diff options
Diffstat (limited to 'externals/dynarmic/src')
359 files changed, 84627 insertions, 0 deletions
diff --git a/externals/dynarmic/src/dynarmic/CMakeLists.txt b/externals/dynarmic/src/dynarmic/CMakeLists.txt new file mode 100644 index 0000000000..0d6068af6a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/CMakeLists.txt @@ -0,0 +1,497 @@ +include(TargetArchitectureSpecificSources) + +add_library(dynarmic + backend/block_range_information.cpp + backend/block_range_information.h + backend/exception_handler.h + common/always_false.h + common/cast_util.h + common/crypto/aes.cpp + common/crypto/aes.h + common/crypto/crc32.cpp + common/crypto/crc32.h + common/crypto/sm4.cpp + common/crypto/sm4.h + common/fp/fpcr.h + common/fp/fpsr.h + common/fp/fused.cpp + common/fp/fused.h + common/fp/info.h + common/fp/mantissa_util.h + common/fp/op.h + common/fp/op/FPCompare.cpp + common/fp/op/FPCompare.h + common/fp/op/FPConvert.cpp + common/fp/op/FPConvert.h + common/fp/op/FPMulAdd.cpp + common/fp/op/FPMulAdd.h + common/fp/op/FPNeg.h + common/fp/op/FPRecipEstimate.cpp + common/fp/op/FPRecipEstimate.h + common/fp/op/FPRecipExponent.cpp + common/fp/op/FPRecipExponent.h + common/fp/op/FPRecipStepFused.cpp + common/fp/op/FPRecipStepFused.h + common/fp/op/FPRoundInt.cpp + common/fp/op/FPRoundInt.h + common/fp/op/FPRSqrtEstimate.cpp + common/fp/op/FPRSqrtEstimate.h + common/fp/op/FPRSqrtStepFused.cpp + common/fp/op/FPRSqrtStepFused.h + common/fp/op/FPToFixed.cpp + common/fp/op/FPToFixed.h + common/fp/process_exception.cpp + common/fp/process_exception.h + common/fp/process_nan.cpp + common/fp/process_nan.h + common/fp/rounding_mode.h + common/fp/unpacked.cpp + common/fp/unpacked.h + common/fp/util.h + common/llvm_disassemble.cpp + common/llvm_disassemble.h + common/lut_from_list.h + common/math_util.cpp + common/math_util.h + common/memory_pool.cpp + common/memory_pool.h + common/safe_ops.h + common/spin_lock.h + common/string_util.h + common/u128.cpp + common/u128.h + common/variant_util.h + frontend/A32/a32_types.cpp + frontend/A32/a32_types.h + frontend/A64/a64_types.cpp + frontend/A64/a64_types.h + frontend/decoder/decoder_detail.h + frontend/decoder/matcher.h + frontend/imm.cpp + frontend/imm.h + interface/exclusive_monitor.h + interface/optimization_flags.h + ir/acc_type.h + ir/basic_block.cpp + ir/basic_block.h + ir/cond.h + ir/ir_emitter.cpp + ir/ir_emitter.h + ir/location_descriptor.cpp + ir/location_descriptor.h + ir/microinstruction.cpp + ir/microinstruction.h + ir/opcodes.cpp + ir/opcodes.h + ir/opcodes.inc + ir/opt/constant_propagation_pass.cpp + ir/opt/dead_code_elimination_pass.cpp + ir/opt/identity_removal_pass.cpp + ir/opt/ir_matcher.h + ir/opt/naming_pass.cpp + ir/opt/passes.h + ir/opt/polyfill_pass.cpp + ir/opt/verification_pass.cpp + ir/terminal.h + ir/type.cpp + ir/type.h + ir/value.cpp + ir/value.h +) + +if ("A32" IN_LIST DYNARMIC_FRONTENDS) + target_sources(dynarmic PRIVATE + frontend/A32/a32_ir_emitter.cpp + frontend/A32/a32_ir_emitter.h + frontend/A32/a32_location_descriptor.cpp + frontend/A32/a32_location_descriptor.h + frontend/A32/decoder/arm.h + frontend/A32/decoder/arm.inc + frontend/A32/decoder/asimd.h + frontend/A32/decoder/asimd.inc + frontend/A32/decoder/thumb16.h + frontend/A32/decoder/thumb16.inc + frontend/A32/decoder/thumb32.h + frontend/A32/decoder/thumb32.inc + frontend/A32/decoder/vfp.h + frontend/A32/decoder/vfp.inc + frontend/A32/disassembler/disassembler.h + frontend/A32/disassembler/disassembler_arm.cpp + frontend/A32/disassembler/disassembler_thumb.cpp + frontend/A32/FPSCR.h + frontend/A32/ITState.h + frontend/A32/PSR.h + frontend/A32/translate/a32_translate.cpp + frontend/A32/translate/a32_translate.h + frontend/A32/translate/conditional_state.cpp + frontend/A32/translate/conditional_state.h + frontend/A32/translate/impl/a32_branch.cpp + frontend/A32/translate/impl/a32_crc32.cpp + frontend/A32/translate/impl/a32_exception_generating.cpp + frontend/A32/translate/impl/a32_translate_impl.cpp + frontend/A32/translate/impl/a32_translate_impl.h + frontend/A32/translate/impl/asimd_load_store_structures.cpp + frontend/A32/translate/impl/asimd_misc.cpp + frontend/A32/translate/impl/asimd_one_reg_modified_immediate.cpp + frontend/A32/translate/impl/asimd_three_regs.cpp + frontend/A32/translate/impl/asimd_two_regs_misc.cpp + frontend/A32/translate/impl/asimd_two_regs_scalar.cpp + frontend/A32/translate/impl/asimd_two_regs_shift.cpp + frontend/A32/translate/impl/barrier.cpp + frontend/A32/translate/impl/coprocessor.cpp + frontend/A32/translate/impl/data_processing.cpp + frontend/A32/translate/impl/divide.cpp + frontend/A32/translate/impl/extension.cpp + frontend/A32/translate/impl/hint.cpp + frontend/A32/translate/impl/load_store.cpp + frontend/A32/translate/impl/misc.cpp + frontend/A32/translate/impl/multiply.cpp + frontend/A32/translate/impl/packing.cpp + frontend/A32/translate/impl/parallel.cpp + frontend/A32/translate/impl/reversal.cpp + frontend/A32/translate/impl/saturated.cpp + frontend/A32/translate/impl/status_register_access.cpp + frontend/A32/translate/impl/synchronization.cpp + frontend/A32/translate/impl/thumb16.cpp + frontend/A32/translate/impl/thumb32_branch.cpp + frontend/A32/translate/impl/thumb32_control.cpp + frontend/A32/translate/impl/thumb32_coprocessor.cpp + frontend/A32/translate/impl/thumb32_data_processing_modified_immediate.cpp + frontend/A32/translate/impl/thumb32_data_processing_plain_binary_immediate.cpp + frontend/A32/translate/impl/thumb32_data_processing_register.cpp + frontend/A32/translate/impl/thumb32_data_processing_shifted_register.cpp + frontend/A32/translate/impl/thumb32_load_byte.cpp + frontend/A32/translate/impl/thumb32_load_halfword.cpp + frontend/A32/translate/impl/thumb32_load_store_dual.cpp + frontend/A32/translate/impl/thumb32_load_store_multiple.cpp + frontend/A32/translate/impl/thumb32_load_word.cpp + frontend/A32/translate/impl/thumb32_long_multiply.cpp + frontend/A32/translate/impl/thumb32_misc.cpp + frontend/A32/translate/impl/thumb32_multiply.cpp + frontend/A32/translate/impl/thumb32_parallel.cpp + frontend/A32/translate/impl/thumb32_store_single_data_item.cpp + frontend/A32/translate/impl/vfp.cpp + frontend/A32/translate/translate_arm.cpp + frontend/A32/translate/translate_thumb.cpp + interface/A32/a32.h + interface/A32/arch_version.h + interface/A32/config.h + interface/A32/coprocessor.h + interface/A32/coprocessor_util.h + interface/A32/disassembler.h + ir/opt/a32_constant_memory_reads_pass.cpp + ir/opt/a32_get_set_elimination_pass.cpp + ) +endif() + +if ("A64" IN_LIST DYNARMIC_FRONTENDS) + target_sources(dynarmic PRIVATE + frontend/A64/a64_ir_emitter.cpp + frontend/A64/a64_ir_emitter.h + frontend/A64/a64_location_descriptor.cpp + frontend/A64/a64_location_descriptor.h + frontend/A64/decoder/a64.h + frontend/A64/decoder/a64.inc + frontend/A64/translate/a64_translate.cpp + frontend/A64/translate/a64_translate.h + frontend/A64/translate/impl/a64_branch.cpp + frontend/A64/translate/impl/a64_exception_generating.cpp + frontend/A64/translate/impl/data_processing_addsub.cpp + frontend/A64/translate/impl/data_processing_bitfield.cpp + frontend/A64/translate/impl/data_processing_conditional_compare.cpp + frontend/A64/translate/impl/data_processing_conditional_select.cpp + frontend/A64/translate/impl/data_processing_crc32.cpp + frontend/A64/translate/impl/data_processing_logical.cpp + frontend/A64/translate/impl/data_processing_multiply.cpp + frontend/A64/translate/impl/data_processing_pcrel.cpp + frontend/A64/translate/impl/data_processing_register.cpp + frontend/A64/translate/impl/data_processing_shift.cpp + frontend/A64/translate/impl/floating_point_compare.cpp + frontend/A64/translate/impl/floating_point_conditional_compare.cpp + frontend/A64/translate/impl/floating_point_conditional_select.cpp + frontend/A64/translate/impl/floating_point_conversion_fixed_point.cpp + frontend/A64/translate/impl/floating_point_conversion_integer.cpp + frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp + frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp + frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp + frontend/A64/translate/impl/impl.cpp + frontend/A64/translate/impl/impl.h + frontend/A64/translate/impl/load_store_exclusive.cpp + frontend/A64/translate/impl/load_store_load_literal.cpp + frontend/A64/translate/impl/load_store_multiple_structures.cpp + frontend/A64/translate/impl/load_store_no_allocate_pair.cpp + frontend/A64/translate/impl/load_store_register_immediate.cpp + frontend/A64/translate/impl/load_store_register_pair.cpp + frontend/A64/translate/impl/load_store_register_register_offset.cpp + frontend/A64/translate/impl/load_store_register_unprivileged.cpp + frontend/A64/translate/impl/load_store_single_structure.cpp + frontend/A64/translate/impl/move_wide.cpp + frontend/A64/translate/impl/simd_across_lanes.cpp + frontend/A64/translate/impl/simd_aes.cpp + frontend/A64/translate/impl/simd_copy.cpp + frontend/A64/translate/impl/simd_crypto_four_register.cpp + frontend/A64/translate/impl/simd_crypto_three_register.cpp + frontend/A64/translate/impl/simd_extract.cpp + frontend/A64/translate/impl/simd_modified_immediate.cpp + frontend/A64/translate/impl/simd_permute.cpp + frontend/A64/translate/impl/simd_scalar_pairwise.cpp + frontend/A64/translate/impl/simd_scalar_shift_by_immediate.cpp + frontend/A64/translate/impl/simd_scalar_three_same.cpp + frontend/A64/translate/impl/simd_scalar_two_register_misc.cpp + frontend/A64/translate/impl/simd_scalar_x_indexed_element.cpp + frontend/A64/translate/impl/simd_sha.cpp + frontend/A64/translate/impl/simd_sha512.cpp + frontend/A64/translate/impl/simd_shift_by_immediate.cpp + frontend/A64/translate/impl/simd_table_lookup.cpp + frontend/A64/translate/impl/simd_three_different.cpp + frontend/A64/translate/impl/simd_three_same.cpp + frontend/A64/translate/impl/simd_three_same_extra.cpp + frontend/A64/translate/impl/simd_two_register_misc.cpp + frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp + frontend/A64/translate/impl/sys_dc.cpp + frontend/A64/translate/impl/sys_ic.cpp + frontend/A64/translate/impl/system.cpp + frontend/A64/translate/impl/system_flag_format.cpp + frontend/A64/translate/impl/system_flag_manipulation.cpp + interface/A64/a64.h + interface/A64/config.h + ir/opt/a64_callback_config_pass.cpp + ir/opt/a64_get_set_elimination_pass.cpp + ir/opt/a64_merge_interpret_blocks.cpp + ) +endif() + +if ("x86_64" IN_LIST ARCHITECTURE) + target_link_libraries(dynarmic + PRIVATE + xbyak::xbyak + Zydis::Zydis + ) + + target_architecture_specific_sources(dynarmic "x86_64" + backend/x64/abi.cpp + backend/x64/abi.h + backend/x64/block_of_code.cpp + backend/x64/block_of_code.h + backend/x64/callback.cpp + backend/x64/callback.h + backend/x64/constant_pool.cpp + backend/x64/constant_pool.h + backend/x64/constants.h + backend/x64/devirtualize.h + backend/x64/emit_x64.cpp + backend/x64/emit_x64.h + backend/x64/emit_x64_aes.cpp + backend/x64/emit_x64_crc32.cpp + backend/x64/emit_x64_data_processing.cpp + backend/x64/emit_x64_floating_point.cpp + backend/x64/emit_x64_memory.cpp.inc + backend/x64/emit_x64_memory.h + backend/x64/emit_x64_packed.cpp + backend/x64/emit_x64_saturation.cpp + backend/x64/emit_x64_sha.cpp + backend/x64/emit_x64_sm4.cpp + backend/x64/emit_x64_vector.cpp + backend/x64/emit_x64_vector_floating_point.cpp + backend/x64/emit_x64_vector_saturation.cpp + backend/x64/exclusive_monitor.cpp + backend/x64/exclusive_monitor_friend.h + backend/x64/host_feature.h + backend/x64/hostloc.cpp + backend/x64/hostloc.h + backend/x64/jitstate_info.h + backend/x64/oparg.h + backend/x64/perf_map.cpp + backend/x64/perf_map.h + backend/x64/reg_alloc.cpp + backend/x64/reg_alloc.h + backend/x64/stack_layout.h + backend/x64/verbose_debugging_output.cpp + backend/x64/verbose_debugging_output.h + common/spin_lock_x64.cpp + common/spin_lock_x64.h + common/x64_disassemble.cpp + common/x64_disassemble.h + ) + + if ("A32" IN_LIST DYNARMIC_FRONTENDS) + target_architecture_specific_sources(dynarmic "x86_64" + backend/x64/a32_emit_x64.cpp + backend/x64/a32_emit_x64.h + backend/x64/a32_emit_x64_memory.cpp + backend/x64/a32_interface.cpp + backend/x64/a32_jitstate.cpp + backend/x64/a32_jitstate.h + ) + endif() + + if ("A64" IN_LIST DYNARMIC_FRONTENDS) + target_architecture_specific_sources(dynarmic "x86_64" + backend/x64/a64_emit_x64.cpp + backend/x64/a64_emit_x64.h + backend/x64/a64_emit_x64_memory.cpp + backend/x64/a64_interface.cpp + backend/x64/a64_jitstate.cpp + backend/x64/a64_jitstate.h + ) + endif() +endif() + +if ("arm64" IN_LIST ARCHITECTURE) + target_link_libraries(dynarmic PRIVATE merry::oaknut) + + target_architecture_specific_sources(dynarmic "arm64" + backend/arm64/a32_jitstate.cpp + backend/arm64/a32_jitstate.h + backend/arm64/a64_jitstate.h + backend/arm64/abi.cpp + backend/arm64/abi.h + backend/arm64/address_space.cpp + backend/arm64/address_space.h + backend/arm64/devirtualize.h + backend/arm64/emit_arm64.cpp + backend/arm64/emit_arm64.h + backend/arm64/emit_arm64_a32.cpp + backend/arm64/emit_arm64_a32_coprocessor.cpp + backend/arm64/emit_arm64_a32_memory.cpp + backend/arm64/emit_arm64_a64.cpp + backend/arm64/emit_arm64_a64_memory.cpp + backend/arm64/emit_arm64_cryptography.cpp + backend/arm64/emit_arm64_data_processing.cpp + backend/arm64/emit_arm64_floating_point.cpp + backend/arm64/emit_arm64_memory.cpp + backend/arm64/emit_arm64_memory.h + backend/arm64/emit_arm64_packed.cpp + backend/arm64/emit_arm64_saturation.cpp + backend/arm64/emit_arm64_vector.cpp + backend/arm64/emit_arm64_vector_floating_point.cpp + backend/arm64/emit_arm64_vector_saturation.cpp + backend/arm64/emit_context.h + backend/arm64/exclusive_monitor.cpp + backend/arm64/fastmem.h + backend/arm64/fpsr_manager.cpp + backend/arm64/fpsr_manager.h + backend/arm64/reg_alloc.cpp + backend/arm64/reg_alloc.h + backend/arm64/stack_layout.h + backend/arm64/verbose_debugging_output.cpp + backend/arm64/verbose_debugging_output.h + common/spin_lock_arm64.cpp + common/spin_lock_arm64.h + ) + + if ("A32" IN_LIST DYNARMIC_FRONTENDS) + target_architecture_specific_sources(dynarmic "arm64" + backend/arm64/a32_address_space.cpp + backend/arm64/a32_address_space.h + backend/arm64/a32_core.h + backend/arm64/a32_interface.cpp + ) + endif() + + if ("A64" IN_LIST DYNARMIC_FRONTENDS) + target_architecture_specific_sources(dynarmic "arm64" + backend/arm64/a64_address_space.cpp + backend/arm64/a64_address_space.h + backend/arm64/a64_core.h + backend/arm64/a64_interface.cpp + ) + endif() +endif() + +if (WIN32) + target_sources(dynarmic PRIVATE backend/exception_handler_windows.cpp) +elseif (APPLE) + find_path(MACH_EXC_DEFS_DIR "mach/mach_exc.defs") + if (NOT MACH_EXC_DEFS_DIR) + message(WARNING "macOS fastmem disabled: unable to find mach/mach_exc.defs") + target_sources(dynarmic PRIVATE backend/exception_handler_generic.cpp) + else() + message(STATUS "mach/mach_exc.defs location: ${MACH_EXC_DEFS_DIR}") + execute_process( + COMMAND + mkdir -p "${CMAKE_CURRENT_SOURCE_DIR}/backend/x64/mig" + COMMAND + mig + -arch x86_64 + -user "${CMAKE_CURRENT_SOURCE_DIR}/backend/x64/mig/mach_exc_user.c" + -header "${CMAKE_CURRENT_SOURCE_DIR}/backend/x64/mig/mach_exc_user.h" + -server "${CMAKE_CURRENT_SOURCE_DIR}/backend/x64/mig/mach_exc_server.c" + -sheader "${CMAKE_CURRENT_SOURCE_DIR}/backend/x64/mig/mach_exc_server.h" + "${MACH_EXC_DEFS_DIR}/mach/mach_exc.defs" + ) + execute_process( + COMMAND + mkdir -p "${CMAKE_CURRENT_SOURCE_DIR}/backend/arm64/mig" + COMMAND + mig + -arch arm64 + -user "${CMAKE_CURRENT_SOURCE_DIR}/backend/arm64/mig/mach_exc_user.c" + -header "${CMAKE_CURRENT_SOURCE_DIR}/backend/arm64/mig/mach_exc_user.h" + -server "${CMAKE_CURRENT_SOURCE_DIR}/backend/arm64/mig/mach_exc_server.c" + -sheader "${CMAKE_CURRENT_SOURCE_DIR}/backend/arm64/mig/mach_exc_server.h" + "${MACH_EXC_DEFS_DIR}/mach/mach_exc.defs" + ) + target_sources(dynarmic PRIVATE + backend/exception_handler_macos.cpp + backend/exception_handler_macos_mig.c + ) + endif() +elseif (UNIX) + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + target_link_libraries(dynarmic PRIVATE rt) + endif() + target_sources(dynarmic PRIVATE backend/exception_handler_posix.cpp) +else() + target_sources(dynarmic PRIVATE backend/exception_handler_generic.cpp) +endif() + +include(CreateDirectoryGroups) +create_target_directory_groups(dynarmic) + +target_include_directories(dynarmic PUBLIC + $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..> + $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}> +) +set_target_properties(dynarmic PROPERTIES + VERSION ${dynarmic_VERSION} + SOVERSION ${dynarmic_VERSION_MAJOR}.${dynarmic_VERSION_MINOR} +) +target_compile_options(dynarmic PRIVATE ${DYNARMIC_CXX_FLAGS}) +target_link_libraries(dynarmic + PRIVATE + Boost::boost + fmt::fmt + merry::mcl + tsl::robin_map +) +if (DYNARMIC_USE_LLVM) + target_include_directories(dynarmic PRIVATE ${LLVM_INCLUDE_DIRS}) + target_compile_definitions(dynarmic PRIVATE DYNARMIC_USE_LLVM=1 ${LLVM_DEFINITIONS}) + llvm_config(dynarmic USE_SHARED armdesc armdisassembler aarch64desc aarch64disassembler x86desc x86disassembler) +endif() +if (DYNARMIC_ENABLE_CPU_FEATURE_DETECTION) + target_compile_definitions(dynarmic PRIVATE DYNARMIC_ENABLE_CPU_FEATURE_DETECTION=1) +endif() +if (DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT) + target_compile_definitions(dynarmic PRIVATE DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT=1) +endif() +if (DYNARMIC_IGNORE_ASSERTS) + target_compile_definitions(dynarmic PRIVATE MCL_IGNORE_ASSERTS=1) +endif() +if (CMAKE_SYSTEM_NAME STREQUAL "Windows") + target_compile_definitions(dynarmic PRIVATE FMT_USE_WINDOWS_H=0) +endif() +target_compile_definitions(dynarmic PRIVATE FMT_USE_USER_DEFINED_LITERALS=1) + +if (DYNARMIC_USE_PRECOMPILED_HEADERS) + set(PRECOMPILED_HEADERS "$<$<COMPILE_LANGUAGE:CXX>:${CMAKE_CURRENT_SOURCE_DIR}/ir/ir_emitter.h>") + if ("x86_64" IN_LIST ARCHITECTURE) + list(PREPEND PRECOMPILED_HEADERS "$<$<COMPILE_LANGUAGE:CXX>:<xbyak/xbyak.h$<ANGLE-R>>") + endif() + if ("arm64" IN_LIST ARCHITECTURE) + list(PREPEND PRECOMPILED_HEADERS "$<$<COMPILE_LANGUAGE:CXX>:<oaknut/oaknut.hpp$<ANGLE-R>>") + endif() + target_precompile_headers(dynarmic PRIVATE ${PRECOMPILED_HEADERS}) + set(CMAKE_PCH_INSTANTIATE_TEMPLATES ON) +endif() diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.cpp new file mode 100644 index 0000000000..cafc47909f --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.cpp @@ -0,0 +1,424 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/arm64/a32_address_space.h" + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/devirtualize.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/stack_layout.h" +#include "dynarmic/common/cast_util.h" +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/frontend/A32/a32_location_descriptor.h" +#include "dynarmic/frontend/A32/translate/a32_translate.h" +#include "dynarmic/interface/A32/config.h" +#include "dynarmic/interface/exclusive_monitor.h" +#include "dynarmic/ir/opt/passes.h" + +namespace Dynarmic::Backend::Arm64 { + +template<auto mfp, typename T> +static void* EmitCallTrampoline(oaknut::CodeGenerator& code, T* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<mfp>(this_); + + oaknut::Label l_addr, l_this; + + void* target = code.xptr<void*>(); + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BR(Xscratch0); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +template<auto mfp, typename T> +static void* EmitWrappedReadCallTrampoline(oaknut::CodeGenerator& code, T* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<mfp>(this_); + + oaknut::Label l_addr, l_this; + + constexpr u64 save_regs = ABI_CALLER_SAVE & ~ToRegList(Xscratch0); + + void* target = code.xptr<void*>(); + ABI_PushRegisters(code, save_regs, 0); + code.LDR(X0, l_this); + code.MOV(X1, Xscratch0); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + code.MOV(Xscratch0, X0); + ABI_PopRegisters(code, save_regs, 0); + code.RET(); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +template<auto callback, typename T> +static void* EmitExclusiveReadCallTrampoline(oaknut::CodeGenerator& code, const A32::UserConfig& conf) { + using namespace oaknut::util; + + oaknut::Label l_addr, l_this; + + auto fn = [](const A32::UserConfig& conf, A32::VAddr vaddr) -> T { + return conf.global_monitor->ReadAndMark<T>(conf.processor_id, vaddr, [&]() -> T { + return (conf.callbacks->*callback)(vaddr); + }); + }; + + void* target = code.xptr<void*>(); + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BR(Xscratch0); + + code.align(8); + code.l(l_this); + code.dx(mcl::bit_cast<u64>(&conf)); + code.l(l_addr); + code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn))); + + return target; +} + +template<auto mfp, typename T> +static void* EmitWrappedWriteCallTrampoline(oaknut::CodeGenerator& code, T* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<mfp>(this_); + + oaknut::Label l_addr, l_this; + + constexpr u64 save_regs = ABI_CALLER_SAVE; + + void* target = code.xptr<void*>(); + ABI_PushRegisters(code, save_regs, 0); + code.LDR(X0, l_this); + code.MOV(X1, Xscratch0); + code.MOV(X2, Xscratch1); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + ABI_PopRegisters(code, save_regs, 0); + code.RET(); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +template<auto callback, typename T> +static void* EmitExclusiveWriteCallTrampoline(oaknut::CodeGenerator& code, const A32::UserConfig& conf) { + using namespace oaknut::util; + + oaknut::Label l_addr, l_this; + + auto fn = [](const A32::UserConfig& conf, A32::VAddr vaddr, T value) -> u32 { + return conf.global_monitor->DoExclusiveOperation<T>(conf.processor_id, vaddr, + [&](T expected) -> bool { + return (conf.callbacks->*callback)(vaddr, value, expected); + }) + ? 0 + : 1; + }; + + void* target = code.xptr<void*>(); + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BR(Xscratch0); + + code.align(8); + code.l(l_this); + code.dx(mcl::bit_cast<u64>(&conf)); + code.l(l_addr); + code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn))); + + return target; +} + +A32AddressSpace::A32AddressSpace(const A32::UserConfig& conf) + : AddressSpace(conf.code_cache_size) + , conf(conf) { + EmitPrelude(); +} + +IR::Block A32AddressSpace::GenerateIR(IR::LocationDescriptor descriptor) const { + IR::Block ir_block = A32::Translate(A32::LocationDescriptor{descriptor}, conf.callbacks, {conf.arch_version, conf.define_unpredictable_behaviour, conf.hook_hint_instructions}); + + Optimization::PolyfillPass(ir_block, {}); + Optimization::NamingPass(ir_block); + if (conf.HasOptimization(OptimizationFlag::GetSetElimination)) { + Optimization::A32GetSetElimination(ir_block, {.convert_nzc_to_nz = true}); + Optimization::DeadCodeElimination(ir_block); + } + if (conf.HasOptimization(OptimizationFlag::ConstProp)) { + Optimization::A32ConstantMemoryReads(ir_block, conf.callbacks); + Optimization::ConstantPropagation(ir_block); + Optimization::DeadCodeElimination(ir_block); + } + Optimization::IdentityRemovalPass(ir_block); + Optimization::VerificationPass(ir_block); + + return ir_block; +} + +void A32AddressSpace::InvalidateCacheRanges(const boost::icl::interval_set<u32>& ranges) { + InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges)); +} + +void A32AddressSpace::EmitPrelude() { + using namespace oaknut::util; + + UnprotectCodeMemory(); + + prelude_info.read_memory_8 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead8>(code, conf.callbacks); + prelude_info.read_memory_16 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead16>(code, conf.callbacks); + prelude_info.read_memory_32 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead32>(code, conf.callbacks); + prelude_info.read_memory_64 = EmitCallTrampoline<&A32::UserCallbacks::MemoryRead64>(code, conf.callbacks); + prelude_info.wrapped_read_memory_8 = EmitWrappedReadCallTrampoline<&A32::UserCallbacks::MemoryRead8>(code, conf.callbacks); + prelude_info.wrapped_read_memory_16 = EmitWrappedReadCallTrampoline<&A32::UserCallbacks::MemoryRead16>(code, conf.callbacks); + prelude_info.wrapped_read_memory_32 = EmitWrappedReadCallTrampoline<&A32::UserCallbacks::MemoryRead32>(code, conf.callbacks); + prelude_info.wrapped_read_memory_64 = EmitWrappedReadCallTrampoline<&A32::UserCallbacks::MemoryRead64>(code, conf.callbacks); + prelude_info.exclusive_read_memory_8 = EmitExclusiveReadCallTrampoline<&A32::UserCallbacks::MemoryRead8, u8>(code, conf); + prelude_info.exclusive_read_memory_16 = EmitExclusiveReadCallTrampoline<&A32::UserCallbacks::MemoryRead16, u16>(code, conf); + prelude_info.exclusive_read_memory_32 = EmitExclusiveReadCallTrampoline<&A32::UserCallbacks::MemoryRead32, u32>(code, conf); + prelude_info.exclusive_read_memory_64 = EmitExclusiveReadCallTrampoline<&A32::UserCallbacks::MemoryRead64, u64>(code, conf); + prelude_info.write_memory_8 = EmitCallTrampoline<&A32::UserCallbacks::MemoryWrite8>(code, conf.callbacks); + prelude_info.write_memory_16 = EmitCallTrampoline<&A32::UserCallbacks::MemoryWrite16>(code, conf.callbacks); + prelude_info.write_memory_32 = EmitCallTrampoline<&A32::UserCallbacks::MemoryWrite32>(code, conf.callbacks); + prelude_info.write_memory_64 = EmitCallTrampoline<&A32::UserCallbacks::MemoryWrite64>(code, conf.callbacks); + prelude_info.wrapped_write_memory_8 = EmitWrappedWriteCallTrampoline<&A32::UserCallbacks::MemoryWrite8>(code, conf.callbacks); + prelude_info.wrapped_write_memory_16 = EmitWrappedWriteCallTrampoline<&A32::UserCallbacks::MemoryWrite16>(code, conf.callbacks); + prelude_info.wrapped_write_memory_32 = EmitWrappedWriteCallTrampoline<&A32::UserCallbacks::MemoryWrite32>(code, conf.callbacks); + prelude_info.wrapped_write_memory_64 = EmitWrappedWriteCallTrampoline<&A32::UserCallbacks::MemoryWrite64>(code, conf.callbacks); + prelude_info.exclusive_write_memory_8 = EmitExclusiveWriteCallTrampoline<&A32::UserCallbacks::MemoryWriteExclusive8, u8>(code, conf); + prelude_info.exclusive_write_memory_16 = EmitExclusiveWriteCallTrampoline<&A32::UserCallbacks::MemoryWriteExclusive16, u16>(code, conf); + prelude_info.exclusive_write_memory_32 = EmitExclusiveWriteCallTrampoline<&A32::UserCallbacks::MemoryWriteExclusive32, u32>(code, conf); + prelude_info.exclusive_write_memory_64 = EmitExclusiveWriteCallTrampoline<&A32::UserCallbacks::MemoryWriteExclusive64, u64>(code, conf); + prelude_info.call_svc = EmitCallTrampoline<&A32::UserCallbacks::CallSVC>(code, conf.callbacks); + prelude_info.exception_raised = EmitCallTrampoline<&A32::UserCallbacks::ExceptionRaised>(code, conf.callbacks); + prelude_info.isb_raised = EmitCallTrampoline<&A32::UserCallbacks::InstructionSynchronizationBarrierRaised>(code, conf.callbacks); + prelude_info.add_ticks = EmitCallTrampoline<&A32::UserCallbacks::AddTicks>(code, conf.callbacks); + prelude_info.get_ticks_remaining = EmitCallTrampoline<&A32::UserCallbacks::GetTicksRemaining>(code, conf.callbacks); + + oaknut::Label return_from_run_code, l_return_to_dispatcher; + + prelude_info.run_code = code.xptr<PreludeInfo::RunCodeFuncType>(); + { + ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); + + code.MOV(X19, X0); + code.MOV(Xstate, X1); + code.MOV(Xhalt, X2); + if (conf.page_table) { + code.MOV(Xpagetable, mcl::bit_cast<u64>(conf.page_table)); + } + if (conf.fastmem_pointer) { + code.MOV(Xfastmem, mcl::bit_cast<u64>(conf.fastmem_pointer)); + } + + if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { + code.LDR(Xscratch0, l_return_to_dispatcher); + for (size_t i = 0; i < RSBCount; i++) { + code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry)); + } + } + + if (conf.enable_cycle_counting) { + code.BL(prelude_info.get_ticks_remaining); + code.MOV(Xticks, X0); + code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run)); + } + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor)); + code.AND(Wscratch0, Wscratch0, 0xffff0000); + code.MRS(Xscratch1, oaknut::SystemReg::FPCR); + code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr)); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + + code.LDAR(Wscratch0, Xhalt); + code.CBNZ(Wscratch0, return_from_run_code); + + code.BR(X19); + } + + prelude_info.step_code = code.xptr<PreludeInfo::RunCodeFuncType>(); + { + ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); + + code.MOV(X19, X0); + code.MOV(Xstate, X1); + code.MOV(Xhalt, X2); + if (conf.page_table) { + code.MOV(Xpagetable, mcl::bit_cast<u64>(conf.page_table)); + } + if (conf.fastmem_pointer) { + code.MOV(Xfastmem, mcl::bit_cast<u64>(conf.fastmem_pointer)); + } + + if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { + code.LDR(Xscratch0, l_return_to_dispatcher); + for (size_t i = 0; i < RSBCount; i++) { + code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry)); + } + } + + if (conf.enable_cycle_counting) { + code.MOV(Xticks, 1); + code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run)); + } + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor)); + code.AND(Wscratch0, Wscratch0, 0xffff0000); + code.MRS(Xscratch1, oaknut::SystemReg::FPCR); + code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr)); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + + oaknut::Label step_hr_loop; + code.l(step_hr_loop); + code.LDAXR(Wscratch0, Xhalt); + code.CBNZ(Wscratch0, return_from_run_code); + code.ORR(Wscratch0, Wscratch0, static_cast<u32>(HaltReason::Step)); + code.STLXR(Wscratch1, Wscratch0, Xhalt); + code.CBNZ(Wscratch1, step_hr_loop); + + code.BR(X19); + } + + prelude_info.return_to_dispatcher = code.xptr<void*>(); + { + oaknut::Label l_this, l_addr; + + code.LDAR(Wscratch0, Xhalt); + code.CBNZ(Wscratch0, return_from_run_code); + + if (conf.enable_cycle_counting) { + code.CMP(Xticks, 0); + code.B(LE, return_from_run_code); + } + + code.LDR(X0, l_this); + code.MOV(X1, Xstate); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + code.BR(X0); + + const auto fn = [](A32AddressSpace& self, A32JitState& context) -> CodePtr { + return self.GetOrEmit(context.GetLocationDescriptor()); + }; + + code.align(8); + code.l(l_this); + code.dx(mcl::bit_cast<u64>(this)); + code.l(l_addr); + code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn))); + } + + prelude_info.return_from_run_code = code.xptr<void*>(); + { + code.l(return_from_run_code); + + if (conf.enable_cycle_counting) { + code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run)); + code.SUB(X1, X1, Xticks); + code.BL(prelude_info.add_ticks); + } + + code.LDR(Wscratch0, SP, offsetof(StackLayout, save_host_fpcr)); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + + oaknut::Label exit_hr_loop; + code.l(exit_hr_loop); + code.LDAXR(W0, Xhalt); + code.STLXR(Wscratch0, WZR, Xhalt); + code.CBNZ(Wscratch0, exit_hr_loop); + + ABI_PopRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); + code.RET(); + } + + code.align(8); + code.l(l_return_to_dispatcher); + code.dx(mcl::bit_cast<u64>(prelude_info.return_to_dispatcher)); + + prelude_info.end_of_prelude = code.offset(); + + mem.invalidate_all(); + ProtectCodeMemory(); +} + +EmitConfig A32AddressSpace::GetEmitConfig() { + return EmitConfig{ + .optimizations = conf.unsafe_optimizations ? conf.optimizations : conf.optimizations & all_safe_optimizations, + + .hook_isb = conf.hook_isb, + + .cntfreq_el0{}, + .ctr_el0{}, + .dczid_el0{}, + .tpidrro_el0{}, + .tpidr_el0{}, + + .check_halt_on_memory_access = conf.check_halt_on_memory_access, + + .page_table_pointer = mcl::bit_cast<u64>(conf.page_table), + .page_table_address_space_bits = 32, + .page_table_pointer_mask_bits = conf.page_table_pointer_mask_bits, + .silently_mirror_page_table = true, + .absolute_offset_page_table = conf.absolute_offset_page_table, + .detect_misaligned_access_via_page_table = conf.detect_misaligned_access_via_page_table, + .only_detect_misalignment_via_page_table_on_page_boundary = conf.only_detect_misalignment_via_page_table_on_page_boundary, + + .fastmem_pointer = mcl::bit_cast<u64>(conf.fastmem_pointer), + .recompile_on_fastmem_failure = conf.recompile_on_fastmem_failure, + .fastmem_address_space_bits = 32, + .silently_mirror_fastmem = true, + + .wall_clock_cntpct = conf.wall_clock_cntpct, + .enable_cycle_counting = conf.enable_cycle_counting, + + .always_little_endian = conf.always_little_endian, + + .descriptor_to_fpcr = [](const IR::LocationDescriptor& location) { return FP::FPCR{A32::LocationDescriptor{location}.FPSCR().Value()}; }, + .emit_cond = EmitA32Cond, + .emit_condition_failed_terminal = EmitA32ConditionFailedTerminal, + .emit_terminal = EmitA32Terminal, + .emit_check_memory_abort = EmitA32CheckMemoryAbort, + + .state_nzcv_offset = offsetof(A32JitState, cpsr_nzcv), + .state_fpsr_offset = offsetof(A32JitState, fpsr), + .state_exclusive_state_offset = offsetof(A32JitState, exclusive_state), + + .coprocessors = conf.coprocessors, + + .very_verbose_debugging_output = conf.very_verbose_debugging_output, + }; +} + +void A32AddressSpace::RegisterNewBasicBlock(const IR::Block& block, const EmittedBlockInfo&) { + const A32::LocationDescriptor descriptor{block.Location()}; + const A32::LocationDescriptor end_location{block.EndLocation()}; + const auto range = boost::icl::discrete_interval<u32>::closed(descriptor.PC(), end_location.PC() - 1); + block_ranges.AddRange(range, descriptor); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.h b/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.h new file mode 100644 index 0000000000..3b9e9fb2f6 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_address_space.h @@ -0,0 +1,35 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include "dynarmic/backend/arm64/address_space.h" +#include "dynarmic/backend/block_range_information.h" +#include "dynarmic/interface/A32/config.h" + +namespace Dynarmic::Backend::Arm64 { + +struct EmittedBlockInfo; + +class A32AddressSpace final : public AddressSpace { +public: + explicit A32AddressSpace(const A32::UserConfig& conf); + + IR::Block GenerateIR(IR::LocationDescriptor) const override; + + void InvalidateCacheRanges(const boost::icl::interval_set<u32>& ranges); + +protected: + friend class A32Core; + + void EmitPrelude(); + EmitConfig GetEmitConfig() override; + void RegisterNewBasicBlock(const IR::Block& block, const EmittedBlockInfo& block_info) override; + + const A32::UserConfig conf; + BlockRangeInformation<u32> block_ranges; +}; + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_core.h b/externals/dynarmic/src/dynarmic/backend/arm64/a32_core.h new file mode 100644 index 0000000000..86f6f284d8 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_core.h @@ -0,0 +1,30 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include "dynarmic/backend/arm64/a32_address_space.h" +#include "dynarmic/backend/arm64/a32_jitstate.h" + +namespace Dynarmic::Backend::Arm64 { + +class A32Core final { +public: + explicit A32Core(const A32::UserConfig&) {} + + HaltReason Run(A32AddressSpace& process, A32JitState& thread_ctx, volatile u32* halt_reason) { + const auto location_descriptor = thread_ctx.GetLocationDescriptor(); + const auto entry_point = process.GetOrEmit(location_descriptor); + return process.prelude_info.run_code(entry_point, &thread_ctx, halt_reason); + } + + HaltReason Step(A32AddressSpace& process, A32JitState& thread_ctx, volatile u32* halt_reason) { + const auto location_descriptor = A32::LocationDescriptor{thread_ctx.GetLocationDescriptor()}.SetSingleStepping(true); + const auto entry_point = process.GetOrEmit(location_descriptor); + return process.prelude_info.step_code(entry_point, &thread_ctx, halt_reason); + } +}; + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_interface.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/a32_interface.cpp new file mode 100644 index 0000000000..6b38c41093 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_interface.cpp @@ -0,0 +1,239 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <memory> +#include <mutex> + +#include <boost/icl/interval_set.hpp> +#include <mcl/assert.hpp> +#include <mcl/scope_exit.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/arm64/a32_address_space.h" +#include "dynarmic/backend/arm64/a32_core.h" +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/common/atomic.h" +#include "dynarmic/interface/A32/a32.h" + +namespace Dynarmic::A32 { + +using namespace Backend::Arm64; + +struct Jit::Impl final { + Impl(Jit* jit_interface, A32::UserConfig conf) + : jit_interface(jit_interface) + , conf(conf) + , current_address_space(conf) + , core(conf) {} + + HaltReason Run() { + ASSERT(!jit_interface->is_executing); + PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&halt_reason))); + + jit_interface->is_executing = true; + SCOPE_EXIT { + jit_interface->is_executing = false; + }; + + HaltReason hr = core.Run(current_address_space, current_state, &halt_reason); + + PerformRequestedCacheInvalidation(hr); + + return hr; + } + + HaltReason Step() { + ASSERT(!jit_interface->is_executing); + PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&halt_reason))); + + jit_interface->is_executing = true; + SCOPE_EXIT { + jit_interface->is_executing = false; + }; + + HaltReason hr = core.Step(current_address_space, current_state, &halt_reason); + + PerformRequestedCacheInvalidation(hr); + + return hr; + } + + void ClearCache() { + std::unique_lock lock{invalidation_mutex}; + invalidate_entire_cache = true; + HaltExecution(HaltReason::CacheInvalidation); + } + + void InvalidateCacheRange(std::uint32_t start_address, std::size_t length) { + std::unique_lock lock{invalidation_mutex}; + invalid_cache_ranges.add(boost::icl::discrete_interval<u32>::closed(start_address, static_cast<u32>(start_address + length - 1))); + HaltExecution(HaltReason::CacheInvalidation); + } + + void Reset() { + current_state = {}; + } + + void HaltExecution(HaltReason hr) { + Atomic::Or(&halt_reason, static_cast<u32>(hr)); + Atomic::Barrier(); + } + + void ClearHalt(HaltReason hr) { + Atomic::And(&halt_reason, ~static_cast<u32>(hr)); + Atomic::Barrier(); + } + + std::array<std::uint32_t, 16>& Regs() { + return current_state.regs; + } + + const std::array<std::uint32_t, 16>& Regs() const { + return current_state.regs; + } + + std::array<std::uint32_t, 64>& ExtRegs() { + return current_state.ext_regs; + } + + const std::array<std::uint32_t, 64>& ExtRegs() const { + return current_state.ext_regs; + } + + std::uint32_t Cpsr() const { + return current_state.Cpsr(); + } + + void SetCpsr(std::uint32_t value) { + current_state.SetCpsr(value); + } + + std::uint32_t Fpscr() const { + return current_state.Fpscr(); + } + + void SetFpscr(std::uint32_t value) { + current_state.SetFpscr(value); + } + + void ClearExclusiveState() { + current_state.exclusive_state = false; + } + + void DumpDisassembly() const { + ASSERT_FALSE("Unimplemented"); + } + +private: + void PerformRequestedCacheInvalidation(HaltReason hr) { + if (Has(hr, HaltReason::CacheInvalidation)) { + std::unique_lock lock{invalidation_mutex}; + + ClearHalt(HaltReason::CacheInvalidation); + + if (invalidate_entire_cache) { + current_address_space.ClearCache(); + + invalidate_entire_cache = false; + invalid_cache_ranges.clear(); + return; + } + + if (!invalid_cache_ranges.empty()) { + current_address_space.InvalidateCacheRanges(invalid_cache_ranges); + + invalid_cache_ranges.clear(); + return; + } + } + } + + Jit* jit_interface; + A32::UserConfig conf; + A32JitState current_state{}; + A32AddressSpace current_address_space; + A32Core core; + + volatile u32 halt_reason = 0; + + std::mutex invalidation_mutex; + boost::icl::interval_set<u32> invalid_cache_ranges; + bool invalidate_entire_cache = false; +}; + +Jit::Jit(UserConfig conf) + : impl(std::make_unique<Impl>(this, conf)) {} + +Jit::~Jit() = default; + +HaltReason Jit::Run() { + return impl->Run(); +} + +HaltReason Jit::Step() { + return impl->Step(); +} + +void Jit::ClearCache() { + impl->ClearCache(); +} + +void Jit::InvalidateCacheRange(std::uint32_t start_address, std::size_t length) { + impl->InvalidateCacheRange(start_address, length); +} + +void Jit::Reset() { + impl->Reset(); +} + +void Jit::HaltExecution(HaltReason hr) { + impl->HaltExecution(hr); +} + +void Jit::ClearHalt(HaltReason hr) { + impl->ClearHalt(hr); +} + +std::array<std::uint32_t, 16>& Jit::Regs() { + return impl->Regs(); +} + +const std::array<std::uint32_t, 16>& Jit::Regs() const { + return impl->Regs(); +} + +std::array<std::uint32_t, 64>& Jit::ExtRegs() { + return impl->ExtRegs(); +} + +const std::array<std::uint32_t, 64>& Jit::ExtRegs() const { + return impl->ExtRegs(); +} + +std::uint32_t Jit::Cpsr() const { + return impl->Cpsr(); +} + +void Jit::SetCpsr(std::uint32_t value) { + impl->SetCpsr(value); +} + +std::uint32_t Jit::Fpscr() const { + return impl->Fpscr(); +} + +void Jit::SetFpscr(std::uint32_t value) { + impl->SetFpscr(value); +} + +void Jit::ClearExclusiveState() { + impl->ClearExclusiveState(); +} + +void Jit::DumpDisassembly() const { + impl->DumpDisassembly(); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.cpp new file mode 100644 index 0000000000..e24654c7db --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.cpp @@ -0,0 +1,74 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/arm64/a32_jitstate.h" + +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +namespace Dynarmic::Backend::Arm64 { + +u32 A32JitState::Cpsr() const { + u32 cpsr = 0; + + // NZCV flags + cpsr |= cpsr_nzcv; + // Q flag + cpsr |= cpsr_q; + // GE flags + cpsr |= mcl::bit::get_bit<31>(cpsr_ge) ? 1 << 19 : 0; + cpsr |= mcl::bit::get_bit<23>(cpsr_ge) ? 1 << 18 : 0; + cpsr |= mcl::bit::get_bit<15>(cpsr_ge) ? 1 << 17 : 0; + cpsr |= mcl::bit::get_bit<7>(cpsr_ge) ? 1 << 16 : 0; + // E flag, T flag + cpsr |= mcl::bit::get_bit<1>(upper_location_descriptor) ? 1 << 9 : 0; + cpsr |= mcl::bit::get_bit<0>(upper_location_descriptor) ? 1 << 5 : 0; + // IT state + cpsr |= static_cast<u32>(upper_location_descriptor & 0b11111100'00000000); + cpsr |= static_cast<u32>(upper_location_descriptor & 0b00000011'00000000) << 17; + // Other flags + cpsr |= cpsr_jaifm; + + return cpsr; +} + +void A32JitState::SetCpsr(u32 cpsr) { + // NZCV flags + cpsr_nzcv = cpsr & 0xF0000000; + // Q flag + cpsr_q = cpsr & (1 << 27); + // GE flags + cpsr_ge = 0; + cpsr_ge |= mcl::bit::get_bit<19>(cpsr) ? 0xFF000000 : 0; + cpsr_ge |= mcl::bit::get_bit<18>(cpsr) ? 0x00FF0000 : 0; + cpsr_ge |= mcl::bit::get_bit<17>(cpsr) ? 0x0000FF00 : 0; + cpsr_ge |= mcl::bit::get_bit<16>(cpsr) ? 0x000000FF : 0; + + upper_location_descriptor &= 0xFFFF0000; + // E flag, T flag + upper_location_descriptor |= mcl::bit::get_bit<9>(cpsr) ? 2 : 0; + upper_location_descriptor |= mcl::bit::get_bit<5>(cpsr) ? 1 : 0; + // IT state + upper_location_descriptor |= (cpsr >> 0) & 0b11111100'00000000; + upper_location_descriptor |= (cpsr >> 17) & 0b00000011'00000000; + + // Other flags + cpsr_jaifm = cpsr & 0x010001DF; +} + +constexpr u32 FPCR_MASK = A32::LocationDescriptor::FPSCR_MODE_MASK; +constexpr u32 FPSR_MASK = 0x0800'009f; + +u32 A32JitState::Fpscr() const { + return (upper_location_descriptor & 0xffff'0000) | fpsr | fpsr_nzcv; +} + +void A32JitState::SetFpscr(u32 fpscr) { + fpsr_nzcv = fpscr & 0xf000'0000; + fpsr = fpscr & FPSR_MASK; + upper_location_descriptor = (upper_location_descriptor & 0x0000'ffff) | (fpscr & FPCR_MASK); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.h b/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.h new file mode 100644 index 0000000000..978bf84ad2 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/a32_jitstate.h @@ -0,0 +1,45 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> + +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/A32/a32_location_descriptor.h" +#include "dynarmic/ir/location_descriptor.h" + +namespace Dynarmic::Backend::Arm64 { + +struct A32JitState { + u32 cpsr_nzcv = 0; + u32 cpsr_q = 0; + u32 cpsr_jaifm = 0; + u32 cpsr_ge = 0; + + u32 fpsr = 0; + u32 fpsr_nzcv = 0; + + std::array<u32, 16> regs{}; + + u32 upper_location_descriptor; + + alignas(16) std::array<u32, 64> ext_regs{}; + + u32 exclusive_state = 0; + + u32 Cpsr() const; + void SetCpsr(u32 cpsr); + + u32 Fpscr() const; + void SetFpscr(u32 fpscr); + + IR::LocationDescriptor GetLocationDescriptor() const { + return IR::LocationDescriptor{regs[15] | (static_cast<u64>(upper_location_descriptor) << 32)}; + } +}; + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.cpp new file mode 100644 index 0000000000..f2e20c3d69 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.cpp @@ -0,0 +1,600 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/arm64/a64_address_space.h" + +#include "dynarmic/backend/arm64/a64_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/devirtualize.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/stack_layout.h" +#include "dynarmic/common/cast_util.h" +#include "dynarmic/frontend/A64/a64_location_descriptor.h" +#include "dynarmic/frontend/A64/translate/a64_translate.h" +#include "dynarmic/interface/A64/config.h" +#include "dynarmic/interface/exclusive_monitor.h" +#include "dynarmic/ir/opt/passes.h" + +namespace Dynarmic::Backend::Arm64 { + +template<auto mfp, typename T> +static void* EmitCallTrampoline(oaknut::CodeGenerator& code, T* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<mfp>(this_); + + oaknut::Label l_addr, l_this; + + void* target = code.xptr<void*>(); + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BR(Xscratch0); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +template<auto mfp, typename T> +static void* EmitWrappedReadCallTrampoline(oaknut::CodeGenerator& code, T* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<mfp>(this_); + + oaknut::Label l_addr, l_this; + + constexpr u64 save_regs = ABI_CALLER_SAVE & ~ToRegList(Xscratch0); + + void* target = code.xptr<void*>(); + ABI_PushRegisters(code, save_regs, 0); + code.LDR(X0, l_this); + code.MOV(X1, Xscratch0); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + code.MOV(Xscratch0, X0); + ABI_PopRegisters(code, save_regs, 0); + code.RET(); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +template<auto callback, typename T> +static void* EmitExclusiveReadCallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) { + using namespace oaknut::util; + + oaknut::Label l_addr, l_this; + + auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr) -> T { + return conf.global_monitor->ReadAndMark<T>(conf.processor_id, vaddr, [&]() -> T { + return (conf.callbacks->*callback)(vaddr); + }); + }; + + void* target = code.xptr<void*>(); + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BR(Xscratch0); + + code.align(8); + code.l(l_this); + code.dx(mcl::bit_cast<u64>(&conf)); + code.l(l_addr); + code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn))); + + return target; +} + +template<auto mfp, typename T> +static void* EmitWrappedWriteCallTrampoline(oaknut::CodeGenerator& code, T* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<mfp>(this_); + + oaknut::Label l_addr, l_this; + + constexpr u64 save_regs = ABI_CALLER_SAVE; + + void* target = code.xptr<void*>(); + ABI_PushRegisters(code, save_regs, 0); + code.LDR(X0, l_this); + code.MOV(X1, Xscratch0); + code.MOV(X2, Xscratch1); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + ABI_PopRegisters(code, save_regs, 0); + code.RET(); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +template<auto callback, typename T> +static void* EmitExclusiveWriteCallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) { + using namespace oaknut::util; + + oaknut::Label l_addr, l_this; + + auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr, T value) -> u32 { + return conf.global_monitor->DoExclusiveOperation<T>(conf.processor_id, vaddr, + [&](T expected) -> bool { + return (conf.callbacks->*callback)(vaddr, value, expected); + }) + ? 0 + : 1; + }; + + void* target = code.xptr<void*>(); + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BR(Xscratch0); + + code.align(8); + code.l(l_this); + code.dx(mcl::bit_cast<u64>(&conf)); + code.l(l_addr); + code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn))); + + return target; +} + +static void* EmitRead128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<&A64::UserCallbacks::MemoryRead128>(this_); + + oaknut::Label l_addr, l_this; + + void* target = code.xptr<void*>(); + ABI_PushRegisters(code, (1ull << 29) | (1ull << 30), 0); + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + code.FMOV(D0, X0); + code.FMOV(V0.D()[1], X1); + ABI_PopRegisters(code, (1ull << 29) | (1ull << 30), 0); + code.RET(); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +static void* EmitWrappedRead128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<&A64::UserCallbacks::MemoryRead128>(this_); + + oaknut::Label l_addr, l_this; + + constexpr u64 save_regs = ABI_CALLER_SAVE & ~ToRegList(Q0); + + void* target = code.xptr<void*>(); + ABI_PushRegisters(code, save_regs, 0); + code.LDR(X0, l_this); + code.MOV(X1, Xscratch0); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + code.FMOV(D0, X0); + code.FMOV(V0.D()[1], X1); + ABI_PopRegisters(code, save_regs, 0); + code.RET(); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +static void* EmitExclusiveRead128CallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) { + using namespace oaknut::util; + + oaknut::Label l_addr, l_this; + + auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr) -> Vector { + return conf.global_monitor->ReadAndMark<Vector>(conf.processor_id, vaddr, [&]() -> Vector { + return conf.callbacks->MemoryRead128(vaddr); + }); + }; + + void* target = code.xptr<void*>(); + ABI_PushRegisters(code, (1ull << 29) | (1ull << 30), 0); + code.LDR(X0, l_this); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + code.FMOV(D0, X0); + code.FMOV(V0.D()[1], X1); + ABI_PopRegisters(code, (1ull << 29) | (1ull << 30), 0); + code.RET(); + + code.align(8); + code.l(l_this); + code.dx(mcl::bit_cast<u64>(&conf)); + code.l(l_addr); + code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn))); + + return target; +} + +static void* EmitWrite128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<&A64::UserCallbacks::MemoryWrite128>(this_); + + oaknut::Label l_addr, l_this; + + void* target = code.xptr<void*>(); + code.LDR(X0, l_this); + code.FMOV(X2, D0); + code.FMOV(X3, V0.D()[1]); + code.LDR(Xscratch0, l_addr); + code.BR(Xscratch0); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +static void* EmitWrappedWrite128CallTrampoline(oaknut::CodeGenerator& code, A64::UserCallbacks* this_) { + using namespace oaknut::util; + + const auto info = Devirtualize<&A64::UserCallbacks::MemoryWrite128>(this_); + + oaknut::Label l_addr, l_this; + + constexpr u64 save_regs = ABI_CALLER_SAVE; + + void* target = code.xptr<void*>(); + ABI_PushRegisters(code, save_regs, 0); + code.LDR(X0, l_this); + code.MOV(X1, Xscratch0); + code.FMOV(X2, D0); + code.FMOV(X3, V0.D()[1]); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + ABI_PopRegisters(code, save_regs, 0); + code.RET(); + + code.align(8); + code.l(l_this); + code.dx(info.this_ptr); + code.l(l_addr); + code.dx(info.fn_ptr); + + return target; +} + +static void* EmitExclusiveWrite128CallTrampoline(oaknut::CodeGenerator& code, const A64::UserConfig& conf) { + using namespace oaknut::util; + + oaknut::Label l_addr, l_this; + + auto fn = [](const A64::UserConfig& conf, A64::VAddr vaddr, Vector value) -> u32 { + return conf.global_monitor->DoExclusiveOperation<Vector>(conf.processor_id, vaddr, + [&](Vector expected) -> bool { + return conf.callbacks->MemoryWriteExclusive128(vaddr, value, expected); + }) + ? 0 + : 1; + }; + + void* target = code.xptr<void*>(); + code.LDR(X0, l_this); + code.FMOV(X2, D0); + code.FMOV(X3, V0.D()[1]); + code.LDR(Xscratch0, l_addr); + code.BR(Xscratch0); + + code.align(8); + code.l(l_this); + code.dx(mcl::bit_cast<u64>(&conf)); + code.l(l_addr); + code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn))); + + return target; +} + +A64AddressSpace::A64AddressSpace(const A64::UserConfig& conf) + : AddressSpace(conf.code_cache_size) + , conf(conf) { + EmitPrelude(); +} + +IR::Block A64AddressSpace::GenerateIR(IR::LocationDescriptor descriptor) const { + const auto get_code = [this](u64 vaddr) { return conf.callbacks->MemoryReadCode(vaddr); }; + IR::Block ir_block = A64::Translate(A64::LocationDescriptor{descriptor}, get_code, + {conf.define_unpredictable_behaviour, conf.wall_clock_cntpct}); + + Optimization::A64CallbackConfigPass(ir_block, conf); + Optimization::NamingPass(ir_block); + if (conf.HasOptimization(OptimizationFlag::GetSetElimination) && !conf.check_halt_on_memory_access) { + Optimization::A64GetSetElimination(ir_block); + Optimization::DeadCodeElimination(ir_block); + } + if (conf.HasOptimization(OptimizationFlag::ConstProp)) { + Optimization::ConstantPropagation(ir_block); + Optimization::DeadCodeElimination(ir_block); + } + if (conf.HasOptimization(OptimizationFlag::MiscIROpt)) { + Optimization::A64MergeInterpretBlocksPass(ir_block, conf.callbacks); + } + Optimization::VerificationPass(ir_block); + + return ir_block; +} + +void A64AddressSpace::InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges) { + InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges)); +} + +void A64AddressSpace::EmitPrelude() { + using namespace oaknut::util; + + UnprotectCodeMemory(); + + prelude_info.read_memory_8 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead8>(code, conf.callbacks); + prelude_info.read_memory_16 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead16>(code, conf.callbacks); + prelude_info.read_memory_32 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead32>(code, conf.callbacks); + prelude_info.read_memory_64 = EmitCallTrampoline<&A64::UserCallbacks::MemoryRead64>(code, conf.callbacks); + prelude_info.read_memory_128 = EmitRead128CallTrampoline(code, conf.callbacks); + prelude_info.wrapped_read_memory_8 = EmitWrappedReadCallTrampoline<&A64::UserCallbacks::MemoryRead8>(code, conf.callbacks); + prelude_info.wrapped_read_memory_16 = EmitWrappedReadCallTrampoline<&A64::UserCallbacks::MemoryRead16>(code, conf.callbacks); + prelude_info.wrapped_read_memory_32 = EmitWrappedReadCallTrampoline<&A64::UserCallbacks::MemoryRead32>(code, conf.callbacks); + prelude_info.wrapped_read_memory_64 = EmitWrappedReadCallTrampoline<&A64::UserCallbacks::MemoryRead64>(code, conf.callbacks); + prelude_info.wrapped_read_memory_128 = EmitWrappedRead128CallTrampoline(code, conf.callbacks); + prelude_info.exclusive_read_memory_8 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead8, u8>(code, conf); + prelude_info.exclusive_read_memory_16 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead16, u16>(code, conf); + prelude_info.exclusive_read_memory_32 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead32, u32>(code, conf); + prelude_info.exclusive_read_memory_64 = EmitExclusiveReadCallTrampoline<&A64::UserCallbacks::MemoryRead64, u64>(code, conf); + prelude_info.exclusive_read_memory_128 = EmitExclusiveRead128CallTrampoline(code, conf); + prelude_info.write_memory_8 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite8>(code, conf.callbacks); + prelude_info.write_memory_16 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite16>(code, conf.callbacks); + prelude_info.write_memory_32 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite32>(code, conf.callbacks); + prelude_info.write_memory_64 = EmitCallTrampoline<&A64::UserCallbacks::MemoryWrite64>(code, conf.callbacks); + prelude_info.write_memory_128 = EmitWrite128CallTrampoline(code, conf.callbacks); + prelude_info.wrapped_write_memory_8 = EmitWrappedWriteCallTrampoline<&A64::UserCallbacks::MemoryWrite8>(code, conf.callbacks); + prelude_info.wrapped_write_memory_16 = EmitWrappedWriteCallTrampoline<&A64::UserCallbacks::MemoryWrite16>(code, conf.callbacks); + prelude_info.wrapped_write_memory_32 = EmitWrappedWriteCallTrampoline<&A64::UserCallbacks::MemoryWrite32>(code, conf.callbacks); + prelude_info.wrapped_write_memory_64 = EmitWrappedWriteCallTrampoline<&A64::UserCallbacks::MemoryWrite64>(code, conf.callbacks); + prelude_info.wrapped_write_memory_128 = EmitWrappedWrite128CallTrampoline(code, conf.callbacks); + prelude_info.exclusive_write_memory_8 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive8, u8>(code, conf); + prelude_info.exclusive_write_memory_16 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive16, u16>(code, conf); + prelude_info.exclusive_write_memory_32 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive32, u32>(code, conf); + prelude_info.exclusive_write_memory_64 = EmitExclusiveWriteCallTrampoline<&A64::UserCallbacks::MemoryWriteExclusive64, u64>(code, conf); + prelude_info.exclusive_write_memory_128 = EmitExclusiveWrite128CallTrampoline(code, conf); + prelude_info.call_svc = EmitCallTrampoline<&A64::UserCallbacks::CallSVC>(code, conf.callbacks); + prelude_info.exception_raised = EmitCallTrampoline<&A64::UserCallbacks::ExceptionRaised>(code, conf.callbacks); + prelude_info.isb_raised = EmitCallTrampoline<&A64::UserCallbacks::InstructionSynchronizationBarrierRaised>(code, conf.callbacks); + prelude_info.ic_raised = EmitCallTrampoline<&A64::UserCallbacks::InstructionCacheOperationRaised>(code, conf.callbacks); + prelude_info.dc_raised = EmitCallTrampoline<&A64::UserCallbacks::DataCacheOperationRaised>(code, conf.callbacks); + prelude_info.get_cntpct = EmitCallTrampoline<&A64::UserCallbacks::GetCNTPCT>(code, conf.callbacks); + prelude_info.add_ticks = EmitCallTrampoline<&A64::UserCallbacks::AddTicks>(code, conf.callbacks); + prelude_info.get_ticks_remaining = EmitCallTrampoline<&A64::UserCallbacks::GetTicksRemaining>(code, conf.callbacks); + + oaknut::Label return_from_run_code, l_return_to_dispatcher; + + prelude_info.run_code = code.xptr<PreludeInfo::RunCodeFuncType>(); + { + ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); + + code.MOV(X19, X0); + code.MOV(Xstate, X1); + code.MOV(Xhalt, X2); + if (conf.page_table) { + code.MOV(Xpagetable, mcl::bit_cast<u64>(conf.page_table)); + } + if (conf.fastmem_pointer) { + code.MOV(Xfastmem, mcl::bit_cast<u64>(conf.fastmem_pointer)); + } + + if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { + code.LDR(Xscratch0, l_return_to_dispatcher); + for (size_t i = 0; i < RSBCount; i++) { + code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry)); + } + } + + if (conf.enable_cycle_counting) { + code.BL(prelude_info.get_ticks_remaining); + code.MOV(Xticks, X0); + code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run)); + } + + code.MRS(Xscratch1, oaknut::SystemReg::FPCR); + code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr)); + code.LDR(Wscratch0, Xstate, offsetof(A64JitState, fpcr)); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + + code.LDAR(Wscratch0, Xhalt); + code.CBNZ(Wscratch0, return_from_run_code); + + code.BR(X19); + } + + prelude_info.step_code = code.xptr<PreludeInfo::RunCodeFuncType>(); + { + ABI_PushRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); + + code.MOV(X19, X0); + code.MOV(Xstate, X1); + code.MOV(Xhalt, X2); + if (conf.page_table) { + code.MOV(Xpagetable, mcl::bit_cast<u64>(conf.page_table)); + } + if (conf.fastmem_pointer) { + code.MOV(Xfastmem, mcl::bit_cast<u64>(conf.fastmem_pointer)); + } + + if (conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { + code.LDR(Xscratch0, l_return_to_dispatcher); + for (size_t i = 0; i < RSBCount; i++) { + code.STR(Xscratch0, SP, offsetof(StackLayout, rsb) + offsetof(RSBEntry, code_ptr) + i * sizeof(RSBEntry)); + } + } + + if (conf.enable_cycle_counting) { + code.MOV(Xticks, 1); + code.STR(Xticks, SP, offsetof(StackLayout, cycles_to_run)); + } + + code.MRS(Xscratch1, oaknut::SystemReg::FPCR); + code.STR(Wscratch1, SP, offsetof(StackLayout, save_host_fpcr)); + code.LDR(Wscratch0, Xstate, offsetof(A64JitState, fpcr)); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + + oaknut::Label step_hr_loop; + code.l(step_hr_loop); + code.LDAXR(Wscratch0, Xhalt); + code.CBNZ(Wscratch0, return_from_run_code); + code.ORR(Wscratch0, Wscratch0, static_cast<u32>(HaltReason::Step)); + code.STLXR(Wscratch1, Wscratch0, Xhalt); + code.CBNZ(Wscratch1, step_hr_loop); + + code.BR(X19); + } + + prelude_info.return_to_dispatcher = code.xptr<void*>(); + { + oaknut::Label l_this, l_addr; + + code.LDAR(Wscratch0, Xhalt); + code.CBNZ(Wscratch0, return_from_run_code); + + if (conf.enable_cycle_counting) { + code.CMP(Xticks, 0); + code.B(LE, return_from_run_code); + } + + code.LDR(X0, l_this); + code.MOV(X1, Xstate); + code.LDR(Xscratch0, l_addr); + code.BLR(Xscratch0); + code.BR(X0); + + const auto fn = [](A64AddressSpace& self, A64JitState& context) -> CodePtr { + return self.GetOrEmit(context.GetLocationDescriptor()); + }; + + code.align(8); + code.l(l_this); + code.dx(mcl::bit_cast<u64>(this)); + code.l(l_addr); + code.dx(mcl::bit_cast<u64>(Common::FptrCast(fn))); + } + + prelude_info.return_from_run_code = code.xptr<void*>(); + { + code.l(return_from_run_code); + + if (conf.enable_cycle_counting) { + code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run)); + code.SUB(X1, X1, Xticks); + code.BL(prelude_info.add_ticks); + } + + code.LDR(Wscratch0, SP, offsetof(StackLayout, save_host_fpcr)); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + + oaknut::Label exit_hr_loop; + code.l(exit_hr_loop); + code.LDAXR(W0, Xhalt); + code.STLXR(Wscratch0, WZR, Xhalt); + code.CBNZ(Wscratch0, exit_hr_loop); + + ABI_PopRegisters(code, ABI_CALLEE_SAVE | (1 << 30), sizeof(StackLayout)); + code.RET(); + } + + code.align(8); + code.l(l_return_to_dispatcher); + code.dx(mcl::bit_cast<u64>(prelude_info.return_to_dispatcher)); + + prelude_info.end_of_prelude = code.offset(); + + mem.invalidate_all(); + ProtectCodeMemory(); +} + +EmitConfig A64AddressSpace::GetEmitConfig() { + return EmitConfig{ + .optimizations = conf.unsafe_optimizations ? conf.optimizations : conf.optimizations & all_safe_optimizations, + + .hook_isb = conf.hook_isb, + + .cntfreq_el0 = conf.cntfrq_el0, + .ctr_el0 = conf.ctr_el0, + .dczid_el0 = conf.dczid_el0, + .tpidrro_el0 = conf.tpidrro_el0, + .tpidr_el0 = conf.tpidr_el0, + + .check_halt_on_memory_access = conf.check_halt_on_memory_access, + + .page_table_pointer = mcl::bit_cast<u64>(conf.page_table), + .page_table_address_space_bits = conf.page_table_address_space_bits, + .page_table_pointer_mask_bits = conf.page_table_pointer_mask_bits, + .silently_mirror_page_table = conf.silently_mirror_page_table, + .absolute_offset_page_table = conf.absolute_offset_page_table, + .detect_misaligned_access_via_page_table = conf.detect_misaligned_access_via_page_table, + .only_detect_misalignment_via_page_table_on_page_boundary = conf.only_detect_misalignment_via_page_table_on_page_boundary, + + .fastmem_pointer = mcl::bit_cast<u64>(conf.fastmem_pointer), + .recompile_on_fastmem_failure = conf.recompile_on_fastmem_failure, + .fastmem_address_space_bits = conf.fastmem_address_space_bits, + .silently_mirror_fastmem = conf.silently_mirror_fastmem, + + .wall_clock_cntpct = conf.wall_clock_cntpct, + .enable_cycle_counting = conf.enable_cycle_counting, + + .always_little_endian = true, + + .descriptor_to_fpcr = [](const IR::LocationDescriptor& location) { return A64::LocationDescriptor{location}.FPCR(); }, + .emit_cond = EmitA64Cond, + .emit_condition_failed_terminal = EmitA64ConditionFailedTerminal, + .emit_terminal = EmitA64Terminal, + .emit_check_memory_abort = EmitA64CheckMemoryAbort, + + .state_nzcv_offset = offsetof(A64JitState, cpsr_nzcv), + .state_fpsr_offset = offsetof(A64JitState, fpsr), + .state_exclusive_state_offset = offsetof(A64JitState, exclusive_state), + + .coprocessors{}, + + .very_verbose_debugging_output = conf.very_verbose_debugging_output, + }; +} + +void A64AddressSpace::RegisterNewBasicBlock(const IR::Block& block, const EmittedBlockInfo&) { + const A64::LocationDescriptor descriptor{block.Location()}; + const A64::LocationDescriptor end_location{block.EndLocation()}; + const auto range = boost::icl::discrete_interval<u64>::closed(descriptor.PC(), end_location.PC() - 1); + block_ranges.AddRange(range, descriptor); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.h b/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.h new file mode 100644 index 0000000000..86eae4dd39 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/a64_address_space.h @@ -0,0 +1,35 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include "dynarmic/backend/arm64/address_space.h" +#include "dynarmic/backend/block_range_information.h" +#include "dynarmic/interface/A64/config.h" + +namespace Dynarmic::Backend::Arm64 { + +struct EmittedBlockInfo; + +class A64AddressSpace final : public AddressSpace { +public: + explicit A64AddressSpace(const A64::UserConfig& conf); + + IR::Block GenerateIR(IR::LocationDescriptor) const override; + + void InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges); + +protected: + friend class A64Core; + + void EmitPrelude(); + EmitConfig GetEmitConfig() override; + void RegisterNewBasicBlock(const IR::Block& block, const EmittedBlockInfo& block_info) override; + + const A64::UserConfig conf; + BlockRangeInformation<u64> block_ranges; +}; + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a64_core.h b/externals/dynarmic/src/dynarmic/backend/arm64/a64_core.h new file mode 100644 index 0000000000..24fbb66b38 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/a64_core.h @@ -0,0 +1,30 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include "dynarmic/backend/arm64/a64_address_space.h" +#include "dynarmic/backend/arm64/a64_jitstate.h" + +namespace Dynarmic::Backend::Arm64 { + +class A64Core final { +public: + explicit A64Core(const A64::UserConfig&) {} + + HaltReason Run(A64AddressSpace& process, A64JitState& thread_ctx, volatile u32* halt_reason) { + const auto location_descriptor = thread_ctx.GetLocationDescriptor(); + const auto entry_point = process.GetOrEmit(location_descriptor); + return process.prelude_info.run_code(entry_point, &thread_ctx, halt_reason); + } + + HaltReason Step(A64AddressSpace& process, A64JitState& thread_ctx, volatile u32* halt_reason) { + const auto location_descriptor = A64::LocationDescriptor{thread_ctx.GetLocationDescriptor()}.SetSingleStepping(true); + const auto entry_point = process.GetOrEmit(location_descriptor); + return process.prelude_info.step_code(entry_point, &thread_ctx, halt_reason); + } +}; + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a64_interface.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/a64_interface.cpp new file mode 100644 index 0000000000..b80e8b39dc --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/a64_interface.cpp @@ -0,0 +1,323 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <memory> +#include <mutex> + +#include <boost/icl/interval_set.hpp> +#include <mcl/assert.hpp> +#include <mcl/scope_exit.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/arm64/a64_address_space.h" +#include "dynarmic/backend/arm64/a64_core.h" +#include "dynarmic/backend/arm64/a64_jitstate.h" +#include "dynarmic/common/atomic.h" +#include "dynarmic/interface/A64/a64.h" +#include "dynarmic/interface/A64/config.h" + +namespace Dynarmic::A64 { + +using namespace Backend::Arm64; + +struct Jit::Impl final { + Impl(Jit*, A64::UserConfig conf) + : conf(conf) + , current_address_space(conf) + , core(conf) {} + + HaltReason Run() { + ASSERT(!is_executing); + PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&halt_reason))); + + is_executing = true; + SCOPE_EXIT { + is_executing = false; + }; + + HaltReason hr = core.Run(current_address_space, current_state, &halt_reason); + + PerformRequestedCacheInvalidation(hr); + + return hr; + } + + HaltReason Step() { + ASSERT(!is_executing); + PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&halt_reason))); + + is_executing = true; + SCOPE_EXIT { + is_executing = false; + }; + + HaltReason hr = core.Step(current_address_space, current_state, &halt_reason); + + PerformRequestedCacheInvalidation(hr); + + return hr; + } + + void ClearCache() { + std::unique_lock lock{invalidation_mutex}; + invalidate_entire_cache = true; + HaltExecution(HaltReason::CacheInvalidation); + } + + void InvalidateCacheRange(std::uint64_t start_address, std::size_t length) { + std::unique_lock lock{invalidation_mutex}; + invalid_cache_ranges.add(boost::icl::discrete_interval<u64>::closed(start_address, start_address + length - 1)); + HaltExecution(HaltReason::CacheInvalidation); + } + + void Reset() { + current_state = {}; + } + + void HaltExecution(HaltReason hr) { + Atomic::Or(&halt_reason, static_cast<u32>(hr)); + } + + void ClearHalt(HaltReason hr) { + Atomic::And(&halt_reason, ~static_cast<u32>(hr)); + } + + std::uint64_t PC() const { + return current_state.pc; + } + + void SetPC(std::uint64_t value) { + current_state.pc = value; + } + + std::uint64_t SP() const { + return current_state.sp; + } + + void SetSP(std::uint64_t value) { + current_state.sp = value; + } + + std::array<std::uint64_t, 31>& Regs() { + return current_state.reg; + } + + const std::array<std::uint64_t, 31>& Regs() const { + return current_state.reg; + } + + std::array<std::uint64_t, 64>& VecRegs() { + return current_state.vec; + } + + const std::array<std::uint64_t, 64>& VecRegs() const { + return current_state.vec; + } + + std::uint32_t Fpcr() const { + return current_state.fpcr; + } + + void SetFpcr(std::uint32_t value) { + current_state.fpcr = value; + } + + std::uint32_t Fpsr() const { + return current_state.fpsr; + } + + void SetFpsr(std::uint32_t value) { + current_state.fpsr = value; + } + + std::uint32_t Pstate() const { + return current_state.cpsr_nzcv; + } + + void SetPstate(std::uint32_t value) { + current_state.cpsr_nzcv = value; + } + + void ClearExclusiveState() { + current_state.exclusive_state = false; + } + + bool IsExecuting() const { + return is_executing; + } + + void DumpDisassembly() const { + ASSERT_FALSE("Unimplemented"); + } + + std::vector<std::string> Disassemble() const { + ASSERT_FALSE("Unimplemented"); + } + +private: + void PerformRequestedCacheInvalidation(HaltReason hr) { + if (Has(hr, HaltReason::CacheInvalidation)) { + std::unique_lock lock{invalidation_mutex}; + + ClearHalt(HaltReason::CacheInvalidation); + + if (invalidate_entire_cache) { + current_address_space.ClearCache(); + + invalidate_entire_cache = false; + invalid_cache_ranges.clear(); + return; + } + + if (!invalid_cache_ranges.empty()) { + current_address_space.InvalidateCacheRanges(invalid_cache_ranges); + + invalid_cache_ranges.clear(); + return; + } + } + } + + A64::UserConfig conf; + A64JitState current_state{}; + A64AddressSpace current_address_space; + A64Core core; + + volatile u32 halt_reason = 0; + + std::mutex invalidation_mutex; + boost::icl::interval_set<u64> invalid_cache_ranges; + bool invalidate_entire_cache = false; + bool is_executing = false; +}; + +Jit::Jit(UserConfig conf) + : impl{std::make_unique<Jit::Impl>(this, conf)} { +} + +Jit::~Jit() = default; + +HaltReason Jit::Run() { + return impl->Run(); +} + +HaltReason Jit::Step() { + return impl->Step(); +} + +void Jit::ClearCache() { + impl->ClearCache(); +} + +void Jit::InvalidateCacheRange(std::uint64_t start_address, std::size_t length) { + impl->InvalidateCacheRange(start_address, length); +} + +void Jit::Reset() { + impl->Reset(); +} + +void Jit::HaltExecution(HaltReason hr) { + impl->HaltExecution(hr); +} + +void Jit::ClearHalt(HaltReason hr) { + impl->ClearHalt(hr); +} + +std::uint64_t Jit::GetSP() const { + return impl->SP(); +} + +void Jit::SetSP(std::uint64_t value) { + impl->SetSP(value); +} + +std::uint64_t Jit::GetPC() const { + return impl->PC(); +} + +void Jit::SetPC(std::uint64_t value) { + impl->SetPC(value); +} + +std::uint64_t Jit::GetRegister(std::size_t index) const { + return impl->Regs()[index]; +} + +void Jit::SetRegister(size_t index, std::uint64_t value) { + impl->Regs()[index] = value; +} + +std::array<std::uint64_t, 31> Jit::GetRegisters() const { + return impl->Regs(); +} + +void Jit::SetRegisters(const std::array<std::uint64_t, 31>& value) { + impl->Regs() = value; +} + +Vector Jit::GetVector(std::size_t index) const { + auto& vec = impl->VecRegs(); + return {vec[index * 2], vec[index * 2 + 1]}; +} + +void Jit::SetVector(std::size_t index, Vector value) { + auto& vec = impl->VecRegs(); + vec[index * 2] = value[0]; + vec[index * 2 + 1] = value[1]; +} + +std::array<Vector, 32> Jit::GetVectors() const { + std::array<Vector, 32> ret; + std::memcpy(ret.data(), impl->VecRegs().data(), sizeof(ret)); + return ret; +} + +void Jit::SetVectors(const std::array<Vector, 32>& value) { + std::memcpy(impl->VecRegs().data(), value.data(), sizeof(value)); +} + +std::uint32_t Jit::GetFpcr() const { + return impl->Fpcr(); +} + +void Jit::SetFpcr(std::uint32_t value) { + impl->SetFpcr(value); +} + +std::uint32_t Jit::GetFpsr() const { + return impl->Fpsr(); +} + +void Jit::SetFpsr(std::uint32_t value) { + impl->SetFpsr(value); +} + +std::uint32_t Jit::GetPstate() const { + return impl->Pstate(); +} + +void Jit::SetPstate(std::uint32_t value) { + impl->SetPstate(value); +} + +void Jit::ClearExclusiveState() { + impl->ClearExclusiveState(); +} + +bool Jit::IsExecuting() const { + return impl->IsExecuting(); +} + +void Jit::DumpDisassembly() const { + impl->DumpDisassembly(); +} + +std::vector<std::string> Jit::Disassemble() const { + return impl->Disassemble(); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/a64_jitstate.h b/externals/dynarmic/src/dynarmic/backend/arm64/a64_jitstate.h new file mode 100644 index 0000000000..215e6987f3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/a64_jitstate.h @@ -0,0 +1,37 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> + +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/A64/a64_location_descriptor.h" + +namespace Dynarmic::Backend::Arm64 { + +struct A64JitState { + std::array<u64, 31> reg{}; + u64 sp = 0; + u64 pc = 0; + + u32 cpsr_nzcv = 0; + + alignas(16) std::array<u64, 64> vec{}; + + u32 exclusive_state = 0; + + u32 fpsr = 0; + u32 fpcr = 0; + + IR::LocationDescriptor GetLocationDescriptor() const { + const u64 fpcr_u64 = static_cast<u64>(fpcr & A64::LocationDescriptor::fpcr_mask) << A64::LocationDescriptor::fpcr_shift; + const u64 pc_u64 = pc & A64::LocationDescriptor::pc_mask; + return IR::LocationDescriptor{pc_u64 | fpcr_u64}; + } +}; + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/abi.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/abi.cpp new file mode 100644 index 0000000000..6d7b96379b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/abi.cpp @@ -0,0 +1,91 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/arm64/abi.h" + +#include <vector> + +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> +#include <oaknut/oaknut.hpp> + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +static constexpr size_t gpr_size = 8; +static constexpr size_t fpr_size = 16; + +struct FrameInfo { + std::vector<int> gprs; + std::vector<int> fprs; + size_t frame_size; + size_t gprs_size; + size_t fprs_size; +}; + +static std::vector<int> ListToIndexes(u32 list) { + std::vector<int> indexes; + for (int i = 0; i < 32; i++) { + if (mcl::bit::get_bit(i, list)) { + indexes.push_back(i); + } + } + return indexes; +} + +static FrameInfo CalculateFrameInfo(RegisterList rl, size_t frame_size) { + const auto gprs = ListToIndexes(static_cast<u32>(rl)); + const auto fprs = ListToIndexes(static_cast<u32>(rl >> 32)); + + const size_t num_gprs = gprs.size(); + const size_t num_fprs = fprs.size(); + + const size_t gprs_size = (num_gprs + 1) / 2 * 16; + const size_t fprs_size = num_fprs * 16; + + return { + gprs, + fprs, + frame_size, + gprs_size, + fprs_size, + }; +} + +#define DO_IT(TYPE, REG_TYPE, PAIR_OP, SINGLE_OP, OFFSET) \ + if (frame_info.TYPE##s.size() > 0) { \ + for (size_t i = 0; i < frame_info.TYPE##s.size() - 1; i += 2) { \ + code.PAIR_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, oaknut::REG_TYPE{frame_info.TYPE##s[i + 1]}, SP, (OFFSET) + i * TYPE##_size); \ + } \ + if (frame_info.TYPE##s.size() % 2 == 1) { \ + const size_t i = frame_info.TYPE##s.size() - 1; \ + code.SINGLE_OP(oaknut::REG_TYPE{frame_info.TYPE##s[i]}, SP, (OFFSET) + i * TYPE##_size); \ + } \ + } + +void ABI_PushRegisters(oaknut::CodeGenerator& code, RegisterList rl, size_t frame_size) { + const FrameInfo frame_info = CalculateFrameInfo(rl, frame_size); + + code.SUB(SP, SP, frame_info.gprs_size + frame_info.fprs_size); + + DO_IT(gpr, XReg, STP, STR, 0) + DO_IT(fpr, QReg, STP, STR, frame_info.gprs_size) + + code.SUB(SP, SP, frame_info.frame_size); +} + +void ABI_PopRegisters(oaknut::CodeGenerator& code, RegisterList rl, size_t frame_size) { + const FrameInfo frame_info = CalculateFrameInfo(rl, frame_size); + + code.ADD(SP, SP, frame_info.frame_size); + + DO_IT(gpr, XReg, LDP, LDR, 0) + DO_IT(fpr, QReg, LDP, LDR, frame_info.gprs_size) + + code.ADD(SP, SP, frame_info.gprs_size + frame_info.fprs_size); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/abi.h b/externals/dynarmic/src/dynarmic/backend/arm64/abi.h new file mode 100644 index 0000000000..609b06cd22 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/abi.h @@ -0,0 +1,78 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <initializer_list> +#include <stdexcept> +#include <type_traits> + +#include <mcl/mp/metavalue/lift_value.hpp> +#include <mcl/stdint.hpp> +#include <oaknut/oaknut.hpp> + +#include "dynarmic/common/always_false.h" + +namespace Dynarmic::Backend::Arm64 { + +constexpr oaknut::XReg Xstate{28}; +constexpr oaknut::XReg Xhalt{27}; +constexpr oaknut::XReg Xticks{26}; +constexpr oaknut::XReg Xfastmem{25}; +constexpr oaknut::XReg Xpagetable{24}; + +constexpr oaknut::XReg Xscratch0{16}, Xscratch1{17}, Xscratch2{30}; +constexpr oaknut::WReg Wscratch0{16}, Wscratch1{17}, Wscratch2{30}; + +template<size_t bitsize> +constexpr auto Rscratch0() { + if constexpr (bitsize == 32) { + return Wscratch0; + } else if constexpr (bitsize == 64) { + return Xscratch0; + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<bitsize>>); + } +} + +template<size_t bitsize> +constexpr auto Rscratch1() { + if constexpr (bitsize == 32) { + return Wscratch1; + } else if constexpr (bitsize == 64) { + return Xscratch1; + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<bitsize>>); + } +} + +constexpr std::initializer_list<int> GPR_ORDER{19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8}; +constexpr std::initializer_list<int> FPR_ORDER{8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +using RegisterList = u64; + +constexpr RegisterList ToRegList(oaknut::Reg reg) { + if (reg.is_vector()) { + return RegisterList{1} << (reg.index() + 32); + } + + if (reg.index() == 31) { + throw std::out_of_range("ZR not allowed in reg list"); + } + + if (reg.index() == -1) { + return RegisterList{1} << 31; + } + + return RegisterList{1} << reg.index(); +} + +constexpr RegisterList ABI_CALLEE_SAVE = 0x0000ff00'7ff80000; +constexpr RegisterList ABI_CALLER_SAVE = 0xffffffff'4000ffff; + +void ABI_PushRegisters(oaknut::CodeGenerator& code, RegisterList rl, size_t stack_space); +void ABI_PopRegisters(oaknut::CodeGenerator& code, RegisterList rl, size_t stack_space); + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/address_space.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/address_space.cpp new file mode 100644 index 0000000000..10df477d19 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/address_space.cpp @@ -0,0 +1,342 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/arm64/a64_address_space.h" +#include "dynarmic/backend/arm64/a64_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/devirtualize.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/stack_layout.h" +#include "dynarmic/common/cast_util.h" +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/interface/exclusive_monitor.h" + +namespace Dynarmic::Backend::Arm64 { + +AddressSpace::AddressSpace(size_t code_cache_size) + : code_cache_size(code_cache_size) + , mem(code_cache_size) + , code(mem.ptr(), mem.ptr()) + , fastmem_manager(exception_handler) { + ASSERT_MSG(code_cache_size <= 128 * 1024 * 1024, "code_cache_size > 128 MiB not currently supported"); + + exception_handler.Register(mem, code_cache_size); + exception_handler.SetFastmemCallback([this](u64 host_pc) { + return FastmemCallback(host_pc); + }); +} + +AddressSpace::~AddressSpace() = default; + +CodePtr AddressSpace::Get(IR::LocationDescriptor descriptor) { + if (const auto iter = block_entries.find(descriptor); iter != block_entries.end()) { + return iter->second; + } + return nullptr; +} + +std::optional<IR::LocationDescriptor> AddressSpace::ReverseGetLocation(CodePtr host_pc) { + if (auto iter = reverse_block_entries.upper_bound(host_pc); iter != reverse_block_entries.begin()) { + // upper_bound locates the first value greater than host_pc, so we need to decrement + --iter; + return iter->second; + } + return std::nullopt; +} + +CodePtr AddressSpace::ReverseGetEntryPoint(CodePtr host_pc) { + if (auto iter = reverse_block_entries.upper_bound(host_pc); iter != reverse_block_entries.begin()) { + // upper_bound locates the first value greater than host_pc, so we need to decrement + --iter; + return iter->first; + } + return nullptr; +} + +CodePtr AddressSpace::GetOrEmit(IR::LocationDescriptor descriptor) { + if (CodePtr block_entry = Get(descriptor)) { + return block_entry; + } + + IR::Block ir_block = GenerateIR(descriptor); + const EmittedBlockInfo block_info = Emit(std::move(ir_block)); + return block_info.entry_point; +} + +void AddressSpace::InvalidateBasicBlocks(const tsl::robin_set<IR::LocationDescriptor>& descriptors) { + UnprotectCodeMemory(); + + for (const auto& descriptor : descriptors) { + const auto iter = block_entries.find(descriptor); + if (iter == block_entries.end()) { + continue; + } + + // Unlink before removal because InvalidateBasicBlocks can be called within a fastmem callback, + // and the currently executing block may have references to itself which need to be unlinked. + RelinkForDescriptor(descriptor, nullptr); + + block_entries.erase(iter); + } + + ProtectCodeMemory(); +} + +void AddressSpace::ClearCache() { + block_entries.clear(); + reverse_block_entries.clear(); + block_infos.clear(); + block_references.clear(); + code.set_offset(prelude_info.end_of_prelude); +} + +size_t AddressSpace::GetRemainingSize() { + return code_cache_size - static_cast<size_t>(code.offset()); +} + +EmittedBlockInfo AddressSpace::Emit(IR::Block block) { + if (GetRemainingSize() < 1024 * 1024) { + ClearCache(); + } + + UnprotectCodeMemory(); + + EmittedBlockInfo block_info = EmitArm64(code, std::move(block), GetEmitConfig(), fastmem_manager); + + ASSERT(block_entries.insert({block.Location(), block_info.entry_point}).second); + ASSERT(reverse_block_entries.insert({block_info.entry_point, block.Location()}).second); + ASSERT(block_infos.insert({block_info.entry_point, block_info}).second); + + Link(block_info); + RelinkForDescriptor(block.Location(), block_info.entry_point); + + mem.invalidate(reinterpret_cast<u32*>(block_info.entry_point), block_info.size); + ProtectCodeMemory(); + + RegisterNewBasicBlock(block, block_info); + + return block_info; +} + +void AddressSpace::Link(EmittedBlockInfo& block_info) { + using namespace oaknut; + using namespace oaknut::util; + + for (auto [ptr_offset, target] : block_info.relocations) { + CodeGenerator c{mem.ptr(), mem.ptr()}; + c.set_xptr(reinterpret_cast<u32*>(block_info.entry_point + ptr_offset)); + + switch (target) { + case LinkTarget::ReturnToDispatcher: + c.B(prelude_info.return_to_dispatcher); + break; + case LinkTarget::ReturnFromRunCode: + c.B(prelude_info.return_from_run_code); + break; + case LinkTarget::ReadMemory8: + c.BL(prelude_info.read_memory_8); + break; + case LinkTarget::ReadMemory16: + c.BL(prelude_info.read_memory_16); + break; + case LinkTarget::ReadMemory32: + c.BL(prelude_info.read_memory_32); + break; + case LinkTarget::ReadMemory64: + c.BL(prelude_info.read_memory_64); + break; + case LinkTarget::ReadMemory128: + c.BL(prelude_info.read_memory_128); + break; + case LinkTarget::WrappedReadMemory8: + c.BL(prelude_info.wrapped_read_memory_8); + break; + case LinkTarget::WrappedReadMemory16: + c.BL(prelude_info.wrapped_read_memory_16); + break; + case LinkTarget::WrappedReadMemory32: + c.BL(prelude_info.wrapped_read_memory_32); + break; + case LinkTarget::WrappedReadMemory64: + c.BL(prelude_info.wrapped_read_memory_64); + break; + case LinkTarget::WrappedReadMemory128: + c.BL(prelude_info.wrapped_read_memory_128); + break; + case LinkTarget::ExclusiveReadMemory8: + c.BL(prelude_info.exclusive_read_memory_8); + break; + case LinkTarget::ExclusiveReadMemory16: + c.BL(prelude_info.exclusive_read_memory_16); + break; + case LinkTarget::ExclusiveReadMemory32: + c.BL(prelude_info.exclusive_read_memory_32); + break; + case LinkTarget::ExclusiveReadMemory64: + c.BL(prelude_info.exclusive_read_memory_64); + break; + case LinkTarget::ExclusiveReadMemory128: + c.BL(prelude_info.exclusive_read_memory_128); + break; + case LinkTarget::WriteMemory8: + c.BL(prelude_info.write_memory_8); + break; + case LinkTarget::WriteMemory16: + c.BL(prelude_info.write_memory_16); + break; + case LinkTarget::WriteMemory32: + c.BL(prelude_info.write_memory_32); + break; + case LinkTarget::WriteMemory64: + c.BL(prelude_info.write_memory_64); + break; + case LinkTarget::WriteMemory128: + c.BL(prelude_info.write_memory_128); + break; + case LinkTarget::WrappedWriteMemory8: + c.BL(prelude_info.wrapped_write_memory_8); + break; + case LinkTarget::WrappedWriteMemory16: + c.BL(prelude_info.wrapped_write_memory_16); + break; + case LinkTarget::WrappedWriteMemory32: + c.BL(prelude_info.wrapped_write_memory_32); + break; + case LinkTarget::WrappedWriteMemory64: + c.BL(prelude_info.wrapped_write_memory_64); + break; + case LinkTarget::WrappedWriteMemory128: + c.BL(prelude_info.wrapped_write_memory_128); + break; + case LinkTarget::ExclusiveWriteMemory8: + c.BL(prelude_info.exclusive_write_memory_8); + break; + case LinkTarget::ExclusiveWriteMemory16: + c.BL(prelude_info.exclusive_write_memory_16); + break; + case LinkTarget::ExclusiveWriteMemory32: + c.BL(prelude_info.exclusive_write_memory_32); + break; + case LinkTarget::ExclusiveWriteMemory64: + c.BL(prelude_info.exclusive_write_memory_64); + break; + case LinkTarget::ExclusiveWriteMemory128: + c.BL(prelude_info.exclusive_write_memory_128); + break; + case LinkTarget::CallSVC: + c.BL(prelude_info.call_svc); + break; + case LinkTarget::ExceptionRaised: + c.BL(prelude_info.exception_raised); + break; + case LinkTarget::InstructionSynchronizationBarrierRaised: + c.BL(prelude_info.isb_raised); + break; + case LinkTarget::InstructionCacheOperationRaised: + c.BL(prelude_info.ic_raised); + break; + case LinkTarget::DataCacheOperationRaised: + c.BL(prelude_info.dc_raised); + break; + case LinkTarget::GetCNTPCT: + c.BL(prelude_info.get_cntpct); + break; + case LinkTarget::AddTicks: + c.BL(prelude_info.add_ticks); + break; + case LinkTarget::GetTicksRemaining: + c.BL(prelude_info.get_ticks_remaining); + break; + default: + ASSERT_FALSE("Invalid relocation target"); + } + } + + for (auto [target_descriptor, list] : block_info.block_relocations) { + block_references[target_descriptor].insert(block_info.entry_point); + LinkBlockLinks(block_info.entry_point, Get(target_descriptor), list); + } +} + +void AddressSpace::LinkBlockLinks(const CodePtr entry_point, const CodePtr target_ptr, const std::vector<BlockRelocation>& block_relocations_list) { + using namespace oaknut; + using namespace oaknut::util; + + for (auto [ptr_offset, type] : block_relocations_list) { + CodeGenerator c{mem.ptr(), mem.ptr()}; + c.set_xptr(reinterpret_cast<u32*>(entry_point + ptr_offset)); + + switch (type) { + case BlockRelocationType::Branch: + if (target_ptr) { + c.B((void*)target_ptr); + } else { + c.NOP(); + } + break; + case BlockRelocationType::MoveToScratch1: + if (target_ptr) { + c.ADRL(Xscratch1, (void*)target_ptr); + } else { + c.ADRL(Xscratch1, prelude_info.return_to_dispatcher); + } + break; + default: + ASSERT_FALSE("Invalid BlockRelocationType"); + } + } +} + +void AddressSpace::RelinkForDescriptor(IR::LocationDescriptor target_descriptor, CodePtr target_ptr) { + for (auto code_ptr : block_references[target_descriptor]) { + if (auto block_iter = block_infos.find(code_ptr); block_iter != block_infos.end()) { + const EmittedBlockInfo& block_info = block_iter->second; + + if (auto relocation_iter = block_info.block_relocations.find(target_descriptor); relocation_iter != block_info.block_relocations.end()) { + LinkBlockLinks(block_info.entry_point, target_ptr, relocation_iter->second); + } + + mem.invalidate(reinterpret_cast<u32*>(block_info.entry_point), block_info.size); + } + } +} + +FakeCall AddressSpace::FastmemCallback(u64 host_pc) { + { + const auto host_ptr = mcl::bit_cast<CodePtr>(host_pc); + + const auto entry_point = ReverseGetEntryPoint(host_ptr); + if (!entry_point) { + goto fail; + } + + const auto block_info = block_infos.find(entry_point); + if (block_info == block_infos.end()) { + goto fail; + } + + const auto patch_entry = block_info->second.fastmem_patch_info.find(host_ptr - entry_point); + if (patch_entry == block_info->second.fastmem_patch_info.end()) { + goto fail; + } + + const auto fc = patch_entry->second.fc; + + if (patch_entry->second.recompile) { + const auto marker = patch_entry->second.marker; + fastmem_manager.MarkDoNotFastmem(marker); + InvalidateBasicBlocks({std::get<0>(marker)}); + } + + return fc; + } + +fail: + fmt::print("dynarmic: Segfault happened within JITted code at host_pc = {:016x}\n", host_pc); + fmt::print("Segfault wasn't at a fastmem patch location!\n"); + ASSERT_FALSE("segfault"); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/address_space.h b/externals/dynarmic/src/dynarmic/backend/arm64/address_space.h new file mode 100644 index 0000000000..a5fe7513e2 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/address_space.h @@ -0,0 +1,136 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <map> +#include <optional> + +#include <mcl/stdint.hpp> +#include <oaknut/code_block.hpp> +#include <oaknut/oaknut.hpp> +#include <tsl/robin_map.h> +#include <tsl/robin_set.h> + +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/fastmem.h" +#include "dynarmic/interface/halt_reason.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/location_descriptor.h" + +namespace Dynarmic::Backend::Arm64 { + +class AddressSpace { +public: + explicit AddressSpace(size_t code_cache_size); + virtual ~AddressSpace(); + + virtual IR::Block GenerateIR(IR::LocationDescriptor) const = 0; + + CodePtr Get(IR::LocationDescriptor descriptor); + + // Returns "most likely" LocationDescriptor assocated with the emitted code at that location + std::optional<IR::LocationDescriptor> ReverseGetLocation(CodePtr host_pc); + + // Returns "most likely" entry_point associated with the emitted code at that location + CodePtr ReverseGetEntryPoint(CodePtr host_pc); + + CodePtr GetOrEmit(IR::LocationDescriptor descriptor); + + void InvalidateBasicBlocks(const tsl::robin_set<IR::LocationDescriptor>& descriptors); + + void ClearCache(); + +protected: + virtual EmitConfig GetEmitConfig() = 0; + virtual void RegisterNewBasicBlock(const IR::Block& block, const EmittedBlockInfo& block_info) = 0; + + void ProtectCodeMemory() { +#if defined(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT) || defined(__APPLE__) || defined(__OpenBSD__) + mem.protect(); +#endif + } + + void UnprotectCodeMemory() { +#if defined(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT) || defined(__APPLE__) || defined(__OpenBSD__) + mem.unprotect(); +#endif + } + + size_t GetRemainingSize(); + EmittedBlockInfo Emit(IR::Block ir_block); + void Link(EmittedBlockInfo& block); + void LinkBlockLinks(const CodePtr entry_point, const CodePtr target_ptr, const std::vector<BlockRelocation>& block_relocations_list); + void RelinkForDescriptor(IR::LocationDescriptor target_descriptor, CodePtr target_ptr); + + FakeCall FastmemCallback(u64 host_pc); + + const size_t code_cache_size; + oaknut::CodeBlock mem; + oaknut::CodeGenerator code; + + // A IR::LocationDescriptor will have one current CodePtr. + // However, there can be multiple other CodePtrs which are older, previously invalidated blocks. + tsl::robin_map<IR::LocationDescriptor, CodePtr> block_entries; + std::map<CodePtr, IR::LocationDescriptor> reverse_block_entries; + tsl::robin_map<CodePtr, EmittedBlockInfo> block_infos; + tsl::robin_map<IR::LocationDescriptor, tsl::robin_set<CodePtr>> block_references; + + ExceptionHandler exception_handler; + FastmemManager fastmem_manager; + + struct PreludeInfo { + std::ptrdiff_t end_of_prelude; + + using RunCodeFuncType = HaltReason (*)(CodePtr entry_point, void* jit_state, volatile u32* halt_reason); + RunCodeFuncType run_code; + RunCodeFuncType step_code; + void* return_to_dispatcher; + void* return_from_run_code; + + void* read_memory_8; + void* read_memory_16; + void* read_memory_32; + void* read_memory_64; + void* read_memory_128; + void* wrapped_read_memory_8; + void* wrapped_read_memory_16; + void* wrapped_read_memory_32; + void* wrapped_read_memory_64; + void* wrapped_read_memory_128; + void* exclusive_read_memory_8; + void* exclusive_read_memory_16; + void* exclusive_read_memory_32; + void* exclusive_read_memory_64; + void* exclusive_read_memory_128; + void* write_memory_8; + void* write_memory_16; + void* write_memory_32; + void* write_memory_64; + void* write_memory_128; + void* wrapped_write_memory_8; + void* wrapped_write_memory_16; + void* wrapped_write_memory_32; + void* wrapped_write_memory_64; + void* wrapped_write_memory_128; + void* exclusive_write_memory_8; + void* exclusive_write_memory_16; + void* exclusive_write_memory_32; + void* exclusive_write_memory_64; + void* exclusive_write_memory_128; + + void* call_svc; + void* exception_raised; + void* dc_raised; + void* ic_raised; + void* isb_raised; + + void* get_cntpct; + void* add_ticks; + void* get_ticks_remaining; + } prelude_info; +}; + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/devirtualize.h b/externals/dynarmic/src/dynarmic/backend/arm64/devirtualize.h new file mode 100644 index 0000000000..c433fd0126 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/devirtualize.h @@ -0,0 +1,57 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/bit_cast.hpp> +#include <mcl/stdint.hpp> +#include <mcl/type_traits/function_info.hpp> + +namespace Dynarmic::Backend::Arm64 { + +struct DevirtualizedCall { + u64 fn_ptr; + u64 this_ptr; +}; + +// https://rants.vastheman.com/2021/09/21/msvc/ +template<auto mfp> +DevirtualizedCall DevirtualizeWindows(mcl::class_type<decltype(mfp)>* this_) { + static_assert(sizeof(mfp) == 8); + return DevirtualizedCall{mcl::bit_cast<u64>(mfp), reinterpret_cast<u64>(this_)}; +} + +// https://github.com/ARM-software/abi-aa/blob/main/cppabi64/cppabi64.rst#representation-of-pointer-to-member-function +template<auto mfp> +DevirtualizedCall DevirtualizeDefault(mcl::class_type<decltype(mfp)>* this_) { + struct MemberFunctionPointer { + // Address of non-virtual function or index into vtable. + u64 ptr; + // LSB is discriminator for if function is virtual. Other bits are this adjustment. + u64 adj; + } mfp_struct = mcl::bit_cast<MemberFunctionPointer>(mfp); + + static_assert(sizeof(MemberFunctionPointer) == 16); + static_assert(sizeof(MemberFunctionPointer) == sizeof(mfp)); + + u64 fn_ptr = mfp_struct.ptr; + u64 this_ptr = mcl::bit_cast<u64>(this_) + (mfp_struct.adj >> 1); + if (mfp_struct.adj & 1) { + u64 vtable = mcl::bit_cast_pointee<u64>(this_ptr); + fn_ptr = mcl::bit_cast_pointee<u64>(vtable + fn_ptr); + } + return DevirtualizedCall{fn_ptr, this_ptr}; +} + +template<auto mfp> +DevirtualizedCall Devirtualize(mcl::class_type<decltype(mfp)>* this_) { +#if defined(_WIN32) && defined(_MSC_VER) + return DevirtualizeWindows<mfp>(this_); +#else + return DevirtualizeDefault<mfp>(this_); +#endif +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.cpp new file mode 100644 index 0000000000..1ecb5424da --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.cpp @@ -0,0 +1,290 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/arm64/emit_arm64.h" + +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/fpsr_manager.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/backend/arm64/verbose_debugging_output.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +template<> +void EmitIR<IR::Opcode::Void>(oaknut::CodeGenerator&, EmitContext&, IR::Inst*) {} + +template<> +void EmitIR<IR::Opcode::Identity>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.DefineAsExisting(inst, args[0]); +} + +template<> +void EmitIR<IR::Opcode::Breakpoint>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) { + code.BRK(0); +} + +template<> +void EmitIR<IR::Opcode::CallHostFunction>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.PrepareForCall(args[1], args[2], args[3]); + code.MOV(Xscratch0, args[0].GetImmediateU64()); + code.BLR(Xscratch0); +} + +template<> +void EmitIR<IR::Opcode::PushRSB>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + if (!ctx.conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[0].IsImmediate()); + const IR::LocationDescriptor target{args[0].GetImmediateU64()}; + + code.LDR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + code.ADD(Wscratch2, Wscratch2, sizeof(RSBEntry)); + code.AND(Wscratch2, Wscratch2, RSBIndexMask); + code.STR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + code.ADD(Xscratch2, SP, Xscratch2); + + code.MOV(Xscratch0, target.Value()); + EmitBlockLinkRelocation(code, ctx, target, BlockRelocationType::MoveToScratch1); + code.STP(Xscratch0, Xscratch1, Xscratch2, offsetof(StackLayout, rsb)); +} + +template<> +void EmitIR<IR::Opcode::GetCarryFromOp>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + [[maybe_unused]] auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(ctx.reg_alloc.WasValueDefined(inst)); +} + +template<> +void EmitIR<IR::Opcode::GetOverflowFromOp>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + [[maybe_unused]] auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(ctx.reg_alloc.WasValueDefined(inst)); +} + +template<> +void EmitIR<IR::Opcode::GetGEFromOp>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + [[maybe_unused]] auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(ctx.reg_alloc.WasValueDefined(inst)); +} + +template<> +void EmitIR<IR::Opcode::GetNZCVFromOp>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (ctx.reg_alloc.WasValueDefined(inst)) { + return; + } + + switch (args[0].GetType()) { + case IR::Type::U32: { + auto Wvalue = ctx.reg_alloc.ReadW(args[0]); + auto flags = ctx.reg_alloc.WriteFlags(inst); + RegAlloc::Realize(Wvalue, flags); + + code.TST(*Wvalue, Wvalue); + break; + } + case IR::Type::U64: { + auto Xvalue = ctx.reg_alloc.ReadX(args[0]); + auto flags = ctx.reg_alloc.WriteFlags(inst); + RegAlloc::Realize(Xvalue, flags); + + code.TST(*Xvalue, Xvalue); + break; + } + default: + ASSERT_FALSE("Invalid type for GetNZCVFromOp"); + break; + } +} + +template<> +void EmitIR<IR::Opcode::GetNZFromOp>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (ctx.reg_alloc.WasValueDefined(inst)) { + return; + } + + switch (args[0].GetType()) { + case IR::Type::U32: { + auto Wvalue = ctx.reg_alloc.ReadW(args[0]); + auto flags = ctx.reg_alloc.WriteFlags(inst); + RegAlloc::Realize(Wvalue, flags); + + code.TST(*Wvalue, *Wvalue); + break; + } + case IR::Type::U64: { + auto Xvalue = ctx.reg_alloc.ReadX(args[0]); + auto flags = ctx.reg_alloc.WriteFlags(inst); + RegAlloc::Realize(Xvalue, flags); + + code.TST(*Xvalue, *Xvalue); + break; + } + default: + ASSERT_FALSE("Invalid type for GetNZFromOp"); + break; + } +} + +template<> +void EmitIR<IR::Opcode::GetUpperFromOp>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + [[maybe_unused]] auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(ctx.reg_alloc.WasValueDefined(inst)); +} + +template<> +void EmitIR<IR::Opcode::GetLowerFromOp>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + [[maybe_unused]] auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(ctx.reg_alloc.WasValueDefined(inst)); +} + +template<> +void EmitIR<IR::Opcode::GetCFlagFromNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wc = ctx.reg_alloc.WriteW(inst); + auto Wnzcv = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wc, Wnzcv); + + code.AND(Wc, Wnzcv, 1 << 29); +} + +template<> +void EmitIR<IR::Opcode::NZCVFromPackedFlags>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.DefineAsExisting(inst, args[0]); +} + +static void EmitAddCycles(oaknut::CodeGenerator& code, EmitContext& ctx, size_t cycles_to_add) { + if (!ctx.conf.enable_cycle_counting) { + return; + } + if (cycles_to_add == 0) { + return; + } + + if (oaknut::AddSubImm::is_valid(cycles_to_add)) { + code.SUB(Xticks, Xticks, cycles_to_add); + } else { + code.MOV(Xscratch1, cycles_to_add); + code.SUB(Xticks, Xticks, Xscratch1); + } +} + +EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const EmitConfig& conf, FastmemManager& fastmem_manager) { + if (conf.very_verbose_debugging_output) { + std::puts(IR::DumpBlock(block).c_str()); + } + + EmittedBlockInfo ebi; + + FpsrManager fpsr_manager{code, conf.state_fpsr_offset}; + RegAlloc reg_alloc{code, fpsr_manager, GPR_ORDER, FPR_ORDER}; + EmitContext ctx{block, reg_alloc, conf, ebi, fpsr_manager, fastmem_manager, {}}; + + ebi.entry_point = code.xptr<CodePtr>(); + + if (ctx.block.GetCondition() == IR::Cond::AL) { + ASSERT(!ctx.block.HasConditionFailedLocation()); + } else { + ASSERT(ctx.block.HasConditionFailedLocation()); + oaknut::Label pass; + + pass = conf.emit_cond(code, ctx, ctx.block.GetCondition()); + EmitAddCycles(code, ctx, ctx.block.ConditionFailedCycleCount()); + conf.emit_condition_failed_terminal(code, ctx); + + code.l(pass); + } + + for (auto iter = block.begin(); iter != block.end(); ++iter) { + IR::Inst* inst = &*iter; + + switch (inst->GetOpcode()) { +#define OPCODE(name, type, ...) \ + case IR::Opcode::name: \ + EmitIR<IR::Opcode::name>(code, ctx, inst); \ + break; +#define A32OPC(name, type, ...) \ + case IR::Opcode::A32##name: \ + EmitIR<IR::Opcode::A32##name>(code, ctx, inst); \ + break; +#define A64OPC(name, type, ...) \ + case IR::Opcode::A64##name: \ + EmitIR<IR::Opcode::A64##name>(code, ctx, inst); \ + break; +#include "dynarmic/ir/opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + default: + ASSERT_FALSE("Invalid opcode: {}", inst->GetOpcode()); + break; + } + + reg_alloc.UpdateAllUses(); + reg_alloc.AssertAllUnlocked(); + + if (conf.very_verbose_debugging_output) { + EmitVerboseDebuggingOutput(code, ctx); + } + } + + fpsr_manager.Spill(); + + reg_alloc.AssertNoMoreUses(); + + EmitAddCycles(code, ctx, block.CycleCount()); + conf.emit_terminal(code, ctx); + code.BRK(0); + + for (const auto& deferred_emit : ctx.deferred_emits) { + deferred_emit(); + } + code.BRK(0); + + ebi.size = code.xptr<CodePtr>() - ebi.entry_point; + return ebi; +} + +void EmitRelocation(oaknut::CodeGenerator& code, EmitContext& ctx, LinkTarget link_target) { + ctx.ebi.relocations.emplace_back(Relocation{code.xptr<CodePtr>() - ctx.ebi.entry_point, link_target}); + code.NOP(); +} + +void EmitBlockLinkRelocation(oaknut::CodeGenerator& code, EmitContext& ctx, const IR::LocationDescriptor& descriptor, BlockRelocationType type) { + ctx.ebi.block_relocations[descriptor].emplace_back(BlockRelocation{code.xptr<CodePtr>() - ctx.ebi.entry_point, type}); + switch (type) { + case BlockRelocationType::Branch: + code.NOP(); + break; + case BlockRelocationType::MoveToScratch1: + code.BRK(0); + code.NOP(); + break; + default: + UNREACHABLE(); + } +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.h b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.h new file mode 100644 index 0000000000..6ac6207442 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64.h @@ -0,0 +1,181 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <cstddef> +#include <memory> +#include <vector> + +#include <mcl/stdint.hpp> +#include <tsl/robin_map.h> + +#include "dynarmic/backend/arm64/fastmem.h" +#include "dynarmic/interface/A32/coprocessor.h" +#include "dynarmic/interface/optimization_flags.h" +#include "dynarmic/ir/location_descriptor.h" + +namespace oaknut { +struct CodeGenerator; +struct Label; +} // namespace oaknut + +namespace Dynarmic::FP { +class FPCR; +} // namespace Dynarmic::FP + +namespace Dynarmic::IR { +class Block; +class Inst; +enum class Cond; +enum class Opcode; +} // namespace Dynarmic::IR + +namespace Dynarmic::Backend::Arm64 { + +struct EmitContext; + +using CodePtr = std::byte*; + +enum class LinkTarget { + ReturnToDispatcher, + ReturnFromRunCode, + ReadMemory8, + ReadMemory16, + ReadMemory32, + ReadMemory64, + ReadMemory128, + WrappedReadMemory8, + WrappedReadMemory16, + WrappedReadMemory32, + WrappedReadMemory64, + WrappedReadMemory128, + ExclusiveReadMemory8, + ExclusiveReadMemory16, + ExclusiveReadMemory32, + ExclusiveReadMemory64, + ExclusiveReadMemory128, + WriteMemory8, + WriteMemory16, + WriteMemory32, + WriteMemory64, + WriteMemory128, + WrappedWriteMemory8, + WrappedWriteMemory16, + WrappedWriteMemory32, + WrappedWriteMemory64, + WrappedWriteMemory128, + ExclusiveWriteMemory8, + ExclusiveWriteMemory16, + ExclusiveWriteMemory32, + ExclusiveWriteMemory64, + ExclusiveWriteMemory128, + CallSVC, + ExceptionRaised, + InstructionSynchronizationBarrierRaised, + InstructionCacheOperationRaised, + DataCacheOperationRaised, + GetCNTPCT, + AddTicks, + GetTicksRemaining, +}; + +struct Relocation { + std::ptrdiff_t code_offset; + LinkTarget target; +}; + +enum class BlockRelocationType { + Branch, + MoveToScratch1, +}; + +struct BlockRelocation { + std::ptrdiff_t code_offset; + BlockRelocationType type; +}; + +struct EmittedBlockInfo { + CodePtr entry_point; + size_t size; + std::vector<Relocation> relocations; + tsl::robin_map<IR::LocationDescriptor, std::vector<BlockRelocation>> block_relocations; + tsl::robin_map<std::ptrdiff_t, FastmemPatchInfo> fastmem_patch_info; +}; + +struct EmitConfig { + OptimizationFlag optimizations; + bool HasOptimization(OptimizationFlag f) const { return (f & optimizations) != no_optimizations; } + + bool hook_isb; + + // System registers + u64 cntfreq_el0; + u32 ctr_el0; + u32 dczid_el0; + const u64* tpidrro_el0; + u64* tpidr_el0; + + // Memory + bool check_halt_on_memory_access; + + // Page table + u64 page_table_pointer; + size_t page_table_address_space_bits; + int page_table_pointer_mask_bits; + bool silently_mirror_page_table; + bool absolute_offset_page_table; + u8 detect_misaligned_access_via_page_table; + bool only_detect_misalignment_via_page_table_on_page_boundary; + + // Fastmem + u64 fastmem_pointer; + bool recompile_on_fastmem_failure; + size_t fastmem_address_space_bits; + bool silently_mirror_fastmem; + + // Timing + bool wall_clock_cntpct; + bool enable_cycle_counting; + + // Endianness + bool always_little_endian; + + // Frontend specific callbacks + FP::FPCR (*descriptor_to_fpcr)(const IR::LocationDescriptor& descriptor); + oaknut::Label (*emit_cond)(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Cond cond); + void (*emit_condition_failed_terminal)(oaknut::CodeGenerator& code, EmitContext& ctx); + void (*emit_terminal)(oaknut::CodeGenerator& code, EmitContext& ctx); + void (*emit_check_memory_abort)(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Label& end); + + // State offsets + size_t state_nzcv_offset; + size_t state_fpsr_offset; + size_t state_exclusive_state_offset; + + // A32 specific + std::array<std::shared_ptr<A32::Coprocessor>, 16> coprocessors{}; + + // Debugging + bool very_verbose_debugging_output; +}; + +EmittedBlockInfo EmitArm64(oaknut::CodeGenerator& code, IR::Block block, const EmitConfig& emit_conf, FastmemManager& fastmem_manager); + +template<IR::Opcode op> +void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +void EmitRelocation(oaknut::CodeGenerator& code, EmitContext& ctx, LinkTarget link_target); +void EmitBlockLinkRelocation(oaknut::CodeGenerator& code, EmitContext& ctx, const IR::LocationDescriptor& descriptor, BlockRelocationType type); +oaknut::Label EmitA32Cond(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Cond cond); +oaknut::Label EmitA64Cond(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Cond cond); +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx); +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx); +void EmitA32ConditionFailedTerminal(oaknut::CodeGenerator& code, EmitContext& ctx); +void EmitA64ConditionFailedTerminal(oaknut::CodeGenerator& code, EmitContext& ctx); +void EmitA32CheckMemoryAbort(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Label& end); +void EmitA64CheckMemoryAbort(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Label& end); + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32.cpp new file mode 100644 index 0000000000..909dde731b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32.cpp @@ -0,0 +1,707 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_field.hpp> +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/fpsr_manager.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/frontend/A32/a32_types.h" +#include "dynarmic/interface/halt_reason.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +oaknut::Label EmitA32Cond(oaknut::CodeGenerator& code, EmitContext&, IR::Cond cond) { + oaknut::Label pass; + // TODO: Flags in host flags + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv)); + code.MSR(oaknut::SystemReg::NZCV, Xscratch0); + code.B(static_cast<oaknut::Cond>(cond), pass); + return pass; +} + +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step); + +void EmitA32Terminal(oaknut::CodeGenerator&, EmitContext&, IR::Term::Interpret, IR::LocationDescriptor, bool) { + ASSERT_FALSE("Interpret should never be emitted."); +} + +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) { + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +} + +static void EmitSetUpperLocationDescriptor(oaknut::CodeGenerator& code, EmitContext& ctx, IR::LocationDescriptor new_location, IR::LocationDescriptor old_location) { + auto get_upper = [](const IR::LocationDescriptor& desc) -> u32 { + return static_cast<u32>(A32::LocationDescriptor{desc}.SetSingleStepping(false).UniqueHash() >> 32); + }; + + const u32 old_upper = get_upper(old_location); + const u32 new_upper = [&] { + const u32 mask = ~u32(ctx.conf.always_little_endian ? 0x2 : 0); + return get_upper(new_location) & mask; + }(); + + if (old_upper != new_upper) { + code.MOV(Wscratch0, new_upper); + code.STR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor)); + } +} + +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + EmitSetUpperLocationDescriptor(code, ctx, terminal.next, initial_location); + + oaknut::Label fail; + + if (ctx.conf.HasOptimization(OptimizationFlag::BlockLinking) && !is_single_step) { + if (ctx.conf.enable_cycle_counting) { + code.CMP(Xticks, 0); + code.B(LE, fail); + EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch); + } else { + code.LDAR(Wscratch0, Xhalt); + code.CBNZ(Wscratch0, fail); + EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch); + } + } + + code.l(fail); + code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC()); + code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +} + +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + EmitSetUpperLocationDescriptor(code, ctx, terminal.next, initial_location); + + if (ctx.conf.HasOptimization(OptimizationFlag::BlockLinking) && !is_single_step) { + EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch); + } + + code.MOV(Wscratch0, A32::LocationDescriptor{terminal.next}.PC()); + code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +} + +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) { + if (ctx.conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) && !is_single_step) { + oaknut::Label fail; + + code.LDR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + code.AND(Wscratch2, Wscratch2, RSBIndexMask); + code.ADD(X2, SP, Xscratch2); + code.SUB(Wscratch2, Wscratch2, sizeof(RSBEntry)); + code.STR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + + code.LDP(Xscratch0, Xscratch1, X2, offsetof(StackLayout, rsb)); + + static_assert(offsetof(A32JitState, regs) + 16 * sizeof(u32) == offsetof(A32JitState, upper_location_descriptor)); + code.LDUR(X0, Xstate, offsetof(A32JitState, regs) + 15 * sizeof(u32)); + + code.CMP(X0, Xscratch0); + code.B(NE, fail); + code.BR(Xscratch1); + + code.l(fail); + } + + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +} + +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::FastDispatchHint, IR::LocationDescriptor, bool) { + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); + + // TODO: Implement FastDispatchHint optimization +} + +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + oaknut::Label pass = EmitA32Cond(code, ctx, terminal.if_); + EmitA32Terminal(code, ctx, terminal.else_, initial_location, is_single_step); + code.l(pass); + EmitA32Terminal(code, ctx, terminal.then_, initial_location, is_single_step); +} + +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + oaknut::Label fail; + code.LDRB(Wscratch0, SP, offsetof(StackLayout, check_bit)); + code.CBZ(Wscratch0, fail); + EmitA32Terminal(code, ctx, terminal.then_, initial_location, is_single_step); + code.l(fail); + EmitA32Terminal(code, ctx, terminal.else_, initial_location, is_single_step); +} + +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + oaknut::Label fail; + code.LDAR(Wscratch0, Xhalt); + code.CBNZ(Wscratch0, fail); + EmitA32Terminal(code, ctx, terminal.else_, initial_location, is_single_step); + code.l(fail); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +} + +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + boost::apply_visitor([&](const auto& t) { EmitA32Terminal(code, ctx, t, initial_location, is_single_step); }, terminal); +} + +void EmitA32Terminal(oaknut::CodeGenerator& code, EmitContext& ctx) { + const A32::LocationDescriptor location{ctx.block.Location()}; + EmitA32Terminal(code, ctx, ctx.block.GetTerminal(), location.SetSingleStepping(false), location.SingleStepping()); +} + +void EmitA32ConditionFailedTerminal(oaknut::CodeGenerator& code, EmitContext& ctx) { + const A32::LocationDescriptor location{ctx.block.Location()}; + EmitA32Terminal(code, ctx, IR::Term::LinkBlock{ctx.block.ConditionFailedLocation()}, location.SetSingleStepping(false), location.SingleStepping()); +} + +void EmitA32CheckMemoryAbort(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Label& end) { + if (!ctx.conf.check_halt_on_memory_access) { + return; + } + + const A32::LocationDescriptor current_location{IR::LocationDescriptor{inst->GetArg(0).GetU64()}}; + + code.LDAR(Xscratch0, Xhalt); + code.TST(Xscratch0, static_cast<u32>(HaltReason::MemoryAbort)); + code.B(EQ, end); + EmitSetUpperLocationDescriptor(code, ctx, current_location, ctx.block.Location()); + code.MOV(Wscratch0, current_location.PC()); + code.STR(Wscratch0, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * 15); + EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); +} + +template<> +void EmitIR<IR::Opcode::A32SetCheckBit>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { + code.MOV(Wscratch0, 1); + code.STRB(Wscratch0, SP, offsetof(StackLayout, check_bit)); + } else { + code.STRB(WZR, SP, offsetof(StackLayout, check_bit)); + } + } else { + auto Wbit = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wbit); + code.STRB(Wbit, SP, offsetof(StackLayout, check_bit)); + } +} + +template<> +void EmitIR<IR::Opcode::A32GetRegister>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A32::Reg reg = inst->GetArg(0).GetA32RegRef(); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wresult); + + // TODO: Detect if Gpr vs Fpr is more appropriate + + code.LDR(Wresult, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * static_cast<size_t>(reg)); +} + +template<> +void EmitIR<IR::Opcode::A32GetExtendedRegister32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsSingleExtReg(reg)); + const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::S0); + + auto Sresult = ctx.reg_alloc.WriteS(inst); + RegAlloc::Realize(Sresult); + + // TODO: Detect if Gpr vs Fpr is more appropriate + + code.LDR(Sresult, Xstate, offsetof(A32JitState, ext_regs) + sizeof(u32) * index); +} + +template<> +void EmitIR<IR::Opcode::A32GetVector>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg)); + + if (A32::IsDoubleExtReg(reg)) { + const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0); + auto Dresult = ctx.reg_alloc.WriteD(inst); + RegAlloc::Realize(Dresult); + code.LDR(Dresult, Xstate, offsetof(A32JitState, ext_regs) + sizeof(u64) * index); + } else { + const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::Q0); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + RegAlloc::Realize(Qresult); + code.LDR(Qresult, Xstate, offsetof(A32JitState, ext_regs) + 2 * sizeof(u64) * index); + } +} + +template<> +void EmitIR<IR::Opcode::A32GetExtendedRegister64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsDoubleExtReg(reg)); + const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0); + + auto Dresult = ctx.reg_alloc.WriteD(inst); + RegAlloc::Realize(Dresult); + + // TODO: Detect if Gpr vs Fpr is more appropriate + + code.LDR(Dresult, Xstate, offsetof(A32JitState, ext_regs) + 2 * sizeof(u32) * index); +} + +template<> +void EmitIR<IR::Opcode::A32SetRegister>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A32::Reg reg = inst->GetArg(0).GetA32RegRef(); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wvalue = ctx.reg_alloc.ReadW(args[1]); + RegAlloc::Realize(Wvalue); + + // TODO: Detect if Gpr vs Fpr is more appropriate + + code.STR(Wvalue, Xstate, offsetof(A32JitState, regs) + sizeof(u32) * static_cast<size_t>(reg)); +} + +template<> +void EmitIR<IR::Opcode::A32SetExtendedRegister32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsSingleExtReg(reg)); + const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::S0); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Svalue = ctx.reg_alloc.ReadS(args[1]); + RegAlloc::Realize(Svalue); + + // TODO: Detect if Gpr vs Fpr is more appropriate + + code.STR(Svalue, Xstate, offsetof(A32JitState, ext_regs) + sizeof(u32) * index); +} + +template<> +void EmitIR<IR::Opcode::A32SetExtendedRegister64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsDoubleExtReg(reg)); + const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Dvalue = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Dvalue); + + // TODO: Detect if Gpr vs Fpr is more appropriate + + code.STR(Dvalue, Xstate, offsetof(A32JitState, ext_regs) + 2 * sizeof(u32) * index); +} + +template<> +void EmitIR<IR::Opcode::A32SetVector>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg)); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (A32::IsDoubleExtReg(reg)) { + const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0); + auto Dvalue = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Dvalue); + code.STR(Dvalue, Xstate, offsetof(A32JitState, ext_regs) + sizeof(u64) * index); + } else { + const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::Q0); + auto Qvalue = ctx.reg_alloc.ReadQ(args[1]); + RegAlloc::Realize(Qvalue); + code.STR(Qvalue, Xstate, offsetof(A32JitState, ext_regs) + 2 * sizeof(u64) * index); + } +} + +template<> +void EmitIR<IR::Opcode::A32GetCpsr>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Wcpsr = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wcpsr); + + static_assert(offsetof(A32JitState, cpsr_nzcv) + sizeof(u32) == offsetof(A32JitState, cpsr_q)); + + code.LDP(Wscratch0, Wscratch1, Xstate, offsetof(A32JitState, cpsr_nzcv)); + code.LDR(Wcpsr, Xstate, offsetof(A32JitState, cpsr_jaifm)); + code.ORR(Wcpsr, Wcpsr, Wscratch0); + code.ORR(Wcpsr, Wcpsr, Wscratch1); + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_ge)); + code.AND(Wscratch0, Wscratch0, 0x80808080); + code.MOV(Wscratch1, 0x00204081); + code.MUL(Wscratch0, Wscratch0, Wscratch1); + code.AND(Wscratch0, Wscratch0, 0xf0000000); + code.ORR(Wcpsr, Wcpsr, Wscratch0, LSR, 12); + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor)); + code.AND(Wscratch0, Wscratch0, 0b11); + // 9 8 7 6 5 + // E T + code.ORR(Wscratch0, Wscratch0, Wscratch0, LSL, 3); + code.AND(Wscratch0, Wscratch0, 0x11111111); + code.ORR(Wcpsr, Wcpsr, Wscratch0, LSL, 5); +} + +template<> +void EmitIR<IR::Opcode::A32SetCpsr>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wcpsr = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wcpsr); + + // NZCV, Q flags + code.AND(Wscratch0, Wcpsr, 0xF0000000); + code.AND(Wscratch1, Wcpsr, 1 << 27); + + static_assert(offsetof(A32JitState, cpsr_nzcv) + sizeof(u32) == offsetof(A32JitState, cpsr_q)); + code.STP(Wscratch0, Wscratch1, Xstate, offsetof(A32JitState, cpsr_nzcv)); + + // GE flags + // this does the following: + // cpsr_ge |= mcl::bit::get_bit<19>(cpsr) ? 0xFF000000 : 0; + // cpsr_ge |= mcl::bit::get_bit<18>(cpsr) ? 0x00FF0000 : 0; + // cpsr_ge |= mcl::bit::get_bit<17>(cpsr) ? 0x0000FF00 : 0; + // cpsr_ge |= mcl::bit::get_bit<16>(cpsr) ? 0x000000FF : 0; + code.UBFX(Wscratch0, Wcpsr, 16, 4); + code.MOV(Wscratch1, 0x00204081); + code.MUL(Wscratch0, Wscratch0, Wscratch1); + code.AND(Wscratch0, Wscratch0, 0x01010101); + code.LSL(Wscratch1, Wscratch0, 8); + code.SUB(Wscratch0, Wscratch1, Wscratch0); + + // Other flags + code.MOV(Wscratch1, 0x010001DF); + code.AND(Wscratch1, Wcpsr, Wscratch1); + + static_assert(offsetof(A32JitState, cpsr_jaifm) + sizeof(u32) == offsetof(A32JitState, cpsr_ge)); + code.STP(Wscratch1, Wscratch0, Xstate, offsetof(A32JitState, cpsr_jaifm)); + + // IT state + code.AND(Wscratch0, Wcpsr, 0xFC00); + code.LSR(Wscratch1, Wcpsr, 17); + code.AND(Wscratch1, Wscratch1, 0x300); + code.ORR(Wscratch0, Wscratch0, Wscratch1); + + // E flag, T flag + code.LSR(Wscratch1, Wcpsr, 8); + code.AND(Wscratch1, Wscratch1, 0x2); + code.ORR(Wscratch0, Wscratch0, Wscratch1); + code.LDR(Wscratch1, Xstate, offsetof(A32JitState, upper_location_descriptor)); + code.BFXIL(Wscratch0, Wcpsr, 5, 1); + code.AND(Wscratch1, Wscratch1, 0xFFFF0000); + code.ORR(Wscratch0, Wscratch0, Wscratch1); + code.STR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor)); +} + +template<> +void EmitIR<IR::Opcode::A32SetCpsrNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wnzcv = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wnzcv); + + code.STR(Wnzcv, Xstate, offsetof(A32JitState, cpsr_nzcv)); +} + +template<> +void EmitIR<IR::Opcode::A32SetCpsrNZCVRaw>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wnzcv = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wnzcv); + + code.STR(Wnzcv, Xstate, offsetof(A32JitState, cpsr_nzcv)); +} + +template<> +void EmitIR<IR::Opcode::A32SetCpsrNZCVQ>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wnzcv = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wnzcv); + + static_assert(offsetof(A32JitState, cpsr_nzcv) + sizeof(u32) == offsetof(A32JitState, cpsr_q)); + + code.AND(Wscratch0, Wnzcv, 0xf000'0000); + code.AND(Wscratch1, Wnzcv, 0x0800'0000); + code.STP(Wscratch0, Wscratch1, Xstate, offsetof(A32JitState, cpsr_nzcv)); +} + +template<> +void EmitIR<IR::Opcode::A32SetCpsrNZ>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wnz = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wnz); + + // TODO: Track latent value + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv)); + code.AND(Wscratch0, Wscratch0, 0x30000000); + code.ORR(Wscratch0, Wscratch0, Wnz); + code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv)); +} + +template<> +void EmitIR<IR::Opcode::A32SetCpsrNZC>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + // TODO: Track latent value + + if (args[0].IsImmediate()) { + if (args[1].IsImmediate()) { + const u32 carry = args[1].GetImmediateU1() ? 0x2000'0000 : 0; + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv)); + code.AND(Wscratch0, Wscratch0, 0x10000000); + if (carry) { + code.ORR(Wscratch0, Wscratch0, carry); + } + code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv)); + } else { + auto Wc = ctx.reg_alloc.ReadW(args[1]); + RegAlloc::Realize(Wc); + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv)); + code.AND(Wscratch0, Wscratch0, 0x10000000); + code.ORR(Wscratch0, Wscratch0, Wc); + code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv)); + } + } else { + if (args[1].IsImmediate()) { + const u32 carry = args[1].GetImmediateU1() ? 0x2000'0000 : 0; + auto Wnz = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wnz); + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv)); + code.AND(Wscratch0, Wscratch0, 0x10000000); + code.ORR(Wscratch0, Wscratch0, Wnz); + if (carry) { + code.ORR(Wscratch0, Wscratch0, carry); + } + code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv)); + } else { + auto Wnz = ctx.reg_alloc.ReadW(args[0]); + auto Wc = ctx.reg_alloc.ReadW(args[1]); + RegAlloc::Realize(Wnz, Wc); + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv)); + code.AND(Wscratch0, Wscratch0, 0x10000000); + code.ORR(Wscratch0, Wscratch0, Wnz); + code.ORR(Wscratch0, Wscratch0, Wc); + code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_nzcv)); + } + } +} + +template<> +void EmitIR<IR::Opcode::A32GetCFlag>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Wflag = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wflag); + + code.LDR(Wflag, Xstate, offsetof(A32JitState, cpsr_nzcv)); + code.AND(Wflag, Wflag, 1 << 29); +} + +template<> +void EmitIR<IR::Opcode::A32OrQFlag>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wflag = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wflag); + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_q)); + code.ORR(Wscratch0, Wscratch0, Wflag, LSL, 27); + code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_q)); +} + +template<> +void EmitIR<IR::Opcode::A32GetGEFlags>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Snzcv = ctx.reg_alloc.WriteS(inst); + RegAlloc::Realize(Snzcv); + + code.LDR(Snzcv, Xstate, offsetof(A32JitState, cpsr_ge)); +} + +template<> +void EmitIR<IR::Opcode::A32SetGEFlags>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Snzcv = ctx.reg_alloc.ReadS(args[0]); + RegAlloc::Realize(Snzcv); + + code.STR(Snzcv, Xstate, offsetof(A32JitState, cpsr_ge)); +} + +template<> +void EmitIR<IR::Opcode::A32SetGEFlagsCompressed>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wge = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wge); + + code.LSR(Wscratch0, Wge, 16); + code.MOV(Wscratch1, 0x00204081); + code.MUL(Wscratch0, Wscratch0, Wscratch1); + code.AND(Wscratch0, Wscratch0, 0x01010101); + code.LSL(Wscratch1, Wscratch0, 8); + code.SUB(Wscratch0, Wscratch1, Wscratch0); + code.STR(Wscratch0, Xstate, offsetof(A32JitState, cpsr_ge)); +} + +template<> +void EmitIR<IR::Opcode::A32BXWritePC>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const u32 upper_without_t = (A32::LocationDescriptor{ctx.block.EndLocation()}.SetSingleStepping(false).UniqueHash() >> 32) & 0xFFFFFFFE; + + static_assert(offsetof(A32JitState, regs) + 16 * sizeof(u32) == offsetof(A32JitState, upper_location_descriptor)); + + if (args[0].IsImmediate()) { + const u32 new_pc = args[0].GetImmediateU32(); + const u32 mask = mcl::bit::get_bit<0>(new_pc) ? 0xFFFFFFFE : 0xFFFFFFFC; + const u32 new_upper = upper_without_t | (mcl::bit::get_bit<0>(new_pc) ? 1 : 0); + + code.MOV(Xscratch0, (u64{new_upper} << 32) | (new_pc & mask)); + code.STUR(Xscratch0, Xstate, offsetof(A32JitState, regs) + 15 * sizeof(u32)); + } else { + auto Wpc = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wpc); + ctx.reg_alloc.SpillFlags(); + + code.ANDS(Wscratch0, Wpc, 1); + code.MOV(Wscratch1, 3); + code.CSEL(Wscratch1, Wscratch0, Wscratch1, NE); + code.BIC(Wscratch1, Wpc, Wscratch1); + code.MOV(Wscratch0, upper_without_t); + code.CINC(Wscratch0, Wscratch0, NE); + code.STP(Wscratch1, Wscratch0, Xstate, offsetof(A32JitState, regs) + 15 * sizeof(u32)); + } +} + +template<> +void EmitIR<IR::Opcode::A32UpdateUpperLocationDescriptor>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst*) { + for (auto& inst : ctx.block) { + if (inst.GetOpcode() == IR::Opcode::A32BXWritePC) { + return; + } + } + EmitSetUpperLocationDescriptor(code, ctx, ctx.block.EndLocation(), ctx.block.Location()); +} + +template<> +void EmitIR<IR::Opcode::A32CallSupervisor>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.PrepareForCall(); + + if (ctx.conf.enable_cycle_counting) { + code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run)); + code.SUB(X1, X1, Xticks); + EmitRelocation(code, ctx, LinkTarget::AddTicks); + } + + code.MOV(W1, args[0].GetImmediateU32()); + EmitRelocation(code, ctx, LinkTarget::CallSVC); + + if (ctx.conf.enable_cycle_counting) { + EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining); + code.STR(X0, SP, offsetof(StackLayout, cycles_to_run)); + code.MOV(Xticks, X0); + } +} + +template<> +void EmitIR<IR::Opcode::A32ExceptionRaised>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.PrepareForCall(); + + if (ctx.conf.enable_cycle_counting) { + code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run)); + code.SUB(X1, X1, Xticks); + EmitRelocation(code, ctx, LinkTarget::AddTicks); + } + + code.MOV(W1, args[0].GetImmediateU32()); + code.MOV(W2, args[1].GetImmediateU32()); + EmitRelocation(code, ctx, LinkTarget::ExceptionRaised); + + if (ctx.conf.enable_cycle_counting) { + EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining); + code.STR(X0, SP, offsetof(StackLayout, cycles_to_run)); + code.MOV(Xticks, X0); + } +} + +template<> +void EmitIR<IR::Opcode::A32DataSynchronizationBarrier>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) { + code.DSB(oaknut::BarrierOp::SY); +} + +template<> +void EmitIR<IR::Opcode::A32DataMemoryBarrier>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) { + code.DMB(oaknut::BarrierOp::SY); +} + +template<> +void EmitIR<IR::Opcode::A32InstructionSynchronizationBarrier>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst*) { + if (!ctx.conf.hook_isb) { + return; + } + + ctx.reg_alloc.PrepareForCall(); + EmitRelocation(code, ctx, LinkTarget::InstructionSynchronizationBarrierRaised); +} + +template<> +void EmitIR<IR::Opcode::A32GetFpscr>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Wfpscr = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wfpscr); + ctx.fpsr.Spill(); + + static_assert(offsetof(A32JitState, fpsr) + sizeof(u32) == offsetof(A32JitState, fpsr_nzcv)); + + code.LDR(Wfpscr, Xstate, offsetof(A32JitState, upper_location_descriptor)); + code.LDP(Wscratch0, Wscratch1, Xstate, offsetof(A32JitState, fpsr)); + code.AND(Wfpscr, Wfpscr, 0xffff'0000); + code.ORR(Wscratch0, Wscratch0, Wscratch1); + code.ORR(Wfpscr, Wfpscr, Wscratch0); +} + +template<> +void EmitIR<IR::Opcode::A32SetFpscr>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wfpscr = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wfpscr); + ctx.fpsr.Overwrite(); + + static_assert(offsetof(A32JitState, fpsr) + sizeof(u32) == offsetof(A32JitState, fpsr_nzcv)); + + code.LDR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor)); + code.MOV(Wscratch1, 0x07f7'0000); + code.AND(Wscratch1, Wfpscr, Wscratch1); + code.AND(Wscratch0, Wscratch0, 0x0000'ffff); + code.ORR(Wscratch0, Wscratch0, Wscratch1); + code.STR(Wscratch0, Xstate, offsetof(A32JitState, upper_location_descriptor)); + + code.MOV(Wscratch0, 0x0800'009f); + code.AND(Wscratch0, Wfpscr, Wscratch0); + code.AND(Wscratch1, Wfpscr, 0xf000'0000); + code.STP(Wscratch0, Wscratch1, Xstate, offsetof(A32JitState, fpsr)); +} + +template<> +void EmitIR<IR::Opcode::A32GetFpscrNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Wnzcv = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wnzcv); + + code.LDR(Wnzcv, Xstate, offsetof(A32JitState, fpsr_nzcv)); +} + +template<> +void EmitIR<IR::Opcode::A32SetFpscrNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wnzcv = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wnzcv); + + code.STR(Wnzcv, Xstate, offsetof(A32JitState, fpsr_nzcv)); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_coprocessor.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_coprocessor.cpp new file mode 100644 index 0000000000..5115fbbbdb --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_coprocessor.cpp @@ -0,0 +1,299 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/interface/A32/coprocessor.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +static void EmitCoprocessorException() { + ASSERT_FALSE("Should raise coproc exception here"); +} + +static void CallCoprocCallback(oaknut::CodeGenerator& code, EmitContext& ctx, A32::Coprocessor::Callback callback, IR::Inst* inst = nullptr, std::optional<Argument::copyable_reference> arg0 = {}, std::optional<Argument::copyable_reference> arg1 = {}) { + ctx.reg_alloc.PrepareForCall({}, arg0, arg1); + + if (callback.user_arg) { + code.MOV(X0, reinterpret_cast<u64>(*callback.user_arg)); + } + + code.MOV(Xscratch0, reinterpret_cast<u64>(callback.function)); + code.BLR(Xscratch0); + + if (inst) { + ctx.reg_alloc.DefineAsRegister(inst, X0); + } +} + +template<> +void EmitIR<IR::Opcode::A32CoprocInternalOperation>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const auto opc1 = static_cast<unsigned>(coproc_info[2]); + const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]); + const auto CRn = static_cast<A32::CoprocReg>(coproc_info[4]); + const auto CRm = static_cast<A32::CoprocReg>(coproc_info[5]); + const auto opc2 = static_cast<unsigned>(coproc_info[6]); + + std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileInternalOperation(two, opc1, CRd, CRn, CRm, opc2); + if (!action) { + EmitCoprocessorException(); + return; + } + + CallCoprocCallback(code, ctx, *action); +} + +template<> +void EmitIR<IR::Opcode::A32CoprocSendOneWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const auto opc1 = static_cast<unsigned>(coproc_info[2]); + const auto CRn = static_cast<A32::CoprocReg>(coproc_info[3]); + const auto CRm = static_cast<A32::CoprocReg>(coproc_info[4]); + const auto opc2 = static_cast<unsigned>(coproc_info[5]); + + std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileSendOneWord(two, opc1, CRn, CRm, opc2); + + if (std::holds_alternative<std::monostate>(action)) { + EmitCoprocessorException(); + return; + } + + if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) { + CallCoprocCallback(code, ctx, *cb, nullptr, args[1]); + return; + } + + if (const auto destination_ptr = std::get_if<u32*>(&action)) { + auto Wvalue = ctx.reg_alloc.ReadW(args[1]); + RegAlloc::Realize(Wvalue); + + code.MOV(Xscratch0, reinterpret_cast<u64>(*destination_ptr)); + code.STR(Wvalue, Xscratch0); + + return; + } + + UNREACHABLE(); +} + +template<> +void EmitIR<IR::Opcode::A32CoprocSendTwoWords>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const auto opc = static_cast<unsigned>(coproc_info[2]); + const auto CRm = static_cast<A32::CoprocReg>(coproc_info[3]); + + std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileSendTwoWords(two, opc, CRm); + + if (std::holds_alternative<std::monostate>(action)) { + EmitCoprocessorException(); + return; + } + + if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) { + CallCoprocCallback(code, ctx, *cb, nullptr, args[1], args[2]); + return; + } + + if (const auto destination_ptrs = std::get_if<std::array<u32*, 2>>(&action)) { + auto Wvalue1 = ctx.reg_alloc.ReadW(args[1]); + auto Wvalue2 = ctx.reg_alloc.ReadW(args[2]); + RegAlloc::Realize(Wvalue1, Wvalue2); + + code.MOV(Xscratch0, reinterpret_cast<u64>((*destination_ptrs)[0])); + code.MOV(Xscratch1, reinterpret_cast<u64>((*destination_ptrs)[1])); + code.STR(Wvalue1, Xscratch0); + code.STR(Wvalue2, Xscratch1); + + return; + } + + UNREACHABLE(); +} + +template<> +void EmitIR<IR::Opcode::A32CoprocGetOneWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const auto opc1 = static_cast<unsigned>(coproc_info[2]); + const auto CRn = static_cast<A32::CoprocReg>(coproc_info[3]); + const auto CRm = static_cast<A32::CoprocReg>(coproc_info[4]); + const auto opc2 = static_cast<unsigned>(coproc_info[5]); + + std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileGetOneWord(two, opc1, CRn, CRm, opc2); + + if (std::holds_alternative<std::monostate>(action)) { + EmitCoprocessorException(); + return; + } + + if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) { + CallCoprocCallback(code, ctx, *cb, inst); + return; + } + + if (const auto source_ptr = std::get_if<u32*>(&action)) { + auto Wvalue = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wvalue); + + code.MOV(Xscratch0, reinterpret_cast<u64>(*source_ptr)); + code.LDR(Wvalue, Xscratch0); + + return; + } + + UNREACHABLE(); +} + +template<> +void EmitIR<IR::Opcode::A32CoprocGetTwoWords>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const unsigned opc = coproc_info[2]; + const auto CRm = static_cast<A32::CoprocReg>(coproc_info[3]); + + std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + auto action = coproc->CompileGetTwoWords(two, opc, CRm); + + if (std::holds_alternative<std::monostate>(action)) { + EmitCoprocessorException(); + return; + } + + if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) { + CallCoprocCallback(code, ctx, *cb, inst); + return; + } + + if (const auto source_ptrs = std::get_if<std::array<u32*, 2>>(&action)) { + auto Xvalue = ctx.reg_alloc.WriteX(inst); + RegAlloc::Realize(Xvalue); + + code.MOV(Xscratch0, reinterpret_cast<u64>((*source_ptrs)[0])); + code.MOV(Xscratch1, reinterpret_cast<u64>((*source_ptrs)[1])); + code.LDR(Xvalue, Xscratch0); + code.LDR(Wscratch1, Xscratch1); + code.BFI(Xvalue, Xscratch1, 32, 32); + + return; + } + + UNREACHABLE(); +} + +template<> +void EmitIR<IR::Opcode::A32CoprocLoadWords>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const bool long_transfer = coproc_info[2] != 0; + const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]); + const bool has_option = coproc_info[4] != 0; + + std::optional<u8> option = std::nullopt; + if (has_option) { + option = coproc_info[5]; + } + + std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileLoadWords(two, long_transfer, CRd, option); + if (!action) { + EmitCoprocessorException(); + return; + } + + CallCoprocCallback(code, ctx, *action, nullptr, args[1]); +} + +template<> +void EmitIR<IR::Opcode::A32CoprocStoreWords>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const bool long_transfer = coproc_info[2] != 0; + const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]); + const bool has_option = coproc_info[4] != 0; + + std::optional<u8> option = std::nullopt; + if (has_option) { + option = coproc_info[5]; + } + + std::shared_ptr<A32::Coprocessor> coproc = ctx.conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileStoreWords(two, long_transfer, CRd, option); + if (!action) { + EmitCoprocessorException(); + return; + } + + CallCoprocCallback(code, ctx, *action, nullptr, args[1]); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_memory.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_memory.cpp new file mode 100644 index 0000000000..9e8426e230 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a32_memory.cpp @@ -0,0 +1,107 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_arm64_memory.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +template<> +void EmitIR<IR::Opcode::A32ClearExclusive>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) { + code.STR(WZR, Xstate, offsetof(A32JitState, exclusive_state)); +} + +template<> +void EmitIR<IR::Opcode::A32ReadMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReadMemory<8>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32ReadMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReadMemory<16>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32ReadMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReadMemory<32>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32ReadMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReadMemory<64>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32ExclusiveReadMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveReadMemory<8>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32ExclusiveReadMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveReadMemory<16>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32ExclusiveReadMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveReadMemory<32>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32ExclusiveReadMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveReadMemory<64>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32WriteMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitWriteMemory<8>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32WriteMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitWriteMemory<16>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32WriteMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitWriteMemory<32>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32WriteMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitWriteMemory<64>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32ExclusiveWriteMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveWriteMemory<8>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32ExclusiveWriteMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveWriteMemory<16>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32ExclusiveWriteMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveWriteMemory<32>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A32ExclusiveWriteMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveWriteMemory<64>(code, ctx, inst); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64.cpp new file mode 100644 index 0000000000..67fd91f26a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64.cpp @@ -0,0 +1,518 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit_cast.hpp> +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a64_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/interface/halt_reason.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +oaknut::Label EmitA64Cond(oaknut::CodeGenerator& code, EmitContext&, IR::Cond cond) { + oaknut::Label pass; + // TODO: Flags in host flags + code.LDR(Wscratch0, Xstate, offsetof(A64JitState, cpsr_nzcv)); + code.MSR(oaknut::SystemReg::NZCV, Xscratch0); + code.B(static_cast<oaknut::Cond>(cond), pass); + return pass; +} + +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step); + +void EmitA64Terminal(oaknut::CodeGenerator&, EmitContext&, IR::Term::Interpret, IR::LocationDescriptor, bool) { + ASSERT_FALSE("Interpret should never be emitted."); +} + +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) { + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +} + +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::LinkBlock terminal, IR::LocationDescriptor, bool is_single_step) { + oaknut::Label fail; + + if (ctx.conf.HasOptimization(OptimizationFlag::BlockLinking) && !is_single_step) { + if (ctx.conf.enable_cycle_counting) { + code.CMP(Xticks, 0); + code.B(LE, fail); + EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch); + } else { + code.LDAR(Wscratch0, Xhalt); + code.CBNZ(Wscratch0, fail); + EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch); + } + } + + code.l(fail); + code.MOV(Xscratch0, A64::LocationDescriptor{terminal.next}.PC()); + code.STR(Xscratch0, Xstate, offsetof(A64JitState, pc)); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +} + +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::LinkBlockFast terminal, IR::LocationDescriptor, bool is_single_step) { + if (ctx.conf.HasOptimization(OptimizationFlag::BlockLinking) && !is_single_step) { + EmitBlockLinkRelocation(code, ctx, terminal.next, BlockRelocationType::Branch); + } + + code.MOV(Xscratch0, A64::LocationDescriptor{terminal.next}.PC()); + code.STR(Xscratch0, Xstate, offsetof(A64JitState, pc)); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +} + +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) { + if (ctx.conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) && !is_single_step) { + oaknut::Label fail; + + code.MOV(Wscratch0, A64::LocationDescriptor::fpcr_mask); + code.LDR(W0, Xstate, offsetof(A64JitState, fpcr)); + code.LDR(X1, Xstate, offsetof(A64JitState, pc)); + code.AND(W0, W0, Wscratch0); + code.AND(X1, X1, A64::LocationDescriptor::pc_mask); + code.LSL(X0, X0, A64::LocationDescriptor::fpcr_shift); + code.ORR(X0, X0, X1); + + code.LDR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + code.AND(Wscratch2, Wscratch2, RSBIndexMask); + code.ADD(X2, SP, Xscratch2); + code.SUB(Wscratch2, Wscratch2, sizeof(RSBEntry)); + code.STR(Wscratch2, SP, offsetof(StackLayout, rsb_ptr)); + + code.LDP(Xscratch0, Xscratch1, X2, offsetof(StackLayout, rsb)); + + code.CMP(X0, Xscratch0); + code.B(NE, fail); + code.BR(Xscratch1); + + code.l(fail); + } + + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +} + +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::FastDispatchHint, IR::LocationDescriptor, bool) { + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); + + // TODO: Implement FastDispatchHint optimization +} + +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + oaknut::Label pass = EmitA64Cond(code, ctx, terminal.if_); + EmitA64Terminal(code, ctx, terminal.else_, initial_location, is_single_step); + code.l(pass); + EmitA64Terminal(code, ctx, terminal.then_, initial_location, is_single_step); +} + +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + oaknut::Label fail; + code.LDRB(Wscratch0, SP, offsetof(StackLayout, check_bit)); + code.CBZ(Wscratch0, fail); + EmitA64Terminal(code, ctx, terminal.then_, initial_location, is_single_step); + code.l(fail); + EmitA64Terminal(code, ctx, terminal.else_, initial_location, is_single_step); +} + +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + oaknut::Label fail; + code.LDAR(Wscratch0, Xhalt); + code.CBNZ(Wscratch0, fail); + EmitA64Terminal(code, ctx, terminal.else_, initial_location, is_single_step); + code.l(fail); + EmitRelocation(code, ctx, LinkTarget::ReturnToDispatcher); +} + +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Term::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + boost::apply_visitor([&](const auto& t) { EmitA64Terminal(code, ctx, t, initial_location, is_single_step); }, terminal); +} + +void EmitA64Terminal(oaknut::CodeGenerator& code, EmitContext& ctx) { + const A64::LocationDescriptor location{ctx.block.Location()}; + EmitA64Terminal(code, ctx, ctx.block.GetTerminal(), location.SetSingleStepping(false), location.SingleStepping()); +} + +void EmitA64ConditionFailedTerminal(oaknut::CodeGenerator& code, EmitContext& ctx) { + const A64::LocationDescriptor location{ctx.block.Location()}; + EmitA64Terminal(code, ctx, IR::Term::LinkBlock{ctx.block.ConditionFailedLocation()}, location.SetSingleStepping(false), location.SingleStepping()); +} + +void EmitA64CheckMemoryAbort(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Label& end) { + if (!ctx.conf.check_halt_on_memory_access) { + return; + } + + const A64::LocationDescriptor current_location{IR::LocationDescriptor{inst->GetArg(0).GetU64()}}; + + code.LDAR(Xscratch0, Xhalt); + code.TST(Xscratch0, static_cast<u32>(HaltReason::MemoryAbort)); + code.B(EQ, end); + code.MOV(Xscratch0, current_location.PC()); + code.STR(Xscratch0, Xstate, offsetof(A64JitState, pc)); + EmitRelocation(code, ctx, LinkTarget::ReturnFromRunCode); +} + +template<> +void EmitIR<IR::Opcode::A64SetCheckBit>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { + code.MOV(Wscratch0, 1); + code.STRB(Wscratch0, SP, offsetof(StackLayout, check_bit)); + } else { + code.STRB(WZR, SP, offsetof(StackLayout, check_bit)); + } + } else { + auto Wbit = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wbit); + code.STRB(Wbit, SP, offsetof(StackLayout, check_bit)); + } +} + +template<> +void EmitIR<IR::Opcode::A64GetCFlag>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Wflag = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wflag); + code.LDR(Wflag, Xstate, offsetof(A64JitState, cpsr_nzcv)); + code.AND(Wflag, Wflag, 1 << 29); +} + +template<> +void EmitIR<IR::Opcode::A64GetNZCVRaw>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Wnzcv = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wnzcv); + + code.LDR(Wnzcv, Xstate, offsetof(A64JitState, cpsr_nzcv)); +} + +template<> +void EmitIR<IR::Opcode::A64SetNZCVRaw>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wnzcv = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wnzcv); + + code.STR(Wnzcv, Xstate, offsetof(A64JitState, cpsr_nzcv)); +} + +template<> +void EmitIR<IR::Opcode::A64SetNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wnzcv = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wnzcv); + + code.STR(Wnzcv, Xstate, offsetof(A64JitState, cpsr_nzcv)); +} + +template<> +void EmitIR<IR::Opcode::A64GetW>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wresult); + + // TODO: Detect if Gpr vs Fpr is more appropriate + + code.LDR(Wresult, Xstate, offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)); +} + +template<> +void EmitIR<IR::Opcode::A64GetX>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); + + auto Xresult = ctx.reg_alloc.WriteX(inst); + RegAlloc::Realize(Xresult); + + // TODO: Detect if Gpr vs Fpr is more appropriate + + code.LDR(Xresult, Xstate, offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)); +} + +template<> +void EmitIR<IR::Opcode::A64GetS>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + auto Sresult = ctx.reg_alloc.WriteS(inst); + RegAlloc::Realize(Sresult); + code.LDR(Sresult, Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)); +} + +template<> +void EmitIR<IR::Opcode::A64GetD>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + auto Dresult = ctx.reg_alloc.WriteD(inst); + RegAlloc::Realize(Dresult); + code.LDR(Dresult, Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)); +} + +template<> +void EmitIR<IR::Opcode::A64GetQ>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + RegAlloc::Realize(Qresult); + code.LDR(Qresult, Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)); +} + +template<> +void EmitIR<IR::Opcode::A64GetSP>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Xresult = ctx.reg_alloc.WriteX(inst); + RegAlloc::Realize(Xresult); + + code.LDR(Xresult, Xstate, offsetof(A64JitState, sp)); +} + +template<> +void EmitIR<IR::Opcode::A64GetFPCR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Wresult = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wresult); + + code.LDR(Wresult, Xstate, offsetof(A64JitState, fpcr)); +} + +template<> +void EmitIR<IR::Opcode::A64GetFPSR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Wresult = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wresult); + + code.LDR(Wresult, Xstate, offsetof(A64JitState, fpsr)); +} + +template<> +void EmitIR<IR::Opcode::A64SetW>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wvalue = ctx.reg_alloc.ReadW(args[1]); + RegAlloc::Realize(Wvalue); + + // TODO: Detect if Gpr vs Fpr is more appropriate + code.MOV(*Wvalue, Wvalue); + code.STR(Wvalue->toX(), Xstate, offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)); +} + +template<> +void EmitIR<IR::Opcode::A64SetX>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Xvalue = ctx.reg_alloc.ReadX(args[1]); + RegAlloc::Realize(Xvalue); + + // TODO: Detect if Gpr vs Fpr is more appropriate + + code.STR(Xvalue, Xstate, offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)); +} + +template<> +void EmitIR<IR::Opcode::A64SetS>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + auto Svalue = ctx.reg_alloc.ReadS(args[1]); + RegAlloc::Realize(Svalue); + + code.FMOV(Svalue, Svalue); + code.STR(Svalue->toQ(), Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)); +} + +template<> +void EmitIR<IR::Opcode::A64SetD>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + auto Dvalue = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Dvalue); + + code.FMOV(Dvalue, Dvalue); + code.STR(Dvalue->toQ(), Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)); +} + +template<> +void EmitIR<IR::Opcode::A64SetQ>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + auto Qvalue = ctx.reg_alloc.ReadQ(args[1]); + RegAlloc::Realize(Qvalue); + code.STR(Qvalue, Xstate, offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)); +} + +template<> +void EmitIR<IR::Opcode::A64SetSP>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Xvalue = ctx.reg_alloc.ReadX(args[0]); + RegAlloc::Realize(Xvalue); + code.STR(Xvalue, Xstate, offsetof(A64JitState, sp)); +} + +template<> +void EmitIR<IR::Opcode::A64SetFPCR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wvalue = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wvalue); + code.STR(Wvalue, Xstate, offsetof(A64JitState, fpcr)); + code.MSR(oaknut::SystemReg::FPCR, Wvalue->toX()); +} + +template<> +void EmitIR<IR::Opcode::A64SetFPSR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wvalue = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wvalue); + code.STR(Wvalue, Xstate, offsetof(A64JitState, fpsr)); + code.MSR(oaknut::SystemReg::FPSR, Wvalue->toX()); +} + +template<> +void EmitIR<IR::Opcode::A64SetPC>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Xvalue = ctx.reg_alloc.ReadX(args[0]); + RegAlloc::Realize(Xvalue); + code.STR(Xvalue, Xstate, offsetof(A64JitState, pc)); +} + +template<> +void EmitIR<IR::Opcode::A64CallSupervisor>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.PrepareForCall(); + + if (ctx.conf.enable_cycle_counting) { + code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run)); + code.SUB(X1, X1, Xticks); + EmitRelocation(code, ctx, LinkTarget::AddTicks); + } + + code.MOV(W1, args[0].GetImmediateU32()); + EmitRelocation(code, ctx, LinkTarget::CallSVC); + + if (ctx.conf.enable_cycle_counting) { + EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining); + code.STR(X0, SP, offsetof(StackLayout, cycles_to_run)); + code.MOV(Xticks, X0); + } +} + +template<> +void EmitIR<IR::Opcode::A64ExceptionRaised>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.PrepareForCall(); + + if (ctx.conf.enable_cycle_counting) { + code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run)); + code.SUB(X1, X1, Xticks); + EmitRelocation(code, ctx, LinkTarget::AddTicks); + } + + code.MOV(X1, args[0].GetImmediateU64()); + code.MOV(X2, args[1].GetImmediateU64()); + EmitRelocation(code, ctx, LinkTarget::ExceptionRaised); + + if (ctx.conf.enable_cycle_counting) { + EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining); + code.STR(X0, SP, offsetof(StackLayout, cycles_to_run)); + code.MOV(Xticks, X0); + } +} + +template<> +void EmitIR<IR::Opcode::A64DataCacheOperationRaised>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.PrepareForCall({}, args[1], args[2]); + EmitRelocation(code, ctx, LinkTarget::DataCacheOperationRaised); +} + +template<> +void EmitIR<IR::Opcode::A64InstructionCacheOperationRaised>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.PrepareForCall({}, args[0], args[1]); + EmitRelocation(code, ctx, LinkTarget::InstructionCacheOperationRaised); +} + +template<> +void EmitIR<IR::Opcode::A64DataSynchronizationBarrier>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) { + code.DSB(oaknut::BarrierOp::SY); +} + +template<> +void EmitIR<IR::Opcode::A64DataMemoryBarrier>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) { + code.DMB(oaknut::BarrierOp::SY); +} + +template<> +void EmitIR<IR::Opcode::A64InstructionSynchronizationBarrier>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst*) { + if (!ctx.conf.hook_isb) { + return; + } + + ctx.reg_alloc.PrepareForCall(); + EmitRelocation(code, ctx, LinkTarget::InstructionSynchronizationBarrierRaised); +} + +template<> +void EmitIR<IR::Opcode::A64GetCNTFRQ>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Xvalue = ctx.reg_alloc.WriteX(inst); + RegAlloc::Realize(Xvalue); + code.MOV(Xvalue, ctx.conf.cntfreq_el0); +} + +template<> +void EmitIR<IR::Opcode::A64GetCNTPCT>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.PrepareForCall(); + if (!ctx.conf.wall_clock_cntpct && ctx.conf.enable_cycle_counting) { + code.LDR(X1, SP, offsetof(StackLayout, cycles_to_run)); + code.SUB(X1, X1, Xticks); + EmitRelocation(code, ctx, LinkTarget::AddTicks); + EmitRelocation(code, ctx, LinkTarget::GetTicksRemaining); + code.STR(X0, SP, offsetof(StackLayout, cycles_to_run)); + code.MOV(Xticks, X0); + } + EmitRelocation(code, ctx, LinkTarget::GetCNTPCT); + ctx.reg_alloc.DefineAsRegister(inst, X0); +} + +template<> +void EmitIR<IR::Opcode::A64GetCTR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Wvalue = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wvalue); + code.MOV(Wvalue, ctx.conf.ctr_el0); +} + +template<> +void EmitIR<IR::Opcode::A64GetDCZID>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Wvalue = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Wvalue); + code.MOV(Wvalue, ctx.conf.dczid_el0); +} + +template<> +void EmitIR<IR::Opcode::A64GetTPIDR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Xvalue = ctx.reg_alloc.WriteX(inst); + RegAlloc::Realize(Xvalue); + code.MOV(Xscratch0, mcl::bit_cast<u64>(ctx.conf.tpidr_el0)); + code.LDR(Xvalue, Xscratch0); +} + +template<> +void EmitIR<IR::Opcode::A64GetTPIDRRO>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Xvalue = ctx.reg_alloc.WriteX(inst); + RegAlloc::Realize(Xvalue); + code.MOV(Xscratch0, mcl::bit_cast<u64>(ctx.conf.tpidrro_el0)); + code.LDR(Xvalue, Xscratch0); +} + +template<> +void EmitIR<IR::Opcode::A64SetTPIDR>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Xvalue = ctx.reg_alloc.ReadX(args[0]); + RegAlloc::Realize(Xvalue); + code.MOV(Xscratch0, mcl::bit_cast<u64>(ctx.conf.tpidr_el0)); + code.STR(Xvalue, Xscratch0); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64_memory.cpp new file mode 100644 index 0000000000..099c81a4c3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_a64_memory.cpp @@ -0,0 +1,128 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a64_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_arm64_memory.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/ir/acc_type.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +template<> +void EmitIR<IR::Opcode::A64ClearExclusive>(oaknut::CodeGenerator& code, EmitContext&, IR::Inst*) { + code.STR(WZR, Xstate, offsetof(A64JitState, exclusive_state)); +} + +template<> +void EmitIR<IR::Opcode::A64ReadMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReadMemory<8>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ReadMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReadMemory<16>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ReadMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReadMemory<32>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ReadMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReadMemory<64>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ReadMemory128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReadMemory<128>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ExclusiveReadMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveReadMemory<8>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ExclusiveReadMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveReadMemory<16>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ExclusiveReadMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveReadMemory<32>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ExclusiveReadMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveReadMemory<64>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ExclusiveReadMemory128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveReadMemory<128>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64WriteMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitWriteMemory<8>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64WriteMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitWriteMemory<16>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64WriteMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitWriteMemory<32>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64WriteMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitWriteMemory<64>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64WriteMemory128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitWriteMemory<128>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ExclusiveWriteMemory8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveWriteMemory<8>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ExclusiveWriteMemory16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveWriteMemory<16>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ExclusiveWriteMemory32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveWriteMemory<32>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ExclusiveWriteMemory64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveWriteMemory<64>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::A64ExclusiveWriteMemory128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitExclusiveWriteMemory<128>(code, ctx, inst); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_cryptography.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_cryptography.cpp new file mode 100644 index 0000000000..b0a3cb4817 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_cryptography.cpp @@ -0,0 +1,166 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +template<size_t bitsize, typename EmitFn> +static void EmitCRC(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit_fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Woutput = ctx.reg_alloc.WriteW(inst); + auto Winput = ctx.reg_alloc.ReadW(args[0]); + auto Rdata = ctx.reg_alloc.ReadReg<bitsize>(args[1]); + RegAlloc::Realize(Woutput, Winput, Rdata); + + emit_fn(Woutput, Winput, Rdata); +} + +template<> +void EmitIR<IR::Opcode::CRC32Castagnoli8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32CB(Woutput, Winput, Wdata); }); +} + +template<> +void EmitIR<IR::Opcode::CRC32Castagnoli16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32CH(Woutput, Winput, Wdata); }); +} + +template<> +void EmitIR<IR::Opcode::CRC32Castagnoli32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32CW(Woutput, Winput, Wdata); }); +} + +template<> +void EmitIR<IR::Opcode::CRC32Castagnoli64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitCRC<64>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Xdata) { code.CRC32CX(Woutput, Winput, Xdata); }); +} + +template<> +void EmitIR<IR::Opcode::CRC32ISO8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32B(Woutput, Winput, Wdata); }); +} + +template<> +void EmitIR<IR::Opcode::CRC32ISO16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32H(Woutput, Winput, Wdata); }); +} + +template<> +void EmitIR<IR::Opcode::CRC32ISO32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitCRC<32>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Wdata) { code.CRC32W(Woutput, Winput, Wdata); }); +} + +template<> +void EmitIR<IR::Opcode::CRC32ISO64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitCRC<64>(code, ctx, inst, [&](auto& Woutput, auto& Winput, auto& Xdata) { code.CRC32X(Woutput, Winput, Xdata); }); +} + +template<> +void EmitIR<IR::Opcode::AESDecryptSingleRound>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qoutput = ctx.reg_alloc.WriteQ(inst); + auto Qinput = ctx.reg_alloc.ReadQ(args[0]); + RegAlloc::Realize(Qoutput, Qinput); + + code.MOVI(Qoutput->toD(), oaknut::RepImm{0}); + code.AESD(Qoutput->B16(), Qinput->B16()); +} + +template<> +void EmitIR<IR::Opcode::AESEncryptSingleRound>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qoutput = ctx.reg_alloc.WriteQ(inst); + auto Qinput = ctx.reg_alloc.ReadQ(args[0]); + RegAlloc::Realize(Qoutput, Qinput); + + code.MOVI(Qoutput->toD(), oaknut::RepImm{0}); + code.AESE(Qoutput->B16(), Qinput->B16()); +} + +template<> +void EmitIR<IR::Opcode::AESInverseMixColumns>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qoutput = ctx.reg_alloc.WriteQ(inst); + auto Qinput = ctx.reg_alloc.ReadQ(args[0]); + RegAlloc::Realize(Qoutput, Qinput); + + code.AESIMC(Qoutput->B16(), Qinput->B16()); +} + +template<> +void EmitIR<IR::Opcode::AESMixColumns>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qoutput = ctx.reg_alloc.WriteQ(inst); + auto Qinput = ctx.reg_alloc.ReadQ(args[0]); + RegAlloc::Realize(Qoutput, Qinput); + + code.AESMC(Qoutput->B16(), Qinput->B16()); +} + +template<> +void EmitIR<IR::Opcode::SM4AccessSubstitutionBox>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::SHA256Hash>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool part1 = args[3].GetImmediateU1(); + + if (part1) { + auto Qx = ctx.reg_alloc.ReadWriteQ(args[0], inst); + auto Qy = ctx.reg_alloc.ReadQ(args[1]); + auto Qz = ctx.reg_alloc.ReadQ(args[2]); + RegAlloc::Realize(Qx, Qy, Qz); + + code.SHA256H(Qx, Qy, Qz->S4()); + } else { + auto Qx = ctx.reg_alloc.ReadQ(args[0]); + auto Qy = ctx.reg_alloc.ReadWriteQ(args[1], inst); + auto Qz = ctx.reg_alloc.ReadQ(args[2]); + RegAlloc::Realize(Qx, Qy, Qz); + + code.SHA256H2(Qy, Qx, Qz->S4()); // Yes x and y are swapped + } +} + +template<> +void EmitIR<IR::Opcode::SHA256MessageSchedule0>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qa = ctx.reg_alloc.ReadWriteQ(args[0], inst); + auto Qb = ctx.reg_alloc.ReadQ(args[1]); + RegAlloc::Realize(Qa, Qb); + + code.SHA256SU0(Qa->S4(), Qb->S4()); +} + +template<> +void EmitIR<IR::Opcode::SHA256MessageSchedule1>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qa = ctx.reg_alloc.ReadWriteQ(args[0], inst); + auto Qb = ctx.reg_alloc.ReadQ(args[1]); + auto Qc = ctx.reg_alloc.ReadQ(args[2]); + RegAlloc::Realize(Qa, Qb, Qc); + + code.SHA256SU1(Qa->S4(), Qb->S4(), Qc->S4()); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_data_processing.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_data_processing.cpp new file mode 100644 index 0000000000..04395148c8 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_data_processing.cpp @@ -0,0 +1,1523 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <cstddef> + +#include <fmt/ostream.h> +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +template<size_t bitsize, typename EmitFn> +static void EmitTwoOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Rresult = ctx.reg_alloc.WriteReg<bitsize>(inst); + auto Roperand = ctx.reg_alloc.ReadReg<bitsize>(args[0]); + RegAlloc::Realize(Rresult, Roperand); + + emit(Rresult, Roperand); +} + +template<size_t bitsize, typename EmitFn> +static void EmitThreeOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Rresult = ctx.reg_alloc.WriteReg<bitsize>(inst); + auto Ra = ctx.reg_alloc.ReadReg<bitsize>(args[0]); + auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]); + RegAlloc::Realize(Rresult, Ra, Rb); + + emit(Rresult, Ra, Rb); +} + +template<> +void EmitIR<IR::Opcode::Pack2x32To1x64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wlo = ctx.reg_alloc.ReadW(args[0]); + auto Whi = ctx.reg_alloc.ReadW(args[1]); + auto Xresult = ctx.reg_alloc.WriteX(inst); + RegAlloc::Realize(Wlo, Whi, Xresult); + + code.MOV(Xresult->toW(), Wlo); // TODO: Move eliminiation + code.BFI(Xresult, Whi->toX(), 32, 32); +} + +template<> +void EmitIR<IR::Opcode::Pack2x64To1x128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (args[0].IsInGpr() && args[1].IsInGpr()) { + auto Xlo = ctx.reg_alloc.ReadX(args[0]); + auto Xhi = ctx.reg_alloc.ReadX(args[1]); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + RegAlloc::Realize(Xlo, Xhi, Qresult); + + code.FMOV(Qresult->toD(), Xlo); + code.MOV(oaknut::VRegSelector{Qresult->index()}.D()[1], Xhi); + } else if (args[0].IsInGpr()) { + auto Xlo = ctx.reg_alloc.ReadX(args[0]); + auto Dhi = ctx.reg_alloc.ReadD(args[1]); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + RegAlloc::Realize(Xlo, Dhi, Qresult); + + code.FMOV(Qresult->toD(), Xlo); + code.MOV(oaknut::VRegSelector{Qresult->index()}.D()[1], oaknut::VRegSelector{Dhi->index()}.D()[0]); + } else if (args[1].IsInGpr()) { + auto Dlo = ctx.reg_alloc.ReadD(args[0]); + auto Xhi = ctx.reg_alloc.ReadX(args[1]); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + RegAlloc::Realize(Dlo, Xhi, Qresult); + + code.FMOV(Qresult->toD(), Dlo); // TODO: Move eliminiation + code.MOV(oaknut::VRegSelector{Qresult->index()}.D()[1], Xhi); + } else { + auto Dlo = ctx.reg_alloc.ReadD(args[0]); + auto Dhi = ctx.reg_alloc.ReadD(args[1]); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + RegAlloc::Realize(Dlo, Dhi, Qresult); + + code.FMOV(Qresult->toD(), Dlo); // TODO: Move eliminiation + code.MOV(oaknut::VRegSelector{Qresult->index()}.D()[1], oaknut::VRegSelector{Dhi->index()}.D()[0]); + } +} + +template<> +void EmitIR<IR::Opcode::LeastSignificantWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Xoperand = ctx.reg_alloc.ReadX(args[0]); + RegAlloc::Realize(Wresult, Xoperand); + + code.MOV(Wresult, Xoperand->toW()); // TODO: Zext elimination +} + +template<> +void EmitIR<IR::Opcode::LeastSignificantHalf>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wresult, Woperand); + + code.UXTH(Wresult, Woperand); // TODO: Zext elimination +} + +template<> +void EmitIR<IR::Opcode::LeastSignificantByte>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wresult, Woperand); + + code.UXTB(Wresult, Woperand); // TODO: Zext elimination +} + +template<> +void EmitIR<IR::Opcode::MostSignificantWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Xoperand = ctx.reg_alloc.ReadX(args[0]); + RegAlloc::Realize(Wresult, Xoperand); + + code.LSR(Wresult->toX(), Xoperand, 32); + + if (carry_inst) { + auto Wcarry = ctx.reg_alloc.WriteW(carry_inst); + RegAlloc::Realize(Wcarry); + + code.LSR(Wcarry, Xoperand->toW(), 31 - 29); + code.AND(Wcarry, Wcarry, 1 << 29); + } +} + +template<> +void EmitIR<IR::Opcode::MostSignificantBit>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wresult, Woperand); + + code.LSR(Wresult, Woperand, 31); +} + +template<> +void EmitIR<IR::Opcode::IsZero32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wresult, Woperand); + ctx.reg_alloc.SpillFlags(); + + code.CMP(Woperand, 0); + code.CSET(Wresult, EQ); +} + +template<> +void EmitIR<IR::Opcode::IsZero64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Xoperand = ctx.reg_alloc.ReadX(args[0]); + RegAlloc::Realize(Wresult, Xoperand); + ctx.reg_alloc.SpillFlags(); + + code.CMP(Xoperand, 0); + code.CSET(Wresult, EQ); +} + +template<> +void EmitIR<IR::Opcode::TestBit>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xoperand = ctx.reg_alloc.ReadX(args[0]); + RegAlloc::Realize(Xresult, Xoperand); + ASSERT(args[1].IsImmediate()); + ASSERT(args[1].GetImmediateU8() < 64); + + code.UBFX(Xresult, Xoperand, args[1].GetImmediateU8(), 1); +} + +template<> +void EmitIR<IR::Opcode::ConditionalSelect32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const IR::Cond cond = args[0].GetImmediateCond(); + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wthen = ctx.reg_alloc.ReadW(args[1]); + auto Welse = ctx.reg_alloc.ReadW(args[2]); + RegAlloc::Realize(Wresult, Wthen, Welse); + ctx.reg_alloc.SpillFlags(); + + // TODO: FSEL for fprs + + code.LDR(Wscratch0, Xstate, ctx.conf.state_nzcv_offset); + code.MSR(oaknut::SystemReg::NZCV, Xscratch0); + code.CSEL(Wresult, Wthen, Welse, static_cast<oaknut::Cond>(cond)); +} + +template<> +void EmitIR<IR::Opcode::ConditionalSelect64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const IR::Cond cond = args[0].GetImmediateCond(); + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xthen = ctx.reg_alloc.ReadX(args[1]); + auto Xelse = ctx.reg_alloc.ReadX(args[2]); + RegAlloc::Realize(Xresult, Xthen, Xelse); + ctx.reg_alloc.SpillFlags(); + + // TODO: FSEL for fprs + + code.LDR(Wscratch0, Xstate, ctx.conf.state_nzcv_offset); + code.MSR(oaknut::SystemReg::NZCV, Xscratch0); + code.CSEL(Xresult, Xthen, Xelse, static_cast<oaknut::Cond>(cond)); +} + +template<> +void EmitIR<IR::Opcode::ConditionalSelectNZCV>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitIR<IR::Opcode::ConditionalSelect32>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::LogicalShiftLeft32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (!carry_inst) { + if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + RegAlloc::Realize(Wresult, Woperand); + + if (shift <= 31) { + code.LSL(Wresult, Woperand, shift); + } else { + code.MOV(Wresult, WZR); + } + } else { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + auto Wshift = ctx.reg_alloc.ReadW(shift_arg); + RegAlloc::Realize(Wresult, Woperand, Wshift); + ctx.reg_alloc.SpillFlags(); + + code.AND(Wscratch0, Wshift, 0xff); + code.LSL(Wresult, Woperand, Wscratch0); + code.CMP(Wscratch0, 32); + code.CSEL(Wresult, Wresult, WZR, LT); + } + } else { + if (shift_arg.IsImmediate() && shift_arg.GetImmediateU8() == 0) { + ctx.reg_alloc.DefineAsExisting(carry_inst, carry_arg); + ctx.reg_alloc.DefineAsExisting(inst, operand_arg); + } else if (shift_arg.IsImmediate()) { + // TODO: Use RMIF + const u8 shift = shift_arg.GetImmediateU8(); + + if (shift < 32) { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + RegAlloc::Realize(Wresult, Wcarry_out, Woperand); + + code.UBFX(Wcarry_out, Woperand, 32 - shift, 1); + code.LSL(Wcarry_out, Wcarry_out, 29); + code.LSL(Wresult, Woperand, shift); + } else if (shift > 32) { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + RegAlloc::Realize(Wresult, Wcarry_out); + + code.MOV(Wresult, WZR); + code.MOV(Wcarry_out, WZR); + } else { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + RegAlloc::Realize(Wresult, Wcarry_out, Woperand); + + code.UBFIZ(Wcarry_out, Woperand, 29, 1); + code.MOV(Wresult, WZR); + } + } else { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + auto Wshift = ctx.reg_alloc.ReadW(shift_arg); + auto Wcarry_in = ctx.reg_alloc.ReadW(carry_arg); + if (carry_arg.IsImmediate()) { + RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift); + } else { + RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift, Wcarry_in); + } + ctx.reg_alloc.SpillFlags(); + + // TODO: Use RMIF + + oaknut::Label zero, end; + + code.ANDS(Wscratch1, Wshift, 0xff); + code.B(EQ, zero); + + code.NEG(Wscratch0, Wshift); + code.LSR(Wcarry_out, Woperand, Wscratch0); + code.LSL(Wresult, Woperand, Wshift); + code.UBFIZ(Wcarry_out, Wcarry_out, 29, 1); + code.CMP(Wscratch1, 32); + code.CSEL(Wresult, Wresult, WZR, LT); + code.CSEL(Wcarry_out, Wcarry_out, WZR, LE); + code.B(end); + + code.l(zero); + code.MOV(*Wresult, Woperand); + if (carry_arg.IsImmediate()) { + code.MOV(Wcarry_out, carry_arg.GetImmediateU32() << 29); + } else { + code.MOV(*Wcarry_out, Wcarry_in); + } + + code.l(end); + } + } +} + +template<> +void EmitIR<IR::Opcode::LogicalShiftLeft64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (args[1].IsImmediate()) { + const u8 shift = args[1].GetImmediateU8(); + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xoperand = ctx.reg_alloc.ReadX(args[0]); + RegAlloc::Realize(Xresult, Xoperand); + + if (shift <= 63) { + code.LSL(Xresult, Xoperand, shift); + } else { + code.MOV(Xresult, XZR); + } + } else { + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xoperand = ctx.reg_alloc.ReadX(args[0]); + auto Xshift = ctx.reg_alloc.ReadX(args[1]); + RegAlloc::Realize(Xresult, Xoperand, Xshift); + ctx.reg_alloc.SpillFlags(); + + code.AND(Xscratch0, Xshift, 0xff); + code.LSL(Xresult, Xoperand, Xscratch0); + code.CMP(Xscratch0, 64); + code.CSEL(Xresult, Xresult, XZR, LT); + } +} + +template<> +void EmitIR<IR::Opcode::LogicalShiftRight32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (!carry_inst) { + if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + RegAlloc::Realize(Wresult, Woperand); + + if (shift <= 31) { + code.LSR(Wresult, Woperand, shift); + } else { + code.MOV(Wresult, WZR); + } + } else { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + auto Wshift = ctx.reg_alloc.ReadW(shift_arg); + RegAlloc::Realize(Wresult, Woperand, Wshift); + ctx.reg_alloc.SpillFlags(); + + code.AND(Wscratch0, Wshift, 0xff); + code.LSR(Wresult, Woperand, Wscratch0); + code.CMP(Wscratch0, 32); + code.CSEL(Wresult, Wresult, WZR, LT); + } + } else { + if (shift_arg.IsImmediate() && shift_arg.GetImmediateU8() == 0) { + ctx.reg_alloc.DefineAsExisting(carry_inst, carry_arg); + ctx.reg_alloc.DefineAsExisting(inst, operand_arg); + } else if (shift_arg.IsImmediate()) { + // TODO: Use RMIF + const u8 shift = shift_arg.GetImmediateU8(); + + if (shift < 32) { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + RegAlloc::Realize(Wresult, Wcarry_out, Woperand); + + code.UBFX(Wcarry_out, Woperand, shift - 1, 1); + code.LSL(Wcarry_out, Wcarry_out, 29); + code.LSR(Wresult, Woperand, shift); + } else if (shift > 32) { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + RegAlloc::Realize(Wresult, Wcarry_out); + + code.MOV(Wresult, WZR); + code.MOV(Wcarry_out, WZR); + } else { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + RegAlloc::Realize(Wresult, Wcarry_out, Woperand); + + code.LSR(Wcarry_out, Woperand, 31 - 29); + code.AND(Wcarry_out, Wcarry_out, 1 << 29); + code.MOV(Wresult, WZR); + } + } else { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + auto Wshift = ctx.reg_alloc.ReadW(shift_arg); + auto Wcarry_in = ctx.reg_alloc.ReadW(carry_arg); + if (carry_arg.IsImmediate()) { + RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift); + } else { + RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift, Wcarry_in); + } + ctx.reg_alloc.SpillFlags(); + + // TODO: Use RMIF + + oaknut::Label zero, end; + + code.ANDS(Wscratch1, Wshift, 0xff); + code.B(EQ, zero); + + code.SUB(Wscratch0, Wshift, 1); + code.LSR(Wcarry_out, Woperand, Wscratch0); + code.LSR(Wresult, Woperand, Wshift); + code.UBFIZ(Wcarry_out, Wcarry_out, 29, 1); + code.CMP(Wscratch1, 32); + code.CSEL(Wresult, Wresult, WZR, LT); + code.CSEL(Wcarry_out, Wcarry_out, WZR, LE); + code.B(end); + + code.l(zero); + code.MOV(*Wresult, Woperand); + if (carry_arg.IsImmediate()) { + code.MOV(Wcarry_out, carry_arg.GetImmediateU32() << 29); + } else { + code.MOV(*Wcarry_out, Wcarry_in); + } + + code.l(end); + } + } +} + +template<> +void EmitIR<IR::Opcode::LogicalShiftRight64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (args[1].IsImmediate()) { + const u8 shift = args[1].GetImmediateU8(); + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xoperand = ctx.reg_alloc.ReadX(args[0]); + RegAlloc::Realize(Xresult, Xoperand); + + if (shift <= 63) { + code.LSR(Xresult, Xoperand, shift); + } else { + code.MOV(Xresult, XZR); + } + } else { + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xoperand = ctx.reg_alloc.ReadX(args[0]); + auto Xshift = ctx.reg_alloc.ReadX(args[1]); + RegAlloc::Realize(Xresult, Xoperand, Xshift); + ctx.reg_alloc.SpillFlags(); + + code.AND(Xscratch0, Xshift, 0xff); + code.LSR(Xresult, Xoperand, Xscratch0); + code.CMP(Xscratch0, 64); + code.CSEL(Xresult, Xresult, XZR, LT); + } +} + +template<> +void EmitIR<IR::Opcode::ArithmeticShiftRight32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (!carry_inst) { + if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + RegAlloc::Realize(Wresult, Woperand); + + code.ASR(Wresult, Woperand, shift <= 31 ? shift : 31); + } else { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + auto Wshift = ctx.reg_alloc.ReadW(shift_arg); + RegAlloc::Realize(Wresult, Woperand, Wshift); + ctx.reg_alloc.SpillFlags(); + + code.AND(Wscratch0, Wshift, 0xff); + code.MOV(Wscratch1, 31); + code.CMP(Wscratch0, 31); + code.CSEL(Wscratch0, Wscratch0, Wscratch1, LS); + code.ASR(Wresult, Woperand, Wscratch0); + } + } else { + if (shift_arg.IsImmediate() && shift_arg.GetImmediateU8() == 0) { + ctx.reg_alloc.DefineAsExisting(carry_inst, carry_arg); + ctx.reg_alloc.DefineAsExisting(inst, operand_arg); + } else if (shift_arg.IsImmediate()) { + // TODO: Use RMIF + + const u8 shift = shift_arg.GetImmediateU8(); + + if (shift <= 31) { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + RegAlloc::Realize(Wresult, Wcarry_out, Woperand); + + code.UBFX(Wcarry_out, Woperand, shift - 1, 1); + code.LSL(Wcarry_out, Wcarry_out, 29); + code.ASR(Wresult, Woperand, shift); + } else { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + RegAlloc::Realize(Wresult, Wcarry_out, Woperand); + + code.ASR(Wresult, Woperand, 31); + code.AND(Wcarry_out, Wresult, 1 << 29); + } + } else { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + auto Wshift = ctx.reg_alloc.ReadW(shift_arg); + auto Wcarry_in = ctx.reg_alloc.ReadW(carry_arg); + if (carry_arg.IsImmediate()) { + RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift); + } else { + RegAlloc::Realize(Wresult, Wcarry_out, Woperand, Wshift, Wcarry_in); + } + ctx.reg_alloc.SpillFlags(); + + // TODO: Use RMIF + + oaknut::Label zero, end; + + code.ANDS(Wscratch0, Wshift, 0xff); + code.B(EQ, zero); + + code.MOV(Wscratch1, 63); + code.CMP(Wscratch0, 63); + code.CSEL(Wscratch0, Wscratch0, Wscratch1, LS); + + code.SXTW(Wresult->toX(), Woperand); + code.SUB(Wscratch1, Wscratch0, 1); + + code.ASR(Wcarry_out->toX(), Wresult->toX(), Xscratch1); + code.ASR(Wresult->toX(), Wresult->toX(), Xscratch0); + + code.UBFIZ(Wcarry_out, Wcarry_out, 29, 1); + code.MOV(*Wresult, Wresult); + + code.B(end); + + code.l(zero); + code.MOV(*Wresult, Woperand); + if (carry_arg.IsImmediate()) { + code.MOV(Wcarry_out, carry_arg.GetImmediateU32() << 29); + } else { + code.MOV(*Wcarry_out, Wcarry_in); + } + + code.l(end); + } + } +} + +template<> +void EmitIR<IR::Opcode::ArithmeticShiftRight64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xoperand = ctx.reg_alloc.ReadX(operand_arg); + RegAlloc::Realize(Xresult, Xoperand); + code.ASR(Xresult, Xoperand, shift <= 63 ? shift : 63); + } else { + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xoperand = ctx.reg_alloc.ReadX(operand_arg); + auto Xshift = ctx.reg_alloc.ReadX(shift_arg); + RegAlloc::Realize(Xresult, Xoperand, Xshift); + code.ASR(Xresult, Xoperand, Xshift); + } +} + +template<> +void EmitIR<IR::Opcode::RotateRight32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (shift_arg.IsImmediate() && shift_arg.GetImmediateU8() == 0) { + if (carry_inst) { + ctx.reg_alloc.DefineAsExisting(carry_inst, carry_arg); + } + ctx.reg_alloc.DefineAsExisting(inst, operand_arg); + } else if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8() % 32; + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + RegAlloc::Realize(Wresult, Woperand); + + code.ROR(Wresult, Woperand, shift); + + if (carry_inst) { + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + RegAlloc::Realize(Wcarry_out); + + code.ROR(Wcarry_out, Woperand, ((shift + 31) - 29) % 32); + code.AND(Wcarry_out, Wcarry_out, 1 << 29); + } + } else { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + auto Wshift = ctx.reg_alloc.ReadW(shift_arg); + RegAlloc::Realize(Wresult, Woperand, Wshift); + + code.ROR(Wresult, Woperand, Wshift); + + if (carry_inst && carry_arg.IsImmediate()) { + const u32 carry_in = carry_arg.GetImmediateU32() << 29; + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + RegAlloc::Realize(Wcarry_out); + ctx.reg_alloc.SpillFlags(); + + code.TST(Wshift, 0xff); + code.LSR(Wcarry_out, Wresult, 31 - 29); + code.AND(Wcarry_out, Wcarry_out, 1 << 29); + if (carry_in) { + code.MOV(Wscratch0, carry_in); + code.CSEL(Wcarry_out, Wscratch0, Wcarry_out, EQ); + } else { + code.CSEL(Wcarry_out, WZR, Wcarry_out, EQ); + } + } else if (carry_inst) { + auto Wcarry_in = ctx.reg_alloc.ReadW(carry_arg); + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + RegAlloc::Realize(Wcarry_out, Wcarry_in); + ctx.reg_alloc.SpillFlags(); + + code.TST(Wshift, 0xff); + code.LSR(Wcarry_out, Wresult, 31 - 29); + code.AND(Wcarry_out, Wcarry_out, 1 << 29); + code.CSEL(Wcarry_out, Wcarry_in, Wcarry_out, EQ); + } + } +} + +template<> +void EmitIR<IR::Opcode::RotateRight64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xoperand = ctx.reg_alloc.ReadX(operand_arg); + RegAlloc::Realize(Xresult, Xoperand); + code.ROR(Xresult, Xoperand, shift); + } else { + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xoperand = ctx.reg_alloc.ReadX(operand_arg); + auto Xshift = ctx.reg_alloc.ReadX(shift_arg); + RegAlloc::Realize(Xresult, Xoperand, Xshift); + code.ROR(Xresult, Xoperand, Xshift); + } +} + +template<> +void EmitIR<IR::Opcode::RotateRightExtended>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(args[0]); + + if (args[1].IsImmediate()) { + RegAlloc::Realize(Wresult, Woperand); + + code.LSR(Wresult, Woperand, 1); + if (args[1].GetImmediateU1()) { + code.ORR(Wresult, Wresult, 0x8000'0000); + } + } else { + auto Wcarry_in = ctx.reg_alloc.ReadW(args[1]); + RegAlloc::Realize(Wresult, Woperand, Wcarry_in); + + code.LSR(Wscratch0, Wcarry_in, 29); + code.EXTR(Wresult, Wscratch0, Woperand, 1); + } + + if (carry_inst) { + auto Wcarry_out = ctx.reg_alloc.WriteW(carry_inst); + RegAlloc::Realize(Wcarry_out); + code.UBFIZ(Wcarry_out, Woperand, 29, 1); + } +} + +template<typename ShiftI, typename ShiftR> +static void EmitMaskedShift32(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, ShiftI si_fn, ShiftR sr_fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + RegAlloc::Realize(Wresult, Woperand); + const u32 shift = shift_arg.GetImmediateU32(); + + si_fn(Wresult, Woperand, static_cast<int>(shift & 0x1F)); + } else { + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(operand_arg); + auto Wshift = ctx.reg_alloc.ReadW(shift_arg); + RegAlloc::Realize(Wresult, Woperand, Wshift); + + sr_fn(Wresult, Woperand, Wshift); + } +} + +template<typename ShiftI, typename ShiftR> +static void EmitMaskedShift64(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, ShiftI si_fn, ShiftR sr_fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xoperand = ctx.reg_alloc.ReadX(operand_arg); + RegAlloc::Realize(Xresult, Xoperand); + const u32 shift = shift_arg.GetImmediateU64(); + + si_fn(Xresult, Xoperand, static_cast<int>(shift & 0x3F)); + } else { + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xoperand = ctx.reg_alloc.ReadX(operand_arg); + auto Xshift = ctx.reg_alloc.ReadX(shift_arg); + RegAlloc::Realize(Xresult, Xoperand, Xshift); + + sr_fn(Xresult, Xoperand, Xshift); + } +} + +template<> +void EmitIR<IR::Opcode::LogicalShiftLeftMasked32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift32( + code, ctx, inst, + [&](auto& Wresult, auto& Woperand, auto shift) { code.LSL(Wresult, Woperand, shift); }, + [&](auto& Wresult, auto& Woperand, auto& Wshift) { code.LSL(Wresult, Woperand, Wshift); }); +} + +template<> +void EmitIR<IR::Opcode::LogicalShiftLeftMasked64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift64( + code, ctx, inst, + [&](auto& Xresult, auto& Xoperand, auto shift) { code.LSL(Xresult, Xoperand, shift); }, + [&](auto& Xresult, auto& Xoperand, auto& Xshift) { code.LSL(Xresult, Xoperand, Xshift); }); +} + +template<> +void EmitIR<IR::Opcode::LogicalShiftRightMasked32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift32( + code, ctx, inst, + [&](auto& Wresult, auto& Woperand, auto shift) { code.LSR(Wresult, Woperand, shift); }, + [&](auto& Wresult, auto& Woperand, auto& Wshift) { code.LSR(Wresult, Woperand, Wshift); }); +} + +template<> +void EmitIR<IR::Opcode::LogicalShiftRightMasked64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift64( + code, ctx, inst, + [&](auto& Xresult, auto& Xoperand, auto shift) { code.LSR(Xresult, Xoperand, shift); }, + [&](auto& Xresult, auto& Xoperand, auto& Xshift) { code.LSR(Xresult, Xoperand, Xshift); }); +} + +template<> +void EmitIR<IR::Opcode::ArithmeticShiftRightMasked32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift32( + code, ctx, inst, + [&](auto& Wresult, auto& Woperand, auto shift) { code.ASR(Wresult, Woperand, shift); }, + [&](auto& Wresult, auto& Woperand, auto& Wshift) { code.ASR(Wresult, Woperand, Wshift); }); +} + +template<> +void EmitIR<IR::Opcode::ArithmeticShiftRightMasked64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift64( + code, ctx, inst, + [&](auto& Xresult, auto& Xoperand, auto shift) { code.ASR(Xresult, Xoperand, shift); }, + [&](auto& Xresult, auto& Xoperand, auto& Xshift) { code.ASR(Xresult, Xoperand, Xshift); }); +} + +template<> +void EmitIR<IR::Opcode::RotateRightMasked32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift32( + code, ctx, inst, + [&](auto& Wresult, auto& Woperand, auto shift) { code.ROR(Wresult, Woperand, shift); }, + [&](auto& Wresult, auto& Woperand, auto& Wshift) { code.ROR(Wresult, Woperand, Wshift); }); +} + +template<> +void EmitIR<IR::Opcode::RotateRightMasked64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift64( + code, ctx, inst, + [&](auto& Xresult, auto& Xoperand, auto shift) { code.ROR(Xresult, Xoperand, shift); }, + [&](auto& Xresult, auto& Xoperand, auto& Xshift) { code.ROR(Xresult, Xoperand, Xshift); }); +} + +template<size_t bitsize, typename EmitFn> +static void MaybeAddSubImm(oaknut::CodeGenerator& code, u64 imm, EmitFn emit_fn) { + static_assert(bitsize == 32 || bitsize == 64); + if constexpr (bitsize == 32) { + imm = static_cast<u32>(imm); + } + if (oaknut::AddSubImm::is_valid(imm)) { + emit_fn(imm); + } else { + code.MOV(Rscratch0<bitsize>(), imm); + emit_fn(Rscratch0<bitsize>()); + } +} + +template<size_t bitsize, bool sub> +static void EmitAddSub(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp); + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Rresult = ctx.reg_alloc.WriteReg<bitsize>(inst); + auto Ra = ctx.reg_alloc.ReadReg<bitsize>(args[0]); + + if (overflow_inst) { + // There is a limited set of circumstances where this is required, so assert for this. + ASSERT(!sub); + ASSERT(!nzcv_inst); + ASSERT(args[2].IsImmediate() && args[2].GetImmediateU1() == false); + + auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]); + auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst); + ctx.reg_alloc.SpillFlags(); + RegAlloc::Realize(Rresult, Ra, Rb, Woverflow); + + code.ADDS(Rresult, *Ra, Rb); + code.CSET(Woverflow, VS); + } else if (nzcv_inst) { + if (args[1].IsImmediate()) { + const u64 imm = args[1].GetImmediateU64(); + + if (args[2].IsImmediate()) { + auto flags = ctx.reg_alloc.WriteFlags(nzcv_inst); + RegAlloc::Realize(Rresult, Ra, flags); + + if (args[2].GetImmediateU1()) { + MaybeAddSubImm<bitsize>(code, sub ? imm : ~imm, [&](const auto b) { code.SUBS(Rresult, *Ra, b); }); + } else { + MaybeAddSubImm<bitsize>(code, sub ? ~imm : imm, [&](const auto b) { code.ADDS(Rresult, *Ra, b); }); + } + } else { + RegAlloc::Realize(Rresult, Ra); + ctx.reg_alloc.ReadWriteFlags(args[2], nzcv_inst); + + if (imm == 0) { + if constexpr (bitsize == 32) { + sub ? code.SBCS(Rresult, Ra, WZR) : code.ADCS(Rresult, Ra, WZR); + } else { + sub ? code.SBCS(Rresult, Ra, XZR) : code.ADCS(Rresult, Ra, XZR); + } + } else { + code.MOV(Rscratch0<bitsize>(), imm); + sub ? code.SBCS(Rresult, Ra, Rscratch0<bitsize>()) : code.ADCS(Rresult, Ra, Rscratch0<bitsize>()); + } + } + } else { + auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]); + + if (args[2].IsImmediate()) { + auto flags = ctx.reg_alloc.WriteFlags(nzcv_inst); + RegAlloc::Realize(Rresult, Ra, Rb, flags); + + if (args[2].GetImmediateU1()) { + if (sub) { + code.SUBS(Rresult, *Ra, Rb); + } else { + code.MVN(Rscratch0<bitsize>(), Rb); + code.SUBS(Rresult, *Ra, Rscratch0<bitsize>()); + } + } else { + if (sub) { + code.MVN(Rscratch0<bitsize>(), Rb); + code.ADDS(Rresult, *Ra, Rscratch0<bitsize>()); + } else { + code.ADDS(Rresult, *Ra, Rb); + } + } + } else { + RegAlloc::Realize(Rresult, Ra, Rb); + ctx.reg_alloc.ReadWriteFlags(args[2], nzcv_inst); + + sub ? code.SBCS(Rresult, Ra, Rb) : code.ADCS(Rresult, Ra, Rb); + } + } + } else { + if (args[1].IsImmediate()) { + const u64 imm = args[1].GetImmediateU64(); + + RegAlloc::Realize(Rresult, Ra); + + if (args[2].IsImmediate()) { + if (args[2].GetImmediateU1()) { + MaybeAddSubImm<bitsize>(code, sub ? imm : ~imm, [&](const auto b) { code.SUB(Rresult, *Ra, b); }); + } else { + MaybeAddSubImm<bitsize>(code, sub ? ~imm : imm, [&](const auto b) { code.ADD(Rresult, *Ra, b); }); + } + } else { + ctx.reg_alloc.ReadWriteFlags(args[2], nullptr); + + if (imm == 0) { + if constexpr (bitsize == 32) { + sub ? code.SBC(Rresult, Ra, WZR) : code.ADC(Rresult, Ra, WZR); + } else { + sub ? code.SBC(Rresult, Ra, XZR) : code.ADC(Rresult, Ra, XZR); + } + } else { + code.MOV(Rscratch0<bitsize>(), imm); + sub ? code.SBC(Rresult, Ra, Rscratch0<bitsize>()) : code.ADC(Rresult, Ra, Rscratch0<bitsize>()); + } + } + } else { + auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]); + + RegAlloc::Realize(Rresult, Ra, Rb); + + if (args[2].IsImmediate()) { + if (args[2].GetImmediateU1()) { + if (sub) { + code.SUB(Rresult, *Ra, Rb); + } else { + code.MVN(Rscratch0<bitsize>(), Rb); + code.SUB(Rresult, *Ra, Rscratch0<bitsize>()); + } + } else { + if (sub) { + code.MVN(Rscratch0<bitsize>(), Rb); + code.ADD(Rresult, *Ra, Rscratch0<bitsize>()); + } else { + code.ADD(Rresult, *Ra, Rb); + } + } + } else { + ctx.reg_alloc.ReadWriteFlags(args[2], nullptr); + + sub ? code.SBC(Rresult, Ra, Rb) : code.ADC(Rresult, Ra, Rb); + } + } + } +} + +template<> +void EmitIR<IR::Opcode::Add32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitAddSub<32, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::Add64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitAddSub<64, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::Sub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitAddSub<32, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::Sub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitAddSub<64, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::Mul32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>( + code, ctx, inst, + [&](auto& Wresult, auto& Wa, auto& Wb) { code.MUL(Wresult, Wa, Wb); }); +} + +template<> +void EmitIR<IR::Opcode::Mul64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>( + code, ctx, inst, + [&](auto& Xresult, auto& Xa, auto& Xb) { code.MUL(Xresult, Xa, Xb); }); +} + +template<> +void EmitIR<IR::Opcode::SignedMultiplyHigh64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xop1 = ctx.reg_alloc.ReadX(args[0]); + auto Xop2 = ctx.reg_alloc.ReadX(args[1]); + RegAlloc::Realize(Xresult, Xop1, Xop2); + + code.SMULH(Xresult, Xop1, Xop2); +} + +template<> +void EmitIR<IR::Opcode::UnsignedMultiplyHigh64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xop1 = ctx.reg_alloc.ReadX(args[0]); + auto Xop2 = ctx.reg_alloc.ReadX(args[1]); + RegAlloc::Realize(Xresult, Xop1, Xop2); + + code.UMULH(Xresult, Xop1, Xop2); +} + +template<> +void EmitIR<IR::Opcode::UnsignedDiv32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>( + code, ctx, inst, + [&](auto& Wresult, auto& Wa, auto& Wb) { code.UDIV(Wresult, Wa, Wb); }); +} + +template<> +void EmitIR<IR::Opcode::UnsignedDiv64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>( + code, ctx, inst, + [&](auto& Xresult, auto& Xa, auto& Xb) { code.UDIV(Xresult, Xa, Xb); }); +} + +template<> +void EmitIR<IR::Opcode::SignedDiv32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>( + code, ctx, inst, + [&](auto& Wresult, auto& Wa, auto& Wb) { code.SDIV(Wresult, Wa, Wb); }); +} + +template<> +void EmitIR<IR::Opcode::SignedDiv64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>( + code, ctx, inst, + [&](auto& Xresult, auto& Xa, auto& Xb) { code.SDIV(Xresult, Xa, Xb); }); +} + +template<size_t bitsize> +static bool IsValidBitImm(u64 imm) { + static_assert(bitsize == 32 || bitsize == 64); + if constexpr (bitsize == 32) { + return static_cast<bool>(oaknut::detail::encode_bit_imm(static_cast<u32>(imm))); + } else { + return static_cast<bool>(oaknut::detail::encode_bit_imm(imm)); + } +} + +template<size_t bitsize, typename EmitFn> +static void MaybeBitImm(oaknut::CodeGenerator& code, u64 imm, EmitFn emit_fn) { + static_assert(bitsize == 32 || bitsize == 64); + if constexpr (bitsize == 32) { + imm = static_cast<u32>(imm); + } + if (IsValidBitImm<bitsize>(imm)) { + emit_fn(imm); + } else { + code.MOV(Rscratch0<bitsize>(), imm); + emit_fn(Rscratch0<bitsize>()); + } +} + +template<size_t bitsize, typename EmitFn1, typename EmitFn2 = std::nullptr_t> +static void EmitBitOp(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn1 emit_without_flags, EmitFn2 emit_with_flags = nullptr) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Rresult = ctx.reg_alloc.WriteReg<bitsize>(inst); + auto Ra = ctx.reg_alloc.ReadReg<bitsize>(args[0]); + + if constexpr (!std::is_same_v<EmitFn2, std::nullptr_t>) { + const auto nz_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZFromOp); + const auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp); + ASSERT(!(nz_inst && nzcv_inst)); + const auto flag_inst = nz_inst ? nz_inst : nzcv_inst; + + if (flag_inst) { + auto Wflags = ctx.reg_alloc.WriteFlags(flag_inst); + + if (args[1].IsImmediate()) { + RegAlloc::Realize(Rresult, Ra, Wflags); + + MaybeBitImm<bitsize>(code, args[1].GetImmediateU64(), [&](const auto& b) { emit_with_flags(Rresult, Ra, b); }); + } else { + auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]); + RegAlloc::Realize(Rresult, Ra, Rb, Wflags); + + emit_with_flags(Rresult, Ra, Rb); + } + + return; + } + } + + if (args[1].IsImmediate()) { + RegAlloc::Realize(Rresult, Ra); + + MaybeBitImm<bitsize>(code, args[1].GetImmediateU64(), [&](const auto& b) { emit_without_flags(Rresult, Ra, b); }); + } else { + auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]); + RegAlloc::Realize(Rresult, Ra, Rb); + + emit_without_flags(Rresult, Ra, Rb); + } +} + +template<size_t bitsize> +static void EmitAndNot(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto nz_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZFromOp); + const auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp); + ASSERT(!(nz_inst && nzcv_inst)); + const auto flag_inst = nz_inst ? nz_inst : nzcv_inst; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Rresult = ctx.reg_alloc.WriteReg<bitsize>(inst); + auto Ra = ctx.reg_alloc.ReadReg<bitsize>(args[0]); + + if (flag_inst) { + auto Wflags = ctx.reg_alloc.WriteFlags(flag_inst); + + if (args[1].IsImmediate()) { + RegAlloc::Realize(Rresult, Ra, Wflags); + + const u64 not_imm = bitsize == 32 ? static_cast<u32>(~args[1].GetImmediateU64()) : ~args[1].GetImmediateU64(); + + if (IsValidBitImm<bitsize>(not_imm)) { + code.ANDS(Rresult, Ra, not_imm); + } else { + code.MOV(Rscratch0<bitsize>(), args[1].GetImmediateU64()); + code.BICS(Rresult, Ra, Rscratch0<bitsize>()); + } + } else { + auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]); + RegAlloc::Realize(Rresult, Ra, Rb, Wflags); + + code.BICS(Rresult, Ra, Rb); + } + + return; + } + + if (args[1].IsImmediate()) { + RegAlloc::Realize(Rresult, Ra); + + const u64 not_imm = bitsize == 32 ? static_cast<u32>(~args[1].GetImmediateU64()) : ~args[1].GetImmediateU64(); + + if (IsValidBitImm<bitsize>(not_imm)) { + code.AND(Rresult, Ra, not_imm); + } else { + code.MOV(Rscratch0<bitsize>(), args[1].GetImmediateU64()); + code.BIC(Rresult, Ra, Rscratch0<bitsize>()); + } + } else { + auto Rb = ctx.reg_alloc.ReadReg<bitsize>(args[1]); + RegAlloc::Realize(Rresult, Ra, Rb); + + code.BIC(Rresult, Ra, Rb); + } +} + +template<> +void EmitIR<IR::Opcode::And32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBitOp<32>( + code, ctx, inst, + [&](auto& result, auto& a, auto& b) { code.AND(result, a, b); }, + [&](auto& result, auto& a, auto& b) { code.ANDS(result, a, b); }); +} + +template<> +void EmitIR<IR::Opcode::And64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBitOp<64>( + code, ctx, inst, + [&](auto& result, auto& a, auto& b) { code.AND(result, a, b); }, + [&](auto& result, auto& a, auto& b) { code.ANDS(result, a, b); }); +} + +template<> +void EmitIR<IR::Opcode::AndNot32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitAndNot<32>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::AndNot64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitAndNot<64>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::Eor32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBitOp<32>( + code, ctx, inst, + [&](auto& result, auto& a, auto& b) { code.EOR(result, a, b); }); +} + +template<> +void EmitIR<IR::Opcode::Eor64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBitOp<64>( + code, ctx, inst, + [&](auto& result, auto& a, auto& b) { code.EOR(result, a, b); }); +} + +template<> +void EmitIR<IR::Opcode::Or32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBitOp<32>( + code, ctx, inst, + [&](auto& result, auto& a, auto& b) { code.ORR(result, a, b); }); +} + +template<> +void EmitIR<IR::Opcode::Or64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBitOp<64>( + code, ctx, inst, + [&](auto& result, auto& a, auto& b) { code.ORR(result, a, b); }); +} + +template<> +void EmitIR<IR::Opcode::Not32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>( + code, ctx, inst, + [&](auto& Wresult, auto& Woperand) { code.MVN(Wresult, Woperand); }); +} + +template<> +void EmitIR<IR::Opcode::Not64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>( + code, ctx, inst, + [&](auto& Xresult, auto& Xoperand) { code.MVN(Xresult, Xoperand); }); +} + +template<> +void EmitIR<IR::Opcode::SignExtendByteToWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>( + code, ctx, inst, + [&](auto& Wresult, auto& Woperand) { code.SXTB(Wresult, Woperand); }); +} + +template<> +void EmitIR<IR::Opcode::SignExtendHalfToWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>( + code, ctx, inst, + [&](auto& Wresult, auto& Woperand) { code.SXTH(Wresult, Woperand); }); +} + +template<> +void EmitIR<IR::Opcode::SignExtendByteToLong>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>( + code, ctx, inst, + [&](auto& Xresult, auto& Xoperand) { code.SXTB(Xresult, Xoperand->toW()); }); +} + +template<> +void EmitIR<IR::Opcode::SignExtendHalfToLong>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>( + code, ctx, inst, + [&](auto& Xresult, auto& Xoperand) { code.SXTH(Xresult, Xoperand->toW()); }); +} + +template<> +void EmitIR<IR::Opcode::SignExtendWordToLong>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>( + code, ctx, inst, + [&](auto& Xresult, auto& Xoperand) { code.SXTW(Xresult, Xoperand->toW()); }); +} + +template<> +void EmitIR<IR::Opcode::ZeroExtendByteToWord>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.DefineAsExisting(inst, args[0]); +} + +template<> +void EmitIR<IR::Opcode::ZeroExtendHalfToWord>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.DefineAsExisting(inst, args[0]); +} + +template<> +void EmitIR<IR::Opcode::ZeroExtendByteToLong>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.DefineAsExisting(inst, args[0]); +} + +template<> +void EmitIR<IR::Opcode::ZeroExtendHalfToLong>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.DefineAsExisting(inst, args[0]); +} + +template<> +void EmitIR<IR::Opcode::ZeroExtendWordToLong>(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.DefineAsExisting(inst, args[0]); +} + +template<> +void EmitIR<IR::Opcode::ZeroExtendLongToQuad>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Xvalue = ctx.reg_alloc.ReadX(args[0]); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + RegAlloc::Realize(Xvalue, Qresult); + + code.FMOV(Qresult->toD(), Xvalue); +} + +template<> +void EmitIR<IR::Opcode::ByteReverseWord>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>( + code, ctx, inst, + [&](auto& Wresult, auto& Woperand) { code.REV(Wresult, Woperand); }); +} + +template<> +void EmitIR<IR::Opcode::ByteReverseHalf>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>( + code, ctx, inst, + [&](auto& Wresult, auto& Woperand) { code.REV16(Wresult, Woperand); }); +} + +template<> +void EmitIR<IR::Opcode::ByteReverseDual>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>( + code, ctx, inst, + [&](auto& Xresult, auto& Xoperand) { code.REV(Xresult, Xoperand); }); +} + +template<> +void EmitIR<IR::Opcode::CountLeadingZeros32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>( + code, ctx, inst, + [&](auto& Wresult, auto& Woperand) { code.CLZ(Wresult, Woperand); }); +} + +template<> +void EmitIR<IR::Opcode::CountLeadingZeros64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>( + code, ctx, inst, + [&](auto& Xresult, auto& Xoperand) { code.CLZ(Xresult, Xoperand); }); +} + +template<> +void EmitIR<IR::Opcode::ExtractRegister32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[2].IsImmediate()); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wop1 = ctx.reg_alloc.ReadW(args[0]); + auto Wop2 = ctx.reg_alloc.ReadW(args[1]); + RegAlloc::Realize(Wresult, Wop1, Wop2); + const u8 lsb = args[2].GetImmediateU8(); + + code.EXTR(Wresult, Wop2, Wop1, lsb); // NB: flipped +} + +template<> +void EmitIR<IR::Opcode::ExtractRegister64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[2].IsImmediate()); + + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xop1 = ctx.reg_alloc.ReadX(args[0]); + auto Xop2 = ctx.reg_alloc.ReadX(args[1]); + RegAlloc::Realize(Xresult, Xop1, Xop2); + const u8 lsb = args[2].GetImmediateU8(); + + code.EXTR(Xresult, Xop2, Xop1, lsb); // NB: flipped +} + +template<> +void EmitIR<IR::Opcode::ReplicateBit32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wvalue = ctx.reg_alloc.ReadW(args[0]); + const u8 bit = args[1].GetImmediateU8(); + RegAlloc::Realize(Wresult, Wvalue); + + code.LSL(Wresult, Wvalue, 31 - bit); + code.ASR(Wresult, Wresult, 31); +} + +template<> +void EmitIR<IR::Opcode::ReplicateBit64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xvalue = ctx.reg_alloc.ReadX(args[0]); + const u8 bit = args[1].GetImmediateU8(); + RegAlloc::Realize(Xresult, Xvalue); + + code.LSL(Xresult, Xvalue, 63 - bit); + code.ASR(Xresult, Xresult, 63); +} + +static void EmitMaxMin32(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Cond cond) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wop1 = ctx.reg_alloc.ReadW(args[0]); + auto Wop2 = ctx.reg_alloc.ReadW(args[1]); + RegAlloc::Realize(Wresult, Wop1, Wop2); + ctx.reg_alloc.SpillFlags(); + + code.CMP(Wop1->toW(), Wop2); + code.CSEL(Wresult, Wop1, Wop2, cond); +} + +static void EmitMaxMin64(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, oaknut::Cond cond) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Xresult = ctx.reg_alloc.WriteX(inst); + auto Xop1 = ctx.reg_alloc.ReadX(args[0]); + auto Xop2 = ctx.reg_alloc.ReadX(args[1]); + RegAlloc::Realize(Xresult, Xop1, Xop2); + ctx.reg_alloc.SpillFlags(); + + code.CMP(Xop1->toX(), Xop2); + code.CSEL(Xresult, Xop1, Xop2, cond); +} + +template<> +void EmitIR<IR::Opcode::MaxSigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaxMin32(code, ctx, inst, GT); +} + +template<> +void EmitIR<IR::Opcode::MaxSigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaxMin64(code, ctx, inst, GT); +} + +template<> +void EmitIR<IR::Opcode::MaxUnsigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaxMin32(code, ctx, inst, HI); +} + +template<> +void EmitIR<IR::Opcode::MaxUnsigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaxMin64(code, ctx, inst, HI); +} + +template<> +void EmitIR<IR::Opcode::MinSigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaxMin32(code, ctx, inst, LT); +} + +template<> +void EmitIR<IR::Opcode::MinSigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaxMin64(code, ctx, inst, LT); +} + +template<> +void EmitIR<IR::Opcode::MinUnsigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaxMin32(code, ctx, inst, LO); +} + +template<> +void EmitIR<IR::Opcode::MinUnsigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitMaxMin64(code, ctx, inst, LO); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_floating_point.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_floating_point.cpp new file mode 100644 index 0000000000..22ec0ec24a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_floating_point.cpp @@ -0,0 +1,801 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/fpsr_manager.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +template<size_t bitsize, typename EmitFn> +static void EmitTwoOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteVec<bitsize>(inst); + auto Voperand = ctx.reg_alloc.ReadVec<bitsize>(args[0]); + RegAlloc::Realize(Vresult, Voperand); + ctx.fpsr.Load(); + + emit(Vresult, Voperand); +} + +template<size_t bitsize, typename EmitFn> +static void EmitThreeOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteVec<bitsize>(inst); + auto Va = ctx.reg_alloc.ReadVec<bitsize>(args[0]); + auto Vb = ctx.reg_alloc.ReadVec<bitsize>(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + ctx.fpsr.Load(); + + emit(Vresult, Va, Vb); +} + +template<size_t bitsize, typename EmitFn> +static void EmitFourOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteVec<bitsize>(inst); + auto Va = ctx.reg_alloc.ReadVec<bitsize>(args[0]); + auto Vb = ctx.reg_alloc.ReadVec<bitsize>(args[1]); + auto Vc = ctx.reg_alloc.ReadVec<bitsize>(args[2]); + RegAlloc::Realize(Vresult, Va, Vb, Vc); + ctx.fpsr.Load(); + + emit(Vresult, Va, Vb, Vc); +} + +template<size_t bitsize_from, size_t bitsize_to, typename EmitFn> +static void EmitConvert(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vto = ctx.reg_alloc.WriteVec<bitsize_to>(inst); + auto Vfrom = ctx.reg_alloc.ReadVec<bitsize_from>(args[0]); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); + RegAlloc::Realize(Vto, Vfrom); + ctx.fpsr.Load(); + + ASSERT(rounding_mode == ctx.FPCR().RMode()); + + emit(Vto, Vfrom); +} + +template<size_t bitsize_from, size_t bitsize_to, bool is_signed> +static void EmitToFixed(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Rto = ctx.reg_alloc.WriteReg<std::max<size_t>(bitsize_to, 32)>(inst); + auto Vfrom = ctx.reg_alloc.ReadVec<bitsize_from>(args[0]); + const size_t fbits = args[1].GetImmediateU8(); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + RegAlloc::Realize(Rto, Vfrom); + ctx.fpsr.Load(); + + if (rounding_mode == FP::RoundingMode::TowardsZero) { + if constexpr (is_signed) { + if constexpr (bitsize_to == 16) { + code.FCVTZS(Rto, Vfrom, fbits + 16); + code.ASR(Wscratch0, Rto, 31); + code.ADD(Rto, Rto, Wscratch0, LSR, 16); // Round towards zero when truncating + code.LSR(Rto, Rto, 16); + } else if (fbits) { + code.FCVTZS(Rto, Vfrom, fbits); + } else { + code.FCVTZS(Rto, Vfrom); + } + } else { + if constexpr (bitsize_to == 16) { + code.FCVTZU(Rto, Vfrom, fbits + 16); + code.LSR(Rto, Rto, 16); + } else if (fbits) { + code.FCVTZU(Rto, Vfrom, fbits); + } else { + code.FCVTZU(Rto, Vfrom); + } + } + } else { + ASSERT(fbits == 0); + ASSERT(bitsize_to != 16); + if constexpr (is_signed) { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + code.FCVTNS(Rto, Vfrom); + break; + case FP::RoundingMode::TowardsPlusInfinity: + code.FCVTPS(Rto, Vfrom); + break; + case FP::RoundingMode::TowardsMinusInfinity: + code.FCVTMS(Rto, Vfrom); + break; + case FP::RoundingMode::TowardsZero: + code.FCVTZS(Rto, Vfrom); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + code.FCVTAS(Rto, Vfrom); + break; + case FP::RoundingMode::ToOdd: + ASSERT_FALSE("Unimplemented"); + break; + default: + ASSERT_FALSE("Invalid RoundingMode"); + break; + } + } else { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + code.FCVTNU(Rto, Vfrom); + break; + case FP::RoundingMode::TowardsPlusInfinity: + code.FCVTPU(Rto, Vfrom); + break; + case FP::RoundingMode::TowardsMinusInfinity: + code.FCVTMU(Rto, Vfrom); + break; + case FP::RoundingMode::TowardsZero: + code.FCVTZU(Rto, Vfrom); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + code.FCVTAU(Rto, Vfrom); + break; + case FP::RoundingMode::ToOdd: + ASSERT_FALSE("Unimplemented"); + break; + default: + ASSERT_FALSE("Invalid RoundingMode"); + break; + } + } + } +} + +template<size_t bitsize_from, size_t bitsize_to, typename EmitFn> +static void EmitFromFixed(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vto = ctx.reg_alloc.WriteVec<bitsize_to>(inst); + auto Rfrom = ctx.reg_alloc.ReadReg<std::max<size_t>(bitsize_from, 32)>(args[0]); + const size_t fbits = args[1].GetImmediateU8(); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + RegAlloc::Realize(Vto, Rfrom); + ctx.fpsr.Load(); + + if (rounding_mode == ctx.FPCR().RMode()) { + emit(Vto, Rfrom, fbits); + } else { + FP::FPCR new_fpcr = ctx.FPCR(); + new_fpcr.RMode(rounding_mode); + + code.MOV(Wscratch0, new_fpcr.Value()); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + + emit(Vto, Rfrom, fbits); + + code.MOV(Wscratch0, ctx.FPCR().Value()); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + } +} + +template<> +void EmitIR<IR::Opcode::FPAbs16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPAbs32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FABS(Sresult, Soperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPAbs64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FABS(Dresult, Doperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FADD(Sresult, Sa, Sb); }); +} + +template<> +void EmitIR<IR::Opcode::FPAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FADD(Dresult, Da, Db); }); +} + +template<size_t size> +void EmitCompare(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto flags = ctx.reg_alloc.WriteFlags(inst); + auto Va = ctx.reg_alloc.ReadVec<size>(args[0]); + const bool exc_on_qnan = args[2].GetImmediateU1(); + + if (args[1].IsImmediate() && args[1].GetImmediateU64() == 0) { + RegAlloc::Realize(flags, Va); + ctx.fpsr.Load(); + + if (exc_on_qnan) { + code.FCMPE(Va, 0); + } else { + code.FCMP(Va, 0); + } + } else { + auto Vb = ctx.reg_alloc.ReadVec<size>(args[1]); + RegAlloc::Realize(flags, Va, Vb); + ctx.fpsr.Load(); + + if (exc_on_qnan) { + code.FCMPE(Va, Vb); + } else { + code.FCMP(Va, Vb); + } + } +} + +template<> +void EmitIR<IR::Opcode::FPCompare32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitCompare<32>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPCompare64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitCompare<64>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPDiv32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FDIV(Sresult, Sa, Sb); }); +} + +template<> +void EmitIR<IR::Opcode::FPDiv64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FDIV(Dresult, Da, Db); }); +} + +template<> +void EmitIR<IR::Opcode::FPMax32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMAX(Sresult, Sa, Sb); }); +} + +template<> +void EmitIR<IR::Opcode::FPMax64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMAX(Dresult, Da, Db); }); +} + +template<> +void EmitIR<IR::Opcode::FPMaxNumeric32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMAXNM(Sresult, Sa, Sb); }); +} + +template<> +void EmitIR<IR::Opcode::FPMaxNumeric64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMAXNM(Dresult, Da, Db); }); +} + +template<> +void EmitIR<IR::Opcode::FPMin32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMIN(Sresult, Sa, Sb); }); +} + +template<> +void EmitIR<IR::Opcode::FPMin64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMIN(Dresult, Da, Db); }); +} + +template<> +void EmitIR<IR::Opcode::FPMinNumeric32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMINNM(Sresult, Sa, Sb); }); +} + +template<> +void EmitIR<IR::Opcode::FPMinNumeric64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMINNM(Dresult, Da, Db); }); +} + +template<> +void EmitIR<IR::Opcode::FPMul32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMUL(Sresult, Sa, Sb); }); +} + +template<> +void EmitIR<IR::Opcode::FPMul64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMUL(Dresult, Da, Db); }); +} + +template<> +void EmitIR<IR::Opcode::FPMulAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPMulAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFourOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& S1, auto& S2) { code.FMADD(Sresult, S1, S2, Sa); }); +} + +template<> +void EmitIR<IR::Opcode::FPMulAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFourOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& D1, auto& D2) { code.FMADD(Dresult, D1, D2, Da); }); +} + +template<> +void EmitIR<IR::Opcode::FPMulSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPMulSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFourOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& S1, auto& S2) { code.FMSUB(Sresult, S1, S2, Sa); }); +} + +template<> +void EmitIR<IR::Opcode::FPMulSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFourOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& D1, auto& D2) { code.FMSUB(Dresult, D1, D2, Da); }); +} + +template<> +void EmitIR<IR::Opcode::FPMulX32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FMULX(Sresult, Sa, Sb); }); +} + +template<> +void EmitIR<IR::Opcode::FPMulX64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FMULX(Dresult, Da, Db); }); +} + +template<> +void EmitIR<IR::Opcode::FPNeg16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPNeg32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FNEG(Sresult, Soperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPNeg64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FNEG(Dresult, Doperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPRecipEstimate16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPRecipEstimate32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FRECPE(Sresult, Soperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPRecipEstimate64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FRECPE(Dresult, Doperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPRecipExponent16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPRecipExponent32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FRECPX(Sresult, Soperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPRecipExponent64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FRECPX(Dresult, Doperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPRecipStepFused16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPRecipStepFused32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FRECPS(Sresult, Sa, Sb); }); +} + +template<> +void EmitIR<IR::Opcode::FPRecipStepFused64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FRECPS(Dresult, Da, Db); }); +} + +template<> +void EmitIR<IR::Opcode::FPRoundInt16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPRoundInt32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8()); + const bool exact = inst->GetArg(2).GetU1(); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Sresult = ctx.reg_alloc.WriteS(inst); + auto Soperand = ctx.reg_alloc.ReadS(args[0]); + RegAlloc::Realize(Sresult, Soperand); + ctx.fpsr.Load(); + + if (exact) { + ASSERT(ctx.FPCR().RMode() == rounding_mode); + code.FRINTX(Sresult, Soperand); + } else { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + code.FRINTN(Sresult, Soperand); + break; + case FP::RoundingMode::TowardsPlusInfinity: + code.FRINTP(Sresult, Soperand); + break; + case FP::RoundingMode::TowardsMinusInfinity: + code.FRINTM(Sresult, Soperand); + break; + case FP::RoundingMode::TowardsZero: + code.FRINTZ(Sresult, Soperand); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + code.FRINTA(Sresult, Soperand); + break; + default: + ASSERT_FALSE("Invalid RoundingMode"); + } + } +} + +template<> +void EmitIR<IR::Opcode::FPRoundInt64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8()); + const bool exact = inst->GetArg(2).GetU1(); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Dresult = ctx.reg_alloc.WriteD(inst); + auto Doperand = ctx.reg_alloc.ReadD(args[0]); + RegAlloc::Realize(Dresult, Doperand); + ctx.fpsr.Load(); + + if (exact) { + ASSERT(ctx.FPCR().RMode() == rounding_mode); + code.FRINTX(Dresult, Doperand); + } else { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + code.FRINTN(Dresult, Doperand); + break; + case FP::RoundingMode::TowardsPlusInfinity: + code.FRINTP(Dresult, Doperand); + break; + case FP::RoundingMode::TowardsMinusInfinity: + code.FRINTM(Dresult, Doperand); + break; + case FP::RoundingMode::TowardsZero: + code.FRINTZ(Dresult, Doperand); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + code.FRINTA(Dresult, Doperand); + break; + default: + ASSERT_FALSE("Invalid RoundingMode"); + } + } +} + +template<> +void EmitIR<IR::Opcode::FPRSqrtEstimate16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPRSqrtEstimate32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FRSQRTE(Sresult, Soperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPRSqrtEstimate64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FRSQRTE(Dresult, Doperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPRSqrtStepFused16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPRSqrtStepFused32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FRSQRTS(Sresult, Sa, Sb); }); +} + +template<> +void EmitIR<IR::Opcode::FPRSqrtStepFused64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FRSQRTS(Dresult, Da, Db); }); +} + +template<> +void EmitIR<IR::Opcode::FPSqrt32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Soperand) { code.FSQRT(Sresult, Soperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPSqrt64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Doperand) { code.FSQRT(Dresult, Doperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<32>(code, ctx, inst, [&](auto& Sresult, auto& Sa, auto& Sb) { code.FSUB(Sresult, Sa, Sb); }); +} + +template<> +void EmitIR<IR::Opcode::FPSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp<64>(code, ctx, inst, [&](auto& Dresult, auto& Da, auto& Db) { code.FSUB(Dresult, Da, Db); }); +} + +template<> +void EmitIR<IR::Opcode::FPHalfToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitConvert<16, 64>(code, ctx, inst, [&](auto& Dto, auto& Hfrom) { code.FCVT(Dto, Hfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPHalfToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitConvert<16, 32>(code, ctx, inst, [&](auto& Sto, auto& Hfrom) { code.FCVT(Sto, Hfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPSingleToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitConvert<32, 64>(code, ctx, inst, [&](auto& Dto, auto& Sfrom) { code.FCVT(Dto, Sfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPSingleToHalf>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitConvert<32, 16>(code, ctx, inst, [&](auto& Hto, auto& Sfrom) { code.FCVT(Hto, Sfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPDoubleToHalf>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitConvert<64, 16>(code, ctx, inst, [&](auto& Hto, auto& Dfrom) { code.FCVT(Hto, Dfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPDoubleToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8()); + + if (rounding_mode == FP::RoundingMode::ToOdd) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Sto = ctx.reg_alloc.WriteS(inst); + auto Dfrom = ctx.reg_alloc.ReadD(args[0]); + RegAlloc::Realize(Sto, Dfrom); + ctx.fpsr.Load(); + + code.FCVTXN(Sto, Dfrom); + + return; + } + + EmitConvert<64, 32>(code, ctx, inst, [&](auto& Sto, auto& Dfrom) { code.FCVT(Sto, Dfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPDoubleToFixedS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<64, 16, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPDoubleToFixedS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<64, 32, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPDoubleToFixedS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + // TODO: Consider fpr source + EmitToFixed<64, 64, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPDoubleToFixedU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<64, 16, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPDoubleToFixedU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<64, 32, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPDoubleToFixedU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + // TODO: Consider fpr source + EmitToFixed<64, 64, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPHalfToFixedS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPHalfToFixedS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPHalfToFixedS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPHalfToFixedU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPHalfToFixedU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPHalfToFixedU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPSingleToFixedS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<32, 16, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPSingleToFixedS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + // TODO: Consider fpr source + EmitToFixed<32, 32, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPSingleToFixedS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<32, 64, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPSingleToFixedU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<32, 16, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPSingleToFixedU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + // TODO: Consider fpr source + EmitToFixed<32, 32, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPSingleToFixedU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<32, 64, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPFixedU16ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<16, 32>(code, ctx, inst, [&](auto& Sto, auto& Wfrom, u8 fbits) { + code.LSL(Wscratch0, Wfrom, 16); + code.UCVTF(Sto, Wscratch0, fbits + 16); + }); +} + +template<> +void EmitIR<IR::Opcode::FPFixedS16ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<16, 32>(code, ctx, inst, [&](auto& Sto, auto& Wfrom, u8 fbits) { + code.LSL(Wscratch0, Wfrom, 16); + code.SCVTF(Sto, Wscratch0, fbits + 16); + }); +} + +template<> +void EmitIR<IR::Opcode::FPFixedU16ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<16, 64>(code, ctx, inst, [&](auto& Dto, auto& Wfrom, u8 fbits) { + code.LSL(Wscratch0, Wfrom, 16); + code.UCVTF(Dto, Wscratch0, fbits + 16); + }); +} + +template<> +void EmitIR<IR::Opcode::FPFixedS16ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<16, 64>(code, ctx, inst, [&](auto& Dto, auto& Wfrom, u8 fbits) { + code.LSL(Wscratch0, Wfrom, 16); + code.SCVTF(Dto, Wscratch0, fbits + 16); + }); +} + +template<> +void EmitIR<IR::Opcode::FPFixedU32ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + // TODO: Consider fpr source + EmitFromFixed<32, 32>(code, ctx, inst, [&](auto& Sto, auto& Wfrom, u8 fbits) { fbits ? code.UCVTF(Sto, Wfrom, fbits) : code.UCVTF(Sto, Wfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPFixedS32ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + // TODO: Consider fpr source + EmitFromFixed<32, 32>(code, ctx, inst, [&](auto& Sto, auto& Wfrom, u8 fbits) { fbits ? code.SCVTF(Sto, Wfrom, fbits) : code.SCVTF(Sto, Wfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPFixedU32ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<32, 64>(code, ctx, inst, [&](auto& Dto, auto& Wfrom, u8 fbits) { fbits ? code.UCVTF(Dto, Wfrom, fbits) : code.UCVTF(Dto, Wfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPFixedS32ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<32, 64>(code, ctx, inst, [&](auto& Dto, auto& Wfrom, u8 fbits) { fbits ? code.SCVTF(Dto, Wfrom, fbits) : code.SCVTF(Dto, Wfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPFixedU64ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + // TODO: Consider fpr source + EmitFromFixed<64, 64>(code, ctx, inst, [&](auto& Dto, auto& Xfrom, u8 fbits) { fbits ? code.UCVTF(Dto, Xfrom, fbits) : code.UCVTF(Dto, Xfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPFixedU64ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<64, 32>(code, ctx, inst, [&](auto& Sto, auto& Xfrom, u8 fbits) { fbits ? code.UCVTF(Sto, Xfrom, fbits) : code.UCVTF(Sto, Xfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPFixedS64ToDouble>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + // TODO: Consider fpr source + EmitFromFixed<64, 64>(code, ctx, inst, [&](auto& Dto, auto& Xfrom, u8 fbits) { fbits ? code.SCVTF(Dto, Xfrom, fbits) : code.SCVTF(Dto, Xfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPFixedS64ToSingle>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<64, 32>(code, ctx, inst, [&](auto& Sto, auto& Xfrom, u8 fbits) { fbits ? code.SCVTF(Sto, Xfrom, fbits) : code.SCVTF(Sto, Xfrom); }); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.cpp new file mode 100644 index 0000000000..339e2c59a0 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.cpp @@ -0,0 +1,683 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/arm64/emit_arm64_memory.h" + +#include <optional> +#include <utility> + +#include <mcl/bit_cast.hpp> +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/fastmem.h" +#include "dynarmic/backend/arm64/fpsr_manager.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/ir/acc_type.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +namespace { + +bool IsOrdered(IR::AccType acctype) { + return acctype == IR::AccType::ORDERED || acctype == IR::AccType::ORDEREDRW || acctype == IR::AccType::LIMITEDORDERED; +} + +LinkTarget ReadMemoryLinkTarget(size_t bitsize) { + switch (bitsize) { + case 8: + return LinkTarget::ReadMemory8; + case 16: + return LinkTarget::ReadMemory16; + case 32: + return LinkTarget::ReadMemory32; + case 64: + return LinkTarget::ReadMemory64; + case 128: + return LinkTarget::ReadMemory128; + } + UNREACHABLE(); +} + +LinkTarget WriteMemoryLinkTarget(size_t bitsize) { + switch (bitsize) { + case 8: + return LinkTarget::WriteMemory8; + case 16: + return LinkTarget::WriteMemory16; + case 32: + return LinkTarget::WriteMemory32; + case 64: + return LinkTarget::WriteMemory64; + case 128: + return LinkTarget::WriteMemory128; + } + UNREACHABLE(); +} + +LinkTarget WrappedReadMemoryLinkTarget(size_t bitsize) { + switch (bitsize) { + case 8: + return LinkTarget::WrappedReadMemory8; + case 16: + return LinkTarget::WrappedReadMemory16; + case 32: + return LinkTarget::WrappedReadMemory32; + case 64: + return LinkTarget::WrappedReadMemory64; + case 128: + return LinkTarget::WrappedReadMemory128; + } + UNREACHABLE(); +} + +LinkTarget WrappedWriteMemoryLinkTarget(size_t bitsize) { + switch (bitsize) { + case 8: + return LinkTarget::WrappedWriteMemory8; + case 16: + return LinkTarget::WrappedWriteMemory16; + case 32: + return LinkTarget::WrappedWriteMemory32; + case 64: + return LinkTarget::WrappedWriteMemory64; + case 128: + return LinkTarget::WrappedWriteMemory128; + } + UNREACHABLE(); +} + +LinkTarget ExclusiveReadMemoryLinkTarget(size_t bitsize) { + switch (bitsize) { + case 8: + return LinkTarget::ExclusiveReadMemory8; + case 16: + return LinkTarget::ExclusiveReadMemory16; + case 32: + return LinkTarget::ExclusiveReadMemory32; + case 64: + return LinkTarget::ExclusiveReadMemory64; + case 128: + return LinkTarget::ExclusiveReadMemory128; + } + UNREACHABLE(); +} + +LinkTarget ExclusiveWriteMemoryLinkTarget(size_t bitsize) { + switch (bitsize) { + case 8: + return LinkTarget::ExclusiveWriteMemory8; + case 16: + return LinkTarget::ExclusiveWriteMemory16; + case 32: + return LinkTarget::ExclusiveWriteMemory32; + case 64: + return LinkTarget::ExclusiveWriteMemory64; + case 128: + return LinkTarget::ExclusiveWriteMemory128; + } + UNREACHABLE(); +} + +template<size_t bitsize> +void CallbackOnlyEmitReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.PrepareForCall({}, args[1]); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); + + EmitRelocation(code, ctx, ReadMemoryLinkTarget(bitsize)); + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } + + if constexpr (bitsize == 128) { + code.MOV(Q8.B16(), Q0.B16()); + ctx.reg_alloc.DefineAsRegister(inst, Q8); + } else { + ctx.reg_alloc.DefineAsRegister(inst, X0); + } +} + +template<size_t bitsize> +void CallbackOnlyEmitExclusiveReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.PrepareForCall({}, args[1]); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); + + code.MOV(Wscratch0, 1); + code.STRB(Wscratch0, Xstate, ctx.conf.state_exclusive_state_offset); + EmitRelocation(code, ctx, ExclusiveReadMemoryLinkTarget(bitsize)); + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } + + if constexpr (bitsize == 128) { + code.MOV(Q8.B16(), Q0.B16()); + ctx.reg_alloc.DefineAsRegister(inst, Q8); + } else { + ctx.reg_alloc.DefineAsRegister(inst, X0); + } +} + +template<size_t bitsize> +void CallbackOnlyEmitWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.PrepareForCall({}, args[1], args[2]); + const bool ordered = IsOrdered(args[3].GetImmediateAccType()); + + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } + EmitRelocation(code, ctx, WriteMemoryLinkTarget(bitsize)); + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } +} + +template<size_t bitsize> +void CallbackOnlyEmitExclusiveWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.PrepareForCall({}, args[1], args[2]); + const bool ordered = IsOrdered(args[3].GetImmediateAccType()); + + oaknut::Label end; + + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } + code.MOV(W0, 1); + code.LDRB(Wscratch0, Xstate, ctx.conf.state_exclusive_state_offset); + code.CBZ(Wscratch0, end); + code.STRB(WZR, Xstate, ctx.conf.state_exclusive_state_offset); + EmitRelocation(code, ctx, ExclusiveWriteMemoryLinkTarget(bitsize)); + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } + code.l(end); + ctx.reg_alloc.DefineAsRegister(inst, X0); +} + +constexpr size_t page_bits = 12; +constexpr size_t page_size = 1 << page_bits; +constexpr size_t page_mask = (1 << page_bits) - 1; + +// This function may use Xscratch0 as a scratch register +// Trashes NZCV +template<size_t bitsize> +void EmitDetectMisalignedVAddr(oaknut::CodeGenerator& code, EmitContext& ctx, oaknut::XReg Xaddr, const SharedLabel& fallback) { + static_assert(bitsize == 8 || bitsize == 16 || bitsize == 32 || bitsize == 64 || bitsize == 128); + + if (bitsize == 8 || (ctx.conf.detect_misaligned_access_via_page_table & bitsize) == 0) { + return; + } + + if (!ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) { + const u64 align_mask = []() -> u64 { + switch (bitsize) { + case 16: + return 0b1; + case 32: + return 0b11; + case 64: + return 0b111; + case 128: + return 0b1111; + default: + UNREACHABLE(); + } + }(); + + code.TST(Xaddr, align_mask); + code.B(NE, *fallback); + } else { + // If (addr & page_mask) > page_size - byte_size, use fallback. + code.AND(Xscratch0, Xaddr, page_mask); + code.CMP(Xscratch0, page_size - bitsize / 8); + code.B(HI, *fallback); + } +} + +// Outputs Xscratch0 = page_table[addr >> page_bits] +// May use Xscratch1 as scratch register +// Address to read/write = [ret0 + ret1], ret0 is always Xscratch0 and ret1 is either Xaddr or Xscratch1 +// Trashes NZCV +template<size_t bitsize> +std::pair<oaknut::XReg, oaknut::XReg> InlinePageTableEmitVAddrLookup(oaknut::CodeGenerator& code, EmitContext& ctx, oaknut::XReg Xaddr, const SharedLabel& fallback) { + const size_t valid_page_index_bits = ctx.conf.page_table_address_space_bits - page_bits; + const size_t unused_top_bits = 64 - ctx.conf.page_table_address_space_bits; + + EmitDetectMisalignedVAddr<bitsize>(code, ctx, Xaddr, fallback); + + if (ctx.conf.silently_mirror_page_table || unused_top_bits == 0) { + code.UBFX(Xscratch0, Xaddr, page_bits, valid_page_index_bits); + } else { + code.LSR(Xscratch0, Xaddr, page_bits); + code.TST(Xscratch0, u64(~u64(0)) << valid_page_index_bits); + code.B(NE, *fallback); + } + + code.LDR(Xscratch0, Xpagetable, Xscratch0, LSL, 3); + + if (ctx.conf.page_table_pointer_mask_bits != 0) { + const u64 mask = u64(~u64(0)) << ctx.conf.page_table_pointer_mask_bits; + code.AND(Xscratch0, Xscratch0, mask); + } + + code.CBZ(Xscratch0, *fallback); + + if (ctx.conf.absolute_offset_page_table) { + return std::make_pair(Xscratch0, Xaddr); + } + code.AND(Xscratch1, Xaddr, page_mask); + return std::make_pair(Xscratch0, Xscratch1); +} + +template<std::size_t bitsize> +CodePtr EmitMemoryLdr(oaknut::CodeGenerator& code, int value_idx, oaknut::XReg Xbase, oaknut::XReg Xoffset, bool ordered, bool extend32 = false) { + const auto index_ext = extend32 ? oaknut::IndexExt::UXTW : oaknut::IndexExt::LSL; + const auto add_ext = extend32 ? oaknut::AddSubExt::UXTW : oaknut::AddSubExt::LSL; + const auto Roffset = extend32 ? oaknut::RReg{Xoffset.toW()} : oaknut::RReg{Xoffset}; + + CodePtr fastmem_location = code.xptr<CodePtr>(); + + if (ordered) { + code.ADD(Xscratch0, Xbase, Roffset, add_ext); + + fastmem_location = code.xptr<CodePtr>(); + + switch (bitsize) { + case 8: + code.LDARB(oaknut::WReg{value_idx}, Xscratch0); + break; + case 16: + code.LDARH(oaknut::WReg{value_idx}, Xscratch0); + break; + case 32: + code.LDAR(oaknut::WReg{value_idx}, Xscratch0); + break; + case 64: + code.LDAR(oaknut::XReg{value_idx}, Xscratch0); + break; + case 128: + code.LDR(oaknut::QReg{value_idx}, Xscratch0); + code.DMB(oaknut::BarrierOp::ISH); + break; + default: + ASSERT_FALSE("Invalid bitsize"); + } + } else { + fastmem_location = code.xptr<CodePtr>(); + + switch (bitsize) { + case 8: + code.LDRB(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext); + break; + case 16: + code.LDRH(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext); + break; + case 32: + code.LDR(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext); + break; + case 64: + code.LDR(oaknut::XReg{value_idx}, Xbase, Roffset, index_ext); + break; + case 128: + code.LDR(oaknut::QReg{value_idx}, Xbase, Roffset, index_ext); + break; + default: + ASSERT_FALSE("Invalid bitsize"); + } + } + + return fastmem_location; +} + +template<std::size_t bitsize> +CodePtr EmitMemoryStr(oaknut::CodeGenerator& code, int value_idx, oaknut::XReg Xbase, oaknut::XReg Xoffset, bool ordered, bool extend32 = false) { + const auto index_ext = extend32 ? oaknut::IndexExt::UXTW : oaknut::IndexExt::LSL; + const auto add_ext = extend32 ? oaknut::AddSubExt::UXTW : oaknut::AddSubExt::LSL; + const auto Roffset = extend32 ? oaknut::RReg{Xoffset.toW()} : oaknut::RReg{Xoffset}; + + CodePtr fastmem_location; + + if (ordered) { + code.ADD(Xscratch0, Xbase, Roffset, add_ext); + + fastmem_location = code.xptr<CodePtr>(); + + switch (bitsize) { + case 8: + code.STLRB(oaknut::WReg{value_idx}, Xscratch0); + break; + case 16: + code.STLRH(oaknut::WReg{value_idx}, Xscratch0); + break; + case 32: + code.STLR(oaknut::WReg{value_idx}, Xscratch0); + break; + case 64: + code.STLR(oaknut::XReg{value_idx}, Xscratch0); + break; + case 128: + code.DMB(oaknut::BarrierOp::ISH); + code.STR(oaknut::QReg{value_idx}, Xscratch0); + code.DMB(oaknut::BarrierOp::ISH); + break; + default: + ASSERT_FALSE("Invalid bitsize"); + } + } else { + fastmem_location = code.xptr<CodePtr>(); + + switch (bitsize) { + case 8: + code.STRB(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext); + break; + case 16: + code.STRH(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext); + break; + case 32: + code.STR(oaknut::WReg{value_idx}, Xbase, Roffset, index_ext); + break; + case 64: + code.STR(oaknut::XReg{value_idx}, Xbase, Roffset, index_ext); + break; + case 128: + code.STR(oaknut::QReg{value_idx}, Xbase, Roffset, index_ext); + break; + default: + ASSERT_FALSE("Invalid bitsize"); + } + } + + return fastmem_location; +} + +template<size_t bitsize> +void InlinePageTableEmitReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Xaddr = ctx.reg_alloc.ReadX(args[1]); + auto Rvalue = [&] { + if constexpr (bitsize == 128) { + return ctx.reg_alloc.WriteQ(inst); + } else { + return ctx.reg_alloc.WriteReg<std::max<std::size_t>(bitsize, 32)>(inst); + } + }(); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); + ctx.fpsr.Spill(); + ctx.reg_alloc.SpillFlags(); + RegAlloc::Realize(Xaddr, Rvalue); + + SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel(); + + const auto [Xbase, Xoffset] = InlinePageTableEmitVAddrLookup<bitsize>(code, ctx, Xaddr, fallback); + EmitMemoryLdr<bitsize>(code, Rvalue->index(), Xbase, Xoffset, ordered); + + ctx.deferred_emits.emplace_back([&code, &ctx, inst, Xaddr = *Xaddr, Rvalue = *Rvalue, ordered, fallback, end] { + code.l(*fallback); + code.MOV(Xscratch0, Xaddr); + EmitRelocation(code, ctx, WrappedReadMemoryLinkTarget(bitsize)); + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } + if constexpr (bitsize == 128) { + code.MOV(Rvalue.B16(), Q0.B16()); + } else { + code.MOV(Rvalue.toX(), Xscratch0); + } + ctx.conf.emit_check_memory_abort(code, ctx, inst, *end); + code.B(*end); + }); + + code.l(*end); +} + +template<size_t bitsize> +void InlinePageTableEmitWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Xaddr = ctx.reg_alloc.ReadX(args[1]); + auto Rvalue = [&] { + if constexpr (bitsize == 128) { + return ctx.reg_alloc.ReadQ(args[2]); + } else { + return ctx.reg_alloc.ReadReg<std::max<std::size_t>(bitsize, 32)>(args[2]); + } + }(); + const bool ordered = IsOrdered(args[3].GetImmediateAccType()); + ctx.fpsr.Spill(); + ctx.reg_alloc.SpillFlags(); + RegAlloc::Realize(Xaddr, Rvalue); + + SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel(); + + const auto [Xbase, Xoffset] = InlinePageTableEmitVAddrLookup<bitsize>(code, ctx, Xaddr, fallback); + EmitMemoryStr<bitsize>(code, Rvalue->index(), Xbase, Xoffset, ordered); + + ctx.deferred_emits.emplace_back([&code, &ctx, inst, Xaddr = *Xaddr, Rvalue = *Rvalue, ordered, fallback, end] { + code.l(*fallback); + if constexpr (bitsize == 128) { + code.MOV(Xscratch0, Xaddr); + code.MOV(Q0.B16(), Rvalue.B16()); + } else { + code.MOV(Xscratch0, Xaddr); + code.MOV(Xscratch1, Rvalue.toX()); + } + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } + EmitRelocation(code, ctx, WrappedWriteMemoryLinkTarget(bitsize)); + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } + ctx.conf.emit_check_memory_abort(code, ctx, inst, *end); + code.B(*end); + }); + + code.l(*end); +} + +std::optional<DoNotFastmemMarker> ShouldFastmem(EmitContext& ctx, IR::Inst* inst) { + if (!ctx.conf.fastmem_pointer || !ctx.fastmem.SupportsFastmem()) { + return std::nullopt; + } + + const auto marker = std::make_tuple(ctx.block.Location(), inst->GetName()); + if (ctx.fastmem.ShouldFastmem(marker)) { + return marker; + } + return std::nullopt; +} + +inline bool ShouldExt32(EmitContext& ctx) { + return ctx.conf.fastmem_address_space_bits == 32 && ctx.conf.silently_mirror_fastmem; +} + +// May use Xscratch0 as scratch register +// Address to read/write = [ret0 + ret1], ret0 is always Xfastmem and ret1 is either Xaddr or Xscratch0 +// Trashes NZCV +template<size_t bitsize> +std::pair<oaknut::XReg, oaknut::XReg> FastmemEmitVAddrLookup(oaknut::CodeGenerator& code, EmitContext& ctx, oaknut::XReg Xaddr, const SharedLabel& fallback) { + if (ctx.conf.fastmem_address_space_bits == 64 || ShouldExt32(ctx)) { + return std::make_pair(Xfastmem, Xaddr); + } + + if (ctx.conf.silently_mirror_fastmem) { + code.UBFX(Xscratch0, Xaddr, 0, ctx.conf.fastmem_address_space_bits); + return std::make_pair(Xfastmem, Xscratch0); + } + + code.LSR(Xscratch0, Xaddr, ctx.conf.fastmem_address_space_bits); + code.CBNZ(Xscratch0, *fallback); + return std::make_pair(Xfastmem, Xaddr); +} + +template<size_t bitsize> +void FastmemEmitReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, DoNotFastmemMarker marker) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Xaddr = ctx.reg_alloc.ReadX(args[1]); + auto Rvalue = [&] { + if constexpr (bitsize == 128) { + return ctx.reg_alloc.WriteQ(inst); + } else { + return ctx.reg_alloc.WriteReg<std::max<std::size_t>(bitsize, 32)>(inst); + } + }(); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); + ctx.fpsr.Spill(); + ctx.reg_alloc.SpillFlags(); + RegAlloc::Realize(Xaddr, Rvalue); + + SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel(); + + const auto [Xbase, Xoffset] = FastmemEmitVAddrLookup<bitsize>(code, ctx, Xaddr, fallback); + const auto fastmem_location = EmitMemoryLdr<bitsize>(code, Rvalue->index(), Xbase, Xoffset, ordered, ShouldExt32(ctx)); + + ctx.deferred_emits.emplace_back([&code, &ctx, inst, marker, Xaddr = *Xaddr, Rvalue = *Rvalue, ordered, fallback, end, fastmem_location] { + ctx.ebi.fastmem_patch_info.emplace( + fastmem_location - ctx.ebi.entry_point, + FastmemPatchInfo{ + .marker = marker, + .fc = FakeCall{ + .call_pc = mcl::bit_cast<u64>(code.xptr<void*>()), + }, + .recompile = ctx.conf.recompile_on_fastmem_failure, + }); + + code.l(*fallback); + code.MOV(Xscratch0, Xaddr); + EmitRelocation(code, ctx, WrappedReadMemoryLinkTarget(bitsize)); + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } + if constexpr (bitsize == 128) { + code.MOV(Rvalue.B16(), Q0.B16()); + } else { + code.MOV(Rvalue.toX(), Xscratch0); + } + ctx.conf.emit_check_memory_abort(code, ctx, inst, *end); + code.B(*end); + }); + + code.l(*end); +} + +template<size_t bitsize> +void FastmemEmitWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, DoNotFastmemMarker marker) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Xaddr = ctx.reg_alloc.ReadX(args[1]); + auto Rvalue = [&] { + if constexpr (bitsize == 128) { + return ctx.reg_alloc.ReadQ(args[2]); + } else { + return ctx.reg_alloc.ReadReg<std::max<std::size_t>(bitsize, 32)>(args[2]); + } + }(); + const bool ordered = IsOrdered(args[3].GetImmediateAccType()); + ctx.fpsr.Spill(); + ctx.reg_alloc.SpillFlags(); + RegAlloc::Realize(Xaddr, Rvalue); + + SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel(); + + const auto [Xbase, Xoffset] = FastmemEmitVAddrLookup<bitsize>(code, ctx, Xaddr, fallback); + const auto fastmem_location = EmitMemoryStr<bitsize>(code, Rvalue->index(), Xbase, Xoffset, ordered, ShouldExt32(ctx)); + + ctx.deferred_emits.emplace_back([&code, &ctx, inst, marker, Xaddr = *Xaddr, Rvalue = *Rvalue, ordered, fallback, end, fastmem_location] { + ctx.ebi.fastmem_patch_info.emplace( + fastmem_location - ctx.ebi.entry_point, + FastmemPatchInfo{ + .marker = marker, + .fc = FakeCall{ + .call_pc = mcl::bit_cast<u64>(code.xptr<void*>()), + }, + .recompile = ctx.conf.recompile_on_fastmem_failure, + }); + + code.l(*fallback); + if constexpr (bitsize == 128) { + code.MOV(Xscratch0, Xaddr); + code.MOV(Q0.B16(), Rvalue.B16()); + } else { + code.MOV(Xscratch0, Xaddr); + code.MOV(Xscratch1, Rvalue.toX()); + } + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } + EmitRelocation(code, ctx, WrappedWriteMemoryLinkTarget(bitsize)); + if (ordered) { + code.DMB(oaknut::BarrierOp::ISH); + } + ctx.conf.emit_check_memory_abort(code, ctx, inst, *end); + code.B(*end); + }); + + code.l(*end); +} + +} // namespace + +template<size_t bitsize> +void EmitReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + if (const auto marker = ShouldFastmem(ctx, inst)) { + FastmemEmitReadMemory<bitsize>(code, ctx, inst, *marker); + } else if (ctx.conf.page_table_pointer != 0) { + InlinePageTableEmitReadMemory<bitsize>(code, ctx, inst); + } else { + CallbackOnlyEmitReadMemory<bitsize>(code, ctx, inst); + } +} + +template<size_t bitsize> +void EmitExclusiveReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + CallbackOnlyEmitExclusiveReadMemory<bitsize>(code, ctx, inst); +} + +template<size_t bitsize> +void EmitWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + if (const auto marker = ShouldFastmem(ctx, inst)) { + FastmemEmitWriteMemory<bitsize>(code, ctx, inst, *marker); + } else if (ctx.conf.page_table_pointer != 0) { + InlinePageTableEmitWriteMemory<bitsize>(code, ctx, inst); + } else { + CallbackOnlyEmitWriteMemory<bitsize>(code, ctx, inst); + } +} + +template<size_t bitsize> +void EmitExclusiveWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + CallbackOnlyEmitExclusiveWriteMemory<bitsize>(code, ctx, inst); +} + +template void EmitReadMemory<8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitReadMemory<16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitReadMemory<32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitReadMemory<64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitReadMemory<128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitExclusiveReadMemory<8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitExclusiveReadMemory<16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitExclusiveReadMemory<32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitExclusiveReadMemory<64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitExclusiveReadMemory<128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitWriteMemory<8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitWriteMemory<16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitWriteMemory<32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitWriteMemory<64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitWriteMemory<128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitExclusiveWriteMemory<8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitExclusiveWriteMemory<16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitExclusiveWriteMemory<32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitExclusiveWriteMemory<64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template void EmitExclusiveWriteMemory<128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.h b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.h new file mode 100644 index 0000000000..bfc6d71f4e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_memory.h @@ -0,0 +1,32 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/stdint.hpp> + +namespace oaknut { +struct CodeGenerator; +struct Label; +} // namespace oaknut + +namespace Dynarmic::IR { +enum class AccType; +class Inst; +} // namespace Dynarmic::IR + +namespace Dynarmic::Backend::Arm64 { + +struct EmitContext; +enum class LinkTarget; + +template<size_t bitsize> +void EmitReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template<size_t bitsize> +void EmitExclusiveReadMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template<size_t bitsize> +void EmitWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); +template<size_t bitsize> +void EmitExclusiveWriteMemory(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst); + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_packed.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_packed.cpp new file mode 100644 index 0000000000..b9f13ac6f2 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_packed.cpp @@ -0,0 +1,409 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/fpsr_manager.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +template<typename EmitFn> +static void EmitPackedOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Va = ctx.reg_alloc.ReadD(args[0]); + auto Vb = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + + emit(Vresult, Va, Vb); +} + +template<typename EmitFn> +static void EmitSaturatedPackedOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Va = ctx.reg_alloc.ReadD(args[0]); + auto Vb = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + ctx.fpsr.Spill(); + + emit(Vresult, Va, Vb); +} + +template<> +void EmitIR<IR::Opcode::PackedAddU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Va = ctx.reg_alloc.ReadD(args[0]); + auto Vb = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + + code.ADD(Vresult->B8(), Va->B8(), Vb->B8()); + + if (ge_inst) { + auto Vge = ctx.reg_alloc.WriteD(ge_inst); + RegAlloc::Realize(Vge); + + code.CMHI(Vge->B8(), Va->B8(), Vresult->B8()); + } +} + +template<> +void EmitIR<IR::Opcode::PackedAddS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Va = ctx.reg_alloc.ReadD(args[0]); + auto Vb = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + + code.ADD(Vresult->B8(), Va->B8(), Vb->B8()); + + if (ge_inst) { + auto Vge = ctx.reg_alloc.WriteD(ge_inst); + RegAlloc::Realize(Vge); + + code.SHADD(Vge->B8(), Va->B8(), Vb->B8()); + code.CMGE(Vge->B8(), Vge->B8(), 0); + } +} + +template<> +void EmitIR<IR::Opcode::PackedSubU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Va = ctx.reg_alloc.ReadD(args[0]); + auto Vb = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + + code.SUB(Vresult->B8(), Va->B8(), Vb->B8()); + + if (ge_inst) { + auto Vge = ctx.reg_alloc.WriteD(ge_inst); + RegAlloc::Realize(Vge); + + code.UHSUB(Vge->B8(), Va->B8(), Vb->B8()); + code.CMGE(Vge->B8(), Vge->B8(), 0); + } +} + +template<> +void EmitIR<IR::Opcode::PackedSubS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Va = ctx.reg_alloc.ReadD(args[0]); + auto Vb = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + + code.SUB(Vresult->B8(), Va->B8(), Vb->B8()); + + if (ge_inst) { + auto Vge = ctx.reg_alloc.WriteD(ge_inst); + RegAlloc::Realize(Vge); + + code.SHSUB(Vge->B8(), Va->B8(), Vb->B8()); + code.CMGE(Vge->B8(), Vge->B8(), 0); + } +} + +template<> +void EmitIR<IR::Opcode::PackedAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Va = ctx.reg_alloc.ReadD(args[0]); + auto Vb = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + + code.ADD(Vresult->H4(), Va->H4(), Vb->H4()); + + if (ge_inst) { + auto Vge = ctx.reg_alloc.WriteD(ge_inst); + RegAlloc::Realize(Vge); + + code.CMHI(Vge->H4(), Va->H4(), Vresult->H4()); + } +} + +template<> +void EmitIR<IR::Opcode::PackedAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Va = ctx.reg_alloc.ReadD(args[0]); + auto Vb = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + + code.ADD(Vresult->H4(), Va->H4(), Vb->H4()); + + if (ge_inst) { + auto Vge = ctx.reg_alloc.WriteD(ge_inst); + RegAlloc::Realize(Vge); + + code.SHADD(Vge->H4(), Va->H4(), Vb->H4()); + code.CMGE(Vge->H4(), Vge->H4(), 0); + } +} + +template<> +void EmitIR<IR::Opcode::PackedSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Va = ctx.reg_alloc.ReadD(args[0]); + auto Vb = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + + code.SUB(Vresult->H4(), Va->H4(), Vb->H4()); + + if (ge_inst) { + auto Vge = ctx.reg_alloc.WriteD(ge_inst); + RegAlloc::Realize(Vge); + + code.UHSUB(Vge->H4(), Va->H4(), Vb->H4()); + code.CMGE(Vge->H4(), Vge->H4(), 0); + } +} + +template<> +void EmitIR<IR::Opcode::PackedSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Va = ctx.reg_alloc.ReadD(args[0]); + auto Vb = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + + code.SUB(Vresult->H4(), Va->H4(), Vb->H4()); + + if (ge_inst) { + auto Vge = ctx.reg_alloc.WriteD(ge_inst); + RegAlloc::Realize(Vge); + + code.SHSUB(Vge->H4(), Va->H4(), Vb->H4()); + code.CMGE(Vge->H4(), Vge->H4(), 0); + } +} + +template<bool add_is_hi, bool is_signed, bool is_halving> +static void EmitPackedAddSub(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Va = ctx.reg_alloc.ReadD(args[0]); + auto Vb = ctx.reg_alloc.ReadD(args[1]); + RegAlloc::Realize(Vresult, Va, Vb); + + if (is_signed) { + code.SXTL(V0.S4(), Va->H4()); + code.SXTL(V1.S4(), Vb->H4()); + } else { + code.UXTL(V0.S4(), Va->H4()); + code.UXTL(V1.S4(), Vb->H4()); + } + code.EXT(V1.B8(), V1.B8(), V1.B8(), 4); + + code.MOVI(D2, oaknut::RepImm{add_is_hi ? 0b11110000 : 0b00001111}); + + code.EOR(V1.B8(), V1.B8(), V2.B8()); + code.SUB(V1.S2(), V1.S2(), V2.S2()); + code.SUB(Vresult->S2(), V0.S2(), V1.S2()); + + if (is_halving) { + if (is_signed) { + code.SSHR(Vresult->S2(), Vresult->S2(), 1); + } else { + code.USHR(Vresult->S2(), Vresult->S2(), 1); + } + } + + if (ge_inst) { + ASSERT(!is_halving); + + auto Vge = ctx.reg_alloc.WriteD(ge_inst); + RegAlloc::Realize(Vge); + + if (is_signed) { + code.CMGE(Vge->S2(), Vresult->S2(), 0); + code.XTN(Vge->H4(), Vge->toQ().S4()); + } else { + code.CMEQ(Vge->H4(), Vresult->H4(), 0); + code.EOR(Vge->B8(), Vge->B8(), V2.B8()); + code.SHRN(Vge->H4(), Vge->toQ().S4(), 16); + } + } + + code.XTN(Vresult->H4(), Vresult->toQ().S4()); +} + +template<> +void EmitIR<IR::Opcode::PackedAddSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedAddSub<true, false, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::PackedAddSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedAddSub<true, true, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::PackedSubAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedAddSub<false, false, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::PackedSubAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedAddSub<false, true, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingAddU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UHADD(Vresult->B8(), Va->B8(), Vb->B8()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingAddS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SHADD(Vresult->B8(), Va->B8(), Vb->B8()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingSubU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UHSUB(Vresult->B8(), Va->B8(), Vb->B8()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingSubS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SHSUB(Vresult->B8(), Va->B8(), Vb->B8()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UHADD(Vresult->H4(), Va->H4(), Vb->H4()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SHADD(Vresult->H4(), Va->H4(), Vb->H4()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UHSUB(Vresult->H4(), Va->H4(), Vb->H4()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SHSUB(Vresult->H4(), Va->H4(), Vb->H4()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingAddSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedAddSub<true, false, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingAddSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedAddSub<true, true, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingSubAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedAddSub<false, false, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::PackedHalvingSubAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedAddSub<false, true, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::PackedSaturatedAddU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UQADD(Vresult->B8(), Va->B8(), Vb->B8()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedSaturatedAddS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SQADD(Vresult->B8(), Va->B8(), Vb->B8()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedSaturatedSubU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UQSUB(Vresult->B8(), Va->B8(), Vb->B8()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedSaturatedSubS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SQSUB(Vresult->B8(), Va->B8(), Vb->B8()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedSaturatedAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UQADD(Vresult->H4(), Va->H4(), Vb->H4()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedSaturatedAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SQADD(Vresult->H4(), Va->H4(), Vb->H4()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedSaturatedSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.UQSUB(Vresult->H4(), Va->H4(), Vb->H4()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedSaturatedSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { code.SQSUB(Vresult->H4(), Va->H4(), Vb->H4()); }); +} + +template<> +void EmitIR<IR::Opcode::PackedAbsDiffSumU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitPackedOp(code, ctx, inst, [&](auto& Vresult, auto& Va, auto& Vb) { + code.MOVI(D2, oaknut::RepImm{0b00001111}); + code.UABD(Vresult->B8(), Va->B8(), Vb->B8()); + code.AND(Vresult->B8(), Vresult->B8(), V2.B8()); // TODO: Zext tracking + code.UADDLV(Vresult->toH(), Vresult->B8()); + }); +} + +template<> +void EmitIR<IR::Opcode::PackedSelect>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + auto Vresult = ctx.reg_alloc.WriteD(inst); + auto Vge = ctx.reg_alloc.ReadD(args[0]); + auto Va = ctx.reg_alloc.ReadD(args[1]); + auto Vb = ctx.reg_alloc.ReadD(args[2]); + RegAlloc::Realize(Vresult, Vge, Va, Vb); + + code.FMOV(Vresult, Vge); // TODO: Move elimination + code.BSL(Vresult->B8(), Vb->B8(), Va->B8()); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_saturation.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_saturation.cpp new file mode 100644 index 0000000000..95b8b51ee9 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_saturation.cpp @@ -0,0 +1,273 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +template<> +void EmitIR<IR::Opcode::SignedSaturatedAddWithFlag32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + ASSERT(overflow_inst); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wa = ctx.reg_alloc.ReadW(args[0]); + auto Wb = ctx.reg_alloc.ReadW(args[1]); + auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst); + RegAlloc::Realize(Wresult, Wa, Wb, Woverflow); + ctx.reg_alloc.SpillFlags(); + + code.ADDS(Wresult, *Wa, Wb); + code.ASR(Wscratch0, Wresult, 31); + code.EOR(Wscratch0, Wscratch0, 0x8000'0000); + code.CSEL(Wresult, Wresult, Wscratch0, VC); + code.CSET(Woverflow, VS); +} + +template<> +void EmitIR<IR::Opcode::SignedSaturatedSubWithFlag32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + ASSERT(overflow_inst); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Wa = ctx.reg_alloc.ReadW(args[0]); + auto Wb = ctx.reg_alloc.ReadW(args[1]); + auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst); + RegAlloc::Realize(Wresult, Wa, Wb, Woverflow); + ctx.reg_alloc.SpillFlags(); + + code.SUBS(Wresult, *Wa, Wb); + code.ASR(Wscratch0, Wresult, 31); + code.EOR(Wscratch0, Wscratch0, 0x8000'0000); + code.CSEL(Wresult, Wresult, Wscratch0, VC); + code.CSET(Woverflow, VS); +} + +template<> +void EmitIR<IR::Opcode::SignedSaturation>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const size_t N = args[1].GetImmediateU8(); + ASSERT(N >= 1 && N <= 32); + + if (N == 32) { + ctx.reg_alloc.DefineAsExisting(inst, args[0]); + if (overflow_inst) { + auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst); + RegAlloc::Realize(Woverflow); + code.MOV(*Woverflow, WZR); + } + return; + } + + const u32 positive_saturated_value = (1u << (N - 1)) - 1; + const u32 negative_saturated_value = ~u32{0} << (N - 1); + + auto Woperand = ctx.reg_alloc.ReadW(args[0]); + auto Wresult = ctx.reg_alloc.WriteW(inst); + RegAlloc::Realize(Woperand, Wresult); + ctx.reg_alloc.SpillFlags(); + + code.MOV(Wscratch0, negative_saturated_value); + code.MOV(Wscratch1, positive_saturated_value); + code.CMP(*Woperand, Wscratch0); + code.CSEL(Wresult, Woperand, Wscratch0, GT); + code.CMP(*Woperand, Wscratch1); + code.CSEL(Wresult, Wresult, Wscratch1, LT); + + if (overflow_inst) { + auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst); + RegAlloc::Realize(Woverflow); + code.CMP(*Wresult, Woperand); + code.CSET(Woverflow, NE); + } +} + +template<> +void EmitIR<IR::Opcode::UnsignedSaturation>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Wresult = ctx.reg_alloc.WriteW(inst); + auto Woperand = ctx.reg_alloc.ReadW(args[0]); + RegAlloc::Realize(Wresult, Woperand); + ctx.reg_alloc.SpillFlags(); + + const size_t N = args[1].GetImmediateU8(); + ASSERT(N <= 31); + const u32 saturated_value = (1u << N) - 1; + + code.MOV(Wscratch0, saturated_value); + code.CMP(*Woperand, 0); + code.CSEL(Wresult, Woperand, WZR, GT); + code.CMP(*Woperand, Wscratch0); + code.CSEL(Wresult, Wresult, Wscratch0, LT); + + if (overflow_inst) { + auto Woverflow = ctx.reg_alloc.WriteW(overflow_inst); + RegAlloc::Realize(Woverflow); + code.CSET(Woverflow, HI); + } +} + +template<> +void EmitIR<IR::Opcode::SignedSaturatedAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::SignedSaturatedAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::SignedSaturatedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::SignedSaturatedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::SignedSaturatedDoublingMultiplyReturnHigh16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::SignedSaturatedDoublingMultiplyReturnHigh32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::SignedSaturatedSub8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::SignedSaturatedSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::SignedSaturatedSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::SignedSaturatedSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::UnsignedSaturatedAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::UnsignedSaturatedAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::UnsignedSaturatedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::UnsignedSaturatedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::UnsignedSaturatedSub8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::UnsignedSaturatedSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::UnsignedSaturatedSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::UnsignedSaturatedSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector.cpp new file mode 100644 index 0000000000..e14effca3b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector.cpp @@ -0,0 +1,1889 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/mp/metavalue/lift_value.hpp> +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/fpsr_manager.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/common/always_false.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +template<typename EmitFn> +static void EmitTwoOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + auto Qoperand = ctx.reg_alloc.ReadQ(args[0]); + RegAlloc::Realize(Qresult, Qoperand); + + emit(Qresult, Qoperand); +} + +template<size_t size, typename EmitFn> +static void EmitTwoOpArranged(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) { + if constexpr (size == 8) { + emit(Qresult->B16(), Qoperand->B16()); + } else if constexpr (size == 16) { + emit(Qresult->H8(), Qoperand->H8()); + } else if constexpr (size == 32) { + emit(Qresult->S4(), Qoperand->S4()); + } else if constexpr (size == 64) { + emit(Qresult->D2(), Qoperand->D2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<size_t size, typename EmitFn> +static void EmitTwoOpArrangedSaturated(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitTwoOpArranged<size>(code, ctx, inst, [&](auto Vresult, auto Voperand) { + ctx.fpsr.Load(); + emit(Vresult, Voperand); + }); +} + +template<size_t size, typename EmitFn> +static void EmitTwoOpArrangedWiden(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) { + if constexpr (size == 8) { + emit(Qresult->H8(), Qoperand->toD().B8()); + } else if constexpr (size == 16) { + emit(Qresult->S4(), Qoperand->toD().H4()); + } else if constexpr (size == 32) { + emit(Qresult->D2(), Qoperand->toD().S2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<size_t size, typename EmitFn> +static void EmitTwoOpArrangedNarrow(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) { + if constexpr (size == 16) { + emit(Qresult->toD().B8(), Qoperand->H8()); + } else if constexpr (size == 32) { + emit(Qresult->toD().H4(), Qoperand->S4()); + } else if constexpr (size == 64) { + emit(Qresult->toD().S2(), Qoperand->D2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<size_t size, typename EmitFn> +static void EmitTwoOpArrangedSaturatedNarrow(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitTwoOpArrangedNarrow<size>(code, ctx, inst, [&](auto Vresult, auto Voperand) { + ctx.fpsr.Load(); + emit(Vresult, Voperand); + }); +} + +template<size_t size, typename EmitFn> +static void EmitTwoOpArrangedPairWiden(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) { + if constexpr (size == 8) { + emit(Qresult->H8(), Qoperand->B16()); + } else if constexpr (size == 16) { + emit(Qresult->S4(), Qoperand->H8()); + } else if constexpr (size == 32) { + emit(Qresult->D2(), Qoperand->S4()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<size_t size, typename EmitFn> +static void EmitTwoOpArrangedLower(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) { + if constexpr (size == 8) { + emit(Qresult->toD().B8(), Qoperand->toD().B8()); + } else if constexpr (size == 16) { + emit(Qresult->toD().H4(), Qoperand->toD().H4()); + } else if constexpr (size == 32) { + emit(Qresult->toD().S2(), Qoperand->toD().S2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<typename EmitFn> +static void EmitThreeOp(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + auto Qa = ctx.reg_alloc.ReadQ(args[0]); + auto Qb = ctx.reg_alloc.ReadQ(args[1]); + RegAlloc::Realize(Qresult, Qa, Qb); + + emit(Qresult, Qa, Qb); +} + +template<size_t size, typename EmitFn> +static void EmitThreeOpArranged(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) { + if constexpr (size == 8) { + emit(Qresult->B16(), Qa->B16(), Qb->B16()); + } else if constexpr (size == 16) { + emit(Qresult->H8(), Qa->H8(), Qb->H8()); + } else if constexpr (size == 32) { + emit(Qresult->S4(), Qa->S4(), Qb->S4()); + } else if constexpr (size == 64) { + emit(Qresult->D2(), Qa->D2(), Qb->D2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<size_t size, typename EmitFn> +static void EmitThreeOpArrangedSaturated(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitThreeOpArranged<size>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { + ctx.fpsr.Load(); + emit(Vresult, Va, Vb); + }); +} + +template<size_t size, typename EmitFn> +static void EmitThreeOpArrangedWiden(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) { + if constexpr (size == 8) { + emit(Qresult->H8(), Qa->toD().B8(), Qb->toD().B8()); + } else if constexpr (size == 16) { + emit(Qresult->S4(), Qa->toD().H4(), Qb->toD().H4()); + } else if constexpr (size == 32) { + emit(Qresult->D2(), Qa->toD().S2(), Qb->toD().S2()); + } else if constexpr (size == 64) { + emit(Qresult->Q1(), Qa->toD().D1(), Qb->toD().D1()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<size_t size, typename EmitFn> +static void EmitThreeOpArrangedSaturatedWiden(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitThreeOpArrangedWiden<size>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { + ctx.fpsr.Load(); + emit(Vresult, Va, Vb); + }); +} + +template<size_t size, typename EmitFn> +static void EmitThreeOpArrangedLower(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) { + if constexpr (size == 8) { + emit(Qresult->toD().B8(), Qa->toD().B8(), Qb->toD().B8()); + } else if constexpr (size == 16) { + emit(Qresult->toD().H4(), Qa->toD().H4(), Qb->toD().H4()); + } else if constexpr (size == 32) { + emit(Qresult->toD().S2(), Qa->toD().S2(), Qb->toD().S2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<size_t size, typename EmitFn> +static void EmitSaturatedAccumulate(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qaccumulator = ctx.reg_alloc.ReadWriteQ(args[1], inst); // NB: Swapped + auto Qoperand = ctx.reg_alloc.ReadQ(args[0]); // NB: Swapped + RegAlloc::Realize(Qaccumulator, Qoperand); + ctx.fpsr.Load(); + + if constexpr (size == 8) { + emit(Qaccumulator->B16(), Qoperand->B16()); + } else if constexpr (size == 16) { + emit(Qaccumulator->H8(), Qoperand->H8()); + } else if constexpr (size == 32) { + emit(Qaccumulator->S4(), Qoperand->S4()); + } else if constexpr (size == 64) { + emit(Qaccumulator->D2(), Qoperand->D2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } +} + +template<size_t size, typename EmitFn> +static void EmitImmShift(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + auto Qoperand = ctx.reg_alloc.ReadQ(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + RegAlloc::Realize(Qresult, Qoperand); + + if constexpr (size == 8) { + emit(Qresult->B16(), Qoperand->B16(), shift_amount); + } else if constexpr (size == 16) { + emit(Qresult->H8(), Qoperand->H8(), shift_amount); + } else if constexpr (size == 32) { + emit(Qresult->S4(), Qoperand->S4(), shift_amount); + } else if constexpr (size == 64) { + emit(Qresult->D2(), Qoperand->D2(), shift_amount); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } +} + +template<size_t size, typename EmitFn> +static void EmitImmShiftSaturated(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitImmShift<size>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { + ctx.fpsr.Load(); + emit(Vresult, Voperand, shift_amount); + }); +} + +template<size_t size, typename EmitFn> +static void EmitReduce(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Vresult = ctx.reg_alloc.WriteVec<size>(inst); + auto Qoperand = ctx.reg_alloc.ReadQ(args[0]); + RegAlloc::Realize(Vresult, Qoperand); + + if constexpr (size == 8) { + emit(Vresult, Qoperand->B16()); + } else if constexpr (size == 16) { + emit(Vresult, Qoperand->H8()); + } else if constexpr (size == 32) { + emit(Vresult, Qoperand->S4()); + } else if constexpr (size == 64) { + emit(Vresult, Qoperand->D2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } +} + +template<size_t size, typename EmitFn> +static void EmitGetElement(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + + auto Rresult = ctx.reg_alloc.WriteReg<std::max<size_t>(32, size)>(inst); + auto Qvalue = ctx.reg_alloc.ReadQ(args[0]); + RegAlloc::Realize(Rresult, Qvalue); + + // TODO: fpr destination + + emit(Rresult, Qvalue, index); +} + +template<> +void EmitIR<IR::Opcode::VectorGetElement8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitGetElement<8>(code, ctx, inst, [&](auto& Wresult, auto& Qvalue, u8 index) { code.UMOV(Wresult, Qvalue->Belem()[index]); }); +} + +template<> +void EmitIR<IR::Opcode::VectorGetElement16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitGetElement<16>(code, ctx, inst, [&](auto& Wresult, auto& Qvalue, u8 index) { code.UMOV(Wresult, Qvalue->Helem()[index]); }); +} + +template<> +void EmitIR<IR::Opcode::VectorGetElement32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitGetElement<32>(code, ctx, inst, [&](auto& Wresult, auto& Qvalue, u8 index) { code.UMOV(Wresult, Qvalue->Selem()[index]); }); +} + +template<> +void EmitIR<IR::Opcode::VectorGetElement64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitGetElement<64>(code, ctx, inst, [&](auto& Xresult, auto& Qvalue, u8 index) { code.UMOV(Xresult, Qvalue->Delem()[index]); }); +} + +template<size_t size, typename EmitFn> +static void EmitSetElement(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + + auto Qvector = ctx.reg_alloc.ReadWriteQ(args[0], inst); + auto Rvalue = ctx.reg_alloc.ReadReg<std::max<size_t>(32, size)>(args[2]); + RegAlloc::Realize(Qvector, Rvalue); + + // TODO: fpr source + + emit(Qvector, Rvalue, index); +} + +template<> +void EmitIR<IR::Opcode::VectorSetElement8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSetElement<8>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue, u8 index) { code.MOV(Qvector->Belem()[index], Wvalue); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSetElement16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSetElement<16>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue, u8 index) { code.MOV(Qvector->Helem()[index], Wvalue); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSetElement32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSetElement<32>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue, u8 index) { code.MOV(Qvector->Selem()[index], Wvalue); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSetElement64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSetElement<64>(code, ctx, inst, [&](auto& Qvector, auto& Xvalue, u8 index) { code.MOV(Qvector->Delem()[index], Xvalue); }); +} + +template<> +void EmitIR<IR::Opcode::VectorAbs8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.ABS(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorAbs16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.ABS(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorAbs32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.ABS(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorAbs64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.ABS(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorAnd>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.AND(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorAndNot>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.BIC(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorArithmeticShiftRight8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<8>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SSHR(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorArithmeticShiftRight16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<16>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SSHR(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorArithmeticShiftRight32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<32>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SSHR(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorArithmeticShiftRight64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<64>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SSHR(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorArithmeticVShift8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorArithmeticVShift16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorArithmeticVShift32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorArithmeticVShift64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SSHL(Vresult, Va, Vb); }); +} + +template<size_t size, typename EmitFn> +static void EmitBroadcast(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qvector = ctx.reg_alloc.WriteQ(inst); + auto Rvalue = ctx.reg_alloc.ReadReg<std::max<size_t>(32, size)>(args[0]); + RegAlloc::Realize(Qvector, Rvalue); + + // TODO: fpr source + + emit(Qvector, Rvalue); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcastLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcast<8>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->toD().B8(), Wvalue); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcastLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcast<16>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->toD().H4(), Wvalue); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcastLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcast<32>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->toD().S2(), Wvalue); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcast8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcast<8>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->B16(), Wvalue); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcast16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcast<16>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->H8(), Wvalue); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcast32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcast<32>(code, ctx, inst, [&](auto& Qvector, auto& Wvalue) { code.DUP(Qvector->S4(), Wvalue); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcast64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcast<64>(code, ctx, inst, [&](auto& Qvector, auto& Xvalue) { code.DUP(Qvector->D2(), Xvalue); }); +} + +template<size_t size, typename EmitFn> +static void EmitBroadcastElement(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qvector = ctx.reg_alloc.WriteQ(inst); + auto Qvalue = ctx.reg_alloc.ReadQ(args[0]); + const u8 index = args[1].GetImmediateU8(); + RegAlloc::Realize(Qvector, Qvalue); + + emit(Qvector, Qvalue, index); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcastElementLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcastElement<8>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->toD().B8(), Qvalue->Belem()[index]); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcastElementLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcastElement<16>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->toD().H4(), Qvalue->Helem()[index]); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcastElementLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcastElement<32>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->toD().S2(), Qvalue->Selem()[index]); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcastElement8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcastElement<8>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->B16(), Qvalue->Belem()[index]); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcastElement16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcastElement<16>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->H8(), Qvalue->Helem()[index]); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcastElement32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcastElement<32>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->S4(), Qvalue->Selem()[index]); }); +} + +template<> +void EmitIR<IR::Opcode::VectorBroadcastElement64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitBroadcastElement<64>(code, ctx, inst, [&](auto& Qvector, auto& Qvalue, u8 index) { code.DUP(Qvector->D2(), Qvalue->Delem()[index]); }); +} + +template<> +void EmitIR<IR::Opcode::VectorCountLeadingZeros8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.CLZ(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorCountLeadingZeros16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.CLZ(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorCountLeadingZeros32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.CLZ(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveEven8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveEven16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveEven32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveEven64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveEvenLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveEvenLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveEvenLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP1(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveOdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveOdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveOdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveOdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveOddLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveOddLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorDeinterleaveOddLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UZP2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorEor>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.EOR(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorEqual8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMEQ(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorEqual16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMEQ(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorEqual32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMEQ(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorEqual64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMEQ(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorEqual128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::VectorExtract>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + auto Qa = ctx.reg_alloc.ReadQ(args[0]); + auto Qb = ctx.reg_alloc.ReadQ(args[1]); + const u8 position = args[2].GetImmediateU8(); + ASSERT(position % 8 == 0); + RegAlloc::Realize(Qresult, Qa, Qb); + + code.EXT(Qresult->B16(), Qa->B16(), Qb->B16(), position / 8); +} + +template<> +void EmitIR<IR::Opcode::VectorExtractLower>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Dresult = ctx.reg_alloc.WriteD(inst); + auto Da = ctx.reg_alloc.ReadD(args[0]); + auto Db = ctx.reg_alloc.ReadD(args[1]); + const u8 position = args[2].GetImmediateU8(); + ASSERT(position % 8 == 0); + RegAlloc::Realize(Dresult, Da, Db); + + code.EXT(Dresult->B8(), Da->B8(), Db->B8(), position / 8); +} + +template<> +void EmitIR<IR::Opcode::VectorGreaterS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMGT(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorGreaterS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMGT(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorGreaterS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMGT(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorGreaterS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.CMGT(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingAddS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingAddS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingAddU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingAddU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingSubS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingSubS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingSubS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SHSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingSubU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingSubU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorHalvingSubU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UHSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorInterleaveLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP1(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorInterleaveLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP1(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorInterleaveLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP1(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorInterleaveLower64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP1(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorInterleaveUpper8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorInterleaveUpper16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorInterleaveUpper32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorInterleaveUpper64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ZIP2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalShiftLeft8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<8>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SHL(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalShiftLeft16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<16>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SHL(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalShiftLeft32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<32>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SHL(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalShiftLeft64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<64>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SHL(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalShiftRight8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<8>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.USHR(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalShiftRight16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<16>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.USHR(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalShiftRight32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<32>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.USHR(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalShiftRight64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<64>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.USHR(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalVShift8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.USHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalVShift16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.USHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalVShift32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.USHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorLogicalVShift64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.USHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMaxS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAX(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMaxS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAX(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMaxS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAX(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMaxS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::VectorMaxU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAX(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMaxU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAX(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMaxU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAX(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMaxU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::VectorMinS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMIN(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMinS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMIN(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMinS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMIN(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMinS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::VectorMinU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMIN(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMinU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMIN(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMinU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMIN(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMinU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::VectorMultiply8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.MUL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMultiply16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.MUL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMultiply32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.MUL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMultiply64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + ASSERT_MSG(ctx.conf.very_verbose_debugging_output, "VectorMultiply64 is for debugging only"); + EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) { + code.FMOV(Xscratch0, Qa->toD()); + code.FMOV(Xscratch1, Qb->toD()); + code.MUL(Xscratch0, Xscratch0, Xscratch1); + code.FMOV(Qresult->toD(), Xscratch0); + code.FMOV(Xscratch0, Qa->Delem()[1]); + code.FMOV(Xscratch1, Qb->Delem()[1]); + code.MUL(Xscratch0, Xscratch0, Xscratch1); + code.FMOV(Qresult->Delem()[1], Xscratch0); + }); +} + +template<> +void EmitIR<IR::Opcode::VectorMultiplySignedWiden8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedWiden<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMULL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMultiplySignedWiden16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedWiden<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMULL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMultiplySignedWiden32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedWiden<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMULL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMultiplyUnsignedWiden8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedWiden<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMULL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMultiplyUnsignedWiden16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedWiden<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMULL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorMultiplyUnsignedWiden32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedWiden<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMULL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorNarrow16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedNarrow<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.XTN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorNarrow32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedNarrow<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.XTN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorNarrow64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedNarrow<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.XTN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorNot>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.NOT(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorOr>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ORR(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAddLower8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAddLower16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAddLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAddSignedWiden8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedPairWiden<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SADDLP(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAddSignedWiden16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedPairWiden<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SADDLP(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAddSignedWiden32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedPairWiden<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SADDLP(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAddUnsignedWiden8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedPairWiden<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UADDLP(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAddUnsignedWiden16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedPairWiden<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UADDLP(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAddUnsignedWiden32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedPairWiden<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UADDLP(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.ADDP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxLowerS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxLowerS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxLowerS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxLowerU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxLowerU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMaxLowerU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMAXP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinLowerS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinLowerS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinLowerS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinLowerU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinLowerU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPairedMinLowerU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedLower<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UMINP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPolynomialMultiply8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.PMUL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPolynomialMultiplyLong8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedWiden<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.PMULL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPolynomialMultiplyLong64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedWiden<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.PMULL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorPopulationCount>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.CNT(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorReverseBits>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.RBIT(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorReverseElementsInHalfGroups8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV16(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorReverseElementsInWordGroups8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV32(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorReverseElementsInWordGroups16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV32(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorReverseElementsInLongGroups8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV64(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorReverseElementsInLongGroups16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV64(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorReverseElementsInLongGroups32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.REV64(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorReduceAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReduce<8>(code, ctx, inst, [&](auto& Bresult, auto Voperand) { code.ADDV(Bresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorReduceAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReduce<16>(code, ctx, inst, [&](auto& Hresult, auto Voperand) { code.ADDV(Hresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorReduceAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReduce<32>(code, ctx, inst, [&](auto& Sresult, auto Voperand) { code.ADDV(Sresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorReduceAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitReduce<64>(code, ctx, inst, [&](auto& Dresult, auto Voperand) { code.ADDP(Dresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRotateWholeVectorRight>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShift<8>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { + ASSERT(shift_amount % 8 == 0); + const u8 ext_imm = (shift_amount % 128) / 8; + code.EXT(Vresult, Voperand, Voperand, ext_imm); + }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingHalvingAddS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingHalvingAddS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingHalvingAddS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingHalvingAddU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingHalvingAddU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingHalvingAddU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URHADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingShiftLeftS8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingShiftLeftS16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingShiftLeftS32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingShiftLeftS64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SRSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingShiftLeftU8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingShiftLeftU16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingShiftLeftU32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorRoundingShiftLeftU64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.URSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignExtend8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedWiden<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SXTL(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignExtend16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedWiden<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SXTL(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignExtend32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedWiden<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SXTL(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignExtend64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedAbsoluteDifference8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SABD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedAbsoluteDifference16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SABD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedAbsoluteDifference32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SABD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedMultiply16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedMultiply32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAbs8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturated<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQABS(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAbs16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQABS(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAbs32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQABS(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAbs64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturated<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQABS(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAccumulateUnsigned8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedAccumulate<8>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.SUQADD(Vaccumulator, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAccumulateUnsigned16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedAccumulate<16>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.SUQADD(Vaccumulator, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAccumulateUnsigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedAccumulate<32>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.SUQADD(Vaccumulator, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAccumulateUnsigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedAccumulate<64>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.SUQADD(Vaccumulator, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyHigh16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQDMULH(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyHigh32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQDMULH(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQRDMULH(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQRDMULH(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyLong16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturatedWiden<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQDMULL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedDoublingMultiplyLong32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturatedWiden<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQDMULL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToSigned16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturatedNarrow<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToSigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturatedNarrow<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToSigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturatedNarrow<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToUnsigned16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturatedNarrow<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTUN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToUnsigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturatedNarrow<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTUN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedNarrowToUnsigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturatedNarrow<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQXTUN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedNeg8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturated<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQNEG(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedNeg16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQNEG(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedNeg32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQNEG(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedNeg64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturated<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.SQNEG(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeft8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeft16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeft32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeft64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeftUnsigned8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShiftSaturated<8>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SQSHLU(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeftUnsigned16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShiftSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SQSHLU(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeftUnsigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShiftSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SQSHLU(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedShiftLeftUnsigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitImmShiftSaturated<64>(code, ctx, inst, [&](auto Vresult, auto Voperand, u8 shift_amount) { code.SQSHLU(Vresult, Voperand, shift_amount); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSub8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorTable>(oaknut::CodeGenerator&, EmitContext&, IR::Inst* inst) { + // Do nothing. We *want* to hold on to the refcount for our arguments, so VectorTableLookup can use our arguments. + ASSERT_MSG(inst->UseCount() == 1, "Table cannot be used multiple times"); +} + +template<> +void EmitIR<IR::Opcode::VectorTableLookup64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto table = ctx.reg_alloc.GetArgumentInfo(inst->GetArg(1).GetInst()); + + const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem) { return !elem.IsVoid(); }); + const bool is_defaults_zero = inst->GetArg(0).IsZero(); + + auto Dresult = is_defaults_zero ? ctx.reg_alloc.WriteD(inst) : ctx.reg_alloc.ReadWriteD(args[0], inst); + auto Dindices = ctx.reg_alloc.ReadD(args[2]); + std::vector<RAReg<oaknut::DReg>> Dtable; + for (size_t i = 0; i < table_size; i++) { + Dtable.emplace_back(ctx.reg_alloc.ReadD(table[i])); + } + RegAlloc::Realize(Dresult, Dindices); + for (size_t i = 0; i < table_size; i++) { + RegAlloc::Realize(Dtable[i]); + } + + switch (table_size) { + case 1: + code.MOVI(V2.B16(), 0x08); + code.CMGE(V2.B8(), Dindices->B8(), V2.B8()); + code.ORR(V2.B8(), Dindices->B8(), V2.B8()); + code.FMOV(D0, Dtable[0]); + if (is_defaults_zero) { + code.TBL(Dresult->B8(), oaknut::List{V0.B16()}, D2.B8()); + } else { + code.TBX(Dresult->B8(), oaknut::List{V0.B16()}, D2.B8()); + } + break; + case 2: + code.ZIP1(V0.D2(), Dtable[0]->toQ().D2(), Dtable[1]->toQ().D2()); + if (is_defaults_zero) { + code.TBL(Dresult->B8(), oaknut::List{V0.B16()}, Dindices->B8()); + } else { + code.TBX(Dresult->B8(), oaknut::List{V0.B16()}, Dindices->B8()); + } + break; + case 3: + code.MOVI(V2.B16(), 0x18); + code.CMGE(V2.B8(), Dindices->B8(), V2.B8()); + code.ORR(V2.B8(), Dindices->B8(), V2.B8()); + code.ZIP1(V0.D2(), Dtable[0]->toQ().D2(), Dtable[1]->toQ().D2()); + code.FMOV(D1, Dtable[2]); + if (is_defaults_zero) { + code.TBL(Dresult->B8(), oaknut::List{V0.B16(), V1.B16()}, D2.B8()); + } else { + code.TBX(Dresult->B8(), oaknut::List{V0.B16(), V1.B16()}, D2.B8()); + } + break; + case 4: + code.ZIP1(V0.D2(), Dtable[0]->toQ().D2(), Dtable[1]->toQ().D2()); + code.ZIP1(V1.D2(), Dtable[2]->toQ().D2(), Dtable[3]->toQ().D2()); + if (is_defaults_zero) { + code.TBL(Dresult->B8(), oaknut::List{V0.B16(), V1.B16()}, Dindices->B8()); + } else { + code.TBX(Dresult->B8(), oaknut::List{V0.B16(), V1.B16()}, Dindices->B8()); + } + break; + default: + ASSERT_FALSE("Unsupported table_size"); + } +} + +template<> +void EmitIR<IR::Opcode::VectorTableLookup128>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto table = ctx.reg_alloc.GetArgumentInfo(inst->GetArg(1).GetInst()); + + const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem) { return !elem.IsVoid(); }); + const bool is_defaults_zero = inst->GetArg(0).IsZero(); + + auto Qresult = is_defaults_zero ? ctx.reg_alloc.WriteQ(inst) : ctx.reg_alloc.ReadWriteQ(args[0], inst); + auto Qindices = ctx.reg_alloc.ReadQ(args[2]); + std::vector<RAReg<oaknut::QReg>> Qtable; + for (size_t i = 0; i < table_size; i++) { + Qtable.emplace_back(ctx.reg_alloc.ReadQ(table[i])); + } + RegAlloc::Realize(Qresult, Qindices); + for (size_t i = 0; i < table_size; i++) { + RegAlloc::Realize(Qtable[i]); + } + + switch (table_size) { + case 1: + if (is_defaults_zero) { + code.TBL(Qresult->B16(), oaknut::List{Qtable[0]->B16()}, Qindices->B16()); + } else { + code.TBX(Qresult->B16(), oaknut::List{Qtable[0]->B16()}, Qindices->B16()); + } + break; + case 2: + code.MOV(V0.B16(), Qtable[0]->B16()); + code.MOV(V1.B16(), Qtable[1]->B16()); + if (is_defaults_zero) { + code.TBL(Qresult->B16(), oaknut::List{V0.B16(), V1.B16()}, Qindices->B16()); + } else { + code.TBX(Qresult->B16(), oaknut::List{V0.B16(), V1.B16()}, Qindices->B16()); + } + break; + case 3: + code.MOV(V0.B16(), Qtable[0]->B16()); + code.MOV(V1.B16(), Qtable[1]->B16()); + code.MOV(V2.B16(), Qtable[2]->B16()); + if (is_defaults_zero) { + code.TBL(Qresult->B16(), oaknut::List{V0.B16(), V1.B16(), V2.B16()}, Qindices->B16()); + } else { + code.TBX(Qresult->B16(), oaknut::List{V0.B16(), V1.B16(), V2.B16()}, Qindices->B16()); + } + break; + case 4: + code.MOV(V0.B16(), Qtable[0]->B16()); + code.MOV(V1.B16(), Qtable[1]->B16()); + code.MOV(V2.B16(), Qtable[2]->B16()); + code.MOV(V3.B16(), Qtable[3]->B16()); + if (is_defaults_zero) { + code.TBL(Qresult->B16(), oaknut::List{V0.B16(), V1.B16(), V2.B16(), V3.B16()}, Qindices->B16()); + } else { + code.TBX(Qresult->B16(), oaknut::List{V0.B16(), V1.B16(), V2.B16(), V3.B16()}, Qindices->B16()); + } + break; + default: + ASSERT_FALSE("Unsupported table_size"); + } +} + +template<> +void EmitIR<IR::Opcode::VectorTranspose8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const bool part = inst->GetArg(2).GetU1(); + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { !part ? code.TRN1(Vresult, Va, Vb) : code.TRN2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorTranspose16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const bool part = inst->GetArg(2).GetU1(); + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { !part ? code.TRN1(Vresult, Va, Vb) : code.TRN2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorTranspose32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const bool part = inst->GetArg(2).GetU1(); + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { !part ? code.TRN1(Vresult, Va, Vb) : code.TRN2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorTranspose64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const bool part = inst->GetArg(2).GetU1(); + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { !part ? code.TRN1(Vresult, Va, Vb) : code.TRN2(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedAbsoluteDifference8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UABD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedAbsoluteDifference16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UABD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedAbsoluteDifference32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UABD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedMultiply16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedMultiply32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedRecipEstimate>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.URECPE(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedRecipSqrtEstimate>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.URSQRTE(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedAccumulateSigned8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedAccumulate<8>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.USQADD(Vaccumulator, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedAccumulateSigned16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedAccumulate<16>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.USQADD(Vaccumulator, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedAccumulateSigned32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedAccumulate<32>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.USQADD(Vaccumulator, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedAccumulateSigned64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitSaturatedAccumulate<64>(code, ctx, inst, [&](auto Vaccumulator, auto Voperand) { code.USQADD(Vaccumulator, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedNarrow16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturatedNarrow<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UQXTN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedNarrow32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturatedNarrow<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UQXTN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedNarrow64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedSaturatedNarrow<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UQXTN(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedShiftLeft8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedShiftLeft16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedShiftLeft32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedShiftLeft64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArrangedSaturated<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSHL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorZeroExtend8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedWiden<8>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UXTL(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorZeroExtend16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedWiden<16>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UXTL(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorZeroExtend32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArrangedWiden<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.UXTL(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::VectorZeroExtend64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) { code.FMOV(Qresult->toD(), Qoperand->toD()); }); +} + +template<> +void EmitIR<IR::Opcode::VectorZeroUpper>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qoperand) { code.FMOV(Qresult->toD(), Qoperand->toD()); }); +} + +template<> +void EmitIR<IR::Opcode::ZeroVector>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto Qresult = ctx.reg_alloc.WriteQ(inst); + RegAlloc::Realize(Qresult); + code.MOVI(Qresult->toD(), oaknut::RepImm{0}); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp new file mode 100644 index 0000000000..e0a53bf9e4 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp @@ -0,0 +1,791 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit_cast.hpp> +#include <mcl/mp/metavalue/lift_value.hpp> +#include <mcl/mp/typelist/cartesian_product.hpp> +#include <mcl/mp/typelist/get.hpp> +#include <mcl/mp/typelist/lift_sequence.hpp> +#include <mcl/mp/typelist/list.hpp> +#include <mcl/mp/typelist/lower_to_tuple.hpp> +#include <mcl/type_traits/function_info.hpp> +#include <mcl/type_traits/integer_of_size.hpp> +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/a64_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/fpsr_manager.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/common/always_false.h" +#include "dynarmic/common/cast_util.h" +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/op.h" +#include "dynarmic/common/fp/rounding_mode.h" +#include "dynarmic/common/lut_from_list.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; +namespace mp = mcl::mp; + +using A64FullVectorWidth = std::integral_constant<size_t, 128>; + +// Array alias that always sizes itself according to the given type T +// relative to the size of a vector register. e.g. T = u32 would result +// in a std::array<u32, 4>. +template<typename T> +using VectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof<T>>; + +template<typename EmitFn> +static void MaybeStandardFPSCRValue(oaknut::CodeGenerator& code, EmitContext& ctx, bool fpcr_controlled, EmitFn emit) { + if (ctx.FPCR(fpcr_controlled) != ctx.FPCR()) { + code.MOV(Wscratch0, ctx.FPCR(fpcr_controlled).Value()); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + emit(); + code.MOV(Wscratch0, ctx.FPCR().Value()); + code.MSR(oaknut::SystemReg::FPCR, Xscratch0); + } else { + emit(); + } +} + +template<typename EmitFn> +static void EmitTwoOp(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + auto Qa = ctx.reg_alloc.ReadQ(args[0]); + const bool fpcr_controlled = args[1].IsVoid() || args[1].GetImmediateU1(); + RegAlloc::Realize(Qresult, Qa); + ctx.fpsr.Load(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { emit(Qresult, Qa); }); +} + +template<size_t size, typename EmitFn> +static void EmitTwoOpArranged(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitTwoOp(code, ctx, inst, [&](auto& Qresult, auto& Qa) { + if constexpr (size == 16) { + emit(Qresult->H8(), Qa->H8()); + } else if constexpr (size == 32) { + emit(Qresult->S4(), Qa->S4()); + } else if constexpr (size == 64) { + emit(Qresult->D2(), Qa->D2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<typename EmitFn> +static void EmitThreeOp(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + auto Qa = ctx.reg_alloc.ReadQ(args[0]); + auto Qb = ctx.reg_alloc.ReadQ(args[1]); + const bool fpcr_controlled = args[2].GetImmediateU1(); + RegAlloc::Realize(Qresult, Qa, Qb); + ctx.fpsr.Load(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { emit(Qresult, Qa, Qb); }); +} + +template<size_t size, typename EmitFn> +static void EmitThreeOpArranged(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) { + if constexpr (size == 16) { + emit(Qresult->H8(), Qa->H8(), Qb->H8()); + } else if constexpr (size == 32) { + emit(Qresult->S4(), Qa->S4(), Qb->S4()); + } else if constexpr (size == 64) { + emit(Qresult->D2(), Qa->D2(), Qb->D2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<size_t size, typename EmitFn> +static void EmitFMA(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qresult = ctx.reg_alloc.ReadWriteQ(args[0], inst); + auto Qm = ctx.reg_alloc.ReadQ(args[1]); + auto Qn = ctx.reg_alloc.ReadQ(args[2]); + const bool fpcr_controlled = args[3].GetImmediateU1(); + RegAlloc::Realize(Qresult, Qm, Qn); + ctx.fpsr.Load(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + if constexpr (size == 16) { + emit(Qresult->H8(), Qm->H8(), Qn->H8()); + } else if constexpr (size == 32) { + emit(Qresult->S4(), Qm->S4(), Qn->S4()); + } else if constexpr (size == 64) { + emit(Qresult->D2(), Qm->D2(), Qn->D2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<size_t size, typename EmitFn> +static void EmitFromFixed(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qto = ctx.reg_alloc.WriteQ(inst); + auto Qfrom = ctx.reg_alloc.ReadQ(args[0]); + const u8 fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + const bool fpcr_controlled = args[3].GetImmediateU1(); + ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode()); + RegAlloc::Realize(Qto, Qfrom); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + if constexpr (size == 32) { + emit(Qto->S4(), Qfrom->S4(), fbits); + } else if constexpr (size == 64) { + emit(Qto->D2(), Qfrom->D2(), fbits); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } + }); +} + +template<size_t fsize, bool is_signed> +void EmitToFixed(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qto = ctx.reg_alloc.WriteQ(inst); + auto Qfrom = ctx.reg_alloc.ReadQ(args[0]); + const size_t fbits = args[1].GetImmediateU8(); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + const bool fpcr_controlled = inst->GetArg(3).GetU1(); + RegAlloc::Realize(Qto, Qfrom); + ctx.fpsr.Load(); + + auto Vto = [&] { + if constexpr (fsize == 32) { + return Qto->S4(); + } else if constexpr (fsize == 64) { + return Qto->D2(); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<fsize>>); + } + }(); + auto Vfrom = [&] { + if constexpr (fsize == 32) { + return Qfrom->S4(); + } else if constexpr (fsize == 64) { + return Qfrom->D2(); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<fsize>>); + } + }(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + if (rounding_mode == FP::RoundingMode::TowardsZero) { + if constexpr (is_signed) { + if (fbits) { + code.FCVTZS(Vto, Vfrom, fbits); + } else { + code.FCVTZS(Vto, Vfrom); + } + } else { + if (fbits) { + code.FCVTZU(Vto, Vfrom, fbits); + } else { + code.FCVTZU(Vto, Vfrom); + } + } + } else { + ASSERT(fbits == 0); + if constexpr (is_signed) { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + code.FCVTNS(Vto, Vfrom); + break; + case FP::RoundingMode::TowardsPlusInfinity: + code.FCVTPS(Vto, Vfrom); + break; + case FP::RoundingMode::TowardsMinusInfinity: + code.FCVTMS(Vto, Vfrom); + break; + case FP::RoundingMode::TowardsZero: + code.FCVTZS(Vto, Vfrom); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + code.FCVTAS(Vto, Vfrom); + break; + case FP::RoundingMode::ToOdd: + ASSERT_FALSE("Unimplemented"); + break; + default: + ASSERT_FALSE("Invalid RoundingMode"); + break; + } + } else { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + code.FCVTNU(Vto, Vfrom); + break; + case FP::RoundingMode::TowardsPlusInfinity: + code.FCVTPU(Vto, Vfrom); + break; + case FP::RoundingMode::TowardsMinusInfinity: + code.FCVTMU(Vto, Vfrom); + break; + case FP::RoundingMode::TowardsZero: + code.FCVTZU(Vto, Vfrom); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + code.FCVTAU(Vto, Vfrom); + break; + case FP::RoundingMode::ToOdd: + ASSERT_FALSE("Unimplemented"); + break; + default: + ASSERT_FALSE("Invalid RoundingMode"); + break; + } + } + } + }); +} + +template<typename Lambda> +static void EmitTwoOpFallbackWithoutRegAlloc(oaknut::CodeGenerator& code, EmitContext& ctx, oaknut::QReg Qresult, oaknut::QReg Qarg1, Lambda lambda, bool fpcr_controlled) { + const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda); + + const u32 fpcr = ctx.FPCR(fpcr_controlled).Value(); + constexpr u64 stack_size = sizeof(u64) * 4; // sizeof(u128) * 2 + + ABI_PushRegisters(code, ABI_CALLER_SAVE & ~(1ull << Qresult.index()), stack_size); + + code.MOV(Xscratch0, mcl::bit_cast<u64>(fn)); + code.ADD(X0, SP, 0 * 16); + code.ADD(X1, SP, 1 * 16); + code.MOV(X2, fpcr); + code.ADD(X3, Xstate, ctx.conf.state_fpsr_offset); + code.STR(Qarg1, X1); + code.BLR(Xscratch0); + code.LDR(Qresult, SP); + + ABI_PopRegisters(code, ABI_CALLER_SAVE & ~(1ull << Qresult.index()), stack_size); +} + +template<size_t fpcr_controlled_arg_index = 1, typename Lambda> +static void EmitTwoOpFallback(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qarg1 = ctx.reg_alloc.ReadQ(args[0]); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + RegAlloc::Realize(Qarg1, Qresult); + ctx.reg_alloc.SpillFlags(); + ctx.fpsr.Spill(); + + const bool fpcr_controlled = args[fpcr_controlled_arg_index].GetImmediateU1(); + EmitTwoOpFallbackWithoutRegAlloc(code, ctx, Qresult, Qarg1, lambda, fpcr_controlled); +} + +template<> +void EmitIR<IR::Opcode::FPVectorAbs16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qresult = ctx.reg_alloc.ReadWriteQ(args[0], inst); + RegAlloc::Realize(Qresult); + + code.BIC(Qresult->H8(), 0b10000000, LSL, 8); +} + +template<> +void EmitIR<IR::Opcode::FPVectorAbs32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FABS(Vresult, Va); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorAbs64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FABS(Vresult, Va); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorDiv32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FDIV(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorDiv64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FDIV(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorEqual16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPVectorEqual32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMEQ(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorEqual64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMEQ(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorFromHalf32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); + ASSERT(rounding_mode == FP::RoundingMode::ToNearest_TieEven); + const bool fpcr_controlled = args[2].GetImmediateU1(); + + auto Qresult = ctx.reg_alloc.WriteQ(inst); + auto Doperand = ctx.reg_alloc.ReadD(args[0]); + RegAlloc::Realize(Qresult, Doperand); + ctx.fpsr.Load(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + code.FCVTL(Qresult->S4(), Doperand->H4()); + }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorFromSignedFixed32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<32>(code, ctx, inst, [&](auto Vto, auto Vfrom, u8 fbits) { fbits ? code.SCVTF(Vto, Vfrom, fbits) : code.SCVTF(Vto, Vfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorFromSignedFixed64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<64>(code, ctx, inst, [&](auto Vto, auto Vfrom, u8 fbits) { fbits ? code.SCVTF(Vto, Vfrom, fbits) : code.SCVTF(Vto, Vfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorFromUnsignedFixed32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<32>(code, ctx, inst, [&](auto Vto, auto Vfrom, u8 fbits) { fbits ? code.UCVTF(Vto, Vfrom, fbits) : code.UCVTF(Vto, Vfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorFromUnsignedFixed64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFromFixed<64>(code, ctx, inst, [&](auto Vto, auto Vfrom, u8 fbits) { fbits ? code.UCVTF(Vto, Vfrom, fbits) : code.UCVTF(Vto, Vfrom); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorGreater32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMGT(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorGreater64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMGT(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorGreaterEqual32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMGE(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorGreaterEqual64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FCMGE(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMax32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAX(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMax64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAX(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMaxNumeric32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAXNM(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMaxNumeric64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMAXNM(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMin32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMIN(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMin64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMIN(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMinNumeric32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMINNM(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMinNumeric64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMINNM(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMul32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMUL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMul64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMUL(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMulAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMulAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFMA<32>(code, ctx, inst, [&](auto Va, auto Vn, auto Vm) { code.FMLA(Va, Vn, Vm); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMulAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitFMA<64>(code, ctx, inst, [&](auto Va, auto Vn, auto Vm) { code.FMLA(Va, Vn, Vm); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMulX32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMULX(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorMulX64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FMULX(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorNeg16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPVectorNeg32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FNEG(Vresult, Va); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorNeg64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FNEG(Vresult, Va); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorPairedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FADDP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorPairedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FADDP(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorPairedAddLower32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) { + code.ZIP1(V0.D2(), Qa->D2(), Qb->D2()); + code.MOVI(D1, oaknut::RepImm{0}); + code.FADDP(Qresult->S4(), V0.S4(), V1.S4()); + }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorPairedAddLower64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOp(code, ctx, inst, [&](auto& Qresult, auto& Qa, auto& Qb) { + code.ZIP1(V0.D2(), Qa->D2(), Qb->D2()); + code.FADDP(Qresult->toD(), V0.D2()); + }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRecipEstimate16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRecipEstimate32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.FRECPE(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRecipEstimate64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.FRECPE(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRecipStepFused16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRecipStepFused32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FRECPS(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRecipStepFused64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FRECPS(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRoundInt16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8()); + const bool exact = inst->GetArg(2).GetU1(); + + using rounding_list = mp::list< + mp::lift_value<FP::RoundingMode::ToNearest_TieEven>, + mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsZero>, + mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>; + using exact_list = mp::list<std::true_type, std::false_type>; + + static const auto lut = Common::GenerateLookupTableFromList( + []<typename I>(I) { + using FPT = u16; + return std::pair{ + mp::lower_to_tuple_v<I>, + Common::FptrCast( + [](VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value; + constexpr bool exact = mp::get<1, I>::value; + + for (size_t i = 0; i < output.size(); ++i) { + output[i] = static_cast<FPT>(FP::FPRoundInt<FPT>(input[i], fpcr, rounding_mode, exact, fpsr)); + } + })}; + }, + mp::cartesian_product<rounding_list, exact_list>{}); + + EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(rounding, exact))); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRoundInt32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8()); + const bool exact = inst->GetArg(2).GetU1(); + const bool fpcr_controlled = inst->GetArg(3).GetU1(); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + auto Qoperand = ctx.reg_alloc.ReadQ(args[0]); + RegAlloc::Realize(Qresult, Qoperand); + ctx.fpsr.Load(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + if (exact) { + ASSERT(ctx.FPCR(fpcr_controlled).RMode() == rounding_mode); + code.FRINTX(Qresult->S4(), Qoperand->S4()); + } else { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + code.FRINTN(Qresult->S4(), Qoperand->S4()); + break; + case FP::RoundingMode::TowardsPlusInfinity: + code.FRINTP(Qresult->S4(), Qoperand->S4()); + break; + case FP::RoundingMode::TowardsMinusInfinity: + code.FRINTM(Qresult->S4(), Qoperand->S4()); + break; + case FP::RoundingMode::TowardsZero: + code.FRINTZ(Qresult->S4(), Qoperand->S4()); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + code.FRINTA(Qresult->S4(), Qoperand->S4()); + break; + default: + ASSERT_FALSE("Invalid RoundingMode"); + } + } + }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRoundInt64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8()); + const bool exact = inst->GetArg(2).GetU1(); + const bool fpcr_controlled = inst->GetArg(3).GetU1(); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + auto Qoperand = ctx.reg_alloc.ReadQ(args[0]); + RegAlloc::Realize(Qresult, Qoperand); + ctx.fpsr.Load(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + if (exact) { + ASSERT(ctx.FPCR(fpcr_controlled).RMode() == rounding_mode); + code.FRINTX(Qresult->D2(), Qoperand->D2()); + } else { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + code.FRINTN(Qresult->D2(), Qoperand->D2()); + break; + case FP::RoundingMode::TowardsPlusInfinity: + code.FRINTP(Qresult->D2(), Qoperand->D2()); + break; + case FP::RoundingMode::TowardsMinusInfinity: + code.FRINTM(Qresult->D2(), Qoperand->D2()); + break; + case FP::RoundingMode::TowardsZero: + code.FRINTZ(Qresult->D2(), Qoperand->D2()); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + code.FRINTA(Qresult->D2(), Qoperand->D2()); + break; + default: + ASSERT_FALSE("Invalid RoundingMode"); + } + } + }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRSqrtEstimate16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRSqrtEstimate32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.FRSQRTE(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRSqrtEstimate64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Voperand) { code.FRSQRTE(Vresult, Voperand); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRSqrtStepFused16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRSqrtStepFused32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FRSQRTS(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorRSqrtStepFused64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FRSQRTS(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorSqrt32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FSQRT(Vresult, Va); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorSqrt64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va) { code.FSQRT(Vresult, Va); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorToHalf32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); + ASSERT(rounding_mode == FP::RoundingMode::ToNearest_TieEven); + const bool fpcr_controlled = args[2].GetImmediateU1(); + + auto Dresult = ctx.reg_alloc.WriteD(inst); + auto Qoperand = ctx.reg_alloc.ReadQ(args[0]); + RegAlloc::Realize(Dresult, Qoperand); + ctx.fpsr.Load(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + code.FCVTN(Dresult->H4(), Qoperand->S4()); + }); +} + +template<> +void EmitIR<IR::Opcode::FPVectorToSignedFixed16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPVectorToSignedFixed32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<32, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPVectorToSignedFixed64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<64, true>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPVectorToUnsignedFixed16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + (void)code; + (void)ctx; + (void)inst; + ASSERT_FALSE("Unimplemented"); +} + +template<> +void EmitIR<IR::Opcode::FPVectorToUnsignedFixed32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<32, false>(code, ctx, inst); +} + +template<> +void EmitIR<IR::Opcode::FPVectorToUnsignedFixed64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + EmitToFixed<64, false>(code, ctx, inst); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_saturation.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_saturation.cpp new file mode 100644 index 0000000000..722a09176b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_saturation.cpp @@ -0,0 +1,126 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/mp/metavalue/lift_value.hpp> +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/a32_jitstate.h" +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/fpsr_manager.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/common/always_false.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +template<size_t size, typename EmitFn> +static void Emit(oaknut::CodeGenerator&, EmitContext& ctx, IR::Inst* inst, EmitFn emit) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto Qresult = ctx.reg_alloc.WriteQ(inst); + auto Qa = ctx.reg_alloc.ReadQ(args[0]); + auto Qb = ctx.reg_alloc.ReadQ(args[1]); + RegAlloc::Realize(Qresult, Qa, Qb); + ctx.fpsr.Load(); + + if constexpr (size == 8) { + emit(Qresult->B16(), Qa->B16(), Qb->B16()); + } else if constexpr (size == 16) { + emit(Qresult->H8(), Qa->H8(), Qb->H8()); + } else if constexpr (size == 32) { + emit(Qresult->S4(), Qa->S4(), Qb->S4()); + } else if constexpr (size == 64) { + emit(Qresult->D2(), Qa->D2(), Qb->D2()); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<size>>); + } +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedSub8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorSignedSaturatedSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.SQSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedAdd8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedAdd16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedAdd32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedAdd64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQADD(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedSub8>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<8>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedSub16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<16>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedSub32>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<32>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSUB(Vresult, Va, Vb); }); +} + +template<> +void EmitIR<IR::Opcode::VectorUnsignedSaturatedSub64>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { + Emit<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.UQSUB(Vresult, Va, Vb); }); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/emit_context.h b/externals/dynarmic/src/dynarmic/backend/arm64/emit_context.h new file mode 100644 index 0000000000..6bfc6f3cae --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/emit_context.h @@ -0,0 +1,51 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <functional> +#include <memory> +#include <vector> + +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/emit_arm64.h" +#include "dynarmic/backend/arm64/reg_alloc.h" +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/ir/basic_block.h" + +namespace Dynarmic::IR { +class Block; +} // namespace Dynarmic::IR + +namespace Dynarmic::Backend::Arm64 { + +struct EmitConfig; +class FastmemManager; +class FpsrManager; + +using SharedLabel = std::shared_ptr<oaknut::Label>; + +inline SharedLabel GenSharedLabel() { + return std::make_shared<oaknut::Label>(); +} + +struct EmitContext { + IR::Block& block; + RegAlloc& reg_alloc; + const EmitConfig& conf; + EmittedBlockInfo& ebi; + FpsrManager& fpsr; + FastmemManager& fastmem; + + std::vector<std::function<void()>> deferred_emits; + + FP::FPCR FPCR(bool fpcr_controlled = true) const { + const FP::FPCR fpcr = conf.descriptor_to_fpcr(block.Location()); + return fpcr_controlled ? fpcr : fpcr.ASIMDStandardValue(); + } +}; + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/exclusive_monitor.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/exclusive_monitor.cpp new file mode 100644 index 0000000000..d57a29cd85 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/exclusive_monitor.cpp @@ -0,0 +1,58 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/interface/exclusive_monitor.h" + +#include <algorithm> + +#include <mcl/assert.hpp> + +namespace Dynarmic { + +ExclusiveMonitor::ExclusiveMonitor(size_t processor_count) + : exclusive_addresses(processor_count, INVALID_EXCLUSIVE_ADDRESS), exclusive_values(processor_count) {} + +size_t ExclusiveMonitor::GetProcessorCount() const { + return exclusive_addresses.size(); +} + +void ExclusiveMonitor::Lock() { + lock.Lock(); +} + +void ExclusiveMonitor::Unlock() { + lock.Unlock(); +} + +bool ExclusiveMonitor::CheckAndClear(size_t processor_id, VAddr address) { + const VAddr masked_address = address & RESERVATION_GRANULE_MASK; + + Lock(); + if (exclusive_addresses[processor_id] != masked_address) { + Unlock(); + return false; + } + + for (VAddr& other_address : exclusive_addresses) { + if (other_address == masked_address) { + other_address = INVALID_EXCLUSIVE_ADDRESS; + } + } + return true; +} + +void ExclusiveMonitor::Clear() { + Lock(); + std::fill(exclusive_addresses.begin(), exclusive_addresses.end(), INVALID_EXCLUSIVE_ADDRESS); + Unlock(); +} + +void ExclusiveMonitor::ClearProcessor(size_t processor_id) { + Lock(); + exclusive_addresses[processor_id] = INVALID_EXCLUSIVE_ADDRESS; + Unlock(); +} + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/fastmem.h b/externals/dynarmic/src/dynarmic/backend/arm64/fastmem.h new file mode 100644 index 0000000000..8ed686aeaf --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/fastmem.h @@ -0,0 +1,56 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <cstddef> +#include <tuple> + +#include <mcl/hash/xmrx.hpp> +#include <mcl/stdint.hpp> +#include <tsl/robin_set.h> + +#include "dynarmic/backend/exception_handler.h" +#include "dynarmic/ir/location_descriptor.h" + +namespace Dynarmic::Backend::Arm64 { + +using DoNotFastmemMarker = std::tuple<IR::LocationDescriptor, unsigned>; + +struct DoNotFastmemMarkerHash { + size_t operator()(const DoNotFastmemMarker& value) const { + return mcl::hash::xmrx(std::get<0>(value).Value() ^ static_cast<u64>(std::get<1>(value))); + } +}; + +struct FastmemPatchInfo { + DoNotFastmemMarker marker; + FakeCall fc; + bool recompile; +}; + +class FastmemManager { +public: + explicit FastmemManager(ExceptionHandler& eh) + : exception_handler(eh) {} + + bool SupportsFastmem() const { + return exception_handler.SupportsFastmem(); + } + + bool ShouldFastmem(DoNotFastmemMarker marker) const { + return do_not_fastmem.count(marker) == 0; + } + + void MarkDoNotFastmem(DoNotFastmemMarker marker) { + do_not_fastmem.insert(marker); + } + +private: + ExceptionHandler& exception_handler; + tsl::robin_set<DoNotFastmemMarker, DoNotFastmemMarkerHash> do_not_fastmem; +}; + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.cpp new file mode 100644 index 0000000000..1a1c323e65 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.cpp @@ -0,0 +1,40 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/arm64/fpsr_manager.h" + +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/abi.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +FpsrManager::FpsrManager(oaknut::CodeGenerator& code, size_t state_fpsr_offset) + : code{code}, state_fpsr_offset{state_fpsr_offset} {} + +void FpsrManager::Spill() { + if (!fpsr_loaded) + return; + + code.LDR(Wscratch0, Xstate, state_fpsr_offset); + code.MRS(Xscratch1, oaknut::SystemReg::FPSR); + code.ORR(Wscratch0, Wscratch0, Wscratch1); + code.STR(Wscratch0, Xstate, state_fpsr_offset); + + fpsr_loaded = false; +} + +void FpsrManager::Load() { + if (fpsr_loaded) + return; + + code.MSR(oaknut::SystemReg::FPSR, XZR); + + fpsr_loaded = true; +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.h b/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.h new file mode 100644 index 0000000000..e003522dde --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/fpsr_manager.h @@ -0,0 +1,30 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/stdint.hpp> + +namespace oaknut { +struct CodeGenerator; +} // namespace oaknut + +namespace Dynarmic::Backend::Arm64 { + +class FpsrManager { +public: + explicit FpsrManager(oaknut::CodeGenerator& code, size_t state_fpsr_offset); + + void Spill(); + void Load(); + void Overwrite() { fpsr_loaded = false; } + +private: + oaknut::CodeGenerator& code; + size_t state_fpsr_offset; + bool fpsr_loaded = false; +}; + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.cpp new file mode 100644 index 0000000000..fedee02f07 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.cpp @@ -0,0 +1,613 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/arm64/reg_alloc.h" + +#include <algorithm> +#include <array> +#include <iterator> + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/bit_cast.hpp> +#include <mcl/mp/metavalue/lift_value.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/backend/arm64/fpsr_manager.h" +#include "dynarmic/backend/arm64/verbose_debugging_output.h" +#include "dynarmic/common/always_false.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +constexpr size_t spill_offset = offsetof(StackLayout, spill); +constexpr size_t spill_slot_size = sizeof(decltype(StackLayout::spill)::value_type); + +static bool IsValuelessType(IR::Type type) { + switch (type) { + case IR::Type::Table: + return true; + default: + return false; + } +} + +IR::Type Argument::GetType() const { + return value.GetType(); +} + +bool Argument::IsImmediate() const { + return value.IsImmediate(); +} + +bool Argument::GetImmediateU1() const { + return value.GetU1(); +} + +u8 Argument::GetImmediateU8() const { + const u64 imm = value.GetImmediateAsU64(); + ASSERT(imm < 0x100); + return u8(imm); +} + +u16 Argument::GetImmediateU16() const { + const u64 imm = value.GetImmediateAsU64(); + ASSERT(imm < 0x10000); + return u16(imm); +} + +u32 Argument::GetImmediateU32() const { + const u64 imm = value.GetImmediateAsU64(); + ASSERT(imm < 0x100000000); + return u32(imm); +} + +u64 Argument::GetImmediateU64() const { + return value.GetImmediateAsU64(); +} + +IR::Cond Argument::GetImmediateCond() const { + ASSERT(IsImmediate() && GetType() == IR::Type::Cond); + return value.GetCond(); +} + +IR::AccType Argument::GetImmediateAccType() const { + ASSERT(IsImmediate() && GetType() == IR::Type::AccType); + return value.GetAccType(); +} + +HostLoc::Kind Argument::CurrentLocationKind() const { + return reg_alloc.ValueLocation(value.GetInst())->kind; +} + +bool HostLocInfo::Contains(const IR::Inst* value) const { + return std::find(values.begin(), values.end(), value) != values.end(); +} + +void HostLocInfo::SetupScratchLocation() { + ASSERT(IsCompletelyEmpty()); + realized = true; +} + +void HostLocInfo::SetupLocation(const IR::Inst* value) { + ASSERT(IsCompletelyEmpty()); + values.clear(); + values.push_back(value); + realized = true; + uses_this_inst = 0; + accumulated_uses = 0; + expected_uses = value->UseCount(); +} + +bool HostLocInfo::IsCompletelyEmpty() const { + return values.empty() && !locked && !realized && !accumulated_uses && !expected_uses && !uses_this_inst; +} + +bool HostLocInfo::MaybeAllocatable() const { + return !locked && !realized; +} + +bool HostLocInfo::IsOneRemainingUse() const { + return accumulated_uses + 1 == expected_uses && uses_this_inst == 1; +} + +void HostLocInfo::UpdateUses() { + accumulated_uses += uses_this_inst; + uses_this_inst = 0; + + if (accumulated_uses == expected_uses) { + values.clear(); + accumulated_uses = 0; + expected_uses = 0; + } +} + +RegAlloc::ArgumentInfo RegAlloc::GetArgumentInfo(IR::Inst* inst) { + ArgumentInfo ret = {Argument{*this}, Argument{*this}, Argument{*this}, Argument{*this}}; + for (size_t i = 0; i < inst->NumArgs(); i++) { + const IR::Value arg = inst->GetArg(i); + ret[i].value = arg; + if (!arg.IsImmediate() && !IsValuelessType(arg.GetType())) { + ASSERT_MSG(ValueLocation(arg.GetInst()), "argument must already been defined"); + ValueInfo(arg.GetInst()).uses_this_inst++; + } + } + return ret; +} + +bool RegAlloc::WasValueDefined(IR::Inst* inst) const { + return defined_insts.count(inst) > 0; +} + +void RegAlloc::PrepareForCall(std::optional<Argument::copyable_reference> arg0, std::optional<Argument::copyable_reference> arg1, std::optional<Argument::copyable_reference> arg2, std::optional<Argument::copyable_reference> arg3) { + fpsr_manager.Spill(); + SpillFlags(); + + // TODO: Spill into callee-save registers + + for (int i = 0; i < 32; i++) { + if (mcl::bit::get_bit(i, static_cast<u32>(ABI_CALLER_SAVE))) { + SpillGpr(i); + } + } + + for (int i = 0; i < 32; i++) { + if (mcl::bit::get_bit(i, static_cast<u32>(ABI_CALLER_SAVE >> 32))) { + SpillFpr(i); + } + } + + const std::array<std::optional<Argument::copyable_reference>, 4> args{arg0, arg1, arg2, arg3}; + + // AAPCS64 Next General-purpose Register Number + int ngrn = 0; + // AAPCS64 Next SIMD and Floating-point Register Number + int nsrn = 0; + + for (int i = 0; i < 4; i++) { + if (args[i]) { + if (args[i]->get().GetType() == IR::Type::U128) { + ASSERT(fprs[nsrn].IsCompletelyEmpty()); + LoadCopyInto(args[i]->get().value, oaknut::QReg{nsrn}); + nsrn++; + } else { + ASSERT(gprs[ngrn].IsCompletelyEmpty()); + LoadCopyInto(args[i]->get().value, oaknut::XReg{ngrn}); + ngrn++; + } + } else { + // Gaps are assumed to be in general-purpose registers + // TODO: should there be a separate list passed for FPRs instead? + ngrn++; + } + } +} + +void RegAlloc::DefineAsExisting(IR::Inst* inst, Argument& arg) { + defined_insts.insert(inst); + + ASSERT(!ValueLocation(inst)); + + if (arg.value.IsImmediate()) { + inst->ReplaceUsesWith(arg.value); + return; + } + + auto& info = ValueInfo(arg.value.GetInst()); + info.values.push_back(inst); + info.expected_uses += inst->UseCount(); +} + +void RegAlloc::DefineAsRegister(IR::Inst* inst, oaknut::Reg reg) { + defined_insts.insert(inst); + + ASSERT(!ValueLocation(inst)); + auto& info = reg.is_vector() ? fprs[reg.index()] : gprs[reg.index()]; + ASSERT(info.IsCompletelyEmpty()); + info.values.push_back(inst); + info.expected_uses += inst->UseCount(); +} + +void RegAlloc::UpdateAllUses() { + for (auto& gpr : gprs) { + gpr.UpdateUses(); + } + for (auto& fpr : fprs) { + fpr.UpdateUses(); + } + flags.UpdateUses(); + for (auto& spill : spills) { + spill.UpdateUses(); + } +} + +void RegAlloc::AssertAllUnlocked() const { + const auto is_unlocked = [](const auto& i) { return !i.locked && !i.realized; }; + ASSERT(std::all_of(gprs.begin(), gprs.end(), is_unlocked)); + ASSERT(std::all_of(fprs.begin(), fprs.end(), is_unlocked)); + ASSERT(is_unlocked(flags)); + ASSERT(std::all_of(spills.begin(), spills.end(), is_unlocked)); +} + +void RegAlloc::AssertNoMoreUses() const { + const auto is_empty = [](const auto& i) { return i.IsCompletelyEmpty(); }; + ASSERT(std::all_of(gprs.begin(), gprs.end(), is_empty)); + ASSERT(std::all_of(fprs.begin(), fprs.end(), is_empty)); + ASSERT(is_empty(flags)); + ASSERT(std::all_of(spills.begin(), spills.end(), is_empty)); +} + +void RegAlloc::EmitVerboseDebuggingOutput() { + code.MOV(X19, mcl::bit_cast<u64>(&PrintVerboseDebuggingOutputLine)); // Non-volatile register + + const auto do_location = [&](HostLocInfo& info, HostLocType type, size_t index) { + using namespace oaknut::util; + for (const IR::Inst* value : info.values) { + code.MOV(X0, SP); + code.MOV(X1, static_cast<u64>(type)); + code.MOV(X2, index); + code.MOV(X3, value->GetName()); + code.MOV(X4, static_cast<u64>(value->GetType())); + code.BLR(X19); + } + }; + + for (size_t i = 0; i < gprs.size(); i++) { + do_location(gprs[i], HostLocType::X, i); + } + for (size_t i = 0; i < fprs.size(); i++) { + do_location(fprs[i], HostLocType::Q, i); + } + do_location(flags, HostLocType::Nzcv, 0); + for (size_t i = 0; i < spills.size(); i++) { + do_location(spills[i], HostLocType::Spill, i); + } +} + +template<HostLoc::Kind kind> +int RegAlloc::GenerateImmediate(const IR::Value& value) { + ASSERT(value.GetType() != IR::Type::U1); + if constexpr (kind == HostLoc::Kind::Gpr) { + const int new_location_index = AllocateRegister(gprs, gpr_order); + SpillGpr(new_location_index); + gprs[new_location_index].SetupScratchLocation(); + + code.MOV(oaknut::XReg{new_location_index}, value.GetImmediateAsU64()); + + return new_location_index; + } else if constexpr (kind == HostLoc::Kind::Fpr) { + const int new_location_index = AllocateRegister(fprs, fpr_order); + SpillFpr(new_location_index); + fprs[new_location_index].SetupScratchLocation(); + + code.MOV(Xscratch0, value.GetImmediateAsU64()); + code.FMOV(oaknut::DReg{new_location_index}, Xscratch0); + + return new_location_index; + } else if constexpr (kind == HostLoc::Kind::Flags) { + SpillFlags(); + flags.SetupScratchLocation(); + + code.MOV(Xscratch0, value.GetImmediateAsU64()); + code.MSR(oaknut::SystemReg::NZCV, Xscratch0); + + return 0; + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<kind>>); + } +} + +template<HostLoc::Kind required_kind> +int RegAlloc::RealizeReadImpl(const IR::Value& value) { + if (value.IsImmediate()) { + return GenerateImmediate<required_kind>(value); + } + + const auto current_location = ValueLocation(value.GetInst()); + ASSERT(current_location); + + if (current_location->kind == required_kind) { + ValueInfo(*current_location).realized = true; + return current_location->index; + } + + ASSERT(!ValueInfo(*current_location).realized); + ASSERT(ValueInfo(*current_location).locked); + + if constexpr (required_kind == HostLoc::Kind::Gpr) { + const int new_location_index = AllocateRegister(gprs, gpr_order); + SpillGpr(new_location_index); + + switch (current_location->kind) { + case HostLoc::Kind::Gpr: + ASSERT_FALSE("Logic error"); + break; + case HostLoc::Kind::Fpr: + code.FMOV(oaknut::XReg{new_location_index}, oaknut::DReg{current_location->index}); + // ASSERT size fits + break; + case HostLoc::Kind::Spill: + code.LDR(oaknut::XReg{new_location_index}, SP, spill_offset + current_location->index * spill_slot_size); + break; + case HostLoc::Kind::Flags: + code.MRS(oaknut::XReg{new_location_index}, oaknut::SystemReg::NZCV); + break; + } + + gprs[new_location_index] = std::exchange(ValueInfo(*current_location), {}); + gprs[new_location_index].realized = true; + return new_location_index; + } else if constexpr (required_kind == HostLoc::Kind::Fpr) { + const int new_location_index = AllocateRegister(fprs, fpr_order); + SpillFpr(new_location_index); + + switch (current_location->kind) { + case HostLoc::Kind::Gpr: + code.FMOV(oaknut::DReg{new_location_index}, oaknut::XReg{current_location->index}); + break; + case HostLoc::Kind::Fpr: + ASSERT_FALSE("Logic error"); + break; + case HostLoc::Kind::Spill: + code.LDR(oaknut::QReg{new_location_index}, SP, spill_offset + current_location->index * spill_slot_size); + break; + case HostLoc::Kind::Flags: + ASSERT_FALSE("Moving from flags into fprs is not currently supported"); + break; + } + + fprs[new_location_index] = std::exchange(ValueInfo(*current_location), {}); + fprs[new_location_index].realized = true; + return new_location_index; + } else if constexpr (required_kind == HostLoc::Kind::Flags) { + ASSERT_FALSE("A simple read from flags is likely a logic error."); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<required_kind>>); + } +} + +template<HostLoc::Kind kind> +int RegAlloc::RealizeWriteImpl(const IR::Inst* value) { + defined_insts.insert(value); + + ASSERT(!ValueLocation(value)); + + if constexpr (kind == HostLoc::Kind::Gpr) { + const int new_location_index = AllocateRegister(gprs, gpr_order); + SpillGpr(new_location_index); + gprs[new_location_index].SetupLocation(value); + return new_location_index; + } else if constexpr (kind == HostLoc::Kind::Fpr) { + const int new_location_index = AllocateRegister(fprs, fpr_order); + SpillFpr(new_location_index); + fprs[new_location_index].SetupLocation(value); + return new_location_index; + } else if constexpr (kind == HostLoc::Kind::Flags) { + SpillFlags(); + flags.SetupLocation(value); + return 0; + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<kind>>); + } +} + +template<HostLoc::Kind kind> +int RegAlloc::RealizeReadWriteImpl(const IR::Value& read_value, const IR::Inst* write_value) { + defined_insts.insert(write_value); + + // TODO: Move elimination + + const int write_loc = RealizeWriteImpl<kind>(write_value); + + if constexpr (kind == HostLoc::Kind::Gpr) { + LoadCopyInto(read_value, oaknut::XReg{write_loc}); + return write_loc; + } else if constexpr (kind == HostLoc::Kind::Fpr) { + LoadCopyInto(read_value, oaknut::QReg{write_loc}); + return write_loc; + } else if constexpr (kind == HostLoc::Kind::Flags) { + ASSERT_FALSE("Incorrect function for ReadWrite of flags"); + } else { + static_assert(Common::always_false_v<mcl::mp::lift_value<kind>>); + } +} + +template int RegAlloc::RealizeReadImpl<HostLoc::Kind::Gpr>(const IR::Value& value); +template int RegAlloc::RealizeReadImpl<HostLoc::Kind::Fpr>(const IR::Value& value); +template int RegAlloc::RealizeReadImpl<HostLoc::Kind::Flags>(const IR::Value& value); +template int RegAlloc::RealizeWriteImpl<HostLoc::Kind::Gpr>(const IR::Inst* value); +template int RegAlloc::RealizeWriteImpl<HostLoc::Kind::Fpr>(const IR::Inst* value); +template int RegAlloc::RealizeWriteImpl<HostLoc::Kind::Flags>(const IR::Inst* value); +template int RegAlloc::RealizeReadWriteImpl<HostLoc::Kind::Gpr>(const IR::Value&, const IR::Inst*); +template int RegAlloc::RealizeReadWriteImpl<HostLoc::Kind::Fpr>(const IR::Value&, const IR::Inst*); +template int RegAlloc::RealizeReadWriteImpl<HostLoc::Kind::Flags>(const IR::Value&, const IR::Inst*); + +int RegAlloc::AllocateRegister(const std::array<HostLocInfo, 32>& regs, const std::vector<int>& order) const { + const auto empty = std::find_if(order.begin(), order.end(), [&](int i) { return regs[i].IsCompletelyEmpty(); }); + if (empty != order.end()) { + return *empty; + } + + std::vector<int> candidates; + std::copy_if(order.begin(), order.end(), std::back_inserter(candidates), [&](int i) { return regs[i].MaybeAllocatable(); }); + + // TODO: LRU + std::uniform_int_distribution<size_t> dis{0, candidates.size() - 1}; + return candidates[dis(rand_gen)]; +} + +void RegAlloc::SpillGpr(int index) { + ASSERT(!gprs[index].locked && !gprs[index].realized); + if (gprs[index].values.empty()) { + return; + } + const int new_location_index = FindFreeSpill(); + code.STR(oaknut::XReg{index}, SP, spill_offset + new_location_index * spill_slot_size); + spills[new_location_index] = std::exchange(gprs[index], {}); +} + +void RegAlloc::SpillFpr(int index) { + ASSERT(!fprs[index].locked && !fprs[index].realized); + if (fprs[index].values.empty()) { + return; + } + const int new_location_index = FindFreeSpill(); + code.STR(oaknut::QReg{index}, SP, spill_offset + new_location_index * spill_slot_size); + spills[new_location_index] = std::exchange(fprs[index], {}); +} + +void RegAlloc::ReadWriteFlags(Argument& read, IR::Inst* write) { + defined_insts.insert(write); + + const auto current_location = ValueLocation(read.value.GetInst()); + ASSERT(current_location); + + if (current_location->kind == HostLoc::Kind::Flags) { + if (!flags.IsOneRemainingUse()) { + SpillFlags(); + } + } else if (current_location->kind == HostLoc::Kind::Gpr) { + if (!flags.values.empty()) { + SpillFlags(); + } + code.MSR(oaknut::SystemReg::NZCV, oaknut::XReg{current_location->index}); + } else if (current_location->kind == HostLoc::Kind::Spill) { + if (!flags.values.empty()) { + SpillFlags(); + } + code.LDR(Wscratch0, SP, spill_offset + current_location->index * spill_slot_size); + code.MSR(oaknut::SystemReg::NZCV, Xscratch0); + } else { + ASSERT_FALSE("Invalid current location for flags"); + } + + if (write) { + flags.SetupLocation(write); + flags.realized = false; + } +} + +void RegAlloc::SpillFlags() { + ASSERT(!flags.locked && !flags.realized); + if (flags.values.empty()) { + return; + } + const int new_location_index = AllocateRegister(gprs, gpr_order); + SpillGpr(new_location_index); + code.MRS(oaknut::XReg{new_location_index}, oaknut::SystemReg::NZCV); + gprs[new_location_index] = std::exchange(flags, {}); +} + +int RegAlloc::FindFreeSpill() const { + const auto iter = std::find_if(spills.begin(), spills.end(), [](const HostLocInfo& info) { return info.values.empty(); }); + ASSERT_MSG(iter != spills.end(), "All spill locations are full"); + return static_cast<int>(iter - spills.begin()); +} + +void RegAlloc::LoadCopyInto(const IR::Value& value, oaknut::XReg reg) { + if (value.IsImmediate()) { + code.MOV(reg, value.GetImmediateAsU64()); + return; + } + + const auto current_location = ValueLocation(value.GetInst()); + ASSERT(current_location); + switch (current_location->kind) { + case HostLoc::Kind::Gpr: + code.MOV(reg, oaknut::XReg{current_location->index}); + break; + case HostLoc::Kind::Fpr: + code.FMOV(reg, oaknut::DReg{current_location->index}); + // ASSERT size fits + break; + case HostLoc::Kind::Spill: + code.LDR(reg, SP, spill_offset + current_location->index * spill_slot_size); + break; + case HostLoc::Kind::Flags: + code.MRS(reg, oaknut::SystemReg::NZCV); + break; + } +} + +void RegAlloc::LoadCopyInto(const IR::Value& value, oaknut::QReg reg) { + if (value.IsImmediate()) { + code.MOV(Xscratch0, value.GetImmediateAsU64()); + code.FMOV(reg.toD(), Xscratch0); + return; + } + + const auto current_location = ValueLocation(value.GetInst()); + ASSERT(current_location); + switch (current_location->kind) { + case HostLoc::Kind::Gpr: + code.FMOV(reg.toD(), oaknut::XReg{current_location->index}); + break; + case HostLoc::Kind::Fpr: + code.MOV(reg.B16(), oaknut::QReg{current_location->index}.B16()); + break; + case HostLoc::Kind::Spill: + // TODO: Minimize move size to max value width + code.LDR(reg, SP, spill_offset + current_location->index * spill_slot_size); + break; + case HostLoc::Kind::Flags: + ASSERT_FALSE("Moving from flags into fprs is not currently supported"); + break; + } +} + +std::optional<HostLoc> RegAlloc::ValueLocation(const IR::Inst* value) const { + const auto contains_value = [value](const HostLocInfo& info) { return info.Contains(value); }; + + if (const auto iter = std::find_if(gprs.begin(), gprs.end(), contains_value); iter != gprs.end()) { + return HostLoc{HostLoc::Kind::Gpr, static_cast<int>(iter - gprs.begin())}; + } + if (const auto iter = std::find_if(fprs.begin(), fprs.end(), contains_value); iter != fprs.end()) { + return HostLoc{HostLoc::Kind::Fpr, static_cast<int>(iter - fprs.begin())}; + } + if (contains_value(flags)) { + return HostLoc{HostLoc::Kind::Flags, 0}; + } + if (const auto iter = std::find_if(spills.begin(), spills.end(), contains_value); iter != spills.end()) { + return HostLoc{HostLoc::Kind::Spill, static_cast<int>(iter - spills.begin())}; + } + return std::nullopt; +} + +HostLocInfo& RegAlloc::ValueInfo(HostLoc host_loc) { + switch (host_loc.kind) { + case HostLoc::Kind::Gpr: + return gprs[static_cast<size_t>(host_loc.index)]; + case HostLoc::Kind::Fpr: + return fprs[static_cast<size_t>(host_loc.index)]; + case HostLoc::Kind::Flags: + return flags; + case HostLoc::Kind::Spill: + return spills[static_cast<size_t>(host_loc.index)]; + } + ASSERT_FALSE("RegAlloc::ValueInfo: Invalid HostLoc::Kind"); +} + +HostLocInfo& RegAlloc::ValueInfo(const IR::Inst* value) { + const auto contains_value = [value](const HostLocInfo& info) { return info.Contains(value); }; + + if (const auto iter = std::find_if(gprs.begin(), gprs.end(), contains_value); iter != gprs.end()) { + return *iter; + } + if (const auto iter = std::find_if(fprs.begin(), fprs.end(), contains_value); iter != fprs.end()) { + return *iter; + } + if (contains_value(flags)) { + return flags; + } + if (const auto iter = std::find_if(spills.begin(), spills.end(), contains_value); iter != spills.end()) { + return *iter; + } + ASSERT_FALSE("RegAlloc::ValueInfo: Value not found"); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.h b/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.h new file mode 100644 index 0000000000..3a2d538819 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/reg_alloc.h @@ -0,0 +1,376 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <optional> +#include <random> +#include <utility> +#include <vector> + +#include <mcl/assert.hpp> +#include <mcl/stdint.hpp> +#include <mcl/type_traits/is_instance_of_template.hpp> +#include <oaknut/oaknut.hpp> +#include <tsl/robin_set.h> + +#include "dynarmic/backend/arm64/stack_layout.h" +#include "dynarmic/ir/cond.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/value.h" + +namespace Dynarmic::Backend::Arm64 { + +class FpsrManager; +class RegAlloc; + +struct HostLoc final { + enum class Kind { + Gpr, + Fpr, + Flags, + Spill, + } kind; + int index; +}; + +enum RWType { + Void, + Read, + Write, + ReadWrite, +}; + +struct Argument final { +public: + using copyable_reference = std::reference_wrapper<Argument>; + + IR::Type GetType() const; + bool IsVoid() const { return GetType() == IR::Type::Void; } + bool IsImmediate() const; + + bool GetImmediateU1() const; + u8 GetImmediateU8() const; + u16 GetImmediateU16() const; + u32 GetImmediateU32() const; + u64 GetImmediateU64() const; + IR::Cond GetImmediateCond() const; + IR::AccType GetImmediateAccType() const; + + // Only valid if not immediate + HostLoc::Kind CurrentLocationKind() const; + bool IsInGpr() const { return !IsImmediate() && CurrentLocationKind() == HostLoc::Kind::Gpr; } + bool IsInFpr() const { return !IsImmediate() && CurrentLocationKind() == HostLoc::Kind::Fpr; } + +private: + friend class RegAlloc; + explicit Argument(RegAlloc& reg_alloc) + : reg_alloc{reg_alloc} {} + + bool allocated = false; + RegAlloc& reg_alloc; + IR::Value value; +}; + +struct FlagsTag final { +private: + template<typename> + friend struct RAReg; + + explicit FlagsTag(int) {} + int index() const { return 0; } +}; + +template<typename T> +struct RAReg final { +public: + static constexpr HostLoc::Kind kind = !std::is_same_v<FlagsTag, T> + ? std::is_base_of_v<oaknut::VReg, T> + ? HostLoc::Kind::Fpr + : HostLoc::Kind::Gpr + : HostLoc::Kind::Flags; + + operator T() const { return reg.value(); } + + operator oaknut::WRegWsp() const + requires(std::is_same_v<T, oaknut::WReg>) + { + return reg.value(); + } + + operator oaknut::XRegSp() const + requires(std::is_same_v<T, oaknut::XReg>) + { + return reg.value(); + } + + T operator*() const { return reg.value(); } + const T* operator->() const { return ®.value(); } + + ~RAReg(); + RAReg(RAReg&& other) + : reg_alloc{other.reg_alloc} + , rw{std::exchange(other.rw, RWType::Void)} + , read_value{std::exchange(other.read_value, {})} + , write_value{std::exchange(other.write_value, nullptr)} + , reg{std::exchange(other.reg, std::nullopt)} { + } + RAReg& operator=(RAReg&&) = delete; + +private: + friend class RegAlloc; + explicit RAReg(RegAlloc& reg_alloc, RWType rw, const IR::Value& read_value, const IR::Inst* write_value); + + RAReg(const RAReg&) = delete; + RAReg& operator=(const RAReg&) = delete; + + void Realize(); + + RegAlloc& reg_alloc; + RWType rw; + IR::Value read_value; + const IR::Inst* write_value; + std::optional<T> reg; +}; + +struct HostLocInfo final { + std::vector<const IR::Inst*> values; + size_t locked = 0; + bool realized = false; + size_t uses_this_inst = 0; + size_t accumulated_uses = 0; + size_t expected_uses = 0; + + bool Contains(const IR::Inst*) const; + void SetupScratchLocation(); + void SetupLocation(const IR::Inst*); + bool IsCompletelyEmpty() const; + bool MaybeAllocatable() const; + bool IsOneRemainingUse() const; + void UpdateUses(); +}; + +class RegAlloc final { +public: + using ArgumentInfo = std::array<Argument, IR::max_arg_count>; + + explicit RegAlloc(oaknut::CodeGenerator& code, FpsrManager& fpsr_manager, std::vector<int> gpr_order, std::vector<int> fpr_order) + : code{code}, fpsr_manager{fpsr_manager}, gpr_order{gpr_order}, fpr_order{fpr_order}, rand_gen{std::random_device{}()} {} + + ArgumentInfo GetArgumentInfo(IR::Inst* inst); + bool WasValueDefined(IR::Inst* inst) const; + + auto ReadX(Argument& arg) { return RAReg<oaknut::XReg>{*this, RWType::Read, arg.value, nullptr}; } + auto ReadW(Argument& arg) { return RAReg<oaknut::WReg>{*this, RWType::Read, arg.value, nullptr}; } + + auto ReadQ(Argument& arg) { return RAReg<oaknut::QReg>{*this, RWType::Read, arg.value, nullptr}; } + auto ReadD(Argument& arg) { return RAReg<oaknut::DReg>{*this, RWType::Read, arg.value, nullptr}; } + auto ReadS(Argument& arg) { return RAReg<oaknut::SReg>{*this, RWType::Read, arg.value, nullptr}; } + auto ReadH(Argument& arg) { return RAReg<oaknut::HReg>{*this, RWType::Read, arg.value, nullptr}; } + auto ReadB(Argument& arg) { return RAReg<oaknut::BReg>{*this, RWType::Read, arg.value, nullptr}; } + + template<size_t size> + auto ReadReg(Argument& arg) { + if constexpr (size == 64) { + return ReadX(arg); + } else if constexpr (size == 32) { + return ReadW(arg); + } else { + ASSERT_FALSE("Invalid size to ReadReg {}", size); + } + } + + template<size_t size> + auto ReadVec(Argument& arg) { + if constexpr (size == 128) { + return ReadQ(arg); + } else if constexpr (size == 64) { + return ReadD(arg); + } else if constexpr (size == 32) { + return ReadS(arg); + } else if constexpr (size == 16) { + return ReadH(arg); + } else if constexpr (size == 8) { + return ReadB(arg); + } else { + ASSERT_FALSE("Invalid size to ReadVec {}", size); + } + } + + auto WriteX(IR::Inst* inst) { return RAReg<oaknut::XReg>{*this, RWType::Write, {}, inst}; } + auto WriteW(IR::Inst* inst) { return RAReg<oaknut::WReg>{*this, RWType::Write, {}, inst}; } + + auto WriteQ(IR::Inst* inst) { return RAReg<oaknut::QReg>{*this, RWType::Write, {}, inst}; } + auto WriteD(IR::Inst* inst) { return RAReg<oaknut::DReg>{*this, RWType::Write, {}, inst}; } + auto WriteS(IR::Inst* inst) { return RAReg<oaknut::SReg>{*this, RWType::Write, {}, inst}; } + auto WriteH(IR::Inst* inst) { return RAReg<oaknut::HReg>{*this, RWType::Write, {}, inst}; } + auto WriteB(IR::Inst* inst) { return RAReg<oaknut::BReg>{*this, RWType::Write, {}, inst}; } + + auto WriteFlags(IR::Inst* inst) { return RAReg<FlagsTag>{*this, RWType::Write, {}, inst}; } + + template<size_t size> + auto WriteReg(IR::Inst* inst) { + if constexpr (size == 64) { + return WriteX(inst); + } else if constexpr (size == 32) { + return WriteW(inst); + } else { + ASSERT_FALSE("Invalid size to WriteReg {}", size); + } + } + + template<size_t size> + auto WriteVec(IR::Inst* inst) { + if constexpr (size == 128) { + return WriteQ(inst); + } else if constexpr (size == 64) { + return WriteD(inst); + } else if constexpr (size == 32) { + return WriteS(inst); + } else if constexpr (size == 16) { + return WriteH(inst); + } else if constexpr (size == 8) { + return WriteB(inst); + } else { + ASSERT_FALSE("Invalid size to WriteVec {}", size); + } + } + + auto ReadWriteX(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::XReg>{*this, RWType::ReadWrite, arg.value, inst}; } + auto ReadWriteW(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::WReg>{*this, RWType::ReadWrite, arg.value, inst}; } + + auto ReadWriteQ(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::QReg>{*this, RWType::ReadWrite, arg.value, inst}; } + auto ReadWriteD(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::DReg>{*this, RWType::ReadWrite, arg.value, inst}; } + auto ReadWriteS(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::SReg>{*this, RWType::ReadWrite, arg.value, inst}; } + auto ReadWriteH(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::HReg>{*this, RWType::ReadWrite, arg.value, inst}; } + auto ReadWriteB(Argument& arg, const IR::Inst* inst) { return RAReg<oaknut::BReg>{*this, RWType::ReadWrite, arg.value, inst}; } + + template<size_t size> + auto ReadWriteReg(Argument& arg, const IR::Inst* inst) { + if constexpr (size == 64) { + return ReadWriteX(arg, inst); + } else if constexpr (size == 32) { + return ReadWriteW(arg, inst); + } else { + ASSERT_FALSE("Invalid size to ReadWriteReg {}", size); + } + } + + template<size_t size> + auto ReadWriteVec(Argument& arg, const IR::Inst* inst) { + if constexpr (size == 128) { + return ReadWriteQ(arg, inst); + } else if constexpr (size == 64) { + return ReadWriteD(arg, inst); + } else if constexpr (size == 32) { + return ReadWriteS(arg, inst); + } else if constexpr (size == 16) { + return ReadWriteH(arg, inst); + } else if constexpr (size == 8) { + return ReadWriteB(arg, inst); + } else { + ASSERT_FALSE("Invalid size to ReadWriteVec {}", size); + } + } + + void PrepareForCall(std::optional<Argument::copyable_reference> arg0 = {}, std::optional<Argument::copyable_reference> arg1 = {}, std::optional<Argument::copyable_reference> arg2 = {}, std::optional<Argument::copyable_reference> arg3 = {}); + + void DefineAsExisting(IR::Inst* inst, Argument& arg); + void DefineAsRegister(IR::Inst* inst, oaknut::Reg reg); + + void ReadWriteFlags(Argument& read, IR::Inst* write); + void SpillFlags(); + void SpillAll(); + + template<typename... Ts> + static void Realize(Ts&... rs) { + static_assert((mcl::is_instance_of_template<RAReg, Ts>() && ...)); + (rs.Realize(), ...); + } + + void UpdateAllUses(); + void AssertAllUnlocked() const; + void AssertNoMoreUses() const; + + void EmitVerboseDebuggingOutput(); + +private: + friend struct Argument; + template<typename> + friend struct RAReg; + + template<HostLoc::Kind kind> + int GenerateImmediate(const IR::Value& value); + template<HostLoc::Kind kind> + int RealizeReadImpl(const IR::Value& value); + template<HostLoc::Kind kind> + int RealizeWriteImpl(const IR::Inst* value); + template<HostLoc::Kind kind> + int RealizeReadWriteImpl(const IR::Value& read_value, const IR::Inst* write_value); + + int AllocateRegister(const std::array<HostLocInfo, 32>& regs, const std::vector<int>& order) const; + void SpillGpr(int index); + void SpillFpr(int index); + int FindFreeSpill() const; + + void LoadCopyInto(const IR::Value& value, oaknut::XReg reg); + void LoadCopyInto(const IR::Value& value, oaknut::QReg reg); + + std::optional<HostLoc> ValueLocation(const IR::Inst* value) const; + HostLocInfo& ValueInfo(HostLoc host_loc); + HostLocInfo& ValueInfo(const IR::Inst* value); + + oaknut::CodeGenerator& code; + FpsrManager& fpsr_manager; + std::vector<int> gpr_order; + std::vector<int> fpr_order; + + std::array<HostLocInfo, 32> gprs; + std::array<HostLocInfo, 32> fprs; + HostLocInfo flags; + std::array<HostLocInfo, SpillCount> spills; + + mutable std::mt19937 rand_gen; + + tsl::robin_set<const IR::Inst*> defined_insts; +}; + +template<typename T> +RAReg<T>::RAReg(RegAlloc& reg_alloc, RWType rw, const IR::Value& read_value, const IR::Inst* write_value) + : reg_alloc{reg_alloc}, rw{rw}, read_value{read_value}, write_value{write_value} { + if (rw != RWType::Write && !read_value.IsImmediate()) { + reg_alloc.ValueInfo(read_value.GetInst()).locked++; + } +} + +template<typename T> +RAReg<T>::~RAReg() { + if (rw != RWType::Write && !read_value.IsImmediate()) { + reg_alloc.ValueInfo(read_value.GetInst()).locked--; + } + if (reg) { + reg_alloc.ValueInfo(HostLoc{kind, reg->index()}).realized = false; + } +} + +template<typename T> +void RAReg<T>::Realize() { + switch (rw) { + case RWType::Read: + reg = T{reg_alloc.RealizeReadImpl<kind>(read_value)}; + break; + case RWType::Write: + reg = T{reg_alloc.RealizeWriteImpl<kind>(write_value)}; + break; + case RWType::ReadWrite: + reg = T{reg_alloc.RealizeReadWriteImpl<kind>(read_value, write_value)}; + break; + default: + ASSERT_FALSE("Invalid RWType"); + } +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/stack_layout.h b/externals/dynarmic/src/dynarmic/backend/arm64/stack_layout.h new file mode 100644 index 0000000000..cf7f3259a9 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/stack_layout.h @@ -0,0 +1,49 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Backend::Arm64 { + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4324) // Structure was padded due to alignment specifier +#endif + +constexpr size_t SpillCount = 64; + +struct alignas(16) RSBEntry { + u64 target; + u64 code_ptr; +}; + +constexpr size_t RSBCount = 8; +constexpr u64 RSBIndexMask = (RSBCount - 1) * sizeof(RSBEntry); + +struct alignas(16) StackLayout { + std::array<RSBEntry, RSBCount> rsb; + + std::array<std::array<u64, 2>, SpillCount> spill; + + u32 rsb_ptr; + + s64 cycles_to_run; + + u32 save_host_fpcr; + + bool check_bit; +}; + +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +static_assert(sizeof(StackLayout) % 16 == 0); + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.cpp b/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.cpp new file mode 100644 index 0000000000..aec0472f68 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.cpp @@ -0,0 +1,108 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2023 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/arm64/verbose_debugging_output.h" + +#include <fmt/format.h> +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/emit_context.h" +#include "dynarmic/ir/type.h" + +namespace Dynarmic::Backend::Arm64 { + +using namespace oaknut::util; + +void EmitVerboseDebuggingOutput(oaknut::CodeGenerator& code, EmitContext& ctx) { + code.SUB(SP, SP, sizeof(RegisterData)); + for (int i = 0; i < 30; i++) { + if (i == 18) { + continue; // Platform register + } + code.STR(oaknut::XReg{i}, SP, offsetof(RegisterData, x) + i * sizeof(u64)); + } + for (int i = 0; i < 32; i++) { + code.STR(oaknut::QReg{i}, SP, offsetof(RegisterData, q) + i * sizeof(Vector)); + } + code.MRS(X0, oaknut::SystemReg::NZCV); + code.STR(X0, SP, offsetof(RegisterData, nzcv)); + code.ADD(X0, SP, sizeof(RegisterData) + offsetof(StackLayout, spill)); + code.STR(X0, SP, offsetof(RegisterData, spill)); + code.MRS(X0, oaknut::SystemReg::FPSR); + code.STR(X0, SP, offsetof(RegisterData, fpsr)); + + ctx.reg_alloc.EmitVerboseDebuggingOutput(); + + code.LDR(X0, SP, offsetof(RegisterData, fpsr)); + code.MSR(oaknut::SystemReg::FPSR, X0); + code.LDR(X0, SP, offsetof(RegisterData, nzcv)); + code.MSR(oaknut::SystemReg::NZCV, X0); + for (int i = 0; i < 32; i++) { + code.LDR(oaknut::QReg{i}, SP, offsetof(RegisterData, q) + i * sizeof(Vector)); + } + for (int i = 0; i < 30; i++) { + if (i == 18) { + continue; // Platform register + } + code.LDR(oaknut::XReg{i}, SP, offsetof(RegisterData, x) + i * sizeof(u64)); + } + code.ADD(SP, SP, sizeof(RegisterData)); +} + +void PrintVerboseDebuggingOutputLine(RegisterData& reg_data, HostLocType reg_type, size_t reg_index, size_t inst_index, IR::Type inst_type) { + fmt::print("dynarmic debug: %{:05} = ", inst_index); + + Vector value = [&]() -> Vector { + switch (reg_type) { + case HostLocType::X: + return {reg_data.x[reg_index], 0}; + case HostLocType::Q: + return reg_data.q[reg_index]; + case HostLocType::Nzcv: + return {reg_data.nzcv, 0}; + case HostLocType::Spill: + return (*reg_data.spill)[reg_index]; + } + fmt::print("invalid reg_type! "); + return {0, 0}; + }(); + + switch (inst_type) { + case IR::Type::U1: + case IR::Type::U8: + fmt::print("{:02x}", value[0] & 0xff); + break; + case IR::Type::U16: + fmt::print("{:04x}", value[0] & 0xffff); + break; + case IR::Type::U32: + case IR::Type::NZCVFlags: + fmt::print("{:08x}", value[0] & 0xffffffff); + break; + case IR::Type::U64: + fmt::print("{:016x}", value[0]); + break; + case IR::Type::U128: + fmt::print("{:016x}{:016x}", value[1], value[0]); + break; + case IR::Type::A32Reg: + case IR::Type::A32ExtReg: + case IR::Type::A64Reg: + case IR::Type::A64Vec: + case IR::Type::CoprocInfo: + case IR::Type::Cond: + case IR::Type::Void: + case IR::Type::Table: + case IR::Type::AccType: + case IR::Type::Opaque: + default: + fmt::print("invalid inst_type!"); + break; + } + + fmt::print("\n"); +} + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.h b/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.h new file mode 100644 index 0000000000..73c6646139 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/arm64/verbose_debugging_output.h @@ -0,0 +1,56 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2023 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> + +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/arm64/stack_layout.h" + +namespace oaknut { +struct CodeGenerator; +struct Label; +} // namespace oaknut + +namespace Dynarmic::IR { +enum class Type; +} // namespace Dynarmic::IR + +namespace Dynarmic::Backend::Arm64 { + +struct EmitContext; + +using Vector = std::array<u64, 2>; + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4324) // Structure was padded due to alignment specifier +#endif + +enum class HostLocType { + X, + Q, + Nzcv, + Spill, +}; + +struct alignas(16) RegisterData { + std::array<u64, 30> x; + std::array<Vector, 32> q; + u32 nzcv; + decltype(StackLayout::spill)* spill; + u32 fpsr; +}; + +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +void EmitVerboseDebuggingOutput(oaknut::CodeGenerator& code, EmitContext& ctx); +void PrintVerboseDebuggingOutputLine(RegisterData& reg_data, HostLocType reg_type, size_t reg_index, size_t inst_index, IR::Type inst_type); + +} // namespace Dynarmic::Backend::Arm64 diff --git a/externals/dynarmic/src/dynarmic/backend/block_range_information.cpp b/externals/dynarmic/src/dynarmic/backend/block_range_information.cpp new file mode 100644 index 0000000000..47e512f5ce --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/block_range_information.cpp @@ -0,0 +1,43 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/block_range_information.h" + +#include <boost/icl/interval_map.hpp> +#include <boost/icl/interval_set.hpp> +#include <mcl/stdint.hpp> +#include <tsl/robin_set.h> + +namespace Dynarmic::Backend { + +template<typename ProgramCounterType> +void BlockRangeInformation<ProgramCounterType>::AddRange(boost::icl::discrete_interval<ProgramCounterType> range, IR::LocationDescriptor location) { + block_ranges.add(std::make_pair(range, std::set<IR::LocationDescriptor>{location})); +} + +template<typename ProgramCounterType> +void BlockRangeInformation<ProgramCounterType>::ClearCache() { + block_ranges.clear(); +} + +template<typename ProgramCounterType> +tsl::robin_set<IR::LocationDescriptor> BlockRangeInformation<ProgramCounterType>::InvalidateRanges(const boost::icl::interval_set<ProgramCounterType>& ranges) { + tsl::robin_set<IR::LocationDescriptor> erase_locations; + for (auto invalidate_interval : ranges) { + auto pair = block_ranges.equal_range(invalidate_interval); + for (auto it = pair.first; it != pair.second; ++it) { + for (const auto& descriptor : it->second) { + erase_locations.insert(descriptor); + } + } + } + // TODO: EFFICIENCY: Remove ranges that are to be erased. + return erase_locations; +} + +template class BlockRangeInformation<u32>; +template class BlockRangeInformation<u64>; + +} // namespace Dynarmic::Backend diff --git a/externals/dynarmic/src/dynarmic/backend/block_range_information.h b/externals/dynarmic/src/dynarmic/backend/block_range_information.h new file mode 100644 index 0000000000..3402b155bb --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/block_range_information.h @@ -0,0 +1,29 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <set> + +#include <boost/icl/interval_map.hpp> +#include <boost/icl/interval_set.hpp> +#include <tsl/robin_set.h> + +#include "dynarmic/ir/location_descriptor.h" + +namespace Dynarmic::Backend { + +template<typename ProgramCounterType> +class BlockRangeInformation { +public: + void AddRange(boost::icl::discrete_interval<ProgramCounterType> range, IR::LocationDescriptor location); + void ClearCache(); + tsl::robin_set<IR::LocationDescriptor> InvalidateRanges(const boost::icl::interval_set<ProgramCounterType>& ranges); + +private: + boost::icl::interval_map<ProgramCounterType, std::set<IR::LocationDescriptor>> block_ranges; +}; + +} // namespace Dynarmic::Backend diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler.h b/externals/dynarmic/src/dynarmic/backend/exception_handler.h new file mode 100644 index 0000000000..179433ba5d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/exception_handler.h @@ -0,0 +1,63 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <functional> +#include <memory> +#include <optional> + +#include <mcl/macro/architecture.hpp> +#include <mcl/stdint.hpp> + +#if defined(MCL_ARCHITECTURE_X86_64) +namespace Dynarmic::Backend::X64 { +class BlockOfCode; +} // namespace Dynarmic::Backend::X64 +#elif defined(MCL_ARCHITECTURE_ARM64) +namespace oaknut { +class CodeBlock; +} // namespace oaknut +#else +# error "Invalid architecture" +#endif + +namespace Dynarmic::Backend { + +#if defined(MCL_ARCHITECTURE_X86_64) +struct FakeCall { + u64 call_rip; + u64 ret_rip; +}; +#elif defined(MCL_ARCHITECTURE_ARM64) +struct FakeCall { + u64 call_pc; +}; +#else +# error "Invalid architecture" +#endif + +class ExceptionHandler final { +public: + ExceptionHandler(); + ~ExceptionHandler(); + +#if defined(MCL_ARCHITECTURE_X86_64) + void Register(X64::BlockOfCode& code); +#elif defined(MCL_ARCHITECTURE_ARM64) + void Register(oaknut::CodeBlock& mem, std::size_t mem_size); +#else +# error "Invalid architecture" +#endif + + bool SupportsFastmem() const noexcept; + void SetFastmemCallback(std::function<FakeCall(u64)> cb); + +private: + struct Impl; + std::unique_ptr<Impl> impl; +}; + +} // namespace Dynarmic::Backend diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler_generic.cpp b/externals/dynarmic/src/dynarmic/backend/exception_handler_generic.cpp new file mode 100644 index 0000000000..bbaa071a5a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/exception_handler_generic.cpp @@ -0,0 +1,36 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/exception_handler.h" + +namespace Dynarmic::Backend { + +struct ExceptionHandler::Impl final { +}; + +ExceptionHandler::ExceptionHandler() = default; +ExceptionHandler::~ExceptionHandler() = default; + +#if defined(MCL_ARCHITECTURE_X86_64) +void ExceptionHandler::Register(X64::BlockOfCode&) { + // Do nothing +} +#elif defined(MCL_ARCHITECTURE_ARM64) +void ExceptionHandler::Register(oaknut::CodeBlock&, std::size_t) { + // Do nothing +} +#else +# error "Invalid architecture" +#endif + +bool ExceptionHandler::SupportsFastmem() const noexcept { + return false; +} + +void ExceptionHandler::SetFastmemCallback(std::function<FakeCall(u64)>) { + // Do nothing +} + +} // namespace Dynarmic::Backend diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler_macos.cpp b/externals/dynarmic/src/dynarmic/backend/exception_handler_macos.cpp new file mode 100644 index 0000000000..bac8d9c951 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/exception_handler_macos.cpp @@ -0,0 +1,293 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mach/mach.h> +#include <mach/message.h> + +#include <cstring> +#include <functional> +#include <memory> +#include <mutex> +#include <optional> +#include <thread> +#include <vector> + +#include <fmt/format.h> +#include <mcl/assert.hpp> +#include <mcl/bit_cast.hpp> +#include <mcl/macro/architecture.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/exception_handler.h" + +#if defined(MCL_ARCHITECTURE_X86_64) + +# include "dynarmic/backend/x64/block_of_code.h" +# define mig_external extern "C" +# include "dynarmic/backend/x64/mig/mach_exc_server.h" + +# define THREAD_STATE x86_THREAD_STATE64 +# define THREAD_STATE_COUNT x86_THREAD_STATE64_COUNT + +using dynarmic_thread_state_t = x86_thread_state64_t; + +#elif defined(MCL_ARCHITECTURE_ARM64) + +# include <oaknut/code_block.hpp> +# define mig_external extern "C" +# include "dynarmic/backend/arm64/mig/mach_exc_server.h" + +# define THREAD_STATE ARM_THREAD_STATE64 +# define THREAD_STATE_COUNT ARM_THREAD_STATE64_COUNT + +using dynarmic_thread_state_t = arm_thread_state64_t; + +#endif + +namespace Dynarmic::Backend { + +namespace { + +struct CodeBlockInfo { + u64 code_begin, code_end; + std::function<FakeCall(u64)> cb; +}; + +struct MachMessage { + mach_msg_header_t head; + char data[2048]; ///< Arbitrary size +}; + +class MachHandler final { +public: + MachHandler(); + ~MachHandler(); + + kern_return_t HandleRequest(dynarmic_thread_state_t* thread_state); + + void AddCodeBlock(CodeBlockInfo info); + void RemoveCodeBlock(u64 rip); + +private: + auto FindCodeBlockInfo(u64 rip) { + return std::find_if(code_block_infos.begin(), code_block_infos.end(), [&](const auto& x) { return x.code_begin <= rip && x.code_end > rip; }); + } + + std::vector<CodeBlockInfo> code_block_infos; + std::mutex code_block_infos_mutex; + + std::thread thread; + mach_port_t server_port; + + void MessagePump(); +}; + +MachHandler::MachHandler() { +#define KCHECK(x) ASSERT_MSG((x) == KERN_SUCCESS, "dynarmic: macOS MachHandler: init failure at {}", #x) + + KCHECK(mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &server_port)); + KCHECK(mach_port_insert_right(mach_task_self(), server_port, server_port, MACH_MSG_TYPE_MAKE_SEND)); + KCHECK(task_set_exception_ports(mach_task_self(), EXC_MASK_BAD_ACCESS, server_port, EXCEPTION_STATE | MACH_EXCEPTION_CODES, THREAD_STATE)); + + // The below doesn't actually work, and I'm not sure why; since this doesn't work we'll have a spurious error message upon shutdown. + mach_port_t prev; + KCHECK(mach_port_request_notification(mach_task_self(), server_port, MACH_NOTIFY_PORT_DESTROYED, 0, server_port, MACH_MSG_TYPE_MAKE_SEND_ONCE, &prev)); + +#undef KCHECK + + thread = std::thread(&MachHandler::MessagePump, this); + thread.detach(); +} + +MachHandler::~MachHandler() { + mach_port_deallocate(mach_task_self(), server_port); +} + +void MachHandler::MessagePump() { + mach_msg_return_t mr; + MachMessage request; + MachMessage reply; + + while (true) { + mr = mach_msg(&request.head, MACH_RCV_MSG | MACH_RCV_LARGE, 0, sizeof(request), server_port, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL); + if (mr != MACH_MSG_SUCCESS) { + fmt::print(stderr, "dynarmic: macOS MachHandler: Failed to receive mach message. error: {:#08x} ({})\n", mr, mach_error_string(mr)); + return; + } + + if (!mach_exc_server(&request.head, &reply.head)) { + fmt::print(stderr, "dynarmic: macOS MachHandler: Unexpected mach message\n"); + return; + } + + mr = mach_msg(&reply.head, MACH_SEND_MSG, reply.head.msgh_size, 0, MACH_PORT_NULL, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL); + if (mr != MACH_MSG_SUCCESS) { + fmt::print(stderr, "dynarmic: macOS MachHandler: Failed to send mach message. error: {:#08x} ({})\n", mr, mach_error_string(mr)); + return; + } + } +} + +#if defined(MCL_ARCHITECTURE_X86_64) +kern_return_t MachHandler::HandleRequest(x86_thread_state64_t* ts) { + std::lock_guard<std::mutex> guard(code_block_infos_mutex); + + const auto iter = FindCodeBlockInfo(ts->__rip); + if (iter == code_block_infos.end()) { + fmt::print(stderr, "Unhandled EXC_BAD_ACCESS at rip {:#016x}\n", ts->__rip); + return KERN_FAILURE; + } + + FakeCall fc = iter->cb(ts->__rip); + + ts->__rsp -= sizeof(u64); + *mcl::bit_cast<u64*>(ts->__rsp) = fc.ret_rip; + ts->__rip = fc.call_rip; + + return KERN_SUCCESS; +} +#elif defined(MCL_ARCHITECTURE_ARM64) +kern_return_t MachHandler::HandleRequest(arm_thread_state64_t* ts) { + std::lock_guard<std::mutex> guard(code_block_infos_mutex); + + const auto iter = FindCodeBlockInfo(ts->__pc); + if (iter == code_block_infos.end()) { + fmt::print(stderr, "Unhandled EXC_BAD_ACCESS at pc {:#016x}\n", ts->__pc); + return KERN_FAILURE; + } + + FakeCall fc = iter->cb(ts->__pc); + + // TODO: Sign with ptrauth_sign_unauthenticated if pointer authentication is enabled. + ts->__pc = fc.call_pc; + + return KERN_SUCCESS; +} +#endif + +void MachHandler::AddCodeBlock(CodeBlockInfo cbi) { + std::lock_guard<std::mutex> guard(code_block_infos_mutex); + if (auto iter = FindCodeBlockInfo(cbi.code_begin); iter != code_block_infos.end()) { + code_block_infos.erase(iter); + } + code_block_infos.push_back(cbi); +} + +void MachHandler::RemoveCodeBlock(u64 rip) { + std::lock_guard<std::mutex> guard(code_block_infos_mutex); + const auto iter = FindCodeBlockInfo(rip); + if (iter == code_block_infos.end()) { + return; + } + code_block_infos.erase(iter); +} + +std::mutex handler_lock; +std::optional<MachHandler> mach_handler; + +void RegisterHandler() { + std::lock_guard<std::mutex> guard(handler_lock); + if (!mach_handler) { + mach_handler.emplace(); + } +} + +} // anonymous namespace + +mig_external kern_return_t catch_mach_exception_raise(mach_port_t, mach_port_t, mach_port_t, exception_type_t, mach_exception_data_t, mach_msg_type_number_t) { + fmt::print(stderr, "dynarmic: Unexpected mach message: mach_exception_raise\n"); + return KERN_FAILURE; +} + +mig_external kern_return_t catch_mach_exception_raise_state_identity(mach_port_t, mach_port_t, mach_port_t, exception_type_t, mach_exception_data_t, mach_msg_type_number_t, int*, thread_state_t, mach_msg_type_number_t, thread_state_t, mach_msg_type_number_t*) { + fmt::print(stderr, "dynarmic: Unexpected mach message: mach_exception_raise_state_identity\n"); + return KERN_FAILURE; +} + +mig_external kern_return_t catch_mach_exception_raise_state( + mach_port_t /*exception_port*/, + exception_type_t exception, + const mach_exception_data_t /*code*/, // code[0] is as per kern_return.h, code[1] is rip. + mach_msg_type_number_t /*codeCnt*/, + int* flavor, + const thread_state_t old_state, + mach_msg_type_number_t old_stateCnt, + thread_state_t new_state, + mach_msg_type_number_t* new_stateCnt) { + if (!flavor || !new_stateCnt) { + fmt::print(stderr, "dynarmic: catch_mach_exception_raise_state: Invalid arguments.\n"); + return KERN_INVALID_ARGUMENT; + } + if (*flavor != THREAD_STATE || old_stateCnt != THREAD_STATE_COUNT || *new_stateCnt < THREAD_STATE_COUNT) { + fmt::print(stderr, "dynarmic: catch_mach_exception_raise_state: Unexpected flavor.\n"); + return KERN_INVALID_ARGUMENT; + } + if (exception != EXC_BAD_ACCESS) { + fmt::print(stderr, "dynarmic: catch_mach_exception_raise_state: Unexpected exception type.\n"); + return KERN_FAILURE; + } + + // The input/output pointers are not necessarily 8-byte aligned. + dynarmic_thread_state_t ts; + std::memcpy(&ts, old_state, sizeof(ts)); + + kern_return_t ret = mach_handler->HandleRequest(&ts); + + std::memcpy(new_state, &ts, sizeof(ts)); + *new_stateCnt = THREAD_STATE_COUNT; + return ret; +} + +struct ExceptionHandler::Impl final { + Impl(u64 code_begin_, u64 code_end_) + : code_begin(code_begin_) + , code_end(code_end_) { + RegisterHandler(); + } + + void SetCallback(std::function<FakeCall(u64)> cb) { + CodeBlockInfo cbi; + cbi.code_begin = code_begin; + cbi.code_end = code_end; + cbi.cb = cb; + mach_handler->AddCodeBlock(cbi); + } + + ~Impl() { + mach_handler->RemoveCodeBlock(code_begin); + } + +private: + u64 code_begin, code_end; +}; + +ExceptionHandler::ExceptionHandler() = default; +ExceptionHandler::~ExceptionHandler() = default; + +#if defined(MCL_ARCHITECTURE_X86_64) +void ExceptionHandler::Register(X64::BlockOfCode& code) { + const u64 code_begin = mcl::bit_cast<u64>(code.getCode()); + const u64 code_end = code_begin + code.GetTotalCodeSize(); + impl = std::make_unique<Impl>(code_begin, code_end); +} +#elif defined(MCL_ARCHITECTURE_ARM64) +void ExceptionHandler::Register(oaknut::CodeBlock& mem, std::size_t size) { + const u64 code_begin = mcl::bit_cast<u64>(mem.ptr()); + const u64 code_end = code_begin + size; + impl = std::make_unique<Impl>(code_begin, code_end); +} +#else +# error "Invalid architecture" +#endif + +bool ExceptionHandler::SupportsFastmem() const noexcept { + return static_cast<bool>(impl); +} + +void ExceptionHandler::SetFastmemCallback(std::function<FakeCall(u64)> cb) { + impl->SetCallback(cb); +} + +} // namespace Dynarmic::Backend diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler_macos_mig.c b/externals/dynarmic/src/dynarmic/backend/exception_handler_macos_mig.c new file mode 100644 index 0000000000..762a80ca42 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/exception_handler_macos_mig.c @@ -0,0 +1,14 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2023 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/macro/architecture.hpp> + +#if defined(MCL_ARCHITECTURE_X86_64) +# include "dynarmic/backend/x64/mig/mach_exc_server.c" +#elif defined(MCL_ARCHITECTURE_ARM64) +# include "dynarmic/backend/arm64/mig/mach_exc_server.c" +#else +# error "Invalid architecture" +#endif diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler_posix.cpp b/externals/dynarmic/src/dynarmic/backend/exception_handler_posix.cpp new file mode 100644 index 0000000000..0a1c270b5e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/exception_handler_posix.cpp @@ -0,0 +1,319 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/exception_handler.h" + +#ifdef __APPLE__ +# include <signal.h> +# include <sys/ucontext.h> +#else +# include <signal.h> +# ifndef __OpenBSD__ +# include <ucontext.h> +# endif +#endif + +#include <cstring> +#include <functional> +#include <memory> +#include <mutex> +#include <optional> +#include <vector> + +#include <mcl/assert.hpp> +#include <mcl/bit_cast.hpp> +#include <mcl/stdint.hpp> + +#if defined(MCL_ARCHITECTURE_X86_64) +# include "dynarmic/backend/x64/block_of_code.h" +#elif defined(MCL_ARCHITECTURE_ARM64) +# include <oaknut/code_block.hpp> + +# include "dynarmic/backend/arm64/abi.h" +#else +# error "Invalid architecture" +#endif + +namespace Dynarmic::Backend { + +namespace { + +struct CodeBlockInfo { + u64 code_begin, code_end; + std::function<FakeCall(u64)> cb; +}; + +class SigHandler { +public: + SigHandler(); + ~SigHandler(); + + void AddCodeBlock(CodeBlockInfo info); + void RemoveCodeBlock(u64 host_pc); + + bool SupportsFastmem() const { return supports_fast_mem; } + +private: + auto FindCodeBlockInfo(u64 host_pc) { + return std::find_if(code_block_infos.begin(), code_block_infos.end(), [&](const auto& x) { return x.code_begin <= host_pc && x.code_end > host_pc; }); + } + + bool supports_fast_mem = true; + + void* signal_stack_memory = nullptr; + + std::vector<CodeBlockInfo> code_block_infos; + std::mutex code_block_infos_mutex; + + struct sigaction old_sa_segv; + struct sigaction old_sa_bus; + + static void SigAction(int sig, siginfo_t* info, void* raw_context); +}; + +std::mutex handler_lock; +std::optional<SigHandler> sig_handler; + +void RegisterHandler() { + std::lock_guard<std::mutex> guard(handler_lock); + if (!sig_handler) { + sig_handler.emplace(); + } +} + +SigHandler::SigHandler() { + const size_t signal_stack_size = std::max<size_t>(SIGSTKSZ, 2 * 1024 * 1024); + + signal_stack_memory = std::malloc(signal_stack_size); + + stack_t signal_stack; + signal_stack.ss_sp = signal_stack_memory; + signal_stack.ss_size = signal_stack_size; + signal_stack.ss_flags = 0; + if (sigaltstack(&signal_stack, nullptr) != 0) { + fmt::print(stderr, "dynarmic: POSIX SigHandler: init failure at sigaltstack\n"); + supports_fast_mem = false; + return; + } + + struct sigaction sa; + sa.sa_handler = nullptr; + sa.sa_sigaction = &SigHandler::SigAction; + sa.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, &old_sa_segv) != 0) { + fmt::print(stderr, "dynarmic: POSIX SigHandler: could not set SIGSEGV handler\n"); + supports_fast_mem = false; + return; + } +#ifdef __APPLE__ + if (sigaction(SIGBUS, &sa, &old_sa_bus) != 0) { + fmt::print(stderr, "dynarmic: POSIX SigHandler: could not set SIGBUS handler\n"); + supports_fast_mem = false; + return; + } +#endif +} + +SigHandler::~SigHandler() { + std::free(signal_stack_memory); +} + +void SigHandler::AddCodeBlock(CodeBlockInfo cbi) { + std::lock_guard<std::mutex> guard(code_block_infos_mutex); + if (auto iter = FindCodeBlockInfo(cbi.code_begin); iter != code_block_infos.end()) { + code_block_infos.erase(iter); + } + code_block_infos.push_back(cbi); +} + +void SigHandler::RemoveCodeBlock(u64 host_pc) { + std::lock_guard<std::mutex> guard(code_block_infos_mutex); + const auto iter = FindCodeBlockInfo(host_pc); + if (iter == code_block_infos.end()) { + return; + } + code_block_infos.erase(iter); +} + +void SigHandler::SigAction(int sig, siginfo_t* info, void* raw_context) { + ASSERT(sig == SIGSEGV || sig == SIGBUS); + + ucontext_t* ucontext = reinterpret_cast<ucontext_t*>(raw_context); +#ifndef __OpenBSD__ + auto& mctx = ucontext->uc_mcontext; +#endif + +#if defined(MCL_ARCHITECTURE_X86_64) + +# if defined(__APPLE__) +# define CTX_RIP (mctx->__ss.__rip) +# define CTX_RSP (mctx->__ss.__rsp) +# elif defined(__linux__) +# define CTX_RIP (mctx.gregs[REG_RIP]) +# define CTX_RSP (mctx.gregs[REG_RSP]) +# elif defined(__FreeBSD__) +# define CTX_RIP (mctx.mc_rip) +# define CTX_RSP (mctx.mc_rsp) +# elif defined(__NetBSD__) +# define CTX_RIP (mctx.__gregs[_REG_RIP]) +# define CTX_RSP (mctx.__gregs[_REG_RSP]) +# elif defined(__OpenBSD__) +# define CTX_RIP (ucontext->sc_rip) +# define CTX_RSP (ucontext->sc_rsp) +# else +# error "Unknown platform" +# endif + + { + std::lock_guard<std::mutex> guard(sig_handler->code_block_infos_mutex); + + const auto iter = sig_handler->FindCodeBlockInfo(CTX_RIP); + if (iter != sig_handler->code_block_infos.end()) { + FakeCall fc = iter->cb(CTX_RIP); + + CTX_RSP -= sizeof(u64); + *mcl::bit_cast<u64*>(CTX_RSP) = fc.ret_rip; + CTX_RIP = fc.call_rip; + + return; + } + } + + fmt::print(stderr, "Unhandled {} at rip {:#018x}\n", sig == SIGSEGV ? "SIGSEGV" : "SIGBUS", CTX_RIP); + +#elif defined(MCL_ARCHITECTURE_ARM64) + +# if defined(__APPLE__) +# define CTX_PC (mctx->__ss.__pc) +# define CTX_SP (mctx->__ss.__sp) +# define CTX_LR (mctx->__ss.__lr) +# define CTX_X(i) (mctx->__ss.__x[i]) +# define CTX_Q(i) (mctx->__ns.__v[i]) +# elif defined(__linux__) +# define CTX_PC (mctx.pc) +# define CTX_SP (mctx.sp) +# define CTX_LR (mctx.regs[30]) +# define CTX_X(i) (mctx.regs[i]) +# define CTX_Q(i) (fpctx->vregs[i]) + [[maybe_unused]] const auto fpctx = [&mctx] { + _aarch64_ctx* header = (_aarch64_ctx*)&mctx.__reserved; + while (header->magic != FPSIMD_MAGIC) { + ASSERT(header->magic && header->size); + header = (_aarch64_ctx*)((char*)header + header->size); + } + return (fpsimd_context*)header; + }(); +# elif defined(__FreeBSD__) +# define CTX_PC (mctx.mc_gpregs.gp_elr) +# define CTX_SP (mctx.mc_gpregs.gp_sp) +# define CTX_LR (mctx.mc_gpregs.gp_lr) +# define CTX_X(i) (mctx.mc_gpregs.gp_x[i]) +# define CTX_Q(i) (mctx.mc_fpregs.fp_q[i]) +# elif defined(__NetBSD__) +# define CTX_PC (mctx.mc_gpregs.gp_elr) +# define CTX_SP (mctx.mc_gpregs.gp_sp) +# define CTX_LR (mctx.mc_gpregs.gp_lr) +# define CTX_X(i) (mctx.mc_gpregs.gp_x[i]) +# define CTX_Q(i) (mctx.mc_fpregs.fp_q[i]) +# elif defined(__OpenBSD__) +# define CTX_PC (ucontext->sc_elr) +# define CTX_SP (ucontext->sc_sp) +# define CTX_LR (ucontext->sc_lr) +# define CTX_X(i) (ucontext->sc_x[i]) +# define CTX_Q(i) (ucontext->sc_q[i]) +# else +# error "Unknown platform" +# endif + + { + std::lock_guard<std::mutex> guard(sig_handler->code_block_infos_mutex); + + const auto iter = sig_handler->FindCodeBlockInfo(CTX_PC); + if (iter != sig_handler->code_block_infos.end()) { + FakeCall fc = iter->cb(CTX_PC); + + CTX_PC = fc.call_pc; + + return; + } + } + + fmt::print(stderr, "Unhandled {} at pc {:#018x}\n", sig == SIGSEGV ? "SIGSEGV" : "SIGBUS", CTX_PC); + +#else + +# error "Invalid architecture" + +#endif + + struct sigaction* retry_sa = sig == SIGSEGV ? &sig_handler->old_sa_segv : &sig_handler->old_sa_bus; + if (retry_sa->sa_flags & SA_SIGINFO) { + retry_sa->sa_sigaction(sig, info, raw_context); + return; + } + if (retry_sa->sa_handler == SIG_DFL) { + signal(sig, SIG_DFL); + return; + } + if (retry_sa->sa_handler == SIG_IGN) { + return; + } + retry_sa->sa_handler(sig); +} + +} // anonymous namespace + +struct ExceptionHandler::Impl final { + Impl(u64 code_begin_, u64 code_end_) + : code_begin(code_begin_) + , code_end(code_end_) { + RegisterHandler(); + } + + void SetCallback(std::function<FakeCall(u64)> cb) { + CodeBlockInfo cbi; + cbi.code_begin = code_begin; + cbi.code_end = code_end; + cbi.cb = cb; + sig_handler->AddCodeBlock(cbi); + } + + ~Impl() { + sig_handler->RemoveCodeBlock(code_begin); + } + +private: + u64 code_begin, code_end; +}; + +ExceptionHandler::ExceptionHandler() = default; +ExceptionHandler::~ExceptionHandler() = default; + +#if defined(MCL_ARCHITECTURE_X86_64) +void ExceptionHandler::Register(X64::BlockOfCode& code) { + const u64 code_begin = mcl::bit_cast<u64>(code.getCode()); + const u64 code_end = code_begin + code.GetTotalCodeSize(); + impl = std::make_unique<Impl>(code_begin, code_end); +} +#elif defined(MCL_ARCHITECTURE_ARM64) +void ExceptionHandler::Register(oaknut::CodeBlock& mem, std::size_t size) { + const u64 code_begin = mcl::bit_cast<u64>(mem.ptr()); + const u64 code_end = code_begin + size; + impl = std::make_unique<Impl>(code_begin, code_end); +} +#else +# error "Invalid architecture" +#endif + +bool ExceptionHandler::SupportsFastmem() const noexcept { + return static_cast<bool>(impl) && sig_handler->SupportsFastmem(); +} + +void ExceptionHandler::SetFastmemCallback(std::function<FakeCall(u64)> cb) { + impl->SetCallback(cb); +} + +} // namespace Dynarmic::Backend diff --git a/externals/dynarmic/src/dynarmic/backend/exception_handler_windows.cpp b/externals/dynarmic/src/dynarmic/backend/exception_handler_windows.cpp new file mode 100644 index 0000000000..719c007594 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/exception_handler_windows.cpp @@ -0,0 +1,14 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2023 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/macro/architecture.hpp> + +#if defined(MCL_ARCHITECTURE_X86_64) +# include "dynarmic/backend/x64/exception_handler_windows.cpp" +#elif defined(MCL_ARCHITECTURE_ARM64) +# include "dynarmic/backend/exception_handler_generic.cpp" +#else +# error "Invalid architecture" +#endif diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp new file mode 100644 index 0000000000..fc8e8bae21 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp @@ -0,0 +1,1299 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/a32_emit_x64.h" + +#include <algorithm> +#include <optional> +#include <utility> + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/scope_exit.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/x64/a32_jitstate.h" +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/devirtualize.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/backend/x64/nzcv_util.h" +#include "dynarmic/backend/x64/perf_map.h" +#include "dynarmic/backend/x64/stack_layout.h" +#include "dynarmic/common/variant_util.h" +#include "dynarmic/frontend/A32/a32_location_descriptor.h" +#include "dynarmic/frontend/A32/a32_types.h" +#include "dynarmic/interface/A32/coprocessor.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +// TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary. +// TODO: Actually implement that proper instruction selector you've always wanted to sweetheart. + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +static Xbyak::Address MJitStateReg(A32::Reg reg) { + return dword[r15 + offsetof(A32JitState, Reg) + sizeof(u32) * static_cast<size_t>(reg)]; +} + +static Xbyak::Address MJitStateExtReg(A32::ExtReg reg) { + if (A32::IsSingleExtReg(reg)) { + const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::S0); + return dword[r15 + offsetof(A32JitState, ExtReg) + sizeof(u32) * index]; + } + if (A32::IsDoubleExtReg(reg)) { + const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0); + return qword[r15 + offsetof(A32JitState, ExtReg) + sizeof(u64) * index]; + } + if (A32::IsQuadExtReg(reg)) { + const size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::Q0); + return xword[r15 + offsetof(A32JitState, ExtReg) + 2 * sizeof(u64) * index]; + } + ASSERT_FALSE("Should never happen."); +} + +A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block) + : EmitContext(reg_alloc, block), conf(conf) {} + +A32::LocationDescriptor A32EmitContext::Location() const { + return A32::LocationDescriptor{block.Location()}; +} + +A32::LocationDescriptor A32EmitContext::EndLocation() const { + return A32::LocationDescriptor{block.EndLocation()}; +} + +bool A32EmitContext::IsSingleStep() const { + return Location().SingleStepping(); +} + +FP::FPCR A32EmitContext::FPCR(bool fpcr_controlled) const { + const FP::FPCR fpcr = FP::FPCR{Location().FPSCR().Value()}; + return fpcr_controlled ? fpcr : fpcr.ASIMDStandardValue(); +} + +A32EmitX64::A32EmitX64(BlockOfCode& code, A32::UserConfig conf, A32::Jit* jit_interface) + : EmitX64(code), conf(std::move(conf)), jit_interface(jit_interface) { + GenFastmemFallbacks(); + GenTerminalHandlers(); + code.PreludeComplete(); + ClearFastDispatchTable(); + + exception_handler.SetFastmemCallback([this](u64 rip_) { + return FastmemCallback(rip_); + }); +} + +A32EmitX64::~A32EmitX64() = default; + +A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) { + if (conf.very_verbose_debugging_output) { + std::puts(IR::DumpBlock(block).c_str()); + } + + code.EnableWriting(); + SCOPE_EXIT { + code.DisableWriting(); + }; + + const std::vector<HostLoc> gpr_order = [this] { + std::vector<HostLoc> gprs{any_gpr}; + if (conf.page_table) { + gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14)); + } + if (conf.fastmem_pointer) { + gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R13)); + } + return gprs; + }(); + + RegAlloc reg_alloc{code, gpr_order, any_xmm}; + A32EmitContext ctx{conf, reg_alloc, block}; + + // Start emitting. + code.align(); + const u8* const entrypoint = code.getCurr(); + + EmitCondPrelude(ctx); + + for (auto iter = block.begin(); iter != block.end(); ++iter) { + IR::Inst* inst = &*iter; + + // Call the relevant Emit* member function. + switch (inst->GetOpcode()) { +#define OPCODE(name, type, ...) \ + case IR::Opcode::name: \ + A32EmitX64::Emit##name(ctx, inst); \ + break; +#define A32OPC(name, type, ...) \ + case IR::Opcode::A32##name: \ + A32EmitX64::EmitA32##name(ctx, inst); \ + break; +#define A64OPC(...) +#include "dynarmic/ir/opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + + default: + ASSERT_FALSE("Invalid opcode: {}", inst->GetOpcode()); + break; + } + + reg_alloc.EndOfAllocScope(); + + if (conf.very_verbose_debugging_output) { + EmitVerboseDebuggingOutput(reg_alloc); + } + } + + reg_alloc.AssertNoMoreUses(); + + if (conf.enable_cycle_counting) { + EmitAddCycles(block.CycleCount()); + } + EmitX64::EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep()); + code.int3(); + + for (auto& deferred_emit : ctx.deferred_emits) { + deferred_emit(); + } + code.int3(); + + const size_t size = static_cast<size_t>(code.getCurr() - entrypoint); + + const A32::LocationDescriptor descriptor{block.Location()}; + const A32::LocationDescriptor end_location{block.EndLocation()}; + + const auto range = boost::icl::discrete_interval<u32>::closed(descriptor.PC(), end_location.PC() - 1); + block_ranges.AddRange(range, descriptor); + + return RegisterBlock(descriptor, entrypoint, size); +} + +void A32EmitX64::ClearCache() { + EmitX64::ClearCache(); + block_ranges.ClearCache(); + ClearFastDispatchTable(); + fastmem_patch_info.clear(); +} + +void A32EmitX64::InvalidateCacheRanges(const boost::icl::interval_set<u32>& ranges) { + InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges)); +} + +void A32EmitX64::EmitCondPrelude(const A32EmitContext& ctx) { + if (ctx.block.GetCondition() == IR::Cond::AL) { + ASSERT(!ctx.block.HasConditionFailedLocation()); + return; + } + + ASSERT(ctx.block.HasConditionFailedLocation()); + + Xbyak::Label pass = EmitCond(ctx.block.GetCondition()); + if (conf.enable_cycle_counting) { + EmitAddCycles(ctx.block.ConditionFailedCycleCount()); + } + EmitTerminal(IR::Term::LinkBlock{ctx.block.ConditionFailedLocation()}, ctx.Location().SetSingleStepping(false), ctx.IsSingleStep()); + code.L(pass); +} + +void A32EmitX64::ClearFastDispatchTable() { + if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { + fast_dispatch_table.fill({}); + } +} + +void A32EmitX64::GenTerminalHandlers() { + // PC ends up in ebp, location_descriptor ends up in rbx + const auto calculate_location_descriptor = [this] { + // This calculation has to match up with IREmitter::PushRSB + code.mov(ebx, dword[r15 + offsetof(A32JitState, upper_location_descriptor)]); + code.shl(rbx, 32); + code.mov(ecx, MJitStateReg(A32::Reg::PC)); + code.mov(ebp, ecx); + code.or_(rbx, rcx); + }; + + Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss; + + code.align(); + terminal_handler_pop_rsb_hint = code.getCurr<const void*>(); + calculate_location_descriptor(); + code.mov(eax, dword[r15 + offsetof(A32JitState, rsb_ptr)]); + code.sub(eax, 1); + code.and_(eax, u32(A32JitState::RSBPtrMask)); + code.mov(dword[r15 + offsetof(A32JitState, rsb_ptr)], eax); + code.cmp(rbx, qword[r15 + offsetof(A32JitState, rsb_location_descriptors) + rax * sizeof(u64)]); + if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { + code.jne(rsb_cache_miss); + } else { + code.jne(code.GetReturnFromRunCodeAddress()); + } + code.mov(rax, qword[r15 + offsetof(A32JitState, rsb_codeptrs) + rax * sizeof(u64)]); + code.jmp(rax); + PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a32_terminal_handler_pop_rsb_hint"); + + if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { + code.align(); + terminal_handler_fast_dispatch_hint = code.getCurr<const void*>(); + calculate_location_descriptor(); + code.L(rsb_cache_miss); + code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data())); + code.mov(rbp, rbx); + if (code.HasHostFeature(HostFeature::SSE42)) { + code.crc32(rbp, r12); + } + code.and_(ebp, fast_dispatch_table_mask); + code.lea(rbp, ptr[r12 + rbp]); + code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]); + code.jne(fast_dispatch_cache_miss); + code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]); + code.L(fast_dispatch_cache_miss); + code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx); + code.LookupBlock(); + code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax); + code.jmp(rax); + PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a32_terminal_handler_fast_dispatch_hint"); + + code.align(); + fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry& (*)(u64)>(); + code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data())); + if (code.HasHostFeature(HostFeature::SSE42)) { + code.crc32(code.ABI_PARAM1, code.ABI_PARAM2); + } + code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask); + code.lea(code.ABI_RETURN, code.ptr[code.ABI_PARAM1 + code.ABI_PARAM2]); + code.ret(); + PerfMapRegister(fast_dispatch_table_lookup, code.getCurr(), "a32_fast_dispatch_table_lookup"); + } +} + +void A32EmitX64::EmitA32SetCheckBit(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8(); + code.mov(code.byte[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, check_bit)], to_store); +} + +void A32EmitX64::EmitA32GetRegister(A32EmitContext& ctx, IR::Inst* inst) { + const A32::Reg reg = inst->GetArg(0).GetA32RegRef(); + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.mov(result, MJitStateReg(reg)); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitX64::EmitA32GetExtendedRegister32(A32EmitContext& ctx, IR::Inst* inst) { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsSingleExtReg(reg)); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.movss(result, MJitStateExtReg(reg)); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitX64::EmitA32GetExtendedRegister64(A32EmitContext& ctx, IR::Inst* inst) { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsDoubleExtReg(reg)); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.movsd(result, MJitStateExtReg(reg)); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitX64::EmitA32GetVector(A32EmitContext& ctx, IR::Inst* inst) { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg)); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + if (A32::IsDoubleExtReg(reg)) { + code.movsd(result, MJitStateExtReg(reg)); + } else { + code.movaps(result, MJitStateExtReg(reg)); + } + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitX64::EmitA32SetRegister(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A32::Reg reg = inst->GetArg(0).GetA32RegRef(); + + if (args[1].IsImmediate()) { + code.mov(MJitStateReg(reg), args[1].GetImmediateU32()); + } else if (args[1].IsInXmm()) { + const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]); + code.movd(MJitStateReg(reg), to_store); + } else { + const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + code.mov(MJitStateReg(reg), to_store); + } +} + +void A32EmitX64::EmitA32SetExtendedRegister32(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsSingleExtReg(reg)); + + if (args[1].IsInXmm()) { + Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]); + code.movss(MJitStateExtReg(reg), to_store); + } else { + Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + code.mov(MJitStateExtReg(reg), to_store); + } +} + +void A32EmitX64::EmitA32SetExtendedRegister64(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsDoubleExtReg(reg)); + + if (args[1].IsInXmm()) { + const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]); + code.movsd(MJitStateExtReg(reg), to_store); + } else { + const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[1]); + code.mov(MJitStateExtReg(reg), to_store); + } +} + +void A32EmitX64::EmitA32SetVector(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg)); + + const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]); + if (A32::IsDoubleExtReg(reg)) { + code.movsd(MJitStateExtReg(reg), to_store); + } else { + code.movaps(MJitStateExtReg(reg), to_store); + } +} + +void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32(); + + if (code.HasHostFeature(HostFeature::FastBMI2)) { + // Here we observe that cpsr_et and cpsr_ge are right next to each other in memory, + // so we load them both at the same time with one 64-bit read. This allows us to + // extract all of their bits together at once with one pext. + static_assert(offsetof(A32JitState, upper_location_descriptor) + 4 == offsetof(A32JitState, cpsr_ge)); + code.mov(result.cvt64(), qword[r15 + offsetof(A32JitState, upper_location_descriptor)]); + code.mov(tmp.cvt64(), 0x80808080'00000003ull); + code.pext(result.cvt64(), result.cvt64(), tmp.cvt64()); + code.mov(tmp, 0x000f0220); + code.pdep(result, result, tmp); + } else { + code.mov(result, dword[r15 + offsetof(A32JitState, upper_location_descriptor)]); + code.imul(result, result, 0x120); + code.and_(result, 0x00000220); + + code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_ge)]); + code.and_(tmp, 0x80808080); + code.imul(tmp, tmp, 0x00204081); + code.shr(tmp, 12); + code.and_(tmp, 0x000f0000); + code.or_(result, tmp); + } + + code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]); + code.shl(tmp, 27); + code.or_(result, tmp); + + code.mov(tmp2, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]); + if (code.HasHostFeature(HostFeature::FastBMI2)) { + code.mov(tmp, NZCV::x64_mask); + code.pext(tmp2, tmp2, tmp); + code.shl(tmp2, 28); + } else { + code.and_(tmp2, NZCV::x64_mask); + code.imul(tmp2, tmp2, NZCV::from_x64_multiplier); + code.and_(tmp2, NZCV::arm_mask); + } + code.or_(result, tmp2); + + code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32(); + + if (conf.always_little_endian) { + code.and_(cpsr, 0xFFFFFDFF); + } + + // cpsr_q + code.bt(cpsr, 27); + code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]); + + // cpsr_nzcv + code.mov(tmp, cpsr); + code.shr(tmp, 28); + if (code.HasHostFeature(HostFeature::FastBMI2)) { + code.mov(tmp2, NZCV::x64_mask); + code.pdep(tmp, tmp, tmp2); + } else { + code.imul(tmp, tmp, NZCV::to_x64_multiplier); + code.and_(tmp, NZCV::x64_mask); + } + code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], tmp); + + // cpsr_jaifm + code.mov(tmp, cpsr); + code.and_(tmp, 0x010001DF); + code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp); + + if (code.HasHostFeature(HostFeature::FastBMI2)) { + // cpsr_et and cpsr_ge + static_assert(offsetof(A32JitState, upper_location_descriptor) + 4 == offsetof(A32JitState, cpsr_ge)); + // This mask is 0x7FFF0000, because we do not want the MSB to be sign extended to the upper dword. + static_assert((A32::LocationDescriptor::FPSCR_MODE_MASK & ~0x7FFF0000) == 0); + + code.and_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], u32(0x7FFF0000)); + code.mov(tmp, 0x000f0220); + code.pext(cpsr, cpsr, tmp); + code.mov(tmp.cvt64(), 0x01010101'00000003ull); + code.pdep(cpsr.cvt64(), cpsr.cvt64(), tmp.cvt64()); + // We perform SWAR partitioned subtraction here, to negate the GE bytes. + code.mov(tmp.cvt64(), 0x80808080'00000003ull); + code.mov(tmp2.cvt64(), tmp.cvt64()); + code.sub(tmp.cvt64(), cpsr.cvt64()); + code.xor_(tmp.cvt64(), tmp2.cvt64()); + code.or_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], tmp.cvt64()); + } else { + code.and_(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], u32(0xFFFF0000)); + code.mov(tmp, cpsr); + code.and_(tmp, 0x00000220); + code.imul(tmp, tmp, 0x00900000); + code.shr(tmp, 28); + code.or_(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], tmp); + + code.and_(cpsr, 0x000f0000); + code.shr(cpsr, 16); + code.imul(cpsr, cpsr, 0x00204081); + code.and_(cpsr, 0x01010101); + code.mov(tmp, 0x80808080); + code.sub(tmp, cpsr); + code.xor_(tmp, 0x80808080); + code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], tmp); + } +} + +void A32EmitX64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], to_store); +} + +void A32EmitX64::EmitA32SetCpsrNZCVRaw(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + const u32 imm = args[0].GetImmediateU32(); + + code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm)); + } else if (code.HasHostFeature(HostFeature::FastBMI2)) { + const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.shr(a, 28); + code.mov(b, NZCV::x64_mask); + code.pdep(a, a, b); + code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a); + } else { + const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + + code.shr(a, 28); + code.imul(a, a, NZCV::to_x64_multiplier); + code.and_(a, NZCV::x64_mask); + code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a); + } +} + +void A32EmitX64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + const u32 imm = args[0].GetImmediateU32(); + + code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm)); + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0)); + } else if (code.HasHostFeature(HostFeature::FastBMI2)) { + const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.shr(a, 28); + code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]); + code.mov(b, NZCV::x64_mask); + code.pdep(a, a, b); + code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a); + } else { + const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + + code.shr(a, 28); + code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]); + code.imul(a, a, NZCV::to_x64_multiplier); + code.and_(a, NZCV::x64_mask); + code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], a); + } +} + +void A32EmitX64::EmitA32SetCpsrNZ(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 nz = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.movzx(tmp, code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1]); + code.and_(tmp, 1); + code.or_(tmp, nz); + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], tmp.cvt8()); +} + +void A32EmitX64::EmitA32SetCpsrNZC(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (args[0].IsImmediate()) { + if (args[1].IsImmediate()) { + const bool c = args[1].GetImmediateU1(); + + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], c); + } else { + const Xbyak::Reg8 c = ctx.reg_alloc.UseGpr(args[1]).cvt8(); + + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], c); + } + } else { + const Xbyak::Reg32 nz = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + + if (args[1].IsImmediate()) { + const bool c = args[1].GetImmediateU1(); + + code.or_(nz, c); + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], nz.cvt8()); + } else { + const Xbyak::Reg32 c = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + + code.or_(nz, c); + code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_nzcv) + 1], nz.cvt8()); + } + } +} + +static void EmitGetFlag(BlockOfCode& code, A32EmitContext& ctx, IR::Inst* inst, size_t flag_bit) { + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]); + if (flag_bit != 0) { + code.shr(result, static_cast<int>(flag_bit)); + } + code.and_(result, 1); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitX64::EmitA32GetCFlag(A32EmitContext& ctx, IR::Inst* inst) { + EmitGetFlag(code, ctx, inst, NZCV::x64_c_flag_bit); +} + +void A32EmitX64::EmitA32OrQFlag(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { + code.mov(dword[r15 + offsetof(A32JitState, cpsr_q)], 1); + } + } else { + const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8(); + + code.or_(code.byte[r15 + offsetof(A32JitState, cpsr_q)], to_store); + } +} + +void A32EmitX64::EmitA32GetGEFlags(A32EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.movd(result, dword[r15 + offsetof(A32JitState, cpsr_ge)]); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitX64::EmitA32SetGEFlags(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(!args[0].IsImmediate()); + + if (args[0].IsInXmm()) { + const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]); + code.movd(dword[r15 + offsetof(A32JitState, cpsr_ge)], to_store); + } else { + const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], to_store); + } +} + +void A32EmitX64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + const u32 imm = args[0].GetImmediateU32(); + u32 ge = 0; + ge |= mcl::bit::get_bit<19>(imm) ? 0xFF000000 : 0; + ge |= mcl::bit::get_bit<18>(imm) ? 0x00FF0000 : 0; + ge |= mcl::bit::get_bit<17>(imm) ? 0x0000FF00 : 0; + ge |= mcl::bit::get_bit<16>(imm) ? 0x000000FF : 0; + + code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], ge); + } else if (code.HasHostFeature(HostFeature::FastBMI2)) { + const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.mov(b, 0x01010101); + code.shr(a, 16); + code.pdep(a, a, b); + code.imul(a, a, 0xFF); + code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], a); + } else { + const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + + code.shr(a, 16); + code.and_(a, 0xF); + code.imul(a, a, 0x00204081); + code.and_(a, 0x01010101); + code.imul(a, a, 0xFF); + code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], a); + } +} + +void A32EmitX64::EmitA32DataSynchronizationBarrier(A32EmitContext&, IR::Inst*) { + code.mfence(); + code.lfence(); +} + +void A32EmitX64::EmitA32DataMemoryBarrier(A32EmitContext&, IR::Inst*) { + code.mfence(); +} + +void A32EmitX64::EmitA32InstructionSynchronizationBarrier(A32EmitContext& ctx, IR::Inst*) { + if (!conf.hook_isb) { + return; + } + + ctx.reg_alloc.HostCall(nullptr); + Devirtualize<&A32::UserCallbacks::InstructionSynchronizationBarrierRaised>(conf.callbacks).EmitCall(code); +} + +void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& arg = args[0]; + + const u32 upper_without_t = (ctx.EndLocation().SetSingleStepping(false).UniqueHash() >> 32) & 0xFFFFFFFE; + + // Pseudocode: + // if (new_pc & 1) { + // new_pc &= 0xFFFFFFFE; + // cpsr.T = true; + // } else { + // new_pc &= 0xFFFFFFFC; + // cpsr.T = false; + // } + // We rely on the fact we disallow EFlag from changing within a block. + + if (arg.IsImmediate()) { + const u32 new_pc = arg.GetImmediateU32(); + const u32 mask = mcl::bit::get_bit<0>(new_pc) ? 0xFFFFFFFE : 0xFFFFFFFC; + const u32 new_upper = upper_without_t | (mcl::bit::get_bit<0>(new_pc) ? 1 : 0); + + code.mov(MJitStateReg(A32::Reg::PC), new_pc & mask); + code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], new_upper); + } else { + const Xbyak::Reg32 new_pc = ctx.reg_alloc.UseScratchGpr(arg).cvt32(); + const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 new_upper = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.mov(mask, new_pc); + code.and_(mask, 1); + code.lea(new_upper, ptr[mask.cvt64() + upper_without_t]); + code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC + code.and_(new_pc, mask); + code.mov(MJitStateReg(A32::Reg::PC), new_pc); + code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], new_upper); + } +} + +void A32EmitX64::EmitA32UpdateUpperLocationDescriptor(A32EmitContext& ctx, IR::Inst*) { + for (auto& inst : ctx.block) { + if (inst.GetOpcode() == IR::Opcode::A32BXWritePC) { + return; + } + } + EmitSetUpperLocationDescriptor(ctx.EndLocation(), ctx.Location()); +} + +void A32EmitX64::EmitA32CallSupervisor(A32EmitContext& ctx, IR::Inst* inst) { + code.SwitchMxcsrOnExit(); + + if (conf.enable_cycle_counting) { + ctx.reg_alloc.HostCall(nullptr); + code.mov(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)]); + code.sub(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)]); + Devirtualize<&A32::UserCallbacks::AddTicks>(conf.callbacks).EmitCall(code); + ctx.reg_alloc.EndOfAllocScope(); + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(nullptr, {}, args[0]); + Devirtualize<&A32::UserCallbacks::CallSVC>(conf.callbacks).EmitCall(code); + + if (conf.enable_cycle_counting) { + Devirtualize<&A32::UserCallbacks::GetTicksRemaining>(conf.callbacks).EmitCall(code); + code.mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)], code.ABI_RETURN); + code.mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], code.ABI_RETURN); + code.SwitchMxcsrOnEntry(); + } +} + +void A32EmitX64::EmitA32ExceptionRaised(A32EmitContext& ctx, IR::Inst* inst) { + code.SwitchMxcsrOnExit(); + + ctx.reg_alloc.HostCall(nullptr); + if (conf.enable_cycle_counting) { + code.mov(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)]); + code.sub(code.ABI_PARAM2, qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)]); + Devirtualize<&A32::UserCallbacks::AddTicks>(conf.callbacks).EmitCall(code); + } + ctx.reg_alloc.EndOfAllocScope(); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[0].IsImmediate() && args[1].IsImmediate()); + const u32 pc = args[0].GetImmediateU32(); + const u64 exception = args[1].GetImmediateU64(); + Devirtualize<&A32::UserCallbacks::ExceptionRaised>(conf.callbacks).EmitCall(code, [&](RegList param) { + code.mov(param[0], pc); + code.mov(param[1], exception); + }); + + if (conf.enable_cycle_counting) { + Devirtualize<&A32::UserCallbacks::GetTicksRemaining>(conf.callbacks).EmitCall(code); + code.mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)], code.ABI_RETURN); + code.mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], code.ABI_RETURN); + code.SwitchMxcsrOnEntry(); + } +} + +static u32 GetFpscrImpl(A32JitState* jit_state) { + return jit_state->Fpscr(); +} + +void A32EmitX64::EmitA32GetFpscr(A32EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.HostCall(inst); + code.mov(code.ABI_PARAM1, code.r15); + + code.stmxcsr(code.dword[code.r15 + offsetof(A32JitState, guest_MXCSR)]); + code.CallFunction(&GetFpscrImpl); +} + +static void SetFpscrImpl(u32 value, A32JitState* jit_state) { + jit_state->SetFpscr(value); +} + +void A32EmitX64::EmitA32SetFpscr(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(nullptr, args[0]); + code.mov(code.ABI_PARAM2, code.r15); + + code.CallFunction(&SetFpscrImpl); + code.ldmxcsr(code.dword[code.r15 + offsetof(A32JitState, guest_MXCSR)]); +} + +void A32EmitX64::EmitA32GetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + code.mov(result, dword[r15 + offsetof(A32JitState, fpsr_nzcv)]); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitX64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::FastBMI2)) { + const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.mov(tmp, NZCV::x64_mask); + code.pext(tmp, value, tmp); + code.shl(tmp, 28); + code.mov(dword[r15 + offsetof(A32JitState, fpsr_nzcv)], tmp); + + return; + } + + const Xbyak::Reg32 value = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + + code.and_(value, NZCV::x64_mask); + code.imul(value, value, NZCV::from_x64_multiplier); + code.and_(value, NZCV::arm_mask); + code.mov(dword[r15 + offsetof(A32JitState, fpsr_nzcv)], value); +} + +static void EmitCoprocessorException() { + ASSERT_FALSE("Should raise coproc exception here"); +} + +static void CallCoprocCallback(BlockOfCode& code, RegAlloc& reg_alloc, A32::Coprocessor::Callback callback, IR::Inst* inst = nullptr, std::optional<Argument::copyable_reference> arg0 = {}, std::optional<Argument::copyable_reference> arg1 = {}) { + reg_alloc.HostCall(inst, {}, arg0, arg1); + + if (callback.user_arg) { + code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(*callback.user_arg)); + } + + code.CallFunction(callback.function); +} + +void A32EmitX64::EmitA32CoprocInternalOperation(A32EmitContext& ctx, IR::Inst* inst) { + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const auto opc1 = static_cast<unsigned>(coproc_info[2]); + const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]); + const auto CRn = static_cast<A32::CoprocReg>(coproc_info[4]); + const auto CRm = static_cast<A32::CoprocReg>(coproc_info[5]); + const auto opc2 = static_cast<unsigned>(coproc_info[6]); + + std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileInternalOperation(two, opc1, CRd, CRn, CRm, opc2); + if (!action) { + EmitCoprocessorException(); + return; + } + + CallCoprocCallback(code, ctx.reg_alloc, *action); +} + +void A32EmitX64::EmitA32CoprocSendOneWord(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const auto opc1 = static_cast<unsigned>(coproc_info[2]); + const auto CRn = static_cast<A32::CoprocReg>(coproc_info[3]); + const auto CRm = static_cast<A32::CoprocReg>(coproc_info[4]); + const auto opc2 = static_cast<unsigned>(coproc_info[5]); + + std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileSendOneWord(two, opc1, CRn, CRm, opc2); + + if (std::holds_alternative<std::monostate>(action)) { + EmitCoprocessorException(); + return; + } + + if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) { + CallCoprocCallback(code, ctx.reg_alloc, *cb, nullptr, args[1]); + return; + } + + if (const auto destination_ptr = std::get_if<u32*>(&action)) { + const Xbyak::Reg32 reg_word = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr(); + + code.mov(reg_destination_addr, reinterpret_cast<u64>(*destination_ptr)); + code.mov(code.dword[reg_destination_addr], reg_word); + + return; + } + + UNREACHABLE(); +} + +void A32EmitX64::EmitA32CoprocSendTwoWords(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const auto opc = static_cast<unsigned>(coproc_info[2]); + const auto CRm = static_cast<A32::CoprocReg>(coproc_info[3]); + + std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileSendTwoWords(two, opc, CRm); + + if (std::holds_alternative<std::monostate>(action)) { + EmitCoprocessorException(); + return; + } + + if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) { + CallCoprocCallback(code, ctx.reg_alloc, *cb, nullptr, args[1], args[2]); + return; + } + + if (const auto destination_ptrs = std::get_if<std::array<u32*, 2>>(&action)) { + const Xbyak::Reg32 reg_word1 = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + const Xbyak::Reg32 reg_word2 = ctx.reg_alloc.UseGpr(args[2]).cvt32(); + const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr(); + + code.mov(reg_destination_addr, reinterpret_cast<u64>((*destination_ptrs)[0])); + code.mov(code.dword[reg_destination_addr], reg_word1); + code.mov(reg_destination_addr, reinterpret_cast<u64>((*destination_ptrs)[1])); + code.mov(code.dword[reg_destination_addr], reg_word2); + + return; + } + + UNREACHABLE(); +} + +void A32EmitX64::EmitA32CoprocGetOneWord(A32EmitContext& ctx, IR::Inst* inst) { + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const auto opc1 = static_cast<unsigned>(coproc_info[2]); + const auto CRn = static_cast<A32::CoprocReg>(coproc_info[3]); + const auto CRm = static_cast<A32::CoprocReg>(coproc_info[4]); + const auto opc2 = static_cast<unsigned>(coproc_info[5]); + + std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileGetOneWord(two, opc1, CRn, CRm, opc2); + + if (std::holds_alternative<std::monostate>(action)) { + EmitCoprocessorException(); + return; + } + + if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) { + CallCoprocCallback(code, ctx.reg_alloc, *cb, inst); + return; + } + + if (const auto source_ptr = std::get_if<u32*>(&action)) { + const Xbyak::Reg32 reg_word = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg64 reg_source_addr = ctx.reg_alloc.ScratchGpr(); + + code.mov(reg_source_addr, reinterpret_cast<u64>(*source_ptr)); + code.mov(reg_word, code.dword[reg_source_addr]); + + ctx.reg_alloc.DefineValue(inst, reg_word); + + return; + } + + UNREACHABLE(); +} + +void A32EmitX64::EmitA32CoprocGetTwoWords(A32EmitContext& ctx, IR::Inst* inst) { + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const unsigned opc = coproc_info[2]; + const auto CRm = static_cast<A32::CoprocReg>(coproc_info[3]); + + std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + auto action = coproc->CompileGetTwoWords(two, opc, CRm); + + if (std::holds_alternative<std::monostate>(action)) { + EmitCoprocessorException(); + return; + } + + if (const auto cb = std::get_if<A32::Coprocessor::Callback>(&action)) { + CallCoprocCallback(code, ctx.reg_alloc, *cb, inst); + return; + } + + if (const auto source_ptrs = std::get_if<std::array<u32*, 2>>(&action)) { + const Xbyak::Reg64 reg_result = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 reg_destination_addr = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 reg_tmp = ctx.reg_alloc.ScratchGpr(); + + code.mov(reg_destination_addr, reinterpret_cast<u64>((*source_ptrs)[1])); + code.mov(reg_result.cvt32(), code.dword[reg_destination_addr]); + code.shl(reg_result, 32); + code.mov(reg_destination_addr, reinterpret_cast<u64>((*source_ptrs)[0])); + code.mov(reg_tmp.cvt32(), code.dword[reg_destination_addr]); + code.or_(reg_result, reg_tmp); + + ctx.reg_alloc.DefineValue(inst, reg_result); + + return; + } + + UNREACHABLE(); +} + +void A32EmitX64::EmitA32CoprocLoadWords(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const bool long_transfer = coproc_info[2] != 0; + const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]); + const bool has_option = coproc_info[4] != 0; + + std::optional<u8> option = std::nullopt; + if (has_option) { + option = coproc_info[5]; + } + + std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileLoadWords(two, long_transfer, CRd, option); + if (!action) { + EmitCoprocessorException(); + return; + } + + CallCoprocCallback(code, ctx.reg_alloc, *action, nullptr, args[1]); +} + +void A32EmitX64::EmitA32CoprocStoreWords(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + const size_t coproc_num = coproc_info[0]; + const bool two = coproc_info[1] != 0; + const bool long_transfer = coproc_info[2] != 0; + const auto CRd = static_cast<A32::CoprocReg>(coproc_info[3]); + const bool has_option = coproc_info[4] != 0; + + std::optional<u8> option = std::nullopt; + if (has_option) { + option = coproc_info[5]; + } + + std::shared_ptr<A32::Coprocessor> coproc = conf.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + const auto action = coproc->CompileStoreWords(two, long_transfer, CRd, option); + if (!action) { + EmitCoprocessorException(); + return; + } + + CallCoprocCallback(code, ctx.reg_alloc, *action, nullptr, args[1]); +} + +std::string A32EmitX64::LocationDescriptorToFriendlyName(const IR::LocationDescriptor& ir_descriptor) const { + const A32::LocationDescriptor descriptor{ir_descriptor}; + return fmt::format("a32_{}{:08X}_{}_fpcr{:08X}", + descriptor.TFlag() ? "t" : "a", + descriptor.PC(), + descriptor.EFlag() ? "be" : "le", + descriptor.FPSCR().Value()); +} + +void A32EmitX64::EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool) { + ASSERT_MSG(A32::LocationDescriptor{terminal.next}.TFlag() == A32::LocationDescriptor{initial_location}.TFlag(), "Unimplemented"); + ASSERT_MSG(A32::LocationDescriptor{terminal.next}.EFlag() == A32::LocationDescriptor{initial_location}.EFlag(), "Unimplemented"); + ASSERT_MSG(terminal.num_instructions == 1, "Unimplemented"); + + code.mov(code.ABI_PARAM2.cvt32(), A32::LocationDescriptor{terminal.next}.PC()); + code.mov(code.ABI_PARAM3.cvt32(), 1); + code.mov(MJitStateReg(A32::Reg::PC), code.ABI_PARAM2.cvt32()); + code.SwitchMxcsrOnExit(); + Devirtualize<&A32::UserCallbacks::InterpreterFallback>(conf.callbacks).EmitCall(code); + code.ReturnFromRunCode(true); // TODO: Check cycles +} + +void A32EmitX64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) { + code.ReturnFromRunCode(); +} + +void A32EmitX64::EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location) { + auto get_upper = [](const IR::LocationDescriptor& desc) -> u32 { + return static_cast<u32>(A32::LocationDescriptor{desc}.SetSingleStepping(false).UniqueHash() >> 32); + }; + + const u32 old_upper = get_upper(old_location); + const u32 new_upper = [&] { + const u32 mask = ~u32(conf.always_little_endian ? 0x2 : 0); + return get_upper(new_location) & mask; + }(); + + if (old_upper != new_upper) { + code.mov(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], new_upper); + } +} + +void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + EmitSetUpperLocationDescriptor(terminal.next, initial_location); + + if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) { + code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{terminal.next}.PC()); + code.ReturnFromRunCode(); + return; + } + + if (conf.enable_cycle_counting) { + code.cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0); + + patch_information[terminal.next].jg.push_back(code.getCurr()); + if (const auto next_bb = GetBasicBlock(terminal.next)) { + EmitPatchJg(terminal.next, next_bb->entrypoint); + } else { + EmitPatchJg(terminal.next); + } + } else { + code.cmp(dword[r15 + offsetof(A32JitState, halt_reason)], 0); + + patch_information[terminal.next].jz.push_back(code.getCurr()); + if (const auto next_bb = GetBasicBlock(terminal.next)) { + EmitPatchJz(terminal.next, next_bb->entrypoint); + } else { + EmitPatchJz(terminal.next); + } + } + + code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{terminal.next}.PC()); + PushRSBHelper(rax, rbx, terminal.next); + code.ForceReturnFromRunCode(); +} + +void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + EmitSetUpperLocationDescriptor(terminal.next, initial_location); + + if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) { + code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{terminal.next}.PC()); + code.ReturnFromRunCode(); + return; + } + + patch_information[terminal.next].jmp.push_back(code.getCurr()); + if (const auto next_bb = GetBasicBlock(terminal.next)) { + EmitPatchJmp(terminal.next, next_bb->entrypoint); + } else { + EmitPatchJmp(terminal.next); + } +} + +void A32EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) { + if (!conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) || is_single_step) { + code.ReturnFromRunCode(); + return; + } + + code.jmp(terminal_handler_pop_rsb_hint); +} + +void A32EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor, bool is_single_step) { + if (!conf.HasOptimization(OptimizationFlag::FastDispatch) || is_single_step) { + code.ReturnFromRunCode(); + return; + } + + code.jmp(terminal_handler_fast_dispatch_hint); +} + +void A32EmitX64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + Xbyak::Label pass = EmitCond(terminal.if_); + EmitTerminal(terminal.else_, initial_location, is_single_step); + code.L(pass); + EmitTerminal(terminal.then_, initial_location, is_single_step); +} + +void A32EmitX64::EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + Xbyak::Label fail; + code.cmp(code.byte[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, check_bit)], u8(0)); + code.jz(fail); + EmitTerminal(terminal.then_, initial_location, is_single_step); + code.L(fail); + EmitTerminal(terminal.else_, initial_location, is_single_step); +} + +void A32EmitX64::EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + code.cmp(dword[r15 + offsetof(A32JitState, halt_reason)], 0); + code.jne(code.GetForceReturnFromRunCodeAddress()); + EmitTerminal(terminal.else_, initial_location, is_single_step); +} + +void A32EmitX64::EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) { + const CodePtr patch_location = code.getCurr(); + if (target_code_ptr) { + code.jg(target_code_ptr); + } else { + code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{target_desc}.PC()); + code.jg(code.GetReturnFromRunCodeAddress()); + } + code.EnsurePatchLocationSize(patch_location, 14); +} + +void A32EmitX64::EmitPatchJz(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) { + const CodePtr patch_location = code.getCurr(); + if (target_code_ptr) { + code.jz(target_code_ptr); + } else { + code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{target_desc}.PC()); + code.jz(code.GetReturnFromRunCodeAddress()); + } + code.EnsurePatchLocationSize(patch_location, 14); +} + +void A32EmitX64::EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) { + const CodePtr patch_location = code.getCurr(); + if (target_code_ptr) { + code.jmp(target_code_ptr); + } else { + code.mov(MJitStateReg(A32::Reg::PC), A32::LocationDescriptor{target_desc}.PC()); + code.jmp(code.GetReturnFromRunCodeAddress()); + } + code.EnsurePatchLocationSize(patch_location, 13); +} + +void A32EmitX64::EmitPatchMovRcx(CodePtr target_code_ptr) { + if (!target_code_ptr) { + target_code_ptr = code.GetReturnFromRunCodeAddress(); + } + const CodePtr patch_location = code.getCurr(); + code.mov(code.rcx, reinterpret_cast<u64>(target_code_ptr)); + code.EnsurePatchLocationSize(patch_location, 10); +} + +void A32EmitX64::Unpatch(const IR::LocationDescriptor& location) { + EmitX64::Unpatch(location); + if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { + code.DisableWriting(); + (*fast_dispatch_table_lookup)(location.Value()) = {}; + code.EnableWriting(); + } +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h new file mode 100644 index 0000000000..3706b30aae --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h @@ -0,0 +1,147 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <optional> +#include <set> +#include <tuple> + +#include <tsl/robin_map.h> + +#include "dynarmic/backend/block_range_information.h" +#include "dynarmic/backend/x64/a32_jitstate.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/frontend/A32/a32_location_descriptor.h" +#include "dynarmic/interface/A32/a32.h" +#include "dynarmic/interface/A32/config.h" +#include "dynarmic/ir/terminal.h" + +namespace Dynarmic::Backend::X64 { + +class RegAlloc; + +struct A32EmitContext final : public EmitContext { + A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block); + + A32::LocationDescriptor Location() const; + A32::LocationDescriptor EndLocation() const; + bool IsSingleStep() const; + FP::FPCR FPCR(bool fpcr_controlled = true) const override; + + bool HasOptimization(OptimizationFlag flag) const override { + return conf.HasOptimization(flag); + } + + const A32::UserConfig& conf; +}; + +class A32EmitX64 final : public EmitX64 { +public: + A32EmitX64(BlockOfCode& code, A32::UserConfig conf, A32::Jit* jit_interface); + ~A32EmitX64() override; + + /** + * Emit host machine code for a basic block with intermediate representation `block`. + * @note block is modified. + */ + BlockDescriptor Emit(IR::Block& block); + + void ClearCache() override; + + void InvalidateCacheRanges(const boost::icl::interval_set<u32>& ranges); + +protected: + const A32::UserConfig conf; + A32::Jit* jit_interface; + BlockRangeInformation<u32> block_ranges; + + void EmitCondPrelude(const A32EmitContext& ctx); + + struct FastDispatchEntry { + u64 location_descriptor = 0xFFFF'FFFF'FFFF'FFFFull; + const void* code_ptr = nullptr; + }; + static_assert(sizeof(FastDispatchEntry) == 0x10); + static constexpr u64 fast_dispatch_table_mask = 0xFFFF0; + static constexpr size_t fast_dispatch_table_size = 0x10000; + std::array<FastDispatchEntry, fast_dispatch_table_size> fast_dispatch_table; + void ClearFastDispatchTable(); + + void (*memory_read_128)() = nullptr; // Dummy + void (*memory_write_128)() = nullptr; // Dummy + + std::map<std::tuple<bool, size_t, int, int>, void (*)()> read_fallbacks; + std::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks; + std::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks; + void GenFastmemFallbacks(); + + const void* terminal_handler_pop_rsb_hint; + const void* terminal_handler_fast_dispatch_hint = nullptr; + FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr; + void GenTerminalHandlers(); + + // Microinstruction emitters +#define OPCODE(...) +#define A32OPC(name, type, ...) void EmitA32##name(A32EmitContext& ctx, IR::Inst* inst); +#define A64OPC(...) +#include "dynarmic/ir/opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + + // Helpers + std::string LocationDescriptorToFriendlyName(const IR::LocationDescriptor&) const override; + + // Fastmem information + using DoNotFastmemMarker = std::tuple<IR::LocationDescriptor, unsigned>; + struct FastmemPatchInfo { + u64 resume_rip; + u64 callback; + DoNotFastmemMarker marker; + bool recompile; + }; + tsl::robin_map<u64, FastmemPatchInfo> fastmem_patch_info; + std::set<DoNotFastmemMarker> do_not_fastmem; + std::optional<DoNotFastmemMarker> ShouldFastmem(A32EmitContext& ctx, IR::Inst* inst) const; + FakeCall FastmemCallback(u64 rip); + + // Memory access helpers + void EmitCheckMemoryAbort(A32EmitContext& ctx, IR::Inst* inst, Xbyak::Label* end = nullptr); + template<std::size_t bitsize, auto callback> + void EmitMemoryRead(A32EmitContext& ctx, IR::Inst* inst); + template<std::size_t bitsize, auto callback> + void EmitMemoryWrite(A32EmitContext& ctx, IR::Inst* inst); + template<std::size_t bitsize, auto callback> + void EmitExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst); + template<std::size_t bitsize, auto callback> + void EmitExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst); + template<std::size_t bitsize, auto callback> + void EmitExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst); + template<std::size_t bitsize, auto callback> + void EmitExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst); + + // Terminal instruction emitters + void EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location); + void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::ReturnToDispatch terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::PopRSBHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::FastDispatchHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + + // Patching + void Unpatch(const IR::LocationDescriptor& target_desc) override; + void EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override; + void EmitPatchJz(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override; + void EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override; + void EmitPatchMovRcx(CodePtr target_code_ptr = nullptr) override; +}; + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp new file mode 100644 index 0000000000..f2919485be --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp @@ -0,0 +1,259 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <array> +#include <initializer_list> +#include <tuple> +#include <utility> + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <mcl/type_traits/integer_of_size.hpp> +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/a32_emit_x64.h" +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/devirtualize.h" +#include "dynarmic/backend/x64/emit_x64_memory.h" +#include "dynarmic/backend/x64/exclusive_monitor_friend.h" +#include "dynarmic/backend/x64/perf_map.h" +#include "dynarmic/common/x64_disassemble.h" +#include "dynarmic/interface/exclusive_monitor.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +void A32EmitX64::GenFastmemFallbacks() { + const std::initializer_list<int> idxes{0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; + const std::array<std::pair<size_t, ArgCallback>, 4> read_callbacks{{ + {8, Devirtualize<&A32::UserCallbacks::MemoryRead8>(conf.callbacks)}, + {16, Devirtualize<&A32::UserCallbacks::MemoryRead16>(conf.callbacks)}, + {32, Devirtualize<&A32::UserCallbacks::MemoryRead32>(conf.callbacks)}, + {64, Devirtualize<&A32::UserCallbacks::MemoryRead64>(conf.callbacks)}, + }}; + const std::array<std::pair<size_t, ArgCallback>, 4> write_callbacks{{ + {8, Devirtualize<&A32::UserCallbacks::MemoryWrite8>(conf.callbacks)}, + {16, Devirtualize<&A32::UserCallbacks::MemoryWrite16>(conf.callbacks)}, + {32, Devirtualize<&A32::UserCallbacks::MemoryWrite32>(conf.callbacks)}, + {64, Devirtualize<&A32::UserCallbacks::MemoryWrite64>(conf.callbacks)}, + }}; + const std::array<std::pair<size_t, ArgCallback>, 4> exclusive_write_callbacks{{ + {8, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive8>(conf.callbacks)}, + {16, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive16>(conf.callbacks)}, + {32, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive32>(conf.callbacks)}, + {64, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, + }}; + + for (bool ordered : {false, true}) { + for (int vaddr_idx : idxes) { + for (int value_idx : idxes) { + for (const auto& [bitsize, callback] : read_callbacks) { + code.align(); + read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + if (ordered) { + code.mfence(); + } + callback.EmitCall(code); + if (value_idx != code.ABI_RETURN.getIdx()) { + code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); + code.ret(); + PerfMapRegister(read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_read_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : write_callbacks) { + code.align(); + write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + callback.EmitCall(code); + if (ordered) { + code.mfence(); + } + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_write_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : exclusive_write_callbacks) { + code.align(); + exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + code.mov(code.ABI_PARAM4, rax); + code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_exclusive_write_fallback_{}", bitsize)); + } + } + } + } +} + +#define Axx A32 +#include "dynarmic/backend/x64/emit_x64_memory.cpp.inc" +#undef Axx + +void A32EmitX64::EmitA32ReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); +} + +void A32EmitX64::EmitA32ReadMemory16(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); +} + +void A32EmitX64::EmitA32ReadMemory32(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); +} + +void A32EmitX64::EmitA32ReadMemory64(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); +} + +void A32EmitX64::EmitA32WriteMemory8(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<8, &A32::UserCallbacks::MemoryWrite8>(ctx, inst); +} + +void A32EmitX64::EmitA32WriteMemory16(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<16, &A32::UserCallbacks::MemoryWrite16>(ctx, inst); +} + +void A32EmitX64::EmitA32WriteMemory32(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<32, &A32::UserCallbacks::MemoryWrite32>(ctx, inst); +} + +void A32EmitX64::EmitA32WriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<64, &A32::UserCallbacks::MemoryWrite64>(ctx, inst); +} + +void A32EmitX64::EmitA32ClearExclusive(A32EmitContext&, IR::Inst*) { + code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); +} + +void A32EmitX64::EmitA32ExclusiveReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); + } else { + EmitExclusiveReadMemory<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveReadMemory16(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); + } else { + EmitExclusiveReadMemory<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveReadMemory32(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); + } else { + EmitExclusiveReadMemory<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveReadMemory64(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); + } else { + EmitExclusiveReadMemory<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveWriteMemory8(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } else { + EmitExclusiveWriteMemory<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveWriteMemory16(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } else { + EmitExclusiveWriteMemory<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveWriteMemory32(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } else { + EmitExclusiveWriteMemory<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveWriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } else { + EmitExclusiveWriteMemory<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } +} + +void A32EmitX64::EmitCheckMemoryAbort(A32EmitContext& ctx, IR::Inst* inst, Xbyak::Label* end) { + if (!conf.check_halt_on_memory_access) { + return; + } + + Xbyak::Label skip; + + const A32::LocationDescriptor current_location{IR::LocationDescriptor{inst->GetArg(0).GetU64()}}; + + code.test(dword[r15 + offsetof(A32JitState, halt_reason)], static_cast<u32>(HaltReason::MemoryAbort)); + if (end) { + code.jz(*end, code.T_NEAR); + } else { + code.jz(skip, code.T_NEAR); + } + EmitSetUpperLocationDescriptor(current_location, ctx.Location()); + code.mov(dword[r15 + offsetof(A32JitState, Reg) + sizeof(u32) * 15], current_location.PC()); + code.ForceReturnFromRunCode(); + code.L(skip); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp new file mode 100644 index 0000000000..84f84f5db9 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp @@ -0,0 +1,343 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <functional> +#include <memory> +#include <mutex> + +#include <boost/icl/interval_set.hpp> +#include <fmt/format.h> +#include <mcl/assert.hpp> +#include <mcl/bit_cast.hpp> +#include <mcl/scope_exit.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/x64/a32_emit_x64.h" +#include "dynarmic/backend/x64/a32_jitstate.h" +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/callback.h" +#include "dynarmic/backend/x64/devirtualize.h" +#include "dynarmic/backend/x64/jitstate_info.h" +#include "dynarmic/common/atomic.h" +#include "dynarmic/common/x64_disassemble.h" +#include "dynarmic/frontend/A32/translate/a32_translate.h" +#include "dynarmic/interface/A32/a32.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/location_descriptor.h" +#include "dynarmic/ir/opt/passes.h" + +namespace Dynarmic::A32 { + +using namespace Backend::X64; + +static RunCodeCallbacks GenRunCodeCallbacks(A32::UserCallbacks* cb, CodePtr (*LookupBlock)(void* lookup_block_arg), void* arg, const A32::UserConfig& conf) { + return RunCodeCallbacks{ + std::make_unique<ArgCallback>(LookupBlock, reinterpret_cast<u64>(arg)), + std::make_unique<ArgCallback>(Devirtualize<&A32::UserCallbacks::AddTicks>(cb)), + std::make_unique<ArgCallback>(Devirtualize<&A32::UserCallbacks::GetTicksRemaining>(cb)), + conf.enable_cycle_counting, + }; +} + +static std::function<void(BlockOfCode&)> GenRCP(const A32::UserConfig& conf) { + return [conf](BlockOfCode& code) { + if (conf.page_table) { + code.mov(code.r14, mcl::bit_cast<u64>(conf.page_table)); + } + if (conf.fastmem_pointer) { + code.mov(code.r13, mcl::bit_cast<u64>(conf.fastmem_pointer)); + } + }; +} + +static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) { + return Optimization::PolyfillOptions{ + .sha256 = !code.HasHostFeature(HostFeature::SHA), + .vector_multiply_widen = true, + }; +} + +struct Jit::Impl { + Impl(Jit* jit, A32::UserConfig conf) + : block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this, conf), JitStateInfo{jit_state}, conf.code_cache_size, GenRCP(conf)) + , emitter(block_of_code, conf, jit) + , polyfill_options(GenPolyfillOptions(block_of_code)) + , conf(std::move(conf)) + , jit_interface(jit) {} + + ~Impl() = default; + + HaltReason Run() { + ASSERT(!jit_interface->is_executing); + PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&jit_state.halt_reason))); + + jit_interface->is_executing = true; + SCOPE_EXIT { + jit_interface->is_executing = false; + }; + + const CodePtr current_codeptr = [this] { + // RSB optimization + const u32 new_rsb_ptr = (jit_state.rsb_ptr - 1) & A32JitState::RSBPtrMask; + if (jit_state.GetUniqueHash() == jit_state.rsb_location_descriptors[new_rsb_ptr]) { + jit_state.rsb_ptr = new_rsb_ptr; + return reinterpret_cast<CodePtr>(jit_state.rsb_codeptrs[new_rsb_ptr]); + } + + return GetCurrentBlock(); + }(); + + const HaltReason hr = block_of_code.RunCode(&jit_state, current_codeptr); + + PerformRequestedCacheInvalidation(hr); + + return hr; + } + + HaltReason Step() { + ASSERT(!jit_interface->is_executing); + PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&jit_state.halt_reason))); + + jit_interface->is_executing = true; + SCOPE_EXIT { + jit_interface->is_executing = false; + }; + + const HaltReason hr = block_of_code.StepCode(&jit_state, GetCurrentSingleStep()); + + PerformRequestedCacheInvalidation(hr); + + return hr; + } + + void ClearCache() { + std::unique_lock lock{invalidation_mutex}; + invalidate_entire_cache = true; + HaltExecution(HaltReason::CacheInvalidation); + } + + void InvalidateCacheRange(std::uint32_t start_address, std::size_t length) { + std::unique_lock lock{invalidation_mutex}; + invalid_cache_ranges.add(boost::icl::discrete_interval<u32>::closed(start_address, static_cast<u32>(start_address + length - 1))); + HaltExecution(HaltReason::CacheInvalidation); + } + + void Reset() { + ASSERT(!jit_interface->is_executing); + jit_state = {}; + } + + void HaltExecution(HaltReason hr) { + Atomic::Or(&jit_state.halt_reason, static_cast<u32>(hr)); + } + + void ClearHalt(HaltReason hr) { + Atomic::And(&jit_state.halt_reason, ~static_cast<u32>(hr)); + } + + void ClearExclusiveState() { + jit_state.exclusive_state = 0; + } + + std::array<u32, 16>& Regs() { + return jit_state.Reg; + } + + const std::array<u32, 16>& Regs() const { + return jit_state.Reg; + } + + std::array<u32, 64>& ExtRegs() { + return jit_state.ExtReg; + } + + const std::array<u32, 64>& ExtRegs() const { + return jit_state.ExtReg; + } + + u32 Cpsr() const { + return jit_state.Cpsr(); + } + + void SetCpsr(u32 value) { + return jit_state.SetCpsr(value); + } + + u32 Fpscr() const { + return jit_state.Fpscr(); + } + + void SetFpscr(u32 value) { + return jit_state.SetFpscr(value); + } + + void DumpDisassembly() const { + const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin()); + Common::DumpDisassembledX64(block_of_code.GetCodeBegin(), size); + } + + std::vector<std::string> Disassemble() const { + const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin()); + return Common::DisassembleX64(block_of_code.GetCodeBegin(), size); + } + +private: + static CodePtr GetCurrentBlockThunk(void* this_voidptr) { + Jit::Impl& this_ = *static_cast<Jit::Impl*>(this_voidptr); + return this_.GetCurrentBlock(); + } + + IR::LocationDescriptor GetCurrentLocation() const { + return IR::LocationDescriptor{jit_state.GetUniqueHash()}; + } + + CodePtr GetCurrentBlock() { + return GetBasicBlock(GetCurrentLocation()).entrypoint; + } + + CodePtr GetCurrentSingleStep() { + return GetBasicBlock(A32::LocationDescriptor{GetCurrentLocation()}.SetSingleStepping(true)).entrypoint; + } + + A32EmitX64::BlockDescriptor GetBasicBlock(IR::LocationDescriptor descriptor) { + auto block = emitter.GetBasicBlock(descriptor); + if (block) + return *block; + + constexpr size_t MINIMUM_REMAINING_CODESIZE = 1 * 1024 * 1024; + if (block_of_code.SpaceRemaining() < MINIMUM_REMAINING_CODESIZE) { + invalidate_entire_cache = true; + PerformRequestedCacheInvalidation(HaltReason::CacheInvalidation); + } + block_of_code.EnsureMemoryCommitted(MINIMUM_REMAINING_CODESIZE); + + IR::Block ir_block = A32::Translate(A32::LocationDescriptor{descriptor}, conf.callbacks, {conf.arch_version, conf.define_unpredictable_behaviour, conf.hook_hint_instructions}); + Optimization::PolyfillPass(ir_block, polyfill_options); + Optimization::NamingPass(ir_block); + if (conf.HasOptimization(OptimizationFlag::GetSetElimination) && !conf.check_halt_on_memory_access) { + Optimization::A32GetSetElimination(ir_block, {.convert_nz_to_nzc = true}); + Optimization::DeadCodeElimination(ir_block); + } + if (conf.HasOptimization(OptimizationFlag::ConstProp)) { + Optimization::A32ConstantMemoryReads(ir_block, conf.callbacks); + Optimization::ConstantPropagation(ir_block); + Optimization::DeadCodeElimination(ir_block); + } + Optimization::IdentityRemovalPass(ir_block); + Optimization::VerificationPass(ir_block); + return emitter.Emit(ir_block); + } + + void PerformRequestedCacheInvalidation(HaltReason hr) { + if (Has(hr, HaltReason::CacheInvalidation)) { + std::unique_lock lock{invalidation_mutex}; + + ClearHalt(HaltReason::CacheInvalidation); + + if (!invalidate_entire_cache && invalid_cache_ranges.empty()) { + return; + } + + jit_state.ResetRSB(); + if (invalidate_entire_cache) { + block_of_code.ClearCache(); + emitter.ClearCache(); + } else { + emitter.InvalidateCacheRanges(invalid_cache_ranges); + } + invalid_cache_ranges.clear(); + invalidate_entire_cache = false; + } + } + + A32JitState jit_state; + BlockOfCode block_of_code; + A32EmitX64 emitter; + Optimization::PolyfillOptions polyfill_options; + + const A32::UserConfig conf; + + Jit* jit_interface; + + // Requests made during execution to invalidate the cache are queued up here. + bool invalidate_entire_cache = false; + boost::icl::interval_set<u32> invalid_cache_ranges; + std::mutex invalidation_mutex; +}; + +Jit::Jit(UserConfig conf) + : impl(std::make_unique<Impl>(this, std::move(conf))) {} + +Jit::~Jit() = default; + +HaltReason Jit::Run() { + return impl->Run(); +} + +HaltReason Jit::Step() { + return impl->Step(); +} + +void Jit::ClearCache() { + impl->ClearCache(); +} + +void Jit::InvalidateCacheRange(std::uint32_t start_address, std::size_t length) { + impl->InvalidateCacheRange(start_address, length); +} + +void Jit::Reset() { + impl->Reset(); +} + +void Jit::HaltExecution(HaltReason hr) { + impl->HaltExecution(hr); +} + +void Jit::ClearHalt(HaltReason hr) { + impl->ClearHalt(hr); +} + +std::array<std::uint32_t, 16>& Jit::Regs() { + return impl->Regs(); +} + +const std::array<std::uint32_t, 16>& Jit::Regs() const { + return impl->Regs(); +} + +std::array<std::uint32_t, 64>& Jit::ExtRegs() { + return impl->ExtRegs(); +} + +const std::array<std::uint32_t, 64>& Jit::ExtRegs() const { + return impl->ExtRegs(); +} + +std::uint32_t Jit::Cpsr() const { + return impl->Cpsr(); +} + +void Jit::SetCpsr(std::uint32_t value) { + impl->SetCpsr(value); +} + +std::uint32_t Jit::Fpscr() const { + return impl->Fpscr(); +} + +void Jit::SetFpscr(std::uint32_t value) { + impl->SetFpscr(value); +} + +void Jit::ClearExclusiveState() { + impl->ClearExclusiveState(); +} + +void Jit::DumpDisassembly() const { + impl->DumpDisassembly(); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.cpp new file mode 100644 index 0000000000..2e201c1cc4 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.cpp @@ -0,0 +1,208 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/a32_jitstate.h" + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/nzcv_util.h" +#include "dynarmic/frontend/A32/a32_location_descriptor.h" + +namespace Dynarmic::Backend::X64 { + +/** + * CPSR Bits + * ========= + * + * ARM CPSR flags + * -------------- + * N bit 31 Negative flag + * Z bit 30 Zero flag + * C bit 29 Carry flag + * V bit 28 oVerflow flag + * Q bit 27 Saturation flag + * IT[1:0] bits 25-26 If-Then execution state (lower 2 bits) + * J bit 24 Jazelle instruction set flag + * GE bits 16-19 Greater than or Equal flags + * IT[7:2] bits 10-15 If-Then execution state (upper 6 bits) + * E bit 9 Data Endianness flag + * A bit 8 Disable imprecise Aborts + * I bit 7 Disable IRQ interrupts + * F bit 6 Disable FIQ interrupts + * T bit 5 Thumb instruction set flag + * M bits 0-4 Processor Mode bits + * + * x64 LAHF+SETO flags + * ------------------- + * SF bit 15 Sign flag + * ZF bit 14 Zero flag + * AF bit 12 Auxiliary flag + * PF bit 10 Parity flag + * CF bit 8 Carry flag + * OF bit 0 Overflow flag + */ + +u32 A32JitState::Cpsr() const { + DEBUG_ASSERT((cpsr_q & ~1) == 0); + DEBUG_ASSERT((cpsr_jaifm & ~0x010001DF) == 0); + + u32 cpsr = 0; + + // NZCV flags + cpsr |= NZCV::FromX64(cpsr_nzcv); + // Q flag + cpsr |= cpsr_q ? 1 << 27 : 0; + // GE flags + cpsr |= mcl::bit::get_bit<31>(cpsr_ge) ? 1 << 19 : 0; + cpsr |= mcl::bit::get_bit<23>(cpsr_ge) ? 1 << 18 : 0; + cpsr |= mcl::bit::get_bit<15>(cpsr_ge) ? 1 << 17 : 0; + cpsr |= mcl::bit::get_bit<7>(cpsr_ge) ? 1 << 16 : 0; + // E flag, T flag + cpsr |= mcl::bit::get_bit<1>(upper_location_descriptor) ? 1 << 9 : 0; + cpsr |= mcl::bit::get_bit<0>(upper_location_descriptor) ? 1 << 5 : 0; + // IT state + cpsr |= static_cast<u32>(upper_location_descriptor & 0b11111100'00000000); + cpsr |= static_cast<u32>(upper_location_descriptor & 0b00000011'00000000) << 17; + // Other flags + cpsr |= cpsr_jaifm; + + return cpsr; +} + +void A32JitState::SetCpsr(u32 cpsr) { + // NZCV flags + cpsr_nzcv = NZCV::ToX64(cpsr); + // Q flag + cpsr_q = mcl::bit::get_bit<27>(cpsr) ? 1 : 0; + // GE flags + cpsr_ge = 0; + cpsr_ge |= mcl::bit::get_bit<19>(cpsr) ? 0xFF000000 : 0; + cpsr_ge |= mcl::bit::get_bit<18>(cpsr) ? 0x00FF0000 : 0; + cpsr_ge |= mcl::bit::get_bit<17>(cpsr) ? 0x0000FF00 : 0; + cpsr_ge |= mcl::bit::get_bit<16>(cpsr) ? 0x000000FF : 0; + + upper_location_descriptor &= 0xFFFF0000; + // E flag, T flag + upper_location_descriptor |= mcl::bit::get_bit<9>(cpsr) ? 2 : 0; + upper_location_descriptor |= mcl::bit::get_bit<5>(cpsr) ? 1 : 0; + // IT state + upper_location_descriptor |= (cpsr >> 0) & 0b11111100'00000000; + upper_location_descriptor |= (cpsr >> 17) & 0b00000011'00000000; + + // Other flags + cpsr_jaifm = cpsr & 0x010001DF; +} + +void A32JitState::ResetRSB() { + rsb_location_descriptors.fill(0xFFFFFFFFFFFFFFFFull); + rsb_codeptrs.fill(0); +} + +/** + * Comparing MXCSR and FPSCR + * ========================= + * + * SSE MXCSR exception flags + * ------------------------- + * PE bit 5 Precision Flag + * UE bit 4 Underflow Flag + * OE bit 3 Overflow Flag + * ZE bit 2 Divide By Zero Flag + * DE bit 1 Denormal Flag // Appears to only be set when MXCSR.DAZ = 0 + * IE bit 0 Invalid Operation Flag + * + * VFP FPSCR cumulative exception bits + * ----------------------------------- + * IDC bit 7 Input Denormal cumulative exception bit // Only ever set when FPSCR.FTZ = 1 + * IXC bit 4 Inexact cumulative exception bit + * UFC bit 3 Underflow cumulative exception bit + * OFC bit 2 Overflow cumulative exception bit + * DZC bit 1 Division by Zero cumulative exception bit + * IOC bit 0 Invalid Operation cumulative exception bit + * + * SSE MSCSR exception masks + * ------------------------- + * PM bit 12 Precision Mask + * UM bit 11 Underflow Mask + * OM bit 10 Overflow Mask + * ZM bit 9 Divide By Zero Mask + * DM bit 8 Denormal Mask + * IM bit 7 Invalid Operation Mask + * + * VFP FPSCR exception trap enables + * -------------------------------- + * IDE bit 15 Input Denormal exception trap enable + * IXE bit 12 Inexact exception trap enable + * UFE bit 11 Underflow exception trap enable + * OFE bit 10 Overflow exception trap enable + * DZE bit 9 Division by Zero exception trap enable + * IOE bit 8 Invalid Operation exception trap enable + * + * SSE MXCSR mode bits + * ------------------- + * FZ bit 15 Flush To Zero + * DAZ bit 6 Denormals Are Zero + * RN bits 13-14 Round to {0 = Nearest, 1 = Negative, 2 = Positive, 3 = Zero} + * + * VFP FPSCR mode bits + * ------------------- + * AHP bit 26 Alternate half-precision + * DN bit 25 Default NaN + * FZ bit 24 Flush to Zero + * RMode bits 22-23 Round to {0 = Nearest, 1 = Positive, 2 = Negative, 3 = Zero} + * Stride bits 20-21 Vector stride + * Len bits 16-18 Vector length + */ + +// NZCV; QC (ASIMD only), AHP; DN, FZ, RMode, Stride; SBZP; Len; trap enables; cumulative bits +constexpr u32 FPSCR_MODE_MASK = A32::LocationDescriptor::FPSCR_MODE_MASK; +constexpr u32 FPSCR_NZCV_MASK = 0xF0000000; + +u32 A32JitState::Fpscr() const { + DEBUG_ASSERT((fpsr_nzcv & ~FPSCR_NZCV_MASK) == 0); + + const u32 fpcr_mode = static_cast<u32>(upper_location_descriptor) & FPSCR_MODE_MASK; + const u32 mxcsr = guest_MXCSR | asimd_MXCSR; + + u32 FPSCR = fpcr_mode | fpsr_nzcv; + FPSCR |= (mxcsr & 0b0000000000001); // IOC = IE + FPSCR |= (mxcsr & 0b0000000111100) >> 1; // IXC, UFC, OFC, DZC = PE, UE, OE, ZE + FPSCR |= fpsr_exc; + FPSCR |= fpsr_qc != 0 ? 1 << 27 : 0; + + return FPSCR; +} + +void A32JitState::SetFpscr(u32 FPSCR) { + // Ensure that only upper half of upper_location_descriptor is used for FPSCR bits. + static_assert((FPSCR_MODE_MASK & 0xFFFF0000) == FPSCR_MODE_MASK); + + upper_location_descriptor &= 0x0000FFFF; + upper_location_descriptor |= FPSCR & FPSCR_MODE_MASK; + + fpsr_nzcv = FPSCR & FPSCR_NZCV_MASK; + fpsr_qc = (FPSCR >> 27) & 1; + + guest_MXCSR = 0x00001f80; + asimd_MXCSR = 0x00009fc0; + + // RMode + const std::array<u32, 4> MXCSR_RMode{0x0, 0x4000, 0x2000, 0x6000}; + guest_MXCSR |= MXCSR_RMode[(FPSCR >> 22) & 0x3]; + + // Cumulative flags IDC, IOC, IXC, UFC, OFC, DZC + fpsr_exc = FPSCR & 0x9F; + + if (mcl::bit::get_bit<24>(FPSCR)) { + // VFP Flush to Zero + guest_MXCSR |= (1 << 15); // SSE Flush to Zero + guest_MXCSR |= (1 << 6); // SSE Denormals are Zero + } +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.h b/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.h new file mode 100644 index 0000000000..cc13abf8e2 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_jitstate.h @@ -0,0 +1,97 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Backend::X64 { + +class BlockOfCode; + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4324) // Structure was padded due to alignment specifier +#endif + +struct A32JitState { + using ProgramCounterType = u32; + + A32JitState() { ResetRSB(); } + + std::array<u32, 16> Reg{}; // Current register file. + // TODO: Mode-specific register sets unimplemented. + + u32 upper_location_descriptor = 0; + + u32 cpsr_ge = 0; + u32 cpsr_q = 0; + u32 cpsr_nzcv = 0; + u32 cpsr_jaifm = 0; + u32 Cpsr() const; + void SetCpsr(u32 cpsr); + + alignas(16) std::array<u32, 64> ExtReg{}; // Extension registers. + + // For internal use (See: BlockOfCode::RunCode) + u32 guest_MXCSR = 0x00001f80; + u32 asimd_MXCSR = 0x00009fc0; + volatile u32 halt_reason = 0; + + // Exclusive state + u32 exclusive_state = 0; + + static constexpr size_t RSBSize = 8; // MUST be a power of 2. + static constexpr size_t RSBPtrMask = RSBSize - 1; + u32 rsb_ptr = 0; + std::array<u64, RSBSize> rsb_location_descriptors; + std::array<u64, RSBSize> rsb_codeptrs; + void ResetRSB(); + + u32 fpsr_exc = 0; + u32 fpsr_qc = 0; + u32 fpsr_nzcv = 0; + u32 Fpscr() const; + void SetFpscr(u32 FPSCR); + + u64 GetUniqueHash() const noexcept { + return (static_cast<u64>(upper_location_descriptor) << 32) | (static_cast<u64>(Reg[15])); + } + + void TransferJitState(const A32JitState& src, bool reset_rsb) { + Reg = src.Reg; + upper_location_descriptor = src.upper_location_descriptor; + cpsr_ge = src.cpsr_ge; + cpsr_q = src.cpsr_q; + cpsr_nzcv = src.cpsr_nzcv; + cpsr_jaifm = src.cpsr_jaifm; + ExtReg = src.ExtReg; + guest_MXCSR = src.guest_MXCSR; + asimd_MXCSR = src.asimd_MXCSR; + fpsr_exc = src.fpsr_exc; + fpsr_qc = src.fpsr_qc; + fpsr_nzcv = src.fpsr_nzcv; + + exclusive_state = 0; + + if (reset_rsb) { + ResetRSB(); + } else { + rsb_ptr = src.rsb_ptr; + rsb_location_descriptors = src.rsb_location_descriptors; + rsb_codeptrs = src.rsb_codeptrs; + } + } +}; + +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +using CodePtr = const void*; + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp new file mode 100644 index 0000000000..cec781380b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp @@ -0,0 +1,763 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/a64_emit_x64.h" + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <mcl/assert.hpp> +#include <mcl/scope_exit.hpp> +#include <mcl/stdint.hpp> +#include <mcl/type_traits/integer_of_size.hpp> + +#include "dynarmic/backend/x64/a64_jitstate.h" +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/devirtualize.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/backend/x64/nzcv_util.h" +#include "dynarmic/backend/x64/perf_map.h" +#include "dynarmic/backend/x64/stack_layout.h" +#include "dynarmic/frontend/A64/a64_location_descriptor.h" +#include "dynarmic/frontend/A64/a64_types.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/cond.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +// TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary. +// TODO: Actually implement that proper instruction selector you've always wanted to sweetheart. + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block) + : EmitContext(reg_alloc, block), conf(conf) {} + +A64::LocationDescriptor A64EmitContext::Location() const { + return A64::LocationDescriptor{block.Location()}; +} + +bool A64EmitContext::IsSingleStep() const { + return Location().SingleStepping(); +} + +FP::FPCR A64EmitContext::FPCR(bool fpcr_controlled) const { + return fpcr_controlled ? Location().FPCR() : Location().FPCR().ASIMDStandardValue(); +} + +A64EmitX64::A64EmitX64(BlockOfCode& code, A64::UserConfig conf, A64::Jit* jit_interface) + : EmitX64(code), conf(conf), jit_interface{jit_interface} { + GenMemory128Accessors(); + GenFastmemFallbacks(); + GenTerminalHandlers(); + code.PreludeComplete(); + ClearFastDispatchTable(); + + exception_handler.SetFastmemCallback([this](u64 rip_) { + return FastmemCallback(rip_); + }); +} + +A64EmitX64::~A64EmitX64() = default; + +A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) { + if (conf.very_verbose_debugging_output) { + std::puts(IR::DumpBlock(block).c_str()); + } + + code.EnableWriting(); + SCOPE_EXIT { + code.DisableWriting(); + }; + + const std::vector<HostLoc> gpr_order = [this] { + std::vector<HostLoc> gprs{any_gpr}; + if (conf.page_table) { + gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R14)); + } + if (conf.fastmem_pointer) { + gprs.erase(std::find(gprs.begin(), gprs.end(), HostLoc::R13)); + } + return gprs; + }(); + + RegAlloc reg_alloc{code, gpr_order, any_xmm}; + A64EmitContext ctx{conf, reg_alloc, block}; + + // Start emitting. + code.align(); + const u8* const entrypoint = code.getCurr(); + + ASSERT(block.GetCondition() == IR::Cond::AL); + + for (auto iter = block.begin(); iter != block.end(); ++iter) { + IR::Inst* inst = &*iter; + + // Call the relevant Emit* member function. + switch (inst->GetOpcode()) { +#define OPCODE(name, type, ...) \ + case IR::Opcode::name: \ + A64EmitX64::Emit##name(ctx, inst); \ + break; +#define A32OPC(...) +#define A64OPC(name, type, ...) \ + case IR::Opcode::A64##name: \ + A64EmitX64::EmitA64##name(ctx, inst); \ + break; +#include "dynarmic/ir/opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + + default: + ASSERT_MSG(false, "Invalid opcode: {}", inst->GetOpcode()); + break; + } + + ctx.reg_alloc.EndOfAllocScope(); + + if (conf.very_verbose_debugging_output) { + EmitVerboseDebuggingOutput(reg_alloc); + } + } + + reg_alloc.AssertNoMoreUses(); + + if (conf.enable_cycle_counting) { + EmitAddCycles(block.CycleCount()); + } + EmitX64::EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep()); + code.int3(); + + for (auto& deferred_emit : ctx.deferred_emits) { + deferred_emit(); + } + code.int3(); + + const size_t size = static_cast<size_t>(code.getCurr() - entrypoint); + + const A64::LocationDescriptor descriptor{block.Location()}; + const A64::LocationDescriptor end_location{block.EndLocation()}; + + const auto range = boost::icl::discrete_interval<u64>::closed(descriptor.PC(), end_location.PC() - 1); + block_ranges.AddRange(range, descriptor); + + return RegisterBlock(descriptor, entrypoint, size); +} + +void A64EmitX64::ClearCache() { + EmitX64::ClearCache(); + block_ranges.ClearCache(); + ClearFastDispatchTable(); + fastmem_patch_info.clear(); +} + +void A64EmitX64::InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges) { + InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges)); +} + +void A64EmitX64::ClearFastDispatchTable() { + if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { + fast_dispatch_table.fill({}); + } +} + +void A64EmitX64::GenTerminalHandlers() { + // PC ends up in rbp, location_descriptor ends up in rbx + const auto calculate_location_descriptor = [this] { + // This calculation has to match up with A64::LocationDescriptor::UniqueHash + // TODO: Optimization is available here based on known state of fpcr. + code.mov(rbp, qword[r15 + offsetof(A64JitState, pc)]); + code.mov(rcx, A64::LocationDescriptor::pc_mask); + code.and_(rcx, rbp); + code.mov(ebx, dword[r15 + offsetof(A64JitState, fpcr)]); + code.and_(ebx, A64::LocationDescriptor::fpcr_mask); + code.shl(rbx, A64::LocationDescriptor::fpcr_shift); + code.or_(rbx, rcx); + }; + + Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss; + + code.align(); + terminal_handler_pop_rsb_hint = code.getCurr<const void*>(); + calculate_location_descriptor(); + code.mov(eax, dword[r15 + offsetof(A64JitState, rsb_ptr)]); + code.sub(eax, 1); + code.and_(eax, u32(A64JitState::RSBPtrMask)); + code.mov(dword[r15 + offsetof(A64JitState, rsb_ptr)], eax); + code.cmp(rbx, qword[r15 + offsetof(A64JitState, rsb_location_descriptors) + rax * sizeof(u64)]); + if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { + code.jne(rsb_cache_miss); + } else { + code.jne(code.GetReturnFromRunCodeAddress()); + } + code.mov(rax, qword[r15 + offsetof(A64JitState, rsb_codeptrs) + rax * sizeof(u64)]); + code.jmp(rax); + PerfMapRegister(terminal_handler_pop_rsb_hint, code.getCurr(), "a64_terminal_handler_pop_rsb_hint"); + + if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { + code.align(); + terminal_handler_fast_dispatch_hint = code.getCurr<const void*>(); + calculate_location_descriptor(); + code.L(rsb_cache_miss); + code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data())); + code.mov(rbp, rbx); + if (code.HasHostFeature(HostFeature::SSE42)) { + code.crc32(rbp, r12); + } + code.and_(ebp, fast_dispatch_table_mask); + code.lea(rbp, ptr[r12 + rbp]); + code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]); + code.jne(fast_dispatch_cache_miss); + code.jmp(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)]); + code.L(fast_dispatch_cache_miss); + code.mov(qword[rbp + offsetof(FastDispatchEntry, location_descriptor)], rbx); + code.LookupBlock(); + code.mov(ptr[rbp + offsetof(FastDispatchEntry, code_ptr)], rax); + code.jmp(rax); + PerfMapRegister(terminal_handler_fast_dispatch_hint, code.getCurr(), "a64_terminal_handler_fast_dispatch_hint"); + + code.align(); + fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry& (*)(u64)>(); + code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data())); + if (code.HasHostFeature(HostFeature::SSE42)) { + code.crc32(code.ABI_PARAM1, code.ABI_PARAM2); + } + code.and_(code.ABI_PARAM1.cvt32(), fast_dispatch_table_mask); + code.lea(code.ABI_RETURN, code.ptr[code.ABI_PARAM2 + code.ABI_PARAM1]); + code.ret(); + PerfMapRegister(fast_dispatch_table_lookup, code.getCurr(), "a64_fast_dispatch_table_lookup"); + } +} + +void A64EmitX64::EmitPushRSB(EmitContext& ctx, IR::Inst* inst) { + if (!conf.HasOptimization(OptimizationFlag::ReturnStackBuffer)) { + return; + } + + EmitX64::EmitPushRSB(ctx, inst); +} + +void A64EmitX64::EmitA64SetCheckBit(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg8 to_store = ctx.reg_alloc.UseGpr(args[0]).cvt8(); + code.mov(code.byte[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, check_bit)], to_store); +} + +void A64EmitX64::EmitA64GetCFlag(A64EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + code.mov(result, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]); + code.shr(result, NZCV::x64_c_flag_bit); + code.and_(result, 1); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64GetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.mov(nzcv_raw, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]); + + if (code.HasHostFeature(HostFeature::FastBMI2)) { + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + code.mov(tmp, NZCV::x64_mask); + code.pext(nzcv_raw, nzcv_raw, tmp); + code.shl(nzcv_raw, 28); + } else { + code.and_(nzcv_raw, NZCV::x64_mask); + code.imul(nzcv_raw, nzcv_raw, NZCV::from_x64_multiplier); + code.and_(nzcv_raw, NZCV::arm_mask); + } + + ctx.reg_alloc.DefineValue(inst, nzcv_raw); +} + +void A64EmitX64::EmitA64SetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + + code.shr(nzcv_raw, 28); + if (code.HasHostFeature(HostFeature::FastBMI2)) { + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + code.mov(tmp, NZCV::x64_mask); + code.pdep(nzcv_raw, nzcv_raw, tmp); + } else { + code.imul(nzcv_raw, nzcv_raw, NZCV::to_x64_multiplier); + code.and_(nzcv_raw, NZCV::x64_mask); + } + code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], nzcv_raw); +} + +void A64EmitX64::EmitA64SetNZCV(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg32 to_store = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], to_store); +} + +void A64EmitX64::EmitA64GetW(A64EmitContext& ctx, IR::Inst* inst) { + const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.mov(result, dword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64GetX(A64EmitContext& ctx, IR::Inst* inst) { + const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + + code.mov(result, qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64GetS(A64EmitContext& ctx, IR::Inst* inst) { + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + const auto addr = qword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)]; + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.movd(result, addr); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64GetD(A64EmitContext& ctx, IR::Inst* inst) { + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + const auto addr = qword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)]; + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.movq(result, addr); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64GetQ(A64EmitContext& ctx, IR::Inst* inst) { + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)]; + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.movaps(result, addr); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64GetSP(A64EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + code.mov(result, qword[r15 + offsetof(A64JitState, sp)]); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64GetFPCR(A64EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + code.mov(result, dword[r15 + offsetof(A64JitState, fpcr)]); + ctx.reg_alloc.DefineValue(inst, result); +} + +static u32 GetFPSRImpl(A64JitState* jit_state) { + return jit_state->GetFpsr(); +} + +void A64EmitX64::EmitA64GetFPSR(A64EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.HostCall(inst); + code.mov(code.ABI_PARAM1, code.r15); + code.stmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]); + code.CallFunction(GetFPSRImpl); +} + +void A64EmitX64::EmitA64SetW(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); + const auto addr = qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]; + if (args[1].FitsInImmediateS32()) { + code.mov(addr, args[1].GetImmediateS32()); + } else { + // TODO: zext tracking, xmm variant + const Xbyak::Reg64 to_store = ctx.reg_alloc.UseScratchGpr(args[1]); + code.mov(to_store.cvt32(), to_store.cvt32()); + code.mov(addr, to_store); + } +} + +void A64EmitX64::EmitA64SetX(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A64::Reg reg = inst->GetArg(0).GetA64RegRef(); + const auto addr = qword[r15 + offsetof(A64JitState, reg) + sizeof(u64) * static_cast<size_t>(reg)]; + if (args[1].FitsInImmediateS32()) { + code.mov(addr, args[1].GetImmediateS32()); + } else if (args[1].IsInXmm()) { + const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]); + code.movq(addr, to_store); + } else { + const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[1]); + code.mov(addr, to_store); + } +} + +void A64EmitX64::EmitA64SetS(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)]; + + const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + // TODO: Optimize + code.pxor(tmp, tmp); + code.movss(tmp, to_store); + code.movaps(addr, tmp); +} + +void A64EmitX64::EmitA64SetD(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)]; + + const Xbyak::Xmm to_store = ctx.reg_alloc.UseScratchXmm(args[1]); + code.movq(to_store, to_store); // TODO: Remove when able + code.movaps(addr, to_store); +} + +void A64EmitX64::EmitA64SetQ(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const A64::Vec vec = inst->GetArg(0).GetA64VecRef(); + const auto addr = xword[r15 + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)]; + + const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[1]); + code.movaps(addr, to_store); +} + +void A64EmitX64::EmitA64SetSP(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto addr = qword[r15 + offsetof(A64JitState, sp)]; + if (args[0].FitsInImmediateS32()) { + code.mov(addr, args[0].GetImmediateS32()); + } else if (args[0].IsInXmm()) { + const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]); + code.movq(addr, to_store); + } else { + const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[0]); + code.mov(addr, to_store); + } +} + +static void SetFPCRImpl(A64JitState* jit_state, u32 value) { + jit_state->SetFpcr(value); +} + +void A64EmitX64::EmitA64SetFPCR(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(nullptr, {}, args[0]); + code.mov(code.ABI_PARAM1, code.r15); + code.CallFunction(SetFPCRImpl); + code.ldmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]); +} + +static void SetFPSRImpl(A64JitState* jit_state, u32 value) { + jit_state->SetFpsr(value); +} + +void A64EmitX64::EmitA64SetFPSR(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(nullptr, {}, args[0]); + code.mov(code.ABI_PARAM1, code.r15); + code.CallFunction(SetFPSRImpl); + code.ldmxcsr(code.dword[code.r15 + offsetof(A64JitState, guest_MXCSR)]); +} + +void A64EmitX64::EmitA64SetPC(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto addr = qword[r15 + offsetof(A64JitState, pc)]; + if (args[0].FitsInImmediateS32()) { + code.mov(addr, args[0].GetImmediateS32()); + } else if (args[0].IsInXmm()) { + const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(args[0]); + code.movq(addr, to_store); + } else { + const Xbyak::Reg64 to_store = ctx.reg_alloc.UseGpr(args[0]); + code.mov(addr, to_store); + } +} + +void A64EmitX64::EmitA64CallSupervisor(A64EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.HostCall(nullptr); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[0].IsImmediate()); + const u32 imm = args[0].GetImmediateU32(); + Devirtualize<&A64::UserCallbacks::CallSVC>(conf.callbacks).EmitCall(code, [&](RegList param) { + code.mov(param[0], imm); + }); + // The kernel would have to execute ERET to get here, which would clear exclusive state. + code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); +} + +void A64EmitX64::EmitA64ExceptionRaised(A64EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.HostCall(nullptr); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[0].IsImmediate() && args[1].IsImmediate()); + const u64 pc = args[0].GetImmediateU64(); + const u64 exception = args[1].GetImmediateU64(); + Devirtualize<&A64::UserCallbacks::ExceptionRaised>(conf.callbacks).EmitCall(code, [&](RegList param) { + code.mov(param[0], pc); + code.mov(param[1], exception); + }); +} + +void A64EmitX64::EmitA64DataCacheOperationRaised(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(nullptr, {}, args[1], args[2]); + Devirtualize<&A64::UserCallbacks::DataCacheOperationRaised>(conf.callbacks).EmitCall(code); +} + +void A64EmitX64::EmitA64InstructionCacheOperationRaised(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); + Devirtualize<&A64::UserCallbacks::InstructionCacheOperationRaised>(conf.callbacks).EmitCall(code); +} + +void A64EmitX64::EmitA64DataSynchronizationBarrier(A64EmitContext&, IR::Inst*) { + code.mfence(); + code.lfence(); +} + +void A64EmitX64::EmitA64DataMemoryBarrier(A64EmitContext&, IR::Inst*) { + code.mfence(); +} + +void A64EmitX64::EmitA64InstructionSynchronizationBarrier(A64EmitContext& ctx, IR::Inst*) { + if (!conf.hook_isb) { + return; + } + + ctx.reg_alloc.HostCall(nullptr); + Devirtualize<&A64::UserCallbacks::InstructionSynchronizationBarrierRaised>(conf.callbacks).EmitCall(code); +} + +void A64EmitX64::EmitA64GetCNTFRQ(A64EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + code.mov(result, conf.cntfrq_el0); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64GetCNTPCT(A64EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.HostCall(inst); + if (!conf.wall_clock_cntpct) { + code.UpdateTicks(); + } + Devirtualize<&A64::UserCallbacks::GetCNTPCT>(conf.callbacks).EmitCall(code); +} + +void A64EmitX64::EmitA64GetCTR(A64EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + code.mov(result, conf.ctr_el0); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64GetDCZID(A64EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + code.mov(result, conf.dczid_el0); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64GetTPIDR(A64EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + if (conf.tpidr_el0) { + code.mov(result, u64(conf.tpidr_el0)); + code.mov(result, qword[result]); + } else { + code.xor_(result.cvt32(), result.cvt32()); + } + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64GetTPIDRRO(A64EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + if (conf.tpidrro_el0) { + code.mov(result, u64(conf.tpidrro_el0)); + code.mov(result, qword[result]); + } else { + code.xor_(result.cvt32(), result.cvt32()); + } + ctx.reg_alloc.DefineValue(inst, result); +} + +void A64EmitX64::EmitA64SetTPIDR(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 addr = ctx.reg_alloc.ScratchGpr(); + if (conf.tpidr_el0) { + code.mov(addr, u64(conf.tpidr_el0)); + code.mov(qword[addr], value); + } +} + +std::string A64EmitX64::LocationDescriptorToFriendlyName(const IR::LocationDescriptor& ir_descriptor) const { + const A64::LocationDescriptor descriptor{ir_descriptor}; + return fmt::format("a64_{:016X}_fpcr{:08X}", + descriptor.PC(), + descriptor.FPCR().Value()); +} + +void A64EmitX64::EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor, bool) { + code.SwitchMxcsrOnExit(); + Devirtualize<&A64::UserCallbacks::InterpreterFallback>(conf.callbacks).EmitCall(code, [&](RegList param) { + code.mov(param[0], A64::LocationDescriptor{terminal.next}.PC()); + code.mov(qword[r15 + offsetof(A64JitState, pc)], param[0]); + code.mov(param[1].cvt32(), terminal.num_instructions); + }); + code.ReturnFromRunCode(true); // TODO: Check cycles +} + +void A64EmitX64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) { + code.ReturnFromRunCode(); +} + +void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor, bool is_single_step) { + if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) { + code.mov(rax, A64::LocationDescriptor{terminal.next}.PC()); + code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); + code.ReturnFromRunCode(); + return; + } + + if (conf.enable_cycle_counting) { + code.cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0); + + patch_information[terminal.next].jg.push_back(code.getCurr()); + if (const auto next_bb = GetBasicBlock(terminal.next)) { + EmitPatchJg(terminal.next, next_bb->entrypoint); + } else { + EmitPatchJg(terminal.next); + } + } else { + code.cmp(dword[r15 + offsetof(A64JitState, halt_reason)], 0); + + patch_information[terminal.next].jz.push_back(code.getCurr()); + if (const auto next_bb = GetBasicBlock(terminal.next)) { + EmitPatchJz(terminal.next, next_bb->entrypoint); + } else { + EmitPatchJz(terminal.next); + } + } + + code.mov(rax, A64::LocationDescriptor{terminal.next}.PC()); + code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); + code.ForceReturnFromRunCode(); +} + +void A64EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor, bool is_single_step) { + if (!conf.HasOptimization(OptimizationFlag::BlockLinking) || is_single_step) { + code.mov(rax, A64::LocationDescriptor{terminal.next}.PC()); + code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); + code.ReturnFromRunCode(); + return; + } + + patch_information[terminal.next].jmp.push_back(code.getCurr()); + if (auto next_bb = GetBasicBlock(terminal.next)) { + EmitPatchJmp(terminal.next, next_bb->entrypoint); + } else { + EmitPatchJmp(terminal.next); + } +} + +void A64EmitX64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) { + if (!conf.HasOptimization(OptimizationFlag::ReturnStackBuffer) || is_single_step) { + code.ReturnFromRunCode(); + return; + } + + code.jmp(terminal_handler_pop_rsb_hint); +} + +void A64EmitX64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor, bool is_single_step) { + if (!conf.HasOptimization(OptimizationFlag::FastDispatch) || is_single_step) { + code.ReturnFromRunCode(); + return; + } + + code.jmp(terminal_handler_fast_dispatch_hint); +} + +void A64EmitX64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + switch (terminal.if_) { + case IR::Cond::AL: + case IR::Cond::NV: + EmitTerminal(terminal.then_, initial_location, is_single_step); + break; + default: + Xbyak::Label pass = EmitCond(terminal.if_); + EmitTerminal(terminal.else_, initial_location, is_single_step); + code.L(pass); + EmitTerminal(terminal.then_, initial_location, is_single_step); + break; + } +} + +void A64EmitX64::EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + Xbyak::Label fail; + code.cmp(code.byte[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, check_bit)], u8(0)); + code.jz(fail); + EmitTerminal(terminal.then_, initial_location, is_single_step); + code.L(fail); + EmitTerminal(terminal.else_, initial_location, is_single_step); +} + +void A64EmitX64::EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + code.cmp(dword[r15 + offsetof(A64JitState, halt_reason)], 0); + code.jne(code.GetForceReturnFromRunCodeAddress()); + EmitTerminal(terminal.else_, initial_location, is_single_step); +} + +void A64EmitX64::EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) { + const CodePtr patch_location = code.getCurr(); + if (target_code_ptr) { + code.jg(target_code_ptr); + } else { + code.mov(rax, A64::LocationDescriptor{target_desc}.PC()); + code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); + code.jg(code.GetReturnFromRunCodeAddress()); + } + code.EnsurePatchLocationSize(patch_location, 23); +} + +void A64EmitX64::EmitPatchJz(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) { + const CodePtr patch_location = code.getCurr(); + if (target_code_ptr) { + code.jz(target_code_ptr); + } else { + code.mov(rax, A64::LocationDescriptor{target_desc}.PC()); + code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); + code.jz(code.GetReturnFromRunCodeAddress()); + } + code.EnsurePatchLocationSize(patch_location, 23); +} + +void A64EmitX64::EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) { + const CodePtr patch_location = code.getCurr(); + if (target_code_ptr) { + code.jmp(target_code_ptr); + } else { + code.mov(rax, A64::LocationDescriptor{target_desc}.PC()); + code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); + code.jmp(code.GetReturnFromRunCodeAddress()); + } + code.EnsurePatchLocationSize(patch_location, 22); +} + +void A64EmitX64::EmitPatchMovRcx(CodePtr target_code_ptr) { + if (!target_code_ptr) { + target_code_ptr = code.GetReturnFromRunCodeAddress(); + } + const CodePtr patch_location = code.getCurr(); + code.mov(code.rcx, reinterpret_cast<u64>(target_code_ptr)); + code.EnsurePatchLocationSize(patch_location, 10); +} + +void A64EmitX64::Unpatch(const IR::LocationDescriptor& location) { + EmitX64::Unpatch(location); + if (conf.HasOptimization(OptimizationFlag::FastDispatch)) { + code.DisableWriting(); + (*fast_dispatch_table_lookup)(location.Value()) = {}; + code.EnableWriting(); + } +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h new file mode 100644 index 0000000000..ed49315512 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h @@ -0,0 +1,144 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <map> +#include <optional> +#include <tuple> + +#include "dynarmic/backend/block_range_information.h" +#include "dynarmic/backend/x64/a64_jitstate.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/frontend/A64/a64_location_descriptor.h" +#include "dynarmic/interface/A64/a64.h" +#include "dynarmic/interface/A64/config.h" +#include "dynarmic/ir/terminal.h" + +namespace Dynarmic::Backend::X64 { + +class RegAlloc; + +struct A64EmitContext final : public EmitContext { + A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block); + + A64::LocationDescriptor Location() const; + bool IsSingleStep() const; + FP::FPCR FPCR(bool fpcr_controlled = true) const override; + + bool HasOptimization(OptimizationFlag flag) const override { + return conf.HasOptimization(flag); + } + + const A64::UserConfig& conf; +}; + +class A64EmitX64 final : public EmitX64 { +public: + A64EmitX64(BlockOfCode& code, A64::UserConfig conf, A64::Jit* jit_interface); + ~A64EmitX64() override; + + /** + * Emit host machine code for a basic block with intermediate representation `block`. + * @note block is modified. + */ + BlockDescriptor Emit(IR::Block& block); + + void ClearCache() override; + + void InvalidateCacheRanges(const boost::icl::interval_set<u64>& ranges); + +protected: + const A64::UserConfig conf; + A64::Jit* jit_interface; + BlockRangeInformation<u64> block_ranges; + + struct FastDispatchEntry { + u64 location_descriptor = 0xFFFF'FFFF'FFFF'FFFFull; + const void* code_ptr = nullptr; + }; + static_assert(sizeof(FastDispatchEntry) == 0x10); + static constexpr u64 fast_dispatch_table_mask = 0xFFFFF0; + static constexpr size_t fast_dispatch_table_size = 0x100000; + std::array<FastDispatchEntry, fast_dispatch_table_size> fast_dispatch_table; + void ClearFastDispatchTable(); + + void (*memory_read_128)(); + void (*memory_write_128)(); + void (*memory_exclusive_write_128)(); + void GenMemory128Accessors(); + + std::map<std::tuple<bool, size_t, int, int>, void (*)()> read_fallbacks; + std::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks; + std::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks; + void GenFastmemFallbacks(); + + const void* terminal_handler_pop_rsb_hint; + const void* terminal_handler_fast_dispatch_hint = nullptr; + FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr; + void GenTerminalHandlers(); + + // Microinstruction emitters + void EmitPushRSB(EmitContext& ctx, IR::Inst* inst); +#define OPCODE(...) +#define A32OPC(...) +#define A64OPC(name, type, ...) void EmitA64##name(A64EmitContext& ctx, IR::Inst* inst); +#include "dynarmic/ir/opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + + // Helpers + std::string LocationDescriptorToFriendlyName(const IR::LocationDescriptor&) const override; + + // Fastmem information + using DoNotFastmemMarker = std::tuple<IR::LocationDescriptor, unsigned>; + struct FastmemPatchInfo { + u64 resume_rip; + u64 callback; + DoNotFastmemMarker marker; + bool recompile; + }; + tsl::robin_map<u64, FastmemPatchInfo> fastmem_patch_info; + std::set<DoNotFastmemMarker> do_not_fastmem; + std::optional<DoNotFastmemMarker> ShouldFastmem(A64EmitContext& ctx, IR::Inst* inst) const; + FakeCall FastmemCallback(u64 rip); + + // Memory access helpers + void EmitCheckMemoryAbort(A64EmitContext& ctx, IR::Inst* inst, Xbyak::Label* end = nullptr); + template<std::size_t bitsize, auto callback> + void EmitMemoryRead(A64EmitContext& ctx, IR::Inst* inst); + template<std::size_t bitsize, auto callback> + void EmitMemoryWrite(A64EmitContext& ctx, IR::Inst* inst); + template<std::size_t bitsize, auto callback> + void EmitExclusiveReadMemory(A64EmitContext& ctx, IR::Inst* inst); + template<std::size_t bitsize, auto callback> + void EmitExclusiveWriteMemory(A64EmitContext& ctx, IR::Inst* inst); + template<std::size_t bitsize, auto callback> + void EmitExclusiveReadMemoryInline(A64EmitContext& ctx, IR::Inst* inst); + template<std::size_t bitsize, auto callback> + void EmitExclusiveWriteMemoryInline(A64EmitContext& ctx, IR::Inst* inst); + + // Terminal instruction emitters + void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::ReturnToDispatch terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::PopRSBHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::FastDispatchHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + + // Patching + void Unpatch(const IR::LocationDescriptor& target_desc) override; + void EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override; + void EmitPatchJz(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override; + void EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override; + void EmitPatchMovRcx(CodePtr target_code_ptr = nullptr) override; +}; + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp new file mode 100644 index 0000000000..450b16d000 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp @@ -0,0 +1,431 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <array> +#include <initializer_list> +#include <tuple> +#include <utility> + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <mcl/type_traits/integer_of_size.hpp> +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/a64_emit_x64.h" +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/devirtualize.h" +#include "dynarmic/backend/x64/emit_x64_memory.h" +#include "dynarmic/backend/x64/exclusive_monitor_friend.h" +#include "dynarmic/backend/x64/perf_map.h" +#include "dynarmic/common/spin_lock_x64.h" +#include "dynarmic/common/x64_disassemble.h" +#include "dynarmic/interface/exclusive_monitor.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +void A64EmitX64::GenMemory128Accessors() { + code.align(); + memory_read_128 = code.getCurr<void (*)()>(); +#ifdef _WIN32 + Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCallWithReturnPointer(code, [&](Xbyak::Reg64 return_value_ptr, [[maybe_unused]] RegList args) { + code.mov(code.ABI_PARAM3, code.ABI_PARAM2); + code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE); + code.lea(return_value_ptr, ptr[rsp + ABI_SHADOW_SPACE]); + }); + code.movups(xmm1, xword[code.ABI_RETURN]); + code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE); +#else + code.sub(rsp, 8); + Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCall(code); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(xmm1, code.ABI_RETURN); + code.pinsrq(xmm1, code.ABI_RETURN2, 1); + } else { + code.movq(xmm1, code.ABI_RETURN); + code.movq(xmm2, code.ABI_RETURN2); + code.punpcklqdq(xmm1, xmm2); + } + code.add(rsp, 8); +#endif + code.ret(); + PerfMapRegister(memory_read_128, code.getCurr(), "a64_memory_read_128"); + + code.align(); + memory_write_128 = code.getCurr<void (*)()>(); +#ifdef _WIN32 + code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + code.movaps(xword[code.ABI_PARAM3], xmm1); + Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code); + code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE); +#else + code.sub(rsp, 8); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(code.ABI_PARAM3, xmm1); + code.pextrq(code.ABI_PARAM4, xmm1, 1); + } else { + code.movq(code.ABI_PARAM3, xmm1); + code.punpckhqdq(xmm1, xmm1); + code.movq(code.ABI_PARAM4, xmm1); + } + Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code); + code.add(rsp, 8); +#endif + code.ret(); + PerfMapRegister(memory_write_128, code.getCurr(), "a64_memory_write_128"); + + code.align(); + memory_exclusive_write_128 = code.getCurr<void (*)()>(); +#ifdef _WIN32 + code.sub(rsp, 8 + 32 + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + code.lea(code.ABI_PARAM4, ptr[rsp + ABI_SHADOW_SPACE + 16]); + code.movaps(xword[code.ABI_PARAM3], xmm1); + code.movaps(xword[code.ABI_PARAM4], xmm2); + Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive128>(conf.callbacks).EmitCall(code); + code.add(rsp, 8 + 32 + ABI_SHADOW_SPACE); +#else + code.sub(rsp, 8); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(code.ABI_PARAM3, xmm1); + code.pextrq(code.ABI_PARAM4, xmm1, 1); + code.movq(code.ABI_PARAM5, xmm2); + code.pextrq(code.ABI_PARAM6, xmm2, 1); + } else { + code.movq(code.ABI_PARAM3, xmm1); + code.punpckhqdq(xmm1, xmm1); + code.movq(code.ABI_PARAM4, xmm1); + code.movq(code.ABI_PARAM5, xmm2); + code.punpckhqdq(xmm2, xmm2); + code.movq(code.ABI_PARAM6, xmm2); + } + Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive128>(conf.callbacks).EmitCall(code); + code.add(rsp, 8); +#endif + code.ret(); + PerfMapRegister(memory_exclusive_write_128, code.getCurr(), "a64_memory_exclusive_write_128"); +} + +void A64EmitX64::GenFastmemFallbacks() { + const std::initializer_list<int> idxes{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + const std::array<std::pair<size_t, ArgCallback>, 4> read_callbacks{{ + {8, Devirtualize<&A64::UserCallbacks::MemoryRead8>(conf.callbacks)}, + {16, Devirtualize<&A64::UserCallbacks::MemoryRead16>(conf.callbacks)}, + {32, Devirtualize<&A64::UserCallbacks::MemoryRead32>(conf.callbacks)}, + {64, Devirtualize<&A64::UserCallbacks::MemoryRead64>(conf.callbacks)}, + }}; + const std::array<std::pair<size_t, ArgCallback>, 4> write_callbacks{{ + {8, Devirtualize<&A64::UserCallbacks::MemoryWrite8>(conf.callbacks)}, + {16, Devirtualize<&A64::UserCallbacks::MemoryWrite16>(conf.callbacks)}, + {32, Devirtualize<&A64::UserCallbacks::MemoryWrite32>(conf.callbacks)}, + {64, Devirtualize<&A64::UserCallbacks::MemoryWrite64>(conf.callbacks)}, + }}; + const std::array<std::pair<size_t, ArgCallback>, 4> exclusive_write_callbacks{{ + {8, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive8>(conf.callbacks)}, + {16, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive16>(conf.callbacks)}, + {32, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive32>(conf.callbacks)}, + {64, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, + }}; + + for (bool ordered : {false, true}) { + for (int vaddr_idx : idxes) { + if (vaddr_idx == 4 || vaddr_idx == 15) { + continue; + } + + for (int value_idx : idxes) { + code.align(); + read_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + if (ordered) { + code.mfence(); + } + code.call(memory_read_128); + if (value_idx != 1) { + code.movaps(Xbyak::Xmm{value_idx}, xmm1); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); + code.ret(); + PerfMapRegister(read_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128"); + + code.align(); + write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + if (value_idx != 1) { + code.movaps(xmm1, Xbyak::Xmm{value_idx}); + } + code.call(memory_write_128); + if (ordered) { + code.mfence(); + } + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); + + code.align(); + exclusive_write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + if (value_idx != 1) { + code.movaps(xmm1, Xbyak::Xmm{value_idx}); + } + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(xmm2, rax); + code.pinsrq(xmm2, rdx, 1); + } else { + code.movq(xmm2, rax); + code.movq(xmm0, rdx); + code.punpcklqdq(xmm2, xmm0); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + code.call(memory_exclusive_write_128); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_exclusive_write_fallback_128"); + + if (value_idx == 4 || value_idx == 15) { + continue; + } + + for (const auto& [bitsize, callback] : read_callbacks) { + code.align(); + read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + if (ordered) { + code.mfence(); + } + callback.EmitCall(code); + if (value_idx != code.ABI_RETURN.getIdx()) { + code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); + code.ret(); + PerfMapRegister(read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : write_callbacks) { + code.align(); + write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + callback.EmitCall(code); + if (ordered) { + code.mfence(); + } + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : exclusive_write_callbacks) { + code.align(); + exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + code.mov(code.ABI_PARAM4, rax); + code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_exclusive_write_fallback_{}", bitsize)); + } + } + } + } +} + +#define Axx A64 +#include "dynarmic/backend/x64/emit_x64_memory.cpp.inc" +#undef Axx + +void A64EmitX64::EmitA64ReadMemory8(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); +} + +void A64EmitX64::EmitA64ReadMemory16(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); +} + +void A64EmitX64::EmitA64ReadMemory32(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); +} + +void A64EmitX64::EmitA64ReadMemory64(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); +} + +void A64EmitX64::EmitA64ReadMemory128(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst); +} + +void A64EmitX64::EmitA64WriteMemory8(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<8, &A64::UserCallbacks::MemoryWrite8>(ctx, inst); +} + +void A64EmitX64::EmitA64WriteMemory16(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<16, &A64::UserCallbacks::MemoryWrite16>(ctx, inst); +} + +void A64EmitX64::EmitA64WriteMemory32(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<32, &A64::UserCallbacks::MemoryWrite32>(ctx, inst); +} + +void A64EmitX64::EmitA64WriteMemory64(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<64, &A64::UserCallbacks::MemoryWrite64>(ctx, inst); +} + +void A64EmitX64::EmitA64WriteMemory128(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<128, &A64::UserCallbacks::MemoryWrite64>(ctx, inst); +} + +void A64EmitX64::EmitA64ClearExclusive(A64EmitContext&, IR::Inst*) { + code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); +} + +void A64EmitX64::EmitA64ExclusiveReadMemory8(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); + } else { + EmitExclusiveReadMemory<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveReadMemory16(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); + } else { + EmitExclusiveReadMemory<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveReadMemory32(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); + } else { + EmitExclusiveReadMemory<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveReadMemory64(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); + } else { + EmitExclusiveReadMemory<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveReadMemory128(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst); + } else { + EmitExclusiveReadMemory<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveWriteMemory8(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } else { + EmitExclusiveWriteMemory<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveWriteMemory16(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } else { + EmitExclusiveWriteMemory<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveWriteMemory32(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } else { + EmitExclusiveWriteMemory<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveWriteMemory64(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } else { + EmitExclusiveWriteMemory<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveWriteMemory128(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst); + } else { + EmitExclusiveWriteMemory<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst); + } +} + +void A64EmitX64::EmitCheckMemoryAbort(A64EmitContext&, IR::Inst* inst, Xbyak::Label* end) { + if (!conf.check_halt_on_memory_access) { + return; + } + + Xbyak::Label skip; + + const A64::LocationDescriptor current_location{IR::LocationDescriptor{inst->GetArg(0).GetU64()}}; + + code.test(dword[r15 + offsetof(A64JitState, halt_reason)], static_cast<u32>(HaltReason::MemoryAbort)); + if (end) { + code.jz(*end, code.T_NEAR); + } else { + code.jz(skip, code.T_NEAR); + } + code.mov(rax, current_location.PC()); + code.mov(qword[r15 + offsetof(A64JitState, pc)], rax); + code.ForceReturnFromRunCode(); + code.L(skip); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_interface.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a64_interface.cpp new file mode 100644 index 0000000000..57f75df003 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_interface.cpp @@ -0,0 +1,449 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <cstring> +#include <memory> +#include <mutex> + +#include <boost/icl/interval_set.hpp> +#include <mcl/assert.hpp> +#include <mcl/bit_cast.hpp> +#include <mcl/scope_exit.hpp> + +#include "dynarmic/backend/x64/a64_emit_x64.h" +#include "dynarmic/backend/x64/a64_jitstate.h" +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/devirtualize.h" +#include "dynarmic/backend/x64/jitstate_info.h" +#include "dynarmic/common/atomic.h" +#include "dynarmic/common/x64_disassemble.h" +#include "dynarmic/frontend/A64/translate/a64_translate.h" +#include "dynarmic/interface/A64/a64.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/opt/passes.h" + +namespace Dynarmic::A64 { + +using namespace Backend::X64; + +static RunCodeCallbacks GenRunCodeCallbacks(A64::UserCallbacks* cb, CodePtr (*LookupBlock)(void* lookup_block_arg), void* arg, const A64::UserConfig& conf) { + return RunCodeCallbacks{ + std::make_unique<ArgCallback>(LookupBlock, reinterpret_cast<u64>(arg)), + std::make_unique<ArgCallback>(Devirtualize<&A64::UserCallbacks::AddTicks>(cb)), + std::make_unique<ArgCallback>(Devirtualize<&A64::UserCallbacks::GetTicksRemaining>(cb)), + conf.enable_cycle_counting, + }; +} + +static std::function<void(BlockOfCode&)> GenRCP(const A64::UserConfig& conf) { + return [conf](BlockOfCode& code) { + if (conf.page_table) { + code.mov(code.r14, mcl::bit_cast<u64>(conf.page_table)); + } + if (conf.fastmem_pointer) { + code.mov(code.r13, mcl::bit_cast<u64>(conf.fastmem_pointer)); + } + }; +} + +static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) { + return Optimization::PolyfillOptions{ + .sha256 = !code.HasHostFeature(HostFeature::SHA), + .vector_multiply_widen = true, + }; +} + +struct Jit::Impl final { +public: + Impl(Jit* jit, UserConfig conf) + : conf(conf) + , block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this, conf), JitStateInfo{jit_state}, conf.code_cache_size, GenRCP(conf)) + , emitter(block_of_code, conf, jit) + , polyfill_options(GenPolyfillOptions(block_of_code)) { + ASSERT(conf.page_table_address_space_bits >= 12 && conf.page_table_address_space_bits <= 64); + } + + ~Impl() = default; + + HaltReason Run() { + ASSERT(!is_executing); + PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&jit_state.halt_reason))); + + is_executing = true; + SCOPE_EXIT { + this->is_executing = false; + }; + + // TODO: Check code alignment + + const CodePtr current_code_ptr = [this] { + // RSB optimization + const u32 new_rsb_ptr = (jit_state.rsb_ptr - 1) & A64JitState::RSBPtrMask; + if (jit_state.GetUniqueHash() == jit_state.rsb_location_descriptors[new_rsb_ptr]) { + jit_state.rsb_ptr = new_rsb_ptr; + return reinterpret_cast<CodePtr>(jit_state.rsb_codeptrs[new_rsb_ptr]); + } + + return GetCurrentBlock(); + }(); + + const HaltReason hr = block_of_code.RunCode(&jit_state, current_code_ptr); + + PerformRequestedCacheInvalidation(hr); + + return hr; + } + + HaltReason Step() { + ASSERT(!is_executing); + PerformRequestedCacheInvalidation(static_cast<HaltReason>(Atomic::Load(&jit_state.halt_reason))); + + is_executing = true; + SCOPE_EXIT { + this->is_executing = false; + }; + + const HaltReason hr = block_of_code.StepCode(&jit_state, GetCurrentSingleStep()); + + PerformRequestedCacheInvalidation(hr); + + return hr; + } + + void ClearCache() { + std::unique_lock lock{invalidation_mutex}; + invalidate_entire_cache = true; + HaltExecution(HaltReason::CacheInvalidation); + } + + void InvalidateCacheRange(u64 start_address, size_t length) { + std::unique_lock lock{invalidation_mutex}; + const auto end_address = static_cast<u64>(start_address + length - 1); + const auto range = boost::icl::discrete_interval<u64>::closed(start_address, end_address); + invalid_cache_ranges.add(range); + HaltExecution(HaltReason::CacheInvalidation); + } + + void Reset() { + ASSERT(!is_executing); + jit_state = {}; + } + + void HaltExecution(HaltReason hr) { + Atomic::Or(&jit_state.halt_reason, static_cast<u32>(hr)); + } + + void ClearHalt(HaltReason hr) { + Atomic::And(&jit_state.halt_reason, ~static_cast<u32>(hr)); + } + + u64 GetSP() const { + return jit_state.sp; + } + + void SetSP(u64 value) { + jit_state.sp = value; + } + + u64 GetPC() const { + return jit_state.pc; + } + + void SetPC(u64 value) { + jit_state.pc = value; + } + + u64 GetRegister(size_t index) const { + if (index == 31) + return GetSP(); + return jit_state.reg.at(index); + } + + void SetRegister(size_t index, u64 value) { + if (index == 31) + return SetSP(value); + jit_state.reg.at(index) = value; + } + + std::array<u64, 31> GetRegisters() const { + return jit_state.reg; + } + + void SetRegisters(const std::array<u64, 31>& value) { + jit_state.reg = value; + } + + Vector GetVector(size_t index) const { + return {jit_state.vec.at(index * 2), jit_state.vec.at(index * 2 + 1)}; + } + + void SetVector(size_t index, Vector value) { + jit_state.vec.at(index * 2) = value[0]; + jit_state.vec.at(index * 2 + 1) = value[1]; + } + + std::array<Vector, 32> GetVectors() const { + std::array<Vector, 32> ret; + static_assert(sizeof(ret) == sizeof(jit_state.vec)); + std::memcpy(ret.data(), jit_state.vec.data(), sizeof(jit_state.vec)); + return ret; + } + + void SetVectors(const std::array<Vector, 32>& value) { + static_assert(sizeof(value) == sizeof(jit_state.vec)); + std::memcpy(jit_state.vec.data(), value.data(), sizeof(jit_state.vec)); + } + + u32 GetFpcr() const { + return jit_state.GetFpcr(); + } + + void SetFpcr(u32 value) { + jit_state.SetFpcr(value); + } + + u32 GetFpsr() const { + return jit_state.GetFpsr(); + } + + void SetFpsr(u32 value) { + jit_state.SetFpsr(value); + } + + u32 GetPstate() const { + return jit_state.GetPstate(); + } + + void SetPstate(u32 value) { + jit_state.SetPstate(value); + } + + void ClearExclusiveState() { + jit_state.exclusive_state = 0; + } + + bool IsExecuting() const { + return is_executing; + } + + void DumpDisassembly() const { + const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin()); + Common::DumpDisassembledX64(block_of_code.GetCodeBegin(), size); + } + + std::vector<std::string> Disassemble() const { + const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin()); + return Common::DisassembleX64(block_of_code.GetCodeBegin(), size); + } + +private: + static CodePtr GetCurrentBlockThunk(void* thisptr) { + Jit::Impl* this_ = static_cast<Jit::Impl*>(thisptr); + return this_->GetCurrentBlock(); + } + + IR::LocationDescriptor GetCurrentLocation() const { + return IR::LocationDescriptor{jit_state.GetUniqueHash()}; + } + + CodePtr GetCurrentBlock() { + return GetBlock(GetCurrentLocation()); + } + + CodePtr GetCurrentSingleStep() { + return GetBlock(A64::LocationDescriptor{GetCurrentLocation()}.SetSingleStepping(true)); + } + + CodePtr GetBlock(IR::LocationDescriptor current_location) { + if (auto block = emitter.GetBasicBlock(current_location)) + return block->entrypoint; + + constexpr size_t MINIMUM_REMAINING_CODESIZE = 1 * 1024 * 1024; + if (block_of_code.SpaceRemaining() < MINIMUM_REMAINING_CODESIZE) { + // Immediately evacuate cache + invalidate_entire_cache = true; + PerformRequestedCacheInvalidation(HaltReason::CacheInvalidation); + } + block_of_code.EnsureMemoryCommitted(MINIMUM_REMAINING_CODESIZE); + + // JIT Compile + const auto get_code = [this](u64 vaddr) { return conf.callbacks->MemoryReadCode(vaddr); }; + IR::Block ir_block = A64::Translate(A64::LocationDescriptor{current_location}, get_code, + {conf.define_unpredictable_behaviour, conf.wall_clock_cntpct}); + Optimization::PolyfillPass(ir_block, polyfill_options); + Optimization::A64CallbackConfigPass(ir_block, conf); + Optimization::NamingPass(ir_block); + if (conf.HasOptimization(OptimizationFlag::GetSetElimination) && !conf.check_halt_on_memory_access) { + Optimization::A64GetSetElimination(ir_block); + Optimization::DeadCodeElimination(ir_block); + } + if (conf.HasOptimization(OptimizationFlag::ConstProp)) { + Optimization::ConstantPropagation(ir_block); + Optimization::DeadCodeElimination(ir_block); + } + if (conf.HasOptimization(OptimizationFlag::MiscIROpt)) { + Optimization::A64MergeInterpretBlocksPass(ir_block, conf.callbacks); + } + Optimization::VerificationPass(ir_block); + return emitter.Emit(ir_block).entrypoint; + } + + void PerformRequestedCacheInvalidation(HaltReason hr) { + if (Has(hr, HaltReason::CacheInvalidation)) { + std::unique_lock lock{invalidation_mutex}; + + ClearHalt(HaltReason::CacheInvalidation); + + if (!invalidate_entire_cache && invalid_cache_ranges.empty()) { + return; + } + + jit_state.ResetRSB(); + if (invalidate_entire_cache) { + block_of_code.ClearCache(); + emitter.ClearCache(); + } else { + emitter.InvalidateCacheRanges(invalid_cache_ranges); + } + invalid_cache_ranges.clear(); + invalidate_entire_cache = false; + } + } + + bool is_executing = false; + + const UserConfig conf; + A64JitState jit_state; + BlockOfCode block_of_code; + A64EmitX64 emitter; + Optimization::PolyfillOptions polyfill_options; + + bool invalidate_entire_cache = false; + boost::icl::interval_set<u64> invalid_cache_ranges; + std::mutex invalidation_mutex; +}; + +Jit::Jit(UserConfig conf) + : impl(std::make_unique<Jit::Impl>(this, conf)) {} + +Jit::~Jit() = default; + +HaltReason Jit::Run() { + return impl->Run(); +} + +HaltReason Jit::Step() { + return impl->Step(); +} + +void Jit::ClearCache() { + impl->ClearCache(); +} + +void Jit::InvalidateCacheRange(u64 start_address, size_t length) { + impl->InvalidateCacheRange(start_address, length); +} + +void Jit::Reset() { + impl->Reset(); +} + +void Jit::HaltExecution(HaltReason hr) { + impl->HaltExecution(hr); +} + +void Jit::ClearHalt(HaltReason hr) { + impl->ClearHalt(hr); +} + +u64 Jit::GetSP() const { + return impl->GetSP(); +} + +void Jit::SetSP(u64 value) { + impl->SetSP(value); +} + +u64 Jit::GetPC() const { + return impl->GetPC(); +} + +void Jit::SetPC(u64 value) { + impl->SetPC(value); +} + +u64 Jit::GetRegister(size_t index) const { + return impl->GetRegister(index); +} + +void Jit::SetRegister(size_t index, u64 value) { + impl->SetRegister(index, value); +} + +std::array<u64, 31> Jit::GetRegisters() const { + return impl->GetRegisters(); +} + +void Jit::SetRegisters(const std::array<u64, 31>& value) { + impl->SetRegisters(value); +} + +Vector Jit::GetVector(size_t index) const { + return impl->GetVector(index); +} + +void Jit::SetVector(size_t index, Vector value) { + impl->SetVector(index, value); +} + +std::array<Vector, 32> Jit::GetVectors() const { + return impl->GetVectors(); +} + +void Jit::SetVectors(const std::array<Vector, 32>& value) { + impl->SetVectors(value); +} + +u32 Jit::GetFpcr() const { + return impl->GetFpcr(); +} + +void Jit::SetFpcr(u32 value) { + impl->SetFpcr(value); +} + +u32 Jit::GetFpsr() const { + return impl->GetFpsr(); +} + +void Jit::SetFpsr(u32 value) { + impl->SetFpsr(value); +} + +u32 Jit::GetPstate() const { + return impl->GetPstate(); +} + +void Jit::SetPstate(u32 value) { + impl->SetPstate(value); +} + +void Jit::ClearExclusiveState() { + impl->ClearExclusiveState(); +} + +bool Jit::IsExecuting() const { + return impl->IsExecuting(); +} + +void Jit::DumpDisassembly() const { + return impl->DumpDisassembly(); +} + +std::vector<std::string> Jit::Disassemble() const { + return impl->Disassemble(); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.cpp new file mode 100644 index 0000000000..9f983f3955 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.cpp @@ -0,0 +1,116 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/a64_jitstate.h" + +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A64/a64_location_descriptor.h" + +namespace Dynarmic::Backend::X64 { + +/** + * Comparing MXCSR and FPCR + * ======================== + * + * SSE MSCSR exception masks + * ------------------------- + * PM bit 12 Precision Mask + * UM bit 11 Underflow Mask + * OM bit 10 Overflow Mask + * ZM bit 9 Divide By Zero Mask + * DM bit 8 Denormal Mask + * IM bit 7 Invalid Operation Mask + * + * A64 FPCR exception trap enables + * ------------------------------- + * IDE bit 15 Input Denormal exception trap enable + * IXE bit 12 Inexact exception trap enable + * UFE bit 11 Underflow exception trap enable + * OFE bit 10 Overflow exception trap enable + * DZE bit 9 Division by Zero exception trap enable + * IOE bit 8 Invalid Operation exception trap enable + * + * SSE MXCSR mode bits + * ------------------- + * FZ bit 15 Flush To Zero + * DAZ bit 6 Denormals Are Zero + * RN bits 13-14 Round to {0 = Nearest, 1 = Negative, 2 = Positive, 3 = Zero} + * + * A64 FPCR mode bits + * ------------------ + * AHP bit 26 Alternative half-precision + * DN bit 25 Default NaN + * FZ bit 24 Flush to Zero + * RMode bits 22-23 Round to {0 = Nearest, 1 = Positive, 2 = Negative, 3 = Zero} + * FZ16 bit 19 Flush to Zero for half-precision + */ + +constexpr u32 FPCR_MASK = 0x07C89F00; + +u32 A64JitState::GetFpcr() const { + return fpcr; +} + +void A64JitState::SetFpcr(u32 value) { + fpcr = value & FPCR_MASK; + + asimd_MXCSR &= 0x0000003D; + guest_MXCSR &= 0x0000003D; + asimd_MXCSR |= 0x00001f80; + guest_MXCSR |= 0x00001f80; // Mask all exceptions + + // RMode + const std::array<u32, 4> MXCSR_RMode{0x0, 0x4000, 0x2000, 0x6000}; + guest_MXCSR |= MXCSR_RMode[(value >> 22) & 0x3]; + + if (mcl::bit::get_bit<24>(value)) { + guest_MXCSR |= (1 << 15); // SSE Flush to Zero + guest_MXCSR |= (1 << 6); // SSE Denormals are Zero + } +} + +/** + * Comparing MXCSR and FPSR + * ======================== + * + * SSE MXCSR exception flags + * ------------------------- + * PE bit 5 Precision Flag + * UE bit 4 Underflow Flag + * OE bit 3 Overflow Flag + * ZE bit 2 Divide By Zero Flag + * DE bit 1 Denormal Flag // Appears to only be set when MXCSR.DAZ = 0 + * IE bit 0 Invalid Operation Flag + * + * A64 FPSR cumulative exception bits + * ---------------------------------- + * QC bit 27 Cumulative saturation bit + * IDC bit 7 Input Denormal cumulative exception bit // Only ever set when FPCR.FTZ = 1 + * IXC bit 4 Inexact cumulative exception bit + * UFC bit 3 Underflow cumulative exception bit + * OFC bit 2 Overflow cumulative exception bit + * DZC bit 1 Division by Zero cumulative exception bit + * IOC bit 0 Invalid Operation cumulative exception bit + */ + +u32 A64JitState::GetFpsr() const { + const u32 mxcsr = guest_MXCSR | asimd_MXCSR; + u32 fpsr = 0; + fpsr |= (mxcsr & 0b0000000000001); // IOC = IE + fpsr |= (mxcsr & 0b0000000111100) >> 1; // IXC, UFC, OFC, DZC = PE, UE, OE, ZE + fpsr |= fpsr_exc; + fpsr |= (fpsr_qc == 0 ? 0 : 1) << 27; + return fpsr; +} + +void A64JitState::SetFpsr(u32 value) { + guest_MXCSR &= ~0x0000003D; + asimd_MXCSR &= ~0x0000003D; + fpsr_qc = (value >> 27) & 1; + fpsr_exc = value & 0x9F; +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.h b/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.h new file mode 100644 index 0000000000..0929e81ec5 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_jitstate.h @@ -0,0 +1,84 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> + +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/x64/nzcv_util.h" +#include "dynarmic/frontend/A64/a64_location_descriptor.h" + +namespace Dynarmic::Backend::X64 { + +class BlockOfCode; + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4324) // Structure was padded due to alignment specifier +#endif + +struct A64JitState { + using ProgramCounterType = u64; + + A64JitState() { ResetRSB(); } + + std::array<u64, 31> reg{}; + u64 sp = 0; + u64 pc = 0; + + u32 cpsr_nzcv = 0; + + u32 GetPstate() const { + return NZCV::FromX64(cpsr_nzcv); + } + void SetPstate(u32 new_pstate) { + cpsr_nzcv = NZCV::ToX64(new_pstate); + } + + alignas(16) std::array<u64, 64> vec{}; // Extension registers. + + // For internal use (See: BlockOfCode::RunCode) + u32 guest_MXCSR = 0x00001f80; + u32 asimd_MXCSR = 0x00009fc0; + volatile u32 halt_reason = 0; + + // Exclusive state + static constexpr u64 RESERVATION_GRANULE_MASK = 0xFFFF'FFFF'FFFF'FFF0ull; + u8 exclusive_state = 0; + + static constexpr size_t RSBSize = 8; // MUST be a power of 2. + static constexpr size_t RSBPtrMask = RSBSize - 1; + u32 rsb_ptr = 0; + std::array<u64, RSBSize> rsb_location_descriptors; + std::array<u64, RSBSize> rsb_codeptrs; + void ResetRSB() { + rsb_location_descriptors.fill(0xFFFFFFFFFFFFFFFFull); + rsb_codeptrs.fill(0); + } + + u32 fpsr_exc = 0; + u32 fpsr_qc = 0; + u32 fpcr = 0; + u32 GetFpcr() const; + u32 GetFpsr() const; + void SetFpcr(u32 value); + void SetFpsr(u32 value); + + u64 GetUniqueHash() const noexcept { + const u64 fpcr_u64 = static_cast<u64>(fpcr & A64::LocationDescriptor::fpcr_mask) << A64::LocationDescriptor::fpcr_shift; + const u64 pc_u64 = pc & A64::LocationDescriptor::pc_mask; + return pc_u64 | fpcr_u64; + } +}; + +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +using CodePtr = const void*; + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/abi.cpp b/externals/dynarmic/src/dynarmic/backend/x64/abi.cpp new file mode 100644 index 0000000000..d6a83b65b4 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/abi.cpp @@ -0,0 +1,135 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/abi.h" + +#include <algorithm> +#include <vector> + +#include <mcl/iterator/reverse.hpp> +#include <mcl/stdint.hpp> +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/block_of_code.h" + +namespace Dynarmic::Backend::X64 { + +constexpr size_t XMM_SIZE = 16; + +struct FrameInfo { + size_t stack_subtraction; + size_t xmm_offset; + size_t frame_offset; +}; + +static FrameInfo CalculateFrameInfo(size_t num_gprs, size_t num_xmms, size_t frame_size) { + // We are initially 8 byte aligned because the return value is pushed onto an aligned stack after a call. + const size_t rsp_alignment = (num_gprs % 2 == 0) ? 8 : 0; + const size_t total_xmm_size = num_xmms * XMM_SIZE; + + if (frame_size & 0xF) { + frame_size += 0x10 - (frame_size & 0xF); + } + + return { + rsp_alignment + total_xmm_size + frame_size + ABI_SHADOW_SPACE, + frame_size + ABI_SHADOW_SPACE, + ABI_SHADOW_SPACE, + }; +} + +template<typename RegisterArrayT> +void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size, const RegisterArrayT& regs) { + using namespace Xbyak::util; + + const size_t num_gprs = std::count_if(regs.begin(), regs.end(), HostLocIsGPR); + const size_t num_xmms = std::count_if(regs.begin(), regs.end(), HostLocIsXMM); + + FrameInfo frame_info = CalculateFrameInfo(num_gprs, num_xmms, frame_size); + + for (HostLoc gpr : regs) { + if (HostLocIsGPR(gpr)) { + code.push(HostLocToReg64(gpr)); + } + } + + if (frame_info.stack_subtraction != 0) { + code.sub(rsp, u32(frame_info.stack_subtraction)); + } + + size_t xmm_offset = frame_info.xmm_offset; + for (HostLoc xmm : regs) { + if (HostLocIsXMM(xmm)) { + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm)); + } else { + code.movaps(code.xword[rsp + xmm_offset], HostLocToXmm(xmm)); + } + xmm_offset += XMM_SIZE; + } + } +} + +template<typename RegisterArrayT> +void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size, const RegisterArrayT& regs) { + using namespace Xbyak::util; + + const size_t num_gprs = std::count_if(regs.begin(), regs.end(), HostLocIsGPR); + const size_t num_xmms = std::count_if(regs.begin(), regs.end(), HostLocIsXMM); + + FrameInfo frame_info = CalculateFrameInfo(num_gprs, num_xmms, frame_size); + + size_t xmm_offset = frame_info.xmm_offset; + for (HostLoc xmm : regs) { + if (HostLocIsXMM(xmm)) { + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]); + } else { + code.movaps(HostLocToXmm(xmm), code.xword[rsp + xmm_offset]); + } + xmm_offset += XMM_SIZE; + } + } + + if (frame_info.stack_subtraction != 0) { + code.add(rsp, u32(frame_info.stack_subtraction)); + } + + for (HostLoc gpr : mcl::iterator::reverse(regs)) { + if (HostLocIsGPR(gpr)) { + code.pop(HostLocToReg64(gpr)); + } + } +} + +void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size) { + ABI_PushRegistersAndAdjustStack(code, frame_size, ABI_ALL_CALLEE_SAVE); +} + +void ABI_PopCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size) { + ABI_PopRegistersAndAdjustStack(code, frame_size, ABI_ALL_CALLEE_SAVE); +} + +void ABI_PushCallerSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size) { + ABI_PushRegistersAndAdjustStack(code, frame_size, ABI_ALL_CALLER_SAVE); +} + +void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size) { + ABI_PopRegistersAndAdjustStack(code, frame_size, ABI_ALL_CALLER_SAVE); +} + +void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) { + std::vector<HostLoc> regs; + std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception); + ABI_PushRegistersAndAdjustStack(code, 0, regs); +} + +void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) { + std::vector<HostLoc> regs; + std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception); + ABI_PopRegistersAndAdjustStack(code, 0, regs); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/abi.h b/externals/dynarmic/src/dynarmic/backend/x64/abi.h new file mode 100644 index 0000000000..4bddf51bad --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/abi.h @@ -0,0 +1,132 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ +#pragma once + +#include <array> + +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/x64/hostloc.h" + +namespace Dynarmic::Backend::X64 { + +class BlockOfCode; + +#ifdef _WIN32 + +constexpr HostLoc ABI_RETURN = HostLoc::RAX; + +constexpr size_t ABI_PARAM_COUNT = 4; + +constexpr HostLoc ABI_PARAM1 = HostLoc::RCX; +constexpr HostLoc ABI_PARAM2 = HostLoc::RDX; +constexpr HostLoc ABI_PARAM3 = HostLoc::R8; +constexpr HostLoc ABI_PARAM4 = HostLoc::R9; + +constexpr std::array<HostLoc, 13> ABI_ALL_CALLER_SAVE = { + HostLoc::RAX, + HostLoc::RCX, + HostLoc::RDX, + HostLoc::R8, + HostLoc::R9, + HostLoc::R10, + HostLoc::R11, + HostLoc::XMM0, + HostLoc::XMM1, + HostLoc::XMM2, + HostLoc::XMM3, + HostLoc::XMM4, + HostLoc::XMM5, +}; + +constexpr std::array<HostLoc, 18> ABI_ALL_CALLEE_SAVE = { + HostLoc::RBX, + HostLoc::RSI, + HostLoc::RDI, + HostLoc::RBP, + HostLoc::R12, + HostLoc::R13, + HostLoc::R14, + HostLoc::R15, + HostLoc::XMM6, + HostLoc::XMM7, + HostLoc::XMM8, + HostLoc::XMM9, + HostLoc::XMM10, + HostLoc::XMM11, + HostLoc::XMM12, + HostLoc::XMM13, + HostLoc::XMM14, + HostLoc::XMM15, +}; + +constexpr size_t ABI_SHADOW_SPACE = 32; // bytes + +#else + +constexpr HostLoc ABI_RETURN = HostLoc::RAX; +constexpr HostLoc ABI_RETURN2 = HostLoc::RDX; + +constexpr size_t ABI_PARAM_COUNT = 6; + +constexpr HostLoc ABI_PARAM1 = HostLoc::RDI; +constexpr HostLoc ABI_PARAM2 = HostLoc::RSI; +constexpr HostLoc ABI_PARAM3 = HostLoc::RDX; +constexpr HostLoc ABI_PARAM4 = HostLoc::RCX; +constexpr HostLoc ABI_PARAM5 = HostLoc::R8; +constexpr HostLoc ABI_PARAM6 = HostLoc::R9; + +constexpr std::array<HostLoc, 25> ABI_ALL_CALLER_SAVE = { + HostLoc::RAX, + HostLoc::RCX, + HostLoc::RDX, + HostLoc::RDI, + HostLoc::RSI, + HostLoc::R8, + HostLoc::R9, + HostLoc::R10, + HostLoc::R11, + HostLoc::XMM0, + HostLoc::XMM1, + HostLoc::XMM2, + HostLoc::XMM3, + HostLoc::XMM4, + HostLoc::XMM5, + HostLoc::XMM6, + HostLoc::XMM7, + HostLoc::XMM8, + HostLoc::XMM9, + HostLoc::XMM10, + HostLoc::XMM11, + HostLoc::XMM12, + HostLoc::XMM13, + HostLoc::XMM14, + HostLoc::XMM15, +}; + +constexpr std::array<HostLoc, 6> ABI_ALL_CALLEE_SAVE = { + HostLoc::RBX, + HostLoc::RBP, + HostLoc::R12, + HostLoc::R13, + HostLoc::R14, + HostLoc::R15, +}; + +constexpr size_t ABI_SHADOW_SPACE = 0; // bytes + +#endif + +static_assert(ABI_ALL_CALLER_SAVE.size() + ABI_ALL_CALLEE_SAVE.size() == 31, "Invalid total number of registers"); + +void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0); +void ABI_PopCalleeSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0); +void ABI_PushCallerSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0); +void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_size = 0); + +void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception); +void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception); + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp b/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp new file mode 100644 index 0000000000..22d9868fc5 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp @@ -0,0 +1,540 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/block_of_code.h" + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# include <windows.h> +#else +# include <sys/mman.h> +#endif + +#ifdef __APPLE__ +# include <errno.h> +# include <fmt/format.h> +# include <sys/sysctl.h> +#endif + +#include <array> +#include <cstring> + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/a32_jitstate.h" +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/hostloc.h" +#include "dynarmic/backend/x64/perf_map.h" +#include "dynarmic/backend/x64/stack_layout.h" + +namespace Dynarmic::Backend::X64 { + +#ifdef _WIN32 +const Xbyak::Reg64 BlockOfCode::ABI_RETURN = HostLocToReg64(Dynarmic::Backend::X64::ABI_RETURN); +const Xbyak::Reg64 BlockOfCode::ABI_PARAM1 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM1); +const Xbyak::Reg64 BlockOfCode::ABI_PARAM2 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM2); +const Xbyak::Reg64 BlockOfCode::ABI_PARAM3 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM3); +const Xbyak::Reg64 BlockOfCode::ABI_PARAM4 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM4); +const std::array<Xbyak::Reg64, ABI_PARAM_COUNT> BlockOfCode::ABI_PARAMS = {BlockOfCode::ABI_PARAM1, BlockOfCode::ABI_PARAM2, BlockOfCode::ABI_PARAM3, BlockOfCode::ABI_PARAM4}; +#else +const Xbyak::Reg64 BlockOfCode::ABI_RETURN = HostLocToReg64(Dynarmic::Backend::X64::ABI_RETURN); +const Xbyak::Reg64 BlockOfCode::ABI_RETURN2 = HostLocToReg64(Dynarmic::Backend::X64::ABI_RETURN2); +const Xbyak::Reg64 BlockOfCode::ABI_PARAM1 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM1); +const Xbyak::Reg64 BlockOfCode::ABI_PARAM2 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM2); +const Xbyak::Reg64 BlockOfCode::ABI_PARAM3 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM3); +const Xbyak::Reg64 BlockOfCode::ABI_PARAM4 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM4); +const Xbyak::Reg64 BlockOfCode::ABI_PARAM5 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM5); +const Xbyak::Reg64 BlockOfCode::ABI_PARAM6 = HostLocToReg64(Dynarmic::Backend::X64::ABI_PARAM6); +const std::array<Xbyak::Reg64, ABI_PARAM_COUNT> BlockOfCode::ABI_PARAMS = {BlockOfCode::ABI_PARAM1, BlockOfCode::ABI_PARAM2, BlockOfCode::ABI_PARAM3, BlockOfCode::ABI_PARAM4, BlockOfCode::ABI_PARAM5, BlockOfCode::ABI_PARAM6}; +#endif + +namespace { + +constexpr size_t CONSTANT_POOL_SIZE = 2 * 1024 * 1024; +constexpr size_t PRELUDE_COMMIT_SIZE = 16 * 1024 * 1024; + +class CustomXbyakAllocator : public Xbyak::Allocator { +public: +#ifdef _WIN32 + uint8_t* alloc(size_t size) override { + void* p = VirtualAlloc(nullptr, size, MEM_RESERVE, PAGE_READWRITE); + if (p == nullptr) { + throw Xbyak::Error(Xbyak::ERR_CANT_ALLOC); + } + return static_cast<uint8_t*>(p); + } + + void free(uint8_t* p) override { + VirtualFree(static_cast<void*>(p), 0, MEM_RELEASE); + } + + bool useProtect() const override { return false; } +#else + static constexpr size_t DYNARMIC_PAGE_SIZE = 4096; + + // Can't subclass Xbyak::MmapAllocator because it is not a pure interface + // and doesn't expose its construtor + uint8_t* alloc(size_t size) override { + // Waste a page to store the size + size += DYNARMIC_PAGE_SIZE; + +# if defined(MAP_ANONYMOUS) + int mode = MAP_PRIVATE | MAP_ANONYMOUS; +# elif defined(MAP_ANON) + int mode = MAP_PRIVATE | MAP_ANON; +# else +# error "not supported" +# endif +# ifdef MAP_JIT + mode |= MAP_JIT; +# endif + + void* p = mmap(nullptr, size, PROT_READ | PROT_WRITE, mode, -1, 0); + if (p == MAP_FAILED) { + throw Xbyak::Error(Xbyak::ERR_CANT_ALLOC); + } + std::memcpy(p, &size, sizeof(size_t)); + return static_cast<uint8_t*>(p) + DYNARMIC_PAGE_SIZE; + } + + void free(uint8_t* p) override { + size_t size; + std::memcpy(&size, p - DYNARMIC_PAGE_SIZE, sizeof(size_t)); + munmap(p - DYNARMIC_PAGE_SIZE, size); + } + +# ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT + bool useProtect() const override { return false; } +# endif +#endif +}; + +// This is threadsafe as Xbyak::Allocator does not contain any state; it is a pure interface. +CustomXbyakAllocator s_allocator; + +#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT +void ProtectMemory(const void* base, size_t size, bool is_executable) { +# ifdef _WIN32 + DWORD oldProtect = 0; + VirtualProtect(const_cast<void*>(base), size, is_executable ? PAGE_EXECUTE_READ : PAGE_READWRITE, &oldProtect); +# else + static const size_t pageSize = sysconf(_SC_PAGESIZE); + const size_t iaddr = reinterpret_cast<size_t>(base); + const size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1)); + const int mode = is_executable ? (PROT_READ | PROT_EXEC) : (PROT_READ | PROT_WRITE); + mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode); +# endif +} +#endif + +HostFeature GetHostFeatures() { + HostFeature features = {}; + +#ifdef DYNARMIC_ENABLE_CPU_FEATURE_DETECTION + using Cpu = Xbyak::util::Cpu; + Xbyak::util::Cpu cpu_info; + + if (cpu_info.has(Cpu::tSSSE3)) + features |= HostFeature::SSSE3; + if (cpu_info.has(Cpu::tSSE41)) + features |= HostFeature::SSE41; + if (cpu_info.has(Cpu::tSSE42)) + features |= HostFeature::SSE42; + if (cpu_info.has(Cpu::tAVX)) + features |= HostFeature::AVX; + if (cpu_info.has(Cpu::tAVX2)) + features |= HostFeature::AVX2; + if (cpu_info.has(Cpu::tAVX512F)) + features |= HostFeature::AVX512F; + if (cpu_info.has(Cpu::tAVX512CD)) + features |= HostFeature::AVX512CD; + if (cpu_info.has(Cpu::tAVX512VL)) + features |= HostFeature::AVX512VL; + if (cpu_info.has(Cpu::tAVX512BW)) + features |= HostFeature::AVX512BW; + if (cpu_info.has(Cpu::tAVX512DQ)) + features |= HostFeature::AVX512DQ; + if (cpu_info.has(Cpu::tAVX512_BITALG)) + features |= HostFeature::AVX512BITALG; + if (cpu_info.has(Cpu::tAVX512VBMI)) + features |= HostFeature::AVX512VBMI; + if (cpu_info.has(Cpu::tPCLMULQDQ)) + features |= HostFeature::PCLMULQDQ; + if (cpu_info.has(Cpu::tF16C)) + features |= HostFeature::F16C; + if (cpu_info.has(Cpu::tFMA)) + features |= HostFeature::FMA; + if (cpu_info.has(Cpu::tAESNI)) + features |= HostFeature::AES; + if (cpu_info.has(Cpu::tSHA)) + features |= HostFeature::SHA; + if (cpu_info.has(Cpu::tPOPCNT)) + features |= HostFeature::POPCNT; + if (cpu_info.has(Cpu::tBMI1)) + features |= HostFeature::BMI1; + if (cpu_info.has(Cpu::tBMI2)) + features |= HostFeature::BMI2; + if (cpu_info.has(Cpu::tLZCNT)) + features |= HostFeature::LZCNT; + if (cpu_info.has(Cpu::tGFNI)) + features |= HostFeature::GFNI; + + if (cpu_info.has(Cpu::tBMI2)) { + // BMI2 instructions such as pdep and pext have been very slow up until Zen 3. + // Check for Zen 3 or newer by its family (0x19). + // See also: https://en.wikichip.org/wiki/amd/cpuid + if (cpu_info.has(Cpu::tAMD)) { + std::array<u32, 4> data{}; + cpu_info.getCpuid(1, data.data()); + const u32 family_base = mcl::bit::get_bits<8, 11>(data[0]); + const u32 family_extended = mcl::bit::get_bits<20, 27>(data[0]); + const u32 family = family_base + family_extended; + if (family >= 0x19) + features |= HostFeature::FastBMI2; + } else { + features |= HostFeature::FastBMI2; + } + } +#endif + + return features; +} + +#ifdef __APPLE__ +bool IsUnderRosetta() { + int result = 0; + size_t result_size = sizeof(result); + if (sysctlbyname("sysctl.proc_translated", &result, &result_size, nullptr, 0) == -1) { + if (errno != ENOENT) + fmt::print("IsUnderRosetta: Failed to detect Rosetta state, assuming not under Rosetta"); + return false; + } + return result != 0; +} +#endif + +} // anonymous namespace + +BlockOfCode::BlockOfCode(RunCodeCallbacks cb, JitStateInfo jsi, size_t total_code_size, std::function<void(BlockOfCode&)> rcp) + : Xbyak::CodeGenerator(total_code_size, nullptr, &s_allocator) + , cb(std::move(cb)) + , jsi(jsi) + , constant_pool(*this, CONSTANT_POOL_SIZE) + , host_features(GetHostFeatures()) { + EnableWriting(); + EnsureMemoryCommitted(PRELUDE_COMMIT_SIZE); + GenRunCode(rcp); +} + +void BlockOfCode::PreludeComplete() { + prelude_complete = true; + code_begin = getCurr(); + ClearCache(); + DisableWriting(); +} + +void BlockOfCode::EnableWriting() { +#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT +# ifdef _WIN32 + ProtectMemory(getCode(), committed_size, false); +# else + ProtectMemory(getCode(), maxSize_, false); +# endif +#endif +} + +void BlockOfCode::DisableWriting() { +#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT +# ifdef _WIN32 + ProtectMemory(getCode(), committed_size, true); +# else + ProtectMemory(getCode(), maxSize_, true); +# endif +#endif +} + +void BlockOfCode::ClearCache() { + ASSERT(prelude_complete); + SetCodePtr(code_begin); +} + +size_t BlockOfCode::SpaceRemaining() const { + ASSERT(prelude_complete); + const u8* current_ptr = getCurr<const u8*>(); + if (current_ptr >= &top_[maxSize_]) + return 0; + return &top_[maxSize_] - current_ptr; +} + +void BlockOfCode::EnsureMemoryCommitted([[maybe_unused]] size_t codesize) { +#ifdef _WIN32 + if (committed_size < size_ + codesize) { + committed_size = std::min<size_t>(maxSize_, committed_size + codesize); +# ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT + VirtualAlloc(top_, committed_size, MEM_COMMIT, PAGE_READWRITE); +# else + VirtualAlloc(top_, committed_size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); +# endif + } +#endif +} + +HaltReason BlockOfCode::RunCode(void* jit_state, CodePtr code_ptr) const { + return run_code(jit_state, code_ptr); +} + +HaltReason BlockOfCode::StepCode(void* jit_state, CodePtr code_ptr) const { + return step_code(jit_state, code_ptr); +} + +void BlockOfCode::ReturnFromRunCode(bool mxcsr_already_exited) { + size_t index = 0; + if (mxcsr_already_exited) + index |= MXCSR_ALREADY_EXITED; + jmp(return_from_run_code[index]); +} + +void BlockOfCode::ForceReturnFromRunCode(bool mxcsr_already_exited) { + size_t index = FORCE_RETURN; + if (mxcsr_already_exited) + index |= MXCSR_ALREADY_EXITED; + jmp(return_from_run_code[index]); +} + +void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) { + Xbyak::Label return_to_caller, return_to_caller_mxcsr_already_exited; + + align(); + run_code = getCurr<RunCodeFuncType>(); + + // This serves two purposes: + // 1. It saves all the registers we as a callee need to save. + // 2. It aligns the stack so that the code the JIT emits can assume + // that the stack is appropriately aligned for CALLs. + ABI_PushCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout)); + + mov(r15, ABI_PARAM1); + mov(rbx, ABI_PARAM2); // save temporarily in non-volatile register + + if (cb.enable_cycle_counting) { + cb.GetTicksRemaining->EmitCall(*this); + mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)], ABI_RETURN); + mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], ABI_RETURN); + } + + rcp(*this); + + cmp(dword[r15 + jsi.offsetof_halt_reason], 0); + jne(return_to_caller_mxcsr_already_exited, T_NEAR); + + SwitchMxcsrOnEntry(); + jmp(rbx); + + align(); + step_code = getCurr<RunCodeFuncType>(); + + ABI_PushCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout)); + + mov(r15, ABI_PARAM1); + + if (cb.enable_cycle_counting) { + mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)], 1); + mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 1); + } + + rcp(*this); + + cmp(dword[r15 + jsi.offsetof_halt_reason], 0); + jne(return_to_caller_mxcsr_already_exited, T_NEAR); + lock(); + or_(dword[r15 + jsi.offsetof_halt_reason], static_cast<u32>(HaltReason::Step)); + + SwitchMxcsrOnEntry(); + jmp(ABI_PARAM2); + + // Dispatcher loop + + align(); + return_from_run_code[0] = getCurr<const void*>(); + + cmp(dword[r15 + jsi.offsetof_halt_reason], 0); + jne(return_to_caller); + if (cb.enable_cycle_counting) { + cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0); + jng(return_to_caller); + } + cb.LookupBlock->EmitCall(*this); + jmp(ABI_RETURN); + + align(); + return_from_run_code[MXCSR_ALREADY_EXITED] = getCurr<const void*>(); + + cmp(dword[r15 + jsi.offsetof_halt_reason], 0); + jne(return_to_caller_mxcsr_already_exited); + if (cb.enable_cycle_counting) { + cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0); + jng(return_to_caller_mxcsr_already_exited); + } + SwitchMxcsrOnEntry(); + cb.LookupBlock->EmitCall(*this); + jmp(ABI_RETURN); + + align(); + return_from_run_code[FORCE_RETURN] = getCurr<const void*>(); + L(return_to_caller); + + SwitchMxcsrOnExit(); + // fallthrough + + return_from_run_code[MXCSR_ALREADY_EXITED | FORCE_RETURN] = getCurr<const void*>(); + L(return_to_caller_mxcsr_already_exited); + + if (cb.enable_cycle_counting) { + cb.AddTicks->EmitCall(*this, [this](RegList param) { + mov(param[0], qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)]); + sub(param[0], qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)]); + }); + } + + xor_(eax, eax); + lock(); + xchg(dword[r15 + jsi.offsetof_halt_reason], eax); + + ABI_PopCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout)); + ret(); + + PerfMapRegister(run_code, getCurr(), "dynarmic_dispatcher"); +} + +void BlockOfCode::SwitchMxcsrOnEntry() { + stmxcsr(dword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, save_host_MXCSR)]); + ldmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]); +} + +void BlockOfCode::SwitchMxcsrOnExit() { + stmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]); + ldmxcsr(dword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, save_host_MXCSR)]); +} + +void BlockOfCode::EnterStandardASIMD() { + stmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]); + ldmxcsr(dword[r15 + jsi.offsetof_asimd_MXCSR]); +} + +void BlockOfCode::LeaveStandardASIMD() { + stmxcsr(dword[r15 + jsi.offsetof_asimd_MXCSR]); + ldmxcsr(dword[r15 + jsi.offsetof_guest_MXCSR]); +} + +void BlockOfCode::UpdateTicks() { + if (!cb.enable_cycle_counting) { + return; + } + + cb.AddTicks->EmitCall(*this, [this](RegList param) { + mov(param[0], qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)]); + sub(param[0], qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)]); + }); + + cb.GetTicksRemaining->EmitCall(*this); + mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_to_run)], ABI_RETURN); + mov(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], ABI_RETURN); +} + +void BlockOfCode::LookupBlock() { + cb.LookupBlock->EmitCall(*this); +} + +void BlockOfCode::LoadRequiredFlagsForCondFromRax(IR::Cond cond) { +#ifdef __APPLE__ + static const bool is_rosetta = IsUnderRosetta(); +#endif + + // sahf restores SF, ZF, CF + // add al, 0x7F restores OF + + switch (cond) { + case IR::Cond::EQ: // z + case IR::Cond::NE: // !z + case IR::Cond::CS: // c + case IR::Cond::CC: // !c + case IR::Cond::MI: // n + case IR::Cond::PL: // !n + sahf(); + break; + case IR::Cond::VS: // v + case IR::Cond::VC: // !v + cmp(al, 0x81); + break; + case IR::Cond::HI: // c & !z + case IR::Cond::LS: // !c | z + sahf(); + cmc(); + break; + case IR::Cond::GE: // n == v + case IR::Cond::LT: // n != v + case IR::Cond::GT: // !z & (n == v) + case IR::Cond::LE: // z | (n != v) +#ifdef __APPLE__ + if (is_rosetta) { + shl(al, 3); + xchg(al, ah); + push(rax); + popf(); + break; + } +#endif + cmp(al, 0x81); + sahf(); + break; + case IR::Cond::AL: + case IR::Cond::NV: + break; + default: + ASSERT_MSG(false, "Unknown cond {}", static_cast<size_t>(cond)); + break; + } +} + +Xbyak::Address BlockOfCode::Const(const Xbyak::AddressFrame& frame, u64 lower, u64 upper) { + return constant_pool.GetConstant(frame, lower, upper); +} + +CodePtr BlockOfCode::GetCodeBegin() const { + return code_begin; +} + +size_t BlockOfCode::GetTotalCodeSize() const { + return maxSize_; +} + +void* BlockOfCode::AllocateFromCodeSpace(size_t alloc_size) { + if (size_ + alloc_size >= maxSize_) { + throw Xbyak::Error(Xbyak::ERR_CODE_IS_TOO_BIG); + } + + EnsureMemoryCommitted(alloc_size); + + void* ret = getCurr<void*>(); + size_ += alloc_size; + memset(ret, 0, alloc_size); + return ret; +} + +void BlockOfCode::SetCodePtr(CodePtr code_ptr) { + // The "size" defines where top_, the insertion point, is. + size_t required_size = reinterpret_cast<const u8*>(code_ptr) - getCode(); + setSize(required_size); +} + +void BlockOfCode::EnsurePatchLocationSize(CodePtr begin, size_t size) { + size_t current_size = getCurr<const u8*>() - reinterpret_cast<const u8*>(begin); + ASSERT(current_size <= size); + nop(size - current_size); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.h b/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.h new file mode 100644 index 0000000000..3f7573af25 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.h @@ -0,0 +1,205 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <functional> +#include <memory> +#include <type_traits> + +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> +#include <xbyak/xbyak.h> +#include <xbyak/xbyak_util.h> + +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/callback.h" +#include "dynarmic/backend/x64/constant_pool.h" +#include "dynarmic/backend/x64/host_feature.h" +#include "dynarmic/backend/x64/jitstate_info.h" +#include "dynarmic/common/cast_util.h" +#include "dynarmic/interface/halt_reason.h" +#include "dynarmic/ir/cond.h" + +namespace Dynarmic::Backend::X64 { + +using CodePtr = const void*; + +struct RunCodeCallbacks { + std::unique_ptr<Callback> LookupBlock; + std::unique_ptr<Callback> AddTicks; + std::unique_ptr<Callback> GetTicksRemaining; + bool enable_cycle_counting; +}; + +class BlockOfCode final : public Xbyak::CodeGenerator { +public: + BlockOfCode(RunCodeCallbacks cb, JitStateInfo jsi, size_t total_code_size, std::function<void(BlockOfCode&)> rcp); + BlockOfCode(const BlockOfCode&) = delete; + + /// Call when external emitters have finished emitting their preludes. + void PreludeComplete(); + + /// Change permissions to RW. This is required to support systems with W^X enforced. + void EnableWriting(); + /// Change permissions to RX. This is required to support systems with W^X enforced. + void DisableWriting(); + + /// Clears this block of code and resets code pointer to beginning. + void ClearCache(); + /// Calculates how much space is remaining to use. + size_t SpaceRemaining() const; + /// Ensure at least codesize bytes of code cache memory are committed at the current code_ptr. + void EnsureMemoryCommitted(size_t codesize); + + /// Runs emulated code from code_ptr. + HaltReason RunCode(void* jit_state, CodePtr code_ptr) const; + /// Runs emulated code from code_ptr for a single cycle. + HaltReason StepCode(void* jit_state, CodePtr code_ptr) const; + /// Code emitter: Returns to dispatcher + void ReturnFromRunCode(bool mxcsr_already_exited = false); + /// Code emitter: Returns to dispatcher, forces return to host + void ForceReturnFromRunCode(bool mxcsr_already_exited = false); + /// Code emitter: Makes guest MXCSR the current MXCSR + void SwitchMxcsrOnEntry(); + /// Code emitter: Makes saved host MXCSR the current MXCSR + void SwitchMxcsrOnExit(); + /// Code emitter: Enter standard ASIMD MXCSR region + void EnterStandardASIMD(); + /// Code emitter: Leave standard ASIMD MXCSR region + void LeaveStandardASIMD(); + /// Code emitter: Updates cycles remaining my calling cb.AddTicks and cb.GetTicksRemaining + /// @note this clobbers ABI caller-save registers + void UpdateTicks(); + /// Code emitter: Performs a block lookup based on current state + /// @note this clobbers ABI caller-save registers + void LookupBlock(); + + /// Code emitter: Load required flags for conditional cond from rax into host rflags + void LoadRequiredFlagsForCondFromRax(IR::Cond cond); + + /// Code emitter: Calls the function + template<typename FunctionPointer> + void CallFunction(FunctionPointer fn) { + static_assert(std::is_pointer_v<FunctionPointer> && std::is_function_v<std::remove_pointer_t<FunctionPointer>>, + "Supplied type must be a pointer to a function"); + + const u64 address = reinterpret_cast<u64>(fn); + const u64 distance = address - (getCurr<u64>() + 5); + + if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL) { + // Far call + mov(rax, address); + call(rax); + } else { + call(fn); + } + } + + /// Code emitter: Calls the lambda. Lambda must not have any captures. + template<typename Lambda> + void CallLambda(Lambda l) { + CallFunction(Common::FptrCast(l)); + } + + void ZeroExtendFrom(size_t bitsize, Xbyak::Reg64 reg) { + switch (bitsize) { + case 8: + movzx(reg.cvt32(), reg.cvt8()); + return; + case 16: + movzx(reg.cvt32(), reg.cvt16()); + return; + case 32: + mov(reg.cvt32(), reg.cvt32()); + return; + case 64: + return; + default: + UNREACHABLE(); + } + } + + Xbyak::Address Const(const Xbyak::AddressFrame& frame, u64 lower, u64 upper = 0); + + template<size_t esize> + Xbyak::Address BConst(const Xbyak::AddressFrame& frame, u64 value) { + return Const(frame, mcl::bit::replicate_element<u64>(esize, value), + mcl::bit::replicate_element<u64>(esize, value)); + } + + CodePtr GetCodeBegin() const; + size_t GetTotalCodeSize() const; + + const void* GetReturnFromRunCodeAddress() const { + return return_from_run_code[0]; + } + + const void* GetForceReturnFromRunCodeAddress() const { + return return_from_run_code[FORCE_RETURN]; + } + + void int3() { db(0xCC); } + + /// Allocate memory of `size` bytes from the same block of memory the code is in. + /// This is useful for objects that need to be placed close to or within code. + /// The lifetime of this memory is the same as the code around it. + void* AllocateFromCodeSpace(size_t size); + + void SetCodePtr(CodePtr code_ptr); + void EnsurePatchLocationSize(CodePtr begin, size_t size); + + // ABI registers +#ifdef _WIN32 + static const Xbyak::Reg64 ABI_RETURN; + static const Xbyak::Reg64 ABI_PARAM1; + static const Xbyak::Reg64 ABI_PARAM2; + static const Xbyak::Reg64 ABI_PARAM3; + static const Xbyak::Reg64 ABI_PARAM4; + static const std::array<Xbyak::Reg64, ABI_PARAM_COUNT> ABI_PARAMS; +#else + static const Xbyak::Reg64 ABI_RETURN; + static const Xbyak::Reg64 ABI_RETURN2; + static const Xbyak::Reg64 ABI_PARAM1; + static const Xbyak::Reg64 ABI_PARAM2; + static const Xbyak::Reg64 ABI_PARAM3; + static const Xbyak::Reg64 ABI_PARAM4; + static const Xbyak::Reg64 ABI_PARAM5; + static const Xbyak::Reg64 ABI_PARAM6; + static const std::array<Xbyak::Reg64, ABI_PARAM_COUNT> ABI_PARAMS; +#endif + + JitStateInfo GetJitStateInfo() const { return jsi; } + + bool HasHostFeature(HostFeature feature) const { + return (host_features & feature) == feature; + } + +private: + RunCodeCallbacks cb; + JitStateInfo jsi; + + bool prelude_complete = false; + CodePtr code_begin = nullptr; + +#ifdef _WIN32 + size_t committed_size = 0; +#endif + + ConstantPool constant_pool; + + using RunCodeFuncType = HaltReason (*)(void*, CodePtr); + RunCodeFuncType run_code = nullptr; + RunCodeFuncType step_code = nullptr; + static constexpr size_t MXCSR_ALREADY_EXITED = 1 << 0; + static constexpr size_t FORCE_RETURN = 1 << 1; + std::array<const void*, 4> return_from_run_code; + void GenRunCode(std::function<void(BlockOfCode&)> rcp); + + const HostFeature host_features; +}; + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/callback.cpp b/externals/dynarmic/src/dynarmic/backend/x64/callback.cpp new file mode 100644 index 0000000000..1ecd8304f1 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/callback.cpp @@ -0,0 +1,41 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/callback.h" + +#include "dynarmic/backend/x64/block_of_code.h" + +namespace Dynarmic::Backend::X64 { + +Callback::~Callback() = default; + +void SimpleCallback::EmitCall(BlockOfCode& code, std::function<void(RegList)> l) const { + l({code.ABI_PARAM1, code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4}); + code.CallFunction(fn); +} + +void SimpleCallback::EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Xbyak::Reg64, RegList)> l) const { + l(code.ABI_PARAM1, {code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4}); + code.CallFunction(fn); +} + +void ArgCallback::EmitCall(BlockOfCode& code, std::function<void(RegList)> l) const { + l({code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4}); + code.mov(code.ABI_PARAM1, arg); + code.CallFunction(fn); +} + +void ArgCallback::EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Xbyak::Reg64, RegList)> l) const { +#if defined(WIN32) && !defined(__MINGW64__) + l(code.ABI_PARAM2, {code.ABI_PARAM3, code.ABI_PARAM4}); + code.mov(code.ABI_PARAM1, arg); +#else + l(code.ABI_PARAM1, {code.ABI_PARAM3, code.ABI_PARAM4}); + code.mov(code.ABI_PARAM2, arg); +#endif + code.CallFunction(fn); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/callback.h b/externals/dynarmic/src/dynarmic/backend/x64/callback.h new file mode 100644 index 0000000000..716555daed --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/callback.h @@ -0,0 +1,63 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <functional> +#include <vector> + +#include <mcl/stdint.hpp> +#include <xbyak/xbyak.h> + +namespace Dynarmic::Backend::X64 { + +using RegList = std::vector<Xbyak::Reg64>; + +class BlockOfCode; + +class Callback { +public: + virtual ~Callback(); + + void EmitCall(BlockOfCode& code) const { + EmitCall(code, [](RegList) {}); + } + + virtual void EmitCall(BlockOfCode& code, std::function<void(RegList)> fn) const = 0; + virtual void EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Xbyak::Reg64, RegList)> fn) const = 0; +}; + +class SimpleCallback final : public Callback { +public: + template<typename Function> + SimpleCallback(Function fn) + : fn(reinterpret_cast<void (*)()>(fn)) {} + + using Callback::EmitCall; + + void EmitCall(BlockOfCode& code, std::function<void(RegList)> fn) const override; + void EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Xbyak::Reg64, RegList)> fn) const override; + +private: + void (*fn)(); +}; + +class ArgCallback final : public Callback { +public: + template<typename Function> + ArgCallback(Function fn, u64 arg) + : fn(reinterpret_cast<void (*)()>(fn)), arg(arg) {} + + using Callback::EmitCall; + + void EmitCall(BlockOfCode& code, std::function<void(RegList)> fn) const override; + void EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Xbyak::Reg64, RegList)> fn) const override; + +private: + void (*fn)(); + u64 arg; +}; + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.cpp b/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.cpp new file mode 100644 index 0000000000..ba003262b3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.cpp @@ -0,0 +1,38 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/constant_pool.h" + +#include <cstring> + +#include <mcl/assert.hpp> + +#include "dynarmic/backend/x64/block_of_code.h" + +namespace Dynarmic::Backend::X64 { + +ConstantPool::ConstantPool(BlockOfCode& code, size_t size) + : code(code), insertion_point(0) { + code.EnsureMemoryCommitted(align_size + size); + code.int3(); + code.align(align_size); + pool = std::span<ConstantT>( + reinterpret_cast<ConstantT*>(code.AllocateFromCodeSpace(size)), size / align_size); +} + +Xbyak::Address ConstantPool::GetConstant(const Xbyak::AddressFrame& frame, u64 lower, u64 upper) { + const auto constant = ConstantT(lower, upper); + auto iter = constant_info.find(constant); + if (iter == constant_info.end()) { + ASSERT(insertion_point < pool.size()); + ConstantT& target_constant = pool[insertion_point]; + target_constant = constant; + iter = constant_info.insert({constant, &target_constant}).first; + ++insertion_point; + } + return frame[code.rip + iter->second]; +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.h b/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.h new file mode 100644 index 0000000000..2ab6421539 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/constant_pool.h @@ -0,0 +1,50 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <bit> +#include <cstddef> +#include <span> +#include <utility> + +#include <mcl/stdint.hpp> +#include <tsl/robin_map.h> +#include <xbyak/xbyak.h> + +namespace Dynarmic::Backend::X64 { + +class BlockOfCode; + +/// ConstantPool allocates a block of memory from BlockOfCode. +/// It places constants into this block of memory, returning the address +/// of the memory location where the constant is placed. If the constant +/// already exists, its memory location is reused. +class ConstantPool final { +public: + ConstantPool(BlockOfCode& code, size_t size); + + Xbyak::Address GetConstant(const Xbyak::AddressFrame& frame, u64 lower, u64 upper = 0); + +private: + static constexpr size_t align_size = 16; // bytes + + using ConstantT = std::pair<u64, u64>; + static_assert(sizeof(ConstantT) == align_size); + + struct ConstantHash { + std::size_t operator()(const ConstantT& constant) const noexcept { + return constant.first ^ std::rotl<u64>(constant.second, 1); + } + }; + + tsl::robin_map<ConstantT, void*, ConstantHash> constant_info; + + BlockOfCode& code; + std::span<ConstantT> pool; + std::size_t insertion_point; +}; + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/constants.h b/externals/dynarmic/src/dynarmic/backend/x64/constants.h new file mode 100644 index 0000000000..f817324142 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/constants.h @@ -0,0 +1,177 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <optional> + +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/rounding_mode.h" + +namespace Dynarmic::Backend::X64 { + +// Redefinition of _MM_CMP_* constants for use with the 'vcmp' instruction +namespace Cmp { +constexpr u8 Equal_OQ = 0; // Equal (Quiet, Ordered). +constexpr u8 LessThan_OS = 1; // Less (Signaling, Ordered). +constexpr u8 LessEqual_OS = 2; // Less/Equal (Signaling, Ordered). +constexpr u8 Unordered_Q = 3; // Unordered (Quiet). +constexpr u8 NotEqual_UQ = 4; // Not Equal (Quiet, Unordered). +constexpr u8 NotLessThan_US = 5; // Not Less (Signaling, Unordered). +constexpr u8 NotLessEqual_US = 6; // Not Less/Equal (Signaling, Unordered). +constexpr u8 Ordered_Q = 7; // Ordered (Quiet). +constexpr u8 Equal_UQ = 8; // Equal (Quiet, Unordered). +constexpr u8 NotGreaterEqual_US = 9; // Not Greater/Equal (Signaling, Unordered). +constexpr u8 NotGreaterThan_US = 10; // Not Greater (Signaling, Unordered). +constexpr u8 False_OQ = 11; // False (Quiet, Ordered). +constexpr u8 NotEqual_OQ = 12; // Not Equal (Quiet, Ordered). +constexpr u8 GreaterEqual_OS = 13; // Greater/Equal (Signaling, Ordered). +constexpr u8 GreaterThan_OS = 14; // Greater (Signaling, Ordered). +constexpr u8 True_UQ = 15; // True (Quiet, Unordered). +constexpr u8 Equal_OS = 16; // Equal (Signaling, Ordered). +constexpr u8 LessThan_OQ = 17; // Less (Quiet, Ordered). +constexpr u8 LessEqual_OQ = 18; // Less/Equal (Quiet, Ordered). +constexpr u8 Unordered_S = 19; // Unordered (Signaling). +constexpr u8 NotEqual_US = 20; // Not Equal (Signaling, Unordered). +constexpr u8 NotLessThan_UQ = 21; // Not Less (Quiet, Unordered). +constexpr u8 NotLessEqual_UQ = 22; // Not Less/Equal (Quiet, Unordered). +constexpr u8 Ordered_S = 23; // Ordered (Signaling). +constexpr u8 Equal_US = 24; // Equal (Signaling, Unordered). +constexpr u8 NotGreaterEqual_UQ = 25; // Not Greater/Equal (Quiet, Unordered). +constexpr u8 NotGreaterThan_UQ = 26; // Not Greater (Quiet, Unordered). +constexpr u8 False_OS = 27; // False (Signaling, Ordered). +constexpr u8 NotEqual_OS = 28; // Not Equal (Signaling, Ordered). +constexpr u8 GreaterEqual_OQ = 29; // Greater/Equal (Quiet, Ordered). +constexpr u8 GreaterThan_OQ = 30; // Greater (Quiet, Ordered). +constexpr u8 True_US = 31; // True (Signaling, Unordered). +} // namespace Cmp + +// Redefinition of _MM_CMPINT_* constants for use with the 'vpcmp' instruction +namespace CmpInt { +constexpr u8 Equal = 0x0; +constexpr u8 LessThan = 0x1; +constexpr u8 LessEqual = 0x2; +constexpr u8 False = 0x3; +constexpr u8 NotEqual = 0x4; +constexpr u8 NotLessThan = 0x5; +constexpr u8 GreaterEqual = 0x5; +constexpr u8 NotLessEqual = 0x6; +constexpr u8 GreaterThan = 0x6; +constexpr u8 True = 0x7; +} // namespace CmpInt + +// Used to generate ternary logic truth-tables for vpternlog +// Use these to directly refer to terms and perform binary operations upon them +// and the resulting value will be the ternary lookup table +// ex: +// (Tern::a | ~Tern::b) & Tern::c +// = 0b10100010 +// = 0xa2 +// vpternlog a, b, c, 0xa2 +// +// ~(Tern::a ^ Tern::b) & Tern::c +// = 0b10000010 +// = 0x82 +// vpternlog a, b, c, 0x82 +namespace Tern { +constexpr u8 a = 0b11110000; +constexpr u8 b = 0b11001100; +constexpr u8 c = 0b10101010; +} // namespace Tern + +// For use as a bitmask with vfpclass instruction +namespace FpClass { +constexpr u8 QNaN = 0b00000001; +constexpr u8 ZeroPos = 0b00000010; +constexpr u8 ZeroNeg = 0b00000100; +constexpr u8 InfPos = 0b00001000; +constexpr u8 InfNeg = 0b00010000; +constexpr u8 Denormal = 0b00100000; +constexpr u8 Negative = 0b01000000; // Negative finite value +constexpr u8 SNaN = 0b10000000; +} // namespace FpClass + +// Opcodes for use with vfixupimm +enum class FpFixup : u8 { + Dest = 0b0000, // Preserve destination + Norm_Src = 0b0001, // Source operand (Denormal as positive-zero) + QNaN_Src = 0b0010, // QNaN with sign of source (Denormal as positive-zero) + IndefNaN = 0b0011, // Indefinite QNaN (Negative QNaN with no payload on x86) + NegInf = 0b0100, // -Infinity + PosInf = 0b0101, // +Infinity + Inf_Src = 0b0110, // Infinity with sign of source (Denormal as positive-zero) + NegZero = 0b0111, // -0.0 + PosZero = 0b1000, // +0.0 + NegOne = 0b1001, // -1.0 + PosOne = 0b1010, // +1.0 + Half = 0b1011, // 0.5 + Ninety = 0b1100, // 90.0 + HalfPi = 0b1101, // PI/2 + PosMax = 0b1110, // +{FLT_MAX,DBL_MAX} + NegMax = 0b1111, // -{FLT_MAX,DBL_MAX} +}; + +// Generates 32-bit LUT for vfixupimm instruction +constexpr u32 FixupLUT(FpFixup src_qnan = FpFixup::Dest, + FpFixup src_snan = FpFixup::Dest, + FpFixup src_zero = FpFixup::Dest, + FpFixup src_posone = FpFixup::Dest, + FpFixup src_neginf = FpFixup::Dest, + FpFixup src_posinf = FpFixup::Dest, + FpFixup src_neg = FpFixup::Dest, + FpFixup src_pos = FpFixup::Dest) { + u32 fixup_lut = 0; + fixup_lut = mcl::bit::set_bits<0, 3, u32>(fixup_lut, static_cast<u32>(src_qnan)); + fixup_lut = mcl::bit::set_bits<4, 7, u32>(fixup_lut, static_cast<u32>(src_snan)); + fixup_lut = mcl::bit::set_bits<8, 11, u32>(fixup_lut, static_cast<u32>(src_zero)); + fixup_lut = mcl::bit::set_bits<12, 15, u32>(fixup_lut, static_cast<u32>(src_posone)); + fixup_lut = mcl::bit::set_bits<16, 19, u32>(fixup_lut, static_cast<u32>(src_neginf)); + fixup_lut = mcl::bit::set_bits<20, 23, u32>(fixup_lut, static_cast<u32>(src_posinf)); + fixup_lut = mcl::bit::set_bits<24, 27, u32>(fixup_lut, static_cast<u32>(src_neg)); + fixup_lut = mcl::bit::set_bits<28, 31, u32>(fixup_lut, static_cast<u32>(src_pos)); + return fixup_lut; +} + +// Opcodes for use with vrange* instructions +enum class FpRangeSelect : u8 { + Min = 0b00, + Max = 0b01, + AbsMin = 0b10, // Smaller absolute value + AbsMax = 0b11, // Larger absolute value +}; + +enum class FpRangeSign : u8 { + A = 0b00, // Copy sign of operand A + Preserve = 0b01, // Leave sign as is + Positive = 0b10, // Set Positive + Negative = 0b11, // Set Negative +}; + +// Generates 8-bit immediate LUT for vrange instruction +constexpr u8 FpRangeLUT(FpRangeSelect range_select, FpRangeSign range_sign) { + u8 range_lut = 0; + range_lut = mcl::bit::set_bits<0, 1, u8>(range_lut, static_cast<u8>(range_select)); + range_lut = mcl::bit::set_bits<2, 3, u8>(range_lut, static_cast<u8>(range_sign)); + return range_lut; +} + +constexpr std::optional<int> ConvertRoundingModeToX64Immediate(FP::RoundingMode rounding_mode) { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + return 0b00; + case FP::RoundingMode::TowardsPlusInfinity: + return 0b10; + case FP::RoundingMode::TowardsMinusInfinity: + return 0b01; + case FP::RoundingMode::TowardsZero: + return 0b11; + default: + return std::nullopt; + } +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/devirtualize.h b/externals/dynarmic/src/dynarmic/backend/x64/devirtualize.h new file mode 100644 index 0000000000..778536a3bb --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/devirtualize.h @@ -0,0 +1,81 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <cstring> +#include <utility> + +#include <mcl/bit_cast.hpp> +#include <mcl/stdint.hpp> +#include <mcl/type_traits/function_info.hpp> + +#include "dynarmic/backend/x64/callback.h" + +namespace Dynarmic { +namespace Backend::X64 { + +namespace impl { + +template<typename FunctionType, FunctionType mfp> +struct ThunkBuilder; + +template<typename C, typename R, typename... Args, R (C::*mfp)(Args...)> +struct ThunkBuilder<R (C::*)(Args...), mfp> { + static R Thunk(C* this_, Args... args) { + return (this_->*mfp)(std::forward<Args>(args)...); + } +}; + +} // namespace impl + +template<auto mfp> +ArgCallback DevirtualizeGeneric(mcl::class_type<decltype(mfp)>* this_) { + return ArgCallback{&impl::ThunkBuilder<decltype(mfp), mfp>::Thunk, reinterpret_cast<u64>(this_)}; +} + +template<auto mfp> +ArgCallback DevirtualizeWindows(mcl::class_type<decltype(mfp)>* this_) { + static_assert(sizeof(mfp) == 8); + return ArgCallback{mcl::bit_cast<u64>(mfp), reinterpret_cast<u64>(this_)}; +} + +template<auto mfp> +ArgCallback DevirtualizeItanium(mcl::class_type<decltype(mfp)>* this_) { + struct MemberFunctionPointer { + /// For a non-virtual function, this is a simple function pointer. + /// For a virtual function, it is (1 + virtual table offset in bytes). + u64 ptr; + /// The required adjustment to `this`, prior to the call. + u64 adj; + } mfp_struct = mcl::bit_cast<MemberFunctionPointer>(mfp); + + static_assert(sizeof(MemberFunctionPointer) == 16); + static_assert(sizeof(MemberFunctionPointer) == sizeof(mfp)); + + u64 fn_ptr = mfp_struct.ptr; + u64 this_ptr = reinterpret_cast<u64>(this_) + mfp_struct.adj; + if (mfp_struct.ptr & 1) { + u64 vtable = mcl::bit_cast_pointee<u64>(this_ptr); + fn_ptr = mcl::bit_cast_pointee<u64>(vtable + fn_ptr - 1); + } + return ArgCallback{fn_ptr, this_ptr}; +} + +template<auto mfp> +ArgCallback Devirtualize(mcl::class_type<decltype(mfp)>* this_) { +#if defined(__APPLE__) || defined(linux) || defined(__linux) || defined(__linux__) + return DevirtualizeItanium<mfp>(this_); +#elif defined(__MINGW64__) + return DevirtualizeItanium<mfp>(this_); +#elif defined(_WIN32) + return DevirtualizeWindows<mfp>(this_); +#else + return DevirtualizeGeneric<mfp>(this_); +#endif +} + +} // namespace Backend::X64 +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp new file mode 100644 index 0000000000..902ddef9a7 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp @@ -0,0 +1,415 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/emit_x64.h" + +#include <iterator> + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/scope_exit.hpp> +#include <mcl/stdint.hpp> +#include <tsl/robin_set.h> + +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/nzcv_util.h" +#include "dynarmic/backend/x64/perf_map.h" +#include "dynarmic/backend/x64/stack_layout.h" +#include "dynarmic/backend/x64/verbose_debugging_output.h" +#include "dynarmic/common/variant_util.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +// TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary. +// TODO: Actually implement that proper instruction selector you've always wanted to sweetheart. + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block) + : reg_alloc(reg_alloc), block(block) {} + +EmitContext::~EmitContext() = default; + +void EmitContext::EraseInstruction(IR::Inst* inst) { + block.Instructions().erase(inst); + inst->ClearArgs(); +} + +EmitX64::EmitX64(BlockOfCode& code) + : code(code) { + exception_handler.Register(code); +} + +EmitX64::~EmitX64() = default; + +std::optional<EmitX64::BlockDescriptor> EmitX64::GetBasicBlock(IR::LocationDescriptor descriptor) const { + const auto iter = block_descriptors.find(descriptor); + if (iter == block_descriptors.end()) { + return std::nullopt; + } + return iter->second; +} + +void EmitX64::EmitVoid(EmitContext&, IR::Inst*) { +} + +void EmitX64::EmitIdentity(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (!args[0].IsImmediate()) { + ctx.reg_alloc.DefineValue(inst, args[0]); + } +} + +void EmitX64::EmitBreakpoint(EmitContext&, IR::Inst*) { + code.int3(); +} + +void EmitX64::EmitCallHostFunction(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(nullptr, args[1], args[2], args[3]); + code.mov(rax, args[0].GetImmediateU64()); + code.call(rax); +} + +void EmitX64::PushRSBHelper(Xbyak::Reg64 loc_desc_reg, Xbyak::Reg64 index_reg, IR::LocationDescriptor target) { + using namespace Xbyak::util; + + const auto iter = block_descriptors.find(target); + CodePtr target_code_ptr = iter != block_descriptors.end() + ? iter->second.entrypoint + : code.GetReturnFromRunCodeAddress(); + + code.mov(index_reg.cvt32(), dword[r15 + code.GetJitStateInfo().offsetof_rsb_ptr]); + + code.mov(loc_desc_reg, target.Value()); + + patch_information[target].mov_rcx.push_back(code.getCurr()); + EmitPatchMovRcx(target_code_ptr); + + code.mov(qword[r15 + index_reg * 8 + code.GetJitStateInfo().offsetof_rsb_location_descriptors], loc_desc_reg); + code.mov(qword[r15 + index_reg * 8 + code.GetJitStateInfo().offsetof_rsb_codeptrs], rcx); + + code.add(index_reg.cvt32(), 1); + code.and_(index_reg.cvt32(), u32(code.GetJitStateInfo().rsb_ptr_mask)); + code.mov(dword[r15 + code.GetJitStateInfo().offsetof_rsb_ptr], index_reg.cvt32()); +} + +void EmitX64::EmitVerboseDebuggingOutput(RegAlloc& reg_alloc) { + code.sub(rsp, sizeof(RegisterData)); + code.stmxcsr(dword[rsp + offsetof(RegisterData, mxcsr)]); + for (int i = 0; i < 16; i++) { + if (rsp.getIdx() == i) { + continue; + } + code.mov(qword[rsp + offsetof(RegisterData, gprs) + sizeof(u64) * i], Xbyak::Reg64{i}); + } + for (int i = 0; i < 16; i++) { + code.movaps(xword[rsp + offsetof(RegisterData, xmms) + 2 * sizeof(u64) * i], Xbyak::Xmm{i}); + } + code.lea(rax, ptr[rsp + sizeof(RegisterData) + offsetof(StackLayout, spill)]); + code.mov(xword[rsp + offsetof(RegisterData, spill)], rax); + + reg_alloc.EmitVerboseDebuggingOutput(); + + for (int i = 0; i < 16; i++) { + if (rsp.getIdx() == i) { + continue; + } + code.mov(Xbyak::Reg64{i}, qword[rsp + offsetof(RegisterData, gprs) + sizeof(u64) * i]); + } + for (int i = 0; i < 16; i++) { + code.movaps(Xbyak::Xmm{i}, xword[rsp + offsetof(RegisterData, xmms) + 2 * sizeof(u64) * i]); + } + code.ldmxcsr(dword[rsp + offsetof(RegisterData, mxcsr)]); + code.add(rsp, sizeof(RegisterData)); +} + +void EmitX64::EmitPushRSB(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[0].IsImmediate()); + const u64 unique_hash_of_target = args[0].GetImmediateU64(); + + ctx.reg_alloc.ScratchGpr(HostLoc::RCX); + const Xbyak::Reg64 loc_desc_reg = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 index_reg = ctx.reg_alloc.ScratchGpr(); + + PushRSBHelper(loc_desc_reg, index_reg, IR::LocationDescriptor{unique_hash_of_target}); +} + +void EmitX64::EmitGetCarryFromOp(EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.RegisterPseudoOperation(inst); +} + +void EmitX64::EmitGetOverflowFromOp(EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.RegisterPseudoOperation(inst); +} + +void EmitX64::EmitGetGEFromOp(EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.RegisterPseudoOperation(inst); +} + +void EmitX64::EmitGetUpperFromOp(EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.RegisterPseudoOperation(inst); +} + +void EmitX64::EmitGetLowerFromOp(EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.RegisterPseudoOperation(inst); +} + +void EmitX64::EmitGetNZFromOp(EmitContext& ctx, IR::Inst* inst) { + if (ctx.reg_alloc.IsValueLive(inst)) { + ctx.reg_alloc.RegisterPseudoOperation(inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const int bitsize = [&] { + switch (args[0].GetType()) { + case IR::Type::U8: + return 8; + case IR::Type::U16: + return 16; + case IR::Type::U32: + return 32; + case IR::Type::U64: + return 64; + default: + UNREACHABLE(); + } + }(); + + const Xbyak::Reg64 nz = ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize); + code.test(value, value); + code.lahf(); + code.movzx(eax, ah); + ctx.reg_alloc.DefineValue(inst, nz); +} + +void EmitX64::EmitGetNZCVFromOp(EmitContext& ctx, IR::Inst* inst) { + if (ctx.reg_alloc.IsValueLive(inst)) { + ctx.reg_alloc.RegisterPseudoOperation(inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const int bitsize = [&] { + switch (args[0].GetType()) { + case IR::Type::U8: + return 8; + case IR::Type::U16: + return 16; + case IR::Type::U32: + return 32; + case IR::Type::U64: + return 64; + default: + UNREACHABLE(); + } + }(); + + const Xbyak::Reg64 nzcv = ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize); + code.test(value, value); + code.lahf(); + code.mov(al, 0); + ctx.reg_alloc.DefineValue(inst, nzcv); +} + +void EmitX64::EmitGetCFlagFromNZCV(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (args[0].IsImmediate()) { + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + const u32 value = (args[0].GetImmediateU32() >> 8) & 1; + code.mov(result, value); + ctx.reg_alloc.DefineValue(inst, result); + } else { + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + code.shr(result, 8); + code.and_(result, 1); + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitNZCVFromPackedFlags(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (args[0].IsImmediate()) { + const Xbyak::Reg32 nzcv = ctx.reg_alloc.ScratchGpr().cvt32(); + u32 value = 0; + value |= mcl::bit::get_bit<31>(args[0].GetImmediateU32()) ? (1 << 15) : 0; + value |= mcl::bit::get_bit<30>(args[0].GetImmediateU32()) ? (1 << 14) : 0; + value |= mcl::bit::get_bit<29>(args[0].GetImmediateU32()) ? (1 << 8) : 0; + value |= mcl::bit::get_bit<28>(args[0].GetImmediateU32()) ? (1 << 0) : 0; + code.mov(nzcv, value); + ctx.reg_alloc.DefineValue(inst, nzcv); + } else if (code.HasHostFeature(HostFeature::FastBMI2)) { + const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.shr(nzcv, 28); + code.mov(tmp, NZCV::x64_mask); + code.pdep(nzcv, nzcv, tmp); + + ctx.reg_alloc.DefineValue(inst, nzcv); + } else { + const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + + code.shr(nzcv, 28); + code.imul(nzcv, nzcv, NZCV::to_x64_multiplier); + code.and_(nzcv, NZCV::x64_mask); + + ctx.reg_alloc.DefineValue(inst, nzcv); + } +} + +void EmitX64::EmitAddCycles(size_t cycles) { + ASSERT(cycles < std::numeric_limits<s32>::max()); + code.sub(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], static_cast<u32>(cycles)); +} + +Xbyak::Label EmitX64::EmitCond(IR::Cond cond) { + Xbyak::Label pass; + + code.mov(eax, dword[r15 + code.GetJitStateInfo().offsetof_cpsr_nzcv]); + + code.LoadRequiredFlagsForCondFromRax(cond); + + switch (cond) { + case IR::Cond::EQ: + code.jz(pass); + break; + case IR::Cond::NE: + code.jnz(pass); + break; + case IR::Cond::CS: + code.jc(pass); + break; + case IR::Cond::CC: + code.jnc(pass); + break; + case IR::Cond::MI: + code.js(pass); + break; + case IR::Cond::PL: + code.jns(pass); + break; + case IR::Cond::VS: + code.jo(pass); + break; + case IR::Cond::VC: + code.jno(pass); + break; + case IR::Cond::HI: + code.ja(pass); + break; + case IR::Cond::LS: + code.jna(pass); + break; + case IR::Cond::GE: + code.jge(pass); + break; + case IR::Cond::LT: + code.jl(pass); + break; + case IR::Cond::GT: + code.jg(pass); + break; + case IR::Cond::LE: + code.jle(pass); + break; + default: + ASSERT_MSG(false, "Unknown cond {}", static_cast<size_t>(cond)); + break; + } + + return pass; +} + +EmitX64::BlockDescriptor EmitX64::RegisterBlock(const IR::LocationDescriptor& descriptor, CodePtr entrypoint, size_t size) { + PerfMapRegister(entrypoint, code.getCurr(), LocationDescriptorToFriendlyName(descriptor)); + Patch(descriptor, entrypoint); + + BlockDescriptor block_desc{entrypoint, size}; + block_descriptors.insert({IR::LocationDescriptor{descriptor.Value()}, block_desc}); + return block_desc; +} + +void EmitX64::EmitTerminal(IR::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + Common::VisitVariant<void>(terminal, [this, initial_location, is_single_step](auto x) { + using T = std::decay_t<decltype(x)>; + if constexpr (!std::is_same_v<T, IR::Term::Invalid>) { + this->EmitTerminalImpl(x, initial_location, is_single_step); + } else { + ASSERT_MSG(false, "Invalid terminal"); + } + }); +} + +void EmitX64::Patch(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) { + const CodePtr save_code_ptr = code.getCurr(); + const PatchInformation& patch_info = patch_information[target_desc]; + + for (CodePtr location : patch_info.jg) { + code.SetCodePtr(location); + EmitPatchJg(target_desc, target_code_ptr); + } + + for (CodePtr location : patch_info.jz) { + code.SetCodePtr(location); + EmitPatchJz(target_desc, target_code_ptr); + } + + for (CodePtr location : patch_info.jmp) { + code.SetCodePtr(location); + EmitPatchJmp(target_desc, target_code_ptr); + } + + for (CodePtr location : patch_info.mov_rcx) { + code.SetCodePtr(location); + EmitPatchMovRcx(target_code_ptr); + } + + code.SetCodePtr(save_code_ptr); +} + +void EmitX64::Unpatch(const IR::LocationDescriptor& target_desc) { + if (patch_information.count(target_desc)) { + Patch(target_desc, nullptr); + } +} + +void EmitX64::ClearCache() { + block_descriptors.clear(); + patch_information.clear(); + + PerfMapClear(); +} + +void EmitX64::InvalidateBasicBlocks(const tsl::robin_set<IR::LocationDescriptor>& locations) { + code.EnableWriting(); + SCOPE_EXIT { + code.DisableWriting(); + }; + + for (const auto& descriptor : locations) { + const auto it = block_descriptors.find(descriptor); + if (it == block_descriptors.end()) { + continue; + } + + Unpatch(descriptor); + + block_descriptors.erase(it); + } +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.h b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.h new file mode 100644 index 0000000000..5a46f04ee6 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.h @@ -0,0 +1,145 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <functional> +#include <memory> +#include <optional> +#include <string> +#include <type_traits> +#include <vector> + +#include <mcl/bitsizeof.hpp> +#include <tsl/robin_map.h> +#include <tsl/robin_set.h> +#include <xbyak/xbyak.h> +#include <xbyak/xbyak_util.h> + +#include "dynarmic/backend/exception_handler.h" +#include "dynarmic/backend/x64/reg_alloc.h" +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/ir/location_descriptor.h" +#include "dynarmic/ir/terminal.h" + +namespace Dynarmic::IR { +class Block; +class Inst; +} // namespace Dynarmic::IR + +namespace Dynarmic { +enum class OptimizationFlag : u32; +} // namespace Dynarmic + +namespace Dynarmic::Backend::X64 { + +class BlockOfCode; + +using A64FullVectorWidth = std::integral_constant<size_t, 128>; + +// Array alias that always sizes itself according to the given type T +// relative to the size of a vector register. e.g. T = u32 would result +// in a std::array<u32, 4>. +template<typename T> +using VectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof<T>>; + +template<typename T> +using HalfVectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof<T> / 2>; + +struct EmitContext { + EmitContext(RegAlloc& reg_alloc, IR::Block& block); + virtual ~EmitContext(); + + void EraseInstruction(IR::Inst* inst); + + virtual FP::FPCR FPCR(bool fpcr_controlled = true) const = 0; + + virtual bool HasOptimization(OptimizationFlag flag) const = 0; + + RegAlloc& reg_alloc; + IR::Block& block; + + std::vector<std::function<void()>> deferred_emits; +}; + +using SharedLabel = std::shared_ptr<Xbyak::Label>; + +inline SharedLabel GenSharedLabel() { + return std::make_shared<Xbyak::Label>(); +} + +class EmitX64 { +public: + struct BlockDescriptor { + CodePtr entrypoint; // Entrypoint of emitted code + size_t size; // Length in bytes of emitted code + }; + + explicit EmitX64(BlockOfCode& code); + virtual ~EmitX64(); + + /// Looks up an emitted host block in the cache. + std::optional<BlockDescriptor> GetBasicBlock(IR::LocationDescriptor descriptor) const; + + /// Empties the entire cache. + virtual void ClearCache(); + + /// Invalidates a selection of basic blocks. + void InvalidateBasicBlocks(const tsl::robin_set<IR::LocationDescriptor>& locations); + +protected: + // Microinstruction emitters +#define OPCODE(name, type, ...) void Emit##name(EmitContext& ctx, IR::Inst* inst); +#define A32OPC(...) +#define A64OPC(...) +#include "dynarmic/ir/opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + + // Helpers + virtual std::string LocationDescriptorToFriendlyName(const IR::LocationDescriptor&) const = 0; + void EmitAddCycles(size_t cycles); + Xbyak::Label EmitCond(IR::Cond cond); + BlockDescriptor RegisterBlock(const IR::LocationDescriptor& location_descriptor, CodePtr entrypoint, size_t size); + void PushRSBHelper(Xbyak::Reg64 loc_desc_reg, Xbyak::Reg64 index_reg, IR::LocationDescriptor target); + + void EmitVerboseDebuggingOutput(RegAlloc& reg_alloc); + + // Terminal instruction emitters + void EmitTerminal(IR::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step); + virtual void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::ReturnToDispatch terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::PopRSBHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::FastDispatchHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + + // Patching + struct PatchInformation { + std::vector<CodePtr> jg; + std::vector<CodePtr> jz; + std::vector<CodePtr> jmp; + std::vector<CodePtr> mov_rcx; + }; + void Patch(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr); + virtual void Unpatch(const IR::LocationDescriptor& target_desc); + virtual void EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) = 0; + virtual void EmitPatchJz(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) = 0; + virtual void EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) = 0; + virtual void EmitPatchMovRcx(CodePtr target_code_ptr = nullptr) = 0; + + // State + BlockOfCode& code; + ExceptionHandler exception_handler; + tsl::robin_map<IR::LocationDescriptor, BlockDescriptor> block_descriptors; + tsl::robin_map<IR::LocationDescriptor, PatchInformation> patch_information; +}; + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_aes.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_aes.cpp new file mode 100644 index 0000000000..9430f0726b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_aes.cpp @@ -0,0 +1,109 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/common/crypto/aes.h" +#include "dynarmic/ir/microinstruction.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; +namespace AES = Common::Crypto::AES; + +using AESFn = void(AES::State&, const AES::State&); + +static void EmitAESFunction(RegAlloc::ArgumentInfo args, EmitContext& ctx, BlockOfCode& code, IR::Inst* inst, AESFn fn) { + constexpr u32 stack_space = static_cast<u32>(sizeof(AES::State)) * 2; + const Xbyak::Xmm input = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + + ctx.reg_alloc.HostCall(nullptr); + + ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE); + + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + sizeof(AES::State)]); + code.movaps(xword[code.ABI_PARAM2], input); + code.CallFunction(fn); + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE]); + + ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitAESDecryptSingleRound(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AES)) { + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + + code.pxor(zero, zero); + code.aesdeclast(data, zero); + + ctx.reg_alloc.DefineValue(inst, data); + return; + } + + EmitAESFunction(args, ctx, code, inst, AES::DecryptSingleRound); +} + +void EmitX64::EmitAESEncryptSingleRound(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AES)) { + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + + code.pxor(zero, zero); + code.aesenclast(data, zero); + + ctx.reg_alloc.DefineValue(inst, data); + return; + } + + EmitAESFunction(args, ctx, code, inst, AES::EncryptSingleRound); +} + +void EmitX64::EmitAESInverseMixColumns(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AES)) { + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + + code.aesimc(data, data); + + ctx.reg_alloc.DefineValue(inst, data); + return; + } + + EmitAESFunction(args, ctx, code, inst, AES::InverseMixColumns); +} + +void EmitX64::EmitAESMixColumns(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AES)) { + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + + code.pxor(zero, zero); + code.aesdeclast(data, zero); + code.aesenc(data, zero); + + ctx.reg_alloc.DefineValue(inst, data); + return; + } + + EmitAESFunction(args, ctx, code, inst, AES::MixColumns); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_crc32.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_crc32.cpp new file mode 100644 index 0000000000..842a8612ee --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_crc32.cpp @@ -0,0 +1,154 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <array> +#include <climits> + +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/common/crypto/crc32.h" +#include "dynarmic/ir/microinstruction.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; +namespace CRC32 = Common::Crypto::CRC32; + +static void EmitCRC32Castagnoli(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::SSE42)) { + const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[1]).changeBit(data_size); + + if (data_size != 64) { + code.crc32(crc, value); + } else { + code.crc32(crc.cvt64(), value); + } + + ctx.reg_alloc.DefineValue(inst, crc); + return; + } + + ctx.reg_alloc.HostCall(inst, args[0], args[1], {}); + code.mov(code.ABI_PARAM3, data_size / CHAR_BIT); + code.CallFunction(&CRC32::ComputeCRC32Castagnoli); +} + +static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size < 32) { + const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg64 value = ctx.reg_alloc.UseScratchGpr(args[1]); + const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641)); + + code.movzx(value.cvt32(), value.changeBit(data_size)); + code.xor_(value.cvt32(), crc); + code.movd(xmm_tmp, value.cvt32()); + code.pslldq(xmm_tmp, (64 - data_size) / 8); + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpclmulqdq(xmm_value, xmm_tmp, xmm_const, 0x00); + code.pclmulqdq(xmm_value, xmm_const, 0x10); + code.pxor(xmm_value, xmm_tmp); + } else { + code.movdqa(xmm_value, xmm_tmp); + code.pclmulqdq(xmm_value, xmm_const, 0x00); + code.pclmulqdq(xmm_value, xmm_const, 0x10); + code.pxor(xmm_value, xmm_tmp); + } + + code.pextrd(crc, xmm_value, 2); + + ctx.reg_alloc.DefineValue(inst, crc); + return; + } + + if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 32) { + const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641)); + + code.xor_(crc, value); + code.shl(crc.cvt64(), 32); + code.movq(xmm_value, crc.cvt64()); + + code.pclmulqdq(xmm_value, xmm_const, 0x00); + code.pclmulqdq(xmm_value, xmm_const, 0x10); + + code.pextrd(crc, xmm_value, 2); + + ctx.reg_alloc.DefineValue(inst, crc); + return; + } + + if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 64) { + const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); + const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm xmm_const = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(xmm_const, code.Const(xword, 0xb4e5b025'f7011641, 0x00000001'DB710641)); + + code.mov(crc, crc); + code.xor_(crc.cvt64(), value); + code.movq(xmm_value, crc.cvt64()); + + code.pclmulqdq(xmm_value, xmm_const, 0x00); + code.pclmulqdq(xmm_value, xmm_const, 0x10); + + code.pextrd(crc, xmm_value, 2); + + ctx.reg_alloc.DefineValue(inst, crc); + return; + } + + ctx.reg_alloc.HostCall(inst, args[0], args[1], {}); + code.mov(code.ABI_PARAM3, data_size / CHAR_BIT); + code.CallFunction(&CRC32::ComputeCRC32ISO); +} + +void EmitX64::EmitCRC32Castagnoli8(EmitContext& ctx, IR::Inst* inst) { + EmitCRC32Castagnoli(code, ctx, inst, 8); +} + +void EmitX64::EmitCRC32Castagnoli16(EmitContext& ctx, IR::Inst* inst) { + EmitCRC32Castagnoli(code, ctx, inst, 16); +} + +void EmitX64::EmitCRC32Castagnoli32(EmitContext& ctx, IR::Inst* inst) { + EmitCRC32Castagnoli(code, ctx, inst, 32); +} + +void EmitX64::EmitCRC32Castagnoli64(EmitContext& ctx, IR::Inst* inst) { + EmitCRC32Castagnoli(code, ctx, inst, 64); +} + +void EmitX64::EmitCRC32ISO8(EmitContext& ctx, IR::Inst* inst) { + EmitCRC32ISO(code, ctx, inst, 8); +} + +void EmitX64::EmitCRC32ISO16(EmitContext& ctx, IR::Inst* inst) { + EmitCRC32ISO(code, ctx, inst, 16); +} + +void EmitX64::EmitCRC32ISO32(EmitContext& ctx, IR::Inst* inst) { + EmitCRC32ISO(code, ctx, inst, 32); +} + +void EmitX64::EmitCRC32ISO64(EmitContext& ctx, IR::Inst* inst) { + EmitCRC32ISO(code, ctx, inst, 64); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_data_processing.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_data_processing.cpp new file mode 100644 index 0000000000..98197c2db3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_data_processing.cpp @@ -0,0 +1,1701 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <cstddef> +#include <type_traits> + +#include <mcl/assert.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +void EmitX64::EmitPack2x32To1x64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 lo = ctx.reg_alloc.UseScratchGpr(args[0]); + const Xbyak::Reg64 hi = ctx.reg_alloc.UseScratchGpr(args[1]); + + code.shl(hi, 32); + code.mov(lo.cvt32(), lo.cvt32()); // Zero extend to 64-bits + code.or_(lo, hi); + + ctx.reg_alloc.DefineValue(inst, lo); +} + +void EmitX64::EmitPack2x64To1x128(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 lo = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 hi = ctx.reg_alloc.UseGpr(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(result, lo); + code.pinsrq(result, hi, 1); + } else { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.movq(result, lo); + code.movq(tmp, hi); + code.punpcklqdq(result, tmp); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitLeastSignificantWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + // TODO: DefineValue directly on Argument + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]); + code.mov(result.cvt32(), source.cvt32()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitMostSignificantWord(EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.shr(result, 32); + + if (carry_inst) { + const Xbyak::Reg64 carry = ctx.reg_alloc.ScratchGpr(); + code.setc(carry.cvt8()); + ctx.reg_alloc.DefineValue(carry_inst, carry); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitLeastSignificantHalf(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + // TODO: DefineValue directly on Argument + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]); + code.movzx(result.cvt32(), source.cvt16()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitLeastSignificantByte(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + // TODO: DefineValue directly on Argument + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]); + code.movzx(result.cvt32(), source.cvt8()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitMostSignificantBit(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + // TODO: Flag optimization + code.shr(result, 31); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitIsZero32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + // TODO: Flag optimization + code.test(result, result); + code.sete(result.cvt8()); + code.movzx(result, result.cvt8()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitIsZero64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + // TODO: Flag optimization + code.test(result, result); + code.sete(result.cvt8()); + code.movzx(result, result.cvt8()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitTestBit(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + ASSERT(args[1].IsImmediate()); + // TODO: Flag optimization + code.bt(result, args[1].GetImmediateU8()); + code.setc(result.cvt8()); + ctx.reg_alloc.DefineValue(inst, result); +} + +static void EmitConditionalSelect(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg32 nzcv = ctx.reg_alloc.ScratchGpr(HostLoc::RAX).cvt32(); + const Xbyak::Reg then_ = ctx.reg_alloc.UseGpr(args[1]).changeBit(bitsize); + const Xbyak::Reg else_ = ctx.reg_alloc.UseScratchGpr(args[2]).changeBit(bitsize); + + code.mov(nzcv, dword[r15 + code.GetJitStateInfo().offsetof_cpsr_nzcv]); + + code.LoadRequiredFlagsForCondFromRax(args[0].GetImmediateCond()); + + switch (args[0].GetImmediateCond()) { + case IR::Cond::EQ: + code.cmovz(else_, then_); + break; + case IR::Cond::NE: + code.cmovnz(else_, then_); + break; + case IR::Cond::CS: + code.cmovc(else_, then_); + break; + case IR::Cond::CC: + code.cmovnc(else_, then_); + break; + case IR::Cond::MI: + code.cmovs(else_, then_); + break; + case IR::Cond::PL: + code.cmovns(else_, then_); + break; + case IR::Cond::VS: + code.cmovo(else_, then_); + break; + case IR::Cond::VC: + code.cmovno(else_, then_); + break; + case IR::Cond::HI: + code.cmova(else_, then_); + break; + case IR::Cond::LS: + code.cmovna(else_, then_); + break; + case IR::Cond::GE: + code.cmovge(else_, then_); + break; + case IR::Cond::LT: + code.cmovl(else_, then_); + break; + case IR::Cond::GT: + code.cmovg(else_, then_); + break; + case IR::Cond::LE: + code.cmovle(else_, then_); + break; + case IR::Cond::AL: + case IR::Cond::NV: + code.mov(else_, then_); + break; + default: + ASSERT_MSG(false, "Invalid cond {}", static_cast<size_t>(args[0].GetImmediateCond())); + } + + ctx.reg_alloc.DefineValue(inst, else_); +} + +void EmitX64::EmitConditionalSelect32(EmitContext& ctx, IR::Inst* inst) { + EmitConditionalSelect(code, ctx, inst, 32); +} + +void EmitX64::EmitConditionalSelect64(EmitContext& ctx, IR::Inst* inst) { + EmitConditionalSelect(code, ctx, inst, 64); +} + +void EmitX64::EmitConditionalSelectNZCV(EmitContext& ctx, IR::Inst* inst) { + EmitConditionalSelect(code, ctx, inst, 32); +} + +static void EmitExtractRegister(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit_size) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(bit_size); + const Xbyak::Reg operand = ctx.reg_alloc.UseGpr(args[1]).changeBit(bit_size); + const u8 lsb = args[2].GetImmediateU8(); + + code.shrd(result, operand, lsb); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitExtractRegister32(Dynarmic::Backend::X64::EmitContext& ctx, IR::Inst* inst) { + EmitExtractRegister(code, ctx, inst, 32); +} + +void EmitX64::EmitExtractRegister64(Dynarmic::Backend::X64::EmitContext& ctx, IR::Inst* inst) { + EmitExtractRegister(code, ctx, inst, 64); +} + +static void EmitReplicateBit(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit_size) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const u8 bit = args[1].GetImmediateU8(); + + if (bit == bit_size - 1) { + const Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(bit_size); + + code.sar(result, bit_size - 1); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Reg value = ctx.reg_alloc.UseGpr(args[0]).changeBit(bit_size); + const Xbyak::Reg result = ctx.reg_alloc.ScratchGpr().changeBit(bit_size); + + code.xor_(result, result); + code.bt(value, bit); + code.sbb(result, result); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitReplicateBit32(Dynarmic::Backend::X64::EmitContext& ctx, IR::Inst* inst) { + EmitReplicateBit(code, ctx, inst, 32); +} + +void EmitX64::EmitReplicateBit64(Dynarmic::Backend::X64::EmitContext& ctx, IR::Inst* inst) { + EmitReplicateBit(code, ctx, inst, 64); +} + +void EmitX64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (!carry_inst) { + if (shift_arg.IsImmediate()) { + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const u8 shift = shift_arg.GetImmediateU8(); + + if (shift <= 31) { + code.shl(result, shift); + } else { + code.xor_(result, result); + } + + ctx.reg_alloc.DefineValue(inst, result); + } else if (code.HasHostFeature(HostFeature::BMI2)) { + const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32(); + const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32(); + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.shlx(result, operand, shift); + code.xor_(zero, zero); + code.cmp(shift.cvt8(), 32); + code.cmovnb(result, zero); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.Use(shift_arg, HostLoc::RCX); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32(); + + // The 32-bit x64 SHL instruction masks the shift count by 0x1F before performing the shift. + // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros. + + code.shl(result, code.cl); + code.xor_(zero, zero); + code.cmp(code.cl, 32); + code.cmovnb(result, zero); + + ctx.reg_alloc.DefineValue(inst, result); + } + } else { + if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const Xbyak::Reg32 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt32(); + + if (shift == 0) { + // There is nothing more to do. + } else if (shift < 32) { + code.bt(carry.cvt32(), 0); + code.shl(result, shift); + code.setc(carry.cvt8()); + } else if (shift > 32) { + code.xor_(result, result); + code.xor_(carry, carry); + } else { + code.mov(carry, result); + code.xor_(result, result); + code.and_(carry, 1); + } + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt32(); + + code.mov(tmp, 63); + code.cmp(code.cl, 63); + code.cmova(code.ecx, tmp); + code.shl(result.cvt64(), 32); + code.bt(carry.cvt32(), 0); + code.shl(result.cvt64(), code.cl); + code.setc(carry.cvt8()); + code.shr(result.cvt64(), 32); + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.reg_alloc.DefineValue(inst, result); + } + } +} + +void EmitX64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); + const u8 shift = shift_arg.GetImmediateU8(); + + if (shift < 64) { + code.shl(result, shift); + } else { + code.xor_(result.cvt32(), result.cvt32()); + } + + ctx.reg_alloc.DefineValue(inst, result); + } else if (code.HasHostFeature(HostFeature::BMI2)) { + const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg); + const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg); + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr(); + + code.shlx(result, operand, shift); + code.xor_(zero.cvt32(), zero.cvt32()); + code.cmp(shift.cvt8(), 64); + code.cmovnb(result, zero); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.Use(shift_arg, HostLoc::RCX); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); + const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr(); + + // The x64 SHL instruction masks the shift count by 0x1F before performing the shift. + // ARM differs from the behaviour: It does not mask the count, so shifts above 63 result in zeros. + + code.shl(result, code.cl); + code.xor_(zero.cvt32(), zero.cvt32()); + code.cmp(code.cl, 64); + code.cmovnb(result, zero); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (!carry_inst) { + if (shift_arg.IsImmediate()) { + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const u8 shift = shift_arg.GetImmediateU8(); + + if (shift <= 31) { + code.shr(result, shift); + } else { + code.xor_(result, result); + } + + ctx.reg_alloc.DefineValue(inst, result); + } else if (code.HasHostFeature(HostFeature::BMI2)) { + const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32(); + const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32(); + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.shrx(result, operand, shift); + code.xor_(zero, zero); + code.cmp(shift.cvt8(), 32); + code.cmovnb(result, zero); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.Use(shift_arg, HostLoc::RCX); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32(); + + // The 32-bit x64 SHR instruction masks the shift count by 0x1F before performing the shift. + // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros. + + code.shr(result, code.cl); + code.xor_(zero, zero); + code.cmp(code.cl, 32); + code.cmovnb(result, zero); + + ctx.reg_alloc.DefineValue(inst, result); + } + } else { + if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const Xbyak::Reg32 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt32(); + + if (shift == 0) { + // There is nothing more to do. + } else if (shift < 32) { + code.shr(result, shift); + code.setc(carry.cvt8()); + } else if (shift == 32) { + code.bt(result, 31); + code.setc(carry.cvt8()); + code.mov(result, 0); + } else { + code.xor_(result, result); + code.xor_(carry, carry); + } + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX); + const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32(); + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt32(); + + code.mov(result, 63); + code.cmp(code.cl, 63); + code.cmovnb(code.ecx, result); + code.mov(result, operand); + code.bt(carry.cvt32(), 0); + code.shr(result.cvt64(), code.cl); + code.setc(carry.cvt8()); + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.reg_alloc.DefineValue(inst, result); + } + } +} + +void EmitX64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); + const u8 shift = shift_arg.GetImmediateU8(); + + if (shift < 64) { + code.shr(result, shift); + } else { + code.xor_(result.cvt32(), result.cvt32()); + } + + ctx.reg_alloc.DefineValue(inst, result); + } else if (code.HasHostFeature(HostFeature::BMI2)) { + const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg); + const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg); + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr(); + + code.shrx(result, operand, shift); + code.xor_(zero.cvt32(), zero.cvt32()); + code.cmp(shift.cvt8(), 63); + code.cmovnb(result, zero); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.Use(shift_arg, HostLoc::RCX); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); + const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr(); + + // The x64 SHR instruction masks the shift count by 0x1F before performing the shift. + // ARM differs from the behaviour: It does not mask the count, so shifts above 63 result in zeros. + + code.shr(result, code.cl); + code.xor_(zero.cvt32(), zero.cvt32()); + code.cmp(code.cl, 64); + code.cmovnb(result, zero); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (!carry_inst) { + if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + + code.sar(result, u8(shift < 31 ? shift : 31)); + + ctx.reg_alloc.DefineValue(inst, result); + } else if (code.HasHostFeature(HostFeature::BMI2)) { + const Xbyak::Reg32 shift = ctx.reg_alloc.UseScratchGpr(shift_arg).cvt32(); + const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32(); + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 const31 = ctx.reg_alloc.ScratchGpr().cvt32(); + + // The 32-bit x64 SAR instruction masks the shift count by 0x1F before performing the shift. + // ARM differs from the behaviour: It does not mask the count. + + // We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31. + code.mov(const31, 31); + code.cmp(shift.cvt8(), 31); + code.cmovnb(shift, const31); + code.sarx(result, operand, shift); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const Xbyak::Reg32 const31 = ctx.reg_alloc.ScratchGpr().cvt32(); + + // The 32-bit x64 SAR instruction masks the shift count by 0x1F before performing the shift. + // ARM differs from the behaviour: It does not mask the count. + + // We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31. + code.mov(const31, 31); + code.cmp(code.cl, u32(31)); + code.cmova(code.ecx, const31); + code.sar(result, code.cl); + + ctx.reg_alloc.DefineValue(inst, result); + } + } else { + if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const Xbyak::Reg8 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt8(); + + if (shift == 0) { + // There is nothing more to do. + } else if (shift <= 31) { + code.sar(result, shift); + code.setc(carry); + } else { + code.sar(result, 31); + code.bt(result, 31); + code.setc(carry); + } + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX); + const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32(); + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt32(); + + code.mov(result, 63); + code.cmp(code.cl, 63); + code.cmovnb(code.ecx, result); + code.movsxd(result.cvt64(), operand); + code.bt(carry.cvt32(), 0); + code.sar(result.cvt64(), code.cl); + code.setc(carry.cvt8()); + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.reg_alloc.DefineValue(inst, result); + } + } +} + +void EmitX64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); + + code.sar(result, u8(shift < 63 ? shift : 63)); + + ctx.reg_alloc.DefineValue(inst, result); + } else if (code.HasHostFeature(HostFeature::BMI2)) { + const Xbyak::Reg64 shift = ctx.reg_alloc.UseScratchGpr(shift_arg); + const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg); + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 const63 = ctx.reg_alloc.ScratchGpr(); + + code.mov(const63.cvt32(), 63); + code.cmp(shift.cvt8(), 63); + code.cmovnb(shift, const63); + code.sarx(result, operand, shift); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); + const Xbyak::Reg64 const63 = ctx.reg_alloc.ScratchGpr(); + + // The 64-bit x64 SAR instruction masks the shift count by 0x3F before performing the shift. + // ARM differs from the behaviour: It does not mask the count. + + // We note that all shift values above 63 have the same behaviour as 63 does, so we saturate `shift` to 63. + code.mov(const63, 63); + code.cmp(code.cl, u32(63)); + code.cmovnb(code.ecx, const63); + code.sar(result, code.cl); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (!carry_inst) { + if (shift_arg.IsImmediate() && code.HasHostFeature(HostFeature::BMI2)) { + const u8 shift = shift_arg.GetImmediateU8(); + const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32(); + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.rorx(result, operand, shift); + + ctx.reg_alloc.DefineValue(inst, result); + } else if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + + code.ror(result, u8(shift & 0x1F)); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.Use(shift_arg, HostLoc::RCX); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + + // x64 ROR instruction does (shift & 0x1F) for us. + code.ror(result, code.cl); + + ctx.reg_alloc.DefineValue(inst, result); + } + } else { + if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const Xbyak::Reg8 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt8(); + + if (shift == 0) { + // There is nothing more to do. + } else if ((shift & 0x1F) == 0) { + code.bt(result, u8(31)); + code.setc(carry); + } else { + code.ror(result, shift); + code.setc(carry); + } + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const Xbyak::Reg8 carry = ctx.reg_alloc.UseScratchGpr(carry_arg).cvt8(); + + Xbyak::Label end; + + code.test(code.cl, code.cl); + code.jz(end); + + code.ror(result, code.cl); + code.bt(result, u8(31)); + code.setc(carry); + + code.L(end); + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.reg_alloc.DefineValue(inst, result); + } + } +} + +void EmitX64::EmitRotateRight64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate() && code.HasHostFeature(HostFeature::BMI2)) { + const u8 shift = shift_arg.GetImmediateU8(); + const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg); + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + + code.rorx(result, operand, shift); + + ctx.reg_alloc.DefineValue(inst, result); + } else if (shift_arg.IsImmediate()) { + const u8 shift = shift_arg.GetImmediateU8(); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); + + code.ror(result, u8(shift & 0x3F)); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.Use(shift_arg, HostLoc::RCX); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); + + // x64 ROR instruction does (shift & 0x3F) for us. + code.ror(result, code.cl); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitRotateRightExtended(EmitContext& ctx, IR::Inst* inst) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg8 carry = ctx.reg_alloc.UseScratchGpr(args[1]).cvt8(); + + code.bt(carry.cvt32(), 0); + code.rcr(result, 1); + + if (carry_inst) { + code.setc(carry); + + ctx.reg_alloc.DefineValue(carry_inst, carry); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<typename ShfitFT, typename BMI2FT> +static void EmitMaskedShift32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, ShfitFT shift_fn, [[maybe_unused]] BMI2FT bmi2_shift) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + const u32 shift = shift_arg.GetImmediateU32(); + + shift_fn(result, static_cast<int>(shift & 0x1F)); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if constexpr (!std::is_same_v<BMI2FT, std::nullptr_t>) { + if (code.HasHostFeature(HostFeature::BMI2)) { + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32(); + const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32(); + + (code.*bmi2_shift)(result, operand, shift); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + + ctx.reg_alloc.Use(shift_arg, HostLoc::RCX); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32(); + + shift_fn(result, code.cl); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<typename ShfitFT, typename BMI2FT> +static void EmitMaskedShift64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, ShfitFT shift_fn, [[maybe_unused]] BMI2FT bmi2_shift) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); + const u64 shift = shift_arg.GetImmediateU64(); + + shift_fn(result, static_cast<int>(shift & 0x3F)); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if constexpr (!std::is_same_v<BMI2FT, std::nullptr_t>) { + if (code.HasHostFeature(HostFeature::BMI2)) { + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg); + const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg); + + (code.*bmi2_shift)(result, operand, shift); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + + ctx.reg_alloc.Use(shift_arg, HostLoc::RCX); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); + + shift_fn(result, code.cl); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitLogicalShiftLeftMasked32(EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift32( + code, ctx, inst, [&](auto result, auto shift) { code.shl(result, shift); }, &Xbyak::CodeGenerator::shlx); +} + +void EmitX64::EmitLogicalShiftLeftMasked64(EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift64( + code, ctx, inst, [&](auto result, auto shift) { code.shl(result, shift); }, &Xbyak::CodeGenerator::shlx); +} + +void EmitX64::EmitLogicalShiftRightMasked32(EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift32( + code, ctx, inst, [&](auto result, auto shift) { code.shr(result, shift); }, &Xbyak::CodeGenerator::shrx); +} + +void EmitX64::EmitLogicalShiftRightMasked64(EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift64( + code, ctx, inst, [&](auto result, auto shift) { code.shr(result, shift); }, &Xbyak::CodeGenerator::shrx); +} + +void EmitX64::EmitArithmeticShiftRightMasked32(EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift32( + code, ctx, inst, [&](auto result, auto shift) { code.sar(result, shift); }, &Xbyak::CodeGenerator::sarx); +} + +void EmitX64::EmitArithmeticShiftRightMasked64(EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift64( + code, ctx, inst, [&](auto result, auto shift) { code.sar(result, shift); }, &Xbyak::CodeGenerator::sarx); +} + +void EmitX64::EmitRotateRightMasked32(EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift32( + code, ctx, inst, [&](auto result, auto shift) { code.ror(result, shift); }, nullptr); +} + +void EmitX64::EmitRotateRightMasked64(EmitContext& ctx, IR::Inst* inst) { + EmitMaskedShift64( + code, ctx, inst, [&](auto result, auto shift) { code.ror(result, shift); }, nullptr); +} + +static Xbyak::Reg8 DoCarry(RegAlloc& reg_alloc, Argument& carry_in, IR::Inst* carry_out) { + if (carry_in.IsImmediate()) { + return carry_out ? reg_alloc.ScratchGpr().cvt8() : Xbyak::Reg8{-1}; + } else { + return carry_out ? reg_alloc.UseScratchGpr(carry_in).cvt8() : reg_alloc.UseGpr(carry_in).cvt8(); + } +} + +static Xbyak::Reg64 DoNZCV(BlockOfCode& code, RegAlloc& reg_alloc, IR::Inst* nzcv_out) { + if (!nzcv_out) { + return Xbyak::Reg64{-1}; + } + + const Xbyak::Reg64 nzcv = reg_alloc.ScratchGpr(HostLoc::RAX); + code.xor_(nzcv.cvt32(), nzcv.cvt32()); + return nzcv; +} + +static void EmitAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + const auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& carry_in = args[2]; + + // Consider using LEA. + if (!carry_inst && !overflow_inst && !nzcv_inst && carry_in.IsImmediate() && !carry_in.GetImmediateU1()) { + if (args[1].IsImmediate() && args[1].FitsInImmediateS32()) { + const Xbyak::Reg op1 = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize); + const Xbyak::Reg result = ctx.reg_alloc.ScratchGpr().changeBit(bitsize); + + code.lea(result, code.ptr[op1 + args[1].GetImmediateS32()]); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + const Xbyak::Reg op1 = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize); + const Xbyak::Reg op2 = ctx.reg_alloc.UseGpr(args[1]).changeBit(bitsize); + const Xbyak::Reg result = ctx.reg_alloc.ScratchGpr().changeBit(bitsize); + + code.lea(result, code.ptr[op1 + op2]); + + ctx.reg_alloc.DefineValue(inst, result); + } + return; + } + + const Xbyak::Reg64 nzcv = DoNZCV(code, ctx.reg_alloc, nzcv_inst); + const Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(bitsize); + const Xbyak::Reg8 carry = DoCarry(ctx.reg_alloc, carry_in, carry_inst); + const Xbyak::Reg8 overflow = overflow_inst ? ctx.reg_alloc.ScratchGpr().cvt8() : Xbyak::Reg8{-1}; + + if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) { + const u32 op_arg = args[1].GetImmediateU32(); + if (carry_in.IsImmediate()) { + if (carry_in.GetImmediateU1()) { + code.stc(); + code.adc(result, op_arg); + } else { + code.add(result, op_arg); + } + } else { + code.bt(carry.cvt32(), 0); + code.adc(result, op_arg); + } + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + op_arg.setBit(bitsize); + if (carry_in.IsImmediate()) { + if (carry_in.GetImmediateU1()) { + code.stc(); + code.adc(result, *op_arg); + } else { + code.add(result, *op_arg); + } + } else { + code.bt(carry.cvt32(), 0); + code.adc(result, *op_arg); + } + } + + if (nzcv_inst) { + code.lahf(); + code.seto(code.al); + ctx.reg_alloc.DefineValue(nzcv_inst, nzcv); + } + if (carry_inst) { + code.setc(carry); + ctx.reg_alloc.DefineValue(carry_inst, carry); + } + if (overflow_inst) { + code.seto(overflow); + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitAdd(code, ctx, inst, 32); +} + +void EmitX64::EmitAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitAdd(code, ctx, inst, 64); +} + +static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) { + const auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + const auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& carry_in = args[2]; + const bool is_cmp = inst->UseCount() == size_t(!!carry_inst + !!overflow_inst + !!nzcv_inst) && carry_in.IsImmediate() && carry_in.GetImmediateU1(); + + // Consider using LEA. + if (!carry_inst && !overflow_inst && !nzcv_inst && carry_in.IsImmediate() && carry_in.GetImmediateU1() && args[1].IsImmediate() && args[1].FitsInImmediateS32() && args[1].GetImmediateS32() != 0xffff'ffff'8000'0000) { + const Xbyak::Reg op1 = ctx.reg_alloc.UseGpr(args[0]).changeBit(bitsize); + const Xbyak::Reg result = ctx.reg_alloc.ScratchGpr().changeBit(bitsize); + + code.lea(result, code.ptr[op1 - args[1].GetImmediateS32()]); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Reg64 nzcv = DoNZCV(code, ctx.reg_alloc, nzcv_inst); + const Xbyak::Reg result = (is_cmp ? ctx.reg_alloc.UseGpr(args[0]) : ctx.reg_alloc.UseScratchGpr(args[0])).changeBit(bitsize); + const Xbyak::Reg8 carry = DoCarry(ctx.reg_alloc, carry_in, carry_inst); + const Xbyak::Reg8 overflow = overflow_inst ? ctx.reg_alloc.ScratchGpr().cvt8() : Xbyak::Reg8{-1}; + + // Note that x64 CF is inverse of what the ARM carry flag is here. + + bool invert_output_carry = true; + + if (is_cmp) { + if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) { + const u32 op_arg = args[1].GetImmediateU32(); + code.cmp(result, op_arg); + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + op_arg.setBit(bitsize); + code.cmp(result, *op_arg); + } + } else if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) { + const u32 op_arg = args[1].GetImmediateU32(); + if (carry_in.IsImmediate()) { + if (carry_in.GetImmediateU1()) { + code.sub(result, op_arg); + } else { + code.add(result, ~op_arg); + invert_output_carry = false; + } + } else { + code.bt(carry.cvt32(), 0); + code.adc(result, ~op_arg); + invert_output_carry = false; + } + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + op_arg.setBit(bitsize); + if (carry_in.IsImmediate()) { + if (carry_in.GetImmediateU1()) { + code.sub(result, *op_arg); + } else { + code.stc(); + code.sbb(result, *op_arg); + } + } else { + code.bt(carry.cvt32(), 0); + code.cmc(); + code.sbb(result, *op_arg); + } + } + + if (nzcv_inst) { + if (invert_output_carry) { + code.cmc(); + } + code.lahf(); + code.seto(code.al); + ctx.reg_alloc.DefineValue(nzcv_inst, nzcv); + } + if (carry_inst) { + if (invert_output_carry) { + code.setnc(carry); + } else { + code.setc(carry); + } + ctx.reg_alloc.DefineValue(carry_inst, carry); + } + if (overflow_inst) { + code.seto(overflow); + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + } + if (!is_cmp) { + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitSub32(EmitContext& ctx, IR::Inst* inst) { + EmitSub(code, ctx, inst, 32); +} + +void EmitX64::EmitSub64(EmitContext& ctx, IR::Inst* inst) { + EmitSub(code, ctx, inst, 64); +} + +void EmitX64::EmitMul32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + if (args[1].IsImmediate()) { + code.imul(result, result, args[1].GetImmediateU32()); + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + op_arg.setBit(32); + + code.imul(result, *op_arg); + } + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitMul64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + + code.imul(result, *op_arg); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitUnsignedMultiplyHigh64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + ctx.reg_alloc.UseScratch(args[0], HostLoc::RAX); + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + code.mul(*op_arg); + + ctx.reg_alloc.DefineValue(inst, rdx); +} + +void EmitX64::EmitSignedMultiplyHigh64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + ctx.reg_alloc.UseScratch(args[0], HostLoc::RAX); + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + code.imul(*op_arg); + + ctx.reg_alloc.DefineValue(inst, rdx); +} + +void EmitX64::EmitUnsignedDiv32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + const Xbyak::Reg32 dividend = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 divisor = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + + Xbyak::Label end; + + code.xor_(eax, eax); + code.test(divisor, divisor); + code.jz(end); + code.mov(eax, dividend); + code.xor_(edx, edx); + code.div(divisor); + code.L(end); + + ctx.reg_alloc.DefineValue(inst, eax); +} + +void EmitX64::EmitUnsignedDiv64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + const Xbyak::Reg64 dividend = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 divisor = ctx.reg_alloc.UseGpr(args[1]); + + Xbyak::Label end; + + code.xor_(eax, eax); + code.test(divisor, divisor); + code.jz(end); + code.mov(rax, dividend); + code.xor_(edx, edx); + code.div(divisor); + code.L(end); + + ctx.reg_alloc.DefineValue(inst, rax); +} + +void EmitX64::EmitSignedDiv32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + const Xbyak::Reg32 dividend = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 divisor = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); + + Xbyak::Label end; + + code.xor_(eax, eax); + code.test(divisor, divisor); + code.jz(end); + code.movsxd(rax, dividend); + code.movsxd(divisor.cvt64(), divisor); + code.cqo(); + code.idiv(divisor.cvt64()); + code.L(end); + + ctx.reg_alloc.DefineValue(inst, eax); +} + +void EmitX64::EmitSignedDiv64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + const Xbyak::Reg64 dividend = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 divisor = ctx.reg_alloc.UseGpr(args[1]); + + Xbyak::Label end, ok; + + code.xor_(eax, eax); + code.test(divisor, divisor); + code.jz(end); + code.cmp(divisor, 0xffffffff); // is sign extended + code.jne(ok); + code.mov(rax, 0x8000000000000000); + code.cmp(dividend, rax); + code.je(end); + code.L(ok); + code.mov(rax, dividend); + code.cqo(); + code.idiv(divisor); + code.L(end); + + ctx.reg_alloc.DefineValue(inst, rax); +} + +void EmitX64::EmitAnd32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + + if (args[1].IsImmediate()) { + const u32 op_arg = args[1].GetImmediateU32(); + + code.and_(result, op_arg); + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + op_arg.setBit(32); + + code.and_(result, *op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitAnd64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + + if (args[1].FitsInImmediateS32()) { + const u32 op_arg = u32(args[1].GetImmediateS32()); + + code.and_(result, op_arg); + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + op_arg.setBit(64); + + code.and_(result, *op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitAndNot32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (!args[0].IsImmediate() && !args[1].IsImmediate() && code.HasHostFeature(HostFeature::BMI1)) { + Xbyak::Reg32 op_a = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + Xbyak::Reg32 op_b = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + code.andn(result, op_b, op_a); + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + Xbyak::Reg32 result; + if (args[1].IsImmediate()) { + result = ctx.reg_alloc.ScratchGpr().cvt32(); + code.mov(result, u32(~args[1].GetImmediateU32())); + } else { + result = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); + code.not_(result); + } + + if (args[0].IsImmediate()) { + const u32 op_arg = args[0].GetImmediateU32(); + code.and_(result, op_arg); + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[0]); + op_arg.setBit(32); + code.and_(result, *op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitAndNot64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (!args[0].IsImmediate() && !args[1].IsImmediate() && code.HasHostFeature(HostFeature::BMI1)) { + Xbyak::Reg64 op_a = ctx.reg_alloc.UseGpr(args[0]); + Xbyak::Reg64 op_b = ctx.reg_alloc.UseGpr(args[1]); + Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + code.andn(result, op_b, op_a); + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + Xbyak::Reg64 result; + if (args[1].IsImmediate()) { + result = ctx.reg_alloc.ScratchGpr(); + code.mov(result, ~args[1].GetImmediateU64()); + } else { + result = ctx.reg_alloc.UseScratchGpr(args[1]); + code.not_(result); + } + + if (args[0].FitsInImmediateS32()) { + const u32 op_arg = u32(args[0].GetImmediateS32()); + code.and_(result, op_arg); + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[0]); + op_arg.setBit(64); + code.and_(result, *op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitEor32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + + if (args[1].IsImmediate()) { + const u32 op_arg = args[1].GetImmediateU32(); + + code.xor_(result, op_arg); + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + op_arg.setBit(32); + + code.xor_(result, *op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitEor64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + + if (args[1].FitsInImmediateS32()) { + const u32 op_arg = u32(args[1].GetImmediateS32()); + + code.xor_(result, op_arg); + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + op_arg.setBit(64); + + code.xor_(result, *op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitOr32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + + if (args[1].IsImmediate()) { + const u32 op_arg = args[1].GetImmediateU32(); + + code.or_(result, op_arg); + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + op_arg.setBit(32); + + code.or_(result, *op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitOr64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + + if (args[1].FitsInImmediateS32()) { + const u32 op_arg = u32(args[1].GetImmediateS32()); + + code.or_(result, op_arg); + } else { + OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]); + op_arg.setBit(64); + + code.or_(result, *op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitNot32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Reg32 result; + if (args[0].IsImmediate()) { + result = ctx.reg_alloc.ScratchGpr().cvt32(); + code.mov(result, u32(~args[0].GetImmediateU32())); + } else { + result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + code.not_(result); + } + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitNot64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Reg64 result; + if (args[0].IsImmediate()) { + result = ctx.reg_alloc.ScratchGpr(); + code.mov(result, ~args[0].GetImmediateU64()); + } else { + result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.not_(result); + } + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitSignExtendByteToWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.movsx(result.cvt32(), result.cvt8()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitSignExtendHalfToWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.movsx(result.cvt32(), result.cvt16()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitSignExtendByteToLong(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.movsx(result.cvt64(), result.cvt8()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitSignExtendHalfToLong(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.movsx(result.cvt64(), result.cvt16()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitSignExtendWordToLong(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.movsxd(result.cvt64(), result.cvt32()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitZeroExtendByteToWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.movzx(result.cvt32(), result.cvt8()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitZeroExtendHalfToWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.movzx(result.cvt32(), result.cvt16()); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitZeroExtendByteToLong(EmitContext& ctx, IR::Inst* inst) { + // x64 zeros upper 32 bits on a 32-bit move + EmitZeroExtendByteToWord(ctx, inst); +} + +void EmitX64::EmitZeroExtendHalfToLong(EmitContext& ctx, IR::Inst* inst) { + // x64 zeros upper 32 bits on a 32-bit move + EmitZeroExtendHalfToWord(ctx, inst); +} + +void EmitX64::EmitZeroExtendWordToLong(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.mov(result.cvt32(), result.cvt32()); // x64 zeros upper 32 bits on a 32-bit move + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitZeroExtendLongToQuad(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (args[0].IsInGpr()) { + const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.movq(result, source); + ctx.reg_alloc.DefineValue(inst, result); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + code.movq(result, result); + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitByteReverseWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + code.bswap(result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitByteReverseHalf(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg16 result = ctx.reg_alloc.UseScratchGpr(args[0]).cvt16(); + code.rol(result, 8); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitByteReverseDual(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.bswap(result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (code.HasHostFeature(HostFeature::LZCNT)) { + const Xbyak::Reg32 source = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.lzcnt(result, source); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + const Xbyak::Reg32 source = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + + // The result of a bsr of zero is undefined, but zf is set after it. + code.bsr(result, source); + code.mov(source, 0xFFFFFFFF); + code.cmovz(result, source); + code.neg(result); + code.add(result, 31); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitCountLeadingZeros64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (code.HasHostFeature(HostFeature::LZCNT)) { + const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]).cvt64(); + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64(); + + code.lzcnt(result, source); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + const Xbyak::Reg64 source = ctx.reg_alloc.UseScratchGpr(args[0]).cvt64(); + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64(); + + // The result of a bsr of zero is undefined, but zf is set after it. + code.bsr(result, source); + code.mov(source.cvt32(), 0xFFFFFFFF); + code.cmovz(result.cvt32(), source.cvt32()); + code.neg(result.cvt32()); + code.add(result.cvt32(), 63); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitMaxSigned32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 x = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); + + code.cmp(x, y); + code.cmovge(y, x); + + ctx.reg_alloc.DefineValue(inst, y); +} + +void EmitX64::EmitMaxSigned64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 x = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(args[1]); + + code.cmp(x, y); + code.cmovge(y, x); + + ctx.reg_alloc.DefineValue(inst, y); +} + +void EmitX64::EmitMaxUnsigned32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 x = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); + + code.cmp(x, y); + code.cmova(y, x); + + ctx.reg_alloc.DefineValue(inst, y); +} + +void EmitX64::EmitMaxUnsigned64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 x = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(args[1]); + + code.cmp(x, y); + code.cmova(y, x); + + ctx.reg_alloc.DefineValue(inst, y); +} + +void EmitX64::EmitMinSigned32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 x = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); + + code.cmp(x, y); + code.cmovle(y, x); + + ctx.reg_alloc.DefineValue(inst, y); +} + +void EmitX64::EmitMinSigned64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 x = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(args[1]); + + code.cmp(x, y); + code.cmovle(y, x); + + ctx.reg_alloc.DefineValue(inst, y); +} + +void EmitX64::EmitMinUnsigned32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 x = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); + + code.cmp(x, y); + code.cmovb(y, x); + + ctx.reg_alloc.DefineValue(inst, y); +} + +void EmitX64::EmitMinUnsigned64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 x = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(args[1]); + + code.cmp(x, y); + code.cmovb(y, x); + + ctx.reg_alloc.DefineValue(inst, y); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp new file mode 100644 index 0000000000..182c887538 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp @@ -0,0 +1,2134 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <optional> +#include <type_traits> +#include <utility> + +#include <mcl/assert.hpp> +#include <mcl/mp/metavalue/lift_value.hpp> +#include <mcl/mp/typelist/cartesian_product.hpp> +#include <mcl/mp/typelist/get.hpp> +#include <mcl/mp/typelist/lift_sequence.hpp> +#include <mcl/mp/typelist/list.hpp> +#include <mcl/mp/typelist/lower_to_tuple.hpp> +#include <mcl/stdint.hpp> +#include <mcl/type_traits/integer_of_size.hpp> +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/constants.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/common/cast_util.h" +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/op.h" +#include "dynarmic/common/fp/rounding_mode.h" +#include "dynarmic/common/lut_from_list.h" +#include "dynarmic/interface/optimization_flags.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; +namespace mp = mcl::mp; + +namespace { + +const Xbyak::Reg64 INVALID_REG = Xbyak::Reg64(-1); + +constexpr u64 f32_negative_zero = 0x80000000u; +constexpr u64 f32_nan = 0x7fc00000u; +constexpr u64 f32_non_sign_mask = 0x7fffffffu; +constexpr u64 f32_smallest_normal = 0x00800000u; + +constexpr u64 f64_negative_zero = 0x8000000000000000u; +constexpr u64 f64_nan = 0x7ff8000000000000u; +constexpr u64 f64_non_sign_mask = 0x7fffffffffffffffu; +constexpr u64 f64_smallest_normal = 0x0010000000000000u; + +constexpr u64 f64_min_s16 = 0xc0e0000000000000u; // -32768 as a double +constexpr u64 f64_max_s16 = 0x40dfffc000000000u; // 32767 as a double +constexpr u64 f64_min_u16 = 0x0000000000000000u; // 0 as a double +constexpr u64 f64_max_u16 = 0x40efffe000000000u; // 65535 as a double +constexpr u64 f64_max_s32 = 0x41dfffffffc00000u; // 2147483647 as a double +constexpr u64 f64_max_u32 = 0x41efffffffe00000u; // 4294967295 as a double +constexpr u64 f64_max_s64_lim = 0x43e0000000000000u; // 2^63 as a double (actual maximum unrepresentable) + +#define FCODE(NAME) \ + [&code](auto... args) { \ + if constexpr (fsize == 32) { \ + code.NAME##s(args...); \ + } else { \ + code.NAME##d(args...); \ + } \ + } +#define ICODE(NAME) \ + [&code](auto... args) { \ + if constexpr (fsize == 32) { \ + code.NAME##d(args...); \ + } else { \ + code.NAME##q(args...); \ + } \ + } + +template<size_t fsize> +void ForceDenormalsToZero(BlockOfCode& code, std::initializer_list<Xbyak::Xmm> to_daz) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + constexpr u32 denormal_to_zero = FixupLUT( + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src); + + const Xbyak::Xmm tmp = xmm16; + FCODE(vmovap)(tmp, code.BConst<fsize>(xword, denormal_to_zero)); + + for (const Xbyak::Xmm& xmm : to_daz) { + FCODE(vfixupimms)(xmm, xmm, tmp, u8(0)); + } + return; + } + + for (const Xbyak::Xmm& xmm : to_daz) { + code.movaps(xmm0, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask)); + code.andps(xmm0, xmm); + if constexpr (fsize == 32) { + code.pcmpgtd(xmm0, code.Const(xword, f32_smallest_normal - 1)); + } else if (code.HasHostFeature(HostFeature::SSE42)) { + code.pcmpgtq(xmm0, code.Const(xword, f64_smallest_normal - 1)); + } else { + code.pcmpgtd(xmm0, code.Const(xword, f64_smallest_normal - 1)); + code.pshufd(xmm0, xmm0, 0b11100101); + } + code.orps(xmm0, code.Const(xword, fsize == 32 ? f32_negative_zero : f64_negative_zero)); + code.andps(xmm, xmm0); + } +} + +template<size_t fsize> +void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz) { + if (ctx.FPCR().FZ()) { + ForceDenormalsToZero<fsize>(code, to_daz); + } +} + +template<size_t fsize> +void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero, + FpFixup::PosZero); + FCODE(vfixupimms)(xmm_value, xmm_value, code.Const(ptr, u64(nan_to_zero)), u8(0)); + } else if (code.HasHostFeature(HostFeature::AVX)) { + FCODE(vcmpords)(xmm_scratch, xmm_value, xmm_value); + FCODE(vandp)(xmm_value, xmm_value, xmm_scratch); + } else { + code.xorps(xmm_scratch, xmm_scratch); + FCODE(cmpords)(xmm_scratch, xmm_value); // true mask when ordered (i.e.: when not an NaN) + code.pand(xmm_value, xmm_scratch); + } +} + +template<size_t fsize> +void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + const Xbyak::Opmask nan_mask = k1; + FCODE(vfpclasss)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN)); + FCODE(vblendmp)(result | nan_mask, result, code.Const(ptr_b, fsize == 32 ? f32_nan : f64_nan)); + } else if (code.HasHostFeature(HostFeature::AVX)) { + FCODE(vcmpunords)(xmm0, result, result); + FCODE(blendvp)(result, code.Const(xword, fsize == 32 ? f32_nan : f64_nan)); + } else { + Xbyak::Label end; + FCODE(ucomis)(result, result); + code.jnp(end); + code.movaps(result, code.Const(xword, fsize == 32 ? f32_nan : f64_nan)); + code.L(end); + } +} + +template<size_t fsize> +SharedLabel ProcessNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a) { + SharedLabel nan = GenSharedLabel(), end = GenSharedLabel(); + + FCODE(ucomis)(a, a); + code.jp(*nan, code.T_NEAR); + + ctx.deferred_emits.emplace_back([=, &code] { + code.L(*nan); + code.orps(a, code.Const(xword, fsize == 32 ? 0x00400000 : 0x0008'0000'0000'0000)); + code.jmp(*end, code.T_NEAR); + }); + + return end; +} + +template<size_t fsize> +void PostProcessNaN(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) { + code.movaps(tmp, result); + FCODE(cmpunordp)(tmp, tmp); + ICODE(psll)(tmp, int(fsize - 1)); + code.xorps(result, tmp); +} + +// This is necessary because x86 and ARM differ in they way they return NaNs from floating point operations +// +// ARM behaviour: +// op1 op2 result +// SNaN SNaN/QNaN op1 +// QNaN SNaN op2 +// QNaN QNaN op1 +// SNaN/QNaN other op1 +// other SNaN/QNaN op2 +// +// x86 behaviour: +// op1 op2 result +// SNaN/QNaN SNaN/QNaN op1 +// SNaN/QNaN other op1 +// other SNaN/QNaN op2 +// +// With ARM: SNaNs take priority. With x86: it doesn't matter. +// +// From the above we can see what differs between the architectures is +// the case when op1 == QNaN and op2 == SNaN. +// +// We assume that registers op1 and op2 are read-only. This function also trashes xmm0. +// We allow for the case where op1 and result are the same register. We do not read from op1 once result is written to. +template<size_t fsize> +void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, Xbyak::Xmm op2, Xbyak::Reg64 tmp, Xbyak::Label end) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + constexpr FPT exponent_mask = FP::FPInfo<FPT>::exponent_mask; + constexpr FPT mantissa_msb = FP::FPInfo<FPT>::mantissa_msb; + constexpr u8 mantissa_msb_bit = static_cast<u8>(FP::FPInfo<FPT>::explicit_mantissa_width - 1); + + // At this point we know that at least one of op1 and op2 is a NaN. + // Thus in op1 ^ op2 at least one of the two would have all 1 bits in the exponent. + // Keeping in mind xor is commutative, there are only four cases: + // SNaN ^ SNaN/Inf -> exponent == 0, mantissa_msb == 0 + // QNaN ^ QNaN -> exponent == 0, mantissa_msb == 0 + // QNaN ^ SNaN/Inf -> exponent == 0, mantissa_msb == 1 + // SNaN/QNaN ^ Otherwise -> exponent != 0, mantissa_msb == ? + // + // We're only really interested in op1 == QNaN and op2 == SNaN, + // so we filter out everything else. + // + // We do it this way instead of checking that op1 is QNaN because + // op1 == QNaN && op2 == QNaN is the most common case. With this method + // that case would only require one branch. + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vxorps(xmm0, op1, op2); + } else { + code.movaps(xmm0, op1); + code.xorps(xmm0, op2); + } + + constexpr size_t shift = fsize == 32 ? 0 : 48; + if constexpr (fsize == 32) { + code.movd(tmp.cvt32(), xmm0); + } else { + // We do this to avoid requiring 64-bit immediates + code.pextrw(tmp.cvt32(), xmm0, shift / 16); + } + code.and_(tmp.cvt32(), static_cast<u32>((exponent_mask | mantissa_msb) >> shift)); + code.cmp(tmp.cvt32(), static_cast<u32>(mantissa_msb >> shift)); + code.jne(end, code.T_NEAR); + + // If we're here there are four cases left: + // op1 == SNaN && op2 == QNaN + // op1 == Inf && op2 == QNaN + // op1 == QNaN && op2 == SNaN <<< The problematic case + // op1 == QNaN && op2 == Inf + + if constexpr (fsize == 32) { + code.movd(tmp.cvt32(), op2); + code.shl(tmp.cvt32(), 32 - mantissa_msb_bit); + } else { + code.movq(tmp, op2); + code.shl(tmp, 64 - mantissa_msb_bit); + } + // If op2 is a SNaN, CF = 0 and ZF = 0. + code.jna(end, code.T_NEAR); + + // Silence the SNaN as required by spec. + if (code.HasHostFeature(HostFeature::AVX)) { + code.vorps(result, op2, code.Const(xword, mantissa_msb)); + } else { + code.movaps(result, op2); + code.orps(result, code.Const(xword, mantissa_msb)); + } + code.jmp(end, code.T_NEAR); +} + +template<size_t fsize, typename Function> +void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + SharedLabel end = GenSharedLabel(); + + Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (!ctx.FPCR().DN() && !ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + end = ProcessNaN<fsize>(code, ctx, result); + } + if constexpr (std::is_member_function_pointer_v<Function>) { + (code.*fn)(result, result); + } else { + fn(result); + } + if (ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + // Do nothing + } else if (ctx.FPCR().DN()) { + ForceToDefaultNaN<fsize>(code, result); + } else { + PostProcessNaN<fsize>(code, result, xmm0); + } + code.L(*end); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<size_t fsize, typename Function> +void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (ctx.FPCR().DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]); + + if constexpr (std::is_member_function_pointer_v<Function>) { + (code.*fn)(result, operand); + } else { + fn(result, operand); + } + + if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + ForceToDefaultNaN<fsize>(code, result); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm op1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm op2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + + SharedLabel end = GenSharedLabel(), nan = GenSharedLabel(); + + code.movaps(result, op1); + if constexpr (std::is_member_function_pointer_v<Function>) { + (code.*fn)(result, op2); + } else { + fn(result, op2); + } + FCODE(ucomis)(result, result); + code.jp(*nan, code.T_NEAR); + code.L(*end); + + ctx.deferred_emits.emplace_back([=, &code] { + Xbyak::Label op_are_nans; + + code.L(*nan); + FCODE(ucomis)(op1, op2); + code.jp(op_are_nans); + // Here we must return a positive NaN, because the indefinite value on x86 is a negative NaN! + code.movaps(result, code.Const(xword, FP::FPInfo<FPT>::DefaultNaN())); + code.jmp(*end, code.T_NEAR); + code.L(op_are_nans); + EmitPostProcessNaNs<fsize>(code, result, op1, op2, tmp, *end); + }); + + ctx.reg_alloc.DefineValue(inst, result); +} + +} // anonymous namespace + +template<size_t fsize> +void FPAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + constexpr FPT non_sign_mask = FP::FPInfo<FPT>::sign_mask - FPT(1u); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Address mask = code.Const(xword, non_sign_mask); + + code.andps(result, mask); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPAbs16(EmitContext& ctx, IR::Inst* inst) { + FPAbs<16>(code, ctx, inst); +} + +void EmitX64::EmitFPAbs32(EmitContext& ctx, IR::Inst* inst) { + FPAbs<32>(code, ctx, inst); +} + +void EmitX64::EmitFPAbs64(EmitContext& ctx, IR::Inst* inst) { + FPAbs<64>(code, ctx, inst); +} + +template<size_t fsize> +void FPNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Address mask = code.Const(xword, u64(sign_mask)); + + code.xorps(result, mask); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPNeg16(EmitContext& ctx, IR::Inst* inst) { + FPNeg<16>(code, ctx, inst); +} + +void EmitX64::EmitFPNeg32(EmitContext& ctx, IR::Inst* inst) { + FPNeg<32>(code, ctx, inst); +} + +void EmitX64::EmitFPNeg64(EmitContext& ctx, IR::Inst* inst) { + FPNeg<64>(code, ctx, inst); +} + +void EmitX64::EmitFPAdd32(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<32>(code, ctx, inst, &Xbyak::CodeGenerator::addss); +} + +void EmitX64::EmitFPAdd64(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::addsd); +} + +void EmitX64::EmitFPDiv32(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<32>(code, ctx, inst, &Xbyak::CodeGenerator::divss); +} + +void EmitX64::EmitFPDiv64(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::divsd); +} + +template<size_t fsize, bool is_max> +static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr(); + + DenormalsAreZero<fsize>(code, ctx, {result, operand}); + + SharedLabel equal = GenSharedLabel(), end = GenSharedLabel(); + + FCODE(ucomis)(result, operand); + code.jz(*equal, code.T_NEAR); + if constexpr (is_max) { + FCODE(maxs)(result, operand); + } else { + FCODE(mins)(result, operand); + } + code.L(*end); + + ctx.deferred_emits.emplace_back([=, &code, &ctx] { + Xbyak::Label nan; + + code.L(*equal); + code.jp(nan); + if constexpr (is_max) { + code.andps(result, operand); + } else { + code.orps(result, operand); + } + code.jmp(*end); + + code.L(nan); + if (ctx.FPCR().DN()) { + code.movaps(result, code.Const(xword, fsize == 32 ? f32_nan : f64_nan)); + code.jmp(*end); + } else { + code.movaps(tmp, result); + FCODE(adds)(result, operand); + EmitPostProcessNaNs<fsize>(code, result, tmp, operand, gpr_scratch, *end); + } + }); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<size_t fsize, bool is_max> +static void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + constexpr FPT default_nan = FP::FPInfo<FPT>::DefaultNaN(); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm op1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm op2 = ctx.reg_alloc.UseScratchXmm(args[1]); // Result stored here! + + DenormalsAreZero<fsize>(code, ctx, {op1, op2}); + + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + // vrangep{s,d} will already correctly handle comparing + // signed zeros and propagating NaNs similar to ARM + constexpr FpRangeSelect range_select = is_max ? FpRangeSelect::Max : FpRangeSelect::Min; + FCODE(vranges)(op2, op1, op2, FpRangeLUT(range_select, FpRangeSign::Preserve)); + + if (ctx.FPCR().DN()) { + FCODE(vcmps)(k1, op2, op2, Cmp::Unordered_Q); + FCODE(vmovs)(op2 | k1, code.Const(xword, default_nan)); + } + } else { + Xbyak::Reg tmp = ctx.reg_alloc.ScratchGpr(); + tmp.setBit(fsize); + + const auto move_to_tmp = [=, &code](const Xbyak::Xmm& xmm) { + if constexpr (fsize == 32) { + code.movd(tmp.cvt32(), xmm); + } else { + code.movq(tmp.cvt64(), xmm); + } + }; + + SharedLabel end = GenSharedLabel(), z = GenSharedLabel(); + + FCODE(ucomis)(op1, op2); + code.jz(*z, code.T_NEAR); + if constexpr (is_max) { + FCODE(maxs)(op2, op1); + } else { + FCODE(mins)(op2, op1); + } + code.L(*end); + + ctx.deferred_emits.emplace_back([=, &code, &ctx] { + Xbyak::Label nan, op2_is_nan, snan, maybe_both_nan; + + constexpr u8 mantissa_msb_bit = static_cast<u8>(FP::FPInfo<FPT>::explicit_mantissa_width - 1); + + code.L(*z); + code.jp(nan); + if constexpr (is_max) { + code.andps(op2, op1); + } else { + code.orps(op2, op1); + } + code.jmp(*end); + + // NaN requirements: + // op1 op2 result + // SNaN anything op1 + // !SNaN SNaN op2 + // QNaN !NaN op2 + // !NaN QNaN op1 + // QNaN QNaN op1 + + code.L(nan); + FCODE(ucomis)(op1, op1); + code.jnp(op2_is_nan); + + // op1 is NaN + move_to_tmp(op1); + code.bt(tmp, mantissa_msb_bit); + code.jc(maybe_both_nan); + if (ctx.FPCR().DN()) { + code.L(snan); + code.movaps(op2, code.Const(xword, default_nan)); + code.jmp(*end); + } else { + code.movaps(op2, op1); + code.L(snan); + code.orps(op2, code.Const(xword, FP::FPInfo<FPT>::mantissa_msb)); + code.jmp(*end); + } + + code.L(maybe_both_nan); + FCODE(ucomis)(op2, op2); + code.jnp(*end, code.T_NEAR); + if (ctx.FPCR().DN()) { + code.jmp(snan); + } else { + move_to_tmp(op2); + code.bt(tmp.cvt64(), mantissa_msb_bit); + code.jnc(snan); + code.movaps(op2, op1); + code.jmp(*end); + } + + // op2 is NaN + code.L(op2_is_nan); + move_to_tmp(op2); + code.bt(tmp, mantissa_msb_bit); + code.jnc(snan); + code.movaps(op2, op1); + code.jmp(*end); + }); + } + + ctx.reg_alloc.DefineValue(inst, op2); +} + +void EmitX64::EmitFPMax32(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMax<32, true>(code, ctx, inst); +} + +void EmitX64::EmitFPMax64(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMax<64, true>(code, ctx, inst); +} + +void EmitX64::EmitFPMaxNumeric32(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMaxNumeric<32, true>(code, ctx, inst); +} + +void EmitX64::EmitFPMaxNumeric64(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMaxNumeric<64, true>(code, ctx, inst); +} + +void EmitX64::EmitFPMin32(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMax<32, false>(code, ctx, inst); +} + +void EmitX64::EmitFPMin64(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMax<64, false>(code, ctx, inst); +} + +void EmitX64::EmitFPMinNumeric32(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMaxNumeric<32, false>(code, ctx, inst); +} + +void EmitX64::EmitFPMinNumeric64(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMaxNumeric<64, false>(code, ctx, inst); +} + +void EmitX64::EmitFPMul32(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<32>(code, ctx, inst, &Xbyak::CodeGenerator::mulss); +} + +void EmitX64::EmitFPMul64(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::mulsd); +} + +template<size_t fsize, bool negate_product> +static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + const auto fallback_fn = negate_product ? &FP::FPMulSub<FPT> : &FP::FPMulAdd<FPT>; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if constexpr (fsize != 16) { + const bool needs_rounding_correction = ctx.FPCR().FZ(); + const bool needs_nan_correction = !ctx.FPCR().DN(); + + if (code.HasHostFeature(HostFeature::FMA) && !needs_rounding_correction && !needs_nan_correction) { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); + + if constexpr (negate_product) { + FCODE(vfnmadd231s)(result, operand2, operand3); + } else { + FCODE(vfmadd231s)(result, operand2, operand3); + } + if (ctx.FPCR().DN()) { + ForceToDefaultNaN<fsize>(code, result); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) { + SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel(); + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, operand1); + if constexpr (negate_product) { + FCODE(vfnmadd231s)(result, operand2, operand3); + } else { + FCODE(vfmadd231s)(result, operand2, operand3); + } + + if (needs_rounding_correction && needs_nan_correction) { + code.vandps(xmm0, result, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask)); + FCODE(ucomis)(xmm0, code.Const(xword, fsize == 32 ? f32_smallest_normal : f64_smallest_normal)); + code.jz(*fallback, code.T_NEAR); + } else if (needs_rounding_correction) { + code.vandps(xmm0, result, code.Const(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask)); + code.vxorps(xmm0, xmm0, code.Const(xword, fsize == 32 ? f32_smallest_normal : f64_smallest_normal)); + code.ptest(xmm0, xmm0); + code.jz(*fallback, code.T_NEAR); + } else if (needs_nan_correction) { + FCODE(ucomis)(result, result); + code.jp(*fallback, code.T_NEAR); + } else { + UNREACHABLE(); + } + if (ctx.FPCR().DN()) { + ForceToDefaultNaN<fsize>(code, result); + } + code.L(*end); + + ctx.deferred_emits.emplace_back([=, &code, &ctx] { + code.L(*fallback); + + Xbyak::Label nan; + + if (needs_rounding_correction && needs_nan_correction) { + code.jp(nan, code.T_NEAR); + } + + if (needs_rounding_correction) { + // x64 rounds before flushing to zero + // AArch64 rounds after flushing to zero + // This difference of behaviour is noticable if something would round to a smallest normalized number + + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.movq(code.ABI_PARAM1, operand1); + code.movq(code.ABI_PARAM2, operand2); + code.movq(code.ABI_PARAM3, operand3); + code.mov(code.ABI_PARAM4.cvt32(), ctx.FPCR().Value()); +#ifdef _WIN32 + code.sub(rsp, 16 + ABI_SHADOW_SPACE); + code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.mov(qword[rsp + ABI_SHADOW_SPACE], rax); + code.CallFunction(fallback_fn); + code.add(rsp, 16 + ABI_SHADOW_SPACE); +#else + code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(fallback_fn); +#endif + code.movq(result, code.ABI_RETURN); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + code.jmp(*end); + } + + if (needs_nan_correction) { + code.L(nan); + + // AArch64 preferentially returns the first SNaN over the first QNaN + // For x64 vfmadd231ss, x64 returns the first of {op2, op3, op1} that is a NaN, irregardless of signalling state + + Xbyak::Label has_nan, indeterminate, op1_snan, op1_done, op2_done, op3_done; + + code.vmovaps(xmm0, code.Const(xword, FP::FPInfo<FPT>::mantissa_msb)); + + FCODE(ucomis)(operand2, operand3); + code.jp(has_nan); + FCODE(ucomis)(operand1, operand1); + code.jnp(indeterminate); + + // AArch64 specifically emits a default NaN for the case when the addend is a QNaN and the two other arguments are {inf, zero} + code.ptest(operand1, xmm0); + code.jz(op1_snan); + FCODE(vmuls)(xmm0, operand2, operand3); // check if {op2, op3} are {inf, zero}/{zero, inf} + FCODE(ucomis)(xmm0, xmm0); + code.jnp(*end); + + code.L(indeterminate); + code.vmovaps(result, code.Const(xword, FP::FPInfo<FPT>::DefaultNaN())); + code.jmp(*end); + + code.L(has_nan); + + FCODE(ucomis)(operand1, operand1); + code.jnp(op1_done); + code.movaps(result, operand1); // this is done because of NaN behavior of vfmadd231s (priority of op2, op3, op1) + code.ptest(operand1, xmm0); + code.jnz(op1_done); + code.L(op1_snan); + code.vorps(result, operand1, xmm0); + code.jmp(*end); + code.L(op1_done); + + FCODE(ucomis)(operand2, operand2); + code.jnp(op2_done); + code.ptest(operand2, xmm0); + code.jnz(op2_done); + code.vorps(result, operand2, xmm0); + if constexpr (negate_product) { + code.xorps(result, code.Const(xword, FP::FPInfo<FPT>::sign_mask)); + } + code.jmp(*end); + code.L(op2_done); + + FCODE(ucomis)(operand3, operand3); + code.jnp(op3_done); + code.ptest(operand3, xmm0); + code.jnz(op3_done); + code.vorps(result, operand3, xmm0); + code.jmp(*end); + code.L(op3_done); + + // at this point, all SNaNs have been handled + // if op1 was not a QNaN and op2 is, negate the result + if constexpr (negate_product) { + FCODE(ucomis)(operand1, operand1); + code.jp(*end); + FCODE(ucomis)(operand2, operand2); + code.jnp(*end); + code.xorps(result, code.Const(xword, FP::FPInfo<FPT>::sign_mask)); + } + + code.jmp(*end); + } + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); + + if constexpr (negate_product) { + code.xorps(operand2, code.Const(xword, FP::FPInfo<FPT>::sign_mask)); + } + FCODE(muls)(operand2, operand3); + FCODE(adds)(operand1, operand2); + + ctx.reg_alloc.DefineValue(inst, operand1); + return; + } + } + + ctx.reg_alloc.HostCall(inst, args[0], args[1], args[2]); + code.mov(code.ABI_PARAM4.cvt32(), ctx.FPCR().Value()); +#ifdef _WIN32 + ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); + code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.mov(qword[rsp + ABI_SHADOW_SPACE], rax); + code.CallFunction(fallback_fn); + ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); +#else + code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(fallback_fn); +#endif +} + +void EmitX64::EmitFPMulAdd16(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulAdd<16, false>(code, ctx, inst); +} + +void EmitX64::EmitFPMulAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulAdd<32, false>(code, ctx, inst); +} + +void EmitX64::EmitFPMulAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulAdd<64, false>(code, ctx, inst); +} + +void EmitX64::EmitFPMulSub16(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulAdd<16, true>(code, ctx, inst); +} + +void EmitX64::EmitFPMulSub32(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulAdd<32, true>(code, ctx, inst); +} + +void EmitX64::EmitFPMulSub64(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulAdd<64, true>(code, ctx, inst); +} + +template<size_t fsize> +static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const bool do_default_nan = ctx.FPCR().DN(); + + const Xbyak::Xmm op1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm op2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg64 tmp = do_default_nan ? INVALID_REG : ctx.reg_alloc.ScratchGpr(); + + SharedLabel end = GenSharedLabel(), nan = GenSharedLabel(); + + if (code.HasHostFeature(HostFeature::AVX)) { + FCODE(vmuls)(result, op1, op2); + } else { + code.movaps(result, op1); + FCODE(muls)(result, op2); + } + FCODE(ucomis)(result, result); + code.jp(*nan, code.T_NEAR); + code.L(*end); + + ctx.deferred_emits.emplace_back([=, &code] { + Xbyak::Label op_are_nans; + + code.L(*nan); + FCODE(ucomis)(op1, op2); + code.jp(op_are_nans); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vxorps(result, op1, op2); + } else { + code.movaps(result, op1); + code.xorps(result, op2); + } + code.andps(result, code.Const(xword, FP::FPInfo<FPT>::sign_mask)); + code.orps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>())); + code.jmp(*end, code.T_NEAR); + code.L(op_are_nans); + if (do_default_nan) { + code.movaps(result, code.Const(xword, FP::FPInfo<FPT>::DefaultNaN())); + code.jmp(*end, code.T_NEAR); + } else { + EmitPostProcessNaNs<fsize>(code, result, op1, op2, tmp, *end); + } + }); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPMulX32(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulX<32>(code, ctx, inst); +} + +void EmitX64::EmitFPMulX64(EmitContext& ctx, IR::Inst* inst) { + EmitFPMulX<64>(code, ctx, inst); +} + +template<size_t fsize> +static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + if constexpr (fsize != 16) { + if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + FCODE(vrcp14s)(result, operand, operand); + } else { + if constexpr (fsize == 32) { + code.rcpss(result, operand); + } else { + code.cvtsd2ss(result, operand); + code.rcpss(result, result); + code.cvtss2sd(result, result); + } + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(inst, args[0]); + code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); + code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPRecipEstimate<FPT>); +} + +void EmitX64::EmitFPRecipEstimate16(EmitContext& ctx, IR::Inst* inst) { + EmitFPRecipEstimate<16>(code, ctx, inst); +} + +void EmitX64::EmitFPRecipEstimate32(EmitContext& ctx, IR::Inst* inst) { + EmitFPRecipEstimate<32>(code, ctx, inst); +} + +void EmitX64::EmitFPRecipEstimate64(EmitContext& ctx, IR::Inst* inst) { + EmitFPRecipEstimate<64>(code, ctx, inst); +} + +template<size_t fsize> +static void EmitFPRecipExponent(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(inst, args[0]); + code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); + code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPRecipExponent<FPT>); +} + +void EmitX64::EmitFPRecipExponent16(EmitContext& ctx, IR::Inst* inst) { + EmitFPRecipExponent<16>(code, ctx, inst); +} + +void EmitX64::EmitFPRecipExponent32(EmitContext& ctx, IR::Inst* inst) { + EmitFPRecipExponent<32>(code, ctx, inst); +} + +void EmitX64::EmitFPRecipExponent64(EmitContext& ctx, IR::Inst* inst) { + EmitFPRecipExponent<64>(code, ctx, inst); +} + +template<size_t fsize> +static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if constexpr (fsize != 16) { + if (code.HasHostFeature(HostFeature::FMA) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + Xbyak::Label end, fallback; + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>())); + FCODE(vfnmadd231s)(result, operand1, operand2); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::FMA)) { + SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>())); + FCODE(vfnmadd231s)(result, operand1, operand2); + FCODE(ucomis)(result, result); + code.jp(*fallback, code.T_NEAR); + code.L(*end); + + ctx.deferred_emits.emplace_back([=, &code, &ctx] { + code.L(*fallback); + + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.movq(code.ABI_PARAM1, operand1); + code.movq(code.ABI_PARAM2, operand2); + code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value()); + code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPRecipStepFused<FPT>); + code.movq(result, code.ABI_RETURN); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + + code.jmp(*end, code.T_NEAR); + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 2>())); + FCODE(muls)(operand1, operand2); + FCODE(subs)(result, operand1); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + + ctx.reg_alloc.HostCall(inst, args[0], args[1]); + code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value()); + code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPRecipStepFused<FPT>); +} + +void EmitX64::EmitFPRecipStepFused16(EmitContext& ctx, IR::Inst* inst) { + EmitFPRecipStepFused<16>(code, ctx, inst); +} + +void EmitX64::EmitFPRecipStepFused32(EmitContext& ctx, IR::Inst* inst) { + EmitFPRecipStepFused<32>(code, ctx, inst); +} + +void EmitX64::EmitFPRecipStepFused64(EmitContext& ctx, IR::Inst* inst) { + EmitFPRecipStepFused<64>(code, ctx, inst); +} + +static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, size_t fsize) { + const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8()); + const bool exact = inst->GetArg(2).GetU1(); + const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); + + if (fsize != 16 && code.HasHostFeature(HostFeature::SSE41) && round_imm && !exact) { + if (fsize == 64) { + FPTwoOp<64>(code, ctx, inst, [&](Xbyak::Xmm result) { + code.roundsd(result, result, *round_imm); + }); + } else { + FPTwoOp<32>(code, ctx, inst, [&](Xbyak::Xmm result) { + code.roundss(result, result, *round_imm); + }); + } + + return; + } + + using fsize_list = mp::list<mp::lift_value<size_t(16)>, + mp::lift_value<size_t(32)>, + mp::lift_value<size_t(64)>>; + using rounding_list = mp::list< + mp::lift_value<FP::RoundingMode::ToNearest_TieEven>, + mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsZero>, + mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>; + using exact_list = mp::list<std::true_type, std::false_type>; + + static const auto lut = Common::GenerateLookupTableFromList( + []<typename I>(I) { + return std::pair{ + mp::lower_to_tuple_v<I>, + Common::FptrCast( + [](u64 input, FP::FPSR& fpsr, FP::FPCR fpcr) { + constexpr size_t fsize = mp::get<0, I>::value; + constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value; + constexpr bool exact = mp::get<2, I>::value; + using InputSize = mcl::unsigned_integer_of_size<fsize>; + + return FP::FPRoundInt<InputSize>(static_cast<InputSize>(input), fpcr, rounding_mode, exact, fpsr); + })}; + }, + mp::cartesian_product<fsize_list, rounding_list, exact_list>{}); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(inst, args[0]); + code.lea(code.ABI_PARAM2, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value()); + code.CallFunction(lut.at(std::make_tuple(fsize, rounding_mode, exact))); +} + +void EmitX64::EmitFPRoundInt16(EmitContext& ctx, IR::Inst* inst) { + EmitFPRound(code, ctx, inst, 16); +} + +void EmitX64::EmitFPRoundInt32(EmitContext& ctx, IR::Inst* inst) { + EmitFPRound(code, ctx, inst, 32); +} + +void EmitX64::EmitFPRoundInt64(EmitContext& ctx, IR::Inst* inst) { + EmitFPRound(code, ctx, inst, 64); +} + +template<size_t fsize> +static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + if constexpr (fsize != 16) { + if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + FCODE(vrsqrt14s)(result, operand, operand); + } else { + if constexpr (fsize == 32) { + code.rsqrtss(result, operand); + } else { + code.cvtsd2ss(result, operand); + code.rsqrtss(result, result); + code.cvtss2sd(result, result); + } + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(); + [[maybe_unused]] const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + + SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel(); + + code.movaps(value, operand); + + code.movaps(xmm0, code.Const(xword, fsize == 32 ? 0xFFFF8000 : 0xFFFF'F000'0000'0000)); + code.pand(value, xmm0); + code.por(value, code.Const(xword, fsize == 32 ? 0x00008000 : 0x0000'1000'0000'0000)); + + // Detect NaNs, negatives, zeros, denormals and infinities + FCODE(ucomis)(value, code.Const(xword, FPT(1) << FP::FPInfo<FPT>::explicit_mantissa_width)); + code.jna(*bad_values, code.T_NEAR); + + FCODE(sqrts)(value, value); + ICODE(mov)(result, code.Const(xword, FP::FPValue<FPT, false, 0, 1>())); + FCODE(divs)(result, value); + + ICODE(padd)(result, code.Const(xword, fsize == 32 ? 0x00004000 : 0x0000'0800'0000'0000)); + code.pand(result, xmm0); + + code.L(*end); + + ctx.deferred_emits.emplace_back([=, &code, &ctx] { + Xbyak::Label fallback, default_nan; + bool needs_fallback = false; + + code.L(*bad_values); + if constexpr (fsize == 32) { + code.movd(tmp, operand); + + if (!ctx.FPCR().FZ()) { + if (ctx.FPCR().DN()) { + // a > 0x80000000 + code.cmp(tmp, 0x80000000); + code.ja(default_nan, code.T_NEAR); + } + + // a > 0 && a < 0x00800000; + code.sub(tmp, 1); + code.cmp(tmp, 0x007FFFFF); + code.jb(fallback); + needs_fallback = true; + } + + code.rsqrtss(result, operand); + + if (ctx.FPCR().DN()) { + code.ucomiss(result, result); + code.jnp(*end, code.T_NEAR); + } else { + // FZ ? (a >= 0x80800000 && a <= 0xFF800000) : (a >= 0x80000001 && a <= 0xFF800000) + // !FZ path takes into account the subtraction by one from the earlier block + code.add(tmp, ctx.FPCR().FZ() ? 0x7F800000 : 0x80000000); + code.cmp(tmp, ctx.FPCR().FZ() ? 0x7F000001 : 0x7F800000); + code.jnb(*end, code.T_NEAR); + } + + code.L(default_nan); + code.movd(result, code.Const(xword, 0x7FC00000)); + code.jmp(*end, code.T_NEAR); + } else { + Xbyak::Label nan, zero; + + code.movaps(value, operand); + DenormalsAreZero<fsize>(code, ctx, {value}); + code.pxor(result, result); + + code.ucomisd(value, result); + if (ctx.FPCR().DN()) { + code.jc(default_nan); + code.je(zero); + } else { + code.jp(nan); + code.je(zero); + code.jc(default_nan); + } + + if (!ctx.FPCR().FZ()) { + needs_fallback = true; + code.jmp(fallback); + } else { + // result = 0 + code.jmp(*end, code.T_NEAR); + } + + code.L(zero); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpor(result, value, code.Const(xword, 0x7FF0'0000'0000'0000)); + } else { + code.movaps(result, value); + code.por(result, code.Const(xword, 0x7FF0'0000'0000'0000)); + } + code.jmp(*end, code.T_NEAR); + + code.L(nan); + if (!ctx.FPCR().DN()) { + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpor(result, operand, code.Const(xword, 0x0008'0000'0000'0000)); + } else { + code.movaps(result, operand); + code.por(result, code.Const(xword, 0x0008'0000'0000'0000)); + } + code.jmp(*end, code.T_NEAR); + } + + code.L(default_nan); + code.movq(result, code.Const(xword, 0x7FF8'0000'0000'0000)); + code.jmp(*end, code.T_NEAR); + } + + code.L(fallback); + if (needs_fallback) { + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.movq(code.ABI_PARAM1, operand); + code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); + code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPRSqrtEstimate<FPT>); + code.movq(result, rax); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + code.jmp(*end, code.T_NEAR); + } + }); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(inst, args[0]); + code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); + code.lea(code.ABI_PARAM3, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPRSqrtEstimate<FPT>); + } +} + +void EmitX64::EmitFPRSqrtEstimate16(EmitContext& ctx, IR::Inst* inst) { + EmitFPRSqrtEstimate<16>(code, ctx, inst); +} + +void EmitX64::EmitFPRSqrtEstimate32(EmitContext& ctx, IR::Inst* inst) { + EmitFPRSqrtEstimate<32>(code, ctx, inst); +} + +void EmitX64::EmitFPRSqrtEstimate64(EmitContext& ctx, IR::Inst* inst) { + EmitFPRSqrtEstimate<64>(code, ctx, inst); +} + +template<size_t fsize> +static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if constexpr (fsize != 16) { + if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vmovaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 3>())); + FCODE(vfnmadd231s)(result, operand1, operand2); + FCODE(vmuls)(result, result, code.Const(xword, FP::FPValue<FPT, false, -1, 1>())); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) { + SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vmovaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 3>())); + FCODE(vfnmadd231s)(result, operand1, operand2); + + // Detect if the intermediate result is infinity or NaN or nearly an infinity. + // Why do we need to care about infinities? This is because x86 doesn't allow us + // to fuse the divide-by-two with the rest of the FMA operation. Therefore the + // intermediate value may overflow and we would like to handle this case. + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + code.vpextrw(tmp, result, fsize == 32 ? 1 : 3); + code.and_(tmp.cvt16(), fsize == 32 ? 0x7f80 : 0x7ff0); + code.cmp(tmp.cvt16(), fsize == 32 ? 0x7f00 : 0x7fe0); + ctx.reg_alloc.Release(tmp); + + code.jae(*fallback, code.T_NEAR); + + FCODE(vmuls)(result, result, code.Const(xword, FP::FPValue<FPT, false, -1, 1>())); + code.L(*end); + + ctx.deferred_emits.emplace_back([=, &code, &ctx] { + code.L(*fallback); + + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.movq(code.ABI_PARAM1, operand1); + code.movq(code.ABI_PARAM2, operand2); + code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value()); + code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPRSqrtStepFused<FPT>); + code.movq(result, code.ABI_RETURN); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + + code.jmp(*end, code.T_NEAR); + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, code.Const(xword, FP::FPValue<FPT, false, 0, 3>())); + FCODE(muls)(operand1, operand2); + FCODE(subs)(result, operand1); + FCODE(muls)(result, code.Const(xword, FP::FPValue<FPT, false, -1, 1>())); + + ctx.reg_alloc.DefineValue(inst, operand1); + return; + } + } + + ctx.reg_alloc.HostCall(inst, args[0], args[1]); + code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value()); + code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPRSqrtStepFused<FPT>); +} + +void EmitX64::EmitFPRSqrtStepFused16(EmitContext& ctx, IR::Inst* inst) { + EmitFPRSqrtStepFused<16>(code, ctx, inst); +} + +void EmitX64::EmitFPRSqrtStepFused32(EmitContext& ctx, IR::Inst* inst) { + EmitFPRSqrtStepFused<32>(code, ctx, inst); +} + +void EmitX64::EmitFPRSqrtStepFused64(EmitContext& ctx, IR::Inst* inst) { + EmitFPRSqrtStepFused<64>(code, ctx, inst); +} + +void EmitX64::EmitFPSqrt32(EmitContext& ctx, IR::Inst* inst) { + FPTwoOp<32>(code, ctx, inst, &Xbyak::CodeGenerator::sqrtss); +} + +void EmitX64::EmitFPSqrt64(EmitContext& ctx, IR::Inst* inst) { + FPTwoOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::sqrtsd); +} + +void EmitX64::EmitFPSub32(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<32>(code, ctx, inst, &Xbyak::CodeGenerator::subss); +} + +void EmitX64::EmitFPSub64(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::subsd); +} + +static Xbyak::Reg64 SetFpscrNzcvFromFlags(BlockOfCode& code, EmitContext& ctx) { + ctx.reg_alloc.ScratchGpr(HostLoc::RCX); // shifting requires use of cl + const Xbyak::Reg64 nzcv = ctx.reg_alloc.ScratchGpr(); + + // x64 flags ARM flags + // ZF PF CF NZCV + // Unordered 1 1 1 0011 + // Greater than 0 0 0 0010 + // Less than 0 0 1 1000 + // Equal 1 0 0 0110 + // + // Thus we can take use ZF:CF as an index into an array like so: + // x64 ARM ARM as x64 + // ZF:CF NZCV NZ-----C-------V + // 0 0010 0000000100000000 = 0x0100 + // 1 1000 1000000000000000 = 0x8000 + // 2 0110 0100000100000000 = 0x4100 + // 3 0011 0000000100000001 = 0x0101 + + code.mov(nzcv, 0x0101'4100'8000'0100); + code.sete(cl); + code.rcl(cl, 5); // cl = ZF:CF:0000 + code.shr(nzcv, cl); + + return nzcv; +} + +void EmitX64::EmitFPCompare32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm reg_a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm reg_b = ctx.reg_alloc.UseXmm(args[1]); + const bool exc_on_qnan = args[2].GetImmediateU1(); + + if (exc_on_qnan) { + code.comiss(reg_a, reg_b); + } else { + code.ucomiss(reg_a, reg_b); + } + + const Xbyak::Reg64 nzcv = SetFpscrNzcvFromFlags(code, ctx); + ctx.reg_alloc.DefineValue(inst, nzcv); +} + +void EmitX64::EmitFPCompare64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm reg_a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm reg_b = ctx.reg_alloc.UseXmm(args[1]); + const bool exc_on_qnan = args[2].GetImmediateU1(); + + if (exc_on_qnan) { + code.comisd(reg_a, reg_b); + } else { + code.ucomisd(reg_a, reg_b); + } + + const Xbyak::Reg64 nzcv = SetFpscrNzcvFromFlags(code, ctx); + ctx.reg_alloc.DefineValue(inst, nzcv); +} + +void EmitX64::EmitFPHalfToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); + + if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]); + + // Double-conversion here is acceptable as this is expanding precision. + code.vcvtph2ps(result, value); + code.vcvtps2pd(result, result); + if (ctx.FPCR().DN()) { + ForceToDefaultNaN<64>(code, result); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + ctx.reg_alloc.HostCall(inst, args[0]); + code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); + code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode)); + code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPConvert<u64, u16>); +} + +void EmitX64::EmitFPHalfToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); + + if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]); + + code.vcvtph2ps(result, value); + if (ctx.FPCR().DN()) { + ForceToDefaultNaN<32>(code, result); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + ctx.reg_alloc.HostCall(inst, args[0]); + code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); + code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode)); + code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPConvert<u32, u16>); +} + +void EmitX64::EmitFPSingleToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); + + // We special-case the non-IEEE-defined ToOdd rounding mode. + if (rounding_mode == ctx.FPCR().RMode() && rounding_mode != FP::RoundingMode::ToOdd) { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + + code.cvtss2sd(result, result); + if (ctx.FPCR().DN()) { + ForceToDefaultNaN<64>(code, result); + } + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.HostCall(inst, args[0]); + code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); + code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode)); + code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPConvert<u64, u32>); + } +} + +void EmitX64::EmitFPSingleToHalf(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); + const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); + + if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (ctx.FPCR().DN()) { + ForceToDefaultNaN<32>(code, result); + } + code.vcvtps2ph(result, result, static_cast<u8>(*round_imm)); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + ctx.reg_alloc.HostCall(inst, args[0]); + code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); + code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode)); + code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPConvert<u16, u32>); +} + +void EmitX64::EmitFPDoubleToHalf(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); + + // NOTE: Do not double-convert here as that is inaccurate. + // To be accurate, the first conversion would need to be "round-to-odd", which x64 doesn't support. + + ctx.reg_alloc.HostCall(inst, args[0]); + code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); + code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode)); + code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPConvert<u16, u64>); +} + +void EmitX64::EmitFPDoubleToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[1].GetImmediateU8()); + + // We special-case the non-IEEE-defined ToOdd rounding mode. + if (rounding_mode == ctx.FPCR().RMode() && rounding_mode != FP::RoundingMode::ToOdd) { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + + code.cvtsd2ss(result, result); + if (ctx.FPCR().DN()) { + ForceToDefaultNaN<32>(code, result); + } + ctx.reg_alloc.DefineValue(inst, result); + } else { + ctx.reg_alloc.HostCall(inst, args[0]); + code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); + code.mov(code.ABI_PARAM3.cvt32(), static_cast<u32>(rounding_mode)); + code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.CallFunction(&FP::FPConvert<u32, u64>); + } +} + +template<size_t fsize, bool unsigned_, size_t isize> +static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const size_t fbits = args[1].GetImmediateU8(); + const auto rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + + if constexpr (fsize != 16) { + const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); + + // cvttsd2si truncates during operation so rounding (and thus SSE4.1) not required + const bool truncating = rounding_mode == FP::RoundingMode::TowardsZero; + + if (round_imm && (truncating || code.HasHostFeature(HostFeature::SSE41))) { + const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr().cvt64(); + + if constexpr (fsize == 64) { + if (fbits != 0) { + const u64 scale_factor = static_cast<u64>((fbits + 1023) << 52); + code.mulsd(src, code.Const(xword, scale_factor)); + } + + if (!truncating) { + code.roundsd(src, src, *round_imm); + } + } else { + if (fbits != 0) { + const u32 scale_factor = static_cast<u32>((fbits + 127) << 23); + code.mulss(src, code.Const(xword, scale_factor)); + } + + if (!truncating) { + code.roundss(src, src, *round_imm); + } + + code.cvtss2sd(src, src); + } + + if constexpr (isize == 64) { + const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(); + + if (!unsigned_) { + SharedLabel saturate_max = GenSharedLabel(), end = GenSharedLabel(); + + ZeroIfNaN<64>(code, src, scratch); + + code.movsd(scratch, code.Const(xword, f64_max_s64_lim)); + code.comisd(scratch, src); + code.jna(*saturate_max, code.T_NEAR); + code.cvttsd2si(result, src); // 64 bit gpr + code.L(*end); + + ctx.deferred_emits.emplace_back([=, &code] { + code.L(*saturate_max); + code.mov(result, 0x7FFF'FFFF'FFFF'FFFF); + code.jmp(*end, code.T_NEAR); + }); + } else { + Xbyak::Label below_max; + + const Xbyak::Reg64 result2 = ctx.reg_alloc.ScratchGpr().cvt64(); + + code.pxor(xmm0, xmm0); + + code.movaps(scratch, src); + code.subsd(scratch, code.Const(xword, f64_max_s64_lim)); + + // these both result in zero if src/scratch are NaN + code.maxsd(src, xmm0); + code.maxsd(scratch, xmm0); + + code.cvttsd2si(result, src); + code.cvttsd2si(result2, scratch); + code.or_(result, result2); + + // when src < 2^63, result2 == 0, and result contains the final result + // when src >= 2^63, result contains 0x800.... and result2 contains the non-MSB bits + // MSB if result2 is 1 when src >= 2^64 + + code.sar(result2, 63); + code.or_(result, result2); + } + } else if constexpr (isize == 32) { + if (!unsigned_) { + const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(); + + ZeroIfNaN<64>(code, src, scratch); + code.minsd(src, code.Const(xword, f64_max_s32)); + // maxsd not required as cvttsd2si results in 0x8000'0000 when out of range + code.cvttsd2si(result.cvt32(), src); // 32 bit gpr + } else { + code.pxor(xmm0, xmm0); + code.maxsd(src, xmm0); // results in a zero if src is NaN + code.minsd(src, code.Const(xword, f64_max_u32)); + code.cvttsd2si(result, src); // 64 bit gpr + } + } else { + const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(); + + ZeroIfNaN<64>(code, src, scratch); + code.maxsd(src, code.Const(xword, unsigned_ ? f64_min_u16 : f64_min_s16)); + code.minsd(src, code.Const(xword, unsigned_ ? f64_max_u16 : f64_max_s16)); + code.cvttsd2si(result, src); // 64 bit gpr + } + + ctx.reg_alloc.DefineValue(inst, result); + + return; + } + } + + using fbits_list = mp::lift_sequence<std::make_index_sequence<isize + 1>>; + using rounding_list = mp::list< + mp::lift_value<FP::RoundingMode::ToNearest_TieEven>, + mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsZero>, + mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>; + + static const auto lut = Common::GenerateLookupTableFromList( + []<typename I>(I) { + return std::pair{ + mp::lower_to_tuple_v<I>, + Common::FptrCast( + [](u64 input, FP::FPSR& fpsr, FP::FPCR fpcr) { + constexpr size_t fbits = mp::get<0, I>::value; + constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value; + using FPT = mcl::unsigned_integer_of_size<fsize>; + + return FP::FPToFixed<FPT>(isize, static_cast<FPT>(input), fbits, unsigned_, fpcr, rounding_mode, fpsr); + })}; + }, + mp::cartesian_product<fbits_list, rounding_list>{}); + + ctx.reg_alloc.HostCall(inst, args[0]); + code.lea(code.ABI_PARAM2, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value()); + code.CallFunction(lut.at(std::make_tuple(fbits, rounding_mode))); +} + +void EmitX64::EmitFPDoubleToFixedS16(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<64, false, 16>(code, ctx, inst); +} + +void EmitX64::EmitFPDoubleToFixedS32(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<64, false, 32>(code, ctx, inst); +} + +void EmitX64::EmitFPDoubleToFixedS64(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<64, false, 64>(code, ctx, inst); +} + +void EmitX64::EmitFPDoubleToFixedU16(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<64, true, 16>(code, ctx, inst); +} + +void EmitX64::EmitFPDoubleToFixedU32(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<64, true, 32>(code, ctx, inst); +} + +void EmitX64::EmitFPDoubleToFixedU64(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<64, true, 64>(code, ctx, inst); +} + +void EmitX64::EmitFPHalfToFixedS16(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<16, false, 16>(code, ctx, inst); +} + +void EmitX64::EmitFPHalfToFixedS32(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<16, false, 32>(code, ctx, inst); +} + +void EmitX64::EmitFPHalfToFixedS64(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<16, false, 64>(code, ctx, inst); +} + +void EmitX64::EmitFPHalfToFixedU16(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<16, true, 16>(code, ctx, inst); +} + +void EmitX64::EmitFPHalfToFixedU32(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<16, true, 32>(code, ctx, inst); +} + +void EmitX64::EmitFPHalfToFixedU64(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<16, true, 64>(code, ctx, inst); +} + +void EmitX64::EmitFPSingleToFixedS16(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<32, false, 16>(code, ctx, inst); +} + +void EmitX64::EmitFPSingleToFixedS32(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<32, false, 32>(code, ctx, inst); +} + +void EmitX64::EmitFPSingleToFixedS64(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<32, false, 64>(code, ctx, inst); +} + +void EmitX64::EmitFPSingleToFixedU16(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<32, true, 16>(code, ctx, inst); +} + +void EmitX64::EmitFPSingleToFixedU32(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<32, true, 32>(code, ctx, inst); +} + +void EmitX64::EmitFPSingleToFixedU64(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<32, true, 64>(code, ctx, inst); +} + +void EmitX64::EmitFPFixedS16ToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg16 from = ctx.reg_alloc.UseGpr(args[0]).cvt16(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required + + code.movsx(tmp, from); + code.cvtsi2ss(result, tmp); + + if (fbits != 0) { + const u32 scale_factor = static_cast<u32>((127 - fbits) << 23); + code.mulss(result, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPFixedU16ToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg16 from = ctx.reg_alloc.UseGpr(args[0]).cvt16(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required + + code.movzx(tmp, from); + code.cvtsi2ss(result, tmp); + + if (fbits != 0) { + const u32 scale_factor = static_cast<u32>((127 - fbits) << 23); + code.mulss(result, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPFixedS32ToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 from = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + + if (rounding_mode == ctx.FPCR().RMode() || ctx.HasOptimization(OptimizationFlag::Unsafe_IgnoreStandardFPCRValue)) { + code.cvtsi2ss(result, from); + } else { + ASSERT(rounding_mode == FP::RoundingMode::ToNearest_TieEven); + code.EnterStandardASIMD(); + code.cvtsi2ss(result, from); + code.LeaveStandardASIMD(); + } + + if (fbits != 0) { + const u32 scale_factor = static_cast<u32>((127 - fbits) << 23); + code.mulss(result, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPFixedU32ToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + + const auto op = [&] { + if (code.HasHostFeature(HostFeature::AVX512F)) { + const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]); + code.vcvtusi2ss(result, result, from.cvt32()); + } else { + // We are using a 64-bit GPR register to ensure we don't end up treating the input as signed + const Xbyak::Reg64 from = ctx.reg_alloc.UseScratchGpr(args[0]); + code.mov(from.cvt32(), from.cvt32()); // TODO: Verify if this is necessary + code.cvtsi2ss(result, from); + } + }; + + if (rounding_mode == ctx.FPCR().RMode() || ctx.HasOptimization(OptimizationFlag::Unsafe_IgnoreStandardFPCRValue)) { + op(); + } else { + ASSERT(rounding_mode == FP::RoundingMode::ToNearest_TieEven); + code.EnterStandardASIMD(); + op(); + code.LeaveStandardASIMD(); + } + + if (fbits != 0) { + const u32 scale_factor = static_cast<u32>((127 - fbits) << 23); + code.mulss(result, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPFixedS16ToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg16 from = ctx.reg_alloc.UseGpr(args[0]).cvt16(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required + + code.movsx(tmp, from); + code.cvtsi2sd(result, tmp); + + if (fbits != 0) { + const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52); + code.mulsd(result, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPFixedU16ToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg16 from = ctx.reg_alloc.UseGpr(args[0]).cvt16(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required + + code.movzx(tmp, from); + code.cvtsi2sd(result, tmp); + + if (fbits != 0) { + const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52); + code.mulsd(result, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPFixedS32ToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 from = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required + + code.cvtsi2sd(result, from); + + if (fbits != 0) { + const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52); + code.mulsd(result, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPFixedU32ToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm to = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + [[maybe_unused]] const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); // Not required + + code.xorps(to, to); + + if (code.HasHostFeature(HostFeature::AVX512F)) { + const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]); + code.vcvtusi2sd(to, to, from.cvt32()); + } else { + // We are using a 64-bit GPR register to ensure we don't end up treating the input as signed + const Xbyak::Reg64 from = ctx.reg_alloc.UseScratchGpr(args[0]); + code.mov(from.cvt32(), from.cvt32()); // TODO: Verify if this is necessary + code.cvtsi2sd(to, from); + } + + if (fbits != 0) { + const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52); + code.mulsd(to, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, to); +} + +void EmitX64::EmitFPFixedS64ToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPCR().RMode()); + + code.cvtsi2sd(result, from); + + if (fbits != 0) { + const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52); + code.mulsd(result, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPFixedS64ToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPCR().RMode()); + + code.cvtsi2ss(result, from); + + if (fbits != 0) { + const u32 scale_factor = static_cast<u32>((127 - fbits) << 23); + code.mulss(result, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPFixedU64ToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPCR().RMode()); + + if (code.HasHostFeature(HostFeature::AVX512F)) { + code.vcvtusi2sd(result, result, from); + } else { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movq(tmp, from); + code.punpckldq(tmp, code.Const(xword, 0x4530000043300000, 0)); + code.subpd(tmp, code.Const(xword, 0x4330000000000000, 0x4530000000000000)); + code.pshufd(result, tmp, 0b01001110); + code.addpd(result, tmp); + if (ctx.FPCR().RMode() == FP::RoundingMode::TowardsMinusInfinity) { + code.pand(result, code.Const(xword, f64_non_sign_mask)); + } + } + + if (fbits != 0) { + const u64 scale_factor = static_cast<u64>((1023 - fbits) << 52); + code.mulsd(result, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPFixedU64ToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPCR().RMode()); + + if (code.HasHostFeature(HostFeature::AVX512F)) { + const Xbyak::Reg64 from = ctx.reg_alloc.UseGpr(args[0]); + code.vcvtusi2ss(result, result, from); + } else { + const Xbyak::Reg64 from = ctx.reg_alloc.UseScratchGpr(args[0]); + code.pxor(result, result); + + Xbyak::Label negative; + Xbyak::Label end; + + code.test(from, from); + code.js(negative); + + code.cvtsi2ss(result, from); + code.jmp(end); + + code.L(negative); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + code.mov(tmp, from); + code.shr(tmp, 1); + code.and_(from.cvt32(), 1); + code.or_(from, tmp); + code.cvtsi2ss(result, from); + code.addss(result, result); + + code.L(end); + } + + if (fbits != 0) { + const u32 scale_factor = static_cast<u32>((127 - fbits) << 23); + code.mulss(result, code.Const(xword, scale_factor)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc new file mode 100644 index 0000000000..272b896ae3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc @@ -0,0 +1,545 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/macro/concatenate_tokens.hpp> + +#define AxxEmitX64 CONCATENATE_TOKENS(Axx, EmitX64) +#define AxxEmitContext CONCATENATE_TOKENS(Axx, EmitContext) +#define AxxJitState CONCATENATE_TOKENS(Axx, JitState) +#define AxxUserConfig Axx::UserConfig + +namespace { +using Vector = std::array<u64, 2>; +} + +std::optional<AxxEmitX64::DoNotFastmemMarker> AxxEmitX64::ShouldFastmem(AxxEmitContext& ctx, IR::Inst* inst) const { + if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) { + return std::nullopt; + } + + const auto marker = std::make_tuple(ctx.Location(), inst->GetName()); + if (do_not_fastmem.count(marker) > 0) { + return std::nullopt; + } + return marker; +} + +FakeCall AxxEmitX64::FastmemCallback(u64 rip_) { + const auto iter = fastmem_patch_info.find(rip_); + + if (iter == fastmem_patch_info.end()) { + fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_); + fmt::print("Segfault wasn't at a fastmem patch location!\n"); + fmt::print("Now dumping code.......\n\n"); + Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000); + ASSERT_FALSE("iter != fastmem_patch_info.end()"); + } + + FakeCall result{ + .call_rip = iter->second.callback, + .ret_rip = iter->second.resume_rip, + }; + + if (iter->second.recompile) { + const auto marker = iter->second.marker; + do_not_fastmem.insert(marker); + InvalidateBasicBlocks({std::get<0>(marker)}); + } + + return result; +} + +template<std::size_t bitsize, auto callback> +void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); + const auto fastmem_marker = ShouldFastmem(ctx, inst); + + if (!conf.page_table && !fastmem_marker) { + // Neither fastmem nor page table: Use callbacks + if constexpr (bitsize == 128) { + ctx.reg_alloc.HostCall(nullptr, {}, args[1]); + if (ordered) { + code.mfence(); + } + code.CallFunction(memory_read_128); + ctx.reg_alloc.DefineValue(inst, xmm1); + } else { + ctx.reg_alloc.HostCall(inst, {}, args[1]); + if (ordered) { + code.mfence(); + } + Devirtualize<callback>(conf.callbacks).EmitCall(code); + code.ZeroExtendFrom(bitsize, code.ABI_RETURN); + } + EmitCheckMemoryAbort(ctx, inst); + return; + } + + if (ordered && bitsize == 128) { + // Required for atomic 128-bit loads/stores + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + ctx.reg_alloc.ScratchGpr(HostLoc::RBX); + ctx.reg_alloc.ScratchGpr(HostLoc::RCX); + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + } + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]); + const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); + + const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; + + SharedLabel abort = GenSharedLabel(), end = GenSharedLabel(); + + if (fastmem_marker) { + // Use fastmem + bool require_abort_handling; + const auto src_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling); + + const auto location = EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr, ordered); + + ctx.deferred_emits.emplace_back([=, this, &ctx] { + code.L(*abort); + code.call(wrapped_fn); + + fastmem_patch_info.emplace( + mcl::bit_cast<u64>(location), + FastmemPatchInfo{ + mcl::bit_cast<u64>(code.getCurr()), + mcl::bit_cast<u64>(wrapped_fn), + *fastmem_marker, + conf.recompile_on_fastmem_failure, + }); + + EmitCheckMemoryAbort(ctx, inst, end.get()); + code.jmp(*end, code.T_NEAR); + }); + } else { + // Use page table + ASSERT(conf.page_table); + const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, *abort, vaddr); + EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr, ordered); + + ctx.deferred_emits.emplace_back([=, this, &ctx] { + code.L(*abort); + code.call(wrapped_fn); + EmitCheckMemoryAbort(ctx, inst, end.get()); + code.jmp(*end, code.T_NEAR); + }); + } + code.L(*end); + + if constexpr (bitsize == 128) { + ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); + } else { + ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); + } +} + +template<std::size_t bitsize, auto callback> +void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[3].GetImmediateAccType()); + const auto fastmem_marker = ShouldFastmem(ctx, inst); + + if (!conf.page_table && !fastmem_marker) { + // Neither fastmem nor page table: Use callbacks + if constexpr (bitsize == 128) { + ctx.reg_alloc.Use(args[1], ABI_PARAM2); + ctx.reg_alloc.Use(args[2], HostLoc::XMM1); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + code.CallFunction(memory_write_128); + } else { + ctx.reg_alloc.HostCall(nullptr, {}, args[1], args[2]); + Devirtualize<callback>(conf.callbacks).EmitCall(code); + } + if (ordered) { + code.mfence(); + } + EmitCheckMemoryAbort(ctx, inst); + return; + } + + if (ordered && bitsize == 128) { + // Required for atomic 128-bit loads/stores + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + ctx.reg_alloc.ScratchGpr(HostLoc::RBX); + ctx.reg_alloc.ScratchGpr(HostLoc::RCX); + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + } + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]); + const int value_idx = bitsize == 128 + ? ctx.reg_alloc.UseXmm(args[2]).getIdx() + : (ordered ? ctx.reg_alloc.UseScratchGpr(args[2]).getIdx() : ctx.reg_alloc.UseGpr(args[2]).getIdx()); + + const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; + + SharedLabel abort = GenSharedLabel(), end = GenSharedLabel(); + + if (fastmem_marker) { + // Use fastmem + bool require_abort_handling; + const auto dest_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling); + + const auto location = EmitWriteMemoryMov<bitsize>(code, dest_ptr, value_idx, ordered); + + ctx.deferred_emits.emplace_back([=, this, &ctx] { + code.L(*abort); + code.call(wrapped_fn); + + fastmem_patch_info.emplace( + mcl::bit_cast<u64>(location), + FastmemPatchInfo{ + mcl::bit_cast<u64>(code.getCurr()), + mcl::bit_cast<u64>(wrapped_fn), + *fastmem_marker, + conf.recompile_on_fastmem_failure, + }); + + EmitCheckMemoryAbort(ctx, inst, end.get()); + code.jmp(*end, code.T_NEAR); + }); + } else { + // Use page table + ASSERT(conf.page_table); + const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, *abort, vaddr); + EmitWriteMemoryMov<bitsize>(code, dest_ptr, value_idx, ordered); + + ctx.deferred_emits.emplace_back([=, this, &ctx] { + code.L(*abort); + code.call(wrapped_fn); + EmitCheckMemoryAbort(ctx, inst, end.get()); + code.jmp(*end, code.T_NEAR); + }); + } + code.L(*end); +} + +template<std::size_t bitsize, auto callback> +void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor != nullptr); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); + + if constexpr (bitsize != 128) { + using T = mcl::unsigned_integer_of_size<bitsize>; + + ctx.reg_alloc.HostCall(inst, {}, args[1]); + + code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1)); + code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf)); + if (ordered) { + code.mfence(); + } + code.CallLambda( + [](AxxUserConfig& conf, Axx::VAddr vaddr) -> T { + return conf.global_monitor->ReadAndMark<T>(conf.processor_id, vaddr, [&]() -> T { + return (conf.callbacks->*callback)(vaddr); + }); + }); + code.ZeroExtendFrom(bitsize, code.ABI_RETURN); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.Use(args[1], ABI_PARAM2); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + + code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1)); + code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf)); + ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + if (ordered) { + code.mfence(); + } + code.CallLambda( + [](AxxUserConfig& conf, Axx::VAddr vaddr, Vector& ret) { + ret = conf.global_monitor->ReadAndMark<Vector>(conf.processor_id, vaddr, [&]() -> Vector { + return (conf.callbacks->*callback)(vaddr); + }); + }); + code.movups(result, xword[rsp + ABI_SHADOW_SPACE]); + ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); + + ctx.reg_alloc.DefineValue(inst, result); + } + + EmitCheckMemoryAbort(ctx, inst); +} + +template<std::size_t bitsize, auto callback> +void AxxEmitX64::EmitExclusiveWriteMemory(AxxEmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor != nullptr); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[3].GetImmediateAccType()); + + if constexpr (bitsize != 128) { + ctx.reg_alloc.HostCall(inst, {}, args[1], args[2]); + } else { + ctx.reg_alloc.Use(args[1], ABI_PARAM2); + ctx.reg_alloc.Use(args[2], HostLoc::XMM1); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(inst); + } + + Xbyak::Label end; + + code.mov(code.ABI_RETURN, u32(1)); + code.cmp(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); + code.je(end); + code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); + code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf)); + if constexpr (bitsize != 128) { + using T = mcl::unsigned_integer_of_size<bitsize>; + + code.CallLambda( + [](AxxUserConfig& conf, Axx::VAddr vaddr, T value) -> u32 { + return conf.global_monitor->DoExclusiveOperation<T>(conf.processor_id, vaddr, + [&](T expected) -> bool { + return (conf.callbacks->*callback)(vaddr, value, expected); + }) + ? 0 + : 1; + }); + if (ordered) { + code.mfence(); + } + } else { + ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + code.movaps(xword[code.ABI_PARAM3], xmm1); + code.CallLambda( + [](AxxUserConfig& conf, Axx::VAddr vaddr, Vector& value) -> u32 { + return conf.global_monitor->DoExclusiveOperation<Vector>(conf.processor_id, vaddr, + [&](Vector expected) -> bool { + return (conf.callbacks->*callback)(vaddr, value, expected); + }) + ? 0 + : 1; + }); + if (ordered) { + code.mfence(); + } + ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); + } + code.L(end); + + EmitCheckMemoryAbort(ctx, inst); +} + +template<std::size_t bitsize, auto callback> +void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + EmitExclusiveReadMemory<bitsize, callback>(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + constexpr bool ordered = true; + + if constexpr (ordered && bitsize == 128) { + // Required for atomic 128-bit loads/stores + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + ctx.reg_alloc.ScratchGpr(HostLoc::RBX); + ctx.reg_alloc.ScratchGpr(HostLoc::RCX); + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + } + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]); + const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); + + const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; + + EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); + + code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1)); + code.mov(tmp, mcl::bit_cast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(qword[tmp], vaddr); + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + SharedLabel abort = GenSharedLabel(), end = GenSharedLabel(); + bool require_abort_handling = false; + + const auto src_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling); + + const auto location = EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr, ordered); + + fastmem_patch_info.emplace( + mcl::bit_cast<u64>(location), + FastmemPatchInfo{ + mcl::bit_cast<u64>(code.getCurr()), + mcl::bit_cast<u64>(wrapped_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.L(*end); + + if (require_abort_handling) { + ctx.deferred_emits.emplace_back([=, this] { + code.L(*abort); + code.call(wrapped_fn); + code.jmp(*end, code.T_NEAR); + }); + } + } else { + code.call(wrapped_fn); + } + + code.mov(tmp, mcl::bit_cast<u64>(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + EmitWriteMemoryMov<bitsize>(code, tmp, value_idx, false); + + EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); + + if constexpr (bitsize == 128) { + ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); + } else { + ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); + } + + EmitCheckMemoryAbort(ctx, inst); +} + +template<std::size_t bitsize, auto callback> +void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + EmitExclusiveWriteMemory<bitsize, callback>(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + constexpr bool ordered = true; + + const auto value = [&] { + if constexpr (bitsize == 128) { + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + ctx.reg_alloc.ScratchGpr(HostLoc::RBX); + ctx.reg_alloc.ScratchGpr(HostLoc::RCX); + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + return ctx.reg_alloc.UseXmm(args[2]); + } else { + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + return ctx.reg_alloc.UseGpr(args[2]); + } + }(); + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[1]); + const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + + const auto wrapped_fn = exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value.getIdx())]; + + EmitExclusiveLock(code, conf, tmp, eax); + + SharedLabel end = GenSharedLabel(); + + code.mov(tmp, mcl::bit_cast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(status, u32(1)); + code.cmp(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); + code.je(*end, code.T_NEAR); + code.cmp(qword[tmp], vaddr); + code.jne(*end, code.T_NEAR); + + EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax); + + code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); + code.mov(tmp, mcl::bit_cast<u64>(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + + if constexpr (bitsize == 128) { + code.mov(rax, qword[tmp + 0]); + code.mov(rdx, qword[tmp + 8]); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(rbx, value); + code.pextrq(rcx, value, 1); + } else { + code.movaps(xmm0, value); + code.movq(rbx, xmm0); + code.punpckhqdq(xmm0, xmm0); + code.movq(rcx, xmm0); + } + } else { + EmitReadMemoryMov<bitsize>(code, rax.getIdx(), tmp, false); + } + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + SharedLabel abort = GenSharedLabel(); + bool require_abort_handling = false; + + const auto dest_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling, tmp); + + const auto location = code.getCurr(); + + if constexpr (bitsize == 128) { + code.lock(); + code.cmpxchg16b(ptr[dest_ptr]); + } else { + switch (bitsize) { + case 8: + code.lock(); + code.cmpxchg(code.byte[dest_ptr], value.cvt8()); + break; + case 16: + code.lock(); + code.cmpxchg(word[dest_ptr], value.cvt16()); + break; + case 32: + code.lock(); + code.cmpxchg(dword[dest_ptr], value.cvt32()); + break; + case 64: + code.lock(); + code.cmpxchg(qword[dest_ptr], value.cvt64()); + break; + default: + UNREACHABLE(); + } + } + + code.setnz(status.cvt8()); + + ctx.deferred_emits.emplace_back([=, this] { + code.L(*abort); + code.call(wrapped_fn); + + fastmem_patch_info.emplace( + mcl::bit_cast<u64>(location), + FastmemPatchInfo{ + mcl::bit_cast<u64>(code.getCurr()), + mcl::bit_cast<u64>(wrapped_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + code.jmp(*end, code.T_NEAR); + }); + } else { + code.call(wrapped_fn); + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + } + + code.L(*end); + + EmitExclusiveUnlock(code, conf, tmp, eax); + + ctx.reg_alloc.DefineValue(inst, status); + + EmitCheckMemoryAbort(ctx, inst); +} + +#undef AxxEmitX64 +#undef AxxEmitContext +#undef AxxJitState +#undef AxxUserConfig diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h new file mode 100644 index 0000000000..c99980d617 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h @@ -0,0 +1,388 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit_cast.hpp> +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/a32_emit_x64.h" +#include "dynarmic/backend/x64/a64_emit_x64.h" +#include "dynarmic/backend/x64/exclusive_monitor_friend.h" +#include "dynarmic/common/spin_lock_x64.h" +#include "dynarmic/interface/exclusive_monitor.h" +#include "dynarmic/ir/acc_type.h" + +namespace Dynarmic::Backend::X64 { + +namespace { + +using namespace Xbyak::util; + +constexpr size_t page_bits = 12; +constexpr size_t page_size = 1 << page_bits; +constexpr size_t page_mask = (1 << page_bits) - 1; + +template<typename EmitContext> +void EmitDetectMisalignedVAddr(BlockOfCode& code, EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr, Xbyak::Reg64 tmp) { + if (bitsize == 8 || (ctx.conf.detect_misaligned_access_via_page_table & bitsize) == 0) { + return; + } + + const u32 align_mask = [bitsize]() -> u32 { + switch (bitsize) { + case 16: + return 0b1; + case 32: + return 0b11; + case 64: + return 0b111; + case 128: + return 0b1111; + default: + UNREACHABLE(); + } + }(); + + code.test(vaddr, align_mask); + + if (!ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) { + code.jnz(abort, code.T_NEAR); + return; + } + + const u32 page_align_mask = static_cast<u32>(page_size - 1) & ~align_mask; + + SharedLabel detect_boundary = GenSharedLabel(), resume = GenSharedLabel(); + + code.jnz(*detect_boundary, code.T_NEAR); + code.L(*resume); + + ctx.deferred_emits.emplace_back([=, &code] { + code.L(*detect_boundary); + code.mov(tmp, vaddr); + code.and_(tmp, page_align_mask); + code.cmp(tmp, page_align_mask); + code.jne(*resume, code.T_NEAR); + // NOTE: We expect to fallthrough into abort code here. + }); +} + +template<typename EmitContext> +Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr); + +template<> +[[maybe_unused]] Xbyak::RegExp EmitVAddrLookup<A32EmitContext>(BlockOfCode& code, A32EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr) { + const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg32 tmp = ctx.conf.absolute_offset_page_table ? page.cvt32() : ctx.reg_alloc.ScratchGpr().cvt32(); + + EmitDetectMisalignedVAddr(code, ctx, bitsize, abort, vaddr, tmp.cvt64()); + + // TODO: This code assumes vaddr has been zext from 32-bits to 64-bits. + + code.mov(tmp, vaddr.cvt32()); + code.shr(tmp, static_cast<int>(page_bits)); + + code.mov(page, qword[r14 + tmp.cvt64() * sizeof(void*)]); + if (ctx.conf.page_table_pointer_mask_bits == 0) { + code.test(page, page); + } else { + code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits); + } + code.jz(abort, code.T_NEAR); + if (ctx.conf.absolute_offset_page_table) { + return page + vaddr; + } + code.mov(tmp, vaddr.cvt32()); + code.and_(tmp, static_cast<u32>(page_mask)); + return page + tmp.cvt64(); +} + +template<> +[[maybe_unused]] Xbyak::RegExp EmitVAddrLookup<A64EmitContext>(BlockOfCode& code, A64EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr) { + const size_t valid_page_index_bits = ctx.conf.page_table_address_space_bits - page_bits; + const size_t unused_top_bits = 64 - ctx.conf.page_table_address_space_bits; + + const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 tmp = ctx.conf.absolute_offset_page_table ? page : ctx.reg_alloc.ScratchGpr(); + + EmitDetectMisalignedVAddr(code, ctx, bitsize, abort, vaddr, tmp); + + if (unused_top_bits == 0) { + code.mov(tmp, vaddr); + code.shr(tmp, int(page_bits)); + } else if (ctx.conf.silently_mirror_page_table) { + if (valid_page_index_bits >= 32) { + if (code.HasHostFeature(HostFeature::BMI2)) { + const Xbyak::Reg64 bit_count = ctx.reg_alloc.ScratchGpr(); + code.mov(bit_count, unused_top_bits); + code.bzhi(tmp, vaddr, bit_count); + code.shr(tmp, int(page_bits)); + ctx.reg_alloc.Release(bit_count); + } else { + code.mov(tmp, vaddr); + code.shl(tmp, int(unused_top_bits)); + code.shr(tmp, int(unused_top_bits + page_bits)); + } + } else { + code.mov(tmp, vaddr); + code.shr(tmp, int(page_bits)); + code.and_(tmp, u32((1 << valid_page_index_bits) - 1)); + } + } else { + ASSERT(valid_page_index_bits < 32); + code.mov(tmp, vaddr); + code.shr(tmp, int(page_bits)); + code.test(tmp, u32(-(1 << valid_page_index_bits))); + code.jnz(abort, code.T_NEAR); + } + code.mov(page, qword[r14 + tmp * sizeof(void*)]); + if (ctx.conf.page_table_pointer_mask_bits == 0) { + code.test(page, page); + } else { + code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits); + } + code.jz(abort, code.T_NEAR); + if (ctx.conf.absolute_offset_page_table) { + return page + vaddr; + } + code.mov(tmp, vaddr); + code.and_(tmp, static_cast<u32>(page_mask)); + return page + tmp; +} + +template<typename EmitContext> +Xbyak::RegExp EmitFastmemVAddr(BlockOfCode& code, EmitContext& ctx, Xbyak::Label& abort, Xbyak::Reg64 vaddr, bool& require_abort_handling, std::optional<Xbyak::Reg64> tmp = std::nullopt); + +template<> +[[maybe_unused]] Xbyak::RegExp EmitFastmemVAddr<A32EmitContext>(BlockOfCode&, A32EmitContext&, Xbyak::Label&, Xbyak::Reg64 vaddr, bool&, std::optional<Xbyak::Reg64>) { + return r13 + vaddr; +} + +template<> +[[maybe_unused]] Xbyak::RegExp EmitFastmemVAddr<A64EmitContext>(BlockOfCode& code, A64EmitContext& ctx, Xbyak::Label& abort, Xbyak::Reg64 vaddr, bool& require_abort_handling, std::optional<Xbyak::Reg64> tmp) { + const size_t unused_top_bits = 64 - ctx.conf.fastmem_address_space_bits; + + if (unused_top_bits == 0) { + return r13 + vaddr; + } else if (ctx.conf.silently_mirror_fastmem) { + if (!tmp) { + tmp = ctx.reg_alloc.ScratchGpr(); + } + if (unused_top_bits < 32) { + code.mov(*tmp, vaddr); + code.shl(*tmp, int(unused_top_bits)); + code.shr(*tmp, int(unused_top_bits)); + } else if (unused_top_bits == 32) { + code.mov(tmp->cvt32(), vaddr.cvt32()); + } else { + code.mov(tmp->cvt32(), vaddr.cvt32()); + code.and_(*tmp, u32((1 << ctx.conf.fastmem_address_space_bits) - 1)); + } + return r13 + *tmp; + } else { + if (ctx.conf.fastmem_address_space_bits < 32) { + code.test(vaddr, u32(-(1 << ctx.conf.fastmem_address_space_bits))); + code.jnz(abort, code.T_NEAR); + require_abort_handling = true; + } else { + // TODO: Consider having TEST as above but coalesce 64-bit constant in register allocator + if (!tmp) { + tmp = ctx.reg_alloc.ScratchGpr(); + } + code.mov(*tmp, vaddr); + code.shr(*tmp, int(ctx.conf.fastmem_address_space_bits)); + code.jnz(abort, code.T_NEAR); + require_abort_handling = true; + } + return r13 + vaddr; + } +} + +template<std::size_t bitsize> +const void* EmitReadMemoryMov(BlockOfCode& code, int value_idx, const Xbyak::RegExp& addr, bool ordered) { + if (ordered) { + if constexpr (bitsize != 128) { + code.xor_(Xbyak::Reg32{value_idx}, Xbyak::Reg32{value_idx}); + } else { + code.xor_(eax, eax); + code.xor_(ebx, ebx); + code.xor_(ecx, ecx); + code.xor_(edx, edx); + } + + const void* fastmem_location = code.getCurr(); + switch (bitsize) { + case 8: + code.lock(); + code.xadd(code.byte[addr], Xbyak::Reg32{value_idx}.cvt8()); + break; + case 16: + code.lock(); + code.xadd(word[addr], Xbyak::Reg16{value_idx}); + break; + case 32: + code.lock(); + code.xadd(dword[addr], Xbyak::Reg32{value_idx}); + break; + case 64: + code.lock(); + code.xadd(qword[addr], Xbyak::Reg64{value_idx}); + break; + case 128: + code.lock(); + code.cmpxchg16b(xword[addr]); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(Xbyak::Xmm{value_idx}, rax); + code.pinsrq(Xbyak::Xmm{value_idx}, rdx, 1); + } else { + code.movq(Xbyak::Xmm{value_idx}, rax); + code.movq(xmm0, rdx); + code.punpcklqdq(Xbyak::Xmm{value_idx}, xmm0); + } + break; + default: + ASSERT_FALSE("Invalid bitsize"); + } + return fastmem_location; + } + + const void* fastmem_location = code.getCurr(); + switch (bitsize) { + case 8: + code.movzx(Xbyak::Reg32{value_idx}, code.byte[addr]); + break; + case 16: + code.movzx(Xbyak::Reg32{value_idx}, word[addr]); + break; + case 32: + code.mov(Xbyak::Reg32{value_idx}, dword[addr]); + break; + case 64: + code.mov(Xbyak::Reg64{value_idx}, qword[addr]); + break; + case 128: + code.movups(Xbyak::Xmm{value_idx}, xword[addr]); + break; + default: + ASSERT_FALSE("Invalid bitsize"); + } + return fastmem_location; +} + +template<std::size_t bitsize> +const void* EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int value_idx, bool ordered) { + if (ordered) { + if constexpr (bitsize == 128) { + code.xor_(eax, eax); + code.xor_(edx, edx); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(rbx, Xbyak::Xmm{value_idx}); + code.pextrq(rcx, Xbyak::Xmm{value_idx}, 1); + } else { + code.movaps(xmm0, Xbyak::Xmm{value_idx}); + code.movq(rbx, xmm0); + code.punpckhqdq(xmm0, xmm0); + code.movq(rcx, xmm0); + } + } + + const void* fastmem_location = code.getCurr(); + switch (bitsize) { + case 8: + code.xchg(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8()); + break; + case 16: + code.xchg(word[addr], Xbyak::Reg16{value_idx}); + break; + case 32: + code.xchg(dword[addr], Xbyak::Reg32{value_idx}); + break; + case 64: + code.xchg(qword[addr], Xbyak::Reg64{value_idx}); + break; + case 128: { + Xbyak::Label loop; + code.L(loop); + code.lock(); + code.cmpxchg16b(xword[addr]); + code.jnz(loop); + break; + } + default: + ASSERT_FALSE("Invalid bitsize"); + } + return fastmem_location; + } + + const void* fastmem_location = code.getCurr(); + switch (bitsize) { + case 8: + code.mov(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8()); + break; + case 16: + code.mov(word[addr], Xbyak::Reg16{value_idx}); + break; + case 32: + code.mov(dword[addr], Xbyak::Reg32{value_idx}); + break; + case 64: + code.mov(qword[addr], Xbyak::Reg64{value_idx}); + break; + case 128: + code.movups(xword[addr], Xbyak::Xmm{value_idx}); + break; + default: + ASSERT_FALSE("Invalid bitsize"); + } + return fastmem_location; +} + +template<typename UserConfig> +void EmitExclusiveLock(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 pointer, Xbyak::Reg32 tmp) { + if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) { + return; + } + + code.mov(pointer, mcl::bit_cast<u64>(GetExclusiveMonitorLockPointer(conf.global_monitor))); + EmitSpinLockLock(code, pointer, tmp); +} + +template<typename UserConfig> +void EmitExclusiveUnlock(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 pointer, Xbyak::Reg32 tmp) { + if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) { + return; + } + + code.mov(pointer, mcl::bit_cast<u64>(GetExclusiveMonitorLockPointer(conf.global_monitor))); + EmitSpinLockUnlock(code, pointer, tmp); +} + +template<typename UserConfig> +void EmitExclusiveTestAndClear(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 vaddr, Xbyak::Reg64 pointer, Xbyak::Reg64 tmp) { + if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) { + return; + } + + code.mov(tmp, 0xDEAD'DEAD'DEAD'DEAD); + const size_t processor_count = GetExclusiveMonitorProcessorCount(conf.global_monitor); + for (size_t processor_index = 0; processor_index < processor_count; processor_index++) { + if (processor_index == conf.processor_id) { + continue; + } + Xbyak::Label ok; + code.mov(pointer, mcl::bit_cast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, processor_index))); + code.cmp(qword[pointer], vaddr); + code.jne(ok); + code.mov(qword[pointer], tmp); + code.L(ok); + } +} + +inline bool IsOrdered(IR::AccType acctype) { + return acctype == IR::AccType::ORDERED || acctype == IR::AccType::ORDEREDRW || acctype == IR::AccType::LIMITEDORDERED; +} + +} // namespace + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_packed.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_packed.cpp new file mode 100644 index 0000000000..90f06a9015 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_packed.cpp @@ -0,0 +1,693 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +void EmitX64::EmitPackedAddU8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + code.paddb(xmm_a, xmm_b); + + if (ge_inst) { + const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(); + + code.pcmpeqb(ones, ones); + + code.movdqa(xmm_ge, xmm_a); + code.pminub(xmm_ge, xmm_b); + code.pcmpeqb(xmm_ge, xmm_b); + code.pxor(xmm_ge, ones); + + ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); + } + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitPackedAddS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + if (ge_inst) { + const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); + + code.pcmpeqb(xmm0, xmm0); + + code.movdqa(xmm_ge, xmm_a); + code.paddsb(xmm_ge, xmm_b); + code.pcmpgtb(xmm_ge, xmm0); + + ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); + } + + code.paddb(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + code.paddw(xmm_a, xmm_b); + + if (ge_inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(); + + code.pcmpeqb(ones, ones); + + code.movdqa(xmm_ge, xmm_a); + code.pminuw(xmm_ge, xmm_b); + code.pcmpeqw(xmm_ge, xmm_b); + code.pxor(xmm_ge, ones); + + ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); + } else { + const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(); + + // !(b <= a+b) == b > a+b + code.movdqa(tmp_a, xmm_a); + code.movdqa(tmp_b, xmm_b); + code.paddw(tmp_a, code.Const(xword, 0x80008000)); + code.paddw(tmp_b, code.Const(xword, 0x80008000)); + code.pcmpgtw(tmp_b, tmp_a); // *Signed* comparison! + + ctx.reg_alloc.DefineValue(ge_inst, tmp_b); + } + } + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitPackedAddS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + if (ge_inst) { + const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); + + code.pcmpeqw(xmm0, xmm0); + + code.movdqa(xmm_ge, xmm_a); + code.paddsw(xmm_ge, xmm_b); + code.pcmpgtw(xmm_ge, xmm0); + + ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); + } + + code.paddw(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitPackedSubU8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + if (ge_inst) { + const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(xmm_ge, xmm_a); + code.pmaxub(xmm_ge, xmm_b); + code.pcmpeqb(xmm_ge, xmm_a); + + ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); + } + + code.psubb(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitPackedSubS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + if (ge_inst) { + const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); + + code.pcmpeqb(xmm0, xmm0); + + code.movdqa(xmm_ge, xmm_a); + code.psubsb(xmm_ge, xmm_b); + code.pcmpgtb(xmm_ge, xmm0); + + ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); + } + + code.psubb(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + if (!ge_inst) { + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + code.psubw(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); + return; + } + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(xmm_ge, xmm_a); + code.pmaxuw(xmm_ge, xmm_b); // Requires SSE 4.1 + code.pcmpeqw(xmm_ge, xmm_a); + + code.psubw(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); + ctx.reg_alloc.DefineValue(inst, xmm_a); + return; + } + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(); + + // (a >= b) == !(b > a) + code.pcmpeqb(ones, ones); + code.paddw(xmm_a, code.Const(xword, 0x80008000)); + code.paddw(xmm_b, code.Const(xword, 0x80008000)); + code.movdqa(xmm_ge, xmm_b); + code.pcmpgtw(xmm_ge, xmm_a); // *Signed* comparison! + code.pxor(xmm_ge, ones); + + code.psubw(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitPackedSubS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + if (ge_inst) { + const Xbyak::Xmm xmm_ge = ctx.reg_alloc.ScratchXmm(); + + code.pcmpeqw(xmm0, xmm0); + + code.movdqa(xmm_ge, xmm_a); + code.psubsw(xmm_ge, xmm_b); + code.pcmpgtw(xmm_ge, xmm0); + + ctx.reg_alloc.DefineValue(ge_inst, xmm_ge); + } + + code.psubw(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (args[0].IsInXmm() || args[1].IsInXmm()) { + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm ones = ctx.reg_alloc.ScratchXmm(); + + // Since, + // pavg(a, b) == (a + b + 1) >> 1 + // Therefore, + // ~pavg(~a, ~b) == (a + b) >> 1 + + code.pcmpeqb(ones, ones); + code.pxor(xmm_a, ones); + code.pxor(xmm_b, ones); + code.pavgb(xmm_a, xmm_b); + code.pxor(xmm_a, ones); + + ctx.reg_alloc.DefineValue(inst, xmm_a); + } else { + const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 and_a_b = reg_a; + const Xbyak::Reg32 result = reg_a; + + // This relies on the equality x+y == ((x&y) << 1) + (x^y). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1). + // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below. + + code.mov(xor_a_b, reg_a); + code.and_(and_a_b, reg_b); + code.xor_(xor_a_b, reg_b); + code.shr(xor_a_b, 1); + code.and_(xor_a_b, 0x7F7F7F7F); + code.add(result, xor_a_b); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (args[0].IsInXmm() || args[1].IsInXmm()) { + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, xmm_a); + code.pand(xmm_a, xmm_b); + code.pxor(tmp, xmm_b); + code.psrlw(tmp, 1); + code.paddw(xmm_a, tmp); + + ctx.reg_alloc.DefineValue(inst, xmm_a); + } else { + const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 and_a_b = reg_a; + const Xbyak::Reg32 result = reg_a; + + // This relies on the equality x+y == ((x&y) << 1) + (x^y). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1). + // We mask by 0x7FFF to remove the LSB so that it doesn't leak into the field below. + + code.mov(xor_a_b, reg_a); + code.and_(and_a_b, reg_b); + code.xor_(xor_a_b, reg_b); + code.shr(xor_a_b, 1); + code.and_(xor_a_b, 0x7FFF7FFF); + code.add(result, xor_a_b); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitPackedHalvingAddS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 reg_b = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + const Xbyak::Reg32 xor_a_b = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 and_a_b = reg_a; + const Xbyak::Reg32 result = reg_a; + const Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32(); + + // This relies on the equality x+y == ((x&y) << 1) + (x^y). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1). + // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below. + // carry propagates the sign bit from (x^y)>>1 upwards by one. + + code.mov(xor_a_b, reg_a); + code.and_(and_a_b, reg_b); + code.xor_(xor_a_b, reg_b); + code.mov(carry, xor_a_b); + code.and_(carry, 0x80808080); + code.shr(xor_a_b, 1); + code.and_(xor_a_b, 0x7F7F7F7F); + code.add(result, xor_a_b); + code.xor_(result, carry); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitPackedHalvingAddS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // This relies on the equality x+y == ((x&y) << 1) + (x^y). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>>1). + // The arithmetic shift right makes this signed. + + code.movdqa(tmp, xmm_a); + code.pand(xmm_a, xmm_b); + code.pxor(tmp, xmm_b); + code.psraw(tmp, 1); + code.paddw(xmm_a, tmp); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitPackedHalvingSubU8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); + + // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x+y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y). + + code.xor_(minuend, subtrahend); + code.and_(subtrahend, minuend); + code.shr(minuend, 1); + + // At this point, + // minuend := (a^b) >> 1 + // subtrahend := (a^b) & b + + // We must now perform a partitioned subtraction. + // We can do this because minuend contains 7 bit fields. + // We use the extra bit in minuend as a bit to borrow from; we set this bit. + // We invert this bit at the end as this tells us if that bit was borrowed from. + code.or_(minuend, 0x80808080); + code.sub(minuend, subtrahend); + code.xor_(minuend, 0x80808080); + + // minuend now contains the desired result. + ctx.reg_alloc.DefineValue(inst, minuend); +} + +void EmitX64::EmitPackedHalvingSubS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 minuend = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 subtrahend = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); + + const Xbyak::Reg32 carry = ctx.reg_alloc.ScratchGpr().cvt32(); + + // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y). + + code.xor_(minuend, subtrahend); + code.and_(subtrahend, minuend); + code.mov(carry, minuend); + code.and_(carry, 0x80808080); + code.shr(minuend, 1); + + // At this point, + // minuend := (a^b) >> 1 + // subtrahend := (a^b) & b + // carry := (a^b) & 0x80808080 + + // We must now perform a partitioned subtraction. + // We can do this because minuend contains 7 bit fields. + // We use the extra bit in minuend as a bit to borrow from; we set this bit. + // We invert this bit at the end as this tells us if that bit was borrowed from. + // We then sign extend the result into this bit. + code.or_(minuend, 0x80808080); + code.sub(minuend, subtrahend); + code.xor_(minuend, 0x80808080); + code.xor_(minuend, carry); + + ctx.reg_alloc.DefineValue(inst, minuend); +} + +void EmitX64::EmitPackedHalvingSubU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]); + + // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y). + + code.pxor(minuend, subtrahend); + code.pand(subtrahend, minuend); + code.psrlw(minuend, 1); + + // At this point, + // minuend := (a^b) >> 1 + // subtrahend := (a^b) & b + + code.psubw(minuend, subtrahend); + + ctx.reg_alloc.DefineValue(inst, minuend); +} + +void EmitX64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm minuend = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm subtrahend = ctx.reg_alloc.UseScratchXmm(args[1]); + + // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x-y)/2, we can instead calculate ((x^y)>>>1) - ((x^y)&y). + + code.pxor(minuend, subtrahend); + code.pand(subtrahend, minuend); + code.psraw(minuend, 1); + + // At this point, + // minuend := (a^b) >>> 1 + // subtrahend := (a^b) & b + + code.psubw(minuend, subtrahend); + + ctx.reg_alloc.DefineValue(inst, minuend); +} + +static void EmitPackedSubAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const Xbyak::Reg32 reg_a_hi = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 reg_b_hi = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); + const Xbyak::Reg32 reg_a_lo = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 reg_b_lo = ctx.reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 reg_sum, reg_diff; + + if (is_signed) { + code.movsx(reg_a_lo, reg_a_hi.cvt16()); + code.movsx(reg_b_lo, reg_b_hi.cvt16()); + code.sar(reg_a_hi, 16); + code.sar(reg_b_hi, 16); + } else { + code.movzx(reg_a_lo, reg_a_hi.cvt16()); + code.movzx(reg_b_lo, reg_b_hi.cvt16()); + code.shr(reg_a_hi, 16); + code.shr(reg_b_hi, 16); + } + + if (hi_is_sum) { + code.sub(reg_a_lo, reg_b_hi); + code.add(reg_a_hi, reg_b_lo); + reg_diff = reg_a_lo; + reg_sum = reg_a_hi; + } else { + code.add(reg_a_lo, reg_b_hi); + code.sub(reg_a_hi, reg_b_lo); + reg_diff = reg_a_hi; + reg_sum = reg_a_lo; + } + + if (ge_inst) { + // The reg_b registers are no longer required. + const Xbyak::Reg32 ge_sum = reg_b_hi; + const Xbyak::Reg32 ge_diff = reg_b_lo; + + code.mov(ge_sum, reg_sum); + code.mov(ge_diff, reg_diff); + + if (!is_signed) { + code.shl(ge_sum, 15); + code.sar(ge_sum, 31); + } else { + code.not_(ge_sum); + code.sar(ge_sum, 31); + } + code.not_(ge_diff); + code.sar(ge_diff, 31); + code.and_(ge_sum, hi_is_sum ? 0xFFFF0000 : 0x0000FFFF); + code.and_(ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000); + code.or_(ge_sum, ge_diff); + + ctx.reg_alloc.DefineValue(ge_inst, ge_sum); + } + + if (is_halving) { + code.shl(reg_a_lo, 15); + code.shr(reg_a_hi, 1); + } else { + code.shl(reg_a_lo, 16); + } + + // reg_a_lo now contains the low word and reg_a_hi now contains the high word. + // Merge them. + code.shld(reg_a_hi, reg_a_lo, 16); + + ctx.reg_alloc.DefineValue(inst, reg_a_hi); +} + +void EmitX64::EmitPackedAddSubU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, false, false); +} + +void EmitX64::EmitPackedAddSubS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, true, false); +} + +void EmitX64::EmitPackedSubAddU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, false, false); +} + +void EmitX64::EmitPackedSubAddS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, true, false); +} + +void EmitX64::EmitPackedHalvingAddSubU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, false, true); +} + +void EmitX64::EmitPackedHalvingAddSubS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, true, true); +} + +void EmitX64::EmitPackedHalvingSubAddU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, false, true); +} + +void EmitX64::EmitPackedHalvingSubAddS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, true, true); +} + +static void EmitPackedOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + (code.*fn)(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitPackedSaturatedAddU8(EmitContext& ctx, IR::Inst* inst) { + EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusb); +} + +void EmitX64::EmitPackedSaturatedAddS8(EmitContext& ctx, IR::Inst* inst) { + EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsb); +} + +void EmitX64::EmitPackedSaturatedSubU8(EmitContext& ctx, IR::Inst* inst) { + EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusb); +} + +void EmitX64::EmitPackedSaturatedSubS8(EmitContext& ctx, IR::Inst* inst) { + EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsb); +} + +void EmitX64::EmitPackedSaturatedAddU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddusw); +} + +void EmitX64::EmitPackedSaturatedAddS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddsw); +} + +void EmitX64::EmitPackedSaturatedSubU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubusw); +} + +void EmitX64::EmitPackedSaturatedSubS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubsw); +} + +void EmitX64::EmitPackedAbsDiffSumU8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // TODO: Optimize with zero-extension detection + code.movaps(tmp, code.Const(xword, 0x0000'0000'ffff'ffff)); + code.pand(xmm_a, tmp); + code.pand(xmm_b, tmp); + code.psadbw(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitPackedSelect(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const size_t num_args_in_xmm = args[0].IsInXmm() + args[1].IsInXmm() + args[2].IsInXmm(); + + if (num_args_in_xmm >= 2) { + const Xbyak::Xmm ge = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm to = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm from = ctx.reg_alloc.UseScratchXmm(args[2]); + + code.pand(from, ge); + code.pandn(ge, to); + code.por(from, ge); + + ctx.reg_alloc.DefineValue(inst, from); + } else if (code.HasHostFeature(HostFeature::BMI1)) { + const Xbyak::Reg32 ge = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 to = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); + const Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32(); + + code.and_(from, ge); + code.andn(to, ge, to); + code.or_(from, to); + + ctx.reg_alloc.DefineValue(inst, from); + } else { + const Xbyak::Reg32 ge = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 to = ctx.reg_alloc.UseGpr(args[1]).cvt32(); + const Xbyak::Reg32 from = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32(); + + code.and_(from, ge); + code.not_(ge); + code.and_(ge, to); + code.or_(from, ge); + + ctx.reg_alloc.DefineValue(inst, from); + } +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_saturation.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_saturation.cpp new file mode 100644 index 0000000000..24fb895b61 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_saturation.cpp @@ -0,0 +1,303 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <limits> + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> +#include <mcl/type_traits/integer_of_size.hpp> + +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +namespace { + +enum class Op { + Add, + Sub, +}; + +template<Op op, size_t size, bool has_overflow_inst = false> +void EmitSignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(size); + Xbyak::Reg addend = ctx.reg_alloc.UseGpr(args[1]).changeBit(size); + Xbyak::Reg overflow = ctx.reg_alloc.ScratchGpr().changeBit(size); + + constexpr u64 int_max = static_cast<u64>(std::numeric_limits<mcl::signed_integer_of_size<size>>::max()); + if constexpr (size < 64) { + code.xor_(overflow.cvt32(), overflow.cvt32()); + code.bt(result.cvt32(), size - 1); + code.adc(overflow.cvt32(), int_max); + } else { + code.mov(overflow, int_max); + code.bt(result, 63); + code.adc(overflow, 0); + } + + // overflow now contains 0x7F... if a was positive, or 0x80... if a was negative + + if constexpr (op == Op::Add) { + code.add(result, addend); + } else { + code.sub(result, addend); + } + + if constexpr (size == 8) { + code.cmovo(result.cvt32(), overflow.cvt32()); + } else { + code.cmovo(result, overflow); + } + + code.seto(overflow.cvt8()); + if constexpr (has_overflow_inst) { + if (const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp)) { + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + } + } else { + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow.cvt8()); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<Op op, size_t size> +void EmitUnsignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Xbyak::Reg op_result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(size); + Xbyak::Reg addend = ctx.reg_alloc.UseScratchGpr(args[1]).changeBit(size); + + constexpr u64 boundary = op == Op::Add ? std::numeric_limits<mcl::unsigned_integer_of_size<size>>::max() : 0; + + if constexpr (op == Op::Add) { + code.add(op_result, addend); + } else { + code.sub(op_result, addend); + } + code.mov(addend, boundary); + if constexpr (size == 8) { + code.cmovae(addend.cvt32(), op_result.cvt32()); + } else { + code.cmovae(addend, op_result); + } + + const Xbyak::Reg overflow = ctx.reg_alloc.ScratchGpr(); + code.setb(overflow.cvt8()); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow.cvt8()); + + ctx.reg_alloc.DefineValue(inst, addend); +} + +} // anonymous namespace + +void EmitX64::EmitSignedSaturatedAddWithFlag32(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp<Op::Add, 32, true>(code, ctx, inst); +} + +void EmitX64::EmitSignedSaturatedSubWithFlag32(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp<Op::Sub, 32, true>(code, ctx, inst); +} + +void EmitX64::EmitSignedSaturation(EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const size_t N = args[1].GetImmediateU8(); + ASSERT(N >= 1 && N <= 32); + + if (N == 32) { + if (overflow_inst) { + const auto no_overflow = IR::Value(false); + overflow_inst->ReplaceUsesWith(no_overflow); + } + // TODO: DefineValue directly on Argument + const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]); + code.mov(result.cvt32(), source.cvt32()); + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const u32 mask = (1u << N) - 1; + const u32 positive_saturated_value = (1u << (N - 1)) - 1; + const u32 negative_saturated_value = 1u << (N - 1); + + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 overflow = ctx.reg_alloc.ScratchGpr().cvt32(); + + // overflow now contains a value between 0 and mask if it was originally between {negative,positive}_saturated_value. + code.lea(overflow, code.ptr[reg_a.cvt64() + negative_saturated_value]); + + // Put the appropriate saturated value in result + code.mov(result, reg_a); + code.sar(result, 31); + code.xor_(result, positive_saturated_value); + + // Do the saturation + code.cmp(overflow, mask); + code.cmovbe(result, reg_a); + + if (overflow_inst) { + code.seta(overflow.cvt8()); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitUnsignedSaturation(EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const size_t N = args[1].GetImmediateU8(); + ASSERT(N <= 31); + + const u32 saturated_value = (1u << N) - 1; + + const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg32 reg_a = ctx.reg_alloc.UseGpr(args[0]).cvt32(); + const Xbyak::Reg32 overflow = ctx.reg_alloc.ScratchGpr().cvt32(); + + // Pseudocode: result = clamp(reg_a, 0, saturated_value); + code.xor_(overflow, overflow); + code.cmp(reg_a, saturated_value); + code.mov(result, saturated_value); + code.cmovle(result, overflow); + code.cmovbe(result, reg_a); + + if (overflow_inst) { + code.seta(overflow.cvt8()); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitSignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp<Op::Add, 8>(code, ctx, inst); +} + +void EmitX64::EmitSignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp<Op::Add, 16>(code, ctx, inst); +} + +void EmitX64::EmitSignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp<Op::Add, 32>(code, ctx, inst); +} + +void EmitX64::EmitSignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp<Op::Add, 64>(code, ctx, inst); +} + +void EmitX64::EmitSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg32 x = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); + const Xbyak::Reg32 y = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.movsx(x, x.cvt16()); + code.movsx(y, y.cvt16()); + + code.imul(x, y); + code.lea(y, ptr[x.cvt64() + x.cvt64()]); + code.mov(tmp, x); + code.shr(tmp, 15); + code.xor_(y, x); + code.mov(y, 0x7FFF); + code.cmovns(y, tmp); + + code.sets(tmp.cvt8()); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], tmp.cvt8()); + + ctx.reg_alloc.DefineValue(inst, y); +} + +void EmitX64::EmitSignedSaturatedDoublingMultiplyReturnHigh32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 x = ctx.reg_alloc.UseScratchGpr(args[0]); + const Xbyak::Reg64 y = ctx.reg_alloc.UseScratchGpr(args[1]); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + + code.movsxd(x, x.cvt32()); + code.movsxd(y, y.cvt32()); + + code.imul(x, y); + code.lea(y, ptr[x + x]); + code.mov(tmp, x); + code.shr(tmp, 31); + code.xor_(y, x); + code.mov(y.cvt32(), 0x7FFFFFFF); + code.cmovns(y.cvt32(), tmp.cvt32()); + + code.sets(tmp.cvt8()); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], tmp.cvt8()); + + ctx.reg_alloc.DefineValue(inst, y); +} + +void EmitX64::EmitSignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp<Op::Sub, 8>(code, ctx, inst); +} + +void EmitX64::EmitSignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp<Op::Sub, 16>(code, ctx, inst); +} + +void EmitX64::EmitSignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp<Op::Sub, 32>(code, ctx, inst); +} + +void EmitX64::EmitSignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp<Op::Sub, 64>(code, ctx, inst); +} + +void EmitX64::EmitUnsignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) { + EmitUnsignedSaturatedOp<Op::Add, 8>(code, ctx, inst); +} + +void EmitX64::EmitUnsignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) { + EmitUnsignedSaturatedOp<Op::Add, 16>(code, ctx, inst); +} + +void EmitX64::EmitUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitUnsignedSaturatedOp<Op::Add, 32>(code, ctx, inst); +} + +void EmitX64::EmitUnsignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitUnsignedSaturatedOp<Op::Add, 64>(code, ctx, inst); +} + +void EmitX64::EmitUnsignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) { + EmitUnsignedSaturatedOp<Op::Sub, 8>(code, ctx, inst); +} + +void EmitX64::EmitUnsignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) { + EmitUnsignedSaturatedOp<Op::Sub, 16>(code, ctx, inst); +} + +void EmitX64::EmitUnsignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) { + EmitUnsignedSaturatedOp<Op::Sub, 32>(code, ctx, inst); +} + +void EmitX64::EmitUnsignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) { + EmitUnsignedSaturatedOp<Op::Sub, 64>(code, ctx, inst); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sha.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sha.cpp new file mode 100644 index 0000000000..92e0841d8b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sha.cpp @@ -0,0 +1,81 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/emit_x64.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +void EmitX64::EmitSHA256Hash(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const bool part1 = args[3].GetImmediateU1(); + + ASSERT(code.HasHostFeature(HostFeature::SHA)); + + // 3 2 1 0 + // x = d c b a + // y = h g f e + // w = wk3 wk2 wk1 wk0 + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm w = ctx.reg_alloc.UseXmm(args[2]); + + // x64 expects: + // 3 2 1 0 + // src1 = c d g h + // src2 = a b e f + // xmm0 = - - wk1 wk0 + + code.movaps(xmm0, y); + code.shufps(xmm0, x, 0b10111011); // src1 + code.shufps(y, x, 0b00010001); // src2 + code.movaps(x, xmm0); + + code.movaps(xmm0, w); + code.sha256rnds2(x, y); + + code.punpckhqdq(xmm0, xmm0); + code.sha256rnds2(y, x); + + code.shufps(y, x, part1 ? 0b10111011 : 0b00010001); + + ctx.reg_alloc.DefineValue(inst, y); +} + +void EmitX64::EmitSHA256MessageSchedule0(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ASSERT(code.HasHostFeature(HostFeature::SHA)); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + + code.sha256msg1(x, y); + + ctx.reg_alloc.DefineValue(inst, x); +} + +void EmitX64::EmitSHA256MessageSchedule1(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ASSERT(code.HasHostFeature(HostFeature::SHA)); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm z = ctx.reg_alloc.UseXmm(args[2]); + + code.movaps(xmm0, z); + code.palignr(xmm0, y, 4); + code.paddd(x, xmm0); + code.sha256msg2(x, z); + + ctx.reg_alloc.DefineValue(inst, x); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sm4.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sm4.cpp new file mode 100644 index 0000000000..b084a92f91 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sm4.cpp @@ -0,0 +1,21 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/common/crypto/sm4.h" +#include "dynarmic/ir/microinstruction.h" + +namespace Dynarmic::Backend::X64 { + +void EmitX64::EmitSM4AccessSubstitutionBox(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.HostCall(inst, args[0]); + code.CallFunction(&Common::Crypto::SM4::AccessSubstitutionBox); + code.movzx(code.ABI_RETURN.cvt32(), code.ABI_RETURN.cvt8()); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp new file mode 100644 index 0000000000..47180de04d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -0,0 +1,5929 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <algorithm> +#include <bit> +#include <bitset> +#include <cstdlib> +#include <type_traits> + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_count.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/bitsizeof.hpp> +#include <mcl/stdint.hpp> +#include <mcl/type_traits/function_info.hpp> +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/constants.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/common/math_util.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +template<typename Function> +static void EmitVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + (code.*fn)(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +template<typename Function> +static void EmitAVXVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + (code.*fn)(xmm_a, xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +template<typename Lambda> +static void EmitOneArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda); + constexpr u32 stack_space = 2 * 16; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + + ctx.reg_alloc.HostCall(nullptr); + ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + + code.movaps(xword[code.ABI_PARAM2], arg1); + code.CallFunction(fn); + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); + + ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<typename Lambda> +static void EmitOneArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda); + constexpr u32 stack_space = 2 * 16; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + + ctx.reg_alloc.HostCall(nullptr); + ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + + code.movaps(xword[code.ABI_PARAM2], arg1); + code.CallFunction(fn); + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); + + ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE); + + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8()); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<typename Lambda> +static void EmitTwoArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda); + constexpr u32 stack_space = 3 * 16; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + + ctx.reg_alloc.HostCall(nullptr); + ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]); + + code.movaps(xword[code.ABI_PARAM2], arg1); + code.movaps(xword[code.ABI_PARAM3], arg2); + code.CallFunction(fn); + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); + + ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE); + + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8()); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<typename Lambda> +static void EmitTwoArgumentFallbackWithSaturationAndImmediate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda); + constexpr u32 stack_space = 2 * 16; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); + const u8 arg2 = args[1].GetImmediateU8(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + + ctx.reg_alloc.HostCall(nullptr); + ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + + code.movaps(xword[code.ABI_PARAM2], arg1); + code.mov(code.ABI_PARAM3, arg2); + code.CallFunction(fn); + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); + + ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE); + + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8()); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<typename Lambda> +static void EmitTwoArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda); + constexpr u32 stack_space = 3 * 16; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + + ctx.reg_alloc.HostCall(nullptr); + ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]); + + code.movaps(xword[code.ABI_PARAM2], arg1); + code.movaps(xword[code.ABI_PARAM3], arg2); + code.CallFunction(fn); + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); + + ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorGetElement8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + + // TODO: DefineValue directly on Argument for index == 0 + + const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32(); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pextrb(dest, source, index); + } else { + code.pextrw(dest, source, u8(index / 2)); + if (index % 2 == 1) { + code.shr(dest, 8); + } else { + code.and_(dest, 0xFF); // TODO: Remove when zext handling is corrected + } + } + + ctx.reg_alloc.DefineValue(inst, dest); +} + +void EmitX64::EmitVectorGetElement16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + + // TODO: DefineValue directly on Argument for index == 0 + + const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32(); + code.pextrw(dest, source, index); + ctx.reg_alloc.DefineValue(inst, dest); +} + +void EmitX64::EmitVectorGetElement32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + + // TODO: DefineValue directly on Argument for index == 0 + + const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr().cvt32(); + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]); + code.pextrd(dest, source, index); + } else { + const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(args[0]); + code.pshufd(source, source, index); + code.movd(dest, source); + } + + ctx.reg_alloc.DefineValue(inst, dest); +} + +void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + + if (index == 0) { + // TODO: DefineValue directly on Argument for index == 0 + const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr().cvt64(); + const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]); + code.movq(dest, source); + ctx.reg_alloc.DefineValue(inst, dest); + return; + } + + const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr().cvt64(); + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(args[0]); + code.pextrq(dest, source, 1); + } else { + const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(args[0]); + code.punpckhqdq(source, source); + code.movq(dest, source); + } + + ctx.reg_alloc.DefineValue(inst, dest); +} + +void EmitX64::EmitVectorSetElement8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Reg8 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt8(); + + code.pinsrb(source_vector, source_elem.cvt32(), index); + + ctx.reg_alloc.DefineValue(inst, source_vector); + } else { + const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32(); + const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.pextrw(tmp, source_vector, index / 2); + if (index % 2 == 0) { + code.and_(tmp, 0xFF00); + code.and_(source_elem, 0x00FF); + code.or_(tmp, source_elem); + } else { + code.and_(tmp, 0x00FF); + code.shl(source_elem, 8); + code.or_(tmp, source_elem); + } + code.pinsrw(source_vector, tmp, index / 2); + + ctx.reg_alloc.DefineValue(inst, source_vector); + } +} + +void EmitX64::EmitVectorSetElement16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + + const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Reg16 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt16(); + + code.pinsrw(source_vector, source_elem.cvt32(), index); + + ctx.reg_alloc.DefineValue(inst, source_vector); +} + +void EmitX64::EmitVectorSetElement32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseGpr(args[2]).cvt32(); + + code.pinsrd(source_vector, source_elem, index); + + ctx.reg_alloc.DefineValue(inst, source_vector); + } else { + const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseScratchGpr(args[2]).cvt32(); + + code.pinsrw(source_vector, source_elem, index * 2); + code.shr(source_elem, 16); + code.pinsrw(source_vector, source_elem, index * 2 + 1); + + ctx.reg_alloc.DefineValue(inst, source_vector); + } +} + +void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(args[2]); + + code.pinsrq(source_vector, source_elem, index); + + ctx.reg_alloc.DefineValue(inst, source_vector); + } else { + const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(args[2]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movq(tmp, source_elem); + + if (index == 0) { + code.movsd(source_vector, tmp); + } else { + code.punpcklqdq(source_vector, tmp); + } + + ctx.reg_alloc.DefineValue(inst, source_vector); + } +} + +static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pabsb(data, data); + } else { + const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); + code.pxor(temp, temp); + code.psubb(temp, data); + code.pminub(data, temp); + } +} + +static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pabsw(data, data); + } else { + const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); + code.pxor(temp, temp); + code.psubw(temp, data); + code.pmaxsw(data, temp); + } +} + +static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pabsd(data, data); + } else { + const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); + code.movdqa(temp, data); + code.psrad(temp, 31); + code.pxor(data, temp); + code.psubd(data, temp); + } +} + +static void VectorAbs64(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + code.vpabsq(data, data); + } else { + const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); + code.pshufd(temp, data, 0b11110101); + code.psrad(temp, 31); + code.pxor(data, temp); + code.psubq(data, temp); + } +} + +static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + + switch (esize) { + case 8: + VectorAbs8(code, ctx, data); + break; + case 16: + VectorAbs16(code, ctx, data); + break; + case 32: + VectorAbs32(code, ctx, data); + break; + case 64: + VectorAbs64(code, ctx, data); + break; + } + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorAbs8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorAbs(8, ctx, inst, code); +} + +void EmitX64::EmitVectorAbs16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorAbs(16, ctx, inst, code); +} + +void EmitX64::EmitVectorAbs32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorAbs(32, ctx, inst, code); +} + +void EmitX64::EmitVectorAbs64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorAbs(64, ctx, inst, code); +} + +void EmitX64::EmitVectorAdd8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddb); +} + +void EmitX64::EmitVectorAdd16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddw); +} + +void EmitX64::EmitVectorAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddd); +} + +void EmitX64::EmitVectorAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::paddq); +} + +void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pand); +} + +void EmitX64::EmitVectorAndNot(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.pandn(xmm_b, xmm_a); + + ctx.reg_alloc.DefineValue(inst, xmm_b); +} + +static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) { + if (code.HasHostFeature(HostFeature::GFNI)) { + const u64 shift_matrix = shift_amount < 8 + ? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8)) + : 0x8080808080808080; + code.gf2p8affineqb(result, code.Const(xword, shift_matrix, shift_matrix), 0); + return; + } + + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.punpckhbw(tmp, result); + code.punpcklbw(result, result); + code.psraw(tmp, 8 + shift_amount); + code.psraw(result, 8 + shift_amount); + code.packsswb(result, tmp); +} + +void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + ArithmeticShiftRightByte(ctx, code, result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorArithmeticShiftRight16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + code.psraw(result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + code.psrad(result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = std::min(args[1].GetImmediateU8(), u8(63)); + + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + code.vpsraq(result, result, shift_amount); + } else { + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + + const u64 sign_bit = 0x80000000'00000000u >> shift_amount; + + code.pxor(tmp2, tmp2); + code.psrlq(result, shift_amount); + code.movdqa(tmp1, code.Const(xword, sign_bit, sign_bit)); + code.pand(tmp1, result); + code.psubq(tmp2, tmp1); + code.por(result, tmp2); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<typename T> +static constexpr T VShift(T x, T y) { + const s8 shift_amount = static_cast<s8>(static_cast<u8>(y)); + const s64 bit_size = static_cast<s64>(mcl::bitsizeof<T>); + + if constexpr (std::is_signed_v<T>) { + if (shift_amount >= bit_size) { + return 0; + } + + if (shift_amount <= -bit_size) { + // Parentheses necessary, as MSVC doesn't appear to consider cast parentheses + // as a grouping in terms of precedence, causing warning C4554 to fire. See: + // https://developercommunity.visualstudio.com/content/problem/144783/msvc-2017-does-not-understand-that-static-cast-cou.html + return x >> (T(bit_size - 1)); + } + } else if (shift_amount <= -bit_size || shift_amount >= bit_size) { + return 0; + } + + if (shift_amount < 0) { + return x >> T(-shift_amount); + } + + using unsigned_type = std::make_unsigned_t<T>; + return static_cast<T>(static_cast<unsigned_type>(x) << static_cast<unsigned_type>(shift_amount)); +} + +void EmitX64::EmitVectorArithmeticVShift8(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s8>); + }); +} + +void EmitX64::EmitVectorArithmeticVShift16(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm right_shift = xmm16; + const Xbyak::Xmm tmp = xmm17; + + code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.vpxord(right_shift, right_shift, right_shift); + code.vpsubw(right_shift, right_shift, left_shift); + + code.vpsllw(xmm0, left_shift, 8); + code.vpsraw(xmm0, xmm0, 15); + + const Xbyak::Opmask mask = k1; + code.vpmovb2m(mask, xmm0); + + code.vpandd(right_shift, right_shift, tmp); + code.vpandd(left_shift, left_shift, tmp); + + code.vpsravw(tmp, result, right_shift); + code.vpsllvw(result, result, left_shift); + code.vpblendmb(result | mask, result, tmp); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s16>); + }); +} + +void EmitX64::EmitVectorArithmeticVShift32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX2)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.vmovdqa(tmp, code.Const(xword, 0x000000FF000000FF, 0x000000FF000000FF)); + code.vpxor(right_shift, right_shift, right_shift); + code.vpsubd(right_shift, right_shift, left_shift); + + code.vpslld(xmm0, left_shift, 24); + + code.vpand(right_shift, right_shift, tmp); + code.vpand(left_shift, left_shift, tmp); + + code.vpsravd(tmp, result, right_shift); + code.vpsllvd(result, result, left_shift); + code.blendvps(result, tmp); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& a, const VectorArray<s32>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s32>); + }); +} + +void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm right_shift = xmm16; + const Xbyak::Xmm tmp = xmm17; + + code.vmovdqa32(tmp, code.Const(xword, 0x00000000000000FF, 0x00000000000000FF)); + code.vpxorq(right_shift, right_shift, right_shift); + code.vpsubq(right_shift, right_shift, left_shift); + + code.vpsllq(xmm0, left_shift, 56); + const Xbyak::Opmask mask = k1; + code.vpmovq2m(mask, xmm0); + + code.vpandq(right_shift, right_shift, tmp); + code.vpandq(left_shift, left_shift, tmp); + + code.vpsravq(tmp, result, right_shift); + code.vpsllvq(result, result, left_shift); + code.vpblendmq(result | mask, result, tmp); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<s64>); + }); +} + +void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::AVX2)) { + code.vpbroadcastb(a, a); + code.vmovq(a, a); + } else if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.pxor(tmp, tmp); + code.pshufb(a, tmp); + code.movq(a, a); + } else { + code.punpcklbw(a, a); + code.pshuflw(a, a, 0); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcastLower16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code.pshuflw(a, a, 0); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code.pshuflw(a, a, 0b01000100); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::AVX2)) { + code.vpbroadcastb(a, a); + } else if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.pxor(tmp, tmp); + code.pshufb(a, tmp); + } else { + code.punpcklbw(a, a); + code.pshuflw(a, a, 0); + code.punpcklqdq(a, a); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::AVX2)) { + code.vpbroadcastw(a, a); + } else { + code.pshuflw(a, a, 0); + code.punpcklqdq(a, a); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::AVX2)) { + code.vpbroadcastd(a, a); + } else { + code.pshufd(a, a, 0); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::AVX2)) { + code.vpbroadcastq(a, a); + } else { + code.punpcklqdq(a, a); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + ASSERT(index < 16); + + if (index > 0) { + code.psrldq(a, index); + } + + if (code.HasHostFeature(HostFeature::AVX2)) { + code.vpbroadcastb(a, a); + code.vmovq(a, a); + } else if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.pxor(tmp, tmp); + code.pshufb(a, tmp); + code.movq(a, a); + } else { + code.punpcklbw(a, a); + code.pshuflw(a, a, 0); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + ASSERT(index < 8); + + if (index > 0) { + code.psrldq(a, u8(index * 2)); + } + + code.pshuflw(a, a, 0); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcastElementLower32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + ASSERT(index < 4); + + if (index > 0) { + code.psrldq(a, u8(index * 4)); + } + + code.pshuflw(a, a, 0b01'00'01'00); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + ASSERT(index < 16); + + if (index > 0) { + code.psrldq(a, index); + } + + if (code.HasHostFeature(HostFeature::AVX2)) { + code.vpbroadcastb(a, a); + } else if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.pxor(tmp, tmp); + code.pshufb(a, tmp); + } else { + code.punpcklbw(a, a); + code.pshuflw(a, a, 0); + code.punpcklqdq(a, a); + } + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + ASSERT(index < 8); + + if (index == 0 && code.HasHostFeature(HostFeature::AVX2)) { + code.vpbroadcastw(a, a); + + ctx.reg_alloc.DefineValue(inst, a); + return; + } + + if (index < 4) { + code.pshuflw(a, a, mcl::bit::replicate_element<2, u8>(index)); + code.punpcklqdq(a, a); + } else { + code.pshufhw(a, a, mcl::bit::replicate_element<2, u8>(u8(index - 4))); + code.punpckhqdq(a, a); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcastElement32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + ASSERT(index < 4); + + code.pshufd(a, a, mcl::bit::replicate_element<2, u8>(index)); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorBroadcastElement64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + ASSERT(args[1].IsImmediate()); + const u8 index = args[1].GetImmediateU8(); + ASSERT(index < 2); + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpermilpd(a, a, mcl::bit::replicate_element<1, u8>(index)); + } else { + if (index == 0) { + code.punpcklqdq(a, a); + } else { + code.punpckhqdq(a, a); + } + } + ctx.reg_alloc.DefineValue(inst, a); +} + +template<typename T> +static void EmitVectorCountLeadingZeros(VectorArray<T>& result, const VectorArray<T>& data) { + for (size_t i = 0; i < result.size(); i++) { + T element = data[i]; + + size_t count = mcl::bitsizeof<T>; + while (element != 0) { + element >>= 1; + --count; + } + + result[i] = static_cast<T>(count); + } +} + +void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::GFNI)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + // Reverse bits: + code.gf2p8affineqb(data, code.BConst<64>(xword, 0x8040201008040201), 0); + + // Perform a tzcnt: + // Isolate lowest set bit + code.pcmpeqb(result, result); + code.paddb(result, data); + code.pandn(result, data); + // Convert lowest set bit into an index + code.gf2p8affineqb(result, code.BConst<64>(xword, 0xaaccf0ff'00000000), 8); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::SSSE3)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp1, code.Const(xword, 0x0101010102020304, 0x0000000000000000)); + code.movdqa(tmp2, tmp1); + + code.pshufb(tmp2, data); + code.psrlw(data, 4); + code.pand(data, code.Const(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F)); + code.pshufb(tmp1, data); + + code.movdqa(data, code.Const(xword, 0x0404040404040404, 0x0404040404040404)); + + code.pcmpeqb(data, tmp1); + code.pand(data, tmp2); + code.paddb(data, tmp1); + + ctx.reg_alloc.DefineValue(inst, data); + return; + } + + EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u8>); +} + +void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.vpsrlw(tmp, data, 1); + code.vpor(data, data, tmp); + code.vpsrlw(tmp, data, 2); + code.vpor(data, data, tmp); + code.vpsrlw(tmp, data, 4); + code.vpor(data, data, tmp); + code.vpsrlw(tmp, data, 8); + code.vpor(data, data, tmp); + code.vpcmpeqw(zeros, zeros, zeros); + code.vpcmpeqw(tmp, tmp, tmp); + code.vpcmpeqw(zeros, zeros, data); + code.vpmullw(data, data, code.Const(xword, 0xf0d3f0d3f0d3f0d3, 0xf0d3f0d3f0d3f0d3)); + code.vpsllw(tmp, tmp, 15); + code.vpsllw(zeros, zeros, 7); + code.vpsrlw(data, data, 12); + code.vmovdqa(result, code.Const(xword, 0x0903060a040b0c10, 0x0f080e0207050d01)); + code.vpor(tmp, tmp, zeros); + code.vpor(data, data, tmp); + code.vpshufb(result, result, data); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::SSSE3)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, data); + code.psrlw(tmp, 1); + code.por(data, tmp); + code.movdqa(tmp, data); + code.psrlw(tmp, 2); + code.por(data, tmp); + code.movdqa(tmp, data); + code.psrlw(tmp, 4); + code.por(data, tmp); + code.movdqa(tmp, data); + code.psrlw(tmp, 8); + code.por(data, tmp); + code.pcmpeqw(zeros, zeros); + code.pcmpeqw(tmp, tmp); + code.pcmpeqw(zeros, data); + code.pmullw(data, code.Const(xword, 0xf0d3f0d3f0d3f0d3, 0xf0d3f0d3f0d3f0d3)); + code.psllw(tmp, 15); + code.psllw(zeros, 7); + code.psrlw(data, 12); + code.movdqa(result, code.Const(xword, 0x0903060a040b0c10, 0x0f080e0207050d01)); + code.por(tmp, zeros); + code.por(data, tmp); + code.pshufb(result, data); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u16>); +} + +void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512CD)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vplzcntd(data, data); + + ctx.reg_alloc.DefineValue(inst, data); + return; + } + + EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u32>); +} + +void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.pand(lhs, tmp); + code.pand(rhs, tmp); + code.packuswb(lhs, rhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + code.pxor(zero, zero); + + code.pblendw(lhs, zero, 0b10101010); + code.pblendw(rhs, zero, 0b10101010); + code.packusdw(lhs, rhs); + } else { + code.pslld(lhs, 16); + code.psrad(lhs, 16); + + code.pslld(rhs, 16); + code.psrad(rhs, 16); + + code.packssdw(lhs, rhs); + } + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.shufps(lhs, rhs, 0b10001000); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.shufpd(lhs, rhs, 0b00); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.punpcklbw(lhs, rhs); + code.pshufb(lhs, code.Const(xword, 0x0D'09'05'01'0C'08'04'00, 0x8080808080808080)); + } else { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.pand(lhs, tmp); + code.pand(rhs, tmp); + code.packuswb(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + } + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.punpcklwd(lhs, rhs); + code.pshufb(lhs, code.Const(xword, 0x0B0A'0302'0908'0100, 0x8080'8080'8080'8080)); + } else { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.pslld(lhs, 16); + code.psrad(lhs, 16); + + code.pslld(rhs, 16); + code.psrad(rhs, 16); + + code.packssdw(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + } + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + if (code.HasHostFeature(HostFeature::SSE41)) { + // copy bytes 0:3 of rhs to lhs, zero out upper 8 bytes + code.insertps(lhs, rhs, 0b00011100); + } else { + code.unpcklps(lhs, rhs); + code.movq(lhs, lhs); + } + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOdd8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.psraw(lhs, 8); + code.psraw(rhs, 8); + code.packsswb(lhs, rhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.psrad(lhs, 16); + code.psrad(rhs, 16); + code.packssdw(lhs, rhs); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.shufps(lhs, rhs, 0b11011101); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.shufpd(lhs, rhs, 0b11); + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.punpcklbw(lhs, rhs); + code.pshufb(lhs, code.Const(xword, 0x0F'0B'07'03'0E'0A'06'02, 0x8080808080808080)); + } else { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.psraw(lhs, 8); + code.psraw(rhs, 8); + code.packsswb(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + } + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOddLower16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + + code.punpcklwd(lhs, rhs); + code.pshufb(lhs, code.Const(xword, 0x0F0E'0706'0D0C'0504, 0x8080'8080'8080'8080)); + } else { + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.psrad(lhs, 16); + code.psrad(rhs, 16); + code.packssdw(lhs, rhs); + code.pshufd(lhs, lhs, 0b11011000); + code.movq(lhs, lhs); + } + + ctx.reg_alloc.DefineValue(inst, lhs); +} + +void EmitX64::EmitVectorDeinterleaveOddLower32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm lhs = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(args[1]); + + // copy bytes 4:7 of lhs to bytes 0:3 of rhs, zero out upper 8 bytes + code.insertps(rhs, lhs, 0b01001100); + + ctx.reg_alloc.DefineValue(inst, rhs); + } else { + const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + + code.xorps(zero, zero); + code.unpcklps(lhs, rhs); + code.unpckhpd(lhs, zero); + + ctx.reg_alloc.DefineValue(inst, lhs); + } +} + +void EmitX64::EmitVectorEor(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pxor); +} + +void EmitX64::EmitVectorEqual8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqb); +} + +void EmitX64::EmitVectorEqual16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqw); +} + +void EmitX64::EmitVectorEqual32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqd); +} + +void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpeqq); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.pcmpeqd(xmm_a, xmm_b); + code.pshufd(tmp, xmm_a, 0b10110001); + code.pand(xmm_a, tmp); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.pcmpeqq(xmm_a, xmm_b); + code.pshufd(tmp, xmm_a, 0b01001110); + code.pand(xmm_a, tmp); + + ctx.reg_alloc.DefineValue(inst, xmm_a); + } else { + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.pcmpeqd(xmm_a, xmm_b); + code.pshufd(tmp, xmm_a, 0b10110001); + code.pand(xmm_a, tmp); + code.pshufd(tmp, xmm_a, 0b01001110); + code.pand(xmm_a, tmp); + + ctx.reg_alloc.DefineValue(inst, xmm_a); + } +} + +void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const u8 position = args[2].GetImmediateU8(); + ASSERT(position % 8 == 0); + + if (position == 0) { + ctx.reg_alloc.DefineValue(inst, args[0]); + return; + } + + if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.palignr(xmm_b, xmm_a, position / 8); + ctx.reg_alloc.DefineValue(inst, xmm_b); + return; + } + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.psrldq(xmm_a, position / 8); + code.pslldq(xmm_b, (128 - position) / 8); + code.por(xmm_a, xmm_b); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitVectorExtractLower(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + + const u8 position = args[2].GetImmediateU8(); + ASSERT(position % 8 == 0); + + if (position != 0) { + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + code.punpcklqdq(xmm_a, xmm_b); + code.psrldq(xmm_a, position / 8); + } + code.movq(xmm_a, xmm_a); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitVectorGreaterS8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtb); +} + +void EmitX64::EmitVectorGreaterS16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtw); +} + +void EmitX64::EmitVectorGreaterS32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtd); +} + +void EmitX64::EmitVectorGreaterS64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE42)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtq); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) { + for (size_t i = 0; i < result.size(); ++i) { + result[i] = (a[i] > b[i]) ? ~u64(0) : 0; + } + }); +} + +static void EmitVectorHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, b); + code.pand(tmp, a); + code.pxor(a, b); + + switch (esize) { + case 8: + ArithmeticShiftRightByte(ctx, code, a, 1); + code.paddb(a, tmp); + break; + case 16: + code.psraw(a, 1); + code.paddw(a, tmp); + break; + case 32: + code.psrad(a, 1); + code.paddd(a, tmp); + break; + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorHalvingAddS8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddSigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingAddS16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddSigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingAddS32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddSigned(32, ctx, inst, code); +} + +static void EmitVectorHalvingAddUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, b); + + switch (esize) { + case 8: + code.pavgb(tmp, a); + code.pxor(a, b); + code.pand(a, code.Const(xword, 0x0101010101010101, 0x0101010101010101)); + code.psubb(tmp, a); + break; + case 16: + code.pavgw(tmp, a); + code.pxor(a, b); + code.pand(a, code.Const(xword, 0x0001000100010001, 0x0001000100010001)); + code.psubw(tmp, a); + break; + case 32: + code.pand(tmp, a); + code.pxor(a, b); + code.psrld(a, 1); + code.paddd(tmp, a); + break; + } + + ctx.reg_alloc.DefineValue(inst, tmp); +} + +void EmitX64::EmitVectorHalvingAddU8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddUnsigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingAddU16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddUnsigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingAddU32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingAddUnsigned(32, ctx, inst, code); +} + +static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + switch (esize) { + case 8: { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp, code.Const(xword, 0x8080808080808080, 0x8080808080808080)); + code.pxor(a, tmp); + code.pxor(b, tmp); + code.pavgb(b, a); + code.psubb(a, b); + break; + } + case 16: { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp, code.Const(xword, 0x8000800080008000, 0x8000800080008000)); + code.pxor(a, tmp); + code.pxor(b, tmp); + code.pavgw(b, a); + code.psubw(a, b); + break; + } + case 32: + code.pxor(a, b); + code.pand(b, a); + code.psrad(a, 1); + code.psubd(a, b); + break; + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorHalvingSubS8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubSigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingSubS16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubSigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingSubS32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubSigned(32, ctx, inst, code); +} + +static void EmitVectorHalvingSubUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + switch (esize) { + case 8: + code.pavgb(b, a); + code.psubb(a, b); + break; + case 16: + code.pavgw(b, a); + code.psubw(a, b); + break; + case 32: + code.pxor(a, b); + code.pand(b, a); + code.psrld(a, 1); + code.psubd(a, b); + break; + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorHalvingSubU8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubUnsigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingSubU16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubUnsigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorHalvingSubU32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorHalvingSubUnsigned(32, ctx, inst, code); +} + +static void EmitVectorInterleaveLower(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + + switch (size) { + case 8: + code.punpcklbw(a, b); + break; + case 16: + code.punpcklwd(a, b); + break; + case 32: + code.punpckldq(a, b); + break; + case 64: + code.punpcklqdq(a, b); + break; + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorInterleaveLower8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorInterleaveLower(code, ctx, inst, 8); +} + +void EmitX64::EmitVectorInterleaveLower16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorInterleaveLower(code, ctx, inst, 16); +} + +void EmitX64::EmitVectorInterleaveLower32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorInterleaveLower(code, ctx, inst, 32); +} + +void EmitX64::EmitVectorInterleaveLower64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorInterleaveLower(code, ctx, inst, 64); +} + +static void EmitVectorInterleaveUpper(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + + switch (size) { + case 8: + code.punpckhbw(a, b); + break; + case 16: + code.punpckhwd(a, b); + break; + case 32: + code.punpckhdq(a, b); + break; + case 64: + code.punpckhqdq(a, b); + break; + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorInterleaveUpper8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorInterleaveUpper(code, ctx, inst, 8); +} + +void EmitX64::EmitVectorInterleaveUpper16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorInterleaveUpper(code, ctx, inst, 16); +} + +void EmitX64::EmitVectorInterleaveUpper32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorInterleaveUpper(code, ctx, inst, 32); +} + +void EmitX64::EmitVectorInterleaveUpper64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorInterleaveUpper(code, ctx, inst, 64); +} + +void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + if (shift_amount == 0) { + // do nothing + } else if (shift_amount >= 8) { + code.pxor(result, result); + } else if (shift_amount == 1) { + code.paddb(result, result); + } else if (code.HasHostFeature(HostFeature::GFNI)) { + const u64 shift_matrix = 0x0102040810204080 >> (shift_amount * 8); + code.gf2p8affineqb(result, code.Const(xword, shift_matrix, shift_matrix), 0); + } else { + const u64 replicand = (0xFFULL << shift_amount) & 0xFF; + const u64 mask = mcl::bit::replicate_element<u8, u64>(replicand); + + code.psllw(result, shift_amount); + code.pand(result, code.Const(xword, mask, mask)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorLogicalShiftLeft16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + code.psllw(result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + code.pslld(result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + code.psllq(result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + if (shift_amount == 0) { + // Do nothing + } else if (shift_amount >= 8) { + code.pxor(result, result); + } else if (code.HasHostFeature(HostFeature::GFNI)) { + const u64 shift_matrix = 0x0102040810204080 << (shift_amount * 8); + code.gf2p8affineqb(result, code.Const(xword, shift_matrix, shift_matrix), 0); + } else { + const u64 replicand = 0xFEULL >> shift_amount; + const u64 mask = mcl::bit::replicate_element<u8, u64>(replicand); + + code.psrlw(result, shift_amount); + code.pand(result, code.Const(xword, mask, mask)); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorLogicalShiftRight16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + code.psrlw(result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + code.psrld(result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const u8 shift_amount = args[1].GetImmediateU8(); + + code.psrlq(result, shift_amount); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::GFNI)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + const Xbyak::Opmask negative_mask = k1; + code.pxor(tmp, tmp); + code.vpcmpb(negative_mask, left_shift, tmp, CmpInt::LessThan); + + // Reverse bits of negative-shifts + code.vmovaps(xmm0, code.BConst<64>(xword, 0x8040201008040201)); + code.vgf2p8affineqb(result | negative_mask, result, xmm0, 0); + + // Turn all negative shifts into left-shifts + code.pabsb(left_shift, left_shift); + + const Xbyak::Opmask valid_index = k2; + code.vptestnmb(valid_index, left_shift, code.BConst<8>(xword, 0xF8)); + + // gf2p8mulb's "x8 + x4 + x3 + x + 1"-polynomial-reduction only applies + // when the multiplication overflows. Masking away any bits that would have + // overflowed turns the polynomial-multiplication into regular modulo-multiplication + code.movdqa(tmp, code.Const(xword, 0x01'03'07'0f'1f'3f'7f'ff, 0)); + code.vpshufb(tmp | valid_index | T_z, tmp, left_shift); + code.pand(result, tmp); + + // n << 0 == n * 1 | n << 1 == n * 2 | n << 2 == n * 4 | etc + code.pxor(tmp, tmp); + code.movsd(tmp, xmm0); + code.pshufb(tmp, left_shift); + + code.gf2p8mulb(result, tmp); + + // Un-reverse bits of negative-shifts + code.vgf2p8affineqb(result | negative_mask, result, xmm0, 0); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u8>); + }); +} + +void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm right_shift = xmm16; + const Xbyak::Xmm tmp = xmm17; + + code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.vpxord(right_shift, right_shift, right_shift); + code.vpsubw(right_shift, right_shift, left_shift); + code.vpandd(left_shift, left_shift, tmp); + code.vpandd(right_shift, right_shift, tmp); + + code.vpsllvw(tmp, result, left_shift); + code.vpsrlvw(result, result, right_shift); + code.vpord(result, result, tmp); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u16>); + }); +} + +void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX2)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.vmovdqa(tmp, code.Const(xword, 0x000000FF000000FF, 0x000000FF000000FF)); + code.vpxor(right_shift, right_shift, right_shift); + code.vpsubd(right_shift, right_shift, left_shift); + code.vpand(left_shift, left_shift, tmp); + code.vpand(right_shift, right_shift, tmp); + + code.vpsllvd(tmp, result, left_shift); + code.vpsrlvd(result, result, right_shift); + code.vpor(result, result, tmp); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a, const VectorArray<u32>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u32>); + }); +} + +void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX2)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.vmovdqa(tmp, code.Const(xword, 0x00000000000000FF, 0x00000000000000FF)); + code.vpxor(right_shift, right_shift, right_shift); + code.vpsubq(right_shift, right_shift, left_shift); + code.vpand(left_shift, left_shift, tmp); + code.vpand(right_shift, right_shift, tmp); + + code.vpsllvq(tmp, result, left_shift); + code.vpsrlvq(result, result, right_shift); + code.vpor(result, result, tmp); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u64>); + }); +} + +void EmitX64::EmitVectorMaxS8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp_b, b); + + code.pcmpgtb(tmp_b, a); + code.pand(b, tmp_b); + code.pandn(tmp_b, a); + code.por(tmp_b, b); + + ctx.reg_alloc.DefineValue(inst, tmp_b); +} + +void EmitX64::EmitVectorMaxS16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsw); +} + +void EmitX64::EmitVectorMaxS32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp_b, b); + + code.pcmpgtd(tmp_b, a); + code.pand(b, tmp_b); + code.pandn(tmp_b, a); + code.por(tmp_b, b); + + ctx.reg_alloc.DefineValue(inst, tmp_b); +} + +void EmitX64::EmitVectorMaxS64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxsq); + return; + } + + if (code.HasHostFeature(HostFeature::AVX)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + + code.vpcmpgtq(xmm0, y, x); + code.pblendvb(x, y); + + ctx.reg_alloc.DefineValue(inst, x); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::max(x, y); }); + }); +} + +void EmitX64::EmitVectorMaxU8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxub); +} + +void EmitX64::EmitVectorMaxU16(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + + code.psubusw(a, b); + code.paddw(a, b); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); + + const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp_b, b); + + code.pxor(tmp_b, tmp); + code.pxor(tmp, a); + + code.pcmpgtd(tmp, tmp_b); + code.pand(a, tmp); + code.pandn(tmp, b); + code.por(a, tmp); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxuq); + return; + } + + if (code.HasHostFeature(HostFeature::AVX)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000)); + code.vpsubq(tmp, y, xmm0); + code.vpsubq(xmm0, x, xmm0); + code.vpcmpgtq(xmm0, tmp, xmm0); + code.pblendvb(x, y); + + ctx.reg_alloc.DefineValue(inst, x); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::max(x, y); }); + }); +} + +void EmitX64::EmitVectorMinS8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsb); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + + const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp_b, b); + + code.pcmpgtb(tmp_b, a); + code.pand(a, tmp_b); + code.pandn(tmp_b, b); + code.por(a, tmp_b); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorMinS16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsw); +} + +void EmitX64::EmitVectorMinS32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsd); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + + const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp_b, b); + + code.pcmpgtd(tmp_b, a); + code.pand(a, tmp_b); + code.pandn(tmp_b, b); + code.por(a, tmp_b); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorMinS64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminsq); + return; + } + + if (code.HasHostFeature(HostFeature::AVX)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.vpcmpgtq(xmm0, y, x); + code.pblendvb(y, x); + + ctx.reg_alloc.DefineValue(inst, y); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::min(x, y); }); + }); +} + +void EmitX64::EmitVectorMinU8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminub); +} + +void EmitX64::EmitVectorMinU16(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminuw); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp_b, b); + + code.psubusw(tmp_b, a); + code.psubw(b, tmp_b); + + ctx.reg_alloc.DefineValue(inst, b); +} + +void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminud); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + + const Xbyak::Xmm sint_max_plus_one = ctx.reg_alloc.ScratchXmm(); + code.movdqa(sint_max_plus_one, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); + + const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp_a, a); + code.psubd(tmp_a, sint_max_plus_one); + + const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp_b, b); + code.psubd(tmp_b, sint_max_plus_one); + + code.pcmpgtd(tmp_b, tmp_a); + code.pand(a, tmp_b); + code.pandn(tmp_b, b); + code.por(a, tmp_b); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminuq); + return; + } + + if (code.HasHostFeature(HostFeature::AVX)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000)); + code.vpsubq(tmp, y, xmm0); + code.vpsubq(xmm0, x, xmm0); + code.vpcmpgtq(xmm0, tmp, xmm0); + code.pblendvb(y, x); + + ctx.reg_alloc.DefineValue(inst, y); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return std::min(x, y); }); + }); +} + +void EmitX64::EmitVectorMultiply8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(); + + // TODO: Optimize + code.movdqa(tmp_a, a); + code.movdqa(tmp_b, b); + code.pmullw(a, b); + code.psrlw(tmp_a, 8); + code.psrlw(tmp_b, 8); + code.pmullw(tmp_a, tmp_b); + code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.psllw(tmp_a, 8); + code.por(a, tmp_a); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorMultiply16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmullw); +} + +void EmitX64::EmitVectorMultiply32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmulld); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, a); + code.psrlq(a, 32); + code.pmuludq(tmp, b); + code.psrlq(b, 32); + code.pmuludq(a, b); + code.pshufd(tmp, tmp, 0b00001000); + code.pshufd(b, a, 0b00001000); + code.punpckldq(tmp, b); + + ctx.reg_alloc.DefineValue(inst, tmp); +} + +void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { + EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmullq); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Reg64 tmp1 = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); + + code.movq(tmp1, a); + code.movq(tmp2, b); + code.imul(tmp2, tmp1); + code.pextrq(tmp1, a, 1); + code.movq(a, tmp2); + code.pextrq(tmp2, b, 1); + code.imul(tmp1, tmp2); + code.pinsrq(a, tmp1, 1); + + ctx.reg_alloc.DefineValue(inst, a); + return; + } + + const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp1, a); + code.movdqa(tmp2, a); + code.movdqa(tmp3, b); + + code.psrlq(tmp1, 32); + code.psrlq(tmp3, 32); + + code.pmuludq(tmp2, b); + code.pmuludq(tmp3, a); + code.pmuludq(b, tmp1); + + code.paddq(b, tmp3); + code.psllq(b, 32); + code.paddq(tmp2, b); + + ctx.reg_alloc.DefineValue(inst, tmp2); +} + +void EmitX64::EmitVectorMultiplySignedWiden8(EmitContext&, IR::Inst*) { + ASSERT_FALSE("Unexpected VectorMultiplySignedWiden8"); +} + +void EmitX64::EmitVectorMultiplySignedWiden16(EmitContext&, IR::Inst*) { + ASSERT_FALSE("Unexpected VectorMultiplySignedWiden16"); +} + +void EmitX64::EmitVectorMultiplySignedWiden32(EmitContext&, IR::Inst*) { + ASSERT_FALSE("Unexpected VectorMultiplySignedWiden32"); +} + +void EmitX64::EmitVectorMultiplyUnsignedWiden8(EmitContext&, IR::Inst*) { + ASSERT_FALSE("Unexpected VectorMultiplyUnsignedWiden8"); +} + +void EmitX64::EmitVectorMultiplyUnsignedWiden16(EmitContext&, IR::Inst*) { + ASSERT_FALSE("Unexpected VectorMultiplyUnsignedWiden16"); +} + +void EmitX64::EmitVectorMultiplyUnsignedWiden32(EmitContext&, IR::Inst*) { + ASSERT_FALSE("Unexpected VectorMultiplyUnsignedWiden32"); +} + +void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { + const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vpmovwb(result, a); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); + + code.pxor(zeros, zeros); + code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.packuswb(a, zeros); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorNarrow32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vpmovdw(result, a); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); + + code.pxor(zeros, zeros); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pblendw(a, zeros, 0b10101010); + code.packusdw(a, zeros); + } else { + code.pslld(a, 16); + code.psrad(a, 16); + code.packssdw(a, zeros); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorNarrow64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vpmovqd(result, a); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); + + code.pxor(zeros, zeros); + code.shufps(a, zeros, 0b00001000); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorNot(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + code.vpternlogq(result, operand, operand, u8(~Tern::c)); + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.ScratchXmm(); + code.pcmpeqw(xmm_b, xmm_b); + code.pxor(xmm_a, xmm_b); + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitVectorOr(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::por); +} + +void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.punpcklqdq(xmm_a, xmm_b); + code.movdqa(tmp, xmm_a); + code.psllw(xmm_a, 8); + code.paddw(xmm_a, tmp); + code.pxor(tmp, tmp); + code.psrlw(xmm_a, 8); + code.packuswb(xmm_a, tmp); + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.punpcklqdq(xmm_a, xmm_b); + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pxor(tmp, tmp); + code.phaddw(xmm_a, tmp); + } else { + code.movdqa(tmp, xmm_a); + code.pslld(xmm_a, 16); + code.paddd(xmm_a, tmp); + code.pxor(tmp, tmp); + code.psrad(xmm_a, 16); + code.packssdw(xmm_a, tmp); // Note: packusdw is SSE4.1, hence the arithmetic shift above. + } + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.punpcklqdq(xmm_a, xmm_b); + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pxor(tmp, tmp); + code.phaddd(xmm_a, tmp); + } else { + code.movdqa(tmp, xmm_a); + code.psllq(xmm_a, 32); + code.paddq(xmm_a, tmp); + code.psrlq(xmm_a, 32); + code.pshufd(xmm_a, xmm_a, 0b11011000); + } + + ctx.reg_alloc.DefineValue(inst, xmm_a); +} + +void EmitX64::EmitVectorPairedAdd8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.movdqa(d, b); + code.psllw(a, 8); + code.psllw(b, 8); + code.paddw(a, c); + code.paddw(b, d); + code.psrlw(a, 8); + code.psrlw(b, 8); + code.packuswb(a, b); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + + code.phaddw(a, b); + + ctx.reg_alloc.DefineValue(inst, a); + } else { + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.movdqa(d, b); + code.pslld(a, 16); + code.pslld(b, 16); + code.paddd(a, c); + code.paddd(b, d); + code.psrad(a, 16); + code.psrad(b, 16); + code.packssdw(a, b); + + ctx.reg_alloc.DefineValue(inst, a); + } +} + +void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::SSSE3)) { + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + + code.phaddd(a, b); + + ctx.reg_alloc.DefineValue(inst, a); + } else { + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.movdqa(d, b); + code.psllq(a, 32); + code.psllq(b, 32); + code.paddq(a, c); + code.paddq(b, d); + code.shufps(a, b, 0b11011101); + + ctx.reg_alloc.DefineValue(inst, a); + } +} + +void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.punpcklqdq(a, b); + code.punpckhqdq(c, b); + code.paddq(a, c); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorPairedAddSignedWiden8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.psllw(a, 8); + code.psraw(c, 8); + code.psraw(a, 8); + code.paddw(a, c); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorPairedAddSignedWiden16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.pslld(a, 16); + code.psrad(c, 16); + code.psrad(a, 16); + code.paddd(a, c); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + const Xbyak::Xmm c = xmm16; + code.vpsraq(c, a, 32); + code.vpsllq(a, a, 32); + code.vpsraq(a, a, 32); + code.vpaddq(a, a, c); + } else { + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.psllq(a, 32); + code.movdqa(tmp1, code.Const(xword, 0x80000000'00000000, 0x80000000'00000000)); + code.movdqa(tmp2, tmp1); + code.pand(tmp1, a); + code.pand(tmp2, c); + code.psrlq(a, 32); + code.psrlq(c, 32); + code.psrad(tmp1, 31); + code.psrad(tmp2, 31); + code.por(a, tmp1); + code.por(c, tmp2); + code.paddq(a, c); + } + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorPairedAddUnsignedWiden8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.psllw(a, 8); + code.psrlw(c, 8); + code.psrlw(a, 8); + code.paddw(a, c); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorPairedAddUnsignedWiden16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.pslld(a, 16); + code.psrld(c, 16); + code.psrld(a, 16); + code.paddd(a, c); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(c, a); + code.psllq(a, 32); + code.psrlq(c, 32); + code.psrlq(a, 32); + code.paddq(a, c); + + ctx.reg_alloc.DefineValue(inst, a); +} + +template<typename T, typename Function> +static void PairedOperation(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y, Function fn) { + const size_t range = x.size() / 2; + + for (size_t i = 0; i < range; i++) { + result[i] = fn(x[2 * i], x[2 * i + 1]); + } + + for (size_t i = 0; i < range; i++) { + result[range + i] = fn(y[2 * i], y[2 * i + 1]); + } +} + +template<typename T, typename Function> +static void LowerPairedOperation(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y, Function fn) { + const size_t range = x.size() / 4; + + for (size_t i = 0; i < range; i++) { + result[i] = fn(x[2 * i], x[2 * i + 1]); + } + + for (size_t i = 0; i < range; i++) { + result[range + i] = fn(y[2 * i], y[2 * i + 1]); + } +} + +template<typename T> +static void PairedMax(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) { + PairedOperation(result, x, y, [](auto a, auto b) { return std::max(a, b); }); +} + +template<typename T> +static void PairedMin(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) { + PairedOperation(result, x, y, [](auto a, auto b) { return std::min(a, b); }); +} + +template<typename T> +static void LowerPairedMax(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) { + LowerPairedOperation(result, x, y, [](auto a, auto b) { return std::max(a, b); }); +} + +template<typename T> +static void LowerPairedMin(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) { + LowerPairedOperation(result, x, y, [](auto a, auto b) { return std::min(a, b); }); +} + +template<typename Function> +static void EmitVectorPairedMinMaxLower8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.punpcklqdq(x, y); + code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01)); + code.movhlps(y, x); + code.movq(x, x); + (code.*fn)(x, y); + + ctx.reg_alloc.DefineValue(inst, x); +} + +template<typename Function> +static void EmitVectorPairedMinMax8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01)); + code.pshufb(x, tmp); + code.pshufb(y, tmp); + + code.movaps(tmp, x); + code.shufps(tmp, y, 0b01'00'01'00); + + code.shufps(x, y, 0b11'10'11'10); + + (code.*fn)(x, tmp); + ctx.reg_alloc.DefineValue(inst, x); +} + +template<typename Function> +static void EmitVectorPairedMinMax16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // swap idxs 1 and 2 within 64-bit lanes so that both registers contain [even, odd, even, odd]-indexed pairs of elements + code.pshuflw(x, x, 0b11'01'10'00); + code.pshuflw(y, y, 0b11'01'10'00); + + code.pshufhw(x, x, 0b11'01'10'00); + code.pshufhw(y, y, 0b11'01'10'00); + + // move pairs of even/odd-indexed elements into one register each + + // tmp = x[0, 2], x[4, 6], y[0, 2], y[4, 6] + code.movaps(tmp, x); + code.shufps(tmp, y, 0b10'00'10'00); + // x = x[1, 3], x[5, 7], y[1, 3], y[5, 7] + code.shufps(x, y, 0b11'01'11'01); + + (code.*fn)(x, tmp); + + ctx.reg_alloc.DefineValue(inst, x); +} + +template<typename Function> +static void EmitVectorPairedMinMaxLower16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements + code.pshuflw(x, x, 0b11'01'10'00); + code.pshuflw(y, y, 0b11'01'10'00); + + // move pairs of even/odd-indexed elements into one register each + + // tmp = x[0, 2], y[0, 2], 0s... + code.movaps(tmp, y); + code.insertps(tmp, x, 0b01001100); + // x = x[1, 3], y[1, 3], 0s... + code.insertps(x, y, 0b00011100); + + (code.*fn)(x, tmp); + + ctx.reg_alloc.DefineValue(inst, x); +} + +static void EmitVectorPairedMinMaxLower32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // tmp = x[1], y[1], 0, 0 + code.movaps(tmp, y); + code.insertps(tmp, x, 0b01001100); + // x = x[0], y[0], 0, 0 + code.insertps(x, y, 0b00011100); + + (code.*fn)(x, tmp); + + ctx.reg_alloc.DefineValue(inst, x); +} + +void EmitX64::EmitVectorPairedMaxS8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) { + PairedMax(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMaxS16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorPairedMinMax16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsw); +} + +void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, x); + code.shufps(tmp, y, 0b10001000); + code.shufps(x, y, 0b11011101); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pmaxsd(x, tmp); + + ctx.reg_alloc.DefineValue(inst, x); + } else { + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp2, tmp); + code.pcmpgtd(tmp2, x); + code.pand(tmp, tmp2); + code.pandn(tmp2, x); + code.por(tmp2, tmp); + + ctx.reg_alloc.DefineValue(inst, tmp2); + } +} + +void EmitX64::EmitVectorPairedMaxU8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSSE3)) { + EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxub); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) { + PairedMax(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMaxU16(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMax16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) { + PairedMax(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp1, x); + code.shufps(tmp1, y, 0b10001000); + code.shufps(x, y, 0b11011101); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pmaxud(x, tmp1); + + ctx.reg_alloc.DefineValue(inst, x); + } else { + const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp3, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); + + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp2, x); + + code.pxor(tmp2, tmp3); + code.pxor(tmp3, tmp1); + code.pcmpgtd(tmp3, tmp2); + code.pand(tmp1, tmp3); + code.pandn(tmp3, x); + code.por(tmp1, tmp3); + + ctx.reg_alloc.DefineValue(inst, tmp1); + } +} + +void EmitX64::EmitVectorPairedMinS8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) { + PairedMin(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMinS16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorPairedMinMax16(code, ctx, inst, &Xbyak::CodeGenerator::pminsw); +} + +void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, x); + code.shufps(tmp, y, 0b10001000); + code.shufps(x, y, 0b11011101); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pminsd(x, tmp); + + ctx.reg_alloc.DefineValue(inst, x); + } else { + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + + code.movaps(tmp2, x); + code.pcmpgtd(tmp2, tmp); + code.pand(tmp, tmp2); + code.pandn(tmp2, x); + code.por(tmp2, tmp); + + ctx.reg_alloc.DefineValue(inst, tmp2); + } +} + +void EmitX64::EmitVectorPairedMinU8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSSE3)) { + EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminub); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) { + PairedMin(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMinU16(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMax16(code, ctx, inst, &Xbyak::CodeGenerator::pminuw); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) { + PairedMin(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp1, x); + code.shufps(tmp1, y, 0b10001000); + code.shufps(x, y, 0b11011101); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pminud(x, tmp1); + + ctx.reg_alloc.DefineValue(inst, x); + } else { + const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp3, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); + + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp2, tmp1); + + code.pxor(tmp2, tmp3); + code.pxor(tmp3, x); + code.pcmpgtd(tmp3, tmp2); + code.pand(tmp1, tmp3); + code.pandn(tmp3, x); + code.por(tmp1, tmp3); + + ctx.reg_alloc.DefineValue(inst, tmp1); + } +} + +void EmitX64::EmitVectorPairedMaxLowerS8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) { + LowerPairedMax(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMaxLowerS16(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsw); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) { + LowerPairedMax(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMaxLowerS32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& a, const VectorArray<s32>& b) { + LowerPairedMax(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMaxLowerU8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxub); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) { + LowerPairedMax(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMaxLowerU16(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) { + LowerPairedMax(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMaxLowerU32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a, const VectorArray<u32>& b) { + LowerPairedMax(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMinLowerS8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) { + LowerPairedMin(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMinLowerS16(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminsw); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) { + LowerPairedMin(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMinLowerS32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminsd); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& a, const VectorArray<s32>& b) { + LowerPairedMin(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMinLowerU8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pminub); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) { + LowerPairedMin(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMinLowerU16(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminuw); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) { + LowerPairedMin(result, a, b); + }); +} + +void EmitX64::EmitVectorPairedMinLowerU32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminud); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a, const VectorArray<u32>& b) { + LowerPairedMin(result, a, b); + }); +} + +template<typename D, typename T> +static D PolynomialMultiply(T lhs, T rhs) { + constexpr size_t bit_size = mcl::bitsizeof<T>; + const std::bitset<bit_size> operand(lhs); + + D res = 0; + for (size_t i = 0; i < bit_size; i++) { + if (operand[i]) { + res ^= rhs << i; + } + } + + return res; +} + +void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr().cvt32(); + + Xbyak::Label loop; + + code.pxor(result, result); + code.movdqa(mask, code.Const(xword, 0x0101010101010101, 0x0101010101010101)); + code.mov(counter, 8); + + code.L(loop); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpand(xmm0, xmm_b, mask); + code.vpxor(alternate, result, xmm_a); + } else { + code.movdqa(xmm0, xmm_b); + code.movdqa(alternate, result); + code.pand(xmm0, mask); + code.pxor(alternate, xmm_a); + } + code.pcmpeqb(xmm0, mask); + code.paddb(mask, mask); + code.paddb(xmm_a, xmm_a); + code.pblendvb(result, alternate); + code.dec(counter); + code.jnz(loop); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), PolynomialMultiply<u8, u8>); + }); +} + +void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr().cvt32(); + + Xbyak::Label loop; + + code.pmovzxbw(xmm_a, xmm_a); + code.pmovzxbw(xmm_b, xmm_b); + code.pxor(result, result); + code.movdqa(mask, code.Const(xword, 0x0001000100010001, 0x0001000100010001)); + code.mov(counter, 8); + + code.L(loop); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpand(xmm0, xmm_b, mask); + code.vpxor(alternate, result, xmm_a); + } else { + code.movdqa(xmm0, xmm_b); + code.movdqa(alternate, result); + code.pand(xmm0, mask); + code.pxor(alternate, xmm_a); + } + code.pcmpeqw(xmm0, mask); + code.paddw(mask, mask); + code.paddw(xmm_a, xmm_a); + code.pblendvb(result, alternate); + code.dec(counter); + code.jnz(loop); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) { + for (size_t i = 0; i < result.size(); i++) { + result[i] = PolynomialMultiply<u16, u8>(a[i], b[i]); + } + }); +} + +void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::PCLMULQDQ)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + code.pclmulqdq(xmm_a, xmm_b, 0x00); + + ctx.reg_alloc.DefineValue(inst, xmm_a); + return; + } + + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) { + const auto handle_high_bits = [](u64 lhs, u64 rhs) { + constexpr size_t bit_size = mcl::bitsizeof<u64>; + u64 result = 0; + + for (size_t i = 1; i < bit_size; i++) { + if (mcl::bit::get_bit(i, lhs)) { + result ^= rhs >> (bit_size - i); + } + } + + return result; + }; + + result[0] = PolynomialMultiply<u64, u64>(a[0], b[0]); + result[1] = handle_high_bits(a[0], b[0]); + }); +} + +void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::AVX512BITALG)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + + code.vpopcntb(data, data); + + ctx.reg_alloc.DefineValue(inst, data); + return; + } + + if (code.HasHostFeature(HostFeature::SSSE3)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm low_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm high_a = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(high_a, low_a); + code.psrlw(high_a, 4); + code.movdqa(tmp1, code.Const(xword, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F)); + code.pand(high_a, tmp1); // High nibbles + code.pand(low_a, tmp1); // Low nibbles + + code.movdqa(tmp1, code.Const(xword, 0x0302020102010100, 0x0403030203020201)); + code.movdqa(tmp2, tmp1); + code.pshufb(tmp1, low_a); + code.pshufb(tmp2, high_a); + + code.paddb(tmp1, tmp2); + + ctx.reg_alloc.DefineValue(inst, tmp1); + return; + } + + EmitOneArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a) { + std::transform(a.begin(), a.end(), result.begin(), [](u8 val) { + return static_cast<u8>(mcl::bit::count_ones(val)); + }); + }); +} + +void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::GFNI)) { + code.gf2p8affineqb(data, code.Const(xword, 0x8040201008040201, 0x8040201008040201), 0); + } else { + const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm(); + code.movdqa(high_nibble_reg, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0)); + code.pand(high_nibble_reg, data); + code.pxor(data, high_nibble_reg); + code.psrld(high_nibble_reg, 4); + + if (code.HasHostFeature(HostFeature::SSSE3)) { + // High lookup + const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm(); + code.movdqa(high_reversed_reg, code.Const(xword, 0xE060A020C0408000, 0xF070B030D0509010)); + code.pshufb(high_reversed_reg, data); + + // Low lookup (low nibble equivalent of the above) + code.movdqa(data, code.Const(xword, 0x0E060A020C040800, 0x0F070B030D050901)); + code.pshufb(data, high_nibble_reg); + code.por(data, high_reversed_reg); + } else { + code.pslld(data, 4); + code.por(data, high_nibble_reg); + + code.movdqa(high_nibble_reg, code.Const(xword, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC)); + code.pand(high_nibble_reg, data); + code.pxor(data, high_nibble_reg); + code.psrld(high_nibble_reg, 2); + code.pslld(data, 2); + code.por(data, high_nibble_reg); + + code.movdqa(high_nibble_reg, code.Const(xword, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA)); + code.pand(high_nibble_reg, data); + code.pxor(data, high_nibble_reg); + code.psrld(high_nibble_reg, 1); + code.paddd(data, data); + code.por(data, high_nibble_reg); + } + } + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorReverseElementsInHalfGroups8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp, data); + code.psllw(tmp, 8); + code.psrlw(data, 8); + code.por(data, tmp); + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorReverseElementsInWordGroups8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // TODO: PSHUFB + + code.movdqa(tmp, data); + code.psllw(tmp, 8); + code.psrlw(data, 8); + code.por(data, tmp); + code.pshuflw(data, data, 0b10110001); + code.pshufhw(data, data, 0b10110001); + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorReverseElementsInWordGroups16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + + code.pshuflw(data, data, 0b10110001); + code.pshufhw(data, data, 0b10110001); + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorReverseElementsInLongGroups8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // TODO: PSHUFB + + code.movdqa(tmp, data); + code.psllw(tmp, 8); + code.psrlw(data, 8); + code.por(data, tmp); + code.pshuflw(data, data, 0b00011011); + code.pshufhw(data, data, 0b00011011); + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorReverseElementsInLongGroups16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + + code.pshuflw(data, data, 0b00011011); + code.pshufhw(data, data, 0b00011011); + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorReverseElementsInLongGroups32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + + code.pshuflw(data, data, 0b01001110); + code.pshufhw(data, data, 0b01001110); + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm temp = xmm0; + + // Add upper elements to lower elements + code.pshufd(temp, data, 0b01'00'11'10); + code.paddb(data, temp); + + // Add adjacent 8-bit values into 64-bit lanes + code.pxor(temp, temp); + code.psadbw(data, temp); + + // Zero-extend lower 8-bits + code.pslldq(data, 15); + code.psrldq(data, 15); + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm temp = xmm0; + + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pxor(temp, temp); + code.phaddw(data, xmm0); + code.phaddw(data, xmm0); + code.phaddw(data, xmm0); + } else { + // Add upper elements to lower elements + code.pshufd(temp, data, 0b00'01'10'11); + code.paddw(data, temp); + + // Add pairs of 16-bit values into 32-bit lanes + code.movdqa(temp, code.Const(xword, 0x0001000100010001, 0x0001000100010001)); + code.pmaddwd(data, temp); + + // Sum adjacent 32-bit lanes + code.pshufd(temp, data, 0b10'11'00'01); + code.paddd(data, temp); + // Zero-extend lower 16-bits + code.pslldq(data, 14); + code.psrldq(data, 14); + } + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm temp = xmm0; + + // Add upper elements to lower elements(reversed) + code.pshufd(temp, data, 0b00'01'10'11); + code.paddd(data, temp); + + // Sum adjacent 32-bit lanes + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.phaddd(data, data); + } else { + code.pshufd(temp, data, 0b10'11'00'01); + code.paddd(data, temp); + } + + // shift upper-most result into lower-most lane + code.psrldq(data, 12); + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm temp = xmm0; + + // Add upper elements to lower elements + code.pshufd(temp, data, 0b01'00'11'10); + code.paddq(data, temp); + + // Zero-extend lower 64-bits + code.movq(data, data); + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorRotateWholeVectorRight(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const u8 shift_amount = args[1].GetImmediateU8(); + ASSERT(shift_amount % 32 == 0); + const u8 shuffle_imm = std::rotr<u8>(0b11100100, shift_amount / 32 * 2); + + code.pshufd(result, operand, shuffle_imm); + + ctx.reg_alloc.DefineValue(inst, result); +} + +static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + switch (esize) { + case 8: { + const Xbyak::Xmm vec_128 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(vec_128, code.Const(xword, 0x8080808080808080, 0x8080808080808080)); + + code.paddb(a, vec_128); + code.paddb(b, vec_128); + code.pavgb(a, b); + code.paddb(a, vec_128); + break; + } + case 16: { + const Xbyak::Xmm vec_32768 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(vec_32768, code.Const(xword, 0x8000800080008000, 0x8000800080008000)); + + code.paddw(a, vec_32768); + code.paddw(b, vec_32768); + code.pavgw(a, b); + code.paddw(a, vec_32768); + break; + } + case 32: { + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp1, a); + + code.por(a, b); + code.psrad(tmp1, 1); + code.psrad(b, 1); + code.pslld(a, 31); + code.paddd(b, tmp1); + code.psrld(a, 31); + code.paddd(a, b); + break; + } + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorRoundingHalvingAddS8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddSigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorRoundingHalvingAddS16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddSigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorRoundingHalvingAddS32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddSigned(32, ctx, inst, code); +} + +static void EmitVectorRoundingHalvingAddUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + switch (esize) { + case 8: + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pavgb); + return; + case 16: + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pavgw); + return; + case 32: { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(tmp1, a); + + code.por(a, b); + code.psrld(tmp1, 1); + code.psrld(b, 1); + code.pslld(a, 31); + code.paddd(b, tmp1); + code.psrld(a, 31); + code.paddd(a, b); + + ctx.reg_alloc.DefineValue(inst, a); + break; + } + } +} + +void EmitX64::EmitVectorRoundingHalvingAddU8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddUnsigned(8, ctx, inst, code); +} + +void EmitX64::EmitVectorRoundingHalvingAddU16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddUnsigned(16, ctx, inst, code); +} + +void EmitX64::EmitVectorRoundingHalvingAddU32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorRoundingHalvingAddUnsigned(32, ctx, inst, code); +} + +template<typename T, typename U> +static void RoundingShiftLeft(VectorArray<T>& out, const VectorArray<T>& lhs, const VectorArray<U>& rhs) { + using signed_type = std::make_signed_t<T>; + using unsigned_type = std::make_unsigned_t<T>; + + constexpr auto bit_size = static_cast<s64>(mcl::bitsizeof<T>); + + for (size_t i = 0; i < out.size(); i++) { + const s64 extended_shift = static_cast<s64>(mcl::bit::sign_extend<8, u64>(rhs[i] & 0xFF)); + + if (extended_shift >= 0) { + if (extended_shift >= bit_size) { + out[i] = 0; + } else { + out[i] = static_cast<T>(static_cast<unsigned_type>(lhs[i]) << extended_shift); + } + } else { + if ((std::is_unsigned_v<T> && extended_shift < -bit_size) || (std::is_signed_v<T> && extended_shift <= -bit_size)) { + out[i] = 0; + } else { + const s64 shift_value = -extended_shift - 1; + const T shifted = (lhs[i] & (static_cast<signed_type>(1) << shift_value)) >> shift_value; + + if (extended_shift == -bit_size) { + out[i] = shifted; + } else { + out[i] = (lhs[i] >> -extended_shift) + shifted; + } + } + } + } +} + +void EmitX64::EmitVectorRoundingShiftLeftS8(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& lhs, const VectorArray<s8>& rhs) { + RoundingShiftLeft(result, lhs, rhs); + }); +} + +void EmitX64::EmitVectorRoundingShiftLeftS16(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& lhs, const VectorArray<s16>& rhs) { + RoundingShiftLeft(result, lhs, rhs); + }); +} + +void EmitX64::EmitVectorRoundingShiftLeftS32(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& lhs, const VectorArray<s32>& rhs) { + RoundingShiftLeft(result, lhs, rhs); + }); +} + +void EmitX64::EmitVectorRoundingShiftLeftS64(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& lhs, const VectorArray<s64>& rhs) { + RoundingShiftLeft(result, lhs, rhs); + }); +} + +void EmitX64::EmitVectorRoundingShiftLeftU8(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& lhs, const VectorArray<s8>& rhs) { + RoundingShiftLeft(result, lhs, rhs); + }); +} + +void EmitX64::EmitVectorRoundingShiftLeftU16(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& lhs, const VectorArray<s16>& rhs) { + RoundingShiftLeft(result, lhs, rhs); + }); +} + +void EmitX64::EmitVectorRoundingShiftLeftU32(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& lhs, const VectorArray<s32>& rhs) { + RoundingShiftLeft(result, lhs, rhs); + }); +} + +void EmitX64::EmitVectorRoundingShiftLeftU64(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& lhs, const VectorArray<s64>& rhs) { + RoundingShiftLeft(result, lhs, rhs); + }); +} + +void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + code.pmovsxbw(a, a); + ctx.reg_alloc.DefineValue(inst, a); + } else { + const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.pxor(result, result); + code.punpcklbw(result, a); + code.psraw(result, 8); + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + code.pmovsxwd(a, a); + ctx.reg_alloc.DefineValue(inst, a); + } else { + const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.pxor(result, result); + code.punpcklwd(result, a); + code.psrad(result, 16); + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pmovsxdq(a, a); + } else { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + code.movaps(tmp, a); + code.psrad(tmp, 31); + code.punpckldq(a, tmp); + } + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Reg64 gpr_tmp = ctx.reg_alloc.ScratchGpr(); + + code.movq(gpr_tmp, data); + code.sar(gpr_tmp, 63); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pinsrq(data, gpr_tmp, 1); + } else { + const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(); + + code.movq(xmm_tmp, gpr_tmp); + code.punpcklqdq(data, xmm_tmp); + } + + ctx.reg_alloc.DefineValue(inst, data); +} + +static void EmitVectorSignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + // only signed 16-bit min/max are available below SSE4.1 + if (code.HasHostFeature(HostFeature::SSE41) || esize == 16) { + code.movdqa(tmp, x); + + switch (esize) { + case 8: + code.pminsb(tmp, y); + code.pmaxsb(x, y); + code.psubb(x, tmp); + break; + case 16: + code.pminsw(tmp, y); + code.pmaxsw(x, y); + code.psubw(x, tmp); + break; + case 32: + code.pminsd(tmp, y); + code.pmaxsd(x, y); + code.psubd(x, tmp); + break; + default: + UNREACHABLE(); + } + } else { + code.movdqa(tmp, y); + + switch (esize) { + case 8: + code.pcmpgtb(tmp, x); + code.psubb(x, y); + code.pxor(x, tmp); + code.psubb(x, tmp); + break; + case 32: + code.pcmpgtd(tmp, x); + code.psubd(x, y); + code.pxor(x, tmp); + code.psubd(x, tmp); + break; + default: + UNREACHABLE(); + } + } + + ctx.reg_alloc.DefineValue(inst, x); +} + +void EmitX64::EmitVectorSignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedAbsoluteDifference(8, ctx, inst, code); +} + +void EmitX64::EmitVectorSignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedAbsoluteDifference(16, ctx, inst, code); +} + +void EmitX64::EmitVectorSignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedAbsoluteDifference(32, ctx, inst, code); +} + +void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) { + const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp); + const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + + if (upper_inst) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpmulhw(result, x, y); + } else { + code.movdqa(result, x); + code.pmulhw(result, y); + } + + ctx.reg_alloc.DefineValue(upper_inst, result); + } + + if (lower_inst) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpmullw(result, x, y); + } else { + code.movdqa(result, x); + code.pmullw(result, y); + } + ctx.reg_alloc.DefineValue(lower_inst, result); + } +} + +void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) { + const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp); + const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) { + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vpmulld(result, x, y); + + ctx.reg_alloc.DefineValue(lower_inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::AVX)) { + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + if (lower_inst) { + const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); + code.vpmulld(lower_result, x, y); + ctx.reg_alloc.DefineValue(lower_inst, lower_result); + } + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vpmuldq(result, x, y); + code.vpsrlq(x, x, 32); + code.vpsrlq(y, y, 32); + code.vpmuldq(x, x, y); + code.shufps(result, x, 0b11011101); + + ctx.reg_alloc.DefineValue(upper_inst, result); + return; + } + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); + + // calculate sign correction + code.movdqa(tmp, x); + code.movdqa(sign_correction, y); + code.psrad(tmp, 31); + code.psrad(sign_correction, 31); + code.pand(tmp, y); + code.pand(sign_correction, x); + code.paddd(sign_correction, tmp); + code.pand(sign_correction, code.Const(xword, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF)); + + // calculate unsigned multiply + code.movdqa(tmp, x); + code.pmuludq(tmp, y); + code.psrlq(x, 32); + code.psrlq(y, 32); + code.pmuludq(x, y); + + // put everything into place + code.pcmpeqw(upper_result, upper_result); + code.pcmpeqw(lower_result, lower_result); + code.psllq(upper_result, 32); + code.psrlq(lower_result, 32); + code.pand(upper_result, x); + code.pand(lower_result, tmp); + code.psrlq(tmp, 32); + code.psllq(x, 32); + code.por(upper_result, tmp); + code.por(lower_result, x); + code.psubd(upper_result, sign_correction); + + if (upper_inst) { + ctx.reg_alloc.DefineValue(upper_inst, upper_result); + } + if (lower_inst) { + ctx.reg_alloc.DefineValue(lower_inst, lower_result); + } +} + +static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm data_test = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Address mask = [esize, &code] { + switch (esize) { + case 8: + return code.Const(xword, 0x8080808080808080, 0x8080808080808080); + case 16: + return code.Const(xword, 0x8000800080008000, 0x8000800080008000); + case 32: + return code.Const(xword, 0x8000000080000000, 0x8000000080000000); + case 64: + return code.Const(xword, 0x8000000000000000, 0x8000000000000000); + default: + UNREACHABLE(); + } + }(); + + const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const Xbyak::Xmm& y) { + switch (esize) { + case 8: + code.pcmpeqb(x, y); + break; + case 16: + code.pcmpeqw(x, y); + break; + case 32: + code.pcmpeqd(x, y); + break; + case 64: + code.pcmpeqq(x, y); + break; + } + }; + + // Keep a copy of the initial data for determining whether or not + // to set the Q flag + code.movdqa(data_test, data); + + switch (esize) { + case 8: + VectorAbs8(code, ctx, data); + break; + case 16: + VectorAbs16(code, ctx, data); + break; + case 32: + VectorAbs32(code, ctx, data); + break; + case 64: + VectorAbs64(code, ctx, data); + break; + } + + code.movdqa(sign, mask); + vector_equality(sign, data); + code.pxor(data, sign); + + // Check if the initial data contained any elements with the value 0x80. + // If any exist, then the Q flag needs to be set. + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + code.movdqa(sign, mask); + vector_equality(data_test, sign); + code.pmovmskb(bit, data_test); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.DefineValue(inst, data); +} + +void EmitX64::EmitVectorSignedSaturatedAbs8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedAbs(8, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedAbs16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedAbs(16, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedAbs32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedAbs(32, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorSignedSaturatedAbs(64, code, ctx, inst); + return; + } + + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& data) { + bool qc_flag = false; + + for (size_t i = 0; i < result.size(); i++) { + if (static_cast<u64>(data[i]) == 0x8000000000000000) { + result[i] = 0x7FFFFFFFFFFFFFFF; + qc_flag = true; + } else { + result[i] = std::abs(data[i]); + } + } + + return qc_flag; + }); +} + +template<size_t bit_width> +static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + code.movdqa(xmm0, y); + ctx.reg_alloc.Release(y); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + switch (bit_width) { + case 8: + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpaddb(result, x, xmm0); + } else { + code.movdqa(result, x); + code.paddb(result, xmm0); + } + break; + case 16: + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpaddw(result, x, xmm0); + } else { + code.movdqa(result, x); + code.paddw(result, xmm0); + } + break; + case 32: + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpaddd(result, x, xmm0); + } else { + code.movdqa(result, x); + code.paddd(result, xmm0); + } + break; + case 64: + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpaddq(result, x, xmm0); + } else { + code.movdqa(result, x); + code.paddq(result, xmm0); + } + break; + } + + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + // xmm0 = majority(~y, x, res) + code.vpternlogd(xmm0, x, result, 0b10001110); + } else if (code.HasHostFeature(HostFeature::AVX)) { + code.vpor(tmp, x, result); + code.pand(x, result); + code.vpblendvb(xmm0, tmp, x, xmm0); + } else { + code.movdqa(tmp, x); + code.pxor(x, result); + code.pand(tmp, result); + code.pandn(xmm0, x); + code.por(xmm0, tmp); + } + + ctx.reg_alloc.Release(x); + + switch (bit_width) { + case 8: + if (code.HasHostFeature(HostFeature::AVX)) { + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + code.pcmpeqb(tmp2, tmp2); + code.pxor(tmp, tmp); + code.vpblendvb(xmm0, tmp, tmp2, xmm0); + ctx.reg_alloc.Release(tmp2); + } else { + code.pand(xmm0, code.Const(xword, 0x8080808080808080, 0x8080808080808080)); + code.movdqa(tmp, xmm0); + code.psrlw(tmp, 7); + code.pxor(xmm0, xmm0); + code.psubb(xmm0, tmp); + } + break; + case 16: + code.psraw(xmm0, 15); + break; + case 32: + code.psrad(xmm0, 31); + break; + case 64: + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + code.vpsraq(xmm0, xmm0, 63); + } else { + code.psrad(xmm0, 31); + code.pshufd(xmm0, xmm0, 0b11110101); + } + break; + } + + code.movdqa(tmp, xmm0); + switch (bit_width) { + case 8: + code.paddb(tmp, tmp); + code.psrlw(tmp, 1); + break; + case 16: + code.psrlw(tmp, 1); + break; + case 32: + code.psrld(tmp, 1); + break; + case 64: + code.psrlq(tmp, 1); + break; + } + + const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32(); + code.pmovmskb(mask, xmm0); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pblendvb(result, tmp); + } else { + code.pandn(xmm0, result); + code.por(xmm0, tmp); + code.movdqa(result, xmm0); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedAccumulateUnsigned<8>(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedAccumulateUnsigned<16>(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedAccumulateUnsigned<32>(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedAccumulateUnsigned<64>(code, ctx, inst); +} + +template<bool is_rounding> +static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm upper_tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm lower_tmp = ctx.reg_alloc.ScratchXmm(); + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpmulhw(upper_tmp, x, y); + } else { + code.movdqa(upper_tmp, x); + code.pmulhw(upper_tmp, y); + } + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpmullw(lower_tmp, x, y); + } else { + code.movdqa(lower_tmp, x); + code.pmullw(lower_tmp, y); + } + + ctx.reg_alloc.Release(x); + ctx.reg_alloc.Release(y); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if (code.HasHostFeature(HostFeature::AVX)) { + if constexpr (is_rounding) { + code.vpsrlw(lower_tmp, lower_tmp, 14); + code.vpaddw(lower_tmp, lower_tmp, code.Const(xword, 0x0001000100010001, 0x0001000100010001)); + code.vpsrlw(lower_tmp, lower_tmp, 1); + } else { + code.vpsrlw(lower_tmp, lower_tmp, 15); + } + code.vpaddw(upper_tmp, upper_tmp, upper_tmp); + code.vpaddw(result, upper_tmp, lower_tmp); + code.vpcmpeqw(upper_tmp, result, code.Const(xword, 0x8000800080008000, 0x8000800080008000)); + code.vpxor(result, result, upper_tmp); + } else { + code.paddw(upper_tmp, upper_tmp); + if constexpr (is_rounding) { + code.psrlw(lower_tmp, 14); + code.paddw(lower_tmp, code.Const(xword, 0x0001000100010001, 0x0001000100010001)); + code.psrlw(lower_tmp, 1); + } else { + code.psrlw(lower_tmp, 15); + } + code.movdqa(result, upper_tmp); + code.paddw(result, lower_tmp); + code.movdqa(upper_tmp, code.Const(xword, 0x8000800080008000, 0x8000800080008000)); + code.pcmpeqw(upper_tmp, result); + code.pxor(result, upper_tmp); + } + + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + code.pmovmskb(bit, upper_tmp); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHigh16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedDoublingMultiply16<false>(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHighRounding16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedDoublingMultiply16<true>(code, ctx, inst); +} + +template<bool is_rounding> +void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AVX)) { + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm odds = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm even = ctx.reg_alloc.ScratchXmm(); + + code.vpmuldq(odds, x, y); + code.vpsrlq(x, x, 32); + code.vpsrlq(y, y, 32); + code.vpmuldq(even, x, y); + + ctx.reg_alloc.Release(x); + ctx.reg_alloc.Release(y); + + code.vpaddq(odds, odds, odds); + code.vpaddq(even, even, even); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if constexpr (is_rounding) { + code.vmovdqa(result, code.Const(xword, 0x0000000080000000, 0x0000000080000000)); + code.vpaddq(odds, odds, result); + code.vpaddq(even, even, result); + } + + code.vpsrlq(result, odds, 32); + code.vblendps(result, result, even, 0b1010); + + const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.vpcmpeqd(mask, result, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); + code.vpxor(result, result, mask); + code.pmovmskb(bit, mask); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.Release(mask); + ctx.reg_alloc.Release(bit); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + // calculate sign correction + code.movdqa(tmp, x); + code.movdqa(sign_correction, y); + code.psrad(tmp, 31); + code.psrad(sign_correction, 31); + code.pand(tmp, y); + code.pand(sign_correction, x); + code.paddd(sign_correction, tmp); + code.pslld(sign_correction, 1); + + // unsigned multiply + code.movdqa(tmp, x); + code.pmuludq(tmp, y); + code.psrlq(x, 32); + code.psrlq(y, 32); + code.pmuludq(x, y); + + // double + code.paddq(tmp, tmp); + code.paddq(x, x); + + if constexpr (is_rounding) { + code.movdqa(result, code.Const(xword, 0x0000000080000000, 0x0000000080000000)); + code.paddq(tmp, result); + code.paddq(x, result); + } + + // put everything into place + code.pcmpeqw(result, result); + code.psllq(result, 32); + code.pand(result, x); + code.psrlq(tmp, 32); + code.por(result, tmp); + code.psubd(result, sign_correction); + + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.movdqa(tmp, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); + code.pcmpeqd(tmp, result); + code.pxor(result, tmp); + code.pmovmskb(bit, tmp); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHigh32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedDoublingMultiply32<false>(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHighRounding32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedDoublingMultiply32<true>(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.punpcklwd(x, x); + code.punpcklwd(y, y); + code.pmaddwd(x, y); + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpcmpeqd(y, x, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); + code.vpxor(x, x, y); + } else { + code.movdqa(y, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); + code.pcmpeqd(y, x); + code.pxor(x, y); + } + + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + code.pmovmskb(bit, y); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.DefineValue(inst, x); +} + +void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpmovsxdq(x, x); + code.vpmovsxdq(y, y); + code.vpmuldq(x, x, y); + code.vpaddq(x, x, x); + } else { + const Xbyak::Reg64 a = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 b = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 c = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 d = ctx.reg_alloc.ScratchGpr(); + + code.movq(c, x); + code.movq(d, y); + code.movsxd(a, c.cvt32()); + code.movsxd(b, d.cvt32()); + code.sar(c, 32); + code.sar(d, 32); + code.imul(a, b); + code.imul(c, d); + code.movq(x, a); + code.movq(y, c); + code.punpcklqdq(x, y); + code.paddq(x, x); + + ctx.reg_alloc.Release(a); + ctx.reg_alloc.Release(b); + ctx.reg_alloc.Release(c); + ctx.reg_alloc.Release(d); + } + + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpcmpeqq(y, x, code.Const(xword, 0x8000000000000000, 0x8000000000000000)); + code.vpxor(x, x, y); + code.vpmovmskb(bit, y); + } else { + code.movdqa(y, code.Const(xword, 0x8000000000000000, 0x8000000000000000)); + code.pcmpeqd(y, x); + code.shufps(y, y, 0b11110101); + code.pxor(x, y); + code.pmovmskb(bit, y); + } + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.DefineValue(inst, x); +} + +static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(dest, src); + code.pxor(xmm0, xmm0); + + switch (original_esize) { + case 16: + code.packsswb(dest, xmm0); + code.movdqa(sign, src); + code.psraw(sign, 15); + code.packsswb(sign, sign); + code.movdqa(reconstructed, dest); + code.punpcklbw(reconstructed, sign); + break; + case 32: + code.packssdw(dest, xmm0); + code.movdqa(reconstructed, dest); + code.movdqa(sign, dest); + code.psraw(sign, 15); + code.punpcklwd(reconstructed, sign); + break; + default: + UNREACHABLE(); + } + + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + code.pcmpeqd(reconstructed, src); + code.movmskps(bit, reconstructed); + code.xor_(bit, 0b1111); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.DefineValue(inst, dest); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToSigned16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNarrowToSigned(16, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToSigned32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNarrowToSigned(32, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToSigned64(EmitContext& ctx, IR::Inst* inst) { + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s64>& a) { + result = {}; + bool qc_flag = false; + for (size_t i = 0; i < a.size(); ++i) { + const s64 saturated = std::clamp<s64>(a[i], -s64(0x80000000), s64(0x7FFFFFFF)); + result[i] = static_cast<s32>(saturated); + qc_flag |= saturated != a[i]; + } + return qc_flag; + }); +} + +static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(); + + code.movdqa(dest, src); + code.pxor(xmm0, xmm0); + + switch (original_esize) { + case 16: + code.packuswb(dest, xmm0); + code.movdqa(reconstructed, dest); + code.punpcklbw(reconstructed, xmm0); + break; + case 32: + ASSERT(code.HasHostFeature(HostFeature::SSE41)); + code.packusdw(dest, xmm0); // SSE4.1 + code.movdqa(reconstructed, dest); + code.punpcklwd(reconstructed, xmm0); + break; + default: + UNREACHABLE(); + } + + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + code.pcmpeqd(reconstructed, src); + code.movmskps(bit, reconstructed); + code.xor_(bit, 0b1111); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.DefineValue(inst, dest); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNarrowToUnsigned(16, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned32(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorSignedSaturatedNarrowToUnsigned(32, code, ctx, inst); + return; + } + + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<s32>& a) { + result = {}; + bool qc_flag = false; + for (size_t i = 0; i < a.size(); ++i) { + const s32 saturated = std::clamp<s32>(a[i], 0, 0xFFFF); + result[i] = static_cast<u16>(saturated); + qc_flag |= saturated != a[i]; + } + return qc_flag; + }); +} + +void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned64(EmitContext& ctx, IR::Inst* inst) { + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<s64>& a) { + result = {}; + bool qc_flag = false; + for (size_t i = 0; i < a.size(); ++i) { + const s64 saturated = std::clamp<s64>(a[i], 0, 0xFFFFFFFF); + result[i] = static_cast<u32>(saturated); + qc_flag |= saturated != a[i]; + } + return qc_flag; + }); +} + +static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm data = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Address mask = [esize, &code] { + switch (esize) { + case 8: + return code.Const(xword, 0x8080808080808080, 0x8080808080808080); + case 16: + return code.Const(xword, 0x8000800080008000, 0x8000800080008000); + case 32: + return code.Const(xword, 0x8000000080000000, 0x8000000080000000); + case 64: + return code.Const(xword, 0x8000000000000000, 0x8000000000000000); + default: + UNREACHABLE(); + } + }(); + + const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const auto& y) { + switch (esize) { + case 8: + code.pcmpeqb(x, y); + break; + case 16: + code.pcmpeqw(x, y); + break; + case 32: + code.pcmpeqd(x, y); + break; + case 64: + code.pcmpeqq(x, y); + break; + } + }; + + code.movdqa(tmp, data); + vector_equality(tmp, mask); + + // Perform negation + code.pxor(zero, zero); + switch (esize) { + case 8: + code.psubsb(zero, data); + break; + case 16: + code.psubsw(zero, data); + break; + case 32: + code.psubd(zero, data); + code.pxor(zero, tmp); + break; + case 64: + code.psubq(zero, data); + code.pxor(zero, tmp); + break; + } + + // Check if any elements matched the mask prior to performing saturation. If so, set the Q bit. + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + code.pmovmskb(bit, tmp); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.DefineValue(inst, zero); +} + +void EmitX64::EmitVectorSignedSaturatedNeg8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNeg(8, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNeg16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNeg(16, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNeg32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturatedNeg(32, code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedNeg64(EmitContext& ctx, IR::Inst* inst) { + if (code.HasHostFeature(HostFeature::SSE41)) { + EmitVectorSignedSaturatedNeg(64, code, ctx, inst); + return; + } + + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& data) { + bool qc_flag = false; + + for (size_t i = 0; i < result.size(); i++) { + if (static_cast<u64>(data[i]) == 0x8000000000000000) { + result[i] = 0x7FFFFFFFFFFFFFFF; + qc_flag = true; + } else { + result[i] = -data[i]; + } + } + + return qc_flag; + }); +} + +// MSVC requires the capture within the saturate lambda, but it's +// determined to be unnecessary via clang and GCC. +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wunused-lambda-capture" +#endif +template<typename T, typename U = std::make_unsigned_t<T>> +static bool VectorSignedSaturatedShiftLeft(VectorArray<T>& dst, const VectorArray<T>& data, const VectorArray<T>& shift_values) { + static_assert(std::is_signed_v<T>, "T must be signed."); + + bool qc_flag = false; + + constexpr size_t bit_size_minus_one = mcl::bitsizeof<T> - 1; + + const auto saturate = [bit_size_minus_one](T value) { + return static_cast<T>((static_cast<U>(value) >> bit_size_minus_one) + (U{1} << bit_size_minus_one) - 1); + }; + + for (size_t i = 0; i < dst.size(); i++) { + const T element = data[i]; + const T shift = std::clamp<T>(static_cast<T>(mcl::bit::sign_extend<8>(static_cast<U>(shift_values[i] & 0xFF))), + -static_cast<T>(bit_size_minus_one), std::numeric_limits<T>::max()); + + if (element == 0) { + dst[i] = 0; + } else if (shift < 0) { + dst[i] = static_cast<T>(element >> -shift); + } else if (static_cast<U>(shift) > bit_size_minus_one) { + dst[i] = saturate(element); + qc_flag = true; + } else { + const T shifted = T(U(element) << shift); + + if ((shifted >> shift) != element) { + dst[i] = saturate(element); + qc_flag = true; + } else { + dst[i] = shifted; + } + } + } + + return qc_flag; +} +#ifdef __clang__ +# pragma clang diagnostic pop +#endif + +void EmitX64::EmitVectorSignedSaturatedShiftLeft8(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft<s8>); +} + +void EmitX64::EmitVectorSignedSaturatedShiftLeft16(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft<s16>); +} + +void EmitX64::EmitVectorSignedSaturatedShiftLeft32(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft<s32>); +} + +void EmitX64::EmitVectorSignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft<s64>); +} + +template<typename T, typename U = std::make_unsigned_t<T>> +static bool VectorSignedSaturatedShiftLeftUnsigned(VectorArray<T>& dst, const VectorArray<T>& data, u8 shift_amount) { + static_assert(std::is_signed_v<T>, "T must be signed."); + + bool qc_flag = false; + for (size_t i = 0; i < dst.size(); i++) { + const T element = data[i]; + const T shift = static_cast<T>(shift_amount); + + if (element == 0) { + dst[i] = 0; + } else if (element < 0) { + dst[i] = 0; + qc_flag = true; + } else { + const U shifted = static_cast<U>(element) << static_cast<U>(shift); + const U shifted_test = shifted >> static_cast<U>(shift); + + if (shifted_test != static_cast<U>(element)) { + dst[i] = static_cast<T>(std::numeric_limits<U>::max()); + qc_flag = true; + } else { + dst[i] = shifted; + } + } + } + + return qc_flag; +} + +void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned8(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s8>); +} + +void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned16(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s16>); +} + +void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned32(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s32>); +} + +void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned64(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s64>); +} + +void EmitX64::EmitVectorSub8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubb); +} + +void EmitX64::EmitVectorSub16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubw); +} + +void EmitX64::EmitVectorSub32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubd); +} + +void EmitX64::EmitVectorSub64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::psubq); +} + +void EmitX64::EmitVectorTable(EmitContext&, IR::Inst* inst) { + // Do nothing. We *want* to hold on to the refcount for our arguments, so VectorTableLookup can use our arguments. + ASSERT_MSG(inst->UseCount() == 1, "Table cannot be used multiple times"); +} + +void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { + ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto table = ctx.reg_alloc.GetArgumentInfo(inst->GetArg(1).GetInst()); + + const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem) { return !elem.IsVoid(); }); + const bool is_defaults_zero = inst->GetArg(0).IsZero(); + + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI)) { + const Xbyak::Xmm indicies = table_size <= 2 ? ctx.reg_alloc.UseXmm(args[2]) : ctx.reg_alloc.UseScratchXmm(args[2]); + + const u64 index_count = mcl::bit::replicate_element<u8, u64>(static_cast<u8>(table_size * 8)); + + code.vpcmpub(k1, indicies, code.Const(xword, index_count, 0), CmpInt::LessThan); + + switch (table_size) { + case 1: { + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]); + if (is_defaults_zero) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.vpermb(result | k1 | T_z, indicies, xmm_table0); + ctx.reg_alloc.DefineValue(inst, result); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vpermb(result | k1, indicies, xmm_table0); + ctx.reg_alloc.DefineValue(inst, result); + } + break; + } + case 2: { + const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(table[0]); + const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); + code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper); + if (is_defaults_zero) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.vpermb(result | k1 | T_z, indicies, xmm0); + ctx.reg_alloc.DefineValue(inst, result); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vpermb(result | k1, indicies, xmm0); + ctx.reg_alloc.DefineValue(inst, result); + } + break; + } + case 3: { + const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(table[0]); + const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(table[2]); + code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper); + if (is_defaults_zero) { + code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1); + ctx.reg_alloc.DefineValue(inst, indicies); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vpermi2b(indicies, xmm0, xmm_table1); + code.vmovdqu8(result | k1, indicies); + ctx.reg_alloc.DefineValue(inst, result); + } + break; + } + case 4: { + const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(table[0]); + const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]); + const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]); + code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper); + code.vpunpcklqdq(xmm_table1, xmm_table1, xmm_table1_upper); + if (is_defaults_zero) { + code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1); + ctx.reg_alloc.DefineValue(inst, indicies); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vpermi2b(indicies, xmm0, xmm_table1); + code.vmovdqu8(result | k1, indicies); + ctx.reg_alloc.DefineValue(inst, result); + } + break; + } + default: + UNREACHABLE(); + break; + } + return; + } + + const std::array<u64, 5> sat_const{ + 0, + 0x7878787878787878, + 0x7070707070707070, + 0x6868686868686868, + 0x6060606060606060, + }; + + if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.xorps(result, result); + code.movsd(result, xmm_table0); + code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); + code.pshufb(result, indicies); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 2) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); + + code.punpcklqdq(xmm_table0, xmm_table0_upper); + code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); + code.pshufb(xmm_table0, indicies); + + ctx.reg_alloc.DefineValue(inst, xmm_table0); + return; + } + + if (code.HasHostFeature(HostFeature::SSE41) && table_size <= 2) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + + if (table_size == 2) { + const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); + code.punpcklqdq(xmm_table0, xmm_table0_upper); + ctx.reg_alloc.Release(xmm_table0_upper); + } + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpaddusb(xmm0, indicies, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF)); + } else { + code.movaps(xmm0, indicies); + code.paddusb(xmm0, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF)); + } + code.pshufb(xmm_table0, indicies); + code.pblendvb(xmm_table0, defaults); + + ctx.reg_alloc.DefineValue(inst, xmm_table0); + return; + } + + if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]); + + { + const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); + code.punpcklqdq(xmm_table0, xmm_table0_upper); + ctx.reg_alloc.Release(xmm_table0_upper); + } + if (table_size == 3) { + code.pxor(xmm0, xmm0); + code.punpcklqdq(xmm_table1, xmm0); + } else { + ASSERT(table_size == 4); + const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]); + code.punpcklqdq(xmm_table1, xmm_table1_upper); + ctx.reg_alloc.Release(xmm_table1_upper); + } + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); + } else { + code.movaps(xmm0, indicies); + code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); + } + code.paddusb(indicies, code.Const(xword, 0x6060606060606060, 0xFFFFFFFFFFFFFFFF)); + code.pshufb(xmm_table0, xmm0); + code.pshufb(xmm_table1, indicies); + code.pblendvb(xmm_table0, xmm_table1); + + ctx.reg_alloc.DefineValue(inst, xmm_table0); + return; + } + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[2]); + + { + const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(table[1]); + code.punpcklqdq(xmm_table0, xmm_table0_upper); + ctx.reg_alloc.Release(xmm_table0_upper); + } + if (table_size == 4) { + const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(table[3]); + code.punpcklqdq(xmm_table1, xmm_table1_upper); + ctx.reg_alloc.Release(xmm_table1_upper); + } + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); + } else { + code.movaps(xmm0, indicies); + code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); + } + code.pshufb(xmm_table0, indicies); + code.pshufb(xmm_table1, indicies); + code.pblendvb(xmm_table0, xmm_table1); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpaddusb(xmm0, indicies, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF)); + } else { + code.movaps(xmm0, indicies); + code.paddusb(xmm0, code.Const(xword, sat_const[table_size], 0xFFFFFFFFFFFFFFFF)); + } + code.pblendvb(xmm_table0, defaults); + + ctx.reg_alloc.DefineValue(inst, xmm_table0); + return; + } + + const u32 stack_space = static_cast<u32>(6 * 8); + ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE); + for (size_t i = 0; i < table_size; ++i) { + const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(table[i]); + code.movq(qword[rsp + ABI_SHADOW_SPACE + i * 8], table_value); + ctx.reg_alloc.Release(table_value); + } + const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 4 * 8]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 5 * 8]); + code.mov(code.ABI_PARAM4.cvt32(), table_size); + code.movq(qword[code.ABI_PARAM2], defaults); + code.movq(qword[code.ABI_PARAM3], indicies); + + code.CallLambda( + [](const HalfVectorArray<u8>* table, HalfVectorArray<u8>& result, const HalfVectorArray<u8>& indicies, size_t table_size) { + for (size_t i = 0; i < result.size(); ++i) { + const size_t index = indicies[i] / table[0].size(); + const size_t elem = indicies[i] % table[0].size(); + if (index < table_size) { + result[i] = table[index][elem]; + } + } + }); + + code.movq(result, qword[rsp + ABI_SHADOW_SPACE + 4 * 8]); + ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { + ASSERT(inst->GetArg(1).GetInst()->GetOpcode() == IR::Opcode::VectorTable); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto table = ctx.reg_alloc.GetArgumentInfo(inst->GetArg(1).GetInst()); + + const size_t table_size = std::count_if(table.begin(), table.end(), [](const auto& elem) { return !elem.IsVoid(); }); + const bool is_defaults_zero = !inst->GetArg(0).IsImmediate() && inst->GetArg(0).GetInst()->GetOpcode() == IR::Opcode::ZeroVector; + + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 4) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + + code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan); + code.vpcmpub(k2, indicies, code.BConst<8>(xword, 4 * 16), CmpInt::LessThan); + + // Handle vector-table 0,1 + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(table[1]); + + code.vpermi2b(indicies | k1, xmm_table0, xmm_table1); + + ctx.reg_alloc.Release(xmm_table0); + ctx.reg_alloc.Release(xmm_table1); + + // Handle vector-table 2,3 + const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(table[2]); + const Xbyak::Xmm xmm_table3 = ctx.reg_alloc.UseXmm(table[3]); + + code.kandnw(k1, k1, k2); + code.vpermi2b(indicies | k1, xmm_table2, xmm_table3); + + if (is_defaults_zero) { + code.vmovdqu8(indicies | k2 | T_z, indicies); + ctx.reg_alloc.DefineValue(inst, indicies); + } else { + const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vmovdqu8(defaults | k2, indicies); + ctx.reg_alloc.DefineValue(inst, defaults); + } + return; + } + + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + + code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan); + code.vpcmpub(k2, indicies, code.BConst<8>(xword, 3 * 16), CmpInt::LessThan); + + // Handle vector-table 0,1 + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(table[1]); + + code.vpermi2b(indicies | k1, xmm_table0, xmm_table1); + + ctx.reg_alloc.Release(xmm_table0); + ctx.reg_alloc.Release(xmm_table1); + + // Handle vector-table 2 + const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(table[2]); + + code.kandnw(k1, k1, k2); + code.vpermb(indicies | k1, indicies, xmm_table2); + + if (is_defaults_zero) { + code.vmovdqu8(indicies | k2 | T_z, indicies); + ctx.reg_alloc.DefineValue(inst, indicies); + } else { + const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vmovdqu8(defaults | k2, indicies); + ctx.reg_alloc.DefineValue(inst, defaults); + } + return; + } + + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(table[1]); + + code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan); + + if (is_defaults_zero) { + code.vpermi2b(indicies | k1 | T_z, xmm_table0, xmm_table1); + ctx.reg_alloc.DefineValue(inst, indicies); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vpermi2b(indicies, xmm_table0, xmm_table1); + code.vmovdqu8(result | k1, indicies); + ctx.reg_alloc.DefineValue(inst, result); + } + return; + } + + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(table[0]); + + code.vpcmpub(k1, indicies, code.BConst<8>(xword, 1 * 16), CmpInt::LessThan); + + if (is_defaults_zero) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + code.vpermb(result | k1 | T_z, indicies, xmm_table0); + ctx.reg_alloc.DefineValue(inst, result); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + code.vpermb(result | k1, indicies, xmm_table0); + ctx.reg_alloc.DefineValue(inst, result); + } + return; + } + + if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + + code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070)); + code.pshufb(xmm_table0, indicies); + + ctx.reg_alloc.DefineValue(inst, xmm_table0); + return; + } + + if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070)); + } else { + code.movaps(xmm0, indicies); + code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0x7070707070707070)); + } + code.pshufb(xmm_table0, indicies); + code.pblendvb(xmm_table0, defaults); + + ctx.reg_alloc.DefineValue(inst, xmm_table0); + return; + } + + if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(args[2]); + const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(table[0]); + const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(table[1]); + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070)); + } else { + code.movaps(xmm0, indicies); + code.paddusb(xmm0, code.Const(xword, 0x7070707070707070, 0x7070707070707070)); + } + code.paddusb(indicies, code.Const(xword, 0x6060606060606060, 0x6060606060606060)); + code.pshufb(xmm_table0, xmm0); + code.pshufb(xmm_table1, indicies); + code.pblendvb(xmm_table0, xmm_table1); + + ctx.reg_alloc.DefineValue(inst, xmm_table0); + return; + } + + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm masked = xmm16; + + code.vpandd(masked, indicies, code.Const(xword_b, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0)); + + for (size_t i = 0; i < table_size; ++i) { + const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(table[i]); + const Xbyak::Opmask table_mask = k1; + const u64 table_index = mcl::bit::replicate_element<u8, u64>(i * 16); + + code.vpcmpeqb(table_mask, masked, code.Const(xword, table_index, table_index)); + + if (table_index == 0 && is_defaults_zero) { + code.vpshufb(result | table_mask | T_z, xmm_table, indicies); + } else { + code.vpshufb(result | table_mask, xmm_table, indicies); + } + + ctx.reg_alloc.Release(xmm_table); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(); + + code.movaps(masked, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0)); + code.pand(masked, indicies); + + for (size_t i = 0; i < table_size; ++i) { + const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(table[i]); + + const u64 table_index = mcl::bit::replicate_element<u8, u64>(i * 16); + + if (table_index == 0) { + code.pxor(xmm0, xmm0); + code.pcmpeqb(xmm0, masked); + } else if (code.HasHostFeature(HostFeature::AVX)) { + code.vpcmpeqb(xmm0, masked, code.Const(xword, table_index, table_index)); + } else { + code.movaps(xmm0, code.Const(xword, table_index, table_index)); + code.pcmpeqb(xmm0, masked); + } + code.pshufb(xmm_table, indicies); + code.pblendvb(result, xmm_table); + + ctx.reg_alloc.Release(xmm_table); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const u32 stack_space = static_cast<u32>((table_size + 2) * 16); + ctx.reg_alloc.AllocStackSpace(stack_space + ABI_SHADOW_SPACE); + for (size_t i = 0; i < table_size; ++i) { + const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(table[i]); + code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value); + ctx.reg_alloc.Release(table_value); + } + const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + (table_size + 1) * 16]); + code.mov(code.ABI_PARAM4.cvt32(), table_size); + code.movaps(xword[code.ABI_PARAM2], defaults); + code.movaps(xword[code.ABI_PARAM3], indicies); + + code.CallLambda( + [](const VectorArray<u8>* table, VectorArray<u8>& result, const VectorArray<u8>& indicies, size_t table_size) { + for (size_t i = 0; i < result.size(); ++i) { + const size_t index = indicies[i] / table[0].size(); + const size_t elem = indicies[i] % table[0].size(); + if (index < table_size) { + result[i] = table[index][elem]; + } + } + }); + + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + (table_size + 0) * 16]); + ctx.reg_alloc.ReleaseStackSpace(stack_space + ABI_SHADOW_SPACE); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(args[1]); + const bool part = args[2].GetImmediateU1(); + + if (!part) { + code.pand(lower, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.psllw(upper, 8); + } else { + code.psrlw(lower, 8); + code.pand(upper, code.Const(xword, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00)); + } + code.por(lower, upper); + + ctx.reg_alloc.DefineValue(inst, lower); +} + +void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(args[1]); + const bool part = args[2].GetImmediateU1(); + + if (!part) { + code.pand(lower, code.Const(xword, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF)); + code.pslld(upper, 16); + } else { + code.psrld(lower, 16); + code.pand(upper, code.Const(xword, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000)); + } + code.por(lower, upper); + + ctx.reg_alloc.DefineValue(inst, lower); +} + +void EmitX64::EmitVectorTranspose32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(args[1]); + const bool part = args[2].GetImmediateU1(); + + code.shufps(lower, upper, !part ? 0b10001000 : 0b11011101); + code.pshufd(lower, lower, 0b11011000); + + ctx.reg_alloc.DefineValue(inst, lower); +} + +void EmitX64::EmitVectorTranspose64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(args[1]); + const bool part = args[2].GetImmediateU1(); + + code.shufpd(lower, upper, !part ? 0b00 : 0b11); + + ctx.reg_alloc.DefineValue(inst, lower); +} + +static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(); + + switch (esize) { + case 8: { + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.movdqa(temp, x); + code.psubusb(temp, y); + code.psubusb(y, x); + code.por(temp, y); + break; + } + case 16: { + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.movdqa(temp, x); + code.psubusw(temp, y); + code.psubusw(y, x); + code.por(temp, y); + break; + } + case 32: + if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + + code.movdqa(temp, x); + code.pminud(x, y); + code.pmaxud(temp, y); + code.psubd(temp, x); + } else { + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); + code.pxor(x, temp); + code.pxor(y, temp); + code.movdqa(temp, x); + code.psubd(temp, y); + code.pcmpgtd(y, x); + code.psrld(y, 1); + code.pxor(temp, y); + code.psubd(temp, y); + } + break; + } + + ctx.reg_alloc.DefineValue(inst, temp); +} + +void EmitX64::EmitVectorUnsignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorUnsignedAbsoluteDifference(8, ctx, inst, code); +} + +void EmitX64::EmitVectorUnsignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorUnsignedAbsoluteDifference(16, ctx, inst, code); +} + +void EmitX64::EmitVectorUnsignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorUnsignedAbsoluteDifference(32, ctx, inst, code); +} + +void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) { + const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp); + const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + + if (upper_inst) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpmulhuw(result, x, y); + } else { + code.movdqa(result, x); + code.pmulhuw(result, y); + } + + ctx.reg_alloc.DefineValue(upper_inst, result); + } + + if (lower_inst) { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpmullw(result, x, y); + } else { + code.movdqa(result, x); + code.pmullw(result, y); + } + ctx.reg_alloc.DefineValue(lower_inst, result); + } +} + +void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) { + const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp); + const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) { + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vpmulld(result, x, y); + + ctx.reg_alloc.DefineValue(lower_inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::AVX)) { + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + + if (lower_inst) { + const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); + code.vpmulld(lower_result, x, y); + ctx.reg_alloc.DefineValue(lower_inst, lower_result); + } + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.vpmuludq(result, x, y); + code.vpsrlq(x, x, 32); + code.vpsrlq(y, y, 32); + code.vpmuludq(x, x, y); + code.shufps(result, x, 0b11011101); + + ctx.reg_alloc.DefineValue(upper_inst, result); + return; + } + + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); + + // calculate unsigned multiply + code.movdqa(tmp, x); + code.pmuludq(tmp, y); + code.psrlq(x, 32); + code.psrlq(y, 32); + code.pmuludq(x, y); + + // put everything into place + code.pcmpeqw(upper_result, upper_result); + code.pcmpeqw(lower_result, lower_result); + code.psllq(upper_result, 32); + code.psrlq(lower_result, 32); + code.pand(upper_result, x); + code.pand(lower_result, tmp); + code.psrlq(tmp, 32); + code.psllq(x, 32); + code.por(upper_result, tmp); + code.por(lower_result, x); + + if (upper_inst) { + ctx.reg_alloc.DefineValue(upper_inst, upper_result); + } + if (lower_inst) { + ctx.reg_alloc.DefineValue(lower_inst, lower_result); + } +} + +void EmitX64::EmitVectorUnsignedRecipEstimate(EmitContext& ctx, IR::Inst* inst) { + EmitOneArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a) { + for (size_t i = 0; i < result.size(); i++) { + if ((a[i] & 0x80000000) == 0) { + result[i] = 0xFFFFFFFF; + continue; + } + + const u32 input = mcl::bit::get_bits<23, 31>(a[i]); + const u32 estimate = Common::RecipEstimate(input); + + result[i] = (0b100000000 | estimate) << 23; + } + }); +} + +void EmitX64::EmitVectorUnsignedRecipSqrtEstimate(EmitContext& ctx, IR::Inst* inst) { + EmitOneArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a) { + for (size_t i = 0; i < result.size(); i++) { + if ((a[i] & 0xC0000000) == 0) { + result[i] = 0xFFFFFFFF; + continue; + } + + const u32 input = mcl::bit::get_bits<23, 31>(a[i]); + const u32 estimate = Common::RecipSqrtEstimate(input); + + result[i] = (0b100000000 | estimate) << 23; + } + }); +} + +// Simple generic case for 8, 16, and 32-bit values. 64-bit values +// will need to be special-cased as we can't simply use a larger integral size. +template<typename T, typename U = std::make_unsigned_t<T>> +static bool EmitVectorUnsignedSaturatedAccumulateSigned(VectorArray<U>& result, const VectorArray<T>& lhs, const VectorArray<T>& rhs) { + static_assert(std::is_signed_v<T>, "T must be signed."); + static_assert(mcl::bitsizeof<T> < 64, "T must be less than 64 bits in size."); + + bool qc_flag = false; + + for (size_t i = 0; i < result.size(); i++) { + // We treat rhs' members as unsigned, so cast to unsigned before signed to inhibit sign-extension. + // We use the unsigned equivalent of T, as we want zero-extension to occur, rather than a plain move. + const s64 x = s64{lhs[i]}; + const s64 y = static_cast<s64>(static_cast<std::make_unsigned_t<U>>(rhs[i])); + const s64 sum = x + y; + + if (sum > std::numeric_limits<U>::max()) { + result[i] = std::numeric_limits<U>::max(); + qc_flag = true; + } else if (sum < 0) { + result[i] = std::numeric_limits<U>::min(); + qc_flag = true; + } else { + result[i] = static_cast<U>(sum); + } + } + + return qc_flag; +} + +void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned8(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitVectorUnsignedSaturatedAccumulateSigned<s8>); +} + +void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned16(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitVectorUnsignedSaturatedAccumulateSigned<s16>); +} + +void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned32(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitVectorUnsignedSaturatedAccumulateSigned<s32>); +} + +void EmitX64::EmitVectorUnsignedSaturatedAccumulateSigned64(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& lhs, const VectorArray<u64>& rhs) { + bool qc_flag = false; + + for (size_t i = 0; i < result.size(); i++) { + const u64 x = lhs[i]; + const u64 y = rhs[i]; + const u64 res = x + y; + + // Check sign bits to determine if an overflow occurred. + if ((~x & y & ~res) & 0x8000000000000000) { + result[i] = UINT64_MAX; + qc_flag = true; + } else if ((x & ~y & res) & 0x8000000000000000) { + result[i] = 0; + qc_flag = true; + } else { + result[i] = res; + } + } + + return qc_flag; + }); +} + +void EmitX64::EmitVectorUnsignedSaturatedNarrow16(EmitContext& ctx, IR::Inst* inst) { + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u16>& a) { + result = {}; + bool qc_flag = false; + for (size_t i = 0; i < a.size(); ++i) { + const u16 saturated = std::clamp<u16>(a[i], 0, 0xFF); + result[i] = static_cast<u8>(saturated); + qc_flag |= saturated != a[i]; + } + return qc_flag; + }); +} + +void EmitX64::EmitVectorUnsignedSaturatedNarrow32(EmitContext& ctx, IR::Inst* inst) { + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u32>& a) { + result = {}; + bool qc_flag = false; + for (size_t i = 0; i < a.size(); ++i) { + const u32 saturated = std::clamp<u32>(a[i], 0, 0xFFFF); + result[i] = static_cast<u16>(saturated); + qc_flag |= saturated != a[i]; + } + return qc_flag; + }); +} + +void EmitX64::EmitVectorUnsignedSaturatedNarrow64(EmitContext& ctx, IR::Inst* inst) { + EmitOneArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u64>& a) { + result = {}; + bool qc_flag = false; + for (size_t i = 0; i < a.size(); ++i) { + const u64 saturated = std::clamp<u64>(a[i], 0, 0xFFFFFFFF); + result[i] = static_cast<u32>(saturated); + qc_flag |= saturated != a[i]; + } + return qc_flag; + }); +} + +template<typename T, typename S = std::make_signed_t<T>> +static bool VectorUnsignedSaturatedShiftLeft(VectorArray<T>& dst, const VectorArray<T>& data, const VectorArray<T>& shift_values) { + static_assert(std::is_unsigned_v<T>, "T must be an unsigned type."); + + bool qc_flag = false; + + constexpr size_t bit_size = mcl::bitsizeof<T>; + constexpr S negative_bit_size = -static_cast<S>(bit_size); + + for (size_t i = 0; i < dst.size(); i++) { + const T element = data[i]; + const S shift = std::clamp(static_cast<S>(mcl::bit::sign_extend<8>(static_cast<T>(shift_values[i] & 0xFF))), + negative_bit_size, std::numeric_limits<S>::max()); + + if (element == 0 || shift <= negative_bit_size) { + dst[i] = 0; + } else if (shift < 0) { + dst[i] = static_cast<T>(element >> -shift); + } else if (shift >= static_cast<S>(bit_size)) { + dst[i] = std::numeric_limits<T>::max(); + qc_flag = true; + } else { + const T shifted = element << shift; + + if ((shifted >> shift) != element) { + dst[i] = std::numeric_limits<T>::max(); + qc_flag = true; + } else { + dst[i] = shifted; + } + } + } + + return qc_flag; +} + +void EmitX64::EmitVectorUnsignedSaturatedShiftLeft8(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorUnsignedSaturatedShiftLeft<u8>); +} + +void EmitX64::EmitVectorUnsignedSaturatedShiftLeft16(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorUnsignedSaturatedShiftLeft<u16>); +} + +void EmitX64::EmitVectorUnsignedSaturatedShiftLeft32(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorUnsignedSaturatedShiftLeft<u32>); +} + +void EmitX64::EmitVectorUnsignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorUnsignedSaturatedShiftLeft<u64>); +} + +void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pmovzxbw(a, a); + } else { + const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); + code.pxor(zeros, zeros); + code.punpcklbw(a, zeros); + } + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pmovzxwd(a, a); + } else { + const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); + code.pxor(zeros, zeros); + code.punpcklwd(a, zeros); + } + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.pmovzxdq(a, a); + } else { + const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); + code.pxor(zeros, zeros); + code.punpckldq(a, zeros); + } + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorZeroExtend64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(); + code.pxor(zeros, zeros); + code.punpcklqdq(a, zeros); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + + code.movq(a, a); // TODO: !IsLastUse + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitZeroVector(EmitContext& ctx, IR::Inst* inst) { + const Xbyak::Xmm a = ctx.reg_alloc.ScratchXmm(); + code.pxor(a, a); + ctx.reg_alloc.DefineValue(inst, a); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp new file mode 100644 index 0000000000..b8aa3eb653 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp @@ -0,0 +1,2182 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <array> +#include <limits> +#include <tuple> +#include <type_traits> +#include <utility> + +#include <mcl/assert.hpp> +#include <mcl/mp/metavalue/lift_value.hpp> +#include <mcl/mp/typelist/cartesian_product.hpp> +#include <mcl/mp/typelist/get.hpp> +#include <mcl/mp/typelist/lift_sequence.hpp> +#include <mcl/mp/typelist/list.hpp> +#include <mcl/mp/typelist/lower_to_tuple.hpp> +#include <mcl/type_traits/function_info.hpp> +#include <mcl/type_traits/integer_of_size.hpp> +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/constants.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/op.h" +#include "dynarmic/common/fp/util.h" +#include "dynarmic/common/lut_from_list.h" +#include "dynarmic/interface/optimization_flags.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; +namespace mp = mcl::mp; + +namespace { + +#define FCODE(NAME) \ + [&code](auto... args) { \ + if constexpr (fsize == 32) { \ + code.NAME##s(args...); \ + } else { \ + code.NAME##d(args...); \ + } \ + } +#define ICODE(NAME) \ + [&code](auto... args) { \ + if constexpr (fsize == 32) { \ + code.NAME##d(args...); \ + } else { \ + code.NAME##q(args...); \ + } \ + } + +template<typename Lambda> +void MaybeStandardFPSCRValue(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, Lambda lambda) { + const bool switch_mxcsr = ctx.FPCR(fpcr_controlled) != ctx.FPCR(); + + if (switch_mxcsr && !ctx.HasOptimization(OptimizationFlag::Unsafe_IgnoreStandardFPCRValue)) { + code.EnterStandardASIMD(); + lambda(); + code.LeaveStandardASIMD(); + } else { + lambda(); + } +} + +template<size_t fsize, template<typename> class Indexer, size_t narg> +struct NaNHandler { +public: + using FPT = mcl::unsigned_integer_of_size<fsize>; + + using function_type = void (*)(std::array<VectorArray<FPT>, narg>&, FP::FPCR); + + static function_type GetDefault() { + return GetDefaultImpl(std::make_index_sequence<narg - 1>{}); + } + +private: + template<size_t... argi> + static function_type GetDefaultImpl(std::index_sequence<argi...>) { + const auto result = [](std::array<VectorArray<FPT>, narg>& values, FP::FPCR) { + VectorArray<FPT>& result = values[0]; + for (size_t elementi = 0; elementi < result.size(); ++elementi) { + const auto current_values = Indexer<FPT>{}(elementi, values[argi + 1]...); + if (auto r = FP::ProcessNaNs(std::get<argi>(current_values)...)) { + result[elementi] = *r; + } else if (FP::IsNaN(result[elementi])) { + result[elementi] = FP::FPInfo<FPT>::DefaultNaN(); + } + } + }; + + return static_cast<function_type>(result); + } +}; + +template<size_t fsize, size_t nargs, typename NaNHandler> +void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::array<Xbyak::Xmm, nargs + 1> xmms, const Xbyak::Xmm& nan_mask, NaNHandler nan_handler) { + static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.ptest(nan_mask, nan_mask); + } else { + const Xbyak::Reg32 bitmask = ctx.reg_alloc.ScratchGpr().cvt32(); + code.movmskps(bitmask, nan_mask); + code.cmp(bitmask, 0); + } + + SharedLabel end = GenSharedLabel(), nan = GenSharedLabel(); + + code.jnz(*nan, code.T_NEAR); + code.L(*end); + + ctx.deferred_emits.emplace_back([=, &code, &ctx] { + code.L(*nan); + + const Xbyak::Xmm result = xmms[0]; + + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + + const size_t stack_space = xmms.size() * 16; + code.sub(rsp, static_cast<u32>(stack_space + ABI_SHADOW_SPACE)); + for (size_t i = 0; i < xmms.size(); ++i) { + code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], xmms[i]); + } + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.mov(code.ABI_PARAM2, ctx.FPCR(fpcr_controlled).Value()); + + code.CallFunction(nan_handler); + + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.add(rsp, static_cast<u32>(stack_space + ABI_SHADOW_SPACE)); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + code.jmp(*end, code.T_NEAR); + }); +} + +template<size_t fsize> +Xbyak::Address GetVectorOf(BlockOfCode& code, u64 value) { + return code.BConst<fsize>(xword, value); +} + +template<size_t fsize, u64 value> +Xbyak::Address GetVectorOf(BlockOfCode& code) { + return code.BConst<fsize>(xword, value); +} + +template<size_t fsize> +Xbyak::Address GetNaNVector(BlockOfCode& code) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + return GetVectorOf<fsize, FP::FPInfo<FPT>::DefaultNaN()>(code); +} + +template<size_t fsize> +Xbyak::Address GetNegativeZeroVector(BlockOfCode& code) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + return GetVectorOf<fsize, FP::FPInfo<FPT>::Zero(true)>(code); +} + +template<size_t fsize> +Xbyak::Address GetNonSignMaskVector(BlockOfCode& code) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + constexpr FPT non_sign_mask = FP::FPInfo<FPT>::exponent_mask | FP::FPInfo<FPT>::mantissa_mask; + return GetVectorOf<fsize, non_sign_mask>(code); +} + +template<size_t fsize> +Xbyak::Address GetSmallestNormalVector(BlockOfCode& code) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + constexpr FPT smallest_normal_number = FP::FPValue<FPT, false, FP::FPInfo<FPT>::exponent_min, 1>(); + return GetVectorOf<fsize, smallest_normal_number>(code); +} + +template<size_t fsize, bool sign, int exponent, mcl::unsigned_integer_of_size<fsize> value> +Xbyak::Address GetVectorOf(BlockOfCode& code) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + return GetVectorOf<fsize, FP::FPValue<FPT, sign, exponent, value>()>(code); +} + +template<size_t fsize> +void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) { + if (fpcr.DN()) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + const Xbyak::Opmask nan_mask = k1; + FCODE(vfpclassp)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN)); + FCODE(vblendmp)(result | nan_mask, result, GetNaNVector<fsize>(code)); + } else if (code.HasHostFeature(HostFeature::AVX)) { + const Xbyak::Xmm nan_mask = xmm0; + FCODE(vcmpunordp)(nan_mask, result, result); + FCODE(blendvp)(result, GetNaNVector<fsize>(code)); + } else { + const Xbyak::Xmm nan_mask = xmm0; + code.movaps(nan_mask, result); + FCODE(cmpordp)(nan_mask, nan_mask); + code.andps(result, nan_mask); + code.andnps(nan_mask, GetNaNVector<fsize>(code)); + code.orps(result, nan_mask); + } + } +} + +template<size_t fsize> +void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) { + const Xbyak::Xmm nan_mask = xmm0; + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero, + FpFixup::PosZero); + FCODE(vfixupimmp)(result, result, code.BConst<32>(ptr_b, nan_to_zero), u8(0)); + } else if (code.HasHostFeature(HostFeature::AVX)) { + FCODE(vcmpordp)(nan_mask, result, result); + FCODE(vandp)(result, result, nan_mask); + } else { + code.movaps(nan_mask, result); + FCODE(cmpordp)(nan_mask, nan_mask); + code.andps(result, nan_mask); + } +} + +template<size_t fsize> +void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) { + if (fpcr.FZ()) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + constexpr u32 denormal_to_zero = FixupLUT( + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src); + + FCODE(vmovap)(tmp, code.BConst<fsize>(xword, denormal_to_zero)); + + for (const Xbyak::Xmm& xmm : to_daz) { + FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0)); + } + return; + } + + if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) { + code.movaps(tmp, GetNegativeZeroVector<fsize>(code)); + } else { + code.xorps(tmp, tmp); + } + for (const Xbyak::Xmm& xmm : to_daz) { + FCODE(addp)(xmm, tmp); + } + } +} + +template<typename T> +struct DefaultIndexer { + std::tuple<T> operator()(size_t i, const VectorArray<T>& a) { + return std::make_tuple(a[i]); + } + + std::tuple<T, T> operator()(size_t i, const VectorArray<T>& a, const VectorArray<T>& b) { + return std::make_tuple(a[i], b[i]); + } + + std::tuple<T, T, T> operator()(size_t i, const VectorArray<T>& a, const VectorArray<T>& b, const VectorArray<T>& c) { + return std::make_tuple(a[i], b[i], c[i]); + } +}; + +template<typename T> +struct PairedIndexer { + std::tuple<T, T> operator()(size_t i, const VectorArray<T>& a, const VectorArray<T>& b) { + constexpr size_t halfway = std::tuple_size_v<VectorArray<T>> / 2; + const size_t which_array = i / halfway; + i %= halfway; + switch (which_array) { + case 0: + return std::make_tuple(a[2 * i], a[2 * i + 1]); + case 1: + return std::make_tuple(b[2 * i], b[2 * i + 1]); + } + UNREACHABLE(); + } +}; + +template<typename T> +struct PairedLowerIndexer { + std::tuple<T, T> operator()(size_t i, const VectorArray<T>& a, const VectorArray<T>& b) { + constexpr size_t array_size = std::tuple_size_v<VectorArray<T>>; + if constexpr (array_size == 4) { + switch (i) { + case 0: + return std::make_tuple(a[0], a[1]); + case 1: + return std::make_tuple(b[0], b[1]); + default: + return std::make_tuple(0, 0); + } + } else if constexpr (array_size == 2) { + if (i == 0) { + return std::make_tuple(a[0], b[0]); + } + return std::make_tuple(0, 0); + } else { + UNREACHABLE(); + } + } +}; + +template<size_t fsize, template<typename> class Indexer, size_t fpcr_controlled_arg_index = 1, typename Function> +void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler<fsize, Indexer, 2>::function_type nan_handler = NaNHandler<fsize, Indexer, 2>::GetDefault()) { + static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[fpcr_controlled_arg_index].GetImmediateU1(); + + if (ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + Xbyak::Xmm result; + + if constexpr (std::is_member_function_pointer_v<Function>) { + result = ctx.reg_alloc.UseScratchXmm(args[0]); + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + (code.*fn)(result); + }); + } else { + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); + result = ctx.reg_alloc.ScratchXmm(); + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + fn(result, xmm_a); + }); + } + + if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); + + if constexpr (std::is_member_function_pointer_v<Function>) { + code.movaps(result, xmm_a); + (code.*fn)(result); + } else { + fn(result, xmm_a); + } + + if (code.HasHostFeature(HostFeature::AVX)) { + FCODE(vcmpunordp)(nan_mask, result, result); + } else { + code.movaps(nan_mask, result); + FCODE(cmpunordp)(nan_mask, nan_mask); + } + + HandleNaNs<fsize, 1>(code, ctx, fpcr_controlled, {result, xmm_a}, nan_mask, nan_handler); + + ctx.reg_alloc.DefineValue(inst, result); +} + +enum class CheckInputNaN { + Yes, + No, +}; + +template<size_t fsize, template<typename> class Indexer, typename Function> +void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, CheckInputNaN check_input_nan = CheckInputNaN::No, typename NaNHandler<fsize, Indexer, 3>::function_type nan_handler = NaNHandler<fsize, Indexer, 3>::GetDefault()) { + static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + + if (ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + + if constexpr (std::is_member_function_pointer_v<Function>) { + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + (code.*fn)(xmm_a, xmm_b); + }); + } else { + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + fn(xmm_a, xmm_b); + }); + } + + if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), xmm_a); + } + + ctx.reg_alloc.DefineValue(inst, xmm_a); + return; + } + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, xmm_a); + + if (check_input_nan == CheckInputNaN::Yes) { + if (code.HasHostFeature(HostFeature::AVX)) { + FCODE(vcmpunordp)(nan_mask, xmm_a, xmm_b); + } else { + code.movaps(nan_mask, xmm_b); + FCODE(cmpunordp)(nan_mask, xmm_a); + } + } + + if constexpr (std::is_member_function_pointer_v<Function>) { + (code.*fn)(result, xmm_b); + } else { + fn(result, xmm_b); + } + + if (check_input_nan == CheckInputNaN::Yes) { + FCODE(cmpunordp)(nan_mask, result); + } else if (code.HasHostFeature(HostFeature::AVX)) { + FCODE(vcmpunordp)(nan_mask, result, result); + } else { + code.movaps(nan_mask, result); + FCODE(cmpunordp)(nan_mask, nan_mask); + } + + HandleNaNs<fsize, 2>(code, ctx, fpcr_controlled, {result, xmm_a, xmm_b}, nan_mask, nan_handler); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<typename Lambda> +void EmitTwoOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Lambda lambda, bool fpcr_controlled) { + const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda); + + const u32 fpcr = ctx.FPCR(fpcr_controlled).Value(); + + constexpr u32 stack_space = 2 * 16; + code.sub(rsp, stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + code.mov(code.ABI_PARAM3.cvt32(), fpcr); + code.lea(code.ABI_PARAM4, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + + code.movaps(xword[code.ABI_PARAM2], arg1); + code.CallFunction(fn); + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); + + code.add(rsp, stack_space + ABI_SHADOW_SPACE); +} + +template<size_t fpcr_controlled_arg_index = 1, typename Lambda> +void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + + const bool fpcr_controlled = args[fpcr_controlled_arg_index].GetImmediateU1(); + + EmitTwoOpFallbackWithoutRegAlloc(code, ctx, result, arg1, lambda, fpcr_controlled); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template<typename Lambda> +void EmitThreeOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Xbyak::Xmm arg2, Lambda lambda, bool fpcr_controlled) { + const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda); + + const u32 fpcr = ctx.FPCR(fpcr_controlled).Value(); + +#ifdef _WIN32 + constexpr u32 stack_space = 4 * 16; + code.sub(rsp, stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 3 * 16]); + code.mov(code.ABI_PARAM4.cvt32(), fpcr); + code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.mov(qword[rsp + ABI_SHADOW_SPACE + 0], rax); +#else + constexpr u32 stack_space = 3 * 16; + code.sub(rsp, stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]); + code.mov(code.ABI_PARAM4.cvt32(), fpcr); + code.lea(code.ABI_PARAM5, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); +#endif + + code.movaps(xword[code.ABI_PARAM2], arg1); + code.movaps(xword[code.ABI_PARAM3], arg2); + code.CallFunction(fn); + +#ifdef _WIN32 + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 1 * 16]); +#else + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); +#endif + + code.add(rsp, stack_space + ABI_SHADOW_SPACE); +} + +template<typename Lambda> +void EmitThreeOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + + const bool fpcr_controlled = args[2].GetImmediateU1(); + + EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, arg1, arg2, lambda, fpcr_controlled); + + ctx.reg_alloc.DefineValue(inst, result); +} + +enum class LoadPreviousResult { + Yes, + No, +}; + +template<LoadPreviousResult load_previous_result = LoadPreviousResult::No, typename Lambda> +void EmitFourOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Xbyak::Xmm arg2, Xbyak::Xmm arg3, Lambda lambda, bool fpcr_controlled) { + const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda); + +#ifdef _WIN32 + constexpr u32 stack_space = 5 * 16; + code.sub(rsp, stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 3 * 16]); + code.lea(code.ABI_PARAM4, ptr[rsp + ABI_SHADOW_SPACE + 4 * 16]); + code.mov(qword[rsp + ABI_SHADOW_SPACE + 0], ctx.FPCR(fpcr_controlled).Value()); + code.lea(rax, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.mov(qword[rsp + ABI_SHADOW_SPACE + 8], rax); +#else + constexpr u32 stack_space = 4 * 16; + code.sub(rsp, stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]); + code.lea(code.ABI_PARAM4, ptr[rsp + ABI_SHADOW_SPACE + 3 * 16]); + code.mov(code.ABI_PARAM5.cvt32(), ctx.FPCR(fpcr_controlled).Value()); + code.lea(code.ABI_PARAM6, code.ptr[code.r15 + code.GetJitStateInfo().offsetof_fpsr_exc]); +#endif + + if constexpr (load_previous_result == LoadPreviousResult::Yes) { + code.movaps(xword[code.ABI_PARAM1], result); + } + code.movaps(xword[code.ABI_PARAM2], arg1); + code.movaps(xword[code.ABI_PARAM3], arg2); + code.movaps(xword[code.ABI_PARAM4], arg3); + code.CallFunction(fn); + +#ifdef _WIN32 + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 1 * 16]); +#else + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); +#endif + + code.add(rsp, stack_space + ABI_SHADOW_SPACE); +} + +template<typename Lambda> +void EmitFourOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[3].GetImmediateU1(); + const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm arg3 = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + + EmitFourOpFallbackWithoutRegAlloc(code, ctx, result, arg1, arg2, arg3, lambda, fpcr_controlled); + + ctx.reg_alloc.DefineValue(inst, result); +} + +} // anonymous namespace + +template<size_t fsize> +void FPVectorAbs(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + code.andps(a, GetNonSignMaskVector<fsize>(code)); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitFPVectorAbs16(EmitContext& ctx, IR::Inst* inst) { + FPVectorAbs<16>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorAbs32(EmitContext& ctx, IR::Inst* inst) { + FPVectorAbs<32>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorAbs64(EmitContext& ctx, IR::Inst* inst) { + FPVectorAbs<64>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::addps); +} + +void EmitX64::EmitFPVectorAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::addpd); +} + +void EmitX64::EmitFPVectorDiv32(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::divps); +} + +void EmitX64::EmitFPVectorDiv64(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::divpd); +} + +void EmitX64::EmitFPVectorEqual16(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& op1, const VectorArray<u16>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) { + for (size_t i = 0; i < result.size(); i++) { + result[i] = FP::FPCompareEQ(op1[i], op2[i], fpcr, fpsr) ? 0xFFFF : 0; + } + }); +} + +void EmitX64::EmitFPVectorEqual32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); + code.cmpeqps(a, b); + }); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitFPVectorEqual64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); + code.cmpeqpd(a, b); + }); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitFPVectorFromHalf32(EmitContext& ctx, IR::Inst* inst) { + const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8()); + const bool fpcr_controlled = inst->GetArg(2).GetU1(); + + if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]); + + code.vcvtph2ps(result, value); + ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + using rounding_list = mp::list< + mp::lift_value<FP::RoundingMode::ToNearest_TieEven>, + mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsZero>, + mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>; + + static const auto lut = Common::GenerateLookupTableFromList( + []<typename I>(I) { + return std::pair{ + mp::lower_to_tuple_v<I>, + Common::FptrCast( + [](VectorArray<u32>& output, const VectorArray<u16>& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value; + + for (size_t i = 0; i < output.size(); ++i) { + output[i] = FP::FPConvert<u32, u16>(input[i], fpcr, rounding_mode, fpsr); + } + })}; + }, + mp::cartesian_product<rounding_list>{}); + + EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode))); +} + +void EmitX64::EmitFPVectorFromSignedFixed32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]); + const int fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + const bool fpcr_controlled = args[3].GetImmediateU1(); + ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode()); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + code.cvtdq2ps(xmm, xmm); + if (fbits != 0) { + code.mulps(xmm, GetVectorOf<32>(code, static_cast<u32>(127 - fbits) << 23)); + } + }); + + ctx.reg_alloc.DefineValue(inst, xmm); +} + +void EmitX64::EmitFPVectorFromSignedFixed64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]); + const int fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + const bool fpcr_controlled = args[3].GetImmediateU1(); + ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode()); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + code.vcvtqq2pd(xmm, xmm); + } else if (code.HasHostFeature(HostFeature::SSE41)) { + const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + + // First quadword + code.movq(tmp, xmm); + code.cvtsi2sd(xmm, tmp); + + // Second quadword + code.pextrq(tmp, xmm, 1); + code.cvtsi2sd(xmm_tmp, tmp); + + // Combine + code.unpcklpd(xmm, xmm_tmp); + } else { + const Xbyak::Xmm high_xmm = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + + // First quadword + code.movhlps(high_xmm, xmm); + code.movq(tmp, xmm); + code.cvtsi2sd(xmm, tmp); + + // Second quadword + code.movq(tmp, high_xmm); + code.cvtsi2sd(xmm_tmp, tmp); + + // Combine + code.unpcklpd(xmm, xmm_tmp); + } + + if (fbits != 0) { + code.mulpd(xmm, GetVectorOf<64>(code, static_cast<u64>(1023 - fbits) << 52)); + } + }); + + ctx.reg_alloc.DefineValue(inst, xmm); +} + +void EmitX64::EmitFPVectorFromUnsignedFixed32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]); + const int fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + const bool fpcr_controlled = args[3].GetImmediateU1(); + ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode()); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + code.vcvtudq2ps(xmm, xmm); + } else { + const Xbyak::Address mem_4B000000 = code.BConst<32>(xword, 0x4B000000); + const Xbyak::Address mem_53000000 = code.BConst<32>(xword, 0x53000000); + const Xbyak::Address mem_D3000080 = code.BConst<32>(xword, 0xD3000080); + + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpblendw(tmp, xmm, mem_4B000000, 0b10101010); + code.vpsrld(xmm, xmm, 16); + code.vpblendw(xmm, xmm, mem_53000000, 0b10101010); + code.vaddps(xmm, xmm, mem_D3000080); + code.vaddps(xmm, tmp, xmm); + } else { + const Xbyak::Address mem_0xFFFF = code.BConst<32>(xword, 0x0000FFFF); + + code.movdqa(tmp, mem_0xFFFF); + + code.pand(tmp, xmm); + code.por(tmp, mem_4B000000); + code.psrld(xmm, 16); + code.por(xmm, mem_53000000); + code.addps(xmm, mem_D3000080); + code.addps(xmm, tmp); + } + } + + if (fbits != 0) { + code.mulps(xmm, GetVectorOf<32>(code, static_cast<u32>(127 - fbits) << 23)); + } + + if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) { + code.pand(xmm, code.BConst<32>(xword, 0x7FFFFFFF)); + } + }); + + ctx.reg_alloc.DefineValue(inst, xmm); +} + +void EmitX64::EmitFPVectorFromUnsignedFixed64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]); + const int fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8()); + const bool fpcr_controlled = args[3].GetImmediateU1(); + ASSERT(rounding_mode == ctx.FPCR(fpcr_controlled).RMode()); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + code.vcvtuqq2pd(xmm, xmm); + } else { + const Xbyak::Address unpack = code.Const(xword, 0x4530000043300000, 0); + const Xbyak::Address subtrahend = code.Const(xword, 0x4330000000000000, 0x4530000000000000); + + const Xbyak::Xmm unpack_reg = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm subtrahend_reg = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovapd(unpack_reg, unpack); + code.vmovapd(subtrahend_reg, subtrahend); + + code.vunpcklps(tmp1, xmm, unpack_reg); + code.vsubpd(tmp1, tmp1, subtrahend_reg); + + code.vpermilps(xmm, xmm, 0b01001110); + + code.vunpcklps(xmm, xmm, unpack_reg); + code.vsubpd(xmm, xmm, subtrahend_reg); + + code.vhaddpd(xmm, tmp1, xmm); + } else { + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + + code.movapd(unpack_reg, unpack); + code.movapd(subtrahend_reg, subtrahend); + + code.pshufd(tmp1, xmm, 0b01001110); + + code.punpckldq(xmm, unpack_reg); + code.subpd(xmm, subtrahend_reg); + code.pshufd(tmp2, xmm, 0b01001110); + code.addpd(xmm, tmp2); + + code.punpckldq(tmp1, unpack_reg); + code.subpd(tmp1, subtrahend_reg); + + code.pshufd(unpack_reg, tmp1, 0b01001110); + code.addpd(unpack_reg, tmp1); + + code.unpcklpd(xmm, unpack_reg); + } + } + + if (fbits != 0) { + code.mulpd(xmm, GetVectorOf<64>(code, static_cast<u64>(1023 - fbits) << 52)); + } + + if (ctx.FPCR(fpcr_controlled).RMode() == FP::RoundingMode::TowardsMinusInfinity) { + code.pand(xmm, code.BConst<64>(xword, 0x7FFFFFFFFFFFFFFF)); + } + }); + + ctx.reg_alloc.DefineValue(inst, xmm); +} + +void EmitX64::EmitFPVectorGreater32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); + code.cmpltps(b, a); + }); + + ctx.reg_alloc.DefineValue(inst, b); +} + +void EmitX64::EmitFPVectorGreater64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); + code.cmpltpd(b, a); + }); + + ctx.reg_alloc.DefineValue(inst, b); +} + +void EmitX64::EmitFPVectorGreaterEqual32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + DenormalsAreZero<32>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); + code.cmpleps(b, a); + }); + + ctx.reg_alloc.DefineValue(inst, b); +} + +void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + DenormalsAreZero<64>(code, ctx.FPCR(fpcr_controlled), {a, b}, xmm0); + code.cmplepd(b, a); + }); + + ctx.reg_alloc.DefineValue(inst, b); +} + +template<size_t fsize, bool is_max> +static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + const bool fpcr_controlled = inst->GetArg(2).GetU1(); + + if (ctx.FPCR(fpcr_controlled).DN()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]); + + const Xbyak::Xmm mask = xmm0; + const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {result, xmm_b}, mask); + + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + constexpr FpRangeSelect range_select = is_max ? FpRangeSelect::Max : FpRangeSelect::Min; + FCODE(vcmpp)(k1, result, xmm_b, Cmp::Unordered_Q); + FCODE(vrangep)(result, result, xmm_b, FpRangeLUT(range_select, FpRangeSign::Preserve)); + FCODE(vblendmp)(result | k1, result, GetNaNVector<fsize>(code)); + } else if (code.HasHostFeature(HostFeature::AVX)) { + FCODE(vcmpeqp)(mask, result, xmm_b); + FCODE(vcmpunordp)(nan_mask, result, xmm_b); + if constexpr (is_max) { + FCODE(vandp)(eq, result, xmm_b); + FCODE(vmaxp)(result, result, xmm_b); + } else { + FCODE(vorp)(eq, result, xmm_b); + FCODE(vminp)(result, result, xmm_b); + } + FCODE(blendvp)(result, eq); + FCODE(vblendvp)(result, result, GetNaNVector<fsize>(code), nan_mask); + } else { + code.movaps(mask, result); + code.movaps(eq, result); + code.movaps(nan_mask, result); + FCODE(cmpneqp)(mask, xmm_b); + FCODE(cmpordp)(nan_mask, xmm_b); + + if constexpr (is_max) { + code.andps(eq, xmm_b); + FCODE(maxp)(result, xmm_b); + } else { + code.orps(eq, xmm_b); + FCODE(minp)(result, xmm_b); + } + + code.andps(result, mask); + code.andnps(mask, eq); + code.orps(result, mask); + + code.andps(result, nan_mask); + code.andnps(nan_mask, GetNaNVector<fsize>(code)); + code.orps(result, nan_mask); + } + }); + + ctx.reg_alloc.DefineValue(inst, result); + + return; + } + + EmitThreeOpVectorOperation<fsize, DefaultIndexer>( + code, ctx, inst, [&](const Xbyak::Xmm& result, Xbyak::Xmm xmm_b) { + const Xbyak::Xmm mask = xmm0; + const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm(); + + if (ctx.FPCR(fpcr_controlled).FZ()) { + const Xbyak::Xmm prev_xmm_b = xmm_b; + xmm_b = ctx.reg_alloc.ScratchXmm(); + code.movaps(xmm_b, prev_xmm_b); + DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {result, xmm_b}, mask); + } + + // What we are doing here is handling the case when the inputs are differently signed zeros. + // x86-64 treats differently signed zeros as equal while ARM does not. + // Thus if we AND together things that x86-64 thinks are equal we'll get the positive zero. + + // vrangep{s,d} here ends up not being significantly shorter than the AVX implementation + + if (code.HasHostFeature(HostFeature::AVX)) { + FCODE(vcmpeqp)(mask, result, xmm_b); + if constexpr (is_max) { + FCODE(vandp)(eq, result, xmm_b); + FCODE(vmaxp)(result, result, xmm_b); + } else { + FCODE(vorp)(eq, result, xmm_b); + FCODE(vminp)(result, result, xmm_b); + } + FCODE(blendvp)(result, eq); + } else { + code.movaps(mask, result); + code.movaps(eq, result); + FCODE(cmpneqp)(mask, xmm_b); + + if constexpr (is_max) { + code.andps(eq, xmm_b); + FCODE(maxp)(result, xmm_b); + } else { + code.orps(eq, xmm_b); + FCODE(minp)(result, xmm_b); + } + + code.andps(result, mask); + code.andnps(mask, eq); + code.orps(result, mask); + } + }, + CheckInputNaN::Yes); +} + +template<size_t fsize, bool is_max> +static void EmitFPVectorMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + const bool fpcr_controlled = inst->GetArg(2).GetU1(); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm intermediate_result = ctx.reg_alloc.ScratchXmm(); + + const Xbyak::Xmm tmp1 = xmm0; + const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + + // NaN requirements: + // op1 op2 result + // SNaN anything op1 + // !SNaN SNaN op2 + // QNaN !NaN op2 + // !NaN QNaN op1 + // QNaN QNaN op1 + + if (code.HasHostFeature(HostFeature::AVX)) { + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + // result = xmm_a == SNaN || xmm_b == QNaN + { + // evaluate xmm_b == QNaN + FCODE(vcmpunordp)(tmp1, xmm_b, xmm_b); + ICODE(vpsll)(tmp2, xmm_b, static_cast<u8>(fsize - FP::FPInfo<FPT>::explicit_mantissa_width)); + { + code.vpsrad(tmp2, tmp2, 31); + if constexpr (fsize == 64) { + code.vpshufd(tmp2, tmp2, 0b11110101); + } + } + code.vandps(result, tmp1, tmp2); + + // evaluate xmm_a == SNaN + FCODE(vcmpunordp)(tmp1, xmm_a, xmm_a); + ICODE(vpsll)(tmp2, xmm_a, static_cast<u8>(fsize - FP::FPInfo<FPT>::explicit_mantissa_width)); + { + code.vpsrad(tmp2, tmp2, 31); + if constexpr (fsize == 64) { + code.vpshufd(tmp2, tmp2, 0b11110101); + } + } + code.vandnps(tmp2, tmp2, tmp1); + + code.vorps(result, tmp2); + } + + // Denormalization quiets SNaNs, therefore should happen after SNaN detection! + DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {xmm_a, xmm_b}, tmp1); + + // intermediate result = max/min(xmm_a, xmm_b) + { + const Xbyak::Xmm eq_mask = tmp1; + const Xbyak::Xmm eq = tmp2; + + FCODE(vcmpeqp)(eq_mask, xmm_a, xmm_b); + + if constexpr (is_max) { + code.vandps(eq, xmm_a, xmm_b); + FCODE(vmaxp)(intermediate_result, xmm_a, xmm_b); + } else { + code.vorps(eq, xmm_a, xmm_b); + FCODE(vminp)(intermediate_result, xmm_a, xmm_b); + } + + code.blendvps(intermediate_result, eq); // eq_mask is in xmm0 + } + + { + code.vblendvps(result, intermediate_result, xmm_a, result); + } + + if (ctx.FPCR(fpcr_controlled).DN()) { + const Xbyak::Xmm ord_mask = tmp1; + + FCODE(vcmpunordp)(ord_mask, result, result); + code.blendvps(result, GetNaNVector<fsize>(code)); // ord_mask is in xmm0 + } else { + const Xbyak::Xmm nan_mask = tmp1; + + FCODE(vcmpunordp)(nan_mask, result, result); + code.vandps(nan_mask, nan_mask, GetVectorOf<fsize, FP::FPInfo<FPT>::mantissa_msb>(code)); + code.vorps(result, result, nan_mask); + } + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + // result = xmm_a == SNaN || xmm_b == QNaN + { + // evaluate xmm_b == QNaN + code.xorps(tmp1, tmp1); + FCODE(cmpunordp)(tmp1, xmm_b); + code.movaps(tmp2, xmm_b); + ICODE(psll)(tmp2, static_cast<int>(fsize - FP::FPInfo<FPT>::explicit_mantissa_width)); + { + code.psrad(tmp2, 31); + if constexpr (fsize == 64) { + code.pshufd(tmp2, tmp2, 0b11110101); + } + } + code.andps(tmp1, tmp2); + + code.movaps(result, tmp1); + + // evaluate xmm_a == SNaN + code.xorps(tmp1, tmp1); + FCODE(cmpunordp)(tmp1, xmm_a); + code.movaps(tmp2, xmm_a); + ICODE(psll)(tmp2, static_cast<int>(fsize - FP::FPInfo<FPT>::explicit_mantissa_width)); + { + code.psrad(tmp2, 31); + if constexpr (fsize == 64) { + code.pshufd(tmp2, tmp2, 0b11110101); + } + } + code.andnps(tmp2, tmp1); + + code.orps(result, tmp2); + } + + // Denormalization quiets SNaNs, therefore should happen after SNaN detection! + DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {xmm_a, xmm_b}, tmp1); + + // intermediate result = max/min(xmm_a, xmm_b) + { + const Xbyak::Xmm eq_mask = tmp1; + const Xbyak::Xmm eq = tmp2; + + code.movaps(eq_mask, xmm_a); + FCODE(cmpneqp)(eq_mask, xmm_b); + + code.movaps(eq, xmm_a); + code.movaps(intermediate_result, xmm_a); + if constexpr (is_max) { + code.andps(eq, xmm_b); + FCODE(maxp)(intermediate_result, xmm_b); + } else { + code.orps(eq, xmm_b); + FCODE(minp)(intermediate_result, xmm_b); + } + + code.andps(intermediate_result, eq_mask); + code.andnps(eq_mask, eq); + code.orps(intermediate_result, eq_mask); + } + + { + code.andps(xmm_a, result); + code.andnps(result, intermediate_result); + code.orps(result, xmm_a); + } + + if (ctx.FPCR(fpcr_controlled).DN()) { + const Xbyak::Xmm ord_mask = tmp1; + + code.xorps(ord_mask, ord_mask); + FCODE(cmpordp)(ord_mask, result); + + code.andps(result, ord_mask); + code.andnps(ord_mask, GetNaNVector<fsize>(code)); + code.orps(result, ord_mask); + } else { + const Xbyak::Xmm nan_mask = tmp1; + + code.xorps(nan_mask, nan_mask); + FCODE(cmpunordp)(nan_mask, result); + code.andps(nan_mask, GetVectorOf<fsize, FP::FPInfo<FPT>::mantissa_msb>(code)); + code.orps(result, nan_mask); + } + }); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPVectorMax32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMax<32, true>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMax64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMax<64, true>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMaxNumeric32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<32, true>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMaxNumeric64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<64, true>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMin32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMax<32, false>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMin64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMax<64, false>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMinNumeric32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<32, false>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMinNumeric64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<64, false>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps); +} + +void EmitX64::EmitFPVectorMul64(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulpd); +} + +template<typename FPT, bool needs_rounding_correction, bool needs_nan_correction> +static void EmitFPVectorMulAddFallback(VectorArray<FPT>& result, const VectorArray<FPT>& addend, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, [[maybe_unused]] FP::FPSR& fpsr) { + for (size_t i = 0; i < result.size(); i++) { + if constexpr (needs_rounding_correction) { + constexpr FPT non_sign_mask = FP::FPInfo<FPT>::exponent_mask | FP::FPInfo<FPT>::mantissa_mask; + constexpr FPT smallest_normal_number = FP::FPValue<FPT, false, FP::FPInfo<FPT>::exponent_min, 1>(); + if ((result[i] & non_sign_mask) == smallest_normal_number) { + result[i] = FP::FPMulAdd<FPT>(addend[i], op1[i], op2[i], fpcr, fpsr); + continue; + } + } + if constexpr (needs_nan_correction) { + if (FP::IsNaN(result[i])) { + if (FP::IsQNaN(addend[i]) && ((FP::IsZero(op1[i], fpcr) && FP::IsInf(op2[i])) || (FP::IsInf(op1[i]) && FP::IsZero(op2[i], fpcr)))) { + result[i] = FP::FPInfo<FPT>::DefaultNaN(); + } else if (auto r = FP::ProcessNaNs(addend[i], op1[i], op2[i])) { + result[i] = *r; + } else { + result[i] = FP::FPInfo<FPT>::DefaultNaN(); + } + } + } + } +} + +template<size_t fsize> +void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + const auto fallback_fn = [](VectorArray<FPT>& result, const VectorArray<FPT>& addend, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) { + for (size_t i = 0; i < result.size(); i++) { + result[i] = FP::FPMulAdd<FPT>(addend[i], op1[i], op2[i], fpcr, fpsr); + } + }; + + if constexpr (fsize != 16) { + const bool fpcr_controlled = inst->GetArg(3).GetU1(); + const bool needs_rounding_correction = ctx.FPCR(fpcr_controlled).FZ(); + const bool needs_nan_correction = !(ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)); + + if (code.HasHostFeature(HostFeature::FMA) && !needs_rounding_correction && !needs_nan_correction) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + FCODE(vfmadd231p)(result, xmm_b, xmm_c); + ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result); + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + code.movaps(result, xmm_a); + FCODE(vfmadd231p)(result, xmm_b, xmm_c); + + if (needs_rounding_correction && needs_nan_correction) { + code.vandps(tmp, result, GetNonSignMaskVector<fsize>(code)); + FCODE(vcmpeq_uqp)(tmp, tmp, GetSmallestNormalVector<fsize>(code)); + } else if (needs_rounding_correction) { + code.vandps(tmp, result, GetNonSignMaskVector<fsize>(code)); + ICODE(vpcmpeq)(tmp, tmp, GetSmallestNormalVector<fsize>(code)); + } else if (needs_nan_correction) { + FCODE(vcmpunordp)(tmp, result, result); + } + code.vptest(tmp, tmp); + code.jnz(*fallback, code.T_NEAR); + code.L(*end); + ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result); + }); + + ctx.deferred_emits.emplace_back([=, &code, &ctx] { + code.L(*fallback); + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + if (needs_rounding_correction && needs_nan_correction) { + EmitFourOpFallbackWithoutRegAlloc<LoadPreviousResult::Yes>(code, ctx, result, xmm_a, xmm_b, xmm_c, EmitFPVectorMulAddFallback<FPT, true, true>, fpcr_controlled); + } else if (needs_rounding_correction) { + EmitFourOpFallbackWithoutRegAlloc<LoadPreviousResult::Yes>(code, ctx, result, xmm_a, xmm_b, xmm_c, EmitFPVectorMulAddFallback<FPT, true, false>, fpcr_controlled); + } else if (needs_nan_correction) { + EmitFourOpFallbackWithoutRegAlloc<LoadPreviousResult::Yes>(code, ctx, result, xmm_a, xmm_b, xmm_c, EmitFPVectorMulAddFallback<FPT, false, true>, fpcr_controlled); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + code.jmp(*end, code.T_NEAR); + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]); + + FCODE(mulp)(operand2, operand3); + FCODE(addp)(operand1, operand2); + + ctx.reg_alloc.DefineValue(inst, operand1); + return; + } + } + + EmitFourOpFallback(code, ctx, inst, fallback_fn); +} + +void EmitX64::EmitFPVectorMulAdd16(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMulAdd<16>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMulAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMulAdd<32>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMulAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMulAdd<64>(code, ctx, inst); +} + +template<size_t fsize> +static void EmitFPVectorMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + + if (ctx.FPCR(fpcr_controlled).DN() && code.HasHostFeature(HostFeature::AVX)) { + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm twos = ctx.reg_alloc.ScratchXmm(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + FCODE(vcmpunordp)(xmm0, result, operand); + FCODE(vxorp)(twos, result, operand); + FCODE(mulp)(result, operand); + FCODE(andp)(twos, GetNegativeZeroVector<fsize>(code)); + FCODE(vcmpunordp)(tmp, result, result); + FCODE(blendvp)(result, GetNaNVector<fsize>(code)); + FCODE(orp)(twos, GetVectorOf<fsize, false, 0, 2>(code)); + FCODE(andnp)(xmm0, tmp); + FCODE(blendvp)(result, twos); + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); + + code.movaps(nan_mask, xmm_b); + code.movaps(result, xmm_a); + FCODE(cmpunordp)(nan_mask, xmm_a); + FCODE(mulp)(result, xmm_b); + FCODE(cmpunordp)(nan_mask, result); + + const auto nan_handler = Common::FptrCast( + [](std::array<VectorArray<FPT>, 3>& values, FP::FPCR fpcr) { + VectorArray<FPT>& result = values[0]; + for (size_t elementi = 0; elementi < result.size(); ++elementi) { + if (auto r = FP::ProcessNaNs(values[1][elementi], values[2][elementi])) { + result[elementi] = fpcr.DN() ? FP::FPInfo<FPT>::DefaultNaN() : *r; + } else if (FP::IsNaN(result[elementi])) { + const FPT sign = (values[1][elementi] ^ values[2][elementi]) & FP::FPInfo<FPT>::sign_mask; + result[elementi] = sign | FP::FPValue<FPT, false, 0, 2>(); + } + } + }); + + HandleNaNs<fsize, 2>(code, ctx, fpcr_controlled, {result, xmm_a, xmm_b}, nan_mask, nan_handler); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPVectorMulX32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMulX<32>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMulX64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMulX<64>(code, ctx, inst); +} + +template<size_t fsize> +void FPVectorNeg(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + constexpr FPT sign_mask = FP::FPInfo<FPT>::sign_mask; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Address mask = code.BConst<fsize>(xword, sign_mask); + + code.xorps(a, mask); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitX64::EmitFPVectorNeg16(EmitContext& ctx, IR::Inst* inst) { + FPVectorNeg<16>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorNeg32(EmitContext& ctx, IR::Inst* inst) { + FPVectorNeg<32>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorNeg64(EmitContext& ctx, IR::Inst* inst) { + FPVectorNeg<64>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<32, PairedIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::haddps); +} + +void EmitX64::EmitFPVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<64, PairedIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::haddpd); +} + +void EmitX64::EmitFPVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<32, PairedLowerIndexer>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) { + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + code.xorps(zero, zero); + code.punpcklqdq(result, xmm_b); + code.haddps(result, zero); + }); +} + +void EmitX64::EmitFPVectorPairedAddLower64(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<64, PairedLowerIndexer>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) { + const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); + code.xorps(zero, zero); + code.punpcklqdq(result, xmm_b); + code.haddpd(result, zero); + }); +} + +template<size_t fsize> +static void EmitRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + if constexpr (fsize != 16) { + if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + FCODE(vrcp14p)(result, operand); + } else { + if constexpr (fsize == 32) { + code.rcpps(result, operand); + } else { + code.cvtpd2ps(result, operand); + code.rcpps(result, result); + code.cvtps2pd(result, result); + } + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + + EmitTwoOpFallback(code, ctx, inst, [](VectorArray<FPT>& result, const VectorArray<FPT>& operand, FP::FPCR fpcr, FP::FPSR& fpsr) { + for (size_t i = 0; i < result.size(); i++) { + result[i] = FP::FPRecipEstimate<FPT>(operand[i], fpcr, fpsr); + } + }); +} + +void EmitX64::EmitFPVectorRecipEstimate16(EmitContext& ctx, IR::Inst* inst) { + EmitRecipEstimate<16>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorRecipEstimate32(EmitContext& ctx, IR::Inst* inst) { + EmitRecipEstimate<32>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorRecipEstimate64(EmitContext& ctx, IR::Inst* inst) { + EmitRecipEstimate<64>(code, ctx, inst); +} + +template<size_t fsize> +static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + const auto fallback_fn = [](VectorArray<FPT>& result, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) { + for (size_t i = 0; i < result.size(); i++) { + result[i] = FP::FPRecipStepFused<FPT>(op1[i], op2[i], fpcr, fpsr); + } + }; + + if constexpr (fsize != 16) { + if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code)); + FCODE(vfnmadd231p)(result, operand1, operand2); + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code)); + FCODE(vfnmadd231p)(result, operand1, operand2); + + FCODE(vcmpunordp)(tmp, result, result); + code.vptest(tmp, tmp); + code.jnz(*fallback, code.T_NEAR); + code.L(*end); + }); + + ctx.deferred_emits.emplace_back([=, &code, &ctx] { + code.L(*fallback); + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, operand1, operand2, fallback_fn, fpcr_controlled); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + code.jmp(*end, code.T_NEAR); + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code)); + FCODE(mulp)(operand1, operand2); + FCODE(subp)(result, operand1); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + + EmitThreeOpFallback(code, ctx, inst, fallback_fn); +} + +void EmitX64::EmitFPVectorRecipStepFused16(EmitContext& ctx, IR::Inst* inst) { + EmitRecipStepFused<16>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorRecipStepFused32(EmitContext& ctx, IR::Inst* inst) { + EmitRecipStepFused<32>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorRecipStepFused64(EmitContext& ctx, IR::Inst* inst) { + EmitRecipStepFused<64>(code, ctx, inst); +} + +template<size_t fsize> +void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8()); + const bool exact = inst->GetArg(2).GetU1(); + + if constexpr (fsize != 16) { + if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) { + const u8 round_imm = [&]() -> u8 { + switch (rounding) { + case FP::RoundingMode::ToNearest_TieEven: + return 0b00; + case FP::RoundingMode::TowardsPlusInfinity: + return 0b10; + case FP::RoundingMode::TowardsMinusInfinity: + return 0b01; + case FP::RoundingMode::TowardsZero: + return 0b11; + default: + UNREACHABLE(); + } + }(); + + EmitTwoOpVectorOperation<fsize, DefaultIndexer, 3>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_a) { + FCODE(roundp)(result, xmm_a, round_imm); + }); + + return; + } + } + + using rounding_list = mp::list< + mp::lift_value<FP::RoundingMode::ToNearest_TieEven>, + mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsZero>, + mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>; + using exact_list = mp::list<std::true_type, std::false_type>; + + static const auto lut = Common::GenerateLookupTableFromList( + []<typename I>(I) { + using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC + return std::pair{ + mp::lower_to_tuple_v<I>, + Common::FptrCast( + [](VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value; + constexpr bool exact = mp::get<1, I>::value; + + for (size_t i = 0; i < output.size(); ++i) { + output[i] = static_cast<FPT>(FP::FPRoundInt<FPT>(input[i], fpcr, rounding_mode, exact, fpsr)); + } + })}; + }, + mp::cartesian_product<rounding_list, exact_list>{}); + + EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(rounding, exact))); +} + +void EmitX64::EmitFPVectorRoundInt16(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorRoundInt<16>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorRoundInt32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorRoundInt<32>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorRoundInt64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorRoundInt<64>(code, ctx, inst); +} + +template<size_t fsize> +static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + const auto fallback_fn = [](VectorArray<FPT>& result, const VectorArray<FPT>& operand, FP::FPCR fpcr, FP::FPSR& fpsr) { + for (size_t i = 0; i < result.size(); i++) { + result[i] = FP::FPRSqrtEstimate<FPT>(operand[i], fpcr, fpsr); + } + }; + + if constexpr (fsize != 16) { + if (ctx.HasOptimization(OptimizationFlag::Unsafe_ReducedErrorFP)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + FCODE(vrsqrt14p)(result, operand); + } else { + if constexpr (fsize == 32) { + code.rsqrtps(result, operand); + } else { + code.cvtpd2ps(result, operand); + code.rsqrtps(result, result); + code.cvtps2pd(result, result); + } + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::AVX)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[1].GetImmediateU1(); + + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(); + + SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel(); + + code.movaps(value, operand); + + code.movaps(xmm0, GetVectorOf<fsize, (fsize == 32 ? 0xFFFF8000 : 0xFFFF'F000'0000'0000)>(code)); + code.pand(value, xmm0); + code.por(value, GetVectorOf<fsize, (fsize == 32 ? 0x00008000 : 0x0000'1000'0000'0000)>(code)); + + // Detect NaNs, negatives, zeros, denormals and infinities + FCODE(vcmpnge_uqp)(result, value, GetVectorOf<fsize, (FPT(1) << FP::FPInfo<FPT>::explicit_mantissa_width)>(code)); + code.vptest(result, result); + code.jnz(*bad_values, code.T_NEAR); + + FCODE(sqrtp)(value, value); + code.vmovaps(result, GetVectorOf<fsize, FP::FPValue<FPT, false, 0, 1>()>(code)); + FCODE(divp)(result, value); + + ICODE(padd)(result, GetVectorOf<fsize, (fsize == 32 ? 0x00004000 : 0x0000'0800'0000'0000)>(code)); + code.pand(result, xmm0); + + code.L(*end); + + ctx.deferred_emits.emplace_back([=, &code, &ctx] { + code.L(*bad_values); + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + EmitTwoOpFallbackWithoutRegAlloc(code, ctx, result, operand, fallback_fn, fpcr_controlled); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + code.jmp(*end, code.T_NEAR); + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + + EmitTwoOpFallback(code, ctx, inst, fallback_fn); +} + +void EmitX64::EmitFPVectorRSqrtEstimate16(EmitContext& ctx, IR::Inst* inst) { + EmitRSqrtEstimate<16>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorRSqrtEstimate32(EmitContext& ctx, IR::Inst* inst) { + EmitRSqrtEstimate<32>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorRSqrtEstimate64(EmitContext& ctx, IR::Inst* inst) { + EmitRSqrtEstimate<64>(code, ctx, inst); +} + +template<size_t fsize> +static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mcl::unsigned_integer_of_size<fsize>; + + const auto fallback_fn = [](VectorArray<FPT>& result, const VectorArray<FPT>& op1, const VectorArray<FPT>& op2, FP::FPCR fpcr, FP::FPSR& fpsr) { + for (size_t i = 0; i < result.size(); i++) { + result[i] = FP::FPRSqrtStepFused<FPT>(op1[i], op2[i], fpcr, fpsr); + } + }; + + if constexpr (fsize != 16) { + if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX) && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code)); + FCODE(vfnmadd231p)(result, operand1, operand2); + FCODE(vmulp)(result, result, GetVectorOf<fsize, false, -1, 1>(code)); + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); + + SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code)); + FCODE(vfnmadd231p)(result, operand1, operand2); + + // An explanation for this is given in EmitFPRSqrtStepFused. + code.vmovaps(mask, GetVectorOf<fsize, (fsize == 32 ? 0x7f000000 : 0x7fe0000000000000)>(code)); + FCODE(vandp)(tmp, result, mask); + ICODE(vpcmpeq)(tmp, tmp, mask); + code.ptest(tmp, tmp); + code.jnz(*fallback, code.T_NEAR); + + FCODE(vmulp)(result, result, GetVectorOf<fsize, false, -1, 1>(code)); + code.L(*end); + }); + + ctx.deferred_emits.emplace_back([=, &code, &ctx] { + code.L(*fallback); + code.sub(rsp, 8); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, operand1, operand2, fallback_fn, fpcr_controlled); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); + code.add(rsp, 8); + code.jmp(*end, code.T_NEAR); + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + code.movaps(result, GetVectorOf<fsize, false, 0, 3>(code)); + FCODE(mulp)(operand1, operand2); + FCODE(subp)(result, operand1); + FCODE(mulp)(result, GetVectorOf<fsize, false, -1, 1>(code)); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + + EmitThreeOpFallback(code, ctx, inst, fallback_fn); +} + +void EmitX64::EmitFPVectorRSqrtStepFused16(EmitContext& ctx, IR::Inst* inst) { + EmitRSqrtStepFused<16>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorRSqrtStepFused32(EmitContext& ctx, IR::Inst* inst) { + EmitRSqrtStepFused<32>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorRSqrtStepFused64(EmitContext& ctx, IR::Inst* inst) { + EmitRSqrtStepFused<64>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorSqrt32(EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, [this](const Xbyak::Xmm& result, const Xbyak::Xmm& operand) { + code.sqrtps(result, operand); + }); +} + +void EmitX64::EmitFPVectorSqrt64(EmitContext& ctx, IR::Inst* inst) { + EmitTwoOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, [this](const Xbyak::Xmm& result, const Xbyak::Xmm& operand) { + code.sqrtpd(result, operand); + }); +} + +void EmitX64::EmitFPVectorSub32(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::subps); +} + +void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) { + EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::subpd); +} + +void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) { + const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8()); + const bool fpcr_controlled = inst->GetArg(2).GetU1(); + + if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + + ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result); + code.vcvtps2ph(result, result, static_cast<u8>(*round_imm)); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + using rounding_list = mp::list< + mp::lift_value<FP::RoundingMode::ToNearest_TieEven>, + mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsZero>, + mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>; + + static const auto lut = Common::GenerateLookupTableFromList( + []<typename I>(I) { + return std::pair{ + mp::lower_to_tuple_v<I>, + Common::FptrCast( + [](VectorArray<u16>& output, const VectorArray<u32>& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value; + + for (size_t i = 0; i < output.size(); ++i) { + if (i < input.size()) { + output[i] = FP::FPConvert<u16, u32>(input[i], fpcr, rounding_mode, fpsr); + } else { + output[i] = 0; + } + } + })}; + }, + mp::cartesian_product<rounding_list>{}); + + EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode))); +} + +template<size_t fsize, bool unsigned_> +void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + const size_t fbits = inst->GetArg(1).GetU8(); + const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8()); + [[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1(); + + if constexpr (fsize != 16) { + if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]); + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + const int round_imm = [&] { + switch (rounding) { + case FP::RoundingMode::ToNearest_TieEven: + default: + return 0b00; + case FP::RoundingMode::TowardsPlusInfinity: + return 0b10; + case FP::RoundingMode::TowardsMinusInfinity: + return 0b01; + case FP::RoundingMode::TowardsZero: + return 0b11; + } + }(); + + const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) { + // MSVC doesn't allow us to use a [&] capture, so we have to do this instead. + (void)ctx; + + if constexpr (fsize == 32) { + code.cvttps2dq(src, src); + } else { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + code.vcvttpd2qq(src, src); + } else { + const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(); + + code.cvttsd2si(lo, src); + code.punpckhqdq(src, src); + code.cvttsd2si(hi, src); + code.movq(src, lo); + code.pinsrq(src, hi, 1); + + ctx.reg_alloc.Release(hi); + ctx.reg_alloc.Release(lo); + } + } + }; + + if (fbits != 0) { + const u64 scale_factor = fsize == 32 + ? static_cast<u64>(fbits + 127) << 23 + : static_cast<u64>(fbits + 1023) << 52; + FCODE(mulp)(src, GetVectorOf<fsize>(code, scale_factor)); + } + + FCODE(roundp)(src, src, static_cast<u8>(round_imm)); + ZeroIfNaN<fsize>(code, src); + + constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000; + [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000; + + if constexpr (unsigned_) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + // Mask positive values + code.xorps(xmm0, xmm0); + FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ); + + // Convert positive values to unsigned integers, write 0 anywhere else + // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...) + if constexpr (fsize == 32) { + code.vcvttps2udq(src | k1 | T_z, src); + } else { + code.vcvttpd2uqq(src | k1 | T_z, src); + } + } else { + // Zero is minimum + code.xorps(xmm0, xmm0); + FCODE(cmplep)(xmm0, src); + FCODE(andp)(src, xmm0); + + // Will we exceed unsigned range? + const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm(); + code.movaps(exceed_unsigned, GetVectorOf<fsize, float_upper_limit_unsigned>(code)); + FCODE(cmplep)(exceed_unsigned, src); + + // Will be exceed signed range? + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.movaps(tmp, GetVectorOf<fsize, float_upper_limit_signed>(code)); + code.movaps(xmm0, tmp); + FCODE(cmplep)(xmm0, src); + FCODE(andp)(tmp, xmm0); + FCODE(subp)(src, tmp); + perform_conversion(src); + ICODE(psll)(xmm0, static_cast<u8>(fsize - 1)); + FCODE(orp)(src, xmm0); + + // Saturate to max + FCODE(orp)(src, exceed_unsigned); + } + } else { + using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC + constexpr u64 integer_max = static_cast<FPT>(std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max()); + + code.movaps(xmm0, GetVectorOf<fsize, float_upper_limit_signed>(code)); + FCODE(cmplep)(xmm0, src); + perform_conversion(src); + FCODE(blendvp)(src, GetVectorOf<fsize, integer_max>(code)); + } + }); + + ctx.reg_alloc.DefineValue(inst, src); + return; + } + } + + using fbits_list = mp::lift_sequence<std::make_index_sequence<fsize + 1>>; + using rounding_list = mp::list< + mp::lift_value<FP::RoundingMode::ToNearest_TieEven>, + mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>, + mp::lift_value<FP::RoundingMode::TowardsZero>, + mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>; + + static const auto lut = Common::GenerateLookupTableFromList( + []<typename I>(I) { + using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC + return std::pair{ + mp::lower_to_tuple_v<I>, + Common::FptrCast( + [](VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + constexpr size_t fbits = mp::get<0, I>::value; + constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value; + + for (size_t i = 0; i < output.size(); ++i) { + output[i] = static_cast<FPT>(FP::FPToFixed<FPT>(fsize, input[i], fbits, unsigned_, fpcr, rounding_mode, fpsr)); + } + })}; + }, + mp::cartesian_product<fbits_list, rounding_list>{}); + + EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(fbits, rounding))); +} + +void EmitX64::EmitFPVectorToSignedFixed16(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorToFixed<16, false>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorToSignedFixed32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorToFixed<32, false>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorToSignedFixed64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorToFixed<64, false>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorToUnsignedFixed16(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorToFixed<16, true>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorToUnsignedFixed32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorToFixed<32, true>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorToUnsignedFixed64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorToFixed<64, true>(code, ctx, inst); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp new file mode 100644 index 0000000000..fb30549fb0 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp @@ -0,0 +1,340 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/constants.h" +#include "dynarmic/backend/x64/emit_x64.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" + +#define FCODE(NAME) \ + [&code](auto... args) { \ + if constexpr (esize == 32) { \ + code.NAME##s(args...); \ + } else { \ + code.NAME##d(args...); \ + } \ + } + +#define ICODE(NAME) \ + [&code](auto... args) { \ + if constexpr (esize == 32) { \ + code.NAME##d(args...); \ + } else { \ + code.NAME##q(args...); \ + } \ + } + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +namespace { + +void EmitVectorSaturatedNative(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*saturated_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&), void (Xbyak::CodeGenerator::*unsaturated_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&), void (Xbyak::CodeGenerator::*sub_fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm addend = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + + code.movaps(xmm0, result); + + (code.*saturated_fn)(result, addend); + + (code.*unsaturated_fn)(xmm0, addend); + (code.*sub_fn)(xmm0, result); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.ptest(xmm0, xmm0); + } else { + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.pxor(tmp, tmp); + code.pcmpeqw(xmm0, tmp); + code.pmovmskb(overflow.cvt32(), xmm0); + code.xor_(overflow.cvt32(), 0xFFFF); + code.test(overflow.cvt32(), overflow.cvt32()); + } + code.setnz(overflow); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); + + ctx.reg_alloc.DefineValue(inst, result); +} + +enum class Op { + Add, + Sub, +}; + +template<Op op, size_t esize> +void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + static_assert(esize == 32 || esize == 64); + constexpr u64 msb_mask = esize == 32 ? 0x8000000080000000 : 0x8000000000000000; + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + + code.movaps(xmm0, operand1); + + if constexpr (op == Op::Add) { + ICODE(vpadd)(result, operand1, operand2); + code.vpternlogd(xmm0, result, operand2, 0b00100100); + } else { + ICODE(vpsub)(result, operand1, operand2); + code.vpternlogd(xmm0, result, operand2, 0b00011000); + } + if constexpr (esize == 32) { + code.vpmovd2m(k1, xmm0); + } else { + code.vpmovq2m(k1, xmm0); + } + ICODE(vpsra)(result | k1, result, u8(esize - 1)); + ICODE(vpxor)(result | k1, result, code.BConst<esize>(xword_b, msb_mask)); + + code.ktestb(k1, k1); + code.setnz(overflow); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1; + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + if (code.HasHostFeature(HostFeature::AVX)) { + if constexpr (op == Op::Add) { + ICODE(vpadd)(result, operand1, operand2); + } else { + ICODE(vpsub)(result, operand1, operand2); + } + code.vpxor(xmm0, operand1, operand2); + code.vpxor(tmp, operand1, result); + } else { + code.movaps(xmm0, operand1); + code.movaps(tmp, operand1); + if constexpr (op == Op::Add) { + ICODE(padd)(result, operand2); + } else { + ICODE(psub)(result, operand2); + } + code.pxor(xmm0, operand2); + code.pxor(tmp, result); + } + + if constexpr (op == Op::Add) { + code.pandn(xmm0, tmp); + } else { + code.pand(xmm0, tmp); + } + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpsrad(tmp, result, 31); + } else { + code.movaps(tmp, result); + code.psrad(tmp, 31); + } + if constexpr (esize == 64) { + code.pshufd(tmp, tmp, 0b11110101); + } + code.pxor(tmp, code.BConst<esize>(xword, msb_mask)); + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.ptest(xmm0, code.Const(xword, msb_mask, msb_mask)); + } else { + FCODE(movmskp)(overflow.cvt32(), xmm0); + code.test(overflow.cvt32(), overflow.cvt32()); + } + code.setnz(overflow); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); + + if (code.HasHostFeature(HostFeature::SSE41)) { + FCODE(blendvp)(result, tmp); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + code.psrad(xmm0, 31); + if constexpr (esize == 64) { + code.pshufd(xmm0, xmm0, 0b11110101); + } + + code.pand(tmp, xmm0); + code.pandn(xmm0, result); + code.por(tmp, xmm0); + + ctx.reg_alloc.DefineValue(inst, tmp); + } +} + +template<Op op, size_t esize> +void EmitVectorUnsignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + static_assert(esize == 32 || esize == 64); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + + if constexpr (op == Op::Add) { + ICODE(vpadd)(result, operand1, operand2); + ICODE(vpcmpu)(k1, result, operand1, CmpInt::LessThan); + ICODE(vpternlog)(result | k1, result, result, u8(0xFF)); + } else { + ICODE(vpsub)(result, operand1, operand2); + ICODE(vpcmpu)(k1, result, operand1, CmpInt::GreaterThan); + ICODE(vpxor)(result | k1, result, result); + } + + code.ktestb(k1, k1); + code.setnz(overflow); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1; + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + if constexpr (op == Op::Add) { + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpxor(xmm0, operand1, operand2); + code.vpand(tmp, operand1, operand2); + ICODE(vpadd)(result, operand1, operand2); + } else { + code.movaps(tmp, operand1); + code.movaps(xmm0, operand1); + + code.pxor(xmm0, operand2); + code.pand(tmp, operand2); + ICODE(padd)(result, operand2); + } + + ICODE(psrl)(xmm0, 1); + ICODE(padd)(tmp, xmm0); + } else { + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpxor(tmp, operand1, operand2); + ICODE(vpsub)(result, operand1, operand2); + code.vpand(xmm0, operand2, tmp); + } else { + code.movaps(tmp, operand1); + code.movaps(xmm0, operand2); + + code.pxor(tmp, operand2); + ICODE(psub)(result, operand2); + code.pand(xmm0, tmp); + } + + ICODE(psrl)(tmp, 1); + ICODE(psub)(tmp, xmm0); + } + + code.psrad(tmp, 31); + if constexpr (esize == 64) { + code.pshufd(tmp, tmp, 0b11110101); + } + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.ptest(tmp, tmp); + } else { + FCODE(movmskp)(overflow.cvt32(), tmp); + code.test(overflow.cvt32(), overflow.cvt32()); + } + + code.setnz(overflow); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); + + if constexpr (op == Op::Add) { + code.por(result, tmp); + ctx.reg_alloc.DefineValue(inst, result); + } else { + code.pandn(tmp, result); + ctx.reg_alloc.DefineValue(inst, tmp); + } +} + +} // anonymous namespace + +void EmitX64::EmitVectorSignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddsb, &Xbyak::CodeGenerator::paddb, &Xbyak::CodeGenerator::psubb); +} + +void EmitX64::EmitVectorSignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddsw, &Xbyak::CodeGenerator::paddw, &Xbyak::CodeGenerator::psubw); +} + +void EmitX64::EmitVectorSignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturated<Op::Add, 32>(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturated<Op::Add, 64>(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubsb, &Xbyak::CodeGenerator::psubb, &Xbyak::CodeGenerator::psubb); +} + +void EmitX64::EmitVectorSignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubsw, &Xbyak::CodeGenerator::psubw, &Xbyak::CodeGenerator::psubw); +} + +void EmitX64::EmitVectorSignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturated<Op::Sub, 32>(code, ctx, inst); +} + +void EmitX64::EmitVectorSignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSignedSaturated<Op::Sub, 64>(code, ctx, inst); +} + +void EmitX64::EmitVectorUnsignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddusb, &Xbyak::CodeGenerator::paddb, &Xbyak::CodeGenerator::psubb); +} + +void EmitX64::EmitVectorUnsignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::paddusw, &Xbyak::CodeGenerator::paddw, &Xbyak::CodeGenerator::psubw); +} + +void EmitX64::EmitVectorUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorUnsignedSaturated<Op::Add, 32>(code, ctx, inst); +} + +void EmitX64::EmitVectorUnsignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorUnsignedSaturated<Op::Add, 64>(code, ctx, inst); +} + +void EmitX64::EmitVectorUnsignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubusb, &Xbyak::CodeGenerator::psubb, &Xbyak::CodeGenerator::psubb); +} + +void EmitX64::EmitVectorUnsignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) { + EmitVectorSaturatedNative(code, ctx, inst, &Xbyak::CodeGenerator::psubusw, &Xbyak::CodeGenerator::psubw, &Xbyak::CodeGenerator::psubw); +} + +void EmitX64::EmitVectorUnsignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) { + EmitVectorUnsignedSaturated<Op::Sub, 32>(code, ctx, inst); +} + +void EmitX64::EmitVectorUnsignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) { + EmitVectorUnsignedSaturated<Op::Sub, 64>(code, ctx, inst); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp b/externals/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp new file mode 100644 index 0000000000..a7f964337a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp @@ -0,0 +1,264 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +#include <cstring> +#include <vector> + +#include <mcl/assert.hpp> +#include <mcl/bit_cast.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/exception_handler.h" +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/common/safe_ops.h" + +using UBYTE = u8; + +enum UNWIND_REGISTER_CODES { + UWRC_RAX, + UWRC_RCX, + UWRC_RDX, + UWRC_RBX, + UWRC_RSP, + UWRC_RBP, + UWRC_RSI, + UWRC_RDI, + UWRC_R8, + UWRC_R9, + UWRC_R10, + UWRC_R11, + UWRC_R12, + UWRC_R13, + UWRC_R14, + UWRC_R15, +}; + +enum UNWIND_OPCODE { + UWOP_PUSH_NONVOL = 0, + UWOP_ALLOC_LARGE = 1, + UWOP_ALLOC_SMALL = 2, + UWOP_SET_FPREG = 3, + UWOP_SAVE_NONVOL = 4, + UWOP_SAVE_NONVOL_FAR = 5, + UWOP_SAVE_XMM128 = 8, + UWOP_SAVE_XMM128_FAR = 9, + UWOP_PUSH_MACHFRAME = 10, +}; + +union UNWIND_CODE { + struct { + UBYTE CodeOffset; + UBYTE UnwindOp : 4; + UBYTE OpInfo : 4; + } code; + USHORT FrameOffset; +}; + +// UNWIND_INFO is a tail-padded structure +struct UNWIND_INFO { + UBYTE Version : 3; + UBYTE Flags : 5; + UBYTE SizeOfProlog; + UBYTE CountOfCodes; + UBYTE FrameRegister : 4; + UBYTE FrameOffset : 4; + // UNWIND_CODE UnwindCode[]; + // With Flags == 0 there are no additional fields. + // OPTIONAL UNW_EXCEPTION_INFO ExceptionInfo; +}; + +struct UNW_EXCEPTION_INFO { + ULONG ExceptionHandler; + // OPTIONAL ARBITRARY HandlerData; +}; + +namespace Dynarmic::Backend { + +using namespace Dynarmic::Backend::X64; + +struct PrologueInformation { + std::vector<UNWIND_CODE> unwind_code; + size_t number_of_unwind_code_entries; + u8 prolog_size; +}; + +static PrologueInformation GetPrologueInformation() { + PrologueInformation ret; + + const auto next_entry = [&]() -> UNWIND_CODE& { + ret.unwind_code.emplace_back(); + return ret.unwind_code.back(); + }; + const auto push_nonvol = [&](u8 offset, UNWIND_REGISTER_CODES reg) { + auto& entry = next_entry(); + entry.code.CodeOffset = offset; + entry.code.UnwindOp = UWOP_PUSH_NONVOL; + entry.code.OpInfo = reg; + }; + const auto alloc_large = [&](u8 offset, size_t size) { + ASSERT(size % 8 == 0); + size /= 8; + + auto& entry = next_entry(); + entry.code.CodeOffset = offset; + entry.code.UnwindOp = UWOP_ALLOC_LARGE; + if (size <= 0xFFFF) { + entry.code.OpInfo = 0; + auto& size_entry = next_entry(); + size_entry.FrameOffset = static_cast<USHORT>(size); + } else { + entry.code.OpInfo = 1; + auto& size_entry_1 = next_entry(); + size_entry_1.FrameOffset = static_cast<USHORT>(size); + auto& size_entry_2 = next_entry(); + size_entry_2.FrameOffset = static_cast<USHORT>(size >> 16); + } + }; + const auto save_xmm128 = [&](u8 offset, u8 reg, size_t frame_offset) { + ASSERT(frame_offset % 16 == 0); + + auto& entry = next_entry(); + entry.code.CodeOffset = offset; + entry.code.UnwindOp = UWOP_SAVE_XMM128; + entry.code.OpInfo = reg; + auto& offset_entry = next_entry(); + offset_entry.FrameOffset = static_cast<USHORT>(frame_offset / 16); + }; + + // This is a list of operations that occur in the prologue. + // The debugger uses this information to retrieve register values and + // to calculate the size of the stack frame. + ret.prolog_size = 89; + save_xmm128(89, 15, 0xB0); // +050 44 0F 29 BC 24 B0 00 00 00 movaps xmmword ptr [rsp+0B0h],xmm15 + save_xmm128(80, 14, 0xA0); // +047 44 0F 29 B4 24 A0 00 00 00 movaps xmmword ptr [rsp+0A0h],xmm14 + save_xmm128(71, 13, 0x90); // +03E 44 0F 29 AC 24 90 00 00 00 movaps xmmword ptr [rsp+90h],xmm13 + save_xmm128(62, 12, 0x80); // +035 44 0F 29 A4 24 80 00 00 00 movaps xmmword ptr [rsp+80h],xmm12 + save_xmm128(53, 11, 0x70); // +02F 44 0F 29 5C 24 70 movaps xmmword ptr [rsp+70h],xmm11 + save_xmm128(47, 10, 0x60); // +029 44 0F 29 54 24 60 movaps xmmword ptr [rsp+60h],xmm10 + save_xmm128(41, 9, 0x50); // +023 44 0F 29 4C 24 50 movaps xmmword ptr [rsp+50h],xmm9 + save_xmm128(35, 8, 0x40); // +01D 44 0F 29 44 24 40 movaps xmmword ptr [rsp+40h],xmm8 + save_xmm128(29, 7, 0x30); // +018 0F 29 7C 24 30 movaps xmmword ptr [rsp+30h],xmm7 + save_xmm128(24, 6, 0x20); // +013 0F 29 74 24 20 movaps xmmword ptr [rsp+20h],xmm6 + alloc_large(19, 0xC8); // +00C 48 81 EC C8 00 00 00 sub rsp,0C8h + push_nonvol(12, UWRC_R15); // +00A 41 57 push r15 + push_nonvol(10, UWRC_R14); // +008 41 56 push r14 + push_nonvol(8, UWRC_R13); // +006 41 55 push r13 + push_nonvol(6, UWRC_R12); // +004 41 54 push r12 + push_nonvol(4, UWRC_RBP); // +003 55 push rbp + push_nonvol(3, UWRC_RDI); // +002 57 push rdi + push_nonvol(2, UWRC_RSI); // +001 56 push rsi + push_nonvol(1, UWRC_RBX); // +000 53 push rbx + + ret.number_of_unwind_code_entries = ret.unwind_code.size(); + + // The Windows API requires the size of the unwind_code array + // to be a multiple of two for alignment reasons. + if (ret.unwind_code.size() % 2 == 1) { + auto& last_entry = next_entry(); + last_entry.FrameOffset = 0; + } + ASSERT(ret.unwind_code.size() % 2 == 0); + + return ret; +} + +struct ExceptionHandler::Impl final { + Impl(BlockOfCode& code) { + const auto prolog_info = GetPrologueInformation(); + + code.align(16); + const u8* exception_handler_without_cb = code.getCurr<u8*>(); + code.mov(code.eax, static_cast<u32>(ExceptionContinueSearch)); + code.ret(); + + code.align(16); + const u8* exception_handler_with_cb = code.getCurr<u8*>(); + // Our 3rd argument is a PCONTEXT. + + // If not within our codeblock, ignore this exception. + code.mov(code.rax, Safe::Negate(mcl::bit_cast<u64>(code.getCode()))); + code.add(code.rax, code.qword[code.ABI_PARAM3 + Xbyak::RegExp(offsetof(CONTEXT, Rip))]); + code.cmp(code.rax, static_cast<u32>(code.GetTotalCodeSize())); + code.ja(exception_handler_without_cb); + + code.sub(code.rsp, 8); + code.mov(code.ABI_PARAM1, mcl::bit_cast<u64>(&cb)); + code.mov(code.ABI_PARAM2, code.ABI_PARAM3); + code.CallLambda( + [](const std::function<FakeCall(u64)>& cb_, PCONTEXT ctx) { + FakeCall fc = cb_(ctx->Rip); + + ctx->Rsp -= sizeof(u64); + *mcl::bit_cast<u64*>(ctx->Rsp) = fc.ret_rip; + ctx->Rip = fc.call_rip; + }); + code.add(code.rsp, 8); + code.mov(code.eax, static_cast<u32>(ExceptionContinueExecution)); + code.ret(); + + exception_handler_without_cb_offset = static_cast<ULONG>(exception_handler_without_cb - code.getCode<u8*>()); + exception_handler_with_cb_offset = static_cast<ULONG>(exception_handler_with_cb - code.getCode<u8*>()); + + code.align(16); + UNWIND_INFO* unwind_info = static_cast<UNWIND_INFO*>(code.AllocateFromCodeSpace(sizeof(UNWIND_INFO))); + unwind_info->Version = 1; + unwind_info->Flags = UNW_FLAG_EHANDLER; + unwind_info->SizeOfProlog = prolog_info.prolog_size; + unwind_info->CountOfCodes = static_cast<UBYTE>(prolog_info.number_of_unwind_code_entries); + unwind_info->FrameRegister = 0; // No frame register present + unwind_info->FrameOffset = 0; // Unused because FrameRegister == 0 + // UNWIND_INFO::UnwindCode field: + const size_t size_of_unwind_code = sizeof(UNWIND_CODE) * prolog_info.unwind_code.size(); + UNWIND_CODE* unwind_code = static_cast<UNWIND_CODE*>(code.AllocateFromCodeSpace(size_of_unwind_code)); + memcpy(unwind_code, prolog_info.unwind_code.data(), size_of_unwind_code); + // UNWIND_INFO::ExceptionInfo field: + except_info = static_cast<UNW_EXCEPTION_INFO*>(code.AllocateFromCodeSpace(sizeof(UNW_EXCEPTION_INFO))); + except_info->ExceptionHandler = exception_handler_without_cb_offset; + + code.align(16); + rfuncs = static_cast<RUNTIME_FUNCTION*>(code.AllocateFromCodeSpace(sizeof(RUNTIME_FUNCTION))); + rfuncs->BeginAddress = static_cast<DWORD>(0); + rfuncs->EndAddress = static_cast<DWORD>(code.GetTotalCodeSize()); + rfuncs->UnwindData = static_cast<DWORD>(reinterpret_cast<u8*>(unwind_info) - code.getCode()); + + RtlAddFunctionTable(rfuncs, 1, reinterpret_cast<DWORD64>(code.getCode())); + } + + void SetCallback(std::function<FakeCall(u64)> new_cb) { + cb = new_cb; + except_info->ExceptionHandler = cb ? exception_handler_with_cb_offset : exception_handler_without_cb_offset; + } + + ~Impl() { + RtlDeleteFunctionTable(rfuncs); + } + +private: + RUNTIME_FUNCTION* rfuncs; + std::function<FakeCall(u64)> cb; + UNW_EXCEPTION_INFO* except_info; + ULONG exception_handler_without_cb_offset; + ULONG exception_handler_with_cb_offset; +}; + +ExceptionHandler::ExceptionHandler() = default; +ExceptionHandler::~ExceptionHandler() = default; + +void ExceptionHandler::Register(BlockOfCode& code) { + impl = std::make_unique<Impl>(code); +} + +bool ExceptionHandler::SupportsFastmem() const noexcept { + return static_cast<bool>(impl); +} + +void ExceptionHandler::SetFastmemCallback(std::function<FakeCall(u64)> cb) { + impl->SetCallback(cb); +} + +} // namespace Dynarmic::Backend diff --git a/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor.cpp b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor.cpp new file mode 100644 index 0000000000..984b67bb02 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor.cpp @@ -0,0 +1,58 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/interface/exclusive_monitor.h" + +#include <algorithm> + +#include <mcl/assert.hpp> + +namespace Dynarmic { + +ExclusiveMonitor::ExclusiveMonitor(size_t processor_count) + : exclusive_addresses(processor_count, INVALID_EXCLUSIVE_ADDRESS), exclusive_values(processor_count) {} + +size_t ExclusiveMonitor::GetProcessorCount() const { + return exclusive_addresses.size(); +} + +void ExclusiveMonitor::Lock() { + lock.Lock(); +} + +void ExclusiveMonitor::Unlock() { + lock.Unlock(); +} + +bool ExclusiveMonitor::CheckAndClear(size_t processor_id, VAddr address) { + const VAddr masked_address = address & RESERVATION_GRANULE_MASK; + + Lock(); + if (exclusive_addresses[processor_id] != masked_address) { + Unlock(); + return false; + } + + for (VAddr& other_address : exclusive_addresses) { + if (other_address == masked_address) { + other_address = INVALID_EXCLUSIVE_ADDRESS; + } + } + return true; +} + +void ExclusiveMonitor::Clear() { + Lock(); + std::fill(exclusive_addresses.begin(), exclusive_addresses.end(), INVALID_EXCLUSIVE_ADDRESS); + Unlock(); +} + +void ExclusiveMonitor::ClearProcessor(size_t processor_id) { + Lock(); + exclusive_addresses[processor_id] = INVALID_EXCLUSIVE_ADDRESS; + Unlock(); +} + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor_friend.h b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor_friend.h new file mode 100644 index 0000000000..7f7fa24259 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor_friend.h @@ -0,0 +1,28 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include "dynarmic/interface/exclusive_monitor.h" + +namespace Dynarmic { + +inline volatile int* GetExclusiveMonitorLockPointer(ExclusiveMonitor* monitor) { + return &monitor->lock.storage; +} + +inline size_t GetExclusiveMonitorProcessorCount(ExclusiveMonitor* monitor) { + return monitor->exclusive_addresses.size(); +} + +inline VAddr* GetExclusiveMonitorAddressPointer(ExclusiveMonitor* monitor, size_t index) { + return monitor->exclusive_addresses.data() + index; +} + +inline Vector* GetExclusiveMonitorValuePointer(ExclusiveMonitor* monitor, size_t index) { + return monitor->exclusive_values.data() + index; +} + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/backend/x64/host_feature.h b/externals/dynarmic/src/dynarmic/backend/x64/host_feature.h new file mode 100644 index 0000000000..8e3b14c7bb --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/host_feature.h @@ -0,0 +1,66 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Backend::X64 { + +enum class HostFeature : u64 { + SSSE3 = 1ULL << 0, + SSE41 = 1ULL << 1, + SSE42 = 1ULL << 2, + AVX = 1ULL << 3, + AVX2 = 1ULL << 4, + AVX512F = 1ULL << 5, + AVX512CD = 1ULL << 6, + AVX512VL = 1ULL << 7, + AVX512BW = 1ULL << 8, + AVX512DQ = 1ULL << 9, + AVX512BITALG = 1ULL << 10, + AVX512VBMI = 1ULL << 11, + PCLMULQDQ = 1ULL << 12, + F16C = 1ULL << 13, + FMA = 1ULL << 14, + AES = 1ULL << 15, + SHA = 1ULL << 16, + POPCNT = 1ULL << 17, + BMI1 = 1ULL << 18, + BMI2 = 1ULL << 19, + LZCNT = 1ULL << 20, + GFNI = 1ULL << 21, + + // Zen-based BMI2 + FastBMI2 = 1ULL << 22, + + // Orthographic AVX512 features on 128 and 256 vectors + AVX512_Ortho = AVX512F | AVX512VL, + + // Orthographic AVX512 features for both 32-bit and 64-bit floats + AVX512_OrthoFloat = AVX512_Ortho | AVX512DQ, +}; + +constexpr HostFeature operator~(HostFeature f) { + return static_cast<HostFeature>(~static_cast<u64>(f)); +} + +constexpr HostFeature operator|(HostFeature f1, HostFeature f2) { + return static_cast<HostFeature>(static_cast<u64>(f1) | static_cast<u64>(f2)); +} + +constexpr HostFeature operator&(HostFeature f1, HostFeature f2) { + return static_cast<HostFeature>(static_cast<u64>(f1) & static_cast<u64>(f2)); +} + +constexpr HostFeature operator|=(HostFeature& result, HostFeature f) { + return result = (result | f); +} + +constexpr HostFeature operator&=(HostFeature& result, HostFeature f) { + return result = (result & f); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/hostloc.cpp b/externals/dynarmic/src/dynarmic/backend/x64/hostloc.cpp new file mode 100644 index 0000000000..b5e2a5f18b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/hostloc.cpp @@ -0,0 +1,25 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/hostloc.h" + +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/stack_layout.h" + +namespace Dynarmic::Backend::X64 { + +Xbyak::Reg64 HostLocToReg64(HostLoc loc) { + ASSERT(HostLocIsGPR(loc)); + return Xbyak::Reg64(static_cast<int>(loc)); +} + +Xbyak::Xmm HostLocToXmm(HostLoc loc) { + ASSERT(HostLocIsXMM(loc)); + return Xbyak::Xmm(static_cast<int>(loc) - static_cast<int>(HostLoc::XMM0)); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/hostloc.h b/externals/dynarmic/src/dynarmic/backend/x64/hostloc.h new file mode 100644 index 0000000000..dbf526b77a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/hostloc.h @@ -0,0 +1,147 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ +#pragma once + +#include <mcl/assert.hpp> +#include <mcl/stdint.hpp> +#include <xbyak/xbyak.h> + +namespace Dynarmic::Backend::X64 { + +enum class HostLoc { + // Ordering of the registers is intentional. See also: HostLocToX64. + RAX, + RCX, + RDX, + RBX, + RSP, + RBP, + RSI, + RDI, + R8, + R9, + R10, + R11, + R12, + R13, + R14, + R15, + XMM0, + XMM1, + XMM2, + XMM3, + XMM4, + XMM5, + XMM6, + XMM7, + XMM8, + XMM9, + XMM10, + XMM11, + XMM12, + XMM13, + XMM14, + XMM15, + CF, + PF, + AF, + ZF, + SF, + OF, + FirstSpill, +}; + +constexpr size_t NonSpillHostLocCount = static_cast<size_t>(HostLoc::FirstSpill); + +inline bool HostLocIsGPR(HostLoc reg) { + return reg >= HostLoc::RAX && reg <= HostLoc::R15; +} + +inline bool HostLocIsXMM(HostLoc reg) { + return reg >= HostLoc::XMM0 && reg <= HostLoc::XMM15; +} + +inline bool HostLocIsRegister(HostLoc reg) { + return HostLocIsGPR(reg) || HostLocIsXMM(reg); +} + +inline bool HostLocIsFlag(HostLoc reg) { + return reg >= HostLoc::CF && reg <= HostLoc::OF; +} + +inline HostLoc HostLocRegIdx(int idx) { + ASSERT(idx >= 0 && idx <= 15); + return static_cast<HostLoc>(idx); +} + +inline HostLoc HostLocXmmIdx(int idx) { + ASSERT(idx >= 0 && idx <= 15); + return static_cast<HostLoc>(static_cast<size_t>(HostLoc::XMM0) + idx); +} + +inline HostLoc HostLocSpill(size_t i) { + return static_cast<HostLoc>(static_cast<size_t>(HostLoc::FirstSpill) + i); +} + +inline bool HostLocIsSpill(HostLoc reg) { + return reg >= HostLoc::FirstSpill; +} + +inline size_t HostLocBitWidth(HostLoc loc) { + if (HostLocIsGPR(loc)) + return 64; + if (HostLocIsXMM(loc)) + return 128; + if (HostLocIsSpill(loc)) + return 128; + if (HostLocIsFlag(loc)) + return 1; + UNREACHABLE(); +} + +using HostLocList = std::initializer_list<HostLoc>; + +// RSP is preserved for function calls +// R15 contains the JitState pointer +const HostLocList any_gpr = { + HostLoc::RAX, + HostLoc::RBX, + HostLoc::RCX, + HostLoc::RDX, + HostLoc::RSI, + HostLoc::RDI, + HostLoc::RBP, + HostLoc::R8, + HostLoc::R9, + HostLoc::R10, + HostLoc::R11, + HostLoc::R12, + HostLoc::R13, + HostLoc::R14, +}; + +// XMM0 is reserved for use by instructions that implicitly use it as an argument +const HostLocList any_xmm = { + HostLoc::XMM1, + HostLoc::XMM2, + HostLoc::XMM3, + HostLoc::XMM4, + HostLoc::XMM5, + HostLoc::XMM6, + HostLoc::XMM7, + HostLoc::XMM8, + HostLoc::XMM9, + HostLoc::XMM10, + HostLoc::XMM11, + HostLoc::XMM12, + HostLoc::XMM13, + HostLoc::XMM14, + HostLoc::XMM15, +}; + +Xbyak::Reg64 HostLocToReg64(HostLoc loc); +Xbyak::Xmm HostLocToXmm(HostLoc loc); + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/jitstate_info.h b/externals/dynarmic/src/dynarmic/backend/x64/jitstate_info.h new file mode 100644 index 0000000000..654b3f0400 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/jitstate_info.h @@ -0,0 +1,38 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <cstddef> + +namespace Dynarmic::Backend::X64 { + +struct JitStateInfo { + template<typename JitStateType> + JitStateInfo(const JitStateType&) + : offsetof_guest_MXCSR(offsetof(JitStateType, guest_MXCSR)) + , offsetof_asimd_MXCSR(offsetof(JitStateType, asimd_MXCSR)) + , offsetof_rsb_ptr(offsetof(JitStateType, rsb_ptr)) + , rsb_ptr_mask(JitStateType::RSBPtrMask) + , offsetof_rsb_location_descriptors(offsetof(JitStateType, rsb_location_descriptors)) + , offsetof_rsb_codeptrs(offsetof(JitStateType, rsb_codeptrs)) + , offsetof_cpsr_nzcv(offsetof(JitStateType, cpsr_nzcv)) + , offsetof_fpsr_exc(offsetof(JitStateType, fpsr_exc)) + , offsetof_fpsr_qc(offsetof(JitStateType, fpsr_qc)) + , offsetof_halt_reason(offsetof(JitStateType, halt_reason)) {} + + const size_t offsetof_guest_MXCSR; + const size_t offsetof_asimd_MXCSR; + const size_t offsetof_rsb_ptr; + const size_t rsb_ptr_mask; + const size_t offsetof_rsb_location_descriptors; + const size_t offsetof_rsb_codeptrs; + const size_t offsetof_cpsr_nzcv; + const size_t offsetof_fpsr_exc; + const size_t offsetof_fpsr_qc; + const size_t offsetof_halt_reason; +}; + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/nzcv_util.h b/externals/dynarmic/src/dynarmic/backend/x64/nzcv_util.h new file mode 100644 index 0000000000..3a70cf4f0b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/nzcv_util.h @@ -0,0 +1,52 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Backend::X64::NZCV { + +constexpr u32 arm_mask = 0xF000'0000; +constexpr u32 x64_mask = 0xC101; + +constexpr size_t x64_n_flag_bit = 15; +constexpr size_t x64_z_flag_bit = 14; +constexpr size_t x64_c_flag_bit = 8; +constexpr size_t x64_v_flag_bit = 0; + +/// This is a constant used to create the x64 flags format from the ARM format. +/// NZCV * multiplier: NZCV0NZCV000NZCV +/// x64_flags format: NZ-----C-------V +constexpr u32 to_x64_multiplier = 0x1081; + +/// This is a constant used to create the ARM format from the x64 flags format. +constexpr u32 from_x64_multiplier = 0x1021'0000; + +inline u32 ToX64(u32 nzcv) { + /* Naive implementation: + u32 x64_flags = 0; + x64_flags |= mcl::bit::get_bit<31>(cpsr) ? 1 << 15 : 0; + x64_flags |= mcl::bit::get_bit<30>(cpsr) ? 1 << 14 : 0; + x64_flags |= mcl::bit::get_bit<29>(cpsr) ? 1 << 8 : 0; + x64_flags |= mcl::bit::get_bit<28>(cpsr) ? 1 : 0; + return x64_flags; + */ + return ((nzcv >> 28) * to_x64_multiplier) & x64_mask; +} + +inline u32 FromX64(u32 x64_flags) { + /* Naive implementation: + u32 nzcv = 0; + nzcv |= mcl::bit::get_bit<15>(x64_flags) ? 1 << 31 : 0; + nzcv |= mcl::bit::get_bit<14>(x64_flags) ? 1 << 30 : 0; + nzcv |= mcl::bit::get_bit<8>(x64_flags) ? 1 << 29 : 0; + nzcv |= mcl::bit::get_bit<0>(x64_flags) ? 1 << 28 : 0; + return nzcv; + */ + return ((x64_flags & x64_mask) * from_x64_multiplier) & arm_mask; +} + +} // namespace Dynarmic::Backend::X64::NZCV diff --git a/externals/dynarmic/src/dynarmic/backend/x64/oparg.h b/externals/dynarmic/src/dynarmic/backend/x64/oparg.h new file mode 100644 index 0000000000..70c60dfb15 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/oparg.h @@ -0,0 +1,79 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/assert.hpp> +#include <xbyak/xbyak.h> + +namespace Dynarmic::Backend::X64 { + +struct OpArg { + OpArg() + : type(Type::Operand), inner_operand() {} + /* implicit */ OpArg(const Xbyak::Address& address) + : type(Type::Address), inner_address(address) {} + /* implicit */ OpArg(const Xbyak::Reg& reg) + : type(Type::Reg), inner_reg(reg) {} + + Xbyak::Operand& operator*() { + switch (type) { + case Type::Address: + return inner_address; + case Type::Operand: + return inner_operand; + case Type::Reg: + return inner_reg; + } + UNREACHABLE(); + } + + void setBit(int bits) { + switch (type) { + case Type::Address: + inner_address.setBit(bits); + return; + case Type::Operand: + inner_operand.setBit(bits); + return; + case Type::Reg: + switch (bits) { + case 8: + inner_reg = inner_reg.cvt8(); + return; + case 16: + inner_reg = inner_reg.cvt16(); + return; + case 32: + inner_reg = inner_reg.cvt32(); + return; + case 64: + inner_reg = inner_reg.cvt64(); + return; + default: + ASSERT_MSG(false, "Invalid bits"); + return; + } + } + UNREACHABLE(); + } + +private: + enum class Type { + Operand, + Address, + Reg, + }; + + Type type; + + union { + Xbyak::Operand inner_operand; + Xbyak::Address inner_address; + Xbyak::Reg inner_reg; + }; +}; + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/perf_map.cpp b/externals/dynarmic/src/dynarmic/backend/x64/perf_map.cpp new file mode 100644 index 0000000000..88691dbffc --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/perf_map.cpp @@ -0,0 +1,94 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/perf_map.h" + +#include <cstddef> +#include <string> + +#ifdef __linux__ + +# include <cstdio> +# include <cstdlib> +# include <mutex> + +# include <fmt/format.h> +# include <mcl/stdint.hpp> +# include <sys/types.h> +# include <unistd.h> + +namespace Dynarmic::Backend::X64 { + +namespace { +std::mutex mutex; +std::FILE* file = nullptr; + +void OpenFile() { + const char* perf_dir = std::getenv("PERF_BUILDID_DIR"); + if (!perf_dir) { + file = nullptr; + return; + } + + const pid_t pid = getpid(); + const std::string filename = fmt::format("{:s}/perf-{:d}.map", perf_dir, pid); + + file = std::fopen(filename.c_str(), "w"); + if (!file) { + return; + } + + std::setvbuf(file, nullptr, _IONBF, 0); +} +} // anonymous namespace + +namespace detail { +void PerfMapRegister(const void* start, const void* end, std::string_view friendly_name) { + if (start == end) { + // Nothing to register + return; + } + + std::lock_guard guard{mutex}; + + if (!file) { + OpenFile(); + if (!file) { + return; + } + } + + const std::string line = fmt::format("{:016x} {:016x} {:s}\n", reinterpret_cast<u64>(start), reinterpret_cast<u64>(end) - reinterpret_cast<u64>(start), friendly_name); + std::fwrite(line.data(), sizeof *line.data(), line.size(), file); +} +} // namespace detail + +void PerfMapClear() { + std::lock_guard guard{mutex}; + + if (!file) { + return; + } + + std::fclose(file); + file = nullptr; + OpenFile(); +} + +} // namespace Dynarmic::Backend::X64 + +#else + +namespace Dynarmic::Backend::X64 { + +namespace detail { +void PerfMapRegister(const void*, const void*, std::string_view) {} +} // namespace detail + +void PerfMapClear() {} + +} // namespace Dynarmic::Backend::X64 + +#endif diff --git a/externals/dynarmic/src/dynarmic/backend/x64/perf_map.h b/externals/dynarmic/src/dynarmic/backend/x64/perf_map.h new file mode 100644 index 0000000000..7d739ae63e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/perf_map.h @@ -0,0 +1,25 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <string_view> + +#include <mcl/bit_cast.hpp> + +namespace Dynarmic::Backend::X64 { + +namespace detail { +void PerfMapRegister(const void* start, const void* end, std::string_view friendly_name); +} // namespace detail + +template<typename T> +void PerfMapRegister(T start, const void* end, std::string_view friendly_name) { + detail::PerfMapRegister(mcl::bit_cast<const void*>(start), end, friendly_name); +} + +void PerfMapClear(); + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp new file mode 100644 index 0000000000..61a3a6cd91 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp @@ -0,0 +1,777 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/reg_alloc.h" + +#include <algorithm> +#include <numeric> +#include <utility> + +#include <fmt/ostream.h> +#include <mcl/assert.hpp> +#include <mcl/bit_cast.hpp> +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/stack_layout.h" +#include "dynarmic/backend/x64/verbose_debugging_output.h" + +namespace Dynarmic::Backend::X64 { + +#define MAYBE_AVX(OPCODE, ...) \ + [&] { \ + if (code.HasHostFeature(HostFeature::AVX)) { \ + code.v##OPCODE(__VA_ARGS__); \ + } else { \ + code.OPCODE(__VA_ARGS__); \ + } \ + }() + +static bool CanExchange(HostLoc a, HostLoc b) { + return HostLocIsGPR(a) && HostLocIsGPR(b); +} + +// Minimum number of bits required to represent a type +static size_t GetBitWidth(IR::Type type) { + switch (type) { + case IR::Type::A32Reg: + case IR::Type::A32ExtReg: + case IR::Type::A64Reg: + case IR::Type::A64Vec: + case IR::Type::CoprocInfo: + case IR::Type::Cond: + case IR::Type::Void: + case IR::Type::Table: + case IR::Type::AccType: + ASSERT_FALSE("Type {} cannot be represented at runtime", type); + case IR::Type::Opaque: + ASSERT_FALSE("Not a concrete type"); + case IR::Type::U1: + return 8; + case IR::Type::U8: + return 8; + case IR::Type::U16: + return 16; + case IR::Type::U32: + return 32; + case IR::Type::U64: + return 64; + case IR::Type::U128: + return 128; + case IR::Type::NZCVFlags: + return 32; // TODO: Update to 16 when flags optimization is done + } + UNREACHABLE(); +} + +static bool IsValuelessType(IR::Type type) { + switch (type) { + case IR::Type::Table: + return true; + default: + return false; + } +} + +bool HostLocInfo::IsLocked() const { + return is_being_used_count > 0; +} + +bool HostLocInfo::IsEmpty() const { + return is_being_used_count == 0 && values.empty(); +} + +bool HostLocInfo::IsLastUse() const { + return is_being_used_count == 0 && current_references == 1 && accumulated_uses + 1 == total_uses; +} + +void HostLocInfo::SetLastUse() { + ASSERT(IsLastUse()); + is_set_last_use = true; +} + +void HostLocInfo::ReadLock() { + ASSERT(!is_scratch); + is_being_used_count++; +} + +void HostLocInfo::WriteLock() { + ASSERT(is_being_used_count == 0); + is_being_used_count++; + is_scratch = true; +} + +void HostLocInfo::AddArgReference() { + current_references++; + ASSERT(accumulated_uses + current_references <= total_uses); +} + +void HostLocInfo::ReleaseOne() { + is_being_used_count--; + is_scratch = false; + + if (current_references == 0) + return; + + accumulated_uses++; + current_references--; + + if (current_references == 0) + ReleaseAll(); +} + +void HostLocInfo::ReleaseAll() { + accumulated_uses += current_references; + current_references = 0; + + is_set_last_use = false; + + if (total_uses == accumulated_uses) { + values.clear(); + accumulated_uses = 0; + total_uses = 0; + max_bit_width = 0; + } + + is_being_used_count = 0; + is_scratch = false; +} + +bool HostLocInfo::ContainsValue(const IR::Inst* inst) const { + return std::find(values.begin(), values.end(), inst) != values.end(); +} + +size_t HostLocInfo::GetMaxBitWidth() const { + return max_bit_width; +} + +void HostLocInfo::AddValue(IR::Inst* inst) { + if (is_set_last_use) { + is_set_last_use = false; + values.clear(); + } + values.push_back(inst); + total_uses += inst->UseCount(); + max_bit_width = std::max(max_bit_width, GetBitWidth(inst->GetType())); +} + +void HostLocInfo::EmitVerboseDebuggingOutput(BlockOfCode& code, size_t host_loc_index) const { + using namespace Xbyak::util; + for (IR::Inst* value : values) { + code.mov(code.ABI_PARAM1, rsp); + code.mov(code.ABI_PARAM2, host_loc_index); + code.mov(code.ABI_PARAM3, value->GetName()); + code.mov(code.ABI_PARAM4, GetBitWidth(value->GetType())); + code.CallFunction(PrintVerboseDebuggingOutputLine); + } +} + +IR::Type Argument::GetType() const { + return value.GetType(); +} + +bool Argument::IsImmediate() const { + return value.IsImmediate(); +} + +bool Argument::IsVoid() const { + return GetType() == IR::Type::Void; +} + +bool Argument::FitsInImmediateU32() const { + if (!IsImmediate()) + return false; + const u64 imm = value.GetImmediateAsU64(); + return imm < 0x100000000; +} + +bool Argument::FitsInImmediateS32() const { + if (!IsImmediate()) + return false; + const s64 imm = static_cast<s64>(value.GetImmediateAsU64()); + return -s64(0x80000000) <= imm && imm <= s64(0x7FFFFFFF); +} + +bool Argument::GetImmediateU1() const { + return value.GetU1(); +} + +u8 Argument::GetImmediateU8() const { + const u64 imm = value.GetImmediateAsU64(); + ASSERT(imm < 0x100); + return u8(imm); +} + +u16 Argument::GetImmediateU16() const { + const u64 imm = value.GetImmediateAsU64(); + ASSERT(imm < 0x10000); + return u16(imm); +} + +u32 Argument::GetImmediateU32() const { + const u64 imm = value.GetImmediateAsU64(); + ASSERT(imm < 0x100000000); + return u32(imm); +} + +u64 Argument::GetImmediateS32() const { + ASSERT(FitsInImmediateS32()); + return value.GetImmediateAsU64(); +} + +u64 Argument::GetImmediateU64() const { + return value.GetImmediateAsU64(); +} + +IR::Cond Argument::GetImmediateCond() const { + ASSERT(IsImmediate() && GetType() == IR::Type::Cond); + return value.GetCond(); +} + +IR::AccType Argument::GetImmediateAccType() const { + ASSERT(IsImmediate() && GetType() == IR::Type::AccType); + return value.GetAccType(); +} + +bool Argument::IsInGpr() const { + if (IsImmediate()) + return false; + return HostLocIsGPR(*reg_alloc.ValueLocation(value.GetInst())); +} + +bool Argument::IsInXmm() const { + if (IsImmediate()) + return false; + return HostLocIsXMM(*reg_alloc.ValueLocation(value.GetInst())); +} + +bool Argument::IsInMemory() const { + if (IsImmediate()) + return false; + return HostLocIsSpill(*reg_alloc.ValueLocation(value.GetInst())); +} + +RegAlloc::RegAlloc(BlockOfCode& code, std::vector<HostLoc> gpr_order, std::vector<HostLoc> xmm_order) + : gpr_order(gpr_order) + , xmm_order(xmm_order) + , hostloc_info(NonSpillHostLocCount + SpillCount) + , code(code) {} + +RegAlloc::ArgumentInfo RegAlloc::GetArgumentInfo(IR::Inst* inst) { + ArgumentInfo ret = {Argument{*this}, Argument{*this}, Argument{*this}, Argument{*this}}; + for (size_t i = 0; i < inst->NumArgs(); i++) { + const IR::Value arg = inst->GetArg(i); + ret[i].value = arg; + if (!arg.IsImmediate() && !IsValuelessType(arg.GetType())) { + ASSERT_MSG(ValueLocation(arg.GetInst()), "argument must already been defined"); + LocInfo(*ValueLocation(arg.GetInst())).AddArgReference(); + } + } + return ret; +} + +void RegAlloc::RegisterPseudoOperation(IR::Inst* inst) { + ASSERT(IsValueLive(inst) || !inst->HasUses()); + + for (size_t i = 0; i < inst->NumArgs(); i++) { + const IR::Value arg = inst->GetArg(i); + if (!arg.IsImmediate() && !IsValuelessType(arg.GetType())) { + if (const auto loc = ValueLocation(arg.GetInst())) { + // May not necessarily have a value (e.g. CMP variant of Sub32). + LocInfo(*loc).AddArgReference(); + } + } + } +} + +bool RegAlloc::IsValueLive(IR::Inst* inst) const { + return !!ValueLocation(inst); +} + +Xbyak::Reg64 RegAlloc::UseGpr(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToReg64(UseImpl(arg.value, gpr_order)); +} + +Xbyak::Xmm RegAlloc::UseXmm(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToXmm(UseImpl(arg.value, xmm_order)); +} + +OpArg RegAlloc::UseOpArg(Argument& arg) { + return UseGpr(arg); +} + +void RegAlloc::Use(Argument& arg, HostLoc host_loc) { + ASSERT(!arg.allocated); + arg.allocated = true; + UseImpl(arg.value, {host_loc}); +} + +Xbyak::Reg64 RegAlloc::UseScratchGpr(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToReg64(UseScratchImpl(arg.value, gpr_order)); +} + +Xbyak::Xmm RegAlloc::UseScratchXmm(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToXmm(UseScratchImpl(arg.value, xmm_order)); +} + +void RegAlloc::UseScratch(Argument& arg, HostLoc host_loc) { + ASSERT(!arg.allocated); + arg.allocated = true; + UseScratchImpl(arg.value, {host_loc}); +} + +void RegAlloc::DefineValue(IR::Inst* inst, const Xbyak::Reg& reg) { + ASSERT(reg.getKind() == Xbyak::Operand::XMM || reg.getKind() == Xbyak::Operand::REG); + const auto hostloc = static_cast<HostLoc>(reg.getIdx() + static_cast<size_t>(reg.getKind() == Xbyak::Operand::XMM ? HostLoc::XMM0 : HostLoc::RAX)); + DefineValueImpl(inst, hostloc); +} + +void RegAlloc::DefineValue(IR::Inst* inst, Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + DefineValueImpl(inst, arg.value); +} + +void RegAlloc::Release(const Xbyak::Reg& reg) { + ASSERT(reg.getKind() == Xbyak::Operand::XMM || reg.getKind() == Xbyak::Operand::REG); + const auto hostloc = static_cast<HostLoc>(reg.getIdx() + static_cast<size_t>(reg.getKind() == Xbyak::Operand::XMM ? HostLoc::XMM0 : HostLoc::RAX)); + LocInfo(hostloc).ReleaseOne(); +} + +Xbyak::Reg64 RegAlloc::ScratchGpr() { + return HostLocToReg64(ScratchImpl(gpr_order)); +} + +Xbyak::Reg64 RegAlloc::ScratchGpr(HostLoc desired_location) { + return HostLocToReg64(ScratchImpl({desired_location})); +} + +Xbyak::Xmm RegAlloc::ScratchXmm() { + return HostLocToXmm(ScratchImpl(xmm_order)); +} + +Xbyak::Xmm RegAlloc::ScratchXmm(HostLoc desired_location) { + return HostLocToXmm(ScratchImpl({desired_location})); +} + +HostLoc RegAlloc::UseImpl(IR::Value use_value, const std::vector<HostLoc>& desired_locations) { + if (use_value.IsImmediate()) { + return LoadImmediate(use_value, ScratchImpl(desired_locations)); + } + + const IR::Inst* use_inst = use_value.GetInst(); + const HostLoc current_location = *ValueLocation(use_inst); + const size_t max_bit_width = LocInfo(current_location).GetMaxBitWidth(); + + const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end(); + if (can_use_current_location) { + LocInfo(current_location).ReadLock(); + return current_location; + } + + if (LocInfo(current_location).IsLocked()) { + return UseScratchImpl(use_value, desired_locations); + } + + const HostLoc destination_location = SelectARegister(desired_locations); + if (max_bit_width > HostLocBitWidth(destination_location)) { + return UseScratchImpl(use_value, desired_locations); + } else if (CanExchange(destination_location, current_location)) { + Exchange(destination_location, current_location); + } else { + MoveOutOfTheWay(destination_location); + Move(destination_location, current_location); + } + LocInfo(destination_location).ReadLock(); + return destination_location; +} + +HostLoc RegAlloc::UseScratchImpl(IR::Value use_value, const std::vector<HostLoc>& desired_locations) { + if (use_value.IsImmediate()) { + return LoadImmediate(use_value, ScratchImpl(desired_locations)); + } + + const IR::Inst* use_inst = use_value.GetInst(); + const HostLoc current_location = *ValueLocation(use_inst); + const size_t bit_width = GetBitWidth(use_inst->GetType()); + + const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end(); + if (can_use_current_location && !LocInfo(current_location).IsLocked()) { + if (!LocInfo(current_location).IsLastUse()) { + MoveOutOfTheWay(current_location); + } else { + LocInfo(current_location).SetLastUse(); + } + LocInfo(current_location).WriteLock(); + return current_location; + } + + const HostLoc destination_location = SelectARegister(desired_locations); + MoveOutOfTheWay(destination_location); + CopyToScratch(bit_width, destination_location, current_location); + LocInfo(destination_location).WriteLock(); + return destination_location; +} + +HostLoc RegAlloc::ScratchImpl(const std::vector<HostLoc>& desired_locations) { + const HostLoc location = SelectARegister(desired_locations); + MoveOutOfTheWay(location); + LocInfo(location).WriteLock(); + return location; +} + +void RegAlloc::HostCall(IR::Inst* result_def, + std::optional<Argument::copyable_reference> arg0, + std::optional<Argument::copyable_reference> arg1, + std::optional<Argument::copyable_reference> arg2, + std::optional<Argument::copyable_reference> arg3) { + constexpr size_t args_count = 4; + constexpr std::array<HostLoc, args_count> args_hostloc = {ABI_PARAM1, ABI_PARAM2, ABI_PARAM3, ABI_PARAM4}; + const std::array<std::optional<Argument::copyable_reference>, args_count> args = {arg0, arg1, arg2, arg3}; + + static const std::vector<HostLoc> other_caller_save = [args_hostloc]() { + std::vector<HostLoc> ret(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end()); + + ret.erase(std::find(ret.begin(), ret.end(), ABI_RETURN)); + for (auto hostloc : args_hostloc) { + ret.erase(std::find(ret.begin(), ret.end(), hostloc)); + } + + return ret; + }(); + + ScratchGpr(ABI_RETURN); + if (result_def) { + DefineValueImpl(result_def, ABI_RETURN); + } + + for (size_t i = 0; i < args_count; i++) { + if (args[i] && !args[i]->get().IsVoid()) { + UseScratch(*args[i], args_hostloc[i]); + + // LLVM puts the burden of zero-extension of 8 and 16 bit values on the caller instead of the callee + const Xbyak::Reg64 reg = HostLocToReg64(args_hostloc[i]); + switch (args[i]->get().GetType()) { + case IR::Type::U8: + code.movzx(reg.cvt32(), reg.cvt8()); + break; + case IR::Type::U16: + code.movzx(reg.cvt32(), reg.cvt16()); + break; + case IR::Type::U32: + code.mov(reg.cvt32(), reg.cvt32()); + break; + default: + break; // Nothing needs to be done + } + } + } + + for (size_t i = 0; i < args_count; i++) { + if (!args[i]) { + // TODO: Force spill + ScratchGpr(args_hostloc[i]); + } + } + + for (HostLoc caller_saved : other_caller_save) { + ScratchImpl({caller_saved}); + } +} + +void RegAlloc::AllocStackSpace(size_t stack_space) { + ASSERT(stack_space < static_cast<size_t>(std::numeric_limits<s32>::max())); + ASSERT(reserved_stack_space == 0); + reserved_stack_space = stack_space; + code.sub(code.rsp, static_cast<u32>(stack_space)); +} + +void RegAlloc::ReleaseStackSpace(size_t stack_space) { + ASSERT(stack_space < static_cast<size_t>(std::numeric_limits<s32>::max())); + ASSERT(reserved_stack_space == stack_space); + reserved_stack_space = 0; + code.add(code.rsp, static_cast<u32>(stack_space)); +} + +void RegAlloc::EndOfAllocScope() { + for (auto& iter : hostloc_info) { + iter.ReleaseAll(); + } +} + +void RegAlloc::AssertNoMoreUses() { + ASSERT(std::all_of(hostloc_info.begin(), hostloc_info.end(), [](const auto& i) { return i.IsEmpty(); })); +} + +void RegAlloc::EmitVerboseDebuggingOutput() { + for (size_t i = 0; i < hostloc_info.size(); i++) { + hostloc_info[i].EmitVerboseDebuggingOutput(code, i); + } +} + +HostLoc RegAlloc::SelectARegister(const std::vector<HostLoc>& desired_locations) const { + std::vector<HostLoc> candidates = desired_locations; + + // Find all locations that have not been allocated.. + const auto allocated_locs = std::partition(candidates.begin(), candidates.end(), [this](auto loc) { + return !this->LocInfo(loc).IsLocked(); + }); + candidates.erase(allocated_locs, candidates.end()); + ASSERT_MSG(!candidates.empty(), "All candidate registers have already been allocated"); + + // Selects the best location out of the available locations. + // TODO: Actually do LRU or something. Currently we just try to pick something without a value if possible. + + std::partition(candidates.begin(), candidates.end(), [this](auto loc) { + return this->LocInfo(loc).IsEmpty(); + }); + + return candidates.front(); +} + +std::optional<HostLoc> RegAlloc::ValueLocation(const IR::Inst* value) const { + for (size_t i = 0; i < hostloc_info.size(); i++) { + if (hostloc_info[i].ContainsValue(value)) { + return static_cast<HostLoc>(i); + } + } + + return std::nullopt; +} + +void RegAlloc::DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc) { + ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined"); + LocInfo(host_loc).AddValue(def_inst); +} + +void RegAlloc::DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst) { + ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined"); + + if (use_inst.IsImmediate()) { + const HostLoc location = ScratchImpl(gpr_order); + DefineValueImpl(def_inst, location); + LoadImmediate(use_inst, location); + return; + } + + ASSERT_MSG(ValueLocation(use_inst.GetInst()), "use_inst must already be defined"); + const HostLoc location = *ValueLocation(use_inst.GetInst()); + DefineValueImpl(def_inst, location); +} + +HostLoc RegAlloc::LoadImmediate(IR::Value imm, HostLoc host_loc) { + ASSERT_MSG(imm.IsImmediate(), "imm is not an immediate"); + + if (HostLocIsGPR(host_loc)) { + const Xbyak::Reg64 reg = HostLocToReg64(host_loc); + const u64 imm_value = imm.GetImmediateAsU64(); + if (imm_value == 0) { + code.xor_(reg.cvt32(), reg.cvt32()); + } else { + code.mov(reg, imm_value); + } + return host_loc; + } + + if (HostLocIsXMM(host_loc)) { + const Xbyak::Xmm reg = HostLocToXmm(host_loc); + const u64 imm_value = imm.GetImmediateAsU64(); + if (imm_value == 0) { + MAYBE_AVX(xorps, reg, reg); + } else { + MAYBE_AVX(movaps, reg, code.Const(code.xword, imm_value)); + } + return host_loc; + } + + UNREACHABLE(); +} + +void RegAlloc::Move(HostLoc to, HostLoc from) { + const size_t bit_width = LocInfo(from).GetMaxBitWidth(); + + ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsLocked()); + ASSERT(bit_width <= HostLocBitWidth(to)); + + if (LocInfo(from).IsEmpty()) { + return; + } + + EmitMove(bit_width, to, from); + + LocInfo(to) = std::exchange(LocInfo(from), {}); +} + +void RegAlloc::CopyToScratch(size_t bit_width, HostLoc to, HostLoc from) { + ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsEmpty()); + + EmitMove(bit_width, to, from); +} + +void RegAlloc::Exchange(HostLoc a, HostLoc b) { + ASSERT(!LocInfo(a).IsLocked() && !LocInfo(b).IsLocked()); + ASSERT(LocInfo(a).GetMaxBitWidth() <= HostLocBitWidth(b)); + ASSERT(LocInfo(b).GetMaxBitWidth() <= HostLocBitWidth(a)); + + if (LocInfo(a).IsEmpty()) { + Move(a, b); + return; + } + + if (LocInfo(b).IsEmpty()) { + Move(b, a); + return; + } + + EmitExchange(a, b); + + std::swap(LocInfo(a), LocInfo(b)); +} + +void RegAlloc::MoveOutOfTheWay(HostLoc reg) { + ASSERT(!LocInfo(reg).IsLocked()); + if (!LocInfo(reg).IsEmpty()) { + SpillRegister(reg); + } +} + +void RegAlloc::SpillRegister(HostLoc loc) { + ASSERT_MSG(HostLocIsRegister(loc), "Only registers can be spilled"); + ASSERT_MSG(!LocInfo(loc).IsEmpty(), "There is no need to spill unoccupied registers"); + ASSERT_MSG(!LocInfo(loc).IsLocked(), "Registers that have been allocated must not be spilt"); + + const HostLoc new_loc = FindFreeSpill(); + Move(new_loc, loc); +} + +HostLoc RegAlloc::FindFreeSpill() const { + for (size_t i = static_cast<size_t>(HostLoc::FirstSpill); i < hostloc_info.size(); i++) { + const auto loc = static_cast<HostLoc>(i); + if (LocInfo(loc).IsEmpty()) { + return loc; + } + } + + ASSERT_FALSE("All spill locations are full"); +} + +HostLocInfo& RegAlloc::LocInfo(HostLoc loc) { + ASSERT(loc != HostLoc::RSP && loc != HostLoc::R15); + return hostloc_info[static_cast<size_t>(loc)]; +} + +const HostLocInfo& RegAlloc::LocInfo(HostLoc loc) const { + ASSERT(loc != HostLoc::RSP && loc != HostLoc::R15); + return hostloc_info[static_cast<size_t>(loc)]; +} + +void RegAlloc::EmitMove(size_t bit_width, HostLoc to, HostLoc from) { + if (HostLocIsXMM(to) && HostLocIsXMM(from)) { + MAYBE_AVX(movaps, HostLocToXmm(to), HostLocToXmm(from)); + } else if (HostLocIsGPR(to) && HostLocIsGPR(from)) { + ASSERT(bit_width != 128); + if (bit_width == 64) { + code.mov(HostLocToReg64(to), HostLocToReg64(from)); + } else { + code.mov(HostLocToReg64(to).cvt32(), HostLocToReg64(from).cvt32()); + } + } else if (HostLocIsXMM(to) && HostLocIsGPR(from)) { + ASSERT(bit_width != 128); + if (bit_width == 64) { + MAYBE_AVX(movq, HostLocToXmm(to), HostLocToReg64(from)); + } else { + MAYBE_AVX(movd, HostLocToXmm(to), HostLocToReg64(from).cvt32()); + } + } else if (HostLocIsGPR(to) && HostLocIsXMM(from)) { + ASSERT(bit_width != 128); + if (bit_width == 64) { + MAYBE_AVX(movq, HostLocToReg64(to), HostLocToXmm(from)); + } else { + MAYBE_AVX(movd, HostLocToReg64(to).cvt32(), HostLocToXmm(from)); + } + } else if (HostLocIsXMM(to) && HostLocIsSpill(from)) { + const Xbyak::Address spill_addr = SpillToOpArg(from); + ASSERT(spill_addr.getBit() >= bit_width); + switch (bit_width) { + case 128: + MAYBE_AVX(movaps, HostLocToXmm(to), spill_addr); + break; + case 64: + MAYBE_AVX(movsd, HostLocToXmm(to), spill_addr); + break; + case 32: + case 16: + case 8: + MAYBE_AVX(movss, HostLocToXmm(to), spill_addr); + break; + default: + UNREACHABLE(); + } + } else if (HostLocIsSpill(to) && HostLocIsXMM(from)) { + const Xbyak::Address spill_addr = SpillToOpArg(to); + ASSERT(spill_addr.getBit() >= bit_width); + switch (bit_width) { + case 128: + MAYBE_AVX(movaps, spill_addr, HostLocToXmm(from)); + break; + case 64: + MAYBE_AVX(movsd, spill_addr, HostLocToXmm(from)); + break; + case 32: + case 16: + case 8: + MAYBE_AVX(movss, spill_addr, HostLocToXmm(from)); + break; + default: + UNREACHABLE(); + } + } else if (HostLocIsGPR(to) && HostLocIsSpill(from)) { + ASSERT(bit_width != 128); + if (bit_width == 64) { + code.mov(HostLocToReg64(to), SpillToOpArg(from)); + } else { + code.mov(HostLocToReg64(to).cvt32(), SpillToOpArg(from)); + } + } else if (HostLocIsSpill(to) && HostLocIsGPR(from)) { + ASSERT(bit_width != 128); + if (bit_width == 64) { + code.mov(SpillToOpArg(to), HostLocToReg64(from)); + } else { + code.mov(SpillToOpArg(to), HostLocToReg64(from).cvt32()); + } + } else { + ASSERT_FALSE("Invalid RegAlloc::EmitMove"); + } +} + +void RegAlloc::EmitExchange(HostLoc a, HostLoc b) { + if (HostLocIsGPR(a) && HostLocIsGPR(b)) { + code.xchg(HostLocToReg64(a), HostLocToReg64(b)); + } else if (HostLocIsXMM(a) && HostLocIsXMM(b)) { + ASSERT_FALSE("Check your code: Exchanging XMM registers is unnecessary"); + } else { + ASSERT_FALSE("Invalid RegAlloc::EmitExchange"); + } +} + +Xbyak::Address RegAlloc::SpillToOpArg(HostLoc loc) { + ASSERT(HostLocIsSpill(loc)); + + size_t i = static_cast<size_t>(loc) - static_cast<size_t>(HostLoc::FirstSpill); + ASSERT_MSG(i < SpillCount, "Spill index greater than number of available spill locations"); + + using namespace Xbyak::util; + return xword[rsp + reserved_stack_space + ABI_SHADOW_SPACE + offsetof(StackLayout, spill) + i * sizeof(StackLayout::spill[0])]; +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.h b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.h new file mode 100644 index 0000000000..19debf5f48 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.h @@ -0,0 +1,188 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <functional> +#include <optional> +#include <utility> +#include <vector> + +#include <mcl/stdint.hpp> +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/block_of_code.h" +#include "dynarmic/backend/x64/hostloc.h" +#include "dynarmic/backend/x64/oparg.h" +#include "dynarmic/ir/cond.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/value.h" + +namespace Dynarmic::IR { +enum class AccType; +} // namespace Dynarmic::IR + +namespace Dynarmic::Backend::X64 { + +class RegAlloc; + +struct HostLocInfo { +public: + bool IsLocked() const; + bool IsEmpty() const; + bool IsLastUse() const; + + void SetLastUse(); + + void ReadLock(); + void WriteLock(); + void AddArgReference(); + void ReleaseOne(); + void ReleaseAll(); + + bool ContainsValue(const IR::Inst* inst) const; + size_t GetMaxBitWidth() const; + + void AddValue(IR::Inst* inst); + + void EmitVerboseDebuggingOutput(BlockOfCode& code, size_t host_loc_index) const; + +private: + // Current instruction state + size_t is_being_used_count = 0; + bool is_scratch = false; + bool is_set_last_use = false; + + // Block state + size_t current_references = 0; + size_t accumulated_uses = 0; + size_t total_uses = 0; + + // Value state + std::vector<IR::Inst*> values; + size_t max_bit_width = 0; +}; + +struct Argument { +public: + using copyable_reference = std::reference_wrapper<Argument>; + + IR::Type GetType() const; + bool IsImmediate() const; + bool IsVoid() const; + + bool FitsInImmediateU32() const; + bool FitsInImmediateS32() const; + + bool GetImmediateU1() const; + u8 GetImmediateU8() const; + u16 GetImmediateU16() const; + u32 GetImmediateU32() const; + u64 GetImmediateS32() const; + u64 GetImmediateU64() const; + IR::Cond GetImmediateCond() const; + IR::AccType GetImmediateAccType() const; + + /// Is this value currently in a GPR? + bool IsInGpr() const; + /// Is this value currently in a XMM? + bool IsInXmm() const; + /// Is this value currently in memory? + bool IsInMemory() const; + +private: + friend class RegAlloc; + explicit Argument(RegAlloc& reg_alloc) + : reg_alloc(reg_alloc) {} + + bool allocated = false; + RegAlloc& reg_alloc; + IR::Value value; +}; + +class RegAlloc final { +public: + using ArgumentInfo = std::array<Argument, IR::max_arg_count>; + + explicit RegAlloc(BlockOfCode& code, std::vector<HostLoc> gpr_order, std::vector<HostLoc> xmm_order); + + ArgumentInfo GetArgumentInfo(IR::Inst* inst); + void RegisterPseudoOperation(IR::Inst* inst); + bool IsValueLive(IR::Inst* inst) const; + + Xbyak::Reg64 UseGpr(Argument& arg); + Xbyak::Xmm UseXmm(Argument& arg); + OpArg UseOpArg(Argument& arg); + void Use(Argument& arg, HostLoc host_loc); + + Xbyak::Reg64 UseScratchGpr(Argument& arg); + Xbyak::Xmm UseScratchXmm(Argument& arg); + void UseScratch(Argument& arg, HostLoc host_loc); + + void DefineValue(IR::Inst* inst, const Xbyak::Reg& reg); + void DefineValue(IR::Inst* inst, Argument& arg); + + void Release(const Xbyak::Reg& reg); + + Xbyak::Reg64 ScratchGpr(); + Xbyak::Reg64 ScratchGpr(HostLoc desired_location); + Xbyak::Xmm ScratchXmm(); + Xbyak::Xmm ScratchXmm(HostLoc desired_location); + + void HostCall(IR::Inst* result_def = nullptr, + std::optional<Argument::copyable_reference> arg0 = {}, + std::optional<Argument::copyable_reference> arg1 = {}, + std::optional<Argument::copyable_reference> arg2 = {}, + std::optional<Argument::copyable_reference> arg3 = {}); + + // TODO: Values in host flags + + void AllocStackSpace(size_t stack_space); + void ReleaseStackSpace(size_t stack_space); + + void EndOfAllocScope(); + + void AssertNoMoreUses(); + + void EmitVerboseDebuggingOutput(); + +private: + friend struct Argument; + + std::vector<HostLoc> gpr_order; + std::vector<HostLoc> xmm_order; + + HostLoc SelectARegister(const std::vector<HostLoc>& desired_locations) const; + std::optional<HostLoc> ValueLocation(const IR::Inst* value) const; + + HostLoc UseImpl(IR::Value use_value, const std::vector<HostLoc>& desired_locations); + HostLoc UseScratchImpl(IR::Value use_value, const std::vector<HostLoc>& desired_locations); + HostLoc ScratchImpl(const std::vector<HostLoc>& desired_locations); + void DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc); + void DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst); + + HostLoc LoadImmediate(IR::Value imm, HostLoc host_loc); + void Move(HostLoc to, HostLoc from); + void CopyToScratch(size_t bit_width, HostLoc to, HostLoc from); + void Exchange(HostLoc a, HostLoc b); + void MoveOutOfTheWay(HostLoc reg); + + void SpillRegister(HostLoc loc); + HostLoc FindFreeSpill() const; + + std::vector<HostLocInfo> hostloc_info; + HostLocInfo& LocInfo(HostLoc loc); + const HostLocInfo& LocInfo(HostLoc loc) const; + + BlockOfCode& code; + size_t reserved_stack_space = 0; + void EmitMove(size_t bit_width, HostLoc to, HostLoc from); + void EmitExchange(HostLoc a, HostLoc b); + + Xbyak::Address SpillToOpArg(HostLoc loc); +}; + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/stack_layout.h b/externals/dynarmic/src/dynarmic/backend/x64/stack_layout.h new file mode 100644 index 0000000000..7e6799f1cf --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/stack_layout.h @@ -0,0 +1,38 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Backend::X64 { + +constexpr size_t SpillCount = 64; + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4324) // Structure was padded due to alignment specifier +#endif + +struct alignas(16) StackLayout { + s64 cycles_remaining; + s64 cycles_to_run; + + std::array<std::array<u64, 2>, SpillCount> spill; + + u32 save_host_MXCSR; + + bool check_bit; +}; + +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +static_assert(sizeof(StackLayout) % 16 == 0); + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.cpp b/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.cpp new file mode 100644 index 0000000000..3378786c46 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.cpp @@ -0,0 +1,56 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2023 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/backend/x64/verbose_debugging_output.h" + +#include <iterator> + +#include <fmt/format.h> + +#include "dynarmic/backend/x64/hostloc.h" + +namespace Dynarmic::Backend::X64 { + +void PrintVerboseDebuggingOutputLine(RegisterData& reg_data, HostLoc hostloc, size_t inst_index, size_t bitsize) { + fmt::print("dynarmic debug: %{:05} = ", inst_index); + + Vector value = [&]() -> Vector { + if (HostLocIsGPR(hostloc)) { + return {reg_data.gprs[HostLocToReg64(hostloc).getIdx()], 0}; + } else if (HostLocIsXMM(hostloc)) { + return reg_data.xmms[HostLocToXmm(hostloc).getIdx()]; + } else if (HostLocIsSpill(hostloc)) { + return (*reg_data.spill)[static_cast<size_t>(hostloc) - static_cast<size_t>(HostLoc::FirstSpill)]; + } else { + fmt::print("invalid hostloc! "); + return {0, 0}; + } + }(); + + switch (bitsize) { + case 8: + fmt::print("{:02x}", value[0] & 0xff); + break; + case 16: + fmt::print("{:04x}", value[0] & 0xffff); + break; + case 32: + fmt::print("{:08x}", value[0] & 0xffffffff); + break; + case 64: + fmt::print("{:016x}", value[0]); + break; + case 128: + fmt::print("{:016x}{:016x}", value[1], value[0]); + break; + default: + fmt::print("invalid bitsize!"); + break; + } + + fmt::print("\n"); +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.h b/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.h new file mode 100644 index 0000000000..4eb2646d77 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/verbose_debugging_output.h @@ -0,0 +1,37 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2023 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> + +#include <mcl/stdint.hpp> + +#include "dynarmic/backend/x64/stack_layout.h" + +namespace Dynarmic::Backend::X64 { + +enum class HostLoc; +using Vector = std::array<u64, 2>; + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4324) // Structure was padded due to alignment specifier +#endif + +struct alignas(16) RegisterData { + std::array<u64, 16> gprs; + std::array<Vector, 16> xmms; + decltype(StackLayout::spill)* spill; + u32 mxcsr; +}; + +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +void PrintVerboseDebuggingOutputLine(RegisterData& reg_data, HostLoc hostloc, size_t inst_index, size_t bitsize); + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/common/always_false.h b/externals/dynarmic/src/dynarmic/common/always_false.h new file mode 100644 index 0000000000..b980fea00c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/always_false.h @@ -0,0 +1,13 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2023 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::Common { + +template<typename T> +inline constexpr bool always_false_v = false; + +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/atomic.h b/externals/dynarmic/src/dynarmic/common/atomic.h new file mode 100644 index 0000000000..34042d3802 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/atomic.h @@ -0,0 +1,44 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Atomic { + +inline u32 Load(volatile u32* ptr) { +#ifdef _MSC_VER + return _InterlockedOr(reinterpret_cast<volatile long*>(ptr), 0); +#else + return __atomic_load_n(ptr, __ATOMIC_SEQ_CST); +#endif +} + +inline void Or(volatile u32* ptr, u32 value) { +#ifdef _MSC_VER + _InterlockedOr(reinterpret_cast<volatile long*>(ptr), value); +#else + __atomic_or_fetch(ptr, value, __ATOMIC_SEQ_CST); +#endif +} + +inline void And(volatile u32* ptr, u32 value) { +#ifdef _MSC_VER + _InterlockedAnd(reinterpret_cast<volatile long*>(ptr), value); +#else + __atomic_and_fetch(ptr, value, __ATOMIC_SEQ_CST); +#endif +} + +inline void Barrier() { +#ifdef _MSC_VER + _ReadWriteBarrier(); +#else + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#endif +} + +} // namespace Dynarmic::Atomic diff --git a/externals/dynarmic/src/dynarmic/common/cast_util.h b/externals/dynarmic/src/dynarmic/common/cast_util.h new file mode 100644 index 0000000000..92c9a259b3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/cast_util.h @@ -0,0 +1,18 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/type_traits/function_info.hpp> + +namespace Dynarmic::Common { + +/// Cast a lambda into an equivalent function pointer. +template<class Function> +inline auto FptrCast(Function f) noexcept { + return static_cast<mcl::equivalent_function_type<Function>*>(f); +} + +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/crypto/aes.cpp b/externals/dynarmic/src/dynarmic/common/crypto/aes.cpp new file mode 100644 index 0000000000..c431758e57 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/crypto/aes.cpp @@ -0,0 +1,180 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/crypto/aes.h" + +#include <array> + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Common::Crypto::AES { + +using SubstitutionTable = std::array<u8, 256>; + +// See section 5.1.1 Figure 7 in FIPS 197 +constexpr SubstitutionTable substitution_box{ + {// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16}}; + +// See section 5.3.2 Figure 14 in FIPS 197 +constexpr SubstitutionTable inverse_substitution_box{ + {// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB, + 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, + 0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, + 0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25, + 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, + 0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, + 0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06, + 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, + 0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, + 0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E, + 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, + 0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, + 0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F, + 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, + 0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D}}; + +// See section 4.2.1 in FIPS 197. +static constexpr u8 xtime(u8 x) { + return static_cast<u8>((x << 1) ^ (((x >> 7) & 1) * 0x1B)); +} + +// Galois Field multiplication. +static constexpr u8 Multiply(u8 x, u8 y) { + return static_cast<u8>(((y & 1) * x) + ^ ((y >> 1 & 1) * xtime(x)) + ^ ((y >> 2 & 1) * xtime(xtime(x))) + ^ ((y >> 3 & 1) * xtime(xtime(xtime(x)))) + ^ ((y >> 4 & 1) * xtime(xtime(xtime(xtime(x)))))); +} + +static void ShiftRows(State& out_state, const State& state) { + // Move zeroth row over + out_state[0] = state[0]; + out_state[4] = state[4]; + out_state[8] = state[8]; + out_state[12] = state[12]; + + // Rotate first row 1 column left. + u8 temp = state[1]; + out_state[1] = state[5]; + out_state[5] = state[9]; + out_state[9] = state[13]; + out_state[13] = temp; + + // Rotate second row 2 columns left + temp = state[2]; + out_state[2] = state[10]; + out_state[10] = temp; + + temp = state[6]; + out_state[6] = state[14]; + out_state[14] = temp; + + // Rotate third row 3 columns left + temp = state[3]; + out_state[3] = state[15]; + out_state[15] = state[11]; + out_state[11] = state[7]; + out_state[7] = temp; +} + +static void InverseShiftRows(State& out_state, const State& state) { + // Move zeroth row over + out_state[0] = state[0]; + out_state[4] = state[4]; + out_state[8] = state[8]; + out_state[12] = state[12]; + + // Rotate first row 1 column right. + u8 temp = state[13]; + out_state[13] = state[9]; + out_state[9] = state[5]; + out_state[5] = state[1]; + out_state[1] = temp; + + // Rotate second row 2 columns right + temp = state[2]; + out_state[2] = state[10]; + out_state[10] = temp; + + temp = state[6]; + out_state[6] = state[14]; + out_state[14] = temp; + + // Rotate third row 3 columns right + temp = state[3]; + out_state[3] = state[7]; + out_state[7] = state[11]; + out_state[11] = state[15]; + out_state[15] = temp; +} + +static void SubBytes(State& state, const SubstitutionTable& table) { + for (size_t i = 0; i < 4; i++) { + for (size_t j = 0; j < 4; j++) { + state[4 * i + j] = table[state[4 * i + j]]; + } + } +} + +void DecryptSingleRound(State& out_state, const State& state) { + InverseShiftRows(out_state, state); + SubBytes(out_state, inverse_substitution_box); +} + +void EncryptSingleRound(State& out_state, const State& state) { + ShiftRows(out_state, state); + SubBytes(out_state, substitution_box); +} + +void MixColumns(State& out_state, const State& state) { + for (size_t i = 0; i < out_state.size(); i += 4) { + const u8 a = state[i]; + const u8 b = state[i + 1]; + const u8 c = state[i + 2]; + const u8 d = state[i + 3]; + + const u8 tmp = a ^ b ^ c ^ d; + + out_state[i + 0] = a ^ xtime(a ^ b) ^ tmp; + out_state[i + 1] = b ^ xtime(b ^ c) ^ tmp; + out_state[i + 2] = c ^ xtime(c ^ d) ^ tmp; + out_state[i + 3] = d ^ xtime(d ^ a) ^ tmp; + } +} + +void InverseMixColumns(State& out_state, const State& state) { + for (size_t i = 0; i < out_state.size(); i += 4) { + const u8 a = state[i]; + const u8 b = state[i + 1]; + const u8 c = state[i + 2]; + const u8 d = state[i + 3]; + + out_state[i + 0] = Multiply(a, 0x0E) ^ Multiply(b, 0x0B) ^ Multiply(c, 0x0D) ^ Multiply(d, 0x09); + out_state[i + 1] = Multiply(a, 0x09) ^ Multiply(b, 0x0E) ^ Multiply(c, 0x0B) ^ Multiply(d, 0x0D); + out_state[i + 2] = Multiply(a, 0x0D) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0E) ^ Multiply(d, 0x0B); + out_state[i + 3] = Multiply(a, 0x0B) ^ Multiply(b, 0x0D) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0E); + } +} + +} // namespace Dynarmic::Common::Crypto::AES diff --git a/externals/dynarmic/src/dynarmic/common/crypto/aes.h b/externals/dynarmic/src/dynarmic/common/crypto/aes.h new file mode 100644 index 0000000000..fa6d5a81b3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/crypto/aes.h @@ -0,0 +1,23 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Common::Crypto::AES { + +using State = std::array<u8, 16>; + +// Assumes the state has already been XORed by the round key. +void DecryptSingleRound(State& out_state, const State& state); +void EncryptSingleRound(State& out_state, const State& state); + +void MixColumns(State& out_state, const State& state); +void InverseMixColumns(State& out_state, const State& state); + +} // namespace Dynarmic::Common::Crypto::AES diff --git a/externals/dynarmic/src/dynarmic/common/crypto/crc32.cpp b/externals/dynarmic/src/dynarmic/common/crypto/crc32.cpp new file mode 100644 index 0000000000..c00385078b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/crypto/crc32.cpp @@ -0,0 +1,168 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/crypto/crc32.h" + +#include <array> + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Common::Crypto::CRC32 { + +using CRC32Table = std::array<u32, 256>; + +// CRC32 algorithm that uses polynomial 0x1EDC6F41 +constexpr CRC32Table castagnoli_table{ + {0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, + 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, + 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, + 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, + 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, + 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, + 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, + 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, + 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, + 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, + 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, + 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, + 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, + 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, + 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, + 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, + 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, + 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, + 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, + 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, + 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, + 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, + 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, + 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, + 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, + 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, + 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, + 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, + 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, + 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, + 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, + 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, + 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, + 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, + 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, + 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, + 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, + 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, + 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, + 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, + 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, + 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, + 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, + 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, + 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, + 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, + 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, + 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, + 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, + 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, + 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, + 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, + 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, + 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, + 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, + 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, + 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, + 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, + 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, + 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, + 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, + 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, + 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, + 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351}}; + +// CRC32 algorithm that uses polynomial 0x04C11DB7 +constexpr CRC32Table iso_table{ + {0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, + 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3, + 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, + 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, + 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, + 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, + 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, + 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5, + 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, + 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, + 0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, + 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, + 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, + 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F, + 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, + 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, + 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, + 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433, + 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, + 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01, + 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, + 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, + 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, + 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, + 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, + 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, + 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, + 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, + 0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, + 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F, + 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, + 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD, + 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, + 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, + 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, + 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, + 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, + 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7, + 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, + 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, + 0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, + 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B, + 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, + 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79, + 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, + 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, + 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, + 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, + 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, + 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713, + 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, + 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, + 0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, + 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, + 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, + 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45, + 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, + 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, + 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, + 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9, + 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, + 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF, + 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, + 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D}}; + +static u32 ComputeCRC32(const CRC32Table& table, u32 crc, const u64 value, int length) { + const auto* data = reinterpret_cast<const unsigned char*>(&value); + + while (length-- > 0) { + crc = (crc >> 8) ^ table[(crc ^ (*data++)) & 0xFF]; + } + + return crc; +} + +u32 ComputeCRC32Castagnoli(u32 crc, u64 value, int length) { + return ComputeCRC32(castagnoli_table, crc, value, length); +} + +u32 ComputeCRC32ISO(u32 crc, u64 value, int length) { + return ComputeCRC32(iso_table, crc, value, length); +} + +} // namespace Dynarmic::Common::Crypto::CRC32 diff --git a/externals/dynarmic/src/dynarmic/common/crypto/crc32.h b/externals/dynarmic/src/dynarmic/common/crypto/crc32.h new file mode 100644 index 0000000000..30942327d4 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/crypto/crc32.h @@ -0,0 +1,40 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Common::Crypto::CRC32 { + +/** + * Computes a CRC32 value using Castagnoli polynomial (0x1EDC6F41). + * + * @param crc The initial CRC value + * @param value The value to compute the CRC of. + * @param length The length of the data to compute. + * + * @remark @p value is interpreted internally as an array of + * unsigned char with @p length data. + * + * @return The computed CRC32 value. + */ +u32 ComputeCRC32Castagnoli(u32 crc, u64 value, int length); + +/** + * Computes a CRC32 value using the ISO polynomial (0x04C11DB7). + * + * @param crc The initial CRC value + * @param value The value to compute the CRC of. + * @param length The length of the data to compute. + * + * @remark @p value is interpreted internally as an array of + * unsigned char with @p length data. + * + * @return The computed CRC32 value. + */ +u32 ComputeCRC32ISO(u32 crc, u64 value, int length); + +} // namespace Dynarmic::Common::Crypto::CRC32 diff --git a/externals/dynarmic/src/dynarmic/common/crypto/sm4.cpp b/externals/dynarmic/src/dynarmic/common/crypto/sm4.cpp new file mode 100644 index 0000000000..5743e5be9a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/crypto/sm4.cpp @@ -0,0 +1,54 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/crypto/sm4.h" + +#include <array> + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Common::Crypto::SM4 { + +using SubstitutionTable = std::array<u8, 256>; + +constexpr SubstitutionTable substitution_box{ + {0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, + 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, + 0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, + 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, + 0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, + 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, + 0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, + 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, + 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, + 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, + 0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, + 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, + 0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, + 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87, + 0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, + 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, + 0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, + 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1, + 0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, + 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, + 0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, + 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, + 0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, + 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, + 0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, + 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, + 0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, + 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, + 0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, + 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84, + 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, + 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48}}; + +u8 AccessSubstitutionBox(u8 index) { + return substitution_box[index]; +} + +} // namespace Dynarmic::Common::Crypto::SM4 diff --git a/externals/dynarmic/src/dynarmic/common/crypto/sm4.h b/externals/dynarmic/src/dynarmic/common/crypto/sm4.h new file mode 100644 index 0000000000..417e9b9263 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/crypto/sm4.h @@ -0,0 +1,14 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Common::Crypto::SM4 { + +u8 AccessSubstitutionBox(u8 index); + +} // namespace Dynarmic::Common::Crypto::SM4 diff --git a/externals/dynarmic/src/dynarmic/common/fp/fpcr.h b/externals/dynarmic/src/dynarmic/common/fp/fpcr.h new file mode 100644 index 0000000000..287f6d2889 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/fpcr.h @@ -0,0 +1,209 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <optional> + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/rounding_mode.h" + +namespace Dynarmic::FP { + +/** + * Representation of the Floating-Point Control Register. + */ +class FPCR final { +public: + FPCR() = default; + FPCR(const FPCR&) = default; + FPCR(FPCR&&) = default; + explicit FPCR(u32 data) + : value{data & mask} {} + + FPCR& operator=(const FPCR&) = default; + FPCR& operator=(FPCR&&) = default; + FPCR& operator=(u32 data) { + value = data & mask; + return *this; + } + + /// Get alternate half-precision control flag. + bool AHP() const { + return mcl::bit::get_bit<26>(value); + } + + /// Set alternate half-precision control flag. + void AHP(bool ahp) { + value = mcl::bit::set_bit<26>(value, ahp); + } + + /// Get default NaN mode control bit. + bool DN() const { + return mcl::bit::get_bit<25>(value); + } + + /// Set default NaN mode control bit. + void DN(bool dn) { + value = mcl::bit::set_bit<25>(value, dn); + } + + /// Get flush-to-zero mode control bit. + bool FZ() const { + return mcl::bit::get_bit<24>(value); + } + + /// Set flush-to-zero mode control bit. + void FZ(bool fz) { + value = mcl::bit::set_bit<24>(value, fz); + } + + /// Get rounding mode control field. + FP::RoundingMode RMode() const { + return static_cast<FP::RoundingMode>(mcl::bit::get_bits<22, 23>(value)); + } + + /// Set rounding mode control field. + void RMode(FP::RoundingMode rounding_mode) { + ASSERT_MSG(static_cast<u32>(rounding_mode) <= 0b11, "FPCR: Invalid rounding mode"); + value = mcl::bit::set_bits<22, 23>(value, static_cast<u32>(rounding_mode)); + } + + /// Get the stride of a vector when executing AArch32 VFP instructions. + /// This field has no function in AArch64 state. + std::optional<size_t> Stride() const { + switch (mcl::bit::get_bits<20, 21>(value)) { + case 0b00: + return 1; + case 0b11: + return 2; + default: + return std::nullopt; + } + } + + /// Set the stride of a vector when executing AArch32 VFP instructions. + /// This field has no function in AArch64 state. + void Stride(size_t stride) { + ASSERT_MSG(stride >= 1 && stride <= 2, "FPCR: Invalid stride"); + value = mcl::bit::set_bits<20, 21>(value, stride == 1 ? 0b00u : 0b11u); + } + + /// Get flush-to-zero (half-precision specific) mode control bit. + bool FZ16() const { + return mcl::bit::get_bit<19>(value); + } + + /// Set flush-to-zero (half-precision specific) mode control bit. + void FZ16(bool fz16) { + value = mcl::bit::set_bit<19>(value, fz16); + } + + /// Gets the length of a vector when executing AArch32 VFP instructions. + /// This field has no function in AArch64 state. + size_t Len() const { + return mcl::bit::get_bits<16, 18>(value) + 1; + } + + /// Sets the length of a vector when executing AArch32 VFP instructions. + /// This field has no function in AArch64 state. + void Len(size_t len) { + ASSERT_MSG(len >= 1 && len <= 8, "FPCR: Invalid len"); + value = mcl::bit::set_bits<16, 18>(value, static_cast<u32>(len - 1)); + } + + /// Get input denormal exception trap enable flag. + bool IDE() const { + return mcl::bit::get_bit<15>(value); + } + + /// Set input denormal exception trap enable flag. + void IDE(bool ide) { + value = mcl::bit::set_bit<15>(value, ide); + } + + /// Get inexact exception trap enable flag. + bool IXE() const { + return mcl::bit::get_bit<12>(value); + } + + /// Set inexact exception trap enable flag. + void IXE(bool ixe) { + value = mcl::bit::set_bit<12>(value, ixe); + } + + /// Get underflow exception trap enable flag. + bool UFE() const { + return mcl::bit::get_bit<11>(value); + } + + /// Set underflow exception trap enable flag. + void UFE(bool ufe) { + value = mcl::bit::set_bit<11>(value, ufe); + } + + /// Get overflow exception trap enable flag. + bool OFE() const { + return mcl::bit::get_bit<10>(value); + } + + /// Set overflow exception trap enable flag. + void OFE(bool ofe) { + value = mcl::bit::set_bit<10>(value, ofe); + } + + /// Get division by zero exception trap enable flag. + bool DZE() const { + return mcl::bit::get_bit<9>(value); + } + + /// Set division by zero exception trap enable flag. + void DZE(bool dze) { + value = mcl::bit::set_bit<9>(value, dze); + } + + /// Get invalid operation exception trap enable flag. + bool IOE() const { + return mcl::bit::get_bit<8>(value); + } + + /// Set invalid operation exception trap enable flag. + void IOE(bool ioe) { + value = mcl::bit::set_bit<8>(value, ioe); + } + + /// Gets the underlying raw value within the FPCR. + u32 Value() const { + return value; + } + + /// Gets the StandardFPSCRValue (A32 ASIMD). + FPCR ASIMDStandardValue() const { + FPCR stdvalue; + stdvalue.AHP(AHP()); + stdvalue.FZ16(FZ16()); + stdvalue.FZ(true); + stdvalue.DN(true); + return stdvalue; + } + +private: + // Bits 0-7, 13-14, and 27-31 are reserved. + static constexpr u32 mask = 0x07FF9F00; + u32 value = 0; +}; + +inline bool operator==(FPCR lhs, FPCR rhs) { + return lhs.Value() == rhs.Value(); +} + +inline bool operator!=(FPCR lhs, FPCR rhs) { + return !operator==(lhs, rhs); +} + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/fpsr.h b/externals/dynarmic/src/dynarmic/common/fp/fpsr.h new file mode 100644 index 0000000000..fba58ebbb7 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/fpsr.h @@ -0,0 +1,160 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +namespace Dynarmic::FP { + +/** + * Representation of the Floating-Point Status Register. + */ +class FPSR final { +public: + FPSR() = default; + FPSR(const FPSR&) = default; + FPSR(FPSR&&) = default; + explicit FPSR(u32 data) + : value{data & mask} {} + + FPSR& operator=(const FPSR&) = default; + FPSR& operator=(FPSR&&) = default; + FPSR& operator=(u32 data) { + value = data & mask; + return *this; + } + + /// Get negative condition flag + bool N() const { + return mcl::bit::get_bit<31>(value); + } + + /// Set negative condition flag + void N(bool N_) { + value = mcl::bit::set_bit<31>(value, N_); + } + + /// Get zero condition flag + bool Z() const { + return mcl::bit::get_bit<30>(value); + } + + /// Set zero condition flag + void Z(bool Z_) { + value = mcl::bit::set_bit<30>(value, Z_); + } + + /// Get carry condition flag + bool C() const { + return mcl::bit::get_bit<29>(value); + } + + /// Set carry condition flag + void C(bool C_) { + value = mcl::bit::set_bit<29>(value, C_); + } + + /// Get overflow condition flag + bool V() const { + return mcl::bit::get_bit<28>(value); + } + + /// Set overflow condition flag + void V(bool V_) { + value = mcl::bit::set_bit<28>(value, V_); + } + + /// Get cumulative saturation bit + bool QC() const { + return mcl::bit::get_bit<27>(value); + } + + /// Set cumulative saturation bit + void QC(bool QC_) { + value = mcl::bit::set_bit<27>(value, QC_); + } + + /// Get input denormal floating-point exception bit + bool IDC() const { + return mcl::bit::get_bit<7>(value); + } + + /// Set input denormal floating-point exception bit + void IDC(bool IDC_) { + value = mcl::bit::set_bit<7>(value, IDC_); + } + + /// Get inexact cumulative floating-point exception bit + bool IXC() const { + return mcl::bit::get_bit<4>(value); + } + + /// Set inexact cumulative floating-point exception bit + void IXC(bool IXC_) { + value = mcl::bit::set_bit<4>(value, IXC_); + } + + /// Get underflow cumulative floating-point exception bit + bool UFC() const { + return mcl::bit::get_bit<3>(value); + } + + /// Set underflow cumulative floating-point exception bit + void UFC(bool UFC_) { + value = mcl::bit::set_bit<3>(value, UFC_); + } + + /// Get overflow cumulative floating-point exception bit + bool OFC() const { + return mcl::bit::get_bit<2>(value); + } + + /// Set overflow cumulative floating-point exception bit + void OFC(bool OFC_) { + value = mcl::bit::set_bit<2>(value, OFC_); + } + + /// Get divide by zero cumulative floating-point exception bit + bool DZC() const { + return mcl::bit::get_bit<1>(value); + } + + /// Set divide by zero cumulative floating-point exception bit + void DZC(bool DZC_) { + value = mcl::bit::set_bit<1>(value, DZC_); + } + + /// Get invalid operation cumulative floating-point exception bit + bool IOC() const { + return mcl::bit::get_bit<0>(value); + } + + /// Set invalid operation cumulative floating-point exception bit + void IOC(bool IOC_) { + value = mcl::bit::set_bit<0>(value, IOC_); + } + + /// Gets the underlying raw value within the FPSR. + u32 Value() const { + return value; + } + +private: + // Bits 5-6 and 8-26 are reserved. + static constexpr u32 mask = 0xF800009F; + u32 value = 0; +}; + +inline bool operator==(FPSR lhs, FPSR rhs) { + return lhs.Value() == rhs.Value(); +} + +inline bool operator!=(FPSR lhs, FPSR rhs) { + return !operator==(lhs, rhs); +} + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/fused.cpp b/externals/dynarmic/src/dynarmic/common/fp/fused.cpp new file mode 100644 index 0000000000..5c32b05eb4 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/fused.cpp @@ -0,0 +1,91 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/fused.h" + +#include <mcl/bit/bit_count.hpp> + +#include "dynarmic/common/fp/unpacked.h" +#include "dynarmic/common/u128.h" + +namespace Dynarmic::FP { + +constexpr size_t product_point_position = normalized_point_position * 2; + +static FPUnpacked ReduceMantissa(bool sign, int exponent, const u128& mantissa) { + constexpr int point_position_correction = normalized_point_position - (product_point_position - 64); + // We round-to-odd here when reducing the bitwidth of the mantissa so that subsequent roundings are accurate. + return {sign, exponent + point_position_correction, mantissa.upper | static_cast<u64>(mantissa.lower != 0)}; +} + +FPUnpacked FusedMulAdd(FPUnpacked addend, FPUnpacked op1, FPUnpacked op2) { + const bool product_sign = op1.sign != op2.sign; + const auto [product_exponent, product_value] = [op1, op2] { + int exponent = op1.exponent + op2.exponent; + u128 value = Multiply64To128(op1.mantissa, op2.mantissa); + if (value.Bit<product_point_position + 1>()) { + value = value >> 1; + exponent++; + } + return std::make_tuple(exponent, value); + }(); + + if (product_value == 0) { + return addend; + } + + if (addend.mantissa == 0) { + return ReduceMantissa(product_sign, product_exponent, product_value); + } + + const int exp_diff = product_exponent - addend.exponent; + + if (product_sign == addend.sign) { + // Addition + + if (exp_diff <= 0) { + // addend > product + const u64 result = addend.mantissa + StickyLogicalShiftRight(product_value, normalized_point_position - exp_diff).lower; + return FPUnpacked{addend.sign, addend.exponent, result}; + } + + // addend < product + const u128 result = product_value + StickyLogicalShiftRight(addend.mantissa, exp_diff - normalized_point_position); + return ReduceMantissa(product_sign, product_exponent, result); + } + + // Subtraction + + const u128 addend_long = u128(addend.mantissa) << normalized_point_position; + + bool result_sign; + u128 result; + int result_exponent; + + if (exp_diff == 0 && product_value > addend_long) { + result_sign = product_sign; + result_exponent = product_exponent; + result = product_value - addend_long; + } else if (exp_diff <= 0) { + result_sign = !product_sign; + result_exponent = addend.exponent; + result = addend_long - StickyLogicalShiftRight(product_value, -exp_diff); + } else { + result_sign = product_sign; + result_exponent = product_exponent; + result = product_value - StickyLogicalShiftRight(addend_long, exp_diff); + } + + if (result.upper == 0) { + return FPUnpacked{result_sign, result_exponent, result.lower}; + } + + const int required_shift = normalized_point_position - mcl::bit::highest_set_bit(result.upper); + result = result << required_shift; + result_exponent -= required_shift; + return ReduceMantissa(result_sign, result_exponent, result); +} + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/fused.h b/externals/dynarmic/src/dynarmic/common/fp/fused.h new file mode 100644 index 0000000000..0ffa2683c1 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/fused.h @@ -0,0 +1,15 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::FP { + +struct FPUnpacked; + +/// This function assumes all arguments have been normalized. +FPUnpacked FusedMulAdd(FPUnpacked addend, FPUnpacked op1, FPUnpacked op2); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/info.h b/externals/dynarmic/src/dynarmic/common/fp/info.h new file mode 100644 index 0000000000..8cc2d29de4 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/info.h @@ -0,0 +1,138 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/bit/bit_count.hpp> +#include <mcl/stdint.hpp> + +namespace Dynarmic::FP { + +template<typename FPT> +struct FPInfo {}; + +template<> +struct FPInfo<u16> { + static constexpr size_t total_width = 16; + static constexpr size_t exponent_width = 5; + static constexpr size_t explicit_mantissa_width = 10; + static constexpr size_t mantissa_width = explicit_mantissa_width + 1; + + static constexpr u32 implicit_leading_bit = u32(1) << explicit_mantissa_width; + static constexpr u32 sign_mask = 0x8000; + static constexpr u32 exponent_mask = 0x7C00; + static constexpr u32 mantissa_mask = 0x3FF; + static constexpr u32 mantissa_msb = 0x200; + + static constexpr int exponent_min = -14; + static constexpr int exponent_max = 15; + static constexpr int exponent_bias = 15; + + static constexpr u16 Zero(bool sign) { + return sign ? static_cast<u16>(sign_mask) : u16{0}; + } + + static constexpr u16 Infinity(bool sign) { + return static_cast<u16>(exponent_mask | Zero(sign)); + } + + static constexpr u16 MaxNormal(bool sign) { + return static_cast<u16>((exponent_mask - 1) | Zero(sign)); + } + + static constexpr u16 DefaultNaN() { + return static_cast<u16>(exponent_mask | (u32(1) << (explicit_mantissa_width - 1))); + } +}; + +template<> +struct FPInfo<u32> { + static constexpr size_t total_width = 32; + static constexpr size_t exponent_width = 8; + static constexpr size_t explicit_mantissa_width = 23; + static constexpr size_t mantissa_width = explicit_mantissa_width + 1; + + static constexpr u32 implicit_leading_bit = u32(1) << explicit_mantissa_width; + static constexpr u32 sign_mask = 0x80000000; + static constexpr u32 exponent_mask = 0x7F800000; + static constexpr u32 mantissa_mask = 0x007FFFFF; + static constexpr u32 mantissa_msb = 0x00400000; + + static constexpr int exponent_min = -126; + static constexpr int exponent_max = 127; + static constexpr int exponent_bias = 127; + + static constexpr u32 Zero(bool sign) { + return sign ? sign_mask : 0; + } + + static constexpr u32 Infinity(bool sign) { + return exponent_mask | Zero(sign); + } + + static constexpr u32 MaxNormal(bool sign) { + return (exponent_mask - 1) | Zero(sign); + } + + static constexpr u32 DefaultNaN() { + return exponent_mask | (u32(1) << (explicit_mantissa_width - 1)); + } +}; + +template<> +struct FPInfo<u64> { + static constexpr size_t total_width = 64; + static constexpr size_t exponent_width = 11; + static constexpr size_t explicit_mantissa_width = 52; + static constexpr size_t mantissa_width = explicit_mantissa_width + 1; + + static constexpr u64 implicit_leading_bit = u64(1) << explicit_mantissa_width; + static constexpr u64 sign_mask = 0x8000'0000'0000'0000; + static constexpr u64 exponent_mask = 0x7FF0'0000'0000'0000; + static constexpr u64 mantissa_mask = 0x000F'FFFF'FFFF'FFFF; + static constexpr u64 mantissa_msb = 0x0008'0000'0000'0000; + + static constexpr int exponent_min = -1022; + static constexpr int exponent_max = 1023; + static constexpr int exponent_bias = 1023; + + static constexpr u64 Zero(bool sign) { + return sign ? sign_mask : 0; + } + + static constexpr u64 Infinity(bool sign) { + return exponent_mask | Zero(sign); + } + + static constexpr u64 MaxNormal(bool sign) { + return (exponent_mask - 1) | Zero(sign); + } + + static constexpr u64 DefaultNaN() { + return exponent_mask | (u64(1) << (explicit_mantissa_width - 1)); + } +}; + +/// value = (sign ? -1 : +1) * 2^exponent * value +/// @note We do not handle denormals. Denormals will static_assert. +template<typename FPT, bool sign, int exponent, FPT value> +constexpr FPT FPValue() { + if constexpr (value == 0) { + return FPInfo<FPT>::Zero(sign); + } + + constexpr int point_position = static_cast<int>(FPInfo<FPT>::explicit_mantissa_width); + constexpr int highest_bit = mcl::bit::highest_set_bit(value); + constexpr int offset = point_position - highest_bit; + constexpr int normalized_exponent = exponent - offset + point_position; + static_assert(offset >= 0); + static_assert(normalized_exponent >= FPInfo<FPT>::exponent_min && normalized_exponent <= FPInfo<FPT>::exponent_max); + + constexpr FPT mantissa = (value << offset) & FPInfo<FPT>::mantissa_mask; + constexpr FPT biased_exponent = static_cast<FPT>(normalized_exponent + FPInfo<FPT>::exponent_bias); + return FPT(FPInfo<FPT>::Zero(sign) | mantissa | (biased_exponent << FPInfo<FPT>::explicit_mantissa_width)); +} + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/mantissa_util.h b/externals/dynarmic/src/dynarmic/common/fp/mantissa_util.h new file mode 100644 index 0000000000..31be52f761 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/mantissa_util.h @@ -0,0 +1,47 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/bit/bit_field.hpp> +#include <mcl/bitsizeof.hpp> +#include <mcl/stdint.hpp> + +namespace Dynarmic::FP { + +enum class ResidualError { + Zero, + LessThanHalf, + Half, + GreaterThanHalf, +}; + +inline ResidualError ResidualErrorOnRightShift(u64 mantissa, int shift_amount) { + if (shift_amount <= 0 || mantissa == 0) { + return ResidualError::Zero; + } + + if (shift_amount > static_cast<int>(mcl::bitsizeof<u64>)) { + return mcl::bit::most_significant_bit(mantissa) ? ResidualError::GreaterThanHalf : ResidualError::LessThanHalf; + } + + const size_t half_bit_position = static_cast<size_t>(shift_amount - 1); + const u64 half = static_cast<u64>(1) << half_bit_position; + const u64 error_mask = mcl::bit::ones<u64>(static_cast<size_t>(shift_amount)); + const u64 error = mantissa & error_mask; + + if (error == 0) { + return ResidualError::Zero; + } + if (error < half) { + return ResidualError::LessThanHalf; + } + if (error == half) { + return ResidualError::Half; + } + return ResidualError::GreaterThanHalf; +} + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op.h b/externals/dynarmic/src/dynarmic/common/fp/op.h new file mode 100644 index 0000000000..b30ff39c44 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op.h @@ -0,0 +1,17 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include "dynarmic/common/fp/op/FPCompare.h" +#include "dynarmic/common/fp/op/FPConvert.h" +#include "dynarmic/common/fp/op/FPMulAdd.h" +#include "dynarmic/common/fp/op/FPRSqrtEstimate.h" +#include "dynarmic/common/fp/op/FPRSqrtStepFused.h" +#include "dynarmic/common/fp/op/FPRecipEstimate.h" +#include "dynarmic/common/fp/op/FPRecipExponent.h" +#include "dynarmic/common/fp/op/FPRecipStepFused.h" +#include "dynarmic/common/fp/op/FPRoundInt.h" +#include "dynarmic/common/fp/op/FPToFixed.h" diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.cpp new file mode 100644 index 0000000000..8aeaadc573 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.cpp @@ -0,0 +1,40 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/op/FPCompare.h" + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/process_exception.h" +#include "dynarmic/common/fp/unpacked.h" + +namespace Dynarmic::FP { + +template<typename FPT> +bool FPCompareEQ(FPT lhs, FPT rhs, FPCR fpcr, FPSR& fpsr) { + const auto unpacked1 = FPUnpack(lhs, fpcr, fpsr); + const auto unpacked2 = FPUnpack(rhs, fpcr, fpsr); + const auto type1 = std::get<FPType>(unpacked1); + const auto type2 = std::get<FPType>(unpacked2); + const auto& value1 = std::get<FPUnpacked>(unpacked1); + const auto& value2 = std::get<FPUnpacked>(unpacked2); + + if (type1 == FPType::QNaN || type1 == FPType::SNaN || type2 == FPType::QNaN || type2 == FPType::SNaN) { + if (type1 == FPType::SNaN || type2 == FPType::SNaN) { + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + } + + // Comparisons against NaN are never equal. + return false; + } + + return value1 == value2 || (type1 == FPType::Zero && type2 == FPType::Zero); +} + +template bool FPCompareEQ<u16>(u16 lhs, u16 rhs, FPCR fpcr, FPSR& fpsr); +template bool FPCompareEQ<u32>(u32 lhs, u32 rhs, FPCR fpcr, FPSR& fpsr); +template bool FPCompareEQ<u64>(u64 lhs, u64 rhs, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.h new file mode 100644 index 0000000000..e0d5ca0374 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPCompare.h @@ -0,0 +1,16 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; + +template<typename FPT> +bool FPCompareEQ(FPT lhs, FPT rhs, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.cpp new file mode 100644 index 0000000000..85f1eaf899 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.cpp @@ -0,0 +1,93 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/op/FPConvert.h" + +#include <mcl/bit/bit_field.hpp> +#include <mcl/bitsizeof.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/process_exception.h" +#include "dynarmic/common/fp/unpacked.h" + +namespace Dynarmic::FP { +namespace { +template<typename FPT_TO, typename FPT_FROM> +FPT_TO FPConvertNaN(FPT_FROM op) { + const bool sign = mcl::bit::get_bit<mcl::bitsizeof<FPT_FROM> - 1>(op); + const u64 frac = [op] { + if constexpr (sizeof(FPT_FROM) == sizeof(u64)) { + return mcl::bit::get_bits<0, 50>(op); + } else if constexpr (sizeof(FPT_FROM) == sizeof(u32)) { + return u64{mcl::bit::get_bits<0, 21>(op)} << 29; + } else { + return u64{mcl::bit::get_bits<0, 8>(op)} << 42; + } + }(); + + const size_t dest_bit_size = mcl::bitsizeof<FPT_TO>; + const u64 shifted_sign = u64{sign} << (dest_bit_size - 1); + const u64 exponent = mcl::bit::ones<u64>(dest_bit_size - FPInfo<FPT_TO>::explicit_mantissa_width); + + if constexpr (sizeof(FPT_TO) == sizeof(u64)) { + return FPT_TO(shifted_sign | exponent << 51 | frac); + } else if constexpr (sizeof(FPT_TO) == sizeof(u32)) { + return FPT_TO(shifted_sign | exponent << 22 | mcl::bit::get_bits<29, 50>(frac)); + } else { + return FPT_TO(shifted_sign | exponent << 9 | mcl::bit::get_bits<42, 50>(frac)); + } +} +} // Anonymous namespace + +template<typename FPT_TO, typename FPT_FROM> +FPT_TO FPConvert(FPT_FROM op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr) { + const auto [type, sign, value] = FPUnpackCV<FPT_FROM>(op, fpcr, fpsr); + const bool is_althp = mcl::bitsizeof<FPT_TO> == 16 && fpcr.AHP(); + + if (type == FPType::SNaN || type == FPType::QNaN) { + std::uintmax_t result{}; + + if (is_althp) { + result = FPInfo<FPT_TO>::Zero(sign); + } else if (fpcr.DN()) { + result = FPInfo<FPT_TO>::DefaultNaN(); + } else { + result = FPConvertNaN<FPT_TO>(op); + } + + if (type == FPType::SNaN || is_althp) { + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + } + + return FPT_TO(result); + } + + if (type == FPType::Infinity) { + if (is_althp) { + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + return FPT_TO(u32{sign} << 15 | 0b111111111111111); + } + + return FPInfo<FPT_TO>::Infinity(sign); + } + + if (type == FPType::Zero) { + return FPInfo<FPT_TO>::Zero(sign); + } + + return FPRoundCV<FPT_TO>(value, fpcr, rounding_mode, fpsr); +} + +template u16 FPConvert<u16, u32>(u32 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr); +template u16 FPConvert<u16, u64>(u64 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr); +template u32 FPConvert<u32, u16>(u16 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr); +template u32 FPConvert<u32, u64>(u64 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr); +template u64 FPConvert<u64, u16>(u16 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr); +template u64 FPConvert<u64, u32>(u32 op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.h new file mode 100644 index 0000000000..0fea22e482 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPConvert.h @@ -0,0 +1,17 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; +enum class RoundingMode; + +template<typename FPT_TO, typename FPT_FROM> +FPT_TO FPConvert(FPT_FROM op, FPCR fpcr, RoundingMode rounding_mode, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.cpp new file mode 100644 index 0000000000..f97f88de17 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.cpp @@ -0,0 +1,90 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/op/FPMulAdd.h" + +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/fused.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/process_exception.h" +#include "dynarmic/common/fp/process_nan.h" +#include "dynarmic/common/fp/unpacked.h" + +namespace Dynarmic::FP { + +template<typename FPT> +FPT FPMulAdd(FPT addend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) { + const RoundingMode rounding = fpcr.RMode(); + + const auto [typeA, signA, valueA] = FPUnpack(addend, fpcr, fpsr); + const auto [type1, sign1, value1] = FPUnpack(op1, fpcr, fpsr); + const auto [type2, sign2, value2] = FPUnpack(op2, fpcr, fpsr); + + const bool infA = typeA == FPType::Infinity; + const bool inf1 = type1 == FPType::Infinity; + const bool inf2 = type2 == FPType::Infinity; + const bool zeroA = typeA == FPType::Zero; + const bool zero1 = type1 == FPType::Zero; + const bool zero2 = type2 == FPType::Zero; + + const auto maybe_nan = FPProcessNaNs3<FPT>(typeA, type1, type2, addend, op1, op2, fpcr, fpsr); + + if (typeA == FPType::QNaN && ((inf1 && zero2) || (zero1 && inf2))) { + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + return FPInfo<FPT>::DefaultNaN(); + } + + if (maybe_nan) { + return *maybe_nan; + } + + // Calculate properties of product (op1 * op2). + const bool signP = sign1 != sign2; + const bool infP = inf1 || inf2; + const bool zeroP = zero1 || zero2; + + // Raise NaN on (inf * inf) of opposite signs or (inf * zero). + if ((inf1 && zero2) || (zero1 && inf2) || (infA && infP && signA != signP)) { + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + return FPInfo<FPT>::DefaultNaN(); + } + + // Handle infinities + if ((infA && !signA) || (infP && !signP)) { + return FPInfo<FPT>::Infinity(false); + } + if ((infA && signA) || (infP && signP)) { + return FPInfo<FPT>::Infinity(true); + } + + // Result is exactly zero + if (zeroA && zeroP && signA == signP) { + return FPInfo<FPT>::Zero(signA); + } + + const FPUnpacked result_value = FusedMulAdd(valueA, value1, value2); + if (result_value.mantissa == 0) { + return FPInfo<FPT>::Zero(rounding == RoundingMode::TowardsMinusInfinity); + } + return FPRound<FPT>(result_value, fpcr, fpsr); +} + +template u16 FPMulAdd<u16>(u16 addend, u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr); +template u32 FPMulAdd<u32>(u32 addend, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr); +template u64 FPMulAdd<u64>(u64 addend, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr); + +template<typename FPT> +FPT FPMulSub(FPT minuend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) { + return FPMulAdd<FPT>(minuend, (op1 ^ FPInfo<FPT>::sign_mask), op2, fpcr, fpsr); +} + +template u16 FPMulSub<u16>(u16 minuend, u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr); +template u32 FPMulSub<u32>(u32 minuend, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr); +template u64 FPMulSub<u64>(u64 minuend, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.h new file mode 100644 index 0000000000..8e0a16cda5 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPMulAdd.h @@ -0,0 +1,19 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; + +template<typename FPT> +FPT FPMulAdd(FPT addend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr); + +template<typename FPT> +FPT FPMulSub(FPT minuend, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPNeg.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPNeg.h new file mode 100644 index 0000000000..4f172a598e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPNeg.h @@ -0,0 +1,17 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include "dynarmic/common/fp/info.h" + +namespace Dynarmic::FP { + +template<typename FPT> +constexpr FPT FPNeg(FPT op) { + return op ^ FPInfo<FPT>::sign_mask; +} + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.cpp new file mode 100644 index 0000000000..ed28bcc5cc --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.cpp @@ -0,0 +1,59 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/op/FPRSqrtEstimate.h" + +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/process_exception.h" +#include "dynarmic/common/fp/process_nan.h" +#include "dynarmic/common/fp/unpacked.h" +#include "dynarmic/common/math_util.h" +#include "dynarmic/common/safe_ops.h" + +namespace Dynarmic::FP { + +template<typename FPT> +FPT FPRSqrtEstimate(FPT op, FPCR fpcr, FPSR& fpsr) { + const auto [type, sign, value] = FPUnpack<FPT>(op, fpcr, fpsr); + + if (type == FPType::SNaN || type == FPType::QNaN) { + return FPProcessNaN(type, op, fpcr, fpsr); + } + + if (type == FPType::Zero) { + FPProcessException(FPExc::DivideByZero, fpcr, fpsr); + return FPInfo<FPT>::Infinity(sign); + } + + if (sign) { + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + return FPInfo<FPT>::DefaultNaN(); + } + + if (type == FPType::Infinity) { + // Note: Just +Inf reaches here, negatives are handled in the case above. + return FPInfo<FPT>::Zero(false); + } + + const int result_exponent = (-(value.exponent + 1)) >> 1; + const bool was_exponent_odd = (value.exponent) % 2 == 0; + + const u64 scaled = Safe::LogicalShiftRight(value.mantissa, normalized_point_position - (was_exponent_odd ? 7 : 8)); + const u64 estimate = Common::RecipSqrtEstimate(scaled); + + const FPT bits_exponent = static_cast<FPT>(result_exponent + FPInfo<FPT>::exponent_bias); + const FPT bits_mantissa = static_cast<FPT>(estimate << (FPInfo<FPT>::explicit_mantissa_width - 8)); + return (bits_exponent << FPInfo<FPT>::explicit_mantissa_width) | (bits_mantissa & FPInfo<FPT>::mantissa_mask); +} + +template u16 FPRSqrtEstimate<u16>(u16 op, FPCR fpcr, FPSR& fpsr); +template u32 FPRSqrtEstimate<u32>(u32 op, FPCR fpcr, FPSR& fpsr); +template u64 FPRSqrtEstimate<u64>(u64 op, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.h new file mode 100644 index 0000000000..f51be1e8bc --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtEstimate.h @@ -0,0 +1,16 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; + +template<typename FPT> +FPT FPRSqrtEstimate(FPT op, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.cpp new file mode 100644 index 0000000000..06fe96d44b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.cpp @@ -0,0 +1,57 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/op/FPRSqrtStepFused.h" + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/fused.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/op/FPNeg.h" +#include "dynarmic/common/fp/process_nan.h" +#include "dynarmic/common/fp/unpacked.h" + +namespace Dynarmic::FP { + +template<typename FPT> +FPT FPRSqrtStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) { + op1 = FPNeg(op1); + + const auto [type1, sign1, value1] = FPUnpack(op1, fpcr, fpsr); + const auto [type2, sign2, value2] = FPUnpack(op2, fpcr, fpsr); + + if (const auto maybe_nan = FPProcessNaNs(type1, type2, op1, op2, fpcr, fpsr)) { + return *maybe_nan; + } + + const bool inf1 = type1 == FPType::Infinity; + const bool inf2 = type2 == FPType::Infinity; + const bool zero1 = type1 == FPType::Zero; + const bool zero2 = type2 == FPType::Zero; + + if ((inf1 && zero2) || (zero1 && inf2)) { + // return +1.5 + return FPValue<FPT, false, -1, 3>(); + } + + if (inf1 || inf2) { + return FPInfo<FPT>::Infinity(sign1 != sign2); + } + + // result_value = (3.0 + (value1 * value2)) / 2.0 + FPUnpacked result_value = FusedMulAdd(ToNormalized(false, 0, 3), value1, value2); + result_value.exponent--; + + if (result_value.mantissa == 0) { + return FPInfo<FPT>::Zero(fpcr.RMode() == RoundingMode::TowardsMinusInfinity); + } + return FPRound<FPT>(result_value, fpcr, fpsr); +} + +template u16 FPRSqrtStepFused<u16>(u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr); +template u32 FPRSqrtStepFused<u32>(u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr); +template u64 FPRSqrtStepFused<u64>(u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.h new file mode 100644 index 0000000000..384a75924d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRSqrtStepFused.h @@ -0,0 +1,16 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; + +template<typename FPT> +FPT FPRSqrtStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.cpp new file mode 100644 index 0000000000..1fd4b924b2 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.cpp @@ -0,0 +1,100 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/op/FPRecipEstimate.h" + +#include <tuple> + +#include <mcl/assert.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/process_exception.h" +#include "dynarmic/common/fp/process_nan.h" +#include "dynarmic/common/fp/unpacked.h" +#include "dynarmic/common/math_util.h" + +namespace Dynarmic::FP { + +template<typename FPT> +FPT FPRecipEstimate(FPT op, FPCR fpcr, FPSR& fpsr) { + FPType type; + bool sign; + FPUnpacked value; + std::tie(type, sign, value) = FPUnpack<FPT>(op, fpcr, fpsr); + + if (type == FPType::SNaN || type == FPType::QNaN) { + return FPProcessNaN(type, op, fpcr, fpsr); + } + + if (type == FPType::Infinity) { + return FPInfo<FPT>::Zero(sign); + } + + if (type == FPType::Zero) { + FPProcessException(FPExc::DivideByZero, fpcr, fpsr); + return FPInfo<FPT>::Infinity(sign); + } + + if (value.exponent < FPInfo<FPT>::exponent_min - 2) { + const bool overflow_to_inf = [&] { + switch (fpcr.RMode()) { + case RoundingMode::ToNearest_TieEven: + return true; + case RoundingMode::TowardsPlusInfinity: + return !sign; + case RoundingMode::TowardsMinusInfinity: + return sign; + case RoundingMode::TowardsZero: + return false; + default: + UNREACHABLE(); + } + }(); + + FPProcessException(FPExc::Overflow, fpcr, fpsr); + FPProcessException(FPExc::Inexact, fpcr, fpsr); + return overflow_to_inf ? FPInfo<FPT>::Infinity(sign) : FPInfo<FPT>::MaxNormal(sign); + } + + if ((fpcr.FZ() && !std::is_same_v<FPT, u16>) || (fpcr.FZ16() && std::is_same_v<FPT, u16>)) { + if (value.exponent >= -FPInfo<FPT>::exponent_min) { + fpsr.UFC(true); + return FPInfo<FPT>::Zero(sign); + } + } + + const u64 scaled = value.mantissa >> (normalized_point_position - 8); + u64 estimate = static_cast<u64>(Common::RecipEstimate(scaled)) << (FPInfo<FPT>::explicit_mantissa_width - 8); + int result_exponent = -(value.exponent + 1); + if (result_exponent < FPInfo<FPT>::exponent_min) { + switch (result_exponent) { + case (FPInfo<FPT>::exponent_min - 1): + estimate |= FPInfo<FPT>::implicit_leading_bit; + estimate >>= 1; + break; + case (FPInfo<FPT>::exponent_min - 2): + estimate |= FPInfo<FPT>::implicit_leading_bit; + estimate >>= 2; + result_exponent++; + break; + default: + UNREACHABLE(); + } + } + + const FPT bits_sign = FPInfo<FPT>::Zero(sign); + const FPT bits_exponent = static_cast<FPT>(result_exponent + FPInfo<FPT>::exponent_bias); + const FPT bits_mantissa = static_cast<FPT>(estimate); + return FPT((bits_exponent << FPInfo<FPT>::explicit_mantissa_width) | (bits_mantissa & FPInfo<FPT>::mantissa_mask) | bits_sign); +} + +template u16 FPRecipEstimate<u16>(u16 op, FPCR fpcr, FPSR& fpsr); +template u32 FPRecipEstimate<u32>(u32 op, FPCR fpcr, FPSR& fpsr); +template u64 FPRecipEstimate<u64>(u64 op, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.h new file mode 100644 index 0000000000..eaf6fee5bf --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipEstimate.h @@ -0,0 +1,16 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; + +template<typename FPT> +FPT FPRecipEstimate(FPT op, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.cpp new file mode 100644 index 0000000000..171b94e96e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.cpp @@ -0,0 +1,59 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/op/FPRecipExponent.h" + +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/process_nan.h" +#include "dynarmic/common/fp/unpacked.h" + +namespace Dynarmic::FP { +namespace { +template<typename FPT> +FPT DetermineExponentValue(size_t value) { + if constexpr (sizeof(FPT) == sizeof(u32)) { + return static_cast<FPT>(mcl::bit::get_bits<23, 30>(value)); + } else if constexpr (sizeof(FPT) == sizeof(u64)) { + return static_cast<FPT>(mcl::bit::get_bits<52, 62>(value)); + } else { + return static_cast<FPT>(mcl::bit::get_bits<10, 14>(value)); + } +} +} // Anonymous namespace + +template<typename FPT> +FPT FPRecipExponent(FPT op, FPCR fpcr, FPSR& fpsr) { + const auto [type, sign, value] = FPUnpack<FPT>(op, fpcr, fpsr); + (void)value; + + if (type == FPType::SNaN || type == FPType::QNaN) { + return FPProcessNaN(type, op, fpcr, fpsr); + } + + const FPT sign_bits = FPInfo<FPT>::Zero(sign); + const FPT exponent = DetermineExponentValue<FPT>(op); + + // Zero and denormals + if (exponent == 0) { + const FPT max_exponent = mcl::bit::ones<FPT>(FPInfo<FPT>::exponent_width) - 1; + return FPT(sign_bits | (max_exponent << FPInfo<FPT>::explicit_mantissa_width)); + } + + // Infinities and normals + const FPT negated_exponent = FPT(~exponent); + const FPT adjusted_exponent = FPT(negated_exponent << FPInfo<FPT>::explicit_mantissa_width) & FPInfo<FPT>::exponent_mask; + return FPT(sign_bits | adjusted_exponent); +} + +template u16 FPRecipExponent<u16>(u16 op, FPCR fpcr, FPSR& fpsr); +template u32 FPRecipExponent<u32>(u32 op, FPCR fpcr, FPSR& fpsr); +template u64 FPRecipExponent<u64>(u64 op, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.h new file mode 100644 index 0000000000..63ff3530b2 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipExponent.h @@ -0,0 +1,16 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; + +template<typename FPT> +FPT FPRecipExponent(FPT op, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.cpp new file mode 100644 index 0000000000..b01477feca --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.cpp @@ -0,0 +1,56 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/op/FPRecipStepFused.h" + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/fused.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/op/FPNeg.h" +#include "dynarmic/common/fp/process_nan.h" +#include "dynarmic/common/fp/unpacked.h" + +namespace Dynarmic::FP { + +template<typename FPT> +FPT FPRecipStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) { + op1 = FPNeg(op1); + + const auto [type1, sign1, value1] = FPUnpack<FPT>(op1, fpcr, fpsr); + const auto [type2, sign2, value2] = FPUnpack<FPT>(op2, fpcr, fpsr); + + if (const auto maybe_nan = FPProcessNaNs(type1, type2, op1, op2, fpcr, fpsr)) { + return *maybe_nan; + } + + const bool inf1 = type1 == FPType::Infinity; + const bool inf2 = type2 == FPType::Infinity; + const bool zero1 = type1 == FPType::Zero; + const bool zero2 = type2 == FPType::Zero; + + if ((inf1 && zero2) || (zero1 && inf2)) { + // return +2.0 + return FPValue<FPT, false, 0, 2>(); + } + + if (inf1 || inf2) { + return FPInfo<FPT>::Infinity(sign1 != sign2); + } + + // result_value = 2.0 + (value1 * value2) + const FPUnpacked result_value = FusedMulAdd(ToNormalized(false, 0, 2), value1, value2); + + if (result_value.mantissa == 0) { + return FPInfo<FPT>::Zero(fpcr.RMode() == RoundingMode::TowardsMinusInfinity); + } + return FPRound<FPT>(result_value, fpcr, fpsr); +} + +template u16 FPRecipStepFused<u16>(u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr); +template u32 FPRecipStepFused<u32>(u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr); +template u64 FPRecipStepFused<u64>(u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.h new file mode 100644 index 0000000000..abe447b500 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRecipStepFused.h @@ -0,0 +1,16 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; + +template<typename FPT> +FPT FPRecipStepFused(FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.cpp new file mode 100644 index 0000000000..7325909075 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.cpp @@ -0,0 +1,97 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/op/FPRoundInt.h" + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/mantissa_util.h" +#include "dynarmic/common/fp/process_exception.h" +#include "dynarmic/common/fp/process_nan.h" +#include "dynarmic/common/fp/rounding_mode.h" +#include "dynarmic/common/fp/unpacked.h" +#include "dynarmic/common/safe_ops.h" + +namespace Dynarmic::FP { + +template<typename FPT> +u64 FPRoundInt(FPT op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr) { + ASSERT(rounding != RoundingMode::ToOdd); + + auto [type, sign, value] = FPUnpack<FPT>(op, fpcr, fpsr); + + if (type == FPType::SNaN || type == FPType::QNaN) { + return FPProcessNaN(type, op, fpcr, fpsr); + } + + if (type == FPType::Infinity) { + return FPInfo<FPT>::Infinity(sign); + } + + if (type == FPType::Zero) { + return FPInfo<FPT>::Zero(sign); + } + + // Reshift decimal point back to bit zero. + const int exponent = value.exponent - normalized_point_position; + + if (exponent >= 0) { + // Guaranteed to be an integer + return op; + } + + u64 int_result = sign ? Safe::Negate<u64>(value.mantissa) : static_cast<u64>(value.mantissa); + const ResidualError error = ResidualErrorOnRightShift(int_result, -exponent); + int_result = Safe::ArithmeticShiftLeft(int_result, exponent); + + bool round_up = false; + switch (rounding) { + case RoundingMode::ToNearest_TieEven: + round_up = error > ResidualError::Half || (error == ResidualError::Half && mcl::bit::get_bit<0>(int_result)); + break; + case RoundingMode::TowardsPlusInfinity: + round_up = error != ResidualError::Zero; + break; + case RoundingMode::TowardsMinusInfinity: + round_up = false; + break; + case RoundingMode::TowardsZero: + round_up = error != ResidualError::Zero && mcl::bit::most_significant_bit(int_result); + break; + case RoundingMode::ToNearest_TieAwayFromZero: + round_up = error > ResidualError::Half || (error == ResidualError::Half && !mcl::bit::most_significant_bit(int_result)); + break; + case RoundingMode::ToOdd: + UNREACHABLE(); + } + + if (round_up) { + int_result++; + } + + const bool new_sign = mcl::bit::most_significant_bit(int_result); + const u64 abs_int_result = new_sign ? Safe::Negate<u64>(int_result) : static_cast<u64>(int_result); + + const FPT result = int_result == 0 + ? FPInfo<FPT>::Zero(sign) + : FPRound<FPT>(FPUnpacked{new_sign, normalized_point_position, abs_int_result}, fpcr, RoundingMode::TowardsZero, fpsr); + + if (error != ResidualError::Zero && exact) { + FPProcessException(FPExc::Inexact, fpcr, fpsr); + } + + return result; +} + +template u64 FPRoundInt<u16>(u16 op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr); +template u64 FPRoundInt<u32>(u32 op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr); +template u64 FPRoundInt<u64>(u64 op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.h new file mode 100644 index 0000000000..e326627ce1 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPRoundInt.h @@ -0,0 +1,19 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/stdint.hpp> + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; +enum class RoundingMode; + +template<typename FPT> +u64 FPRoundInt(FPT op, FPCR fpcr, RoundingMode rounding, bool exact, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.cpp b/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.cpp new file mode 100644 index 0000000000..b1a57d3104 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.cpp @@ -0,0 +1,105 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/op/FPToFixed.h" + +#include <fmt/format.h> +#include <mcl/assert.hpp> +#include <mcl/bit/bit_count.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/mantissa_util.h" +#include "dynarmic/common/fp/process_exception.h" +#include "dynarmic/common/fp/rounding_mode.h" +#include "dynarmic/common/fp/unpacked.h" +#include "dynarmic/common/safe_ops.h" + +namespace Dynarmic::FP { + +template<typename FPT> +u64 FPToFixed(size_t ibits, FPT op, size_t fbits, bool unsigned_, FPCR fpcr, RoundingMode rounding, FPSR& fpsr) { + ASSERT(rounding != RoundingMode::ToOdd); + ASSERT(ibits <= 64); + ASSERT(fbits <= ibits); + + auto [type, sign, value] = FPUnpack<FPT>(op, fpcr, fpsr); + + if (type == FPType::SNaN || type == FPType::QNaN) { + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + } + + // Handle zero + if (value.mantissa == 0) { + return 0; + } + + if (sign && unsigned_) { + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + return 0; + } + + // value *= 2.0^fbits and reshift the decimal point back to bit zero. + int exponent = value.exponent + static_cast<int>(fbits) - normalized_point_position; + + u64 int_result = sign ? Safe::Negate<u64>(value.mantissa) : static_cast<u64>(value.mantissa); + const ResidualError error = ResidualErrorOnRightShift(int_result, -exponent); + int_result = Safe::ArithmeticShiftLeft(int_result, exponent); + + bool round_up = false; + switch (rounding) { + case RoundingMode::ToNearest_TieEven: + round_up = error > ResidualError::Half || (error == ResidualError::Half && mcl::bit::get_bit<0>(int_result)); + break; + case RoundingMode::TowardsPlusInfinity: + round_up = error != ResidualError::Zero; + break; + case RoundingMode::TowardsMinusInfinity: + round_up = false; + break; + case RoundingMode::TowardsZero: + round_up = error != ResidualError::Zero && mcl::bit::most_significant_bit(int_result); + break; + case RoundingMode::ToNearest_TieAwayFromZero: + round_up = error > ResidualError::Half || (error == ResidualError::Half && !mcl::bit::most_significant_bit(int_result)); + break; + case RoundingMode::ToOdd: + UNREACHABLE(); + } + + if (round_up) { + int_result++; + } + + // Detect Overflow + const int min_exponent_for_overflow = static_cast<int>(ibits) - static_cast<int>(mcl::bit::highest_set_bit(value.mantissa + (round_up ? Safe::LogicalShiftRight<u64>(1, exponent) : 0))) - (unsigned_ ? 0 : 1); + if (exponent >= min_exponent_for_overflow) { + // Positive overflow + if (unsigned_ || !sign) { + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + return mcl::bit::ones<u64>(ibits - (unsigned_ ? 0 : 1)); + } + + // Negative overflow + const u64 min_value = Safe::Negate<u64>(static_cast<u64>(1) << (ibits - 1)); + if (!(exponent == min_exponent_for_overflow && int_result == min_value)) { + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + return static_cast<u64>(1) << (ibits - 1); + } + } + + if (error != ResidualError::Zero) { + FPProcessException(FPExc::Inexact, fpcr, fpsr); + } + return int_result & mcl::bit::ones<u64>(ibits); +} + +template u64 FPToFixed<u16>(size_t ibits, u16 op, size_t fbits, bool unsigned_, FPCR fpcr, RoundingMode rounding, FPSR& fpsr); +template u64 FPToFixed<u32>(size_t ibits, u32 op, size_t fbits, bool unsigned_, FPCR fpcr, RoundingMode rounding, FPSR& fpsr); +template u64 FPToFixed<u64>(size_t ibits, u64 op, size_t fbits, bool unsigned_, FPCR fpcr, RoundingMode rounding, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.h b/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.h new file mode 100644 index 0000000000..53a952836e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/op/FPToFixed.h @@ -0,0 +1,19 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/stdint.hpp> + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; +enum class RoundingMode; + +template<typename FPT> +u64 FPToFixed(size_t ibits, FPT op, size_t fbits, bool unsigned_, FPCR fpcr, RoundingMode rounding, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/process_exception.cpp b/externals/dynarmic/src/dynarmic/common/fp/process_exception.cpp new file mode 100644 index 0000000000..a97e0ec820 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/process_exception.cpp @@ -0,0 +1,59 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/process_exception.h" + +#include <mcl/assert.hpp> + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" + +namespace Dynarmic::FP { + +void FPProcessException(FPExc exception, FPCR fpcr, FPSR& fpsr) { + switch (exception) { + case FPExc::InvalidOp: + if (fpcr.IOE()) { + ASSERT_FALSE("Raising floating point exceptions unimplemented"); + } + fpsr.IOC(true); + break; + case FPExc::DivideByZero: + if (fpcr.DZE()) { + ASSERT_FALSE("Raising floating point exceptions unimplemented"); + } + fpsr.DZC(true); + break; + case FPExc::Overflow: + if (fpcr.OFE()) { + ASSERT_FALSE("Raising floating point exceptions unimplemented"); + } + fpsr.OFC(true); + break; + case FPExc::Underflow: + if (fpcr.UFE()) { + ASSERT_FALSE("Raising floating point exceptions unimplemented"); + } + fpsr.UFC(true); + break; + case FPExc::Inexact: + if (fpcr.IXE()) { + ASSERT_FALSE("Raising floating point exceptions unimplemented"); + } + fpsr.IXC(true); + break; + case FPExc::InputDenorm: + if (fpcr.IDE()) { + ASSERT_FALSE("Raising floating point exceptions unimplemented"); + } + fpsr.IDC(true); + break; + default: + UNREACHABLE(); + break; + } +} + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/process_exception.h b/externals/dynarmic/src/dynarmic/common/fp/process_exception.h new file mode 100644 index 0000000000..436a9daa28 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/process_exception.h @@ -0,0 +1,24 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; + +enum class FPExc { + InvalidOp, + DivideByZero, + Overflow, + Underflow, + Inexact, + InputDenorm, +}; + +void FPProcessException(FPExc exception, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/process_nan.cpp b/externals/dynarmic/src/dynarmic/common/fp/process_nan.cpp new file mode 100644 index 0000000000..516b92adb6 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/process_nan.cpp @@ -0,0 +1,93 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/process_nan.h" + +#include <optional> + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/process_exception.h" +#include "dynarmic/common/fp/unpacked.h" + +namespace Dynarmic::FP { + +template<typename FPT> +FPT FPProcessNaN(FPType type, FPT op, FPCR fpcr, FPSR& fpsr) { + ASSERT(type == FPType::QNaN || type == FPType::SNaN); + + constexpr size_t topfrac = FPInfo<FPT>::explicit_mantissa_width - 1; + + FPT result = op; + + if (type == FPType::SNaN) { + result = mcl::bit::set_bit<topfrac>(op, true); + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + } + + if (fpcr.DN()) { + result = FPInfo<FPT>::DefaultNaN(); + } + + return result; +} + +template u16 FPProcessNaN<u16>(FPType type, u16 op, FPCR fpcr, FPSR& fpsr); +template u32 FPProcessNaN<u32>(FPType type, u32 op, FPCR fpcr, FPSR& fpsr); +template u64 FPProcessNaN<u64>(FPType type, u64 op, FPCR fpcr, FPSR& fpsr); + +template<typename FPT> +std::optional<FPT> FPProcessNaNs(FPType type1, FPType type2, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr) { + if (type1 == FPType::SNaN) { + return FPProcessNaN<FPT>(type1, op1, fpcr, fpsr); + } + if (type2 == FPType::SNaN) { + return FPProcessNaN<FPT>(type2, op2, fpcr, fpsr); + } + if (type1 == FPType::QNaN) { + return FPProcessNaN<FPT>(type1, op1, fpcr, fpsr); + } + if (type2 == FPType::QNaN) { + return FPProcessNaN<FPT>(type2, op2, fpcr, fpsr); + } + return std::nullopt; +} + +template std::optional<u16> FPProcessNaNs<u16>(FPType type1, FPType type2, u16 op1, u16 op2, FPCR fpcr, FPSR& fpsr); +template std::optional<u32> FPProcessNaNs<u32>(FPType type1, FPType type2, u32 op1, u32 op2, FPCR fpcr, FPSR& fpsr); +template std::optional<u64> FPProcessNaNs<u64>(FPType type1, FPType type2, u64 op1, u64 op2, FPCR fpcr, FPSR& fpsr); + +template<typename FPT> +std::optional<FPT> FPProcessNaNs3(FPType type1, FPType type2, FPType type3, FPT op1, FPT op2, FPT op3, FPCR fpcr, FPSR& fpsr) { + if (type1 == FPType::SNaN) { + return FPProcessNaN<FPT>(type1, op1, fpcr, fpsr); + } + if (type2 == FPType::SNaN) { + return FPProcessNaN<FPT>(type2, op2, fpcr, fpsr); + } + if (type3 == FPType::SNaN) { + return FPProcessNaN<FPT>(type3, op3, fpcr, fpsr); + } + if (type1 == FPType::QNaN) { + return FPProcessNaN<FPT>(type1, op1, fpcr, fpsr); + } + if (type2 == FPType::QNaN) { + return FPProcessNaN<FPT>(type2, op2, fpcr, fpsr); + } + if (type3 == FPType::QNaN) { + return FPProcessNaN<FPT>(type3, op3, fpcr, fpsr); + } + return std::nullopt; +} + +template std::optional<u16> FPProcessNaNs3<u16>(FPType type1, FPType type2, FPType type3, u16 op1, u16 op2, u16 op3, FPCR fpcr, FPSR& fpsr); +template std::optional<u32> FPProcessNaNs3<u32>(FPType type1, FPType type2, FPType type3, u32 op1, u32 op2, u32 op3, FPCR fpcr, FPSR& fpsr); +template std::optional<u64> FPProcessNaNs3<u64>(FPType type1, FPType type2, FPType type3, u64 op1, u64 op2, u64 op3, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/process_nan.h b/externals/dynarmic/src/dynarmic/common/fp/process_nan.h new file mode 100644 index 0000000000..331b7943fc --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/process_nan.h @@ -0,0 +1,25 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <optional> + +namespace Dynarmic::FP { + +class FPCR; +class FPSR; +enum class FPType; + +template<typename FPT> +FPT FPProcessNaN(FPType type, FPT op, FPCR fpcr, FPSR& fpsr); + +template<typename FPT> +std::optional<FPT> FPProcessNaNs(FPType type1, FPType type2, FPT op1, FPT op2, FPCR fpcr, FPSR& fpsr); + +template<typename FPT> +std::optional<FPT> FPProcessNaNs3(FPType type1, FPType type2, FPType type3, FPT op1, FPT op2, FPT op3, FPCR fpcr, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/rounding_mode.h b/externals/dynarmic/src/dynarmic/common/fp/rounding_mode.h new file mode 100644 index 0000000000..fcf36f6100 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/rounding_mode.h @@ -0,0 +1,27 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::FP { + +/// Ordering of first four values is important as they correspond to bits in FPCR. +enum class RoundingMode { + /// Round to nearest floating point. If there is a tie, round to nearest even digit in required position. + ToNearest_TieEven, + /// Round up towards positive infinity. + TowardsPlusInfinity, + /// Round downwards towards negative infinity. + TowardsMinusInfinity, + /// Truncate towards zero. + TowardsZero, + /// Round to nearest floating point. If there is a tie, round away from zero. + ToNearest_TieAwayFromZero, + /// Von Neumann rounding (as modified by Brent). Also known as sticky rounding. + /// Set the least significant bit to 1 if the result is not exact. + ToOdd, +}; + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/unpacked.cpp b/externals/dynarmic/src/dynarmic/common/fp/unpacked.cpp new file mode 100644 index 0000000000..f853ab07e1 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/unpacked.cpp @@ -0,0 +1,197 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/unpacked.h" + +#include <algorithm> + +#include <mcl/bit/bit_count.hpp> +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/common/fp/fpsr.h" +#include "dynarmic/common/fp/info.h" +#include "dynarmic/common/fp/mantissa_util.h" +#include "dynarmic/common/fp/process_exception.h" +#include "dynarmic/common/fp/rounding_mode.h" +#include "dynarmic/common/safe_ops.h" + +namespace Dynarmic::FP { + +template<typename FPT> +std::tuple<FPType, bool, FPUnpacked> FPUnpackBase(FPT op, FPCR fpcr, [[maybe_unused]] FPSR& fpsr) { + constexpr size_t sign_bit = FPInfo<FPT>::exponent_width + FPInfo<FPT>::explicit_mantissa_width; + constexpr size_t exponent_high_bit = FPInfo<FPT>::exponent_width + FPInfo<FPT>::explicit_mantissa_width - 1; + constexpr size_t exponent_low_bit = FPInfo<FPT>::explicit_mantissa_width; + constexpr size_t mantissa_high_bit = FPInfo<FPT>::explicit_mantissa_width - 1; + constexpr size_t mantissa_low_bit = 0; + constexpr int denormal_exponent = FPInfo<FPT>::exponent_min - int(FPInfo<FPT>::explicit_mantissa_width); + + constexpr bool is_half_precision = std::is_same_v<FPT, u16>; + const bool sign = mcl::bit::get_bit<sign_bit>(op); + const FPT exp_raw = mcl::bit::get_bits<exponent_low_bit, exponent_high_bit>(op); + const FPT frac_raw = mcl::bit::get_bits<mantissa_low_bit, mantissa_high_bit>(op); + + if (exp_raw == 0) { + if constexpr (is_half_precision) { + if (frac_raw == 0 || fpcr.FZ16()) { + return {FPType::Zero, sign, {sign, 0, 0}}; + } + return {FPType::Nonzero, sign, ToNormalized(sign, denormal_exponent, frac_raw)}; + } else { + if (frac_raw == 0 || fpcr.FZ()) { + if (frac_raw != 0) { + FPProcessException(FPExc::InputDenorm, fpcr, fpsr); + } + return {FPType::Zero, sign, {sign, 0, 0}}; + } + + return {FPType::Nonzero, sign, ToNormalized(sign, denormal_exponent, frac_raw)}; + } + } + + const bool exp_all_ones = exp_raw == mcl::bit::ones<FPT>(FPInfo<FPT>::exponent_width); + const bool ahp_disabled = is_half_precision && !fpcr.AHP(); + if ((exp_all_ones && !is_half_precision) || (exp_all_ones && ahp_disabled)) { + if (frac_raw == 0) { + return {FPType::Infinity, sign, ToNormalized(sign, 1000000, 1)}; + } + + const bool is_quiet = mcl::bit::get_bit<mantissa_high_bit>(frac_raw); + return {is_quiet ? FPType::QNaN : FPType::SNaN, sign, {sign, 0, 0}}; + } + + const int exp = static_cast<int>(exp_raw) - FPInfo<FPT>::exponent_bias; + const u64 frac = static_cast<u64>(frac_raw | FPInfo<FPT>::implicit_leading_bit) << (normalized_point_position - FPInfo<FPT>::explicit_mantissa_width); + return {FPType::Nonzero, sign, {sign, exp, frac}}; +} + +template std::tuple<FPType, bool, FPUnpacked> FPUnpackBase<u16>(u16 op, FPCR fpcr, FPSR& fpsr); +template std::tuple<FPType, bool, FPUnpacked> FPUnpackBase<u32>(u32 op, FPCR fpcr, FPSR& fpsr); +template std::tuple<FPType, bool, FPUnpacked> FPUnpackBase<u64>(u64 op, FPCR fpcr, FPSR& fpsr); + +template<size_t F> +std::tuple<bool, int, u64, ResidualError> Normalize(FPUnpacked op, int extra_right_shift = 0) { + const int highest_set_bit = mcl::bit::highest_set_bit(op.mantissa); + const int shift_amount = highest_set_bit - static_cast<int>(F) + extra_right_shift; + const u64 mantissa = Safe::LogicalShiftRight(op.mantissa, shift_amount); + const ResidualError error = ResidualErrorOnRightShift(op.mantissa, shift_amount); + const int exponent = op.exponent + highest_set_bit - normalized_point_position; + return std::make_tuple(op.sign, exponent, mantissa, error); +} + +template<typename FPT> +FPT FPRoundBase(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr) { + ASSERT(op.mantissa != 0); + ASSERT(rounding != RoundingMode::ToNearest_TieAwayFromZero); + + constexpr int minimum_exp = FPInfo<FPT>::exponent_min; + constexpr size_t E = FPInfo<FPT>::exponent_width; + constexpr size_t F = FPInfo<FPT>::explicit_mantissa_width; + constexpr bool isFP16 = FPInfo<FPT>::total_width == 16; + + auto [sign, exponent, mantissa, error] = Normalize<F>(op); + + if (((!isFP16 && fpcr.FZ()) || (isFP16 && fpcr.FZ16())) && exponent < minimum_exp) { + fpsr.UFC(true); + return FPInfo<FPT>::Zero(sign); + } + + int biased_exp = std::max<int>(exponent - minimum_exp + 1, 0); + if (biased_exp == 0) { + std::tie(sign, exponent, mantissa, error) = Normalize<F>(op, minimum_exp - exponent); + } + + if (biased_exp == 0 && (error != ResidualError::Zero || fpcr.UFE())) { + FPProcessException(FPExc::Underflow, fpcr, fpsr); + } + + bool round_up = false, overflow_to_inf = false; + switch (rounding) { + case RoundingMode::ToNearest_TieEven: { + round_up = (error > ResidualError::Half) || (error == ResidualError::Half && mcl::bit::get_bit<0>(mantissa)); + overflow_to_inf = true; + break; + } + case RoundingMode::TowardsPlusInfinity: + round_up = error != ResidualError::Zero && !sign; + overflow_to_inf = !sign; + break; + case RoundingMode::TowardsMinusInfinity: + round_up = error != ResidualError::Zero && sign; + overflow_to_inf = sign; + break; + default: + break; + } + + if (round_up) { + if ((mantissa & FPInfo<FPT>::mantissa_mask) == FPInfo<FPT>::mantissa_mask) { + // Overflow on rounding up is going to happen + if (mantissa == FPInfo<FPT>::mantissa_mask) { + // Rounding up from denormal to normal + mantissa++; + biased_exp++; + } else { + // Rounding up to next exponent + mantissa = (mantissa + 1) / 2; + biased_exp++; + } + } else { + mantissa++; + } + } + + if (error != ResidualError::Zero && rounding == RoundingMode::ToOdd) { + mantissa = mcl::bit::set_bit<0>(mantissa, true); + } + + FPT result = 0; +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4127) // C4127: conditional expression is constant +#endif + if (!isFP16 || !fpcr.AHP()) { +#ifdef _MSC_VER +# pragma warning(pop) +#endif + constexpr int max_biased_exp = (1 << E) - 1; + if (biased_exp >= max_biased_exp) { + result = overflow_to_inf ? FPInfo<FPT>::Infinity(sign) : FPInfo<FPT>::MaxNormal(sign); + FPProcessException(FPExc::Overflow, fpcr, fpsr); + FPProcessException(FPExc::Inexact, fpcr, fpsr); + } else { + result = sign ? 1 : 0; + result <<= E; + result += FPT(biased_exp); + result <<= F; + result |= static_cast<FPT>(mantissa) & FPInfo<FPT>::mantissa_mask; + if (error != ResidualError::Zero) { + FPProcessException(FPExc::Inexact, fpcr, fpsr); + } + } + } else { + constexpr int max_biased_exp = (1 << E); + if (biased_exp >= max_biased_exp) { + result = sign ? 0xFFFF : 0x7FFF; + FPProcessException(FPExc::InvalidOp, fpcr, fpsr); + } else { + result = sign ? 1 : 0; + result <<= E; + result += FPT(biased_exp); + result <<= F; + result |= static_cast<FPT>(mantissa) & FPInfo<FPT>::mantissa_mask; + if (error != ResidualError::Zero) { + FPProcessException(FPExc::Inexact, fpcr, fpsr); + } + } + } + return result; +} + +template u16 FPRoundBase<u16>(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr); +template u32 FPRoundBase<u32>(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr); +template u64 FPRoundBase<u64>(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr); + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/unpacked.h b/externals/dynarmic/src/dynarmic/common/fp/unpacked.h new file mode 100644 index 0000000000..77f33d8966 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/unpacked.h @@ -0,0 +1,90 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <tuple> + +#include <mcl/bit/bit_count.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/fpcr.h" + +namespace Dynarmic::FP { + +class FPSR; +enum class RoundingMode; + +enum class FPType { + Nonzero, + Zero, + Infinity, + QNaN, + SNaN, +}; + +constexpr size_t normalized_point_position = 62; + +/// value = (sign ? -1 : +1) * mantissa/(2^62) * 2^exponent +/// 63rd bit of mantissa is always set (unless value is zero) +struct FPUnpacked { + bool sign; + int exponent; + u64 mantissa; +}; + +inline bool operator==(const FPUnpacked& a, const FPUnpacked& b) { + return std::tie(a.sign, a.exponent, a.mantissa) == std::tie(b.sign, b.exponent, b.mantissa); +} + +/// return value = (sign ? -1 : +1) * value * 2^exponent +constexpr FPUnpacked ToNormalized(bool sign, int exponent, u64 value) { + if (value == 0) { + return {sign, 0, 0}; + } + + const int highest_bit = mcl::bit::highest_set_bit(value); + const int offset = static_cast<int>(normalized_point_position) - highest_bit; + value <<= offset; + exponent -= offset - static_cast<int>(normalized_point_position); + return {sign, exponent, value}; +} + +template<typename FPT> +std::tuple<FPType, bool, FPUnpacked> FPUnpackBase(FPT op, FPCR fpcr, FPSR& fpsr); + +template<typename FPT> +std::tuple<FPType, bool, FPUnpacked> FPUnpack(FPT op, FPCR fpcr, FPSR& fpsr) { + fpcr.AHP(false); + return FPUnpackBase(op, fpcr, fpsr); +} + +template<typename FPT> +std::tuple<FPType, bool, FPUnpacked> FPUnpackCV(FPT op, FPCR fpcr, FPSR& fpsr) { + fpcr.FZ16(false); + return FPUnpackBase(op, fpcr, fpsr); +} + +template<typename FPT> +FPT FPRoundBase(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr); + +template<typename FPT> +FPT FPRound(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr) { + fpcr.AHP(false); + return FPRoundBase<FPT>(op, fpcr, rounding, fpsr); +} + +template<typename FPT> +FPT FPRoundCV(FPUnpacked op, FPCR fpcr, RoundingMode rounding, FPSR& fpsr) { + fpcr.FZ16(false); + return FPRoundBase<FPT>(op, fpcr, rounding, fpsr); +} + +template<typename FPT> +FPT FPRound(FPUnpacked op, FPCR fpcr, FPSR& fpsr) { + return FPRound<FPT>(op, fpcr, fpcr.RMode(), fpsr); +} + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/fp/util.h b/externals/dynarmic/src/dynarmic/common/fp/util.h new file mode 100644 index 0000000000..fda34e3ee5 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/fp/util.h @@ -0,0 +1,99 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <optional> + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/common/fp/info.h" + +namespace Dynarmic::FP { + +/// Is floating point value a zero? +template<typename FPT> +inline bool IsZero(FPT value, FPCR fpcr) { + if (fpcr.FZ()) { + return (value & FPInfo<FPT>::exponent_mask) == 0; + } + return (value & ~FPInfo<FPT>::sign_mask) == 0; +} + +/// Is floating point value an infinity? +template<typename FPT> +constexpr bool IsInf(FPT value) { + return (value & ~FPInfo<FPT>::sign_mask) == FPInfo<FPT>::Infinity(false); +} + +/// Is floating point value a QNaN? +template<typename FPT> +constexpr bool IsQNaN(FPT value) { + constexpr FPT qnan_bits = FPInfo<FPT>::exponent_mask | FPInfo<FPT>::mantissa_msb; + return (value & qnan_bits) == qnan_bits; +} + +/// Is floating point value a SNaN? +template<typename FPT> +constexpr bool IsSNaN(FPT value) { + constexpr FPT qnan_bits = FPInfo<FPT>::exponent_mask | FPInfo<FPT>::mantissa_msb; + constexpr FPT snan_bits = FPInfo<FPT>::exponent_mask; + return (value & qnan_bits) == snan_bits && (value & FPInfo<FPT>::mantissa_mask) != 0; +} + +/// Is floating point value a NaN? +template<typename FPT> +constexpr bool IsNaN(FPT value) { + return IsQNaN(value) || IsSNaN(value); +} + +/// Given a single argument, return the NaN value which would be returned by an ARM processor. +/// If the argument isn't a NaN, returns std::nullopt. +template<typename FPT> +constexpr std::optional<FPT> ProcessNaNs(FPT a) { + if (IsSNaN(a)) { + return a | FPInfo<FPT>::mantissa_msb; + } else if (IsQNaN(a)) { + return a; + } + return std::nullopt; +} + +/// Given a pair of arguments, return the NaN value which would be returned by an ARM processor. +/// If neither argument is a NaN, returns std::nullopt. +template<typename FPT> +constexpr std::optional<FPT> ProcessNaNs(FPT a, FPT b) { + if (IsSNaN(a)) { + return a | FPInfo<FPT>::mantissa_msb; + } else if (IsSNaN(b)) { + return b | FPInfo<FPT>::mantissa_msb; + } else if (IsQNaN(a)) { + return a; + } else if (IsQNaN(b)) { + return b; + } + return std::nullopt; +} + +/// Given three arguments, return the NaN value which would be returned by an ARM processor. +/// If none of the arguments is a NaN, returns std::nullopt. +template<typename FPT> +constexpr std::optional<FPT> ProcessNaNs(FPT a, FPT b, FPT c) { + if (IsSNaN(a)) { + return a | FPInfo<FPT>::mantissa_msb; + } else if (IsSNaN(b)) { + return b | FPInfo<FPT>::mantissa_msb; + } else if (IsSNaN(c)) { + return c | FPInfo<FPT>::mantissa_msb; + } else if (IsQNaN(a)) { + return a; + } else if (IsQNaN(b)) { + return b; + } else if (IsQNaN(c)) { + return c; + } + return std::nullopt; +} + +} // namespace Dynarmic::FP diff --git a/externals/dynarmic/src/dynarmic/common/llvm_disassemble.cpp b/externals/dynarmic/src/dynarmic/common/llvm_disassemble.cpp new file mode 100644 index 0000000000..636ee3a250 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/llvm_disassemble.cpp @@ -0,0 +1,128 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <string> + +#include <fmt/format.h> + +#ifdef DYNARMIC_USE_LLVM +# include <llvm-c/Disassembler.h> +# include <llvm-c/Target.h> +#endif + +#include <mcl/assert.hpp> +#include <mcl/bit_cast.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/llvm_disassemble.h" + +namespace Dynarmic::Common { + +std::string DisassembleX64(const void* begin, const void* end) { + std::string result; + +#ifdef DYNARMIC_USE_LLVM + LLVMInitializeX86TargetInfo(); + LLVMInitializeX86TargetMC(); + LLVMInitializeX86Disassembler(); + LLVMDisasmContextRef llvm_ctx = LLVMCreateDisasm("x86_64", nullptr, 0, nullptr, nullptr); + LLVMSetDisasmOptions(llvm_ctx, LLVMDisassembler_Option_AsmPrinterVariant); + + const u8* pos = reinterpret_cast<const u8*>(begin); + size_t remaining = reinterpret_cast<size_t>(end) - reinterpret_cast<size_t>(pos); + while (pos < end) { + char buffer[80]; + size_t inst_size = LLVMDisasmInstruction(llvm_ctx, const_cast<u8*>(pos), remaining, reinterpret_cast<u64>(pos), buffer, sizeof(buffer)); + ASSERT(inst_size); + for (const u8* i = pos; i < pos + inst_size; i++) + result += fmt::format("{:02x} ", *i); + for (size_t i = inst_size; i < 10; i++) + result += " "; + result += buffer; + result += '\n'; + + pos += inst_size; + remaining -= inst_size; + } + + LLVMDisasmDispose(llvm_ctx); +#else + result += fmt::format("(recompile with DYNARMIC_USE_LLVM=ON to disassemble the generated x86_64 code)\n"); + result += fmt::format("start: {:016x}, end: {:016x}\n", mcl::bit_cast<u64>(begin), mcl::bit_cast<u64>(end)); +#endif + + return result; +} + +std::string DisassembleAArch32([[maybe_unused]] bool is_thumb, [[maybe_unused]] u32 pc, [[maybe_unused]] const u8* instructions, [[maybe_unused]] size_t length) { + std::string result; + +#ifdef DYNARMIC_USE_LLVM + LLVMInitializeARMTargetInfo(); + LLVMInitializeARMTargetMC(); + LLVMInitializeARMDisassembler(); + LLVMDisasmContextRef llvm_ctx = LLVMCreateDisasm(is_thumb ? "thumbv8-arm" : "armv8-arm", nullptr, 0, nullptr, nullptr); + LLVMSetDisasmOptions(llvm_ctx, LLVMDisassembler_Option_AsmPrinterVariant); + + char buffer[1024]; + while (length) { + size_t inst_size = LLVMDisasmInstruction(llvm_ctx, const_cast<u8*>(instructions), length, pc, buffer, sizeof(buffer)); + const char* const disassembled = inst_size > 0 ? buffer : "<invalid instruction>"; + + if (inst_size == 0) + inst_size = is_thumb ? 2 : 4; + + result += fmt::format("{:08x} ", pc); + for (size_t i = 0; i < 4; i++) { + if (i < inst_size) { + result += fmt::format("{:02x}", instructions[inst_size - i - 1]); + } else { + result += " "; + } + } + result += disassembled; + result += '\n'; + + if (length <= inst_size) + break; + + pc += inst_size; + instructions += inst_size; + length -= inst_size; + } + + LLVMDisasmDispose(llvm_ctx); +#else + result += fmt::format("(disassembly disabled)\n"); +#endif + + return result; +} + +std::string DisassembleAArch64([[maybe_unused]] u32 instruction, [[maybe_unused]] u64 pc) { + std::string result; + +#ifdef DYNARMIC_USE_LLVM + LLVMInitializeAArch64TargetInfo(); + LLVMInitializeAArch64TargetMC(); + LLVMInitializeAArch64Disassembler(); + LLVMDisasmContextRef llvm_ctx = LLVMCreateDisasm("aarch64", nullptr, 0, nullptr, nullptr); + LLVMSetDisasmOptions(llvm_ctx, LLVMDisassembler_Option_AsmPrinterVariant); + + char buffer[80]; + size_t inst_size = LLVMDisasmInstruction(llvm_ctx, (u8*)&instruction, sizeof(instruction), pc, buffer, sizeof(buffer)); + result = fmt::format("{:016x} {:08x} ", pc, instruction); + result += inst_size > 0 ? buffer : "<invalid instruction>"; + result += '\n'; + + LLVMDisasmDispose(llvm_ctx); +#else + result += fmt::format("(disassembly disabled)\n"); +#endif + + return result; +} + +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/llvm_disassemble.h b/externals/dynarmic/src/dynarmic/common/llvm_disassemble.h new file mode 100644 index 0000000000..56de791a56 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/llvm_disassemble.h @@ -0,0 +1,18 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <string> + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Common { + +std::string DisassembleX64(const void* pos, const void* end); +std::string DisassembleAArch32(bool is_thumb, u32 pc, const u8* instructions, size_t length); +std::string DisassembleAArch64(u32 instruction, u64 pc = 0); + +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/lut_from_list.h b/externals/dynarmic/src/dynarmic/common/lut_from_list.h new file mode 100644 index 0000000000..ed9e3dc046 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/lut_from_list.h @@ -0,0 +1,37 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <initializer_list> +#include <map> +#include <type_traits> + +#include <mcl/mp/metafunction/apply.hpp> +#include <mcl/mp/typelist/list.hpp> +#include <mcl/type_traits/is_instance_of_template.hpp> + +#ifdef _MSC_VER +# include <mcl/mp/typelist/head.hpp> +#endif + +namespace Dynarmic::Common { + +template<typename Function, typename... Values> +inline auto GenerateLookupTableFromList(Function f, mcl::mp::list<Values...>) { +#ifdef _MSC_VER + using PairT = std::invoke_result_t<Function, mcl::mp::head<mcl::mp::list<Values...>>>; +#else + using PairT = std::common_type_t<std::invoke_result_t<Function, Values>...>; +#endif + using MapT = mcl::mp::apply<std::map, PairT>; + + static_assert(mcl::is_instance_of_template_v<std::pair, PairT>); + + const std::initializer_list<PairT> pair_array{f(Values{})...}; + return MapT(pair_array.begin(), pair_array.end()); +} + +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/math_util.cpp b/externals/dynarmic/src/dynarmic/common/math_util.cpp new file mode 100644 index 0000000000..32211fd979 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/math_util.cpp @@ -0,0 +1,68 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/math_util.h" + +#include <array> + +namespace Dynarmic::Common { + +u8 RecipEstimate(u64 a) { + using LUT = std::array<u8, 256>; + static constexpr u64 lut_offset = 256; + + static const LUT lut = [] { + LUT result{}; + for (u64 i = 0; i < result.size(); i++) { + u64 a = i + lut_offset; + + a = a * 2 + 1; + u64 b = (1u << 19) / a; + result[i] = static_cast<u8>((b + 1) / 2); + } + return result; + }(); + + return lut[a - lut_offset]; +} + +/// Input is a u0.9 fixed point number. Only values in [0.25, 1.0) are valid. +/// Output is a u0.8 fixed point number, with an implied 1 prefixed. +/// i.e.: The output is a value in [1.0, 2.0). +u8 RecipSqrtEstimate(u64 a) { + using LUT = std::array<u8, 512>; + + static const LUT lut = [] { + LUT result{}; + for (u64 i = 128; i < result.size(); i++) { + u64 a = i; + + // Convert to u.10 (with 8 significant bits), force to odd + if (a < 256) { + // [0.25, 0.5) + a = a * 2 + 1; + } else { + // [0.5, 1.0) + a = (a | 1) * 2; + } + + // Calculate largest b which for which b < 1.0 / sqrt(a). + // Start from b = 1.0 (in u.9) since b cannot be smaller. + u64 b = 512; + // u.10 * u.9 * u.9 -> u.28 + while (a * (b + 1) * (b + 1) < (1u << 28)) { + b++; + } + + // Round to nearest u0.8 (with implied set integer bit). + result[i] = static_cast<u8>((b + 1) / 2); + } + return result; + }(); + + return lut[a & 0x1FF]; +} + +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/math_util.h b/externals/dynarmic/src/dynarmic/common/math_util.h new file mode 100644 index 0000000000..5c1f784c89 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/math_util.h @@ -0,0 +1,47 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <utility> + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Common { + +/** + * This function is a workaround for a bug in MSVC 19.12 where fold expressions + * do not work when the /permissive- flag is enabled. + */ +template<typename T, typename... Ts> +constexpr T Sum(T first, Ts&&... rest) { + if constexpr (sizeof...(rest) == 0) { + return first; + } else { + return first + Sum(std::forward<Ts>(rest)...); + } +} + +/** + * Input is a u0.9 fixed point number. Only values in [0.5, 1.0) are valid. + * Output is a u0.8 fixed point number, with an implied 1 prefixed. + * i.e.: The output is a value in [1.0, 2.0). + * + * @see RecipEstimate() within the ARMv8 architecture reference manual + * for a general overview of the requirements of the algorithm. + */ +u8 RecipEstimate(u64 a); + +/** + * Input is a u0.9 fixed point number. Only values in [0.25, 1.0) are valid. + * Output is a u0.8 fixed point number, with an implied 1 prefixed. + * i.e.: The output is a value in [1.0, 2.0). + * + * @see RecipSqrtEstimate() within the ARMv8 architecture reference manual + * for a general overview of the requirements of the algorithm. + */ +u8 RecipSqrtEstimate(u64 a); + +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/memory_pool.cpp b/externals/dynarmic/src/dynarmic/common/memory_pool.cpp new file mode 100644 index 0000000000..a62cd11295 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/memory_pool.cpp @@ -0,0 +1,44 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/memory_pool.h" + +#include <cstdlib> + +namespace Dynarmic::Common { + +Pool::Pool(size_t object_size, size_t initial_pool_size) + : object_size(object_size), slab_size(initial_pool_size) { + AllocateNewSlab(); +} + +Pool::~Pool() { + std::free(current_slab); + + for (char* slab : slabs) { + std::free(slab); + } +} + +void* Pool::Alloc() { + if (remaining == 0) { + slabs.push_back(current_slab); + AllocateNewSlab(); + } + + void* ret = static_cast<void*>(current_ptr); + current_ptr += object_size; + remaining--; + + return ret; +} + +void Pool::AllocateNewSlab() { + current_slab = static_cast<char*>(std::malloc(object_size * slab_size)); + current_ptr = current_slab; + remaining = slab_size; +} + +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/memory_pool.h b/externals/dynarmic/src/dynarmic/common/memory_pool.h new file mode 100644 index 0000000000..1cbdd9095e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/memory_pool.h @@ -0,0 +1,45 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <cstddef> +#include <vector> + +namespace Dynarmic::Common { + +class Pool { +public: + /** + * @param object_size Byte-size of objects to construct + * @param initial_pool_size Number of objects to have per slab + */ + Pool(size_t object_size, size_t initial_pool_size); + ~Pool(); + + Pool(const Pool&) = delete; + Pool(Pool&&) = delete; + + Pool& operator=(const Pool&) = delete; + Pool& operator=(Pool&&) = delete; + + /// Returns a pointer to an `object_size`-bytes block of memory. + void* Alloc(); + +private: + // Allocates a completely new memory slab. + // Used when an entirely new slab is needed + // due the current one running out of usable space. + void AllocateNewSlab(); + + size_t object_size; + size_t slab_size; + char* current_slab; + char* current_ptr; + size_t remaining; + std::vector<char*> slabs; +}; + +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/safe_ops.h b/externals/dynarmic/src/dynarmic/common/safe_ops.h new file mode 100644 index 0000000000..aef3134762 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/safe_ops.h @@ -0,0 +1,115 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <type_traits> + +#include <mcl/bitsizeof.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/u128.h" + +namespace Dynarmic::Safe { + +template<typename T> +T LogicalShiftLeft(T value, int shift_amount); +template<typename T> +T LogicalShiftRight(T value, int shift_amount); +template<typename T> +T ArithmeticShiftLeft(T value, int shift_amount); +template<typename T> +T ArithmeticShiftRight(T value, int shift_amount); + +template<typename T> +T LogicalShiftLeft(T value, int shift_amount) { + static_assert(std::is_integral_v<T>); + + if (shift_amount >= static_cast<int>(mcl::bitsizeof<T>)) { + return 0; + } + + if (shift_amount < 0) { + return LogicalShiftRight(value, -shift_amount); + } + + auto unsigned_value = static_cast<std::make_unsigned_t<T>>(value); + return static_cast<T>(unsigned_value << shift_amount); +} + +template<> +inline u128 LogicalShiftLeft(u128 value, int shift_amount) { + return value << shift_amount; +} + +template<typename T> +T LogicalShiftRight(T value, int shift_amount) { + static_assert(std::is_integral_v<T>); + + if (shift_amount >= static_cast<int>(mcl::bitsizeof<T>)) { + return 0; + } + + if (shift_amount < 0) { + return LogicalShiftLeft(value, -shift_amount); + } + + auto unsigned_value = static_cast<std::make_unsigned_t<T>>(value); + return static_cast<T>(unsigned_value >> shift_amount); +} + +template<> +inline u128 LogicalShiftRight(u128 value, int shift_amount) { + return value >> shift_amount; +} + +template<typename T> +T LogicalShiftRightDouble(T top, T bottom, int shift_amount) { + return LogicalShiftLeft(top, int(mcl::bitsizeof<T>) - shift_amount) | LogicalShiftRight(bottom, shift_amount); +} + +template<typename T> +T ArithmeticShiftLeft(T value, int shift_amount) { + static_assert(std::is_integral_v<T>); + + if (shift_amount >= static_cast<int>(mcl::bitsizeof<T>)) { + return 0; + } + + if (shift_amount < 0) { + return ArithmeticShiftRight(value, -shift_amount); + } + + auto unsigned_value = static_cast<std::make_unsigned_t<T>>(value); + return static_cast<T>(unsigned_value << shift_amount); +} + +template<typename T> +T ArithmeticShiftRight(T value, int shift_amount) { + static_assert(std::is_integral_v<T>); + + if (shift_amount >= static_cast<int>(mcl::bitsizeof<T>)) { + return mcl::bit::most_significant_bit(value) ? ~static_cast<T>(0) : 0; + } + + if (shift_amount < 0) { + return ArithmeticShiftLeft(value, -shift_amount); + } + + auto signed_value = static_cast<std::make_signed_t<T>>(value); + return static_cast<T>(signed_value >> shift_amount); +} + +template<typename T> +T ArithmeticShiftRightDouble(T top, T bottom, int shift_amount) { + return ArithmeticShiftLeft(top, int(mcl::bitsizeof<T>) - shift_amount) | LogicalShiftRight(bottom, shift_amount); +} + +template<typename T> +T Negate(T value) { + return static_cast<T>(~static_cast<std::uintmax_t>(value) + 1); +} + +} // namespace Dynarmic::Safe diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock.h b/externals/dynarmic/src/dynarmic/common/spin_lock.h new file mode 100644 index 0000000000..f653704db6 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/spin_lock.h @@ -0,0 +1,17 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic { + +struct SpinLock { + void Lock(); + void Unlock(); + + volatile int storage = 0; +}; + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.cpp b/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.cpp new file mode 100644 index 0000000000..ccf807e2d2 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.cpp @@ -0,0 +1,86 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mutex> + +#include <oaknut/code_block.hpp> +#include <oaknut/oaknut.hpp> + +#include "dynarmic/backend/arm64/abi.h" +#include "dynarmic/common/spin_lock.h" + +namespace Dynarmic { + +using Backend::Arm64::Wscratch0; +using Backend::Arm64::Wscratch1; +using namespace oaknut::util; + +void EmitSpinLockLock(oaknut::CodeGenerator& code, oaknut::XReg ptr) { + oaknut::Label start, loop; + + code.MOV(Wscratch1, 1); + code.SEVL(); + code.l(start); + code.WFE(); + code.l(loop); + code.LDAXR(Wscratch0, ptr); + code.CBNZ(Wscratch0, start); + code.STXR(Wscratch0, Wscratch1, ptr); + code.CBNZ(Wscratch0, loop); +} + +void EmitSpinLockUnlock(oaknut::CodeGenerator& code, oaknut::XReg ptr) { + code.STLR(WZR, ptr); +} + +namespace { + +struct SpinLockImpl { + SpinLockImpl(); + + void Initialize(); + + oaknut::CodeBlock mem; + oaknut::CodeGenerator code; + + void (*lock)(volatile int*); + void (*unlock)(volatile int*); +}; + +std::once_flag flag; +SpinLockImpl impl; + +SpinLockImpl::SpinLockImpl() + : mem{4096} + , code{mem.ptr(), mem.ptr()} {} + +void SpinLockImpl::Initialize() { + mem.unprotect(); + + lock = code.xptr<void (*)(volatile int*)>(); + EmitSpinLockLock(code, X0); + code.RET(); + + unlock = code.xptr<void (*)(volatile int*)>(); + EmitSpinLockUnlock(code, X0); + code.RET(); + + mem.protect(); + mem.invalidate_all(); +} + +} // namespace + +void SpinLock::Lock() { + std::call_once(flag, &SpinLockImpl::Initialize, impl); + impl.lock(&storage); +} + +void SpinLock::Unlock() { + std::call_once(flag, &SpinLockImpl::Initialize, impl); + impl.unlock(&storage); +} + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.h b/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.h new file mode 100644 index 0000000000..c0a86bfed3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/spin_lock_arm64.h @@ -0,0 +1,15 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <oaknut/oaknut.hpp> + +namespace Dynarmic { + +void EmitSpinLockLock(oaknut::CodeGenerator& code, oaknut::XReg ptr); +void EmitSpinLockUnlock(oaknut::CodeGenerator& code, oaknut::XReg ptr); + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock_x64.cpp b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.cpp new file mode 100644 index 0000000000..fdea94f4be --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.cpp @@ -0,0 +1,76 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mutex> + +#include <xbyak/xbyak.h> + +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/hostloc.h" +#include "dynarmic/common/spin_lock.h" + +namespace Dynarmic { + +void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) { + Xbyak::Label start, loop; + + code.jmp(start); + code.L(loop); + code.pause(); + code.L(start); + code.mov(tmp, 1); + code.lock(); + code.xchg(code.dword[ptr], tmp); + code.test(tmp, tmp); + code.jnz(loop); +} + +void EmitSpinLockUnlock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) { + code.xor_(tmp, tmp); + code.xchg(code.dword[ptr], tmp); + code.mfence(); +} + +namespace { + +struct SpinLockImpl { + void Initialize(); + + Xbyak::CodeGenerator code; + + void (*lock)(volatile int*); + void (*unlock)(volatile int*); +}; + +std::once_flag flag; +SpinLockImpl impl; + +void SpinLockImpl::Initialize() { + const Xbyak::Reg64 ABI_PARAM1 = Backend::X64::HostLocToReg64(Backend::X64::ABI_PARAM1); + + code.align(); + lock = code.getCurr<void (*)(volatile int*)>(); + EmitSpinLockLock(code, ABI_PARAM1, code.eax); + code.ret(); + + code.align(); + unlock = code.getCurr<void (*)(volatile int*)>(); + EmitSpinLockUnlock(code, ABI_PARAM1, code.eax); + code.ret(); +} + +} // namespace + +void SpinLock::Lock() { + std::call_once(flag, &SpinLockImpl::Initialize, impl); + impl.lock(&storage); +} + +void SpinLock::Unlock() { + std::call_once(flag, &SpinLockImpl::Initialize, impl); + impl.unlock(&storage); +} + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock_x64.h b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.h new file mode 100644 index 0000000000..df6a3d7407 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.h @@ -0,0 +1,15 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <xbyak/xbyak.h> + +namespace Dynarmic { + +void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp); +void EmitSpinLockUnlock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp); + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/common/string_util.h b/externals/dynarmic/src/dynarmic/common/string_util.h new file mode 100644 index 0000000000..900b3aadd2 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/string_util.h @@ -0,0 +1,15 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::Common { + +template<typename T> +constexpr char SignToChar(T value) { + return value >= 0 ? '+' : '-'; +} + +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/u128.cpp b/externals/dynarmic/src/dynarmic/common/u128.cpp new file mode 100644 index 0000000000..dd78ac97ef --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/u128.cpp @@ -0,0 +1,142 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/u128.h" + +#include <mcl/stdint.hpp> + +namespace Dynarmic { + +u128 Multiply64To128(u64 a, u64 b) { + const u32 a0 = static_cast<u32>(a); + const u32 b0 = static_cast<u32>(b); + const u32 a1 = static_cast<u32>(a >> 32); + const u32 b1 = static_cast<u32>(b >> 32); + + // result = (c2 << 64) + (c1 << 32) + c0 + // c2 = a1 * b1 + // c1 = a1 * b0 + a0 * b1 + // c0 = a0 * b0 + // noting that these operations may overflow + + const u64 c0 = static_cast<u64>(a0) * b0; + const u64 c1_0 = static_cast<u64>(a1) * b0; + const u64 c1_1 = static_cast<u64>(a0) * b1; + const u64 c2 = static_cast<u64>(a1) * b1; + + const u64 c1 = c1_0 + c1_1; + const u64 c1_overflow = c1 < c1_0; + + const u64 lower = c0 + (c1 << 32); + const u64 lower_overflow = lower < c0; + + const u64 upper = lower_overflow + (c1 >> 32) + (c1_overflow << 32) + c2; + + u128 result; + result.lower = lower; + result.upper = upper; + return result; +} + +u128 operator<<(u128 operand, int amount) { + if (amount < 0) { + return operand >> -amount; + } + + if (amount == 0) { + return operand; + } + + if (amount < 64) { + u128 result; + result.lower = (operand.lower << amount); + result.upper = (operand.upper << amount) | (operand.lower >> (64 - amount)); + return result; + } + + if (amount < 128) { + u128 result; + result.upper = operand.lower << (amount - 64); + return result; + } + + return {}; +} + +u128 operator>>(u128 operand, int amount) { + if (amount < 0) { + return operand << -amount; + } + + if (amount == 0) { + return operand; + } + + if (amount < 64) { + u128 result; + result.lower = (operand.lower >> amount) | (operand.upper << (64 - amount)); + result.upper = (operand.upper >> amount); + return result; + } + + if (amount < 128) { + u128 result; + result.lower = operand.upper >> (amount - 64); + return result; + } + + return {}; +} + +u128 StickyLogicalShiftRight(u128 operand, int amount) { + if (amount < 0) { + return operand << -amount; + } + + if (amount == 0) { + return operand; + } + + if (amount < 64) { + u128 result; + result.lower = (operand.lower >> amount) | (operand.upper << (64 - amount)); + result.upper = (operand.upper >> amount); + // Sticky bit + if ((operand.lower << (64 - amount)) != 0) { + result.lower |= 1; + } + return result; + } + + if (amount == 64) { + u128 result; + result.lower = operand.upper; + // Sticky bit + if (operand.lower != 0) { + result.lower |= 1; + } + return result; + } + + if (amount < 128) { + u128 result; + result.lower = operand.upper >> (amount - 64); + // Sticky bit + if (operand.lower != 0) { + result.lower |= 1; + } + if ((operand.upper << (128 - amount)) != 0) { + result.lower |= 1; + } + return result; + } + + if (operand.lower != 0 || operand.upper != 0) { + return u128(1); + } + return {}; +} + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/common/u128.h b/externals/dynarmic/src/dynarmic/common/u128.h new file mode 100644 index 0000000000..f6df1ae6cc --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/u128.h @@ -0,0 +1,99 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <tuple> +#include <type_traits> + +#include <mcl/bit/bit_field.hpp> +#include <mcl/bitsizeof.hpp> +#include <mcl/stdint.hpp> + +namespace Dynarmic { + +struct u128 { + u128() = default; + u128(const u128&) = default; + u128(u128&&) = default; + u128& operator=(const u128&) = default; + u128& operator=(u128&&) = default; + + u128(u64 lower_, u64 upper_) + : lower(lower_), upper(upper_) {} + + template<typename T> + /* implicit */ u128(T value) + : lower(value), upper(0) { + static_assert(std::is_integral_v<T>); + static_assert(mcl::bitsizeof<T> <= mcl::bitsizeof<u64>); + } + + u64 lower = 0; + u64 upper = 0; + + template<size_t bit_position> + bool Bit() const { + static_assert(bit_position < 128); + if constexpr (bit_position < 64) { + return mcl::bit::get_bit<bit_position>(lower); + } else { + return mcl::bit::get_bit<bit_position - 64>(upper); + } + } +}; + +static_assert(mcl::bitsizeof<u128> == 128); +static_assert(std::is_standard_layout_v<u128>); +static_assert(std::is_trivially_copyable_v<u128>); + +u128 Multiply64To128(u64 a, u64 b); + +inline u128 operator+(u128 a, u128 b) { + u128 result; + result.lower = a.lower + b.lower; + result.upper = a.upper + b.upper + (a.lower > result.lower); + return result; +} + +inline u128 operator-(u128 a, u128 b) { + u128 result; + result.lower = a.lower - b.lower; + result.upper = a.upper - b.upper - (a.lower < result.lower); + return result; +} + +inline bool operator<(u128 a, u128 b) { + return std::tie(a.upper, a.lower) < std::tie(b.upper, b.lower); +} + +inline bool operator>(u128 a, u128 b) { + return operator<(b, a); +} + +inline bool operator<=(u128 a, u128 b) { + return !operator>(a, b); +} + +inline bool operator>=(u128 a, u128 b) { + return !operator<(a, b); +} + +inline bool operator==(u128 a, u128 b) { + return std::tie(a.upper, a.lower) == std::tie(b.upper, b.lower); +} + +inline bool operator!=(u128 a, u128 b) { + return !operator==(a, b); +} + +u128 operator<<(u128 operand, int amount); +u128 operator>>(u128 operand, int amount); + +/// LSB is a "sticky-bit". +/// If a 1 is shifted off, the LSB would be set. +u128 StickyLogicalShiftRight(u128 operand, int amount); + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/common/variant_util.h b/externals/dynarmic/src/dynarmic/common/variant_util.h new file mode 100644 index 0000000000..4dd7f67167 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/variant_util.h @@ -0,0 +1,29 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <boost/variant.hpp> + +namespace Dynarmic::Common { +namespace detail { + +template<typename ReturnT, typename Lambda> +struct VariantVisitor : boost::static_visitor<ReturnT> + , Lambda { + VariantVisitor(Lambda&& lambda) + : Lambda(std::move(lambda)) {} + + using Lambda::operator(); +}; + +} // namespace detail + +template<typename ReturnT, typename Variant, typename Lambda> +inline ReturnT VisitVariant(Variant&& variant, Lambda&& lambda) { + return boost::apply_visitor(detail::VariantVisitor<ReturnT, Lambda>(std::move(lambda)), variant); +} + +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/x64_disassemble.cpp b/externals/dynarmic/src/dynarmic/common/x64_disassemble.cpp new file mode 100644 index 0000000000..854de23a77 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/x64_disassemble.cpp @@ -0,0 +1,57 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/x64_disassemble.h" + +#include <Zydis/Zydis.h> +#include <fmt/printf.h> +#include <mcl/stdint.hpp> + +namespace Dynarmic::Common { + +void DumpDisassembledX64(const void* ptr, size_t size) { + ZydisDecoder decoder; + ZydisDecoderInit(&decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64); + + ZydisFormatter formatter; + ZydisFormatterInit(&formatter, ZYDIS_FORMATTER_STYLE_INTEL); + + size_t offset = 0; + ZydisDecodedInstruction instruction; + ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; + while (ZYAN_SUCCESS(ZydisDecoderDecodeFull(&decoder, static_cast<const char*>(ptr) + offset, size - offset, &instruction, operands))) { + fmt::print("{:016x} ", (u64)ptr + offset); + + char buffer[256]; + ZydisFormatterFormatInstruction(&formatter, &instruction, operands, instruction.operand_count_visible, buffer, sizeof(buffer), reinterpret_cast<u64>(ptr) + offset, ZYAN_NULL); + puts(buffer); + + offset += instruction.length; + } +} + +std::vector<std::string> DisassembleX64(const void* ptr, size_t size) { + std::vector<std::string> result; + ZydisDecoder decoder; + ZydisDecoderInit(&decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64); + + ZydisFormatter formatter; + ZydisFormatterInit(&formatter, ZYDIS_FORMATTER_STYLE_INTEL); + + size_t offset = 0; + ZydisDecodedInstruction instruction; + ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; + while (ZYAN_SUCCESS(ZydisDecoderDecodeFull(&decoder, static_cast<const char*>(ptr) + offset, size - offset, &instruction, operands))) { + char buffer[256]; + ZydisFormatterFormatInstruction(&formatter, &instruction, operands, instruction.operand_count_visible, buffer, sizeof(buffer), reinterpret_cast<u64>(ptr) + offset, ZYAN_NULL); + + result.push_back(fmt::format("{:016x} {}", (u64)ptr + offset, buffer)); + + offset += instruction.length; + } + + return result; +} +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/common/x64_disassemble.h b/externals/dynarmic/src/dynarmic/common/x64_disassemble.h new file mode 100644 index 0000000000..03c511bfda --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/x64_disassemble.h @@ -0,0 +1,21 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <string> +#include <vector> + +#include <mcl/stdint.hpp> + +namespace Dynarmic::Common { + +void DumpDisassembledX64(const void* ptr, size_t size); +/** + * Disassemble `size' bytes from `ptr' and return the disassembled lines as a vector + * of strings. + */ +std::vector<std::string> DisassembleX64(const void* ptr, size_t size); +} // namespace Dynarmic::Common diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/FPSCR.h b/externals/dynarmic/src/dynarmic/frontend/A32/FPSCR.h new file mode 100644 index 0000000000..28414e9fa1 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/FPSCR.h @@ -0,0 +1,191 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <optional> + +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/rounding_mode.h" + +namespace Dynarmic::A32 { + +/** + * Representation of the Floating-Point Status and Control Register. + */ +class FPSCR final { +public: + FPSCR() = default; + FPSCR(const FPSCR&) = default; + FPSCR(FPSCR&&) = default; + explicit FPSCR(u32 data) + : value{data & mask} {} + + FPSCR& operator=(const FPSCR&) = default; + FPSCR& operator=(FPSCR&&) = default; + FPSCR& operator=(u32 data) { + value = data & mask; + return *this; + } + + /// Negative condition flag. + bool N() const { + return mcl::bit::get_bit<31>(value); + } + + /// Zero condition flag. + bool Z() const { + return mcl::bit::get_bit<30>(value); + } + + /// Carry condition flag. + bool C() const { + return mcl::bit::get_bit<29>(value); + } + + /// Overflow condition flag. + bool V() const { + return mcl::bit::get_bit<28>(value); + } + + /// Cumulative saturation flag. + bool QC() const { + return mcl::bit::get_bit<27>(value); + } + + /// Alternate half-precision control flag. + bool AHP() const { + return mcl::bit::get_bit<26>(value); + } + + /// Default NaN mode control bit. + bool DN() const { + return mcl::bit::get_bit<25>(value); + } + + /// Flush-to-zero mode control bit. + bool FTZ() const { + return mcl::bit::get_bit<24>(value); + } + + /// Rounding mode control field. + FP::RoundingMode RMode() const { + return static_cast<FP::RoundingMode>(mcl::bit::get_bits<22, 23>(value)); + } + + /// Indicates the stride of a vector. + std::optional<size_t> Stride() const { + switch (mcl::bit::get_bits<20, 21>(value)) { + case 0b00: + return 1; + case 0b11: + return 2; + default: + return std::nullopt; + } + } + + /// Indicates the length of a vector. + size_t Len() const { + return mcl::bit::get_bits<16, 18>(value) + 1; + } + + /// Input denormal exception trap enable flag. + bool IDE() const { + return mcl::bit::get_bit<15>(value); + } + + /// Inexact exception trap enable flag. + bool IXE() const { + return mcl::bit::get_bit<12>(value); + } + + /// Underflow exception trap enable flag. + bool UFE() const { + return mcl::bit::get_bit<11>(value); + } + + /// Overflow exception trap enable flag. + bool OFE() const { + return mcl::bit::get_bit<10>(value); + } + + /// Division by zero exception trap enable flag. + bool DZE() const { + return mcl::bit::get_bit<9>(value); + } + + /// Invalid operation exception trap enable flag. + bool IOE() const { + return mcl::bit::get_bit<8>(value); + } + + /// Input denormal cumulative exception bit. + bool IDC() const { + return mcl::bit::get_bit<7>(value); + } + + /// Inexact cumulative exception bit. + bool IXC() const { + return mcl::bit::get_bit<4>(value); + } + + /// Underflow cumulative exception bit. + bool UFC() const { + return mcl::bit::get_bit<3>(value); + } + + /// Overflow cumulative exception bit. + bool OFC() const { + return mcl::bit::get_bit<2>(value); + } + + /// Division by zero cumulative exception bit. + bool DZC() const { + return mcl::bit::get_bit<1>(value); + } + + /// Invalid operation cumulative exception bit. + bool IOC() const { + return mcl::bit::get_bit<0>(value); + } + + /** + * Whether or not the FPSCR indicates RunFast mode. + * + * RunFast mode is enabled when: + * - Flush-to-zero is enabled + * - Default NaNs are enabled. + * - All exception enable bits are cleared. + */ + bool InRunFastMode() const { + constexpr u32 runfast_mask = 0x03001F00; + constexpr u32 expected = 0x03000000; + + return (value & runfast_mask) == expected; + } + + /// Gets the underlying raw value within the FPSCR. + u32 Value() const { + return value; + } + +private: + // Bits 5-6, 13-14, and 19 are reserved. + static constexpr u32 mask = 0xFFF79F9F; + u32 value = 0; +}; + +inline bool operator==(FPSCR lhs, FPSCR rhs) { + return lhs.Value() == rhs.Value(); +} + +inline bool operator!=(FPSCR lhs, FPSCR rhs) { + return !operator==(lhs, rhs); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/ITState.h b/externals/dynarmic/src/dynarmic/frontend/A32/ITState.h new file mode 100644 index 0000000000..ae69fa1e3f --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/ITState.h @@ -0,0 +1,64 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/ir/cond.h" + +namespace Dynarmic::A32 { + +class ITState final { +public: + ITState() = default; + explicit ITState(u8 data) + : value(data) {} + + ITState& operator=(u8 data) { + value = data; + return *this; + } + + IR::Cond Cond() const { + if (value == 0b00000000) { + return IR::Cond::AL; + } + return static_cast<IR::Cond>(mcl::bit::get_bits<4, 7>(value)); + } + + bool IsInITBlock() const { + return mcl::bit::get_bits<0, 3>(value) != 0b0000; + } + + bool IsLastInITBlock() const { + return mcl::bit::get_bits<0, 3>(value) == 0b1000; + } + + ITState Advance() const { + if (mcl::bit::get_bits<0, 2>(value) == 0b000) { + return ITState{0b00000000}; + } + return ITState{mcl::bit::set_bits<0, 4>(value, static_cast<u8>(mcl::bit::get_bits<0, 4>(value) << 1))}; + } + + u8 Value() const { + return value; + } + +private: + u8 value = 0; +}; + +inline bool operator==(ITState lhs, ITState rhs) { + return lhs.Value() == rhs.Value(); +} + +inline bool operator!=(ITState lhs, ITState rhs) { + return !operator==(lhs, rhs); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/PSR.h b/externals/dynarmic/src/dynarmic/frontend/A32/PSR.h new file mode 100644 index 0000000000..9af78eaae5 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/PSR.h @@ -0,0 +1,224 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/A32/ITState.h" + +namespace Dynarmic::A32 { + +/** + * Program Status Register + * + * | Bit(s) | Description | + * |:-------:|:----------------------------------------------| + * | N | Negative | + * | Z | Zero | + * | C | Carry | + * | V | Overflow | + * | Q | Sticky overflow for DSP-oriented instructions | + * | IT[1:0] | Lower two bits of the If-Then execution state | + * | J | Jazelle bit | + * | GE | Greater-than or Equal | + * | IT[7:2] | Upper six bits of the If-Then execution state | + * | E | Endian (0 is little endian, 1 is big endian) | + * | A | Imprecise data abort (disables them when set) | + * | I | IRQ interrupts (disabled when set) | + * | F | FIQ interrupts (disabled when set) | + * | T | Thumb bit | + * | M | Current processor mode | + */ +class PSR final { +public: + /// Valid processor modes that may be indicated. + enum class Mode : u32 { + User = 0b10000, + FIQ = 0b10001, + IRQ = 0b10010, + Supervisor = 0b10011, + Monitor = 0b10110, + Abort = 0b10111, + Hypervisor = 0b11010, + Undefined = 0b11011, + System = 0b11111 + }; + + /// Instruction sets that may be signified through a PSR. + enum class InstructionSet { + ARM, + Jazelle, + Thumb, + ThumbEE + }; + + PSR() = default; + explicit PSR(u32 data) + : value{data & mask} {} + + PSR& operator=(u32 data) { + value = data & mask; + return *this; + } + + bool N() const { + return mcl::bit::get_bit<31>(value); + } + void N(bool set) { + value = mcl::bit::set_bit<31>(value, set); + } + + bool Z() const { + return mcl::bit::get_bit<30>(value); + } + void Z(bool set) { + value = mcl::bit::set_bit<30>(value, set); + } + + bool C() const { + return mcl::bit::get_bit<29>(value); + } + void C(bool set) { + value = mcl::bit::set_bit<29>(value, set); + } + + bool V() const { + return mcl::bit::get_bit<28>(value); + } + void V(bool set) { + value = mcl::bit::set_bit<28>(value, set); + } + + bool Q() const { + return mcl::bit::get_bit<27>(value); + } + void Q(bool set) { + value = mcl::bit::set_bit<27>(value, set); + } + + bool J() const { + return mcl::bit::get_bit<24>(value); + } + void J(bool set) { + value = mcl::bit::set_bit<24>(value, set); + } + + u32 GE() const { + return mcl::bit::get_bits<16, 19>(value); + } + void GE(u32 data) { + value = mcl::bit::set_bits<16, 19>(value, data); + } + + ITState IT() const { + return ITState{static_cast<u8>((value & 0x6000000) >> 25 | (value & 0xFC00) >> 8)}; + } + void IT(ITState it_state) { + const u32 data = it_state.Value(); + value = (value & ~0x000FC00) | (data & 0b11111100) << 8; + value = (value & ~0x6000000) | (data & 0b00000011) << 25; + } + + bool E() const { + return mcl::bit::get_bit<9>(value); + } + void E(bool set) { + value = mcl::bit::set_bit<9>(value, set); + } + + bool A() const { + return mcl::bit::get_bit<8>(value); + } + void A(bool set) { + value = mcl::bit::set_bit<8>(value, set); + } + + bool I() const { + return mcl::bit::get_bit<7>(value); + } + void I(bool set) { + value = mcl::bit::set_bit<7>(value, set); + } + + bool F() const { + return mcl::bit::get_bit<6>(value); + } + void F(bool set) { + value = mcl::bit::set_bit<6>(value, set); + } + + bool T() const { + return mcl::bit::get_bit<5>(value); + } + void T(bool set) { + value = mcl::bit::set_bit<5>(value, set); + } + + Mode M() const { + return static_cast<Mode>(mcl::bit::get_bits<0, 4>(value)); + } + void M(Mode mode) { + value = mcl::bit::set_bits<0, 4>(value, static_cast<u32>(mode)); + } + + u32 Value() const { + return value; + } + + InstructionSet CurrentInstructionSet() const { + const bool j_bit = J(); + const bool t_bit = T(); + + if (j_bit && t_bit) + return InstructionSet::ThumbEE; + + if (t_bit) + return InstructionSet::Thumb; + + if (j_bit) + return InstructionSet::Jazelle; + + return InstructionSet::ARM; + } + + void CurrentInstructionSet(InstructionSet instruction_set) { + switch (instruction_set) { + case InstructionSet::ARM: + T(false); + J(false); + break; + case InstructionSet::Jazelle: + T(false); + J(true); + break; + case InstructionSet::Thumb: + T(true); + J(false); + break; + case InstructionSet::ThumbEE: + T(true); + J(true); + break; + } + } + +private: + // Bits 20-23 are reserved and should be zero. + static constexpr u32 mask = 0xFF0FFFFF; + + u32 value = 0; +}; + +inline bool operator==(PSR lhs, PSR rhs) { + return lhs.Value() == rhs.Value(); +} + +inline bool operator!=(PSR lhs, PSR rhs) { + return !operator==(lhs, rhs); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.cpp new file mode 100644 index 0000000000..343e521aba --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.cpp @@ -0,0 +1,442 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/a32_ir_emitter.h" + +#include <mcl/assert.hpp> + +#include "dynarmic/frontend/A32/a32_types.h" +#include "dynarmic/interface/A32/arch_version.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::A32 { + +using Opcode = IR::Opcode; + +size_t IREmitter::ArchVersion() const { + switch (arch_version) { + case ArchVersion::v3: + return 3; + case ArchVersion::v4: + case ArchVersion::v4T: + return 4; + case ArchVersion::v5TE: + return 5; + case ArchVersion::v6K: + case ArchVersion::v6T2: + return 6; + case ArchVersion::v7: + return 7; + case ArchVersion::v8: + return 8; + } + UNREACHABLE(); +} + +u32 IREmitter::PC() const { + const u32 offset = current_location.TFlag() ? 4 : 8; + return current_location.PC() + offset; +} + +u32 IREmitter::AlignPC(size_t alignment) const { + const u32 pc = PC(); + return static_cast<u32>(pc - pc % alignment); +} + +IR::U32 IREmitter::GetRegister(Reg reg) { + if (reg == A32::Reg::PC) { + return Imm32(PC()); + } + return Inst<IR::U32>(Opcode::A32GetRegister, IR::Value(reg)); +} + +IR::U32U64 IREmitter::GetExtendedRegister(ExtReg reg) { + if (A32::IsSingleExtReg(reg)) { + return Inst<IR::U32U64>(Opcode::A32GetExtendedRegister32, IR::Value(reg)); + } + + if (A32::IsDoubleExtReg(reg)) { + return Inst<IR::U32U64>(Opcode::A32GetExtendedRegister64, IR::Value(reg)); + } + + ASSERT_FALSE("Invalid reg."); +} + +IR::U128 IREmitter::GetVector(ExtReg reg) { + ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg)); + return Inst<IR::U128>(Opcode::A32GetVector, IR::Value(reg)); +} + +void IREmitter::SetRegister(const Reg reg, const IR::U32& value) { + ASSERT(reg != A32::Reg::PC); + Inst(Opcode::A32SetRegister, IR::Value(reg), value); +} + +void IREmitter::SetExtendedRegister(const ExtReg reg, const IR::U32U64& value) { + if (A32::IsSingleExtReg(reg)) { + Inst(Opcode::A32SetExtendedRegister32, IR::Value(reg), value); + } else if (A32::IsDoubleExtReg(reg)) { + Inst(Opcode::A32SetExtendedRegister64, IR::Value(reg), value); + } else { + ASSERT_FALSE("Invalid reg."); + } +} + +void IREmitter::SetVector(ExtReg reg, const IR::U128& value) { + ASSERT(A32::IsDoubleExtReg(reg) || A32::IsQuadExtReg(reg)); + Inst(Opcode::A32SetVector, IR::Value(reg), value); +} + +void IREmitter::ALUWritePC(const IR::U32& value) { + // This behaviour is ARM version-dependent. + if (ArchVersion() >= 7 && !current_location.TFlag()) { + BXWritePC(value); + } else { + BranchWritePC(value); + } +} + +void IREmitter::BranchWritePC(const IR::U32& value) { + if (!current_location.TFlag()) { + // Note that for ArchVersion() < 6, this is UNPREDICTABLE when value<1:0> != 0b00 + const auto new_pc = And(value, Imm32(0xFFFFFFFC)); + Inst(Opcode::A32SetRegister, IR::Value(A32::Reg::PC), new_pc); + } else { + const auto new_pc = And(value, Imm32(0xFFFFFFFE)); + Inst(Opcode::A32SetRegister, IR::Value(A32::Reg::PC), new_pc); + } +} + +void IREmitter::BXWritePC(const IR::U32& value) { + Inst(Opcode::A32BXWritePC, value); +} + +void IREmitter::LoadWritePC(const IR::U32& value) { + // This behaviour is ARM version-dependent. + if (ArchVersion() >= 5) { + BXWritePC(value); + } else { + BranchWritePC(value); + } +} + +void IREmitter::UpdateUpperLocationDescriptor() { + Inst(Opcode::A32UpdateUpperLocationDescriptor); +} + +void IREmitter::CallSupervisor(const IR::U32& value) { + Inst(Opcode::A32CallSupervisor, value); +} + +void IREmitter::ExceptionRaised(const Exception exception) { + Inst(Opcode::A32ExceptionRaised, Imm32(current_location.PC()), Imm64(static_cast<u64>(exception))); +} + +IR::U32 IREmitter::GetCpsr() { + return Inst<IR::U32>(Opcode::A32GetCpsr); +} + +void IREmitter::SetCpsr(const IR::U32& value) { + Inst(Opcode::A32SetCpsr, value); +} + +void IREmitter::SetCpsrNZCV(const IR::NZCV& value) { + Inst(Opcode::A32SetCpsrNZCV, value); +} + +void IREmitter::SetCpsrNZCVRaw(const IR::U32& value) { + Inst(Opcode::A32SetCpsrNZCVRaw, value); +} + +void IREmitter::SetCpsrNZCVQ(const IR::U32& value) { + Inst(Opcode::A32SetCpsrNZCVQ, value); +} + +void IREmitter::SetCheckBit(const IR::U1& value) { + Inst(Opcode::A32SetCheckBit, value); +} + +IR::U1 IREmitter::GetOverflowFrom(const IR::Value& value) { + return Inst<IR::U1>(Opcode::GetOverflowFromOp, value); +} + +IR::U1 IREmitter::GetCFlag() { + return Inst<IR::U1>(Opcode::A32GetCFlag); +} + +void IREmitter::OrQFlag(const IR::U1& value) { + Inst(Opcode::A32OrQFlag, value); +} + +IR::U32 IREmitter::GetGEFlags() { + return Inst<IR::U32>(Opcode::A32GetGEFlags); +} + +void IREmitter::SetGEFlags(const IR::U32& value) { + Inst(Opcode::A32SetGEFlags, value); +} + +void IREmitter::SetGEFlagsCompressed(const IR::U32& value) { + Inst(Opcode::A32SetGEFlagsCompressed, value); +} + +IR::NZCV IREmitter::NZFrom(const IR::Value& value) { + return Inst<IR::NZCV>(Opcode::GetNZFromOp, value); +} + +void IREmitter::SetCpsrNZ(const IR::NZCV& nz) { + Inst(Opcode::A32SetCpsrNZ, nz); +} + +void IREmitter::SetCpsrNZC(const IR::NZCV& nz, const IR::U1& c) { + Inst(Opcode::A32SetCpsrNZC, nz, c); +} + +void IREmitter::DataSynchronizationBarrier() { + Inst(Opcode::A32DataSynchronizationBarrier); +} + +void IREmitter::DataMemoryBarrier() { + Inst(Opcode::A32DataMemoryBarrier); +} + +void IREmitter::InstructionSynchronizationBarrier() { + Inst(Opcode::A32InstructionSynchronizationBarrier); +} + +IR::U32 IREmitter::GetFpscr() { + return Inst<IR::U32>(Opcode::A32GetFpscr); +} + +void IREmitter::SetFpscr(const IR::U32& new_fpscr) { + Inst(Opcode::A32SetFpscr, new_fpscr); +} + +IR::U32 IREmitter::GetFpscrNZCV() { + return Inst<IR::U32>(Opcode::A32GetFpscrNZCV); +} + +void IREmitter::SetFpscrNZCV(const IR::NZCV& new_fpscr_nzcv) { + Inst(Opcode::A32SetFpscrNZCV, new_fpscr_nzcv); +} + +void IREmitter::ClearExclusive() { + Inst(Opcode::A32ClearExclusive); +} + +IR::UAny IREmitter::ReadMemory(size_t bitsize, const IR::U32& vaddr, IR::AccType acc_type) { + switch (bitsize) { + case 8: + return ReadMemory8(vaddr, acc_type); + case 16: + return ReadMemory16(vaddr, acc_type); + case 32: + return ReadMemory32(vaddr, acc_type); + case 64: + return ReadMemory64(vaddr, acc_type); + } + ASSERT_FALSE("Invalid bitsize"); +} + +IR::U8 IREmitter::ReadMemory8(const IR::U32& vaddr, IR::AccType acc_type) { + return Inst<IR::U8>(Opcode::A32ReadMemory8, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +IR::U16 IREmitter::ReadMemory16(const IR::U32& vaddr, IR::AccType acc_type) { + const auto value = Inst<IR::U16>(Opcode::A32ReadMemory16, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); + return current_location.EFlag() ? ByteReverseHalf(value) : value; +} + +IR::U32 IREmitter::ReadMemory32(const IR::U32& vaddr, IR::AccType acc_type) { + const auto value = Inst<IR::U32>(Opcode::A32ReadMemory32, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); + return current_location.EFlag() ? ByteReverseWord(value) : value; +} + +IR::U64 IREmitter::ReadMemory64(const IR::U32& vaddr, IR::AccType acc_type) { + const auto value = Inst<IR::U64>(Opcode::A32ReadMemory64, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); + return current_location.EFlag() ? ByteReverseDual(value) : value; +} + +IR::U8 IREmitter::ExclusiveReadMemory8(const IR::U32& vaddr, IR::AccType acc_type) { + return Inst<IR::U8>(Opcode::A32ExclusiveReadMemory8, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +IR::U16 IREmitter::ExclusiveReadMemory16(const IR::U32& vaddr, IR::AccType acc_type) { + const auto value = Inst<IR::U16>(Opcode::A32ExclusiveReadMemory16, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); + return current_location.EFlag() ? ByteReverseHalf(value) : value; +} + +IR::U32 IREmitter::ExclusiveReadMemory32(const IR::U32& vaddr, IR::AccType acc_type) { + const auto value = Inst<IR::U32>(Opcode::A32ExclusiveReadMemory32, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); + return current_location.EFlag() ? ByteReverseWord(value) : value; +} + +std::pair<IR::U32, IR::U32> IREmitter::ExclusiveReadMemory64(const IR::U32& vaddr, IR::AccType acc_type) { + const auto value = Inst<IR::U64>(Opcode::A32ExclusiveReadMemory64, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); + const auto lo = LeastSignificantWord(value); + const auto hi = MostSignificantWord(value).result; + if (current_location.EFlag()) { + // DO NOT SWAP hi AND lo IN BIG ENDIAN MODE, THIS IS CORRECT BEHAVIOUR + return std::make_pair(ByteReverseWord(lo), ByteReverseWord(hi)); + } + return std::make_pair(lo, hi); +} + +void IREmitter::WriteMemory(size_t bitsize, const IR::U32& vaddr, const IR::UAny& value, IR::AccType acc_type) { + switch (bitsize) { + case 8: + return WriteMemory8(vaddr, value, acc_type); + case 16: + return WriteMemory16(vaddr, value, acc_type); + case 32: + return WriteMemory32(vaddr, value, acc_type); + case 64: + return WriteMemory64(vaddr, value, acc_type); + } + ASSERT_FALSE("Invalid bitsize"); +} + +void IREmitter::WriteMemory8(const IR::U32& vaddr, const IR::U8& value, IR::AccType acc_type) { + Inst(Opcode::A32WriteMemory8, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +void IREmitter::WriteMemory16(const IR::U32& vaddr, const IR::U16& value, IR::AccType acc_type) { + if (current_location.EFlag()) { + const auto v = ByteReverseHalf(value); + Inst(Opcode::A32WriteMemory16, ImmCurrentLocationDescriptor(), vaddr, v, IR::Value{acc_type}); + } else { + Inst(Opcode::A32WriteMemory16, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); + } +} + +void IREmitter::WriteMemory32(const IR::U32& vaddr, const IR::U32& value, IR::AccType acc_type) { + if (current_location.EFlag()) { + const auto v = ByteReverseWord(value); + Inst(Opcode::A32WriteMemory32, ImmCurrentLocationDescriptor(), vaddr, v, IR::Value{acc_type}); + } else { + Inst(Opcode::A32WriteMemory32, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); + } +} + +void IREmitter::WriteMemory64(const IR::U32& vaddr, const IR::U64& value, IR::AccType acc_type) { + if (current_location.EFlag()) { + const auto v = ByteReverseDual(value); + Inst(Opcode::A32WriteMemory64, ImmCurrentLocationDescriptor(), vaddr, v, IR::Value{acc_type}); + } else { + Inst(Opcode::A32WriteMemory64, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); + } +} + +IR::U32 IREmitter::ExclusiveWriteMemory8(const IR::U32& vaddr, const IR::U8& value, IR::AccType acc_type) { + return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory8, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +IR::U32 IREmitter::ExclusiveWriteMemory16(const IR::U32& vaddr, const IR::U16& value, IR::AccType acc_type) { + if (current_location.EFlag()) { + const auto v = ByteReverseHalf(value); + return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory16, ImmCurrentLocationDescriptor(), vaddr, v, IR::Value{acc_type}); + } else { + return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory16, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); + } +} + +IR::U32 IREmitter::ExclusiveWriteMemory32(const IR::U32& vaddr, const IR::U32& value, IR::AccType acc_type) { + if (current_location.EFlag()) { + const auto v = ByteReverseWord(value); + return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory32, ImmCurrentLocationDescriptor(), vaddr, v, IR::Value{acc_type}); + } else { + return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory32, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); + } +} + +IR::U32 IREmitter::ExclusiveWriteMemory64(const IR::U32& vaddr, const IR::U32& value_lo, const IR::U32& value_hi, IR::AccType acc_type) { + if (current_location.EFlag()) { + const auto vlo = ByteReverseWord(value_lo); + const auto vhi = ByteReverseWord(value_hi); + return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory64, ImmCurrentLocationDescriptor(), vaddr, Pack2x32To1x64(vlo, vhi), IR::Value{acc_type}); + } else { + return Inst<IR::U32>(Opcode::A32ExclusiveWriteMemory64, ImmCurrentLocationDescriptor(), vaddr, Pack2x32To1x64(value_lo, value_hi), IR::Value{acc_type}); + } +} + +void IREmitter::CoprocInternalOperation(size_t coproc_no, bool two, size_t opc1, CoprocReg CRd, CoprocReg CRn, CoprocReg CRm, size_t opc2) { + ASSERT(coproc_no <= 15); + const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no), + static_cast<u8>(two ? 1 : 0), + static_cast<u8>(opc1), + static_cast<u8>(CRd), + static_cast<u8>(CRn), + static_cast<u8>(CRm), + static_cast<u8>(opc2)}; + Inst(Opcode::A32CoprocInternalOperation, IR::Value(coproc_info)); +} + +void IREmitter::CoprocSendOneWord(size_t coproc_no, bool two, size_t opc1, CoprocReg CRn, CoprocReg CRm, size_t opc2, const IR::U32& word) { + ASSERT(coproc_no <= 15); + const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no), + static_cast<u8>(two ? 1 : 0), + static_cast<u8>(opc1), + static_cast<u8>(CRn), + static_cast<u8>(CRm), + static_cast<u8>(opc2)}; + Inst(Opcode::A32CoprocSendOneWord, IR::Value(coproc_info), word); +} + +void IREmitter::CoprocSendTwoWords(size_t coproc_no, bool two, size_t opc, CoprocReg CRm, const IR::U32& word1, const IR::U32& word2) { + ASSERT(coproc_no <= 15); + const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no), + static_cast<u8>(two ? 1 : 0), + static_cast<u8>(opc), + static_cast<u8>(CRm)}; + Inst(Opcode::A32CoprocSendTwoWords, IR::Value(coproc_info), word1, word2); +} + +IR::U32 IREmitter::CoprocGetOneWord(size_t coproc_no, bool two, size_t opc1, CoprocReg CRn, CoprocReg CRm, size_t opc2) { + ASSERT(coproc_no <= 15); + const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no), + static_cast<u8>(two ? 1 : 0), + static_cast<u8>(opc1), + static_cast<u8>(CRn), + static_cast<u8>(CRm), + static_cast<u8>(opc2)}; + return Inst<IR::U32>(Opcode::A32CoprocGetOneWord, IR::Value(coproc_info)); +} + +IR::U64 IREmitter::CoprocGetTwoWords(size_t coproc_no, bool two, size_t opc, CoprocReg CRm) { + ASSERT(coproc_no <= 15); + const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no), + static_cast<u8>(two ? 1 : 0), + static_cast<u8>(opc), + static_cast<u8>(CRm)}; + return Inst<IR::U64>(Opcode::A32CoprocGetTwoWords, IR::Value(coproc_info)); +} + +void IREmitter::CoprocLoadWords(size_t coproc_no, bool two, bool long_transfer, CoprocReg CRd, const IR::U32& address, bool has_option, u8 option) { + ASSERT(coproc_no <= 15); + const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no), + static_cast<u8>(two ? 1 : 0), + static_cast<u8>(long_transfer ? 1 : 0), + static_cast<u8>(CRd), + static_cast<u8>(has_option ? 1 : 0), + static_cast<u8>(option)}; + Inst(Opcode::A32CoprocLoadWords, IR::Value(coproc_info), address); +} + +void IREmitter::CoprocStoreWords(size_t coproc_no, bool two, bool long_transfer, CoprocReg CRd, const IR::U32& address, bool has_option, u8 option) { + ASSERT(coproc_no <= 15); + const IR::Value::CoprocessorInfo coproc_info{static_cast<u8>(coproc_no), + static_cast<u8>(two ? 1 : 0), + static_cast<u8>(long_transfer ? 1 : 0), + static_cast<u8>(CRd), + static_cast<u8>(has_option ? 1 : 0), + static_cast<u8>(option)}; + Inst(Opcode::A32CoprocStoreWords, IR::Value(coproc_info), address); +} + +IR::U64 IREmitter::ImmCurrentLocationDescriptor() { + return Imm64(IR::LocationDescriptor{current_location}.Value()); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.h b/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.h new file mode 100644 index 0000000000..9fde4f8775 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_ir_emitter.h @@ -0,0 +1,116 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <utility> + +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/A32/a32_location_descriptor.h" +#include "dynarmic/ir/ir_emitter.h" +#include "dynarmic/ir/value.h" + +namespace Dynarmic::A32 { + +enum class ArchVersion; +enum class CoprocReg; +enum class Exception; +enum class ExtReg; +enum class Reg; + +/** + * Convenience class to construct a basic block of the intermediate representation. + * `block` is the resulting block. + * The user of this class updates `current_location` as appropriate. + */ +class IREmitter : public IR::IREmitter { +public: + IREmitter(IR::Block& block, LocationDescriptor descriptor, ArchVersion arch_version) + : IR::IREmitter(block), current_location(descriptor), arch_version(arch_version) {} + + LocationDescriptor current_location; + + size_t ArchVersion() const; + + u32 PC() const; + u32 AlignPC(size_t alignment) const; + + IR::U32 GetRegister(Reg source_reg); + IR::U32U64 GetExtendedRegister(ExtReg source_reg); + IR::U128 GetVector(ExtReg source_reg); + void SetRegister(Reg dest_reg, const IR::U32& value); + void SetExtendedRegister(ExtReg dest_reg, const IR::U32U64& value); + void SetVector(ExtReg dest_reg, const IR::U128& value); + + void ALUWritePC(const IR::U32& value); + void BranchWritePC(const IR::U32& value); + void BXWritePC(const IR::U32& value); + void LoadWritePC(const IR::U32& value); + void UpdateUpperLocationDescriptor(); + + void CallSupervisor(const IR::U32& value); + void ExceptionRaised(Exception exception); + + IR::U32 GetCpsr(); + void SetCpsr(const IR::U32& value); + void SetCpsrNZCV(const IR::NZCV& value); + void SetCpsrNZCVRaw(const IR::U32& value); + void SetCpsrNZCVQ(const IR::U32& value); + void SetCheckBit(const IR::U1& value); + IR::U1 GetOverflowFrom(const IR::Value& value); + IR::U1 GetCFlag(); + void OrQFlag(const IR::U1& value); + IR::U32 GetGEFlags(); + void SetGEFlags(const IR::U32& value); + void SetGEFlagsCompressed(const IR::U32& value); + + IR::NZCV NZFrom(const IR::Value& value); + void SetCpsrNZ(const IR::NZCV& nz); + void SetCpsrNZC(const IR::NZCV& nz, const IR::U1& c); + + void DataSynchronizationBarrier(); + void DataMemoryBarrier(); + void InstructionSynchronizationBarrier(); + + IR::U32 GetFpscr(); + void SetFpscr(const IR::U32& new_fpscr); + IR::U32 GetFpscrNZCV(); + void SetFpscrNZCV(const IR::NZCV& new_fpscr_nzcv); + + void ClearExclusive(); + IR::UAny ReadMemory(size_t bitsize, const IR::U32& vaddr, IR::AccType acc_type); + IR::U8 ReadMemory8(const IR::U32& vaddr, IR::AccType acc_type); + IR::U16 ReadMemory16(const IR::U32& vaddr, IR::AccType acc_type); + IR::U32 ReadMemory32(const IR::U32& vaddr, IR::AccType acc_type); + IR::U64 ReadMemory64(const IR::U32& vaddr, IR::AccType acc_type); + IR::U8 ExclusiveReadMemory8(const IR::U32& vaddr, IR::AccType acc_type); + IR::U16 ExclusiveReadMemory16(const IR::U32& vaddr, IR::AccType acc_type); + IR::U32 ExclusiveReadMemory32(const IR::U32& vaddr, IR::AccType acc_type); + std::pair<IR::U32, IR::U32> ExclusiveReadMemory64(const IR::U32& vaddr, IR::AccType acc_type); + void WriteMemory(size_t bitsize, const IR::U32& vaddr, const IR::UAny& value, IR::AccType acc_type); + void WriteMemory8(const IR::U32& vaddr, const IR::U8& value, IR::AccType acc_type); + void WriteMemory16(const IR::U32& vaddr, const IR::U16& value, IR::AccType acc_type); + void WriteMemory32(const IR::U32& vaddr, const IR::U32& value, IR::AccType acc_type); + void WriteMemory64(const IR::U32& vaddr, const IR::U64& value, IR::AccType acc_type); + IR::U32 ExclusiveWriteMemory8(const IR::U32& vaddr, const IR::U8& value, IR::AccType acc_type); + IR::U32 ExclusiveWriteMemory16(const IR::U32& vaddr, const IR::U16& value, IR::AccType acc_type); + IR::U32 ExclusiveWriteMemory32(const IR::U32& vaddr, const IR::U32& value, IR::AccType acc_type); + IR::U32 ExclusiveWriteMemory64(const IR::U32& vaddr, const IR::U32& value_lo, const IR::U32& value_hi, IR::AccType acc_type); + + void CoprocInternalOperation(size_t coproc_no, bool two, size_t opc1, CoprocReg CRd, CoprocReg CRn, CoprocReg CRm, size_t opc2); + void CoprocSendOneWord(size_t coproc_no, bool two, size_t opc1, CoprocReg CRn, CoprocReg CRm, size_t opc2, const IR::U32& word); + void CoprocSendTwoWords(size_t coproc_no, bool two, size_t opc, CoprocReg CRm, const IR::U32& word1, const IR::U32& word2); + IR::U32 CoprocGetOneWord(size_t coproc_no, bool two, size_t opc1, CoprocReg CRn, CoprocReg CRm, size_t opc2); + IR::U64 CoprocGetTwoWords(size_t coproc_no, bool two, size_t opc, CoprocReg CRm); + void CoprocLoadWords(size_t coproc_no, bool two, bool long_transfer, CoprocReg CRd, const IR::U32& address, bool has_option, u8 option); + void CoprocStoreWords(size_t coproc_no, bool two, bool long_transfer, CoprocReg CRd, const IR::U32& address, bool has_option, u8 option); + +private: + enum ArchVersion arch_version; + IR::U64 ImmCurrentLocationDescriptor(); +}; + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.cpp new file mode 100644 index 0000000000..ac3a374e0d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.cpp @@ -0,0 +1,21 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/a32_location_descriptor.h" + +#include <fmt/format.h> + +namespace Dynarmic::A32 { + +std::string ToString(const LocationDescriptor& descriptor) { + return fmt::format("{{{:08x},{},{},{:08x}{}}}", + descriptor.PC(), + descriptor.TFlag() ? "T" : "!T", + descriptor.EFlag() ? "E" : "!E", + descriptor.FPSCR().Value(), + descriptor.SingleStepping() ? ",step" : ""); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.h b/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.h new file mode 100644 index 0000000000..c53e75d4b7 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_location_descriptor.h @@ -0,0 +1,162 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <functional> +#include <string> +#include <tuple> + +#include <fmt/format.h> +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/A32/FPSCR.h" +#include "dynarmic/frontend/A32/ITState.h" +#include "dynarmic/frontend/A32/PSR.h" +#include "dynarmic/ir/location_descriptor.h" + +namespace Dynarmic::A32 { + +/** + * LocationDescriptor describes the location of a basic block. + * The location is not solely based on the PC because other flags influence the way + * instructions should be translated. The CPSR.T flag is most notable since it + * tells us if the processor is in Thumb or Arm mode. + */ +class LocationDescriptor { +public: + // Indicates bits that should be preserved within descriptors. + static constexpr u32 CPSR_MODE_MASK = 0x0600FE20; + static constexpr u32 FPSCR_MODE_MASK = 0x07F70000; + + LocationDescriptor(u32 arm_pc, PSR cpsr, FPSCR fpscr, bool single_stepping = false) + : arm_pc(arm_pc) + , cpsr(cpsr.Value() & CPSR_MODE_MASK) + , fpscr(fpscr.Value() & FPSCR_MODE_MASK) + , single_stepping(single_stepping) {} + + explicit LocationDescriptor(const IR::LocationDescriptor& o) { + arm_pc = static_cast<u32>(o.Value()); + cpsr.T((o.Value() >> 32) & 1); + cpsr.E((o.Value() >> 32) & 2); + fpscr = (o.Value() >> 32) & FPSCR_MODE_MASK; + cpsr.IT(ITState{static_cast<u8>(o.Value() >> 40)}); + single_stepping = (o.Value() >> 32) & 4; + } + + u32 PC() const { return arm_pc; } + bool TFlag() const { return cpsr.T(); } + bool EFlag() const { return cpsr.E(); } + ITState IT() const { return cpsr.IT(); } + + A32::PSR CPSR() const { return cpsr; } + A32::FPSCR FPSCR() const { return fpscr; } + + bool SingleStepping() const { return single_stepping; } + + bool operator==(const LocationDescriptor& o) const { + return std::tie(arm_pc, cpsr, fpscr, single_stepping) == std::tie(o.arm_pc, o.cpsr, o.fpscr, o.single_stepping); + } + + bool operator!=(const LocationDescriptor& o) const { + return !operator==(o); + } + + LocationDescriptor SetPC(u32 new_arm_pc) const { + return LocationDescriptor(new_arm_pc, cpsr, fpscr, single_stepping); + } + + LocationDescriptor AdvancePC(int amount) const { + return LocationDescriptor(static_cast<u32>(arm_pc + amount), cpsr, fpscr, single_stepping); + } + + LocationDescriptor SetTFlag(bool new_tflag) const { + PSR new_cpsr = cpsr; + new_cpsr.T(new_tflag); + + return LocationDescriptor(arm_pc, new_cpsr, fpscr, single_stepping); + } + + LocationDescriptor SetEFlag(bool new_eflag) const { + PSR new_cpsr = cpsr; + new_cpsr.E(new_eflag); + + return LocationDescriptor(arm_pc, new_cpsr, fpscr, single_stepping); + } + + LocationDescriptor SetFPSCR(u32 new_fpscr) const { + return LocationDescriptor(arm_pc, cpsr, A32::FPSCR{new_fpscr & FPSCR_MODE_MASK}, single_stepping); + } + + LocationDescriptor SetIT(ITState new_it) const { + PSR new_cpsr = cpsr; + new_cpsr.IT(new_it); + + return LocationDescriptor(arm_pc, new_cpsr, fpscr, single_stepping); + } + + LocationDescriptor AdvanceIT() const { + return SetIT(IT().Advance()); + } + + LocationDescriptor SetSingleStepping(bool new_single_stepping) const { + return LocationDescriptor(arm_pc, cpsr, fpscr, new_single_stepping); + } + + u64 UniqueHash() const noexcept { + // This value MUST BE UNIQUE. + // This calculation has to match up with EmitX64::EmitTerminalPopRSBHint + const u64 pc_u64 = arm_pc; + const u64 fpscr_u64 = fpscr.Value(); + const u64 t_u64 = cpsr.T() ? 1 : 0; + const u64 e_u64 = cpsr.E() ? 2 : 0; + const u64 single_stepping_u64 = single_stepping ? 4 : 0; + const u64 it_u64 = u64(cpsr.IT().Value()) << 8; + const u64 upper = (fpscr_u64 | t_u64 | e_u64 | single_stepping_u64 | it_u64) << 32; + return pc_u64 | upper; + } + + operator IR::LocationDescriptor() const { + return IR::LocationDescriptor{UniqueHash()}; + } + +private: + u32 arm_pc; ///< Current program counter value. + PSR cpsr; ///< Current program status register. + A32::FPSCR fpscr; ///< Floating point status control register. + bool single_stepping; +}; + +/** + * Provides a string representation of a LocationDescriptor. + * + * @param descriptor The descriptor to get a string representation of + */ +std::string ToString(const LocationDescriptor& descriptor); + +} // namespace Dynarmic::A32 + +namespace std { +template<> +struct less<Dynarmic::A32::LocationDescriptor> { + bool operator()(const Dynarmic::A32::LocationDescriptor& x, const Dynarmic::A32::LocationDescriptor& y) const noexcept { + return x.UniqueHash() < y.UniqueHash(); + } +}; +template<> +struct hash<Dynarmic::A32::LocationDescriptor> { + size_t operator()(const Dynarmic::A32::LocationDescriptor& x) const noexcept { + return std::hash<u64>()(x.UniqueHash()); + } +}; +} // namespace std + +template<> +struct fmt::formatter<Dynarmic::A32::LocationDescriptor> : fmt::formatter<std::string> { + template<typename FormatContext> + auto format(Dynarmic::A32::LocationDescriptor descriptor, FormatContext& ctx) const { + return formatter<std::string>::format(Dynarmic::A32::ToString(descriptor), ctx); + } +}; diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.cpp new file mode 100644 index 0000000000..a47ce0b78b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.cpp @@ -0,0 +1,60 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/a32_types.h" + +#include <array> +#include <ostream> + +#include <mcl/bit/bit_field.hpp> + +namespace Dynarmic::A32 { + +const char* CondToString(Cond cond, bool explicit_al) { + static constexpr std::array cond_strs = {"eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"}; + return (!explicit_al && cond == Cond::AL) ? "" : cond_strs.at(static_cast<size_t>(cond)); +} + +const char* RegToString(Reg reg) { + static constexpr std::array reg_strs = {"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", + "r8", "r9", "r10", "r11", "r12", "sp", "lr", "pc"}; + return reg_strs.at(static_cast<size_t>(reg)); +} + +const char* ExtRegToString(ExtReg reg) { + static constexpr std::array reg_strs = { + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", + "s17", "s18", "s19", "s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31", + + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", + "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", + + "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "q16"}; + return reg_strs.at(static_cast<size_t>(reg)); +} + +const char* CoprocRegToString(CoprocReg reg) { + static constexpr std::array reg_strs = { + "c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", + "c9", "c10", "c11", "c12", "c13", "c14", "c15"}; + return reg_strs.at(static_cast<size_t>(reg)); +} + +std::string RegListToString(RegList reg_list) { + std::string ret; + bool first_reg = true; + for (size_t i = 0; i < 16; i++) { + if (mcl::bit::get_bit(i, reg_list)) { + if (!first_reg) { + ret += ", "; + } + ret += RegToString(static_cast<Reg>(i)); + first_reg = false; + } + } + return ret; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.h b/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.h new file mode 100644 index 0000000000..6f56bea51a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/a32_types.h @@ -0,0 +1,177 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <string> +#include <utility> + +#include <fmt/format.h> +#include <mcl/assert.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/interface/A32/coprocessor_util.h" +#include "dynarmic/ir/cond.h" + +namespace Dynarmic::A32 { + +using Cond = IR::Cond; + +enum class Reg { + R0, + R1, + R2, + R3, + R4, + R5, + R6, + R7, + R8, + R9, + R10, + R11, + R12, + R13, + R14, + R15, + SP = R13, + LR = R14, + PC = R15, + INVALID_REG = 99 +}; + +enum class ExtReg { + // clang-format off + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31, + D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31, + Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15 + // clang-format on +}; + +using RegList = u16; + +enum class ShiftType { + LSL, + LSR, + ASR, + ROR ///< RRX falls under this too +}; + +enum class SignExtendRotation { + ROR_0, ///< ROR #0 or omitted + ROR_8, ///< ROR #8 + ROR_16, ///< ROR #16 + ROR_24 ///< ROR #24 +}; + +const char* CondToString(Cond cond, bool explicit_al = false); +const char* RegToString(Reg reg); +const char* ExtRegToString(ExtReg reg); +const char* CoprocRegToString(CoprocReg reg); +std::string RegListToString(RegList reg_list); + +constexpr bool IsSingleExtReg(ExtReg reg) { + return reg >= ExtReg::S0 && reg <= ExtReg::S31; +} + +constexpr bool IsDoubleExtReg(ExtReg reg) { + return reg >= ExtReg::D0 && reg <= ExtReg::D31; +} + +constexpr bool IsQuadExtReg(ExtReg reg) { + return reg >= ExtReg::Q0 && reg <= ExtReg::Q15; +} + +inline size_t RegNumber(Reg reg) { + ASSERT(reg != Reg::INVALID_REG); + return static_cast<size_t>(reg); +} + +inline size_t RegNumber(ExtReg reg) { + if (IsSingleExtReg(reg)) { + return static_cast<size_t>(reg) - static_cast<size_t>(ExtReg::S0); + } + + if (IsDoubleExtReg(reg)) { + return static_cast<size_t>(reg) - static_cast<size_t>(ExtReg::D0); + } + + if (IsQuadExtReg(reg)) { + return static_cast<size_t>(reg) - static_cast<size_t>(ExtReg::Q0); + } + + ASSERT_FALSE("Invalid extended register"); +} + +inline Reg operator+(Reg reg, size_t number) { + const size_t new_reg = RegNumber(reg) + number; + ASSERT(new_reg <= 15); + + return static_cast<Reg>(new_reg); +} + +inline ExtReg operator+(ExtReg reg, size_t number) { + const auto new_reg = static_cast<ExtReg>(static_cast<size_t>(reg) + number); + + ASSERT((IsSingleExtReg(reg) && IsSingleExtReg(new_reg)) + || (IsDoubleExtReg(reg) && IsDoubleExtReg(new_reg)) + || (IsQuadExtReg(reg) && IsQuadExtReg(new_reg))); + + return new_reg; +} + +inline ExtReg ToExtRegQ(size_t base, bool bit) { + return ExtReg::Q0 + ((base >> 1) + (bit ? 8 : 0)); +} + +inline ExtReg ToExtRegD(size_t base, bool bit) { + return ExtReg::D0 + (base + (bit ? 16 : 0)); +} + +inline ExtReg ToExtRegS(size_t base, bool bit) { + return ExtReg::S0 + ((base << 1) + (bit ? 1 : 0)); +} + +inline ExtReg ToExtReg(bool sz, size_t base, bool bit) { + return sz ? ToExtRegD(base, bit) : ToExtRegS(base, bit); +} + +inline ExtReg ToVector(bool Q, size_t base, bool bit) { + return Q ? ToExtRegQ(base, bit) : ToExtRegD(base, bit); +} + +} // namespace Dynarmic::A32 + +template<> +struct fmt::formatter<Dynarmic::A32::Reg> : fmt::formatter<const char*> { + template<typename FormatContext> + auto format(Dynarmic::A32::Reg reg, FormatContext& ctx) const { + return formatter<const char*>::format(Dynarmic::A32::RegToString(reg), ctx); + } +}; + +template<> +struct fmt::formatter<Dynarmic::A32::ExtReg> : fmt::formatter<const char*> { + template<typename FormatContext> + auto format(Dynarmic::A32::ExtReg reg, FormatContext& ctx) const { + return formatter<const char*>::format(Dynarmic::A32::ExtRegToString(reg), ctx); + } +}; + +template<> +struct fmt::formatter<Dynarmic::A32::CoprocReg> : fmt::formatter<const char*> { + template<typename FormatContext> + auto format(Dynarmic::A32::CoprocReg reg, FormatContext& ctx) const { + return formatter<const char*>::format(Dynarmic::A32::CoprocRegToString(reg), ctx); + } +}; + +template<> +struct fmt::formatter<Dynarmic::A32::RegList> : fmt::formatter<std::string> { + template<typename FormatContext> + auto format(Dynarmic::A32::RegList reg_list, FormatContext& ctx) const { + return formatter<std::string>::format(Dynarmic::A32::RegListToString(reg_list), ctx); + } +}; diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.h b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.h new file mode 100644 index 0000000000..16ae52e13a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.h @@ -0,0 +1,74 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + * + * Original version of table by Lioncash. + */ + +#pragma once + +#include <algorithm> +#include <functional> +#include <optional> +#include <vector> + +#include <mcl/bit/bit_count.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/decoder/decoder_detail.h" +#include "dynarmic/frontend/decoder/matcher.h" + +namespace Dynarmic::A32 { + +template<typename Visitor> +using ArmMatcher = Decoder::Matcher<Visitor, u32>; + +template<typename Visitor> +using ArmDecodeTable = std::array<std::vector<ArmMatcher<Visitor>>, 0x1000>; + +namespace detail { +inline size_t ToFastLookupIndexArm(u32 instruction) { + return ((instruction >> 4) & 0x00F) | ((instruction >> 16) & 0xFF0); +} +} // namespace detail + +template<typename V> +ArmDecodeTable<V> GetArmDecodeTable() { + std::vector<ArmMatcher<V>> list = { + +#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(ArmMatcher, fn, name, Decoder::detail::StringToArray<32>(bitstring)), +#include "./arm.inc" +#undef INST + + }; + + // If a matcher has more bits in its mask it is more specific, so it should come first. + std::stable_sort(list.begin(), list.end(), [](const auto& matcher1, const auto& matcher2) { + return mcl::bit::count_ones(matcher1.GetMask()) > mcl::bit::count_ones(matcher2.GetMask()); + }); + + ArmDecodeTable<V> table{}; + for (size_t i = 0; i < table.size(); ++i) { + for (auto matcher : list) { + const auto expect = detail::ToFastLookupIndexArm(matcher.GetExpected()); + const auto mask = detail::ToFastLookupIndexArm(matcher.GetMask()); + if ((i & mask) == expect) { + table[i].push_back(matcher); + } + } + } + return table; +} + +template<typename V> +std::optional<std::reference_wrapper<const ArmMatcher<V>>> DecodeArm(u32 instruction) { + static const auto table = GetArmDecodeTable<V>(); + + const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); }; + + const auto& subtable = table[detail::ToFastLookupIndexArm(instruction)]; + auto iter = std::find_if(subtable.begin(), subtable.end(), matches_instruction); + return iter != subtable.end() ? std::optional<std::reference_wrapper<const ArmMatcher<V>>>(*iter) : std::nullopt; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.inc b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.inc new file mode 100644 index 0000000000..90995c4c05 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/arm.inc @@ -0,0 +1,316 @@ +// Barrier instructions +INST(arm_DMB, "DMB", "1111010101111111111100000101oooo") // v7 +INST(arm_DSB, "DSB", "1111010101111111111100000100oooo") // v7 +INST(arm_ISB, "ISB", "1111010101111111111100000110oooo") // v7 + +// Branch instructions +INST(arm_BLX_imm, "BLX (imm)", "1111101hvvvvvvvvvvvvvvvvvvvvvvvv") // v5 +INST(arm_BLX_reg, "BLX (reg)", "cccc000100101111111111110011mmmm") // v5 +INST(arm_B, "B", "cccc1010vvvvvvvvvvvvvvvvvvvvvvvv") // v1 +INST(arm_BL, "BL", "cccc1011vvvvvvvvvvvvvvvvvvvvvvvv") // v1 +INST(arm_BX, "BX", "cccc000100101111111111110001mmmm") // v4T +INST(arm_BXJ, "BXJ", "cccc000100101111111111110010mmmm") // v5J + +// CRC32 instructions +INST(arm_CRC32, "CRC32", "cccc00010zz0nnnndddd00000100mmmm") // v8 +INST(arm_CRC32C, "CRC32C", "cccc00010zz0nnnndddd00100100mmmm") // v8 + +// Coprocessor instructions +INST(arm_CDP, "CDP", "cccc1110ooooNNNNDDDDppppooo0MMMM") // v2 (CDP2: v5) +INST(arm_LDC, "LDC", "cccc110pudw1nnnnDDDDppppvvvvvvvv") // v2 (LDC2: v5) +INST(arm_MCR, "MCR", "cccc1110ooo0NNNNttttppppooo1MMMM") // v2 (MCR2: v5) +INST(arm_MCRR, "MCRR", "cccc11000100uuuuttttppppooooMMMM") // v5E (MCRR2: v6) +INST(arm_MRC, "MRC", "cccc1110ooo1NNNNttttppppooo1MMMM") // v2 (MRC2: v5) +INST(arm_MRRC, "MRRC", "cccc11000101uuuuttttppppooooMMMM") // v5E (MRRC2: v6) +INST(arm_STC, "STC", "cccc110pudw0nnnnDDDDppppvvvvvvvv") // v2 (STC2: v5) + +// Data Processing instructions +INST(arm_ADC_imm, "ADC (imm)", "cccc0010101Snnnnddddrrrrvvvvvvvv") // v1 +INST(arm_ADC_reg, "ADC (reg)", "cccc0000101Snnnnddddvvvvvrr0mmmm") // v1 +INST(arm_ADC_rsr, "ADC (rsr)", "cccc0000101Snnnnddddssss0rr1mmmm") // v1 +INST(arm_ADD_imm, "ADD (imm)", "cccc0010100Snnnnddddrrrrvvvvvvvv") // v1 +INST(arm_ADD_reg, "ADD (reg)", "cccc0000100Snnnnddddvvvvvrr0mmmm") // v1 +INST(arm_ADD_rsr, "ADD (rsr)", "cccc0000100Snnnnddddssss0rr1mmmm") // v1 +INST(arm_AND_imm, "AND (imm)", "cccc0010000Snnnnddddrrrrvvvvvvvv") // v1 +INST(arm_AND_reg, "AND (reg)", "cccc0000000Snnnnddddvvvvvrr0mmmm") // v1 +INST(arm_AND_rsr, "AND (rsr)", "cccc0000000Snnnnddddssss0rr1mmmm") // v1 +INST(arm_BIC_imm, "BIC (imm)", "cccc0011110Snnnnddddrrrrvvvvvvvv") // v1 +INST(arm_BIC_reg, "BIC (reg)", "cccc0001110Snnnnddddvvvvvrr0mmmm") // v1 +INST(arm_BIC_rsr, "BIC (rsr)", "cccc0001110Snnnnddddssss0rr1mmmm") // v1 +INST(arm_CMN_imm, "CMN (imm)", "cccc00110111nnnn0000rrrrvvvvvvvv") // v1 +INST(arm_CMN_reg, "CMN (reg)", "cccc00010111nnnn0000vvvvvrr0mmmm") // v1 +INST(arm_CMN_rsr, "CMN (rsr)", "cccc00010111nnnn0000ssss0rr1mmmm") // v1 +INST(arm_CMP_imm, "CMP (imm)", "cccc00110101nnnn0000rrrrvvvvvvvv") // v1 +INST(arm_CMP_reg, "CMP (reg)", "cccc00010101nnnn0000vvvvvrr0mmmm") // v1 +INST(arm_CMP_rsr, "CMP (rsr)", "cccc00010101nnnn0000ssss0rr1mmmm") // v1 +INST(arm_EOR_imm, "EOR (imm)", "cccc0010001Snnnnddddrrrrvvvvvvvv") // v1 +INST(arm_EOR_reg, "EOR (reg)", "cccc0000001Snnnnddddvvvvvrr0mmmm") // v1 +INST(arm_EOR_rsr, "EOR (rsr)", "cccc0000001Snnnnddddssss0rr1mmmm") // v1 +INST(arm_MOV_imm, "MOV (imm)", "cccc0011101S0000ddddrrrrvvvvvvvv") // v1 +INST(arm_MOV_reg, "MOV (reg)", "cccc0001101S0000ddddvvvvvrr0mmmm") // v1 +INST(arm_MOV_rsr, "MOV (rsr)", "cccc0001101S0000ddddssss0rr1mmmm") // v1 +INST(arm_MVN_imm, "MVN (imm)", "cccc0011111S0000ddddrrrrvvvvvvvv") // v1 +INST(arm_MVN_reg, "MVN (reg)", "cccc0001111S0000ddddvvvvvrr0mmmm") // v1 +INST(arm_MVN_rsr, "MVN (rsr)", "cccc0001111S0000ddddssss0rr1mmmm") // v1 +INST(arm_ORR_imm, "ORR (imm)", "cccc0011100Snnnnddddrrrrvvvvvvvv") // v1 +INST(arm_ORR_reg, "ORR (reg)", "cccc0001100Snnnnddddvvvvvrr0mmmm") // v1 +INST(arm_ORR_rsr, "ORR (rsr)", "cccc0001100Snnnnddddssss0rr1mmmm") // v1 +INST(arm_RSB_imm, "RSB (imm)", "cccc0010011Snnnnddddrrrrvvvvvvvv") // v1 +INST(arm_RSB_reg, "RSB (reg)", "cccc0000011Snnnnddddvvvvvrr0mmmm") // v1 +INST(arm_RSB_rsr, "RSB (rsr)", "cccc0000011Snnnnddddssss0rr1mmmm") // v1 +INST(arm_RSC_imm, "RSC (imm)", "cccc0010111Snnnnddddrrrrvvvvvvvv") // v1 +INST(arm_RSC_reg, "RSC (reg)", "cccc0000111Snnnnddddvvvvvrr0mmmm") // v1 +INST(arm_RSC_rsr, "RSC (rsr)", "cccc0000111Snnnnddddssss0rr1mmmm") // v1 +INST(arm_SBC_imm, "SBC (imm)", "cccc0010110Snnnnddddrrrrvvvvvvvv") // v1 +INST(arm_SBC_reg, "SBC (reg)", "cccc0000110Snnnnddddvvvvvrr0mmmm") // v1 +INST(arm_SBC_rsr, "SBC (rsr)", "cccc0000110Snnnnddddssss0rr1mmmm") // v1 +INST(arm_SUB_imm, "SUB (imm)", "cccc0010010Snnnnddddrrrrvvvvvvvv") // v1 +INST(arm_SUB_reg, "SUB (reg)", "cccc0000010Snnnnddddvvvvvrr0mmmm") // v1 +INST(arm_SUB_rsr, "SUB (rsr)", "cccc0000010Snnnnddddssss0rr1mmmm") // v1 +INST(arm_TEQ_imm, "TEQ (imm)", "cccc00110011nnnn0000rrrrvvvvvvvv") // v1 +INST(arm_TEQ_reg, "TEQ (reg)", "cccc00010011nnnn0000vvvvvrr0mmmm") // v1 +INST(arm_TEQ_rsr, "TEQ (rsr)", "cccc00010011nnnn0000ssss0rr1mmmm") // v1 +INST(arm_TST_imm, "TST (imm)", "cccc00110001nnnn0000rrrrvvvvvvvv") // v1 +INST(arm_TST_reg, "TST (reg)", "cccc00010001nnnn0000vvvvvrr0mmmm") // v1 +INST(arm_TST_rsr, "TST (rsr)", "cccc00010001nnnn0000ssss0rr1mmmm") // v1 + +// Exception Generating instructions +INST(arm_BKPT, "BKPT", "cccc00010010vvvvvvvvvvvv0111vvvv") // v5 +INST(arm_SVC, "SVC", "cccc1111vvvvvvvvvvvvvvvvvvvvvvvv") // v1 +INST(arm_UDF, "UDF", "111001111111------------1111----") + +// Extension instructions +INST(arm_SXTB, "SXTB", "cccc011010101111ddddrr000111mmmm") // v6 +INST(arm_SXTB16, "SXTB16", "cccc011010001111ddddrr000111mmmm") // v6 +INST(arm_SXTH, "SXTH", "cccc011010111111ddddrr000111mmmm") // v6 +INST(arm_SXTAB, "SXTAB", "cccc01101010nnnnddddrr000111mmmm") // v6 +INST(arm_SXTAB16, "SXTAB16", "cccc01101000nnnnddddrr000111mmmm") // v6 +INST(arm_SXTAH, "SXTAH", "cccc01101011nnnnddddrr000111mmmm") // v6 +INST(arm_UXTB, "UXTB", "cccc011011101111ddddrr000111mmmm") // v6 +INST(arm_UXTB16, "UXTB16", "cccc011011001111ddddrr000111mmmm") // v6 +INST(arm_UXTH, "UXTH", "cccc011011111111ddddrr000111mmmm") // v6 +INST(arm_UXTAB, "UXTAB", "cccc01101110nnnnddddrr000111mmmm") // v6 +INST(arm_UXTAB16, "UXTAB16", "cccc01101100nnnnddddrr000111mmmm") // v6 +INST(arm_UXTAH, "UXTAH", "cccc01101111nnnnddddrr000111mmmm") // v6 + +// Hint instructions +INST(arm_PLD_imm, "PLD (imm)", "11110101uz01nnnn1111iiiiiiiiiiii") // v5E for PLD; v7 for PLDW +INST(arm_PLD_reg, "PLD (reg)", "11110111uz01nnnn1111iiiiitt0mmmm") // v5E for PLD; v7 for PLDW +INST(arm_SEV, "SEV", "----0011001000001111000000000100") // v6K +INST(arm_SEVL, "SEVL", "----0011001000001111000000000101") // v8 +INST(arm_WFE, "WFE", "----0011001000001111000000000010") // v6K +INST(arm_WFI, "WFI", "----0011001000001111000000000011") // v6K +INST(arm_YIELD, "YIELD", "----0011001000001111000000000001") // v6K +INST(arm_NOP, "Reserved Hint", "----0011001000001111------------") +INST(arm_NOP, "Reserved Hint", "----001100100000111100000000----") + +// Synchronization Primitive instructions +INST(arm_CLREX, "CLREX", "11110101011111111111000000011111") // v6K +INST(arm_SWP, "SWP", "cccc00010000nnnntttt00001001uuuu") // v2S (v6: Deprecated) +INST(arm_SWPB, "SWPB", "cccc00010100nnnntttt00001001uuuu") // v2S (v6: Deprecated) +INST(arm_STL, "STL", "cccc00011000nnnn111111001001tttt") // v8 +INST(arm_STLEX, "STLEX", "cccc00011000nnnndddd11101001tttt") // v8 +INST(arm_STREX, "STREX", "cccc00011000nnnndddd11111001mmmm") // v6 +INST(arm_LDA, "LDA", "cccc00011001nnnndddd110010011111") // v8 +INST(arm_LDAEX, "LDAEX", "cccc00011001nnnndddd111010011111") // v8 +INST(arm_LDREX, "LDREX", "cccc00011001nnnndddd111110011111") // v6 +INST(arm_STLEXD, "STLEXD", "cccc00011010nnnndddd11101001mmmm") // v8 +INST(arm_STREXD, "STREXD", "cccc00011010nnnndddd11111001mmmm") // v6K +INST(arm_LDAEXD, "LDAEXD", "cccc00011011nnnndddd111010011111") // v8 +INST(arm_LDREXD, "LDREXD", "cccc00011011nnnndddd111110011111") // v6K +INST(arm_STLB, "STLB", "cccc00011100nnnn111111001001tttt") // v8 +INST(arm_STLEXB, "STLEXB", "cccc00011100nnnndddd11101001mmmm") // v8 +INST(arm_STREXB, "STREXB", "cccc00011100nnnndddd11111001mmmm") // v6K +INST(arm_LDAB, "LDAB", "cccc00011101nnnndddd110010011111") // v8 +INST(arm_LDAEXB, "LDAEXB", "cccc00011101nnnndddd111010011111") // v8 +INST(arm_LDREXB, "LDREXB", "cccc00011101nnnndddd111110011111") // v6K +INST(arm_STLH, "STLH", "cccc00011110nnnn111111001001mmmm") // v8 +INST(arm_STLEXH, "STLEXH", "cccc00011110nnnndddd11101001mmmm") // v8 +INST(arm_STREXH, "STREXH", "cccc00011110nnnndddd11111001mmmm") // v6K +INST(arm_LDAH, "LDAH", "cccc00011111nnnndddd110010011111") // v8 +INST(arm_LDAEXH, "LDAEXH", "cccc00011111nnnndddd111010011111") // v8 +INST(arm_LDREXH, "LDREXH", "cccc00011111nnnndddd111110011111") // v6K + +// Load/Store instructions +INST(arm_LDRBT, "LDRBT (A1)", "----0100-111--------------------") // v1 +INST(arm_LDRBT, "LDRBT (A2)", "----0110-111---------------0----") // v1 +INST(arm_LDRHT, "LDRHT (A1)", "----0000-111------------1011----") // v6T2 +INST(arm_LDRHT, "LDRHT (A1)", "----0000-1111111--------1011----") // v6T2 +INST(arm_LDRHT, "LDRHT (A2)", "----0000-011--------00001011----") // v6T2 +INST(arm_LDRSBT, "LDRSBT (A1)", "----0000-111------------1101----") // v6T2 +INST(arm_LDRSBT, "LDRSBT (A2)", "----0000-011--------00001101----") // v6T2 +INST(arm_LDRSHT, "LDRSHT (A1)", "----0000-111------------1111----") // v6T2 +INST(arm_LDRSHT, "LDRSHT (A2)", "----0000-011--------00001111----") // v6T2 +INST(arm_LDRT, "LDRT (A1)", "----0100-011--------------------") // v1 +INST(arm_LDRT, "LDRT (A2)", "----0110-011---------------0----") // v1 +INST(arm_STRBT, "STRBT (A1)", "----0100-110--------------------") // v1 +INST(arm_STRBT, "STRBT (A2)", "----0110-110---------------0----") // v1 +INST(arm_STRHT, "STRHT (A1)", "----0000-110------------1011----") // v6T2 +INST(arm_STRHT, "STRHT (A2)", "----0000-010--------00001011----") // v6T2 +INST(arm_STRT, "STRT (A1)", "----0100-010--------------------") // v1 +INST(arm_STRT, "STRT (A2)", "----0110-010---------------0----") // v1 +INST(arm_LDR_lit, "LDR (lit)", "cccc0101u0011111ttttvvvvvvvvvvvv") // v1 +INST(arm_LDR_imm, "LDR (imm)", "cccc010pu0w1nnnnttttvvvvvvvvvvvv") // v1 +INST(arm_LDR_reg, "LDR (reg)", "cccc011pu0w1nnnnttttvvvvvrr0mmmm") // v1 +INST(arm_LDRB_lit, "LDRB (lit)", "cccc0101u1011111ttttvvvvvvvvvvvv") // v1 +INST(arm_LDRB_imm, "LDRB (imm)", "cccc010pu1w1nnnnttttvvvvvvvvvvvv") // v1 +INST(arm_LDRB_reg, "LDRB (reg)", "cccc011pu1w1nnnnttttvvvvvrr0mmmm") // v1 +INST(arm_LDRD_lit, "LDRD (lit)", "cccc0001u1001111ttttvvvv1101vvvv") // v5E +INST(arm_LDRD_imm, "LDRD (imm)", "cccc000pu1w0nnnnttttvvvv1101vvvv") // v5E +INST(arm_LDRD_reg, "LDRD (reg)", "cccc000pu0w0nnnntttt00001101mmmm") // v5E +INST(arm_LDRH_lit, "LDRH (lit)", "cccc000pu1w11111ttttvvvv1011vvvv") // v4 +INST(arm_LDRH_imm, "LDRH (imm)", "cccc000pu1w1nnnnttttvvvv1011vvvv") // v4 +INST(arm_LDRH_reg, "LDRH (reg)", "cccc000pu0w1nnnntttt00001011mmmm") // v4 +INST(arm_LDRSB_lit, "LDRSB (lit)", "cccc0001u1011111ttttvvvv1101vvvv") // v4 +INST(arm_LDRSB_imm, "LDRSB (imm)", "cccc000pu1w1nnnnttttvvvv1101vvvv") // v4 +INST(arm_LDRSB_reg, "LDRSB (reg)", "cccc000pu0w1nnnntttt00001101mmmm") // v4 +INST(arm_LDRSH_lit, "LDRSH (lit)", "cccc0001u1011111ttttvvvv1111vvvv") // v4 +INST(arm_LDRSH_imm, "LDRSH (imm)", "cccc000pu1w1nnnnttttvvvv1111vvvv") // v4 +INST(arm_LDRSH_reg, "LDRSH (reg)", "cccc000pu0w1nnnntttt00001111mmmm") // v4 +INST(arm_STR_imm, "STR (imm)", "cccc010pu0w0nnnnttttvvvvvvvvvvvv") // v1 +INST(arm_STR_reg, "STR (reg)", "cccc011pu0w0nnnnttttvvvvvrr0mmmm") // v1 +INST(arm_STRB_imm, "STRB (imm)", "cccc010pu1w0nnnnttttvvvvvvvvvvvv") // v1 +INST(arm_STRB_reg, "STRB (reg)", "cccc011pu1w0nnnnttttvvvvvrr0mmmm") // v1 +INST(arm_STRD_imm, "STRD (imm)", "cccc000pu1w0nnnnttttvvvv1111vvvv") // v5E +INST(arm_STRD_reg, "STRD (reg)", "cccc000pu0w0nnnntttt00001111mmmm") // v5E +INST(arm_STRH_imm, "STRH (imm)", "cccc000pu1w0nnnnttttvvvv1011vvvv") // v4 +INST(arm_STRH_reg, "STRH (reg)", "cccc000pu0w0nnnntttt00001011mmmm") // v4 + +// Load/Store Multiple instructions +INST(arm_LDM, "LDM", "cccc100010w1nnnnxxxxxxxxxxxxxxxx") // v1 +INST(arm_LDMDA, "LDMDA", "cccc100000w1nnnnxxxxxxxxxxxxxxxx") // v1 +INST(arm_LDMDB, "LDMDB", "cccc100100w1nnnnxxxxxxxxxxxxxxxx") // v1 +INST(arm_LDMIB, "LDMIB", "cccc100110w1nnnnxxxxxxxxxxxxxxxx") // v1 +INST(arm_LDM_usr, "LDM (usr reg)", "----100--101--------------------") // v1 +INST(arm_LDM_eret, "LDM (exce ret)", "----100--1-1----1---------------") // v1 +INST(arm_STM, "STM", "cccc100010w0nnnnxxxxxxxxxxxxxxxx") // v1 +INST(arm_STMDA, "STMDA", "cccc100000w0nnnnxxxxxxxxxxxxxxxx") // v1 +INST(arm_STMDB, "STMDB", "cccc100100w0nnnnxxxxxxxxxxxxxxxx") // v1 +INST(arm_STMIB, "STMIB", "cccc100110w0nnnnxxxxxxxxxxxxxxxx") // v1 +INST(arm_STM_usr, "STM (usr reg)", "----100--100--------------------") // v1 + +// Miscellaneous instructions +INST(arm_BFC, "BFC", "cccc0111110vvvvvddddvvvvv0011111") // v6T2 +INST(arm_BFI, "BFI", "cccc0111110vvvvvddddvvvvv001nnnn") // v6T2 +INST(arm_CLZ, "CLZ", "cccc000101101111dddd11110001mmmm") // v5 +INST(arm_MOVT, "MOVT", "cccc00110100vvvvddddvvvvvvvvvvvv") // v6T2 +INST(arm_MOVW, "MOVW", "cccc00110000vvvvddddvvvvvvvvvvvv") // v6T2 +INST(arm_NOP, "NOP", "----0011001000001111000000000000") // v6K +INST(arm_SBFX, "SBFX", "cccc0111101wwwwwddddvvvvv101nnnn") // v6T2 +INST(arm_SEL, "SEL", "cccc01101000nnnndddd11111011mmmm") // v6 +INST(arm_UBFX, "UBFX", "cccc0111111wwwwwddddvvvvv101nnnn") // v6T2 + +// Unsigned Sum of Absolute Differences instructions +INST(arm_USAD8, "USAD8", "cccc01111000dddd1111mmmm0001nnnn") // v6 +INST(arm_USADA8, "USADA8", "cccc01111000ddddaaaammmm0001nnnn") // v6 + +// Packing instructions +INST(arm_PKHBT, "PKHBT", "cccc01101000nnnnddddvvvvv001mmmm") // v6K +INST(arm_PKHTB, "PKHTB", "cccc01101000nnnnddddvvvvv101mmmm") // v6K + +// Reversal instructions +INST(arm_RBIT, "RBIT", "cccc011011111111dddd11110011mmmm") // v6T2 +INST(arm_REV, "REV", "cccc011010111111dddd11110011mmmm") // v6 +INST(arm_REV16, "REV16", "cccc011010111111dddd11111011mmmm") // v6 +INST(arm_REVSH, "REVSH", "cccc011011111111dddd11111011mmmm") // v6 + +// Saturation instructions +INST(arm_SSAT, "SSAT", "cccc0110101vvvvvddddvvvvvr01nnnn") // v6 +INST(arm_SSAT16, "SSAT16", "cccc01101010vvvvdddd11110011nnnn") // v6 +INST(arm_USAT, "USAT", "cccc0110111vvvvvddddvvvvvr01nnnn") // v6 +INST(arm_USAT16, "USAT16", "cccc01101110vvvvdddd11110011nnnn") // v6 + +// Divide instructions +INST(arm_SDIV, "SDIV", "cccc01110001dddd1111mmmm0001nnnn") // v7a +INST(arm_UDIV, "UDIV", "cccc01110011dddd1111mmmm0001nnnn") // v7a + +// Multiply (Normal) instructions +INST(arm_MLA, "MLA", "cccc0000001Sddddaaaammmm1001nnnn") // v2 +INST(arm_MLS, "MLS", "cccc00000110ddddaaaammmm1001nnnn") // v6T2 +INST(arm_MUL, "MUL", "cccc0000000Sdddd0000mmmm1001nnnn") // v2 + +// Multiply (Long) instructions +INST(arm_SMLAL, "SMLAL", "cccc0000111Sddddaaaammmm1001nnnn") // v3M +INST(arm_SMULL, "SMULL", "cccc0000110Sddddaaaammmm1001nnnn") // v3M +INST(arm_UMAAL, "UMAAL", "cccc00000100ddddaaaammmm1001nnnn") // v6 +INST(arm_UMLAL, "UMLAL", "cccc0000101Sddddaaaammmm1001nnnn") // v3M +INST(arm_UMULL, "UMULL", "cccc0000100Sddddaaaammmm1001nnnn") // v3M + +// Multiply (Halfword) instructions +INST(arm_SMLALxy, "SMLALXY", "cccc00010100ddddaaaammmm1xy0nnnn") // v5xP +INST(arm_SMLAxy, "SMLAXY", "cccc00010000ddddaaaammmm1xy0nnnn") // v5xP +INST(arm_SMULxy, "SMULXY", "cccc00010110dddd0000mmmm1xy0nnnn") // v5xP + +// Multiply (Word by Halfword) instructions +INST(arm_SMLAWy, "SMLAWY", "cccc00010010ddddaaaammmm1y00nnnn") // v5xP +INST(arm_SMULWy, "SMULWY", "cccc00010010dddd0000mmmm1y10nnnn") // v5xP + +// Multiply (Most Significant Word) instructions +INST(arm_SMMUL, "SMMUL", "cccc01110101dddd1111mmmm00R1nnnn") // v6 +INST(arm_SMMLA, "SMMLA", "cccc01110101ddddaaaammmm00R1nnnn") // v6 +INST(arm_SMMLS, "SMMLS", "cccc01110101ddddaaaammmm11R1nnnn") // v6 + +// Multiply (Dual) instructions +INST(arm_SMLAD, "SMLAD", "cccc01110000ddddaaaammmm00M1nnnn") // v6 +INST(arm_SMLALD, "SMLALD", "cccc01110100ddddaaaammmm00M1nnnn") // v6 +INST(arm_SMLSD, "SMLSD", "cccc01110000ddddaaaammmm01M1nnnn") // v6 +INST(arm_SMLSLD, "SMLSLD", "cccc01110100ddddaaaammmm01M1nnnn") // v6 +INST(arm_SMUAD, "SMUAD", "cccc01110000dddd1111mmmm00M1nnnn") // v6 +INST(arm_SMUSD, "SMUSD", "cccc01110000dddd1111mmmm01M1nnnn") // v6 + +// Parallel Add/Subtract (Modulo) instructions +INST(arm_SADD8, "SADD8", "cccc01100001nnnndddd11111001mmmm") // v6 +INST(arm_SADD16, "SADD16", "cccc01100001nnnndddd11110001mmmm") // v6 +INST(arm_SASX, "SASX", "cccc01100001nnnndddd11110011mmmm") // v6 +INST(arm_SSAX, "SSAX", "cccc01100001nnnndddd11110101mmmm") // v6 +INST(arm_SSUB8, "SSUB8", "cccc01100001nnnndddd11111111mmmm") // v6 +INST(arm_SSUB16, "SSUB16", "cccc01100001nnnndddd11110111mmmm") // v6 +INST(arm_UADD8, "UADD8", "cccc01100101nnnndddd11111001mmmm") // v6 +INST(arm_UADD16, "UADD16", "cccc01100101nnnndddd11110001mmmm") // v6 +INST(arm_UASX, "UASX", "cccc01100101nnnndddd11110011mmmm") // v6 +INST(arm_USAX, "USAX", "cccc01100101nnnndddd11110101mmmm") // v6 +INST(arm_USUB8, "USUB8", "cccc01100101nnnndddd11111111mmmm") // v6 +INST(arm_USUB16, "USUB16", "cccc01100101nnnndddd11110111mmmm") // v6 + +// Parallel Add/Subtract (Saturating) instructions +INST(arm_QADD8, "QADD8", "cccc01100010nnnndddd11111001mmmm") // v6 +INST(arm_QADD16, "QADD16", "cccc01100010nnnndddd11110001mmmm") // v6 +INST(arm_QASX, "QASX", "cccc01100010nnnndddd11110011mmmm") // v6 +INST(arm_QSAX, "QSAX", "cccc01100010nnnndddd11110101mmmm") // v6 +INST(arm_QSUB8, "QSUB8", "cccc01100010nnnndddd11111111mmmm") // v6 +INST(arm_QSUB16, "QSUB16", "cccc01100010nnnndddd11110111mmmm") // v6 +INST(arm_UQADD8, "UQADD8", "cccc01100110nnnndddd11111001mmmm") // v6 +INST(arm_UQADD16, "UQADD16", "cccc01100110nnnndddd11110001mmmm") // v6 +INST(arm_UQASX, "UQASX", "cccc01100110nnnndddd11110011mmmm") // v6 +INST(arm_UQSAX, "UQSAX", "cccc01100110nnnndddd11110101mmmm") // v6 +INST(arm_UQSUB8, "UQSUB8", "cccc01100110nnnndddd11111111mmmm") // v6 +INST(arm_UQSUB16, "UQSUB16", "cccc01100110nnnndddd11110111mmmm") // v6 + +// Parallel Add/Subtract (Halving) instructions +INST(arm_SHADD8, "SHADD8", "cccc01100011nnnndddd11111001mmmm") // v6 +INST(arm_SHADD16, "SHADD16", "cccc01100011nnnndddd11110001mmmm") // v6 +INST(arm_SHASX, "SHASX", "cccc01100011nnnndddd11110011mmmm") // v6 +INST(arm_SHSAX, "SHSAX", "cccc01100011nnnndddd11110101mmmm") // v6 +INST(arm_SHSUB8, "SHSUB8", "cccc01100011nnnndddd11111111mmmm") // v6 +INST(arm_SHSUB16, "SHSUB16", "cccc01100011nnnndddd11110111mmmm") // v6 +INST(arm_UHADD8, "UHADD8", "cccc01100111nnnndddd11111001mmmm") // v6 +INST(arm_UHADD16, "UHADD16", "cccc01100111nnnndddd11110001mmmm") // v6 +INST(arm_UHASX, "UHASX", "cccc01100111nnnndddd11110011mmmm") // v6 +INST(arm_UHSAX, "UHSAX", "cccc01100111nnnndddd11110101mmmm") // v6 +INST(arm_UHSUB8, "UHSUB8", "cccc01100111nnnndddd11111111mmmm") // v6 +INST(arm_UHSUB16, "UHSUB16", "cccc01100111nnnndddd11110111mmmm") // v6 + +// Saturated Add/Subtract instructions +INST(arm_QADD, "QADD", "cccc00010000nnnndddd00000101mmmm") // v5xP +INST(arm_QSUB, "QSUB", "cccc00010010nnnndddd00000101mmmm") // v5xP +INST(arm_QDADD, "QDADD", "cccc00010100nnnndddd00000101mmmm") // v5xP +INST(arm_QDSUB, "QDSUB", "cccc00010110nnnndddd00000101mmmm") // v5xP + +// Status Register Access instructions +INST(arm_CPS, "CPS", "111100010000---00000000---0-----") // v6 +INST(arm_SETEND, "SETEND", "1111000100000001000000e000000000") // v6 +INST(arm_MRS, "MRS", "cccc000100001111dddd000000000000") // v3 +INST(arm_MSR_imm, "MSR (imm)", "cccc00110010mmmm1111rrrrvvvvvvvv") // v3 +INST(arm_MSR_reg, "MSR (reg)", "cccc00010010mmmm111100000000nnnn") // v3 +INST(arm_RFE, "RFE", "1111100--0-1----0000101000000000") // v6 +INST(arm_SRS, "SRS", "1111100--1-0110100000101000-----") // v6 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.h b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.h new file mode 100644 index 0000000000..cfcd28c6e6 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.h @@ -0,0 +1,78 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <algorithm> +#include <functional> +#include <optional> +#include <set> +#include <vector> + +#include <mcl/bit/bit_count.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/decoder/decoder_detail.h" +#include "dynarmic/frontend/decoder/matcher.h" + +namespace Dynarmic::A32 { + +template<typename Visitor> +using ASIMDMatcher = Decoder::Matcher<Visitor, u32>; + +template<typename V> +std::vector<ASIMDMatcher<V>> GetASIMDDecodeTable() { + std::vector<ASIMDMatcher<V>> table = { + +#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(ASIMDMatcher, fn, name, Decoder::detail::StringToArray<32>(bitstring)), +#include "./asimd.inc" +#undef INST + + }; + + // Exceptions to the rule of thumb. + const std::set<std::string> comes_first{ + "VBIC, VMOV, VMVN, VORR (immediate)", + "VEXT", + "VTBL", + "VTBX", + "VDUP (scalar)", + }; + const std::set<std::string> comes_last{ + "VMLA (scalar)", + "VMLAL (scalar)", + "VQDMLAL/VQDMLSL (scalar)", + "VMUL (scalar)", + "VMULL (scalar)", + "VQDMULL (scalar)", + "VQDMULH (scalar)", + "VQRDMULH (scalar)", + }; + const auto sort_begin = std::stable_partition(table.begin(), table.end(), [&](const auto& matcher) { + return comes_first.count(matcher.GetName()) > 0; + }); + const auto sort_end = std::stable_partition(table.begin(), table.end(), [&](const auto& matcher) { + return comes_last.count(matcher.GetName()) == 0; + }); + + // If a matcher has more bits in its mask it is more specific, so it should come first. + std::stable_sort(sort_begin, sort_end, [](const auto& matcher1, const auto& matcher2) { + return mcl::bit::count_ones(matcher1.GetMask()) > mcl::bit::count_ones(matcher2.GetMask()); + }); + + return table; +} + +template<typename V> +std::optional<std::reference_wrapper<const ASIMDMatcher<V>>> DecodeASIMD(u32 instruction) { + static const auto table = GetASIMDDecodeTable<V>(); + + const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); }; + + auto iter = std::find_if(table.begin(), table.end(), matches_instruction); + return iter != table.end() ? std::optional<std::reference_wrapper<const ASIMDMatcher<V>>>(*iter) : std::nullopt; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.inc b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.inc new file mode 100644 index 0000000000..d0dfd86752 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.inc @@ -0,0 +1,172 @@ +// Three registers of the same length +INST(asimd_VHADD, "VHADD", "1111001U0Dzznnnndddd0000NQM0mmmm") // ASIMD +INST(asimd_VQADD, "VQADD", "1111001U0Dzznnnndddd0000NQM1mmmm") // ASIMD +INST(asimd_VRHADD, "VRHADD", "1111001U0Dzznnnndddd0001NQM0mmmm") // ASIMD +INST(asimd_VAND_reg, "VAND (register)", "111100100D00nnnndddd0001NQM1mmmm") // ASIMD +INST(asimd_VBIC_reg, "VBIC (register)", "111100100D01nnnndddd0001NQM1mmmm") // ASIMD +INST(asimd_VORR_reg, "VORR (register)", "111100100D10nnnndddd0001NQM1mmmm") // ASIMD +INST(asimd_VORN_reg, "VORN (register)", "111100100D11nnnndddd0001NQM1mmmm") // ASIMD +INST(asimd_VEOR_reg, "VEOR (register)", "111100110D00nnnndddd0001NQM1mmmm") // ASIMD +INST(asimd_VBSL, "VBSL", "111100110D01nnnndddd0001NQM1mmmm") // ASIMD +INST(asimd_VBIT, "VBIT", "111100110D10nnnndddd0001NQM1mmmm") // ASIMD +INST(asimd_VBIF, "VBIF", "111100110D11nnnndddd0001NQM1mmmm") // ASIMD +INST(asimd_VHSUB, "VHSUB", "1111001U0Dzznnnndddd0010NQM0mmmm") // ASIMD +INST(asimd_VQSUB, "VQSUB", "1111001U0Dzznnnndddd0010NQM1mmmm") // ASIMD +INST(asimd_VCGT_reg, "VCGT (register)", "1111001U0Dzznnnndddd0011NQM0mmmm") // ASIMD +INST(asimd_VCGE_reg, "VCGE (register)", "1111001U0Dzznnnndddd0011NQM1mmmm") // ASIMD +INST(asimd_VSHL_reg, "VSHL (register)", "1111001U0Dzznnnndddd0100NQM0mmmm") // ASIMD +INST(asimd_VQSHL_reg, "VQSHL (register)", "1111001U0Dzznnnndddd0100NQM1mmmm") // ASIMD +INST(asimd_VRSHL, "VRSHL", "1111001U0Dzznnnndddd0101NQM0mmmm") // ASIMD +//INST(asimd_VQRSHL, "VQRSHL", "1111001U0-CC--------0101---1----") // ASIMD +INST(asimd_VMAX, "VMAX/VMIN (integer)", "1111001U0Dzznnnnmmmm0110NQMommmm") // ASIMD +INST(asimd_VABD, "VABD", "1111001U0Dzznnnndddd0111NQM0mmmm") // ASIMD +INST(asimd_VABA, "VABA", "1111001U0Dzznnnndddd0111NQM1mmmm") // ASIMD +INST(asimd_VADD_int, "VADD (integer)", "111100100Dzznnnndddd1000NQM0mmmm") // ASIMD +INST(asimd_VSUB_int, "VSUB (integer)", "111100110Dzznnnndddd1000NQM0mmmm") // ASIMD +INST(asimd_VTST, "VTST", "111100100Dzznnnndddd1000NQM1mmmm") // ASIMD +INST(asimd_VCEQ_reg, "VCEG (register)", "111100110Dzznnnndddd1000NQM1mmmm") // ASIMD +INST(asimd_VMLA, "VMLA/VMLS", "1111001o0Dzznnnndddd1001NQM0mmmm") // ASIMD +INST(asimd_VMUL, "VMUL", "1111001P0Dzznnnndddd1001NQM1mmmm") // ASIMD +INST(asimd_VPMAX_int, "VPMAX/VPMIN (integer)", "1111001U0Dzznnnndddd1010NQMommmm") // ASIMD +INST(v8_VMAXNM, "VMAXNM", "111100110D0znnnndddd1111NQM1mmmm") // v8 +INST(v8_VMINNM, "VMINNM", "111100110D1znnnndddd1111NQM1mmmm") // v8 +INST(asimd_VQDMULH, "VQDMULH", "111100100Dzznnnndddd1011NQM0mmmm") // ASIMD +INST(asimd_VQRDMULH, "VQRDMULH", "111100110Dzznnnndddd1011NQM0mmmm") // ASIMD +INST(asimd_VPADD, "VPADD", "111100100Dzznnnndddd1011NQM1mmmm") // ASIMD +INST(asimd_VFMA, "VFMA", "111100100D0znnnndddd1100NQM1mmmm") // ASIMD +INST(asimd_VFMS, "VFMS", "111100100D1znnnndddd1100NQM1mmmm") // ASIMD +INST(asimd_VADD_float, "VADD (floating-point)", "111100100D0znnnndddd1101NQM0mmmm") // ASIMD +INST(asimd_VSUB_float, "VSUB (floating-point)", "111100100D1znnnndddd1101NQM0mmmm") // ASIMD +INST(asimd_VPADD_float, "VPADD (floating-point)", "111100110D0znnnndddd1101NQM0mmmm") // ASIMD +INST(asimd_VABD_float, "VABD (floating-point)", "111100110D1znnnndddd1101NQM0mmmm") // ASIMD +INST(asimd_VMLA_float, "VMLA (floating-point)", "111100100D0znnnndddd1101NQM1mmmm") // ASIMD +INST(asimd_VMLS_float, "VMLS (floating-point)", "111100100D1znnnndddd1101NQM1mmmm") // ASIMD +INST(asimd_VMUL_float, "VMUL (floating-point)", "111100110D0znnnndddd1101NQM1mmmm") // ASIMD +INST(asimd_VCEQ_reg_float, "VCEQ (register)", "111100100D0znnnndddd1110NQM0mmmm") // ASIMD +INST(asimd_VCGE_reg_float, "VCGE (register)", "111100110D0znnnndddd1110NQM0mmmm") // ASIMD +INST(asimd_VCGT_reg_float, "VCGT (register)", "111100110D1znnnndddd1110NQM0mmmm") // ASIMD +INST(asimd_VACGE, "VACGE", "111100110Doznnnndddd1110NQM1mmmm") // ASIMD +INST(asimd_VMAX_float, "VMAX (floating-point)", "111100100D0znnnndddd1111NQM0mmmm") // ASIMD +INST(asimd_VMIN_float, "VMIN (floating-point)", "111100100D1znnnndddd1111NQM0mmmm") // ASIMD +INST(asimd_VPMAX_float, "VPMAX (floating-point)", "111100110D0znnnndddd1111NQM0mmmm") // ASIMD +INST(asimd_VPMIN_float, "VPMIN (floating-point)", "111100110D1znnnndddd1111NQM0mmmm") // ASIMD +INST(asimd_VRECPS, "VRECPS", "111100100D0znnnndddd1111NQM1mmmm") // ASIMD +INST(asimd_VRSQRTS, "VRSQRTS", "111100100D1znnnndddd1111NQM1mmmm") // ASIMD +INST(v8_SHA256H, "SHA256H", "111100110D00nnnndddd1100NQM0mmmm") // v8 +INST(v8_SHA256H2, "SHA256H2", "111100110D01nnnndddd1100NQM0mmmm") // v8 +INST(v8_SHA256SU1, "SHA256SU1", "111100110D10nnnndddd1100NQM0mmmm") // v8 + +// Three registers of different lengths +INST(asimd_VADDL, "VADDL/VADDW", "1111001U1Dzznnnndddd000oN0M0mmmm") // ASIMD +INST(asimd_VSUBL, "VSUBL/VSUBW", "1111001U1Dzznnnndddd001oN0M0mmmm") // ASIMD +//INST(asimd_VADDHN, "VADDHN", "111100101-----------0100-0-0----") // ASIMD +//INST(asimd_VRADDHN, "VRADDHN", "111100111-----------0100-0-0----") // ASIMD +INST(asimd_VABAL, "VABAL", "1111001U1Dzznnnndddd0101N0M0mmmm") // ASIMD +//INST(asimd_VSUBHN, "VSUBHN", "111100101-----------0110-0-0----") // ASIMD +//INST(asimd_VRSUBHN, "VRSUBHN", "111100111-----------0110-0-0----") // ASIMD +INST(asimd_VABDL, "VABDL", "1111001U1Dzznnnndddd0111N0M0mmmm") // ASIMD +INST(asimd_VMLAL, "VMLAL/VMLSL", "1111001U1Dzznnnndddd10o0N0M0mmmm") // ASIMD +//INST(asimd_VQDMLAL, "VQDMLAL", "111100101-----------10-1-0-0----") // ASIMD +INST(asimd_VMULL, "VMULL", "1111001U1Dzznnnndddd11P0N0M0mmmm") // ASIMD +//INST(asimd_VQDMULL, "VQDMULL", "111100101-----------1101-0-0----") // ASIMD + +// Two registers and a scalar +INST(asimd_VMLA_scalar, "VMLA (scalar)", "1111001Q1Dzznnnndddd0o0FN1M0mmmm") // ASIMD +INST(asimd_VMLAL_scalar, "VMLAL (scalar)", "1111001U1dzznnnndddd0o10N1M0mmmm") // ASIMD +//INST(asimd_VQDMLAL_scalar, "VQDMLAL/VQDMLSL (scalar)", "111100101-BB--------0x11-1-0----") // ASIMD +INST(asimd_VMUL_scalar, "VMUL (scalar)", "1111001Q1Dzznnnndddd100FN1M0mmmm") // ASIMD +INST(asimd_VMULL_scalar, "VMULL (scalar)", "1111001U1Dzznnnndddd1010N1M0mmmm") // ASIMD +INST(asimd_VQDMULL_scalar, "VQDMULL (scalar)", "111100101Dzznnnndddd1011N1M0mmmm") // ASIMD +INST(asimd_VQDMULH_scalar, "VQDMULH (scalar)", "1111001Q1Dzznnnndddd1100N1M0mmmm") // ASIMD +INST(asimd_VQRDMULH_scalar, "VQRDMULH (scalar)", "1111001Q1Dzznnnndddd1101N1M0mmmm") // ASIMD + +// Two registers and a shift amount +INST(asimd_SHR, "SHR", "1111001U1Diiiiiidddd0000LQM1mmmm") // ASIMD +INST(asimd_SRA, "SRA", "1111001U1Diiiiiidddd0001LQM1mmmm") // ASIMD +INST(asimd_VRSHR, "VRSHR", "1111001U1Diiiiiidddd0010LQM1mmmm") // ASIMD +INST(asimd_VRSRA, "VRSRA", "1111001U1Diiiiiidddd0011LQM1mmmm") // ASIMD +INST(asimd_VSRI, "VSRI", "111100111Diiiiiidddd0100LQM1mmmm") // ASIMD +INST(asimd_VSHL, "VSHL", "111100101Diiiiiidddd0101LQM1mmmm") // ASIMD +INST(asimd_VSLI, "VSLI", "111100111Diiiiiidddd0101LQM1mmmm") // ASIMD +INST(asimd_VQSHL, "VQSHL" , "1111001U1Diiiiiidddd011oLQM1mmmm") // ASIMD +INST(asimd_VSHRN, "VSHRN", "111100101Diiiiiidddd100000M1mmmm") // ASIMD +INST(asimd_VRSHRN, "VRSHRN", "111100101Diiiiiidddd100001M1mmmm") // ASIMD +INST(asimd_VQSHRUN, "VQSHRUN", "111100111Diiiiiidddd100000M1mmmm") // ASIMD +INST(asimd_VQRSHRUN, "VQRSHRUN", "111100111Diiiiiidddd100001M1mmmm") // ASIMD +INST(asimd_VQSHRN, "VQSHRN", "1111001U1Diiiiiidddd100100M1mmmm") // ASIMD +INST(asimd_VQRSHRN, "VQRSHRN", "1111001U1Diiiiiidddd100101M1mmmm") // ASIMD +INST(asimd_VSHLL, "VSHLL", "1111001U1Diiiiiidddd101000M1mmmm") // ASIMD +INST(asimd_VCVT_fixed, "VCVT (fixed-point)", "1111001U1Diiiiiidddd111o0QM1mmmm") // ASIMD + +// Two registers, miscellaneous +INST(asimd_VREV, "VREV{16,32,64}", "111100111D11zz00dddd000ooQM0mmmm") // ASIMD +INST(asimd_VPADDL, "VPADDL", "111100111D11zz00dddd0010oQM0mmmm") // ASIMD +INST(asimd_VCLS, "VCLS", "111100111D11zz00dddd01000QM0mmmm") // ASIMD +INST(asimd_VCLZ, "VCLZ", "111100111D11zz00dddd01001QM0mmmm") // ASIMD +INST(asimd_VCNT, "VCNT", "111100111D11zz00dddd01010QM0mmmm") // ASIMD +INST(asimd_VMVN_reg, "VMVN_reg", "111100111D11zz00dddd01011QM0mmmm") // ASIMD +INST(asimd_VPADAL, "VPADAL", "111100111D11zz00dddd0110oQM0mmmm") // ASIMD +INST(asimd_VQABS, "VQABS", "111100111D11zz00dddd01110QM0mmmm") // ASIMD +INST(asimd_VQNEG, "VQNEG", "111100111D11zz00dddd01111QM0mmmm") // ASIMD +INST(asimd_VCGT_zero, "VCGT (zero)", "111100111D11zz01dddd0F000QM0mmmm") // ASIMD +INST(asimd_VCGE_zero, "VCGE (zero)", "111100111D11zz01dddd0F001QM0mmmm") // ASIMD +INST(asimd_VCEQ_zero, "VCEQ (zero)", "111100111D11zz01dddd0F010QM0mmmm") // ASIMD +INST(asimd_VCLE_zero, "VCLE (zero)", "111100111D11zz01dddd0F011QM0mmmm") // ASIMD +INST(asimd_VCLT_zero, "VCLT (zero)", "111100111D11zz01dddd0F100QM0mmmm") // ASIMD +INST(arm_UDF, "UNALLOCATED", "111100111-11--01----01101--0----") // v8 +INST(asimd_VABS, "VABS", "111100111D11zz01dddd0F110QM0mmmm") // ASIMD +INST(asimd_VNEG, "VNEG", "111100111D11zz01dddd0F111QM0mmmm") // ASIMD +INST(asimd_VSWP, "VSWP", "111100111D110010dddd00000QM0mmmm") // ASIMD +INST(arm_UDF, "UNALLOCATED", "111100111-11--10----00000--0----") // ASIMD +INST(asimd_VTRN, "VTRN", "111100111D11zz10dddd00001QM0mmmm") // ASIMD +INST(asimd_VUZP, "VUZP", "111100111D11zz10dddd00010QM0mmmm") // ASIMD +INST(asimd_VZIP, "VZIP", "111100111D11zz10dddd00011QM0mmmm") // ASIMD +INST(asimd_VMOVN, "VMOVN", "111100111D11zz10dddd001000M0mmmm") // ASIMD +INST(asimd_VQMOVUN, "VQMOVUN", "111100111D11zz10dddd001001M0mmmm") // ASIMD +INST(asimd_VQMOVN, "VQMOVN", "111100111D11zz10dddd00101oM0mmmm") // ASIMD +INST(asimd_VSHLL_max, "VSHLL_max", "111100111D11zz10dddd001100M0mmmm") // ASIMD +INST(v8_VRINTN, "VRINTN", "111100111D11zz10dddd01000QM0mmmm") // v8 +INST(v8_VRINTX, "VRINTX", "111100111D11zz10dddd01001QM0mmmm") // v8 +INST(v8_VRINTA, "VRINTA", "111100111D11zz10dddd01010QM0mmmm") // v8 +INST(v8_VRINTZ, "VRINTZ", "111100111D11zz10dddd01011QM0mmmm") // v8 +INST(v8_VRINTM, "VRINTM", "111100111D11zz10dddd01101QM0mmmm") // v8 +INST(v8_VRINTP, "VRINTP", "111100111D11zz10dddd01111QM0mmmm") // v8 +INST(asimd_VCVT_half, "VCVT (half-precision)", "111100111D11zz10dddd011o00M0mmmm") // ASIMD +INST(arm_UDF, "UNALLOCATED", "111100111-11--10----011-01-0----") // ASIMD +INST(v8_VCVTA, "VCVTA", "111100111D11zz11dddd0000oQM0mmmm") // v8 +INST(v8_VCVTN, "VCVTN", "111100111D11zz11dddd0001oQM0mmmm") // v8 +INST(v8_VCVTP, "VCVTP", "111100111D11zz11dddd0010oQM0mmmm") // v8 +INST(v8_VCVTM, "VCVTM", "111100111D11zz11dddd0011oQM0mmmm") // v8 +INST(asimd_VRECPE, "VRECPE", "111100111D11zz11dddd010F0QM0mmmm") // ASIMD +INST(asimd_VRSQRTE, "VRSQRTE", "111100111D11zz11dddd010F1QM0mmmm") // ASIMD +INST(asimd_VCVT_integer, "VCVT (integer)", "111100111D11zz11dddd011oUQM0mmmm") // ASIMD + +// Two registers, cryptography +INST(v8_AESE, "AESE", "111100111D11zz00dddd001100M0mmmm") // v8 +INST(v8_AESD, "AESD", "111100111D11zz00dddd001101M0mmmm") // v8 +INST(v8_AESMC, "AESMC", "111100111D11zz00dddd001110M0mmmm") // v8 +INST(v8_AESIMC, "AESIMC", "111100111D11zz00dddd001111M0mmmm") // v8 +INST(arm_UDF, "UNALLOCATED", "111100111-11--01----001010-0----") // v8 +INST(arm_UDF, "UNALLOCATED (SHA1H)", "111100111-11--01----001011-0----") // v8 +INST(arm_UDF, "UNALLOCATED (SHA1SU1)", "111100111-11--10----001110-0----") // v8 +INST(v8_SHA256SU0, "SHA256SU0", "111100111D11zz10dddd001111M0mmmm") // v8 + +// One register and modified immediate +INST(asimd_VMOV_imm, "VBIC, VMOV, VMVN, VORR (immediate)", "1111001a1D000bcdVVVVmmmm0Qo1efgh") // ASIMD + +// Miscellaneous +INST(asimd_VEXT, "VEXT", "111100101D11nnnnddddiiiiNQM0mmmm") // ASIMD +INST(asimd_VTBL, "VTBL", "111100111D11nnnndddd10zzN0M0mmmm") // ASIMD +INST(asimd_VTBX, "VTBX", "111100111D11nnnndddd10zzN1M0mmmm") // ASIMD +INST(asimd_VDUP_scalar, "VDUP (scalar)", "111100111D11iiiidddd11000QM0mmmm") // ASIMD +INST(arm_UDF, "UNALLOCATED", "111100111-11--------11-----0----") // ASIMD + +// Advanced SIMD load/store structures +INST(v8_VST_multiple, "VST{1-4} (multiple)", "111101000D00nnnnddddxxxxzzaammmm") // v8 +INST(v8_VLD_multiple, "VLD{1-4} (multiple)", "111101000D10nnnnddddxxxxzzaammmm") // v8 +INST(arm_UDF, "UNALLOCATED", "111101000--0--------1011--------") // v8 +INST(arm_UDF, "UNALLOCATED", "111101000--0--------11----------") // v8 +INST(arm_UDF, "UNALLOCATED", "111101001-00--------11----------") // v8 +INST(v8_VLD_all_lanes, "VLD{1-4} (all lanes)", "111101001D10nnnndddd11nnzzTammmm") // v8 +INST(v8_VST_single, "VST{1-4} (single)", "111101001D00nnnnddddzzNNaaaammmm") // v8 +INST(v8_VLD_single, "VLD{1-4} (single)", "111101001D10nnnnddddzzNNaaaammmm") // v8 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.h b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.h new file mode 100644 index 0000000000..6a4275f726 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.h @@ -0,0 +1,39 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <algorithm> +#include <functional> +#include <optional> +#include <vector> + +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/decoder/decoder_detail.h" +#include "dynarmic/frontend/decoder/matcher.h" + +namespace Dynarmic::A32 { + +template<typename Visitor> +using Thumb16Matcher = Decoder::Matcher<Visitor, u16>; + +template<typename V> +std::optional<std::reference_wrapper<const Thumb16Matcher<V>>> DecodeThumb16(u16 instruction) { + static const std::vector<Thumb16Matcher<V>> table = { + +#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(Thumb16Matcher, fn, name, Decoder::detail::StringToArray<16>(bitstring)), +#include "./thumb16.inc" +#undef INST + + }; + + const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); }; + + auto iter = std::find_if(table.begin(), table.end(), matches_instruction); + return iter != table.end() ? std::optional<std::reference_wrapper<const Thumb16Matcher<V>>>(*iter) : std::nullopt; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.inc b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.inc new file mode 100644 index 0000000000..f01c98aa05 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb16.inc @@ -0,0 +1,98 @@ +// Shift (immediate) add, subtract, move and compare instructions +INST(thumb16_LSL_imm, "LSL (imm)", "00000vvvvvmmmddd") +INST(thumb16_LSR_imm, "LSR (imm)", "00001vvvvvmmmddd") +INST(thumb16_ASR_imm, "ASR (imm)", "00010vvvvvmmmddd") +INST(thumb16_ADD_reg_t1, "ADD (reg, T1)", "0001100mmmnnnddd") +INST(thumb16_SUB_reg, "SUB (reg)", "0001101mmmnnnddd") +INST(thumb16_ADD_imm_t1, "ADD (imm, T1)", "0001110vvvnnnddd") +INST(thumb16_SUB_imm_t1, "SUB (imm, T1)", "0001111vvvnnnddd") +INST(thumb16_MOV_imm, "MOV (imm)", "00100dddvvvvvvvv") +INST(thumb16_CMP_imm, "CMP (imm)", "00101nnnvvvvvvvv") +INST(thumb16_ADD_imm_t2, "ADD (imm, T2)", "00110dddvvvvvvvv") +INST(thumb16_SUB_imm_t2, "SUB (imm, T2)", "00111dddvvvvvvvv") + + // Data-processing instructions +INST(thumb16_AND_reg, "AND (reg)", "0100000000mmmddd") +INST(thumb16_EOR_reg, "EOR (reg)", "0100000001mmmddd") +INST(thumb16_LSL_reg, "LSL (reg)", "0100000010mmmddd") +INST(thumb16_LSR_reg, "LSR (reg)", "0100000011mmmddd") +INST(thumb16_ASR_reg, "ASR (reg)", "0100000100mmmddd") +INST(thumb16_ADC_reg, "ADC (reg)", "0100000101mmmddd") +INST(thumb16_SBC_reg, "SBC (reg)", "0100000110mmmddd") +INST(thumb16_ROR_reg, "ROR (reg)", "0100000111sssddd") +INST(thumb16_TST_reg, "TST (reg)", "0100001000mmmnnn") +INST(thumb16_RSB_imm, "RSB (imm)", "0100001001nnnddd") +INST(thumb16_CMP_reg_t1, "CMP (reg, T1)", "0100001010mmmnnn") +INST(thumb16_CMN_reg, "CMN (reg)", "0100001011mmmnnn") +INST(thumb16_ORR_reg, "ORR (reg)", "0100001100mmmddd") +INST(thumb16_MUL_reg, "MUL (reg)", "0100001101nnnddd") +INST(thumb16_BIC_reg, "BIC (reg)", "0100001110mmmddd") +INST(thumb16_MVN_reg, "MVN (reg)", "0100001111mmmddd") + +// Special data instructions +INST(thumb16_ADD_reg_t2, "ADD (reg, T2)", "01000100Dmmmmddd") // v4T, Low regs: v6T2 +INST(thumb16_CMP_reg_t2, "CMP (reg, T2)", "01000101Nmmmmnnn") // v4T +INST(thumb16_MOV_reg, "MOV (reg)", "01000110Dmmmmddd") // v4T, Low regs: v6 + +// Store/Load single data item instructions +INST(thumb16_LDR_literal, "LDR (literal)", "01001tttvvvvvvvv") +INST(thumb16_STR_reg, "STR (reg)", "0101000mmmnnnttt") +INST(thumb16_STRH_reg, "STRH (reg)", "0101001mmmnnnttt") +INST(thumb16_STRB_reg, "STRB (reg)", "0101010mmmnnnttt") +INST(thumb16_LDRSB_reg, "LDRSB (reg)", "0101011mmmnnnttt") +INST(thumb16_LDR_reg, "LDR (reg)", "0101100mmmnnnttt") +INST(thumb16_LDRH_reg, "LDRH (reg)", "0101101mmmnnnttt") +INST(thumb16_LDRB_reg, "LDRB (reg)", "0101110mmmnnnttt") +INST(thumb16_LDRSH_reg, "LDRSH (reg)", "0101111mmmnnnttt") +INST(thumb16_STR_imm_t1, "STR (imm, T1)", "01100vvvvvnnnttt") +INST(thumb16_LDR_imm_t1, "LDR (imm, T1)", "01101vvvvvnnnttt") +INST(thumb16_STRB_imm, "STRB (imm)", "01110vvvvvnnnttt") +INST(thumb16_LDRB_imm, "LDRB (imm)", "01111vvvvvnnnttt") +INST(thumb16_STRH_imm, "STRH (imm)", "10000vvvvvnnnttt") +INST(thumb16_LDRH_imm, "LDRH (imm)", "10001vvvvvnnnttt") +INST(thumb16_STR_imm_t2, "STR (imm, T2)", "10010tttvvvvvvvv") +INST(thumb16_LDR_imm_t2, "LDR (imm, T2)", "10011tttvvvvvvvv") + +// Generate relative address instructions +INST(thumb16_ADR, "ADR", "10100dddvvvvvvvv") +INST(thumb16_ADD_sp_t1, "ADD (SP plus imm, T1)", "10101dddvvvvvvvv") +INST(thumb16_ADD_sp_t2, "ADD (SP plus imm, T2)", "101100000vvvvvvv") // v4T +INST(thumb16_SUB_sp, "SUB (SP minus imm)", "101100001vvvvvvv") // v4T + +// Hint instructions +INST(thumb16_SEV, "SEV", "1011111101000000") // v7 +INST(thumb16_SEVL, "SEVL", "1011111101010000") // v8 +INST(thumb16_WFE, "WFE", "1011111100100000") // v7 +INST(thumb16_WFI, "WFI", "1011111100110000") // v7 +INST(thumb16_YIELD, "YIELD", "1011111100010000") // v7 +INST(thumb16_NOP, "NOP", "10111111----0000") // v7 + +// IT instruction +INST(thumb16_IT, "IT", "10111111iiiiiiii") // v7 + +// Miscellaneous 16-bit instructions +INST(thumb16_SXTH, "SXTH", "1011001000mmmddd") // v6 +INST(thumb16_SXTB, "SXTB", "1011001001mmmddd") // v6 +INST(thumb16_UXTH, "UXTH", "1011001010mmmddd") // v6 +INST(thumb16_UXTB, "UXTB", "1011001011mmmddd") // v6 +INST(thumb16_PUSH, "PUSH", "1011010Mxxxxxxxx") // v4T +INST(thumb16_POP, "POP", "1011110Pxxxxxxxx") // v4T +INST(thumb16_SETEND, "SETEND", "101101100101x000") // v6 +INST(thumb16_CPS, "CPS", "10110110011m0aif") // v6 +INST(thumb16_REV, "REV", "1011101000mmmddd") // v6 +INST(thumb16_REV16, "REV16", "1011101001mmmddd") // v6 +INST(thumb16_REVSH, "REVSH", "1011101011mmmddd") // v6 +INST(thumb16_BKPT, "BKPT", "10111110xxxxxxxx") // v5 + +// Store/Load multiple registers +INST(thumb16_STMIA, "STMIA", "11000nnnxxxxxxxx") +INST(thumb16_LDMIA, "LDMIA", "11001nnnxxxxxxxx") + +// Branch instructions +INST(thumb16_BX, "BX", "010001110mmmm000") // v4T +INST(thumb16_BLX_reg, "BLX (reg)", "010001111mmmm000") // v5T +INST(thumb16_CBZ_CBNZ, "CBZ/CBNZ", "1011o0i1iiiiinnn") // v6T2 +INST(thumb16_UDF, "UDF", "11011110--------") +INST(thumb16_SVC, "SVC", "11011111xxxxxxxx") +INST(thumb16_B_t1, "B (T1)", "1101ccccvvvvvvvv") +INST(thumb16_B_t2, "B (T2)", "11100vvvvvvvvvvv") diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.h b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.h new file mode 100644 index 0000000000..f3f4b3b9ee --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.h @@ -0,0 +1,38 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <algorithm> +#include <optional> +#include <vector> + +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/decoder/decoder_detail.h" +#include "dynarmic/frontend/decoder/matcher.h" + +namespace Dynarmic::A32 { + +template<typename Visitor> +using Thumb32Matcher = Decoder::Matcher<Visitor, u32>; + +template<typename V> +std::optional<std::reference_wrapper<const Thumb32Matcher<V>>> DecodeThumb32(u32 instruction) { + static const std::vector<Thumb32Matcher<V>> table = { + +#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(Thumb32Matcher, fn, name, Decoder::detail::StringToArray<32>(bitstring)), +#include "./thumb32.inc" +#undef INST + + }; + + const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); }; + + auto iter = std::find_if(table.begin(), table.end(), matches_instruction); + return iter != table.end() ? std::optional<std::reference_wrapper<const Thumb32Matcher<V>>>(*iter) : std::nullopt; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.inc b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.inc new file mode 100644 index 0000000000..ecc7a65538 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.inc @@ -0,0 +1,292 @@ +// Load/Store Multiple +//INST(thumb32_SRS_1, "SRS", "1110100000-0--------------------") +//INST(thumb32_RFE_2, "RFE", "1110100000-1--------------------") +INST(thumb32_STMIA, "STMIA/STMEA", "1110100010W0nnnn0iiiiiiiiiiiiiii") +INST(thumb32_POP, "POP", "1110100010111101iiiiiiiiiiiiiiii") +INST(thumb32_LDMIA, "LDMIA/LDMFD", "1110100010W1nnnniiiiiiiiiiiiiiii") +INST(thumb32_PUSH, "PUSH", "11101001001011010iiiiiiiiiiiiiii") +INST(thumb32_STMDB, "STMDB/STMFD", "1110100100W0nnnn0iiiiiiiiiiiiiii") +INST(thumb32_LDMDB, "LDMDB/LDMEA", "1110100100W1nnnniiiiiiiiiiiiiiii") +//INST(thumb32_SRS_1, "SRS", "1110100110-0--------------------") +//INST(thumb32_RFE_2, "RFE", "1110100110-1--------------------") + +// Load/Store Dual, Load/Store Exclusive, Table Branch +INST(thumb32_STREX, "STREX", "111010000100nnnnttttddddiiiiiiii") +INST(thumb32_LDREX, "LDREX", "111010000101nnnntttt1111iiiiiiii") +INST(thumb32_STRD_imm_1, "STRD (imm)", "11101000U110nnnnttttssssiiiiiiii") +INST(thumb32_STRD_imm_2, "STRD (imm)", "11101001U1W0nnnnttttssssiiiiiiii") +INST(thumb32_LDRD_lit_1, "LDRD (lit)", "11101000U1111111ttttssssiiiiiiii") +INST(thumb32_LDRD_lit_2, "LDRD (lit)", "11101001U1W11111ttttssssiiiiiiii") +INST(thumb32_LDRD_imm_1, "LDRD (imm)", "11101000U111nnnnttttssssiiiiiiii") +INST(thumb32_LDRD_imm_2, "LDRD (imm)", "11101001U1W1nnnnttttssssiiiiiiii") +INST(thumb32_STL, "STL", "111010001100nnnntttt111110101111") // v8 +INST(thumb32_LDA, "LDA", "111010001101nnnntttt111110101111") // v8 +INST(thumb32_STREXB, "STREXB", "111010001100nnnntttt11110100dddd") +INST(thumb32_STREXH, "STREXH", "111010001100nnnntttt11110101dddd") +INST(thumb32_STREXD, "STREXD", "111010001100nnnnttttuuuu0111dddd") +INST(thumb32_TBB, "TBB", "111010001101nnnn111100000000mmmm") +INST(thumb32_TBH, "TBH", "111010001101nnnn111100000001mmmm") +INST(thumb32_LDREXB, "LDREXB", "111010001101nnnntttt111101001111") +INST(thumb32_LDREXH, "LDREXH", "111010001101nnnntttt111101011111") +INST(thumb32_LDREXD, "LDREXD", "111010001101nnnnttttuuuu01111111") + +// Data Processing (Shifted Register) +INST(thumb32_TST_reg, "TST (reg)", "111010100001nnnn0vvv1111vvttmmmm") +INST(thumb32_AND_reg, "AND (reg)", "11101010000Snnnn0vvvddddvvttmmmm") +INST(thumb32_BIC_reg, "BIC (reg)", "11101010001Snnnn0vvvddddvvttmmmm") +INST(thumb32_MOV_reg, "MOV (reg)", "11101010010S11110vvvddddvvttmmmm") +INST(thumb32_ORR_reg, "ORR (reg)", "11101010010Snnnn0vvvddddvvttmmmm") +INST(thumb32_MVN_reg, "MVN (reg)", "11101010011S11110vvvddddvvttmmmm") +INST(thumb32_ORN_reg, "ORN (reg)", "11101010011Snnnn0vvvddddvvttmmmm") +INST(thumb32_TEQ_reg, "TEQ (reg)", "111010101001nnnn0vvv1111vvttmmmm") +INST(thumb32_EOR_reg, "EOR (reg)", "11101010100Snnnn0vvvddddvvttmmmm") +INST(thumb32_PKH, "PKH", "111010101100nnnn0vvvddddvvt0mmmm") +INST(thumb32_CMN_reg, "CMN (reg)", "111010110001nnnn0vvv1111vvttmmmm") +INST(thumb32_ADD_reg, "ADD (reg)", "11101011000Snnnn0vvvddddvvttmmmm") +INST(thumb32_ADC_reg, "ADC (reg)", "11101011010Snnnn0vvvddddvvttmmmm") +INST(thumb32_SBC_reg, "SBC (reg)", "11101011011Snnnn0vvvddddvvttmmmm") +INST(thumb32_CMP_reg, "CMP (reg)", "111010111011nnnn0vvv1111vvttmmmm") +INST(thumb32_SUB_reg, "SUB (reg)", "11101011101Snnnn0vvvddddvvttmmmm") +INST(thumb32_RSB_reg, "RSB (reg)", "11101011110Snnnn0vvvddddvvttmmmm") + +// Data Processing (Modified Immediate) +INST(thumb32_TST_imm, "TST (imm)", "11110v000001nnnn0vvv1111vvvvvvvv") +INST(thumb32_AND_imm, "AND (imm)", "11110v00000Snnnn0vvvddddvvvvvvvv") +INST(thumb32_BIC_imm, "BIC (imm)", "11110v00001Snnnn0vvvddddvvvvvvvv") +INST(thumb32_MOV_imm, "MOV (imm)", "11110v00010S11110vvvddddvvvvvvvv") +INST(thumb32_ORR_imm, "ORR (imm)", "11110v00010Snnnn0vvvddddvvvvvvvv") +INST(thumb32_MVN_imm, "MVN (imm)", "11110v00011S11110vvvddddvvvvvvvv") +INST(thumb32_ORN_imm, "ORN (imm)", "11110v00011Snnnn0vvvddddvvvvvvvv") +INST(thumb32_TEQ_imm, "TEQ (imm)", "11110v001001nnnn0vvv1111vvvvvvvv") +INST(thumb32_EOR_imm, "EOR (imm)", "11110v00100Snnnn0vvvddddvvvvvvvv") +INST(thumb32_CMN_imm, "CMN (imm)", "11110v010001nnnn0vvv1111vvvvvvvv") +INST(thumb32_ADD_imm_1, "ADD (imm)", "11110v01000Snnnn0vvvddddvvvvvvvv") +INST(thumb32_ADC_imm, "ADC (imm)", "11110v01010Snnnn0vvvddddvvvvvvvv") +INST(thumb32_SBC_imm, "SBC (imm)", "11110v01011Snnnn0vvvddddvvvvvvvv") +INST(thumb32_CMP_imm, "CMP (imm)", "11110v011011nnnn0vvv1111vvvvvvvv") +INST(thumb32_SUB_imm_1, "SUB (imm)", "11110v01101Snnnn0vvvddddvvvvvvvv") +INST(thumb32_RSB_imm, "RSB (imm)", "11110v01110Snnnn0vvvddddvvvvvvvv") + +// Data Processing (Plain Binary Immediate) +INST(thumb32_ADR_t3, "ADR", "11110i10000011110iiiddddiiiiiiii") +INST(thumb32_ADD_imm_2, "ADD (imm)", "11110i100000nnnn0iiiddddiiiiiiii") +INST(thumb32_MOVW_imm, "MOVW (imm)", "11110i100100iiii0iiiddddiiiiiiii") +INST(thumb32_ADR_t2, "ADR", "11110i10101011110iiiddddiiiiiiii") +INST(thumb32_SUB_imm_2, "SUB (imm)", "11110i101010nnnn0iiiddddiiiiiiii") +INST(thumb32_MOVT, "MOVT", "11110i101100iiii0iiiddddiiiiiiii") +INST(thumb32_UDF, "Invalid decoding", "11110011-010----0000----0001----") +INST(thumb32_SSAT16, "SSAT16", "111100110010nnnn0000dddd0000iiii") +INST(thumb32_USAT16, "USAT16", "111100111010nnnn0000dddd0000iiii") +INST(thumb32_SSAT, "SSAT", "1111001100s0nnnn0iiiddddii0bbbbb") +INST(thumb32_USAT, "USAT", "1111001110s0nnnn0iiiddddii0bbbbb") +INST(thumb32_SBFX, "SBFX", "111100110100nnnn0iiiddddii0wwwww") +INST(thumb32_BFC, "BFC", "11110011011011110iiiddddii0bbbbb") +INST(thumb32_BFI, "BFI", "111100110110nnnn0iiiddddii0bbbbb") +INST(thumb32_UBFX, "UBFX", "111100111100nnnn0iiiddddii0wwwww") + +// Branches and Miscellaneous Control +//INST(thumb32_MSR_banked, "MSR (banked)", "11110011100-----10-0------1-----") +INST(thumb32_MSR_reg, "MSR (reg)", "11110011100Rnnnn1000mmmm00000000") + +INST(thumb32_NOP, "NOP", "11110011101011111000000000000000") +INST(thumb32_YIELD, "YIELD", "11110011101011111000000000000001") +INST(thumb32_WFE, "WFE", "11110011101011111000000000000010") +INST(thumb32_WFI, "WFI", "11110011101011111000000000000011") +INST(thumb32_SEV, "SEV", "11110011101011111000000000000100") +INST(thumb32_SEVL, "SEVL", "11110011101011111000000000000101") +//INST(thumb32_DBG, "DBG", "111100111010----10-0-0001111----") +//INST(thumb32_CPS, "CPS", "111100111010----10-0------------") + +//INST(thumb32_ENTERX, "ENTERX", "111100111011----10-0----0001----") +//INST(thumb32_LEAVEX, "LEAVEX", "111100111011----10-0----0000----") +INST(thumb32_CLREX, "CLREX", "11110011101111111000111100101111") +INST(thumb32_DSB, "DSB", "1111001110111111100011110100oooo") +INST(thumb32_DMB, "DMB", "1111001110111111100011110101oooo") +INST(thumb32_ISB, "ISB", "1111001110111111100011110110oooo") + +INST(thumb32_BXJ, "BXJ", "111100111100mmmm1000111100000000") +//INST(thumb32_ERET, "ERET", "11110011110111101000111100000000") +//INST(thumb32_SUBS_pc_lr, "SUBS PC, LR", "111100111101111010001111--------") + +//INST(thumb32_MRS_banked, "MRS (banked)", "11110011111-----10-0------1-----") +INST(thumb32_MRS_reg, "MRS (reg)", "11110011111R11111000dddd00000000") +//INST(thumb32_HVC, "HVC", "111101111110----1000------------") +//INST(thumb32_SMC, "SMC", "111101111111----1000000000000000") +INST(thumb32_UDF, "UDF", "111101111111----1010------------") // v6T2 + +// Branch instructions +INST(thumb32_BL_imm, "BL (imm)", "11110Svvvvvvvvvv11j1jvvvvvvvvvvv") // v4T +INST(thumb32_BLX_imm, "BLX (imm)", "11110Svvvvvvvvvv11j0jvvvvvvvvvvv") // v5T +INST(thumb32_B, "B", "11110Svvvvvvvvvv10j1jvvvvvvvvvvv") +INST(thumb32_UDF, "Invalid decoding", "11110-111-------10-0------------") +INST(thumb32_B_cond, "B (cond)", "11110Sccccvvvvvv10i0ivvvvvvvvvvv") + +// Store Single Data Item +INST(thumb32_STRB_imm_1, "STRB (imm)", "111110000000nnnntttt1PU1iiiiiiii") +INST(thumb32_STRB_imm_2, "STRB (imm)", "111110000000nnnntttt1100iiiiiiii") +INST(thumb32_STRB_imm_3, "STRB (imm)", "111110001000nnnnttttiiiiiiiiiiii") +INST(thumb32_STRBT, "STRBT", "111110000000nnnntttt1110iiiiiiii") +INST(thumb32_STRB, "STRB (reg)", "111110000000nnnntttt000000iimmmm") +INST(thumb32_STRH_imm_1, "STRH (imm)", "111110000010nnnntttt1PU1iiiiiiii") +INST(thumb32_STRH_imm_2, "STRH (imm)", "111110000010nnnntttt1100iiiiiiii") +INST(thumb32_STRH_imm_3, "STRH (imm)", "111110001010nnnnttttiiiiiiiiiiii") +INST(thumb32_STRHT, "STRHT", "111110000010nnnntttt1110iiiiiiii") +INST(thumb32_STRH, "STRH (reg)", "111110000010nnnntttt000000iimmmm") +INST(thumb32_STR_imm_1, "STR (imm)", "111110000100nnnntttt1PU1iiiiiiii") +INST(thumb32_STR_imm_2, "STR (imm)", "111110000100nnnntttt1100iiiiiiii") +INST(thumb32_STR_imm_3, "STR (imm)", "111110001100nnnnttttiiiiiiiiiiii") +INST(thumb32_STRT, "STRT", "111110000100nnnntttt1110iiiiiiii") +INST(thumb32_STR_reg, "STR (reg)", "111110000100nnnntttt000000iimmmm") + +// Load Byte and Memory Hints +INST(thumb32_PLD_lit, "PLD (lit)", "11111000U00111111111iiiiiiiiiiii") +INST(thumb32_PLD_lit, "PLD (lit)", "11111000U01111111111iiiiiiiiiiii") +INST(thumb32_PLD_reg, "PLD (reg)", "1111100000W1nnnn1111000000iimmmm") +INST(thumb32_PLD_imm8, "PLD (imm8)", "1111100000W1nnnn11111100iiiiiiii") +INST(thumb32_PLD_imm12, "PLD (imm12)", "1111100010W1nnnn1111iiiiiiiiiiii") +INST(thumb32_PLI_lit, "PLI (lit)", "11111001U00111111111iiiiiiiiiiii") +INST(thumb32_PLI_reg, "PLI (reg)", "111110010001nnnn1111000000iimmmm") +INST(thumb32_PLI_imm8, "PLI (imm8)", "111110010001nnnn11111100iiiiiiii") +INST(thumb32_PLI_imm12, "PLI (imm12)", "111110011001nnnn1111iiiiiiiiiiii") +INST(thumb32_LDRB_lit, "LDRB (lit)", "11111000U0011111ttttiiiiiiiiiiii") +INST(thumb32_LDRB_reg, "LDRB (reg)", "111110000001nnnntttt000000iimmmm") +INST(thumb32_LDRBT, "LDRBT", "111110000001nnnntttt1110iiiiiiii") +INST(thumb32_LDRB_imm8, "LDRB (imm8)", "111110000001nnnntttt1PUWiiiiiiii") +INST(thumb32_LDRB_imm12, "LDRB (imm12)", "111110001001nnnnttttiiiiiiiiiiii") +INST(thumb32_LDRSB_lit, "LDRSB (lit)", "11111001U0011111ttttiiiiiiiiiiii") +INST(thumb32_LDRSB_reg, "LDRSB (reg)", "111110010001nnnntttt000000iimmmm") +INST(thumb32_LDRSBT, "LDRSBT", "111110010001nnnntttt1110iiiiiiii") +INST(thumb32_LDRSB_imm8, "LDRSB (imm8)", "111110010001nnnntttt1PUWiiiiiiii") +INST(thumb32_LDRSB_imm12, "LDRSB (imm12)", "111110011001nnnnttttiiiiiiiiiiii") + +// Load Halfword and Memory Hints +INST(thumb32_LDRH_lit, "LDRH (lit)", "11111000U0111111ttttiiiiiiiiiiii") +INST(thumb32_LDRH_reg, "LDRH (reg)", "111110000011nnnntttt000000iimmmm") +INST(thumb32_LDRHT, "LDRHT", "111110000011nnnntttt1110iiiiiiii") +INST(thumb32_LDRH_imm8, "LDRH (imm8)", "111110000011nnnntttt1PUWiiiiiiii") +INST(thumb32_LDRH_imm12, "LDRH (imm12)", "111110001011nnnnttttiiiiiiiiiiii") +INST(thumb32_NOP, "NOP", "11111001-01111111111------------") +INST(thumb32_LDRSH_lit, "LDRSH (lit)", "11111001U0111111ttttiiiiiiiiiiii") +INST(thumb32_NOP, "NOP", "111110010011----1111000000------") +INST(thumb32_LDRSH_reg, "LDRSH (reg)", "111110010011nnnntttt000000iimmmm") +INST(thumb32_LDRSHT, "LDRSHT", "111110010011nnnntttt1110iiiiiiii") +INST(thumb32_NOP, "NOP", "111110010011----11111100--------") +INST(thumb32_NOP, "NOP", "111110011011----1111------------") +INST(thumb32_LDRSH_imm8, "LDRSH (imm8)", "111110010011nnnntttt1PUWiiiiiiii") +INST(thumb32_LDRSH_imm12, "LDRSH (imm12)", "111110011011nnnnttttiiiiiiiiiiii") + +// Load Word +INST(thumb32_LDR_lit, "LDR (lit)", "11111000U1011111ttttiiiiiiiiiiii") +INST(thumb32_LDRT, "LDRT", "111110000101nnnntttt1110iiiiiiii") +INST(thumb32_LDR_reg, "LDR (reg)", "111110000101nnnntttt000000iimmmm") +INST(thumb32_LDR_imm8, "LDR (imm8)", "111110000101nnnntttt1PUWiiiiiiii") +INST(thumb32_LDR_imm12, "LDR (imm12)", "111110001101nnnnttttiiiiiiiiiiii") + +// Data Processing (register) +INST(thumb32_LSL_reg, "LSL (reg)", "11111010000Smmmm1111dddd0000ssss") +INST(thumb32_LSR_reg, "LSR (reg)", "11111010001Smmmm1111dddd0000ssss") +INST(thumb32_ASR_reg, "ASR (reg)", "11111010010Smmmm1111dddd0000ssss") +INST(thumb32_ROR_reg, "ROR (reg)", "11111010011Smmmm1111dddd0000ssss") +INST(thumb32_SXTH, "SXTH", "11111010000011111111dddd10rrmmmm") +INST(thumb32_SXTAH, "SXTAH", "111110100000nnnn1111dddd10rrmmmm") +INST(thumb32_UXTH, "UXTH", "11111010000111111111dddd10rrmmmm") +INST(thumb32_UXTAH, "UXTAH", "111110100001nnnn1111dddd10rrmmmm") +INST(thumb32_SXTB16, "SXTB16", "11111010001011111111dddd10rrmmmm") +INST(thumb32_SXTAB16, "SXTAB16", "111110100010nnnn1111dddd10rrmmmm") +INST(thumb32_UXTB16, "UXTB16", "11111010001111111111dddd10rrmmmm") +INST(thumb32_UXTAB16, "UXTAB16", "111110100011nnnn1111dddd10rrmmmm") +INST(thumb32_SXTB, "SXTB", "11111010010011111111dddd10rrmmmm") +INST(thumb32_SXTAB, "SXTAB", "111110100100nnnn1111dddd10rrmmmm") +INST(thumb32_UXTB, "UXTB", "11111010010111111111dddd10rrmmmm") +INST(thumb32_UXTAB, "UXTAB", "111110100101nnnn1111dddd10rrmmmm") + +// Parallel Addition and Subtraction (signed) +INST(thumb32_SADD16, "SADD16", "111110101001nnnn1111dddd0000mmmm") +INST(thumb32_SASX, "SASX", "111110101010nnnn1111dddd0000mmmm") +INST(thumb32_SSAX, "SSAX", "111110101110nnnn1111dddd0000mmmm") +INST(thumb32_SSUB16, "SSUB16", "111110101101nnnn1111dddd0000mmmm") +INST(thumb32_SADD8, "SADD8", "111110101000nnnn1111dddd0000mmmm") +INST(thumb32_SSUB8, "SSUB8", "111110101100nnnn1111dddd0000mmmm") +INST(thumb32_QADD16, "QADD16", "111110101001nnnn1111dddd0001mmmm") +INST(thumb32_QASX, "QASX", "111110101010nnnn1111dddd0001mmmm") +INST(thumb32_QSAX, "QSAX", "111110101110nnnn1111dddd0001mmmm") +INST(thumb32_QSUB16, "QSUB16", "111110101101nnnn1111dddd0001mmmm") +INST(thumb32_QADD8, "QADD8", "111110101000nnnn1111dddd0001mmmm") +INST(thumb32_QSUB8, "QSUB8", "111110101100nnnn1111dddd0001mmmm") +INST(thumb32_SHADD16, "SHADD16", "111110101001nnnn1111dddd0010mmmm") +INST(thumb32_SHASX, "SHASX", "111110101010nnnn1111dddd0010mmmm") +INST(thumb32_SHSAX, "SHSAX", "111110101110nnnn1111dddd0010mmmm") +INST(thumb32_SHSUB16, "SHSUB16", "111110101101nnnn1111dddd0010mmmm") +INST(thumb32_SHADD8, "SHADD8", "111110101000nnnn1111dddd0010mmmm") +INST(thumb32_SHSUB8, "SHSUB8", "111110101100nnnn1111dddd0010mmmm") + +// Parallel Addition and Subtraction (unsigned) +INST(thumb32_UADD16, "UADD16", "111110101001nnnn1111dddd0100mmmm") +INST(thumb32_UASX, "UASX", "111110101010nnnn1111dddd0100mmmm") +INST(thumb32_USAX, "USAX", "111110101110nnnn1111dddd0100mmmm") +INST(thumb32_USUB16, "USUB16", "111110101101nnnn1111dddd0100mmmm") +INST(thumb32_UADD8, "UADD8", "111110101000nnnn1111dddd0100mmmm") +INST(thumb32_USUB8, "USUB8", "111110101100nnnn1111dddd0100mmmm") +INST(thumb32_UQADD16, "UQADD16", "111110101001nnnn1111dddd0101mmmm") +INST(thumb32_UQASX, "UQASX", "111110101010nnnn1111dddd0101mmmm") +INST(thumb32_UQSAX, "UQSAX", "111110101110nnnn1111dddd0101mmmm") +INST(thumb32_UQSUB16, "UQSUB16", "111110101101nnnn1111dddd0101mmmm") +INST(thumb32_UQADD8, "UQADD8", "111110101000nnnn1111dddd0101mmmm") +INST(thumb32_UQSUB8, "UQSUB8", "111110101100nnnn1111dddd0101mmmm") +INST(thumb32_UHADD16, "UHADD16", "111110101001nnnn1111dddd0110mmmm") +INST(thumb32_UHASX, "UHASX", "111110101010nnnn1111dddd0110mmmm") +INST(thumb32_UHSAX, "UHSAX", "111110101110nnnn1111dddd0110mmmm") +INST(thumb32_UHSUB16, "UHSUB16", "111110101101nnnn1111dddd0110mmmm") +INST(thumb32_UHADD8, "UHADD8", "111110101000nnnn1111dddd0110mmmm") +INST(thumb32_UHSUB8, "UHSUB8", "111110101100nnnn1111dddd0110mmmm") + +// Miscellaneous Operations +INST(thumb32_QADD, "QADD", "111110101000nnnn1111dddd1000mmmm") +INST(thumb32_QDADD, "QDADD", "111110101000nnnn1111dddd1001mmmm") +INST(thumb32_QSUB, "QSUB", "111110101000nnnn1111dddd1010mmmm") +INST(thumb32_QDSUB, "QDSUB", "111110101000nnnn1111dddd1011mmmm") +INST(thumb32_REV, "REV", "111110101001nnnn1111dddd1000mmmm") +INST(thumb32_REV16, "REV16", "111110101001nnnn1111dddd1001mmmm") +INST(thumb32_RBIT, "RBIT", "111110101001nnnn1111dddd1010mmmm") +INST(thumb32_REVSH, "REVSH", "111110101001nnnn1111dddd1011mmmm") +INST(thumb32_SEL, "SEL", "111110101010nnnn1111dddd1000mmmm") +INST(thumb32_CLZ, "CLZ", "111110101011nnnn1111dddd1000mmmm") + +// Multiply, Multiply Accumulate, and Absolute Difference +INST(thumb32_MUL, "MUL", "111110110000nnnn1111dddd0000mmmm") +INST(thumb32_MLA, "MLA", "111110110000nnnnaaaadddd0000mmmm") +INST(thumb32_MLS, "MLS", "111110110000nnnnaaaadddd0001mmmm") +INST(thumb32_SMULXY, "SMULXY", "111110110001nnnn1111dddd00NMmmmm") +INST(thumb32_SMLAXY, "SMLAXY", "111110110001nnnnaaaadddd00NMmmmm") +INST(thumb32_SMUAD, "SMUAD", "111110110010nnnn1111dddd000Mmmmm") +INST(thumb32_SMLAD, "SMLAD", "111110110010nnnnaaaadddd000Xmmmm") +INST(thumb32_SMULWY, "SMULWY", "111110110011nnnn1111dddd000Mmmmm") +INST(thumb32_SMLAWY, "SMLAWY", "111110110011nnnnaaaadddd000Mmmmm") +INST(thumb32_SMUSD, "SMUSD", "111110110100nnnn1111dddd000Mmmmm") +INST(thumb32_SMLSD, "SMLSD", "111110110100nnnnaaaadddd000Xmmmm") +INST(thumb32_SMMUL, "SMMUL", "111110110101nnnn1111dddd000Rmmmm") +INST(thumb32_SMMLA, "SMMLA", "111110110101nnnnaaaadddd000Rmmmm") +INST(thumb32_SMMLS, "SMMLS", "111110110110nnnnaaaadddd000Rmmmm") +INST(thumb32_USAD8, "USAD8", "111110110111nnnn1111dddd0000mmmm") +INST(thumb32_USADA8, "USADA8", "111110110111nnnnaaaadddd0000mmmm") + +// Long Multiply, Long Multiply Accumulate, and Divide +INST(thumb32_SMULL, "SMULL", "111110111000nnnnllllhhhh0000mmmm") +INST(thumb32_SDIV, "SDIV", "111110111001nnnn1111dddd1111mmmm") +INST(thumb32_UMULL, "UMULL", "111110111010nnnnllllhhhh0000mmmm") +INST(thumb32_UDIV, "UDIV", "111110111011nnnn1111dddd1111mmmm") +INST(thumb32_SMLAL, "SMLAL", "111110111100nnnnllllhhhh0000mmmm") +INST(thumb32_SMLALXY, "SMLALXY", "111110111100nnnnllllhhhh10NMmmmm") +INST(thumb32_SMLALD, "SMLALD", "111110111100nnnnllllhhhh110Mmmmm") +INST(thumb32_SMLSLD, "SMLSLD", "111110111101nnnnllllhhhh110Mmmmm") +INST(thumb32_UMLAL, "UMLAL", "111110111110nnnnllllhhhh0000mmmm") +INST(thumb32_UMAAL, "UMAAL", "111110111110nnnnllllhhhh0110mmmm") + +// Coprocessor +INST(thumb32_MCRR, "MCRR", "111o11000100uuuuttttppppooooMMMM") +INST(thumb32_MRRC, "MRRC", "111o11000101uuuuttttppppooooMMMM") +INST(thumb32_STC, "STC", "111o110pudw0nnnnDDDDppppvvvvvvvv") +INST(thumb32_LDC, "LDC", "111o110pudw1nnnnDDDDppppvvvvvvvv") +INST(thumb32_CDP, "CDP", "111o1110ooooNNNNDDDDppppooo0MMMM") +INST(thumb32_MCR, "MCR", "111o1110ooo0NNNNttttppppooo1MMMM") +INST(thumb32_MRC, "MRC", "111o1110ooo1NNNNttttppppooo1MMMM") diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.h b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.h new file mode 100644 index 0000000000..f79a859bf7 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.h @@ -0,0 +1,58 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2032 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <algorithm> +#include <functional> +#include <optional> +#include <vector> + +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/decoder/decoder_detail.h" +#include "dynarmic/frontend/decoder/matcher.h" + +namespace Dynarmic::A32 { + +template<typename Visitor> +using VFPMatcher = Decoder::Matcher<Visitor, u32>; + +template<typename V> +std::optional<std::reference_wrapper<const VFPMatcher<V>>> DecodeVFP(u32 instruction) { + using Table = std::vector<VFPMatcher<V>>; + + static const struct Tables { + Table unconditional; + Table conditional; + } tables = [] { + Table list = { + +#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(VFPMatcher, fn, name, Decoder::detail::StringToArray<32>(bitstring)), +#include "./vfp.inc" +#undef INST + + }; + + const auto division = std::stable_partition(list.begin(), list.end(), [&](const auto& matcher) { + return (matcher.GetMask() & 0xF0000000) == 0xF0000000; + }); + + return Tables{ + Table{list.begin(), division}, + Table{division, list.end()}, + }; + }(); + + const bool is_unconditional = (instruction & 0xF0000000) == 0xF0000000; + const Table& table = is_unconditional ? tables.unconditional : tables.conditional; + + const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); }; + + auto iter = std::find_if(table.begin(), table.end(), matches_instruction); + return iter != table.end() ? std::optional<std::reference_wrapper<const VFPMatcher<V>>>(*iter) : std::nullopt; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.inc b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.inc new file mode 100644 index 0000000000..21c9b78d79 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/vfp.inc @@ -0,0 +1,71 @@ +// Floating-point three-register data processing instructions +INST(vfp_VMLA, "VMLA", "cccc11100D00nnnndddd101zN0M0mmmm") // VFPv2 +INST(vfp_VMLS, "VMLS", "cccc11100D00nnnndddd101zN1M0mmmm") // VFPv2 +INST(vfp_VNMLS, "VNMLS", "cccc11100D01nnnndddd101zN0M0mmmm") // VFPv2 +INST(vfp_VNMLA, "VNMLA", "cccc11100D01nnnndddd101zN1M0mmmm") // VFPv2 +INST(vfp_VMUL, "VMUL", "cccc11100D10nnnndddd101zN0M0mmmm") // VFPv2 +INST(vfp_VNMUL, "VNMUL", "cccc11100D10nnnndddd101zN1M0mmmm") // VFPv2 +INST(vfp_VADD, "VADD", "cccc11100D11nnnndddd101zN0M0mmmm") // VFPv2 +INST(vfp_VSUB, "VSUB", "cccc11100D11nnnndddd101zN1M0mmmm") // VFPv2 +INST(vfp_VDIV, "VDIV", "cccc11101D00nnnndddd101zN0M0mmmm") // VFPv2 +INST(vfp_VFNMS, "VFNMS", "cccc11101D01nnnndddd101zN0M0mmmm") // VFPv4 +INST(vfp_VFNMA, "VFNMA", "cccc11101D01nnnndddd101zN1M0mmmm") // VFPv4 +INST(vfp_VFMA, "VFMA", "cccc11101D10nnnndddd101zN0M0mmmm") // VFPv4 +INST(vfp_VFMS, "VFMS", "cccc11101D10nnnndddd101zN1M0mmmm") // VFPv4 +INST(vfp_VSEL, "VSEL", "111111100Dccnnnndddd101zN0M0mmmm") // VFPv5 +INST(vfp_VMAXNM, "VMAXNNM", "111111101D00nnnndddd101zN0M0mmmm") // VFPv5 +INST(vfp_VMINNM, "VMINNM", "111111101D00nnnndddd101zN1M0mmmm") // VFPv5 + +// Other floating-point data-processing instructions +INST(vfp_VMOV_imm, "VMOV (immediate)", "cccc11101D11vvvvdddd101z0000vvvv") // VFPv3 +INST(vfp_VMOV_reg, "VMOV (reg)", "cccc11101D110000dddd101z01M0mmmm") // VFPv2 +INST(vfp_VABS, "VABS", "cccc11101D110000dddd101z11M0mmmm") // VFPv2 +INST(vfp_VNEG, "VNEG", "cccc11101D110001dddd101z01M0mmmm") // VFPv2 +INST(vfp_VSQRT, "VSQRT", "cccc11101D110001dddd101z11M0mmmm") // VFPv2 +INST(vfp_VCVTB, "VCVTB", "cccc11101D11001odddd101z01M0mmmm") // VFPv3HP +INST(vfp_VCVTT, "VCVTT", "cccc11101D11001odddd101z11M0mmmm") // VFPv3HP +INST(vfp_VCMP, "VCMP", "cccc11101D110100dddd101zE1M0mmmm") // VFPv2 +INST(vfp_VCMP_zero, "VCMP (with zero)", "cccc11101D110101dddd101zE1000000") // VFPv2 +INST(vfp_VRINTR, "VRINTR", "cccc11101D110110dddd101z01M0mmmm") // VFPv5 +INST(vfp_VRINTZ, "VRINTZ", "cccc11101D110110dddd101z11M0mmmm") // VFPv5 +INST(vfp_VRINTX, "VRINTX", "cccc11101D110111dddd101z01M0mmmm") // VFPv5 +INST(vfp_VCVT_f_to_f, "VCVT (f32<->f64)", "cccc11101D110111dddd101z11M0mmmm") // VFPv2 +INST(vfp_VCVT_from_int, "VCVT (from int)", "cccc11101D111000dddd101zs1M0mmmm") // VFPv2 +INST(vfp_VCVT_from_fixed, "VCVT (from fixed)", "cccc11101D11101Udddd101zx1i0vvvv") // VFPv3 +INST(vfp_VCVT_to_u32, "VCVT (to u32)", "cccc11101D111100dddd101zr1M0mmmm") // VFPv2 +INST(vfp_VCVT_to_s32, "VCVT (to s32)", "cccc11101D111101dddd101zr1M0mmmm") // VFPv2 +INST(vfp_VCVT_to_fixed, "VCVT (to fixed)", "cccc11101D11111Udddd101zx1i0vvvv") // VFPv3 +INST(vfp_VRINT_rm, "VRINT{A,N,P,M}", "111111101D1110mmdddd101z01M0mmmm") // VFPv5 +INST(vfp_VCVT_rm, "VCVT{A,N,P,M}", "111111101D1111mmdddd101zU1M0mmmm") // VFPv5 + +// Floating-point move instructions +INST(vfp_VMOV_u32_f64, "VMOV (core to f64)", "cccc11100000ddddtttt1011D0010000") // VFPv2 +INST(vfp_VMOV_f64_u32, "VMOV (f64 to core)", "cccc11100001nnnntttt1011N0010000") // VFPv2 +INST(vfp_VMOV_u32_f32, "VMOV (core to f32)", "cccc11100000nnnntttt1010N0010000") // VFPv2 +INST(vfp_VMOV_f32_u32, "VMOV (f32 to core)", "cccc11100001nnnntttt1010N0010000") // VFPv2 +INST(vfp_VMOV_2u32_2f32, "VMOV (2xcore to 2xf32)", "cccc11000100uuuutttt101000M1mmmm") // VFPv2 +INST(vfp_VMOV_2f32_2u32, "VMOV (2xf32 to 2xcore)", "cccc11000101uuuutttt101000M1mmmm") // VFPv2 +INST(vfp_VMOV_2u32_f64, "VMOV (2xcore to f64)", "cccc11000100uuuutttt101100M1mmmm") // VFPv2 +INST(vfp_VMOV_f64_2u32, "VMOV (f64 to 2xcore)", "cccc11000101uuuutttt101100M1mmmm") // VFPv2 +INST(vfp_VMOV_from_i32, "VMOV (core to i32)" , "cccc111000i0nnnntttt1011N0010000") // VFPv4 +INST(vfp_VMOV_from_i16, "VMOV (core to i16)" , "cccc111000i0nnnntttt1011Ni110000") // ASIMD +INST(vfp_VMOV_from_i8, "VMOV (core to i8)", "cccc111001i0nnnntttt1011Nii10000") // ASIMD +INST(vfp_VMOV_to_i32, "VMOV (i32 to core)" , "cccc111000i1nnnntttt1011N0010000") // VFPv4 +INST(vfp_VMOV_to_i16, "VMOV (i16 to core)" , "cccc1110U0i1nnnntttt1011Ni110000") // ASIMD +INST(vfp_VMOV_to_i8, "VMOV (i8 to core)", "cccc1110U1i1nnnntttt1011Nii10000") // ASIMD +INST(vfp_VDUP, "VDUP (from core)", "cccc11101BQ0ddddtttt1011D0E10000") // ASIMD + +// Floating-point system register access +INST(vfp_VMSR, "VMSR", "cccc111011100001tttt101000010000") // VFPv2 +INST(vfp_VMRS, "VMRS", "cccc111011110001tttt101000010000") // VFPv2 + +// Extension register load-store instructions +INST(vfp_VPUSH, "VPUSH", "cccc11010D101101dddd101zvvvvvvvv") // VFPv2 +INST(vfp_VPOP, "VPOP", "cccc11001D111101dddd101zvvvvvvvv") // VFPv2 +INST(vfp_VLDR, "VLDR", "cccc1101UD01nnnndddd101zvvvvvvvv") // VFPv2 +INST(vfp_VSTR, "VSTR", "cccc1101UD00nnnndddd101zvvvvvvvv") // VFPv2 +INST(arm_UDF, "Undefined VSTM/VLDM", "----11000-0---------101---------") // VFPv2 +INST(vfp_VSTM_a1, "VSTM (A1)", "cccc110puDw0nnnndddd1011vvvvvvvv") // VFPv2 +INST(vfp_VSTM_a2, "VSTM (A2)", "cccc110puDw0nnnndddd1010vvvvvvvv") // VFPv2 +INST(vfp_VLDM_a1, "VLDM (A1)", "cccc110puDw1nnnndddd1011vvvvvvvv") // VFPv2 +INST(vfp_VLDM_a2, "VLDM (A2)", "cccc110puDw1nnnndddd1010vvvvvvvv") // VFPv2 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler.h b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler.h new file mode 100644 index 0000000000..6a61afdefa --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler.h @@ -0,0 +1,17 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <string> + +#include <mcl/stdint.hpp> + +namespace Dynarmic::A32 { + +std::string DisassembleArm(u32 instruction); +std::string DisassembleThumb16(u16 instruction); + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_arm.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_arm.cpp new file mode 100644 index 0000000000..ff0c48a1ce --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_arm.cpp @@ -0,0 +1,1584 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <cstdlib> +#include <string> + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <mcl/bit/bit_field.hpp> +#include <mcl/bit/rotate.hpp> + +#include "dynarmic/common/string_util.h" +#include "dynarmic/frontend/A32/a32_types.h" +#include "dynarmic/frontend/A32/decoder/arm.h" +#include "dynarmic/frontend/A32/decoder/vfp.h" +#include "dynarmic/frontend/A32/disassembler/disassembler.h" +#include "dynarmic/frontend/imm.h" + +namespace Dynarmic::A32 { + +class DisassemblerVisitor { +public: + using instruction_return_type = std::string; + + static u32 ArmExpandImm(int rotate, Imm<8> imm8) { + return mcl::bit::rotate_right(static_cast<u32>(imm8.ZeroExtend()), rotate * 2); + } + + static std::string ShiftStr(ShiftType shift, Imm<5> imm5) { + switch (shift) { + case ShiftType::LSL: + if (imm5 == 0) + return ""; + return fmt::format(", lsl #{}", imm5.ZeroExtend()); + case ShiftType::LSR: + if (imm5 == 0) + return ", lsr #32"; + return fmt::format(", lsr #{}", imm5.ZeroExtend()); + case ShiftType::ASR: + if (imm5 == 0) + return ", asr #32"; + return fmt::format(", asr #{}", imm5.ZeroExtend()); + case ShiftType::ROR: + if (imm5 == 0) + return ", rrx"; + return fmt::format(", ror #{}", imm5.ZeroExtend()); + } + ASSERT(false); + return "<internal error>"; + } + + static std::string RsrStr(Reg s, ShiftType shift, Reg m) { + switch (shift) { + case ShiftType::LSL: + return fmt::format("{}, lsl {}", m, s); + case ShiftType::LSR: + return fmt::format("{}, lsr {}", m, s); + case ShiftType::ASR: + return fmt::format("{}, asr {}", m, s); + case ShiftType::ROR: + return fmt::format("{}, ror {}", m, s); + } + ASSERT(false); + return "<internal error>"; + } + + static std::string RorStr(Reg m, SignExtendRotation rotate) { + switch (rotate) { + case SignExtendRotation::ROR_0: + return RegToString(m); + case SignExtendRotation::ROR_8: + return fmt::format("{}, ror #8", m); + case SignExtendRotation::ROR_16: + return fmt::format("{}, ror #16", m); + case SignExtendRotation::ROR_24: + return fmt::format("{}, ror #24", m); + } + ASSERT(false); + return "<internal error>"; + } + + static const char* BarrierOptionStr(Imm<4> option) { + switch (option.ZeroExtend()) { + case 0b0010: + return " oshst"; + case 0b0011: + return " osh"; + case 0b0110: + return " nshst"; + case 0b0111: + return " nsh"; + case 0b1010: + return " ishst"; + case 0b1011: + return " ish"; + case 0b1110: + return " st"; + case 0b1111: // SY can be omitted. + return ""; + default: + return " unknown"; + } + } + + static std::string FPRegStr(bool dp_operation, size_t base, bool bit) { + size_t reg_num; + if (dp_operation) { + reg_num = base + (bit ? 16 : 0); + } else { + reg_num = (base << 1) + (bit ? 1 : 0); + } + return fmt::format("{}{}", dp_operation ? 'd' : 's', reg_num); + } + + static std::string FPNextRegStr(bool dp_operation, size_t base, bool bit) { + size_t reg_num; + if (dp_operation) { + reg_num = base + (bit ? 16 : 0); + } else { + reg_num = (base << 1) + (bit ? 1 : 0); + } + return fmt::format("{}{}", dp_operation ? 'd' : 's', reg_num + 1); + } + + static std::string VectorStr(bool Q, size_t base, bool bit) { + size_t reg_num; + if (Q) { + reg_num = (base >> 1) + (bit ? 8 : 0); + } else { + reg_num = base + (bit ? 16 : 0); + } + return fmt::format("{}{}", Q ? 'q' : 'd', reg_num); + } + + static std::string CondOrTwo(Cond cond) { + return cond == Cond::NV ? "2" : CondToString(cond); + } + + // Barrier instructions + std::string arm_DMB(Imm<4> option) { + return fmt::format("dmb{}", BarrierOptionStr(option)); + } + std::string arm_DSB(Imm<4> option) { + return fmt::format("dsb{}", BarrierOptionStr(option)); + } + std::string arm_ISB([[maybe_unused]] Imm<4> option) { + return "isb"; + } + + // Branch instructions + std::string arm_B(Cond cond, Imm<24> imm24) { + const s32 offset = static_cast<s32>(mcl::bit::sign_extend<26, u32>(imm24.ZeroExtend() << 2) + 8); + return fmt::format("b{} {}#{}", CondToString(cond), Common::SignToChar(offset), abs(offset)); + } + std::string arm_BL(Cond cond, Imm<24> imm24) { + const s32 offset = static_cast<s32>(mcl::bit::sign_extend<26, u32>(imm24.ZeroExtend() << 2) + 8); + return fmt::format("bl{} {}#{}", CondToString(cond), Common::SignToChar(offset), abs(offset)); + } + std::string arm_BLX_imm(bool H, Imm<24> imm24) { + const s32 offset = static_cast<s32>(mcl::bit::sign_extend<26, u32>(imm24.ZeroExtend() << 2) + 8 + (H ? 2 : 0)); + return fmt::format("blx {}#{}", Common::SignToChar(offset), abs(offset)); + } + std::string arm_BLX_reg(Cond cond, Reg m) { + return fmt::format("blx{} {}", CondToString(cond), m); + } + std::string arm_BX(Cond cond, Reg m) { + return fmt::format("bx{} {}", CondToString(cond), m); + } + std::string arm_BXJ(Cond cond, Reg m) { + return fmt::format("bxj{} {}", CondToString(cond), m); + } + + // Coprocessor instructions + std::string arm_CDP(Cond cond, size_t opc1, CoprocReg CRn, CoprocReg CRd, size_t coproc_no, size_t opc2, CoprocReg CRm) { + return fmt::format("cdp{} p{}, #{}, {}, {}, {}, #{}", CondToString(cond), coproc_no, opc1, CRd, CRn, CRm, opc2); + } + + std::string arm_LDC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) { + const u32 imm32 = static_cast<u32>(imm8.ZeroExtend()) << 2; + if (!p && !u && !d && !w) { + return "<undefined>"; + } + if (p) { + return fmt::format("ldc{}{} {}, {}, [{}, #{}{}]{}", d ? "l" : "", + CondOrTwo(cond), coproc_no, CRd, n, u ? "+" : "-", imm32, + w ? "!" : ""); + } + if (!p && w) { + return fmt::format("ldc{}{} {}, {}, [{}], #{}{}", d ? "l" : "", + CondOrTwo(cond), coproc_no, CRd, n, u ? "+" : "-", imm32); + } + if (!p && !w && u) { + return fmt::format("ldc{}{} {}, {}, [{}], {}", d ? "l" : "", + CondOrTwo(cond), coproc_no, CRd, n, imm8.ZeroExtend()); + } + UNREACHABLE(); + } + + std::string arm_MCR(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) { + return fmt::format("mcr{} p{}, #{}, {}, {}, {}, #{}", CondOrTwo(cond), coproc_no, opc1, t, CRn, CRm, opc2); + } + + std::string arm_MCRR(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) { + return fmt::format("mcr{} p{}, #{}, {}, {}, {}", CondOrTwo(cond), coproc_no, opc, t, t2, CRm); + } + + std::string arm_MRC(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) { + return fmt::format("mrc{} p{}, #{}, {}, {}, {}, #{}", CondOrTwo(cond), coproc_no, opc1, t, CRn, CRm, opc2); + } + + std::string arm_MRRC(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) { + return fmt::format("mrrc{} p{}, #{}, {}, {}, {}", CondOrTwo(cond), coproc_no, opc, t, t2, CRm); + } + + std::string arm_STC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) { + const u32 imm32 = static_cast<u32>(imm8.ZeroExtend()) << 2; + if (!p && !u && !d && !w) { + return "<undefined>"; + } + if (p) { + return fmt::format("stc{}{} {}, {}, [{}, #{}{}]{}", d ? "l" : "", + CondOrTwo(cond), coproc_no, CRd, n, + u ? "+" : "-", imm32, w ? "!" : ""); + } + if (!p && w) { + return fmt::format("stc{}{} {}, {}, [{}], #{}{}", d ? "l" : "", + CondOrTwo(cond), coproc_no, CRd, n, + u ? "+" : "-", imm32); + } + if (!p && !w && u) { + return fmt::format("stc{}{} {}, {}, [{}], {}", d ? "l" : "", + CondOrTwo(cond), coproc_no, CRd, n, imm8.ZeroExtend()); + } + UNREACHABLE(); + } + + // CRC32 instructions + std::string arm_CRC32([[maybe_unused]] Cond cond, Imm<2> sz, Reg n, Reg d, Reg m) { + static constexpr std::array data_type{ + "b", + "h", + "w", + "invalid", + }; + + return fmt::format("crc32{} {}, {}, {}", data_type[sz.ZeroExtend()], d, n, m); + } + std::string arm_CRC32C([[maybe_unused]] Cond cond, Imm<2> sz, Reg n, Reg d, Reg m) { + static constexpr std::array data_type{ + "b", + "h", + "w", + "invalid", + }; + + return fmt::format("crc32c{} {}, {}, {}", data_type[sz.ZeroExtend()], d, n, m); + } + + // Data processing instructions + std::string arm_ADC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("adc{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8)); + } + std::string arm_ADC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("adc{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5)); + } + std::string arm_ADC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("adc{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m)); + } + std::string arm_ADD_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("add{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8)); + } + std::string arm_ADD_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("add{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5)); + } + std::string arm_ADD_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("add{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m)); + } + std::string arm_AND_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("and{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8)); + } + std::string arm_AND_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("and{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5)); + } + std::string arm_AND_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("and{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m)); + } + std::string arm_BIC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("bic{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8)); + } + std::string arm_BIC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("bic{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5)); + } + std::string arm_BIC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("bic{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m)); + } + std::string arm_CMN_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) { + return fmt::format("cmn{} {}, #{}", CondToString(cond), n, ArmExpandImm(rotate, imm8)); + } + std::string arm_CMN_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("cmn{} {}, {}{}", CondToString(cond), n, m, ShiftStr(shift, imm5)); + } + std::string arm_CMN_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) { + return fmt::format("cmn{} {}, {}", CondToString(cond), n, RsrStr(s, shift, m)); + } + std::string arm_CMP_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) { + return fmt::format("cmp{} {}, #{}", CondToString(cond), n, ArmExpandImm(rotate, imm8)); + } + std::string arm_CMP_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("cmp{} {}, {}{}", CondToString(cond), n, m, ShiftStr(shift, imm5)); + } + std::string arm_CMP_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) { + return fmt::format("cmp{} {}, {}", CondToString(cond), n, RsrStr(s, shift, m)); + } + std::string arm_EOR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("eor{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8)); + } + std::string arm_EOR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("eor{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5)); + } + std::string arm_EOR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("eor{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m)); + } + std::string arm_MOV_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("mov{}{} {}, #{}", CondToString(cond), S ? "s" : "", d, ArmExpandImm(rotate, imm8)); + } + std::string arm_MOV_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("mov{}{} {}, {}{}", CondToString(cond), S ? "s" : "", d, m, ShiftStr(shift, imm5)); + } + std::string arm_MOV_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("mov{}{} {}, {}", CondToString(cond), S ? "s" : "", d, RsrStr(s, shift, m)); + } + std::string arm_MVN_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("mvn{}{} {}, #{}", CondToString(cond), S ? "s" : "", d, ArmExpandImm(rotate, imm8)); + } + std::string arm_MVN_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("mvn{}{} {}, {}{}", CondToString(cond), S ? "s" : "", d, m, ShiftStr(shift, imm5)); + } + std::string arm_MVN_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("mvn{}{} {}, {}", CondToString(cond), S ? "s" : "", d, RsrStr(s, shift, m)); + } + std::string arm_ORR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("orr{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8)); + } + std::string arm_ORR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("orr{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5)); + } + std::string arm_ORR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("orr{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m)); + } + std::string arm_RSB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("rsb{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8)); + } + std::string arm_RSB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("rsb{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5)); + } + std::string arm_RSB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("rsb{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m)); + } + std::string arm_RSC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("rsc{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8)); + } + std::string arm_RSC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("rsc{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5)); + } + std::string arm_RSC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("rsc{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m)); + } + std::string arm_SBC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("sbc{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8)); + } + std::string arm_SBC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("sbc{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5)); + } + std::string arm_SBC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("sbc{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m)); + } + std::string arm_SUB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + return fmt::format("sub{}{} {}, {}, #{}", CondToString(cond), S ? "s" : "", d, n, ArmExpandImm(rotate, imm8)); + } + std::string arm_SUB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("sub{}{} {}, {}, {}{}", CondToString(cond), S ? "s" : "", d, n, m, ShiftStr(shift, imm5)); + } + std::string arm_SUB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + return fmt::format("sub{}{} {}, {}, {}", CondToString(cond), S ? "s" : "", d, n, RsrStr(s, shift, m)); + } + std::string arm_TEQ_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) { + return fmt::format("teq{} {}, #{}", CondToString(cond), n, ArmExpandImm(rotate, imm8)); + } + std::string arm_TEQ_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("teq{} {}, {}{}", CondToString(cond), n, m, ShiftStr(shift, imm5)); + } + std::string arm_TEQ_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) { + return fmt::format("teq{} {}, {}", CondToString(cond), n, RsrStr(s, shift, m)); + } + std::string arm_TST_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) { + return fmt::format("tst{} {}, #{}", CondToString(cond), n, ArmExpandImm(rotate, imm8)); + } + std::string arm_TST_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) { + return fmt::format("tst{} {}, {}{}", CondToString(cond), n, m, ShiftStr(shift, imm5)); + } + std::string arm_TST_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) { + return fmt::format("tst{} {}, {}", CondToString(cond), n, RsrStr(s, shift, m)); + } + + // Exception generation instructions + std::string arm_BKPT(Cond cond, Imm<12> imm12, Imm<4> imm4) { + return fmt::format("bkpt{} #{}", CondToString(cond), concatenate(imm12, imm4).ZeroExtend()); + } + std::string arm_SVC(Cond cond, Imm<24> imm24) { + return fmt::format("svc{} #{}", CondToString(cond), imm24.ZeroExtend()); + } + std::string arm_UDF() { + return fmt::format("udf"); + } + + // Extension functions + std::string arm_SXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("sxtab{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate)); + } + std::string arm_SXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("sxtab16{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate)); + } + std::string arm_SXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("sxtah{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate)); + } + std::string arm_SXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("sxtb{} {}, {}", CondToString(cond), d, RorStr(m, rotate)); + } + std::string arm_SXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("sxtb16{} {}, {}", CondToString(cond), d, RorStr(m, rotate)); + } + std::string arm_SXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("sxth{} {}, {}", CondToString(cond), d, RorStr(m, rotate)); + } + std::string arm_UXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("uxtab{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate)); + } + std::string arm_UXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("uxtab16{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate)); + } + std::string arm_UXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("uxtah{} {}, {}, {}", CondToString(cond), d, n, RorStr(m, rotate)); + } + std::string arm_UXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("uxtb{} {}, {}", CondToString(cond), d, RorStr(m, rotate)); + } + std::string arm_UXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("uxtb16{} {}, {}", CondToString(cond), d, RorStr(m, rotate)); + } + std::string arm_UXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + return fmt::format("uxth{} {}, {}", CondToString(cond), d, RorStr(m, rotate)); + } + + // Hint instructions + std::string arm_PLD_imm(bool add, bool R, Reg n, Imm<12> imm12) { + const char sign = add ? '+' : '-'; + const char* const w = R ? "" : "w"; + + return fmt::format("pld{} [{}, #{}{:x}]", w, n, sign, imm12.ZeroExtend()); + } + std::string arm_PLD_reg(bool add, bool R, Reg n, Imm<5> imm5, ShiftType shift, Reg m) { + const char sign = add ? '+' : '-'; + const char* const w = R ? "" : "w"; + + return fmt::format("pld{} [{}, {}{}{}]", w, n, sign, m, ShiftStr(shift, imm5)); + } + std::string arm_SEV() { + return "sev"; + } + std::string arm_SEVL() { + return "sevl"; + } + std::string arm_WFE() { + return "wfe"; + } + std::string arm_WFI() { + return "wfi"; + } + std::string arm_YIELD() { + return "yield"; + } + + // Load/Store instructions + std::string arm_LDR_lit(Cond cond, bool U, Reg t, Imm<12> imm12) { + const bool P = true; + const bool W = false; + return arm_LDR_imm(cond, P, U, W, Reg::PC, t, imm12); + } + std::string arm_LDR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) { + const u32 imm12_value = imm12.ZeroExtend(); + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldr{} {}, [{}, #{}{}]{}", + CondToString(cond), t, n, sign, + imm12_value, W ? "!" : ""); + } else { + return fmt::format("ldr{} {}, [{}], #{}{}{}", + CondToString(cond), t, n, sign, + imm12_value, W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) { + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldr{} {}, [{}, {}{}{}]{}", + CondToString(cond), t, n, sign, m, + ShiftStr(shift, imm5), W ? "!" : ""); + } else { + return fmt::format("ldr{} {}, [{}], {}{}{}{}", + CondToString(cond), t, n, sign, m, + ShiftStr(shift, imm5), W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDRB_lit(Cond cond, bool U, Reg t, Imm<12> imm12) { + const bool P = true; + const bool W = false; + return arm_LDRB_imm(cond, P, U, W, Reg::PC, t, imm12); + } + std::string arm_LDRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) { + const u32 imm12_value = imm12.ZeroExtend(); + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldrb{} {}, [{}, #{}{}]{}", + CondToString(cond), t, n, sign, imm12_value, + W ? "!" : ""); + } else { + return fmt::format("ldrb{} {}, [{}], #{}{}{}", + CondToString(cond), t, n, sign, imm12_value, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) { + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldrb{} {}, [{}, {}{}{}]{}", + CondToString(cond), t, n, sign, m, + ShiftStr(shift, imm5), W ? "!" : ""); + } else { + return fmt::format("ldrb{} {}, [{}], {}{}{}{}", + CondToString(cond), t, n, sign, m, + ShiftStr(shift, imm5), W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDRBT() { return "ice"; } + std::string arm_LDRD_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + const bool P = true; + const bool W = false; + return arm_LDRD_imm(cond, P, U, W, Reg::PC, t, imm8a, imm8b); + } + std::string arm_LDRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldrd{} {}, {}, [{}, #{}{}]{}", + CondToString(cond), t, t + 1, n, sign, imm32, + W ? "!" : ""); + } else { + return fmt::format("ldrd{} {}, {}, [{}], #{}{}{}", + CondToString(cond), t, t + 1, n, sign, imm32, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldrd{} {}, {}, [{}, {}{}]{}", + CondToString(cond), t, t + 1, n, sign, m, + W ? "!" : ""); + } else { + return fmt::format("ldrd{} {}, {}, [{}], {}{}{}", + CondToString(cond), t, t + 1, n, sign, m, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDRH_lit(Cond cond, bool P, bool U, bool W, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + return arm_LDRH_imm(cond, P, U, W, Reg::PC, t, imm8a, imm8b); + } + std::string arm_LDRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldrh{} {}, [{}, #{}{}]{}", + CondToString(cond), t, n, sign, imm32, + W ? "!" : ""); + } else { + return fmt::format("ldrh{} {}, [{}], #{}{}{}", + CondToString(cond), t, n, sign, imm32, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldrh{} {}, [{}, {}{}]{}", + CondToString(cond), t, n, sign, m, + W ? "!" : ""); + } else { + return fmt::format("ldrh{} {}, [{}], {}{}{}", + CondToString(cond), t, n, sign, m, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDRHT() { return "ice"; } + std::string arm_LDRSB_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + const bool P = true; + const bool W = false; + return arm_LDRSB_imm(cond, P, U, W, Reg::PC, t, imm8a, imm8b); + } + std::string arm_LDRSB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldrsb{} {}, [{}, #{}{}]{}", + CondToString(cond), t, n, sign, imm32, + W ? "!" : ""); + } else { + return fmt::format("ldrsb{} {}, [{}], #{}{}{}", + CondToString(cond), t, n, sign, imm32, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDRSB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldrsb{} {}, [{}, {}{}]{}", + CondToString(cond), t, n, sign, m, + W ? "!" : ""); + } else { + return fmt::format("ldrsb{} {}, [{}], {}{}{}", + CondToString(cond), t, n, sign, m, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDRSBT() { return "ice"; } + std::string arm_LDRSH_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + const bool P = true; + const bool W = false; + return arm_LDRSH_imm(cond, P, U, W, Reg::PC, t, imm8a, imm8b); + } + std::string arm_LDRSH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldrsh{} {}, [{}, #{}{}]{}", + CondToString(cond), t, n, sign, imm32, + W ? "!" : ""); + } else { + return fmt::format("ldrsh{} {}, [{}], #{}{}{}", + CondToString(cond), t, n, sign, imm32, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDRSH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("ldrsh{} {}, [{}, {}{}]{}", + CondToString(cond), t, n, sign, m, + W ? "!" : ""); + } else { + return fmt::format("ldrsh{} {}, [{}], {}{}{}", + CondToString(cond), t, n, sign, m, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_LDRSHT() { return "ice"; } + std::string arm_LDRT() { return "ice"; } + std::string arm_STR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) { + const u32 imm12_value = imm12.ZeroExtend(); + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("str{} {}, [{}, #{}{}]{}", + CondToString(cond), t, n, sign, imm12_value, + W ? "!" : ""); + } else { + return fmt::format("str{} {}, [{}], #{}{}{}", + CondToString(cond), t, n, sign, imm12_value, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_STR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) { + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("str{} {}, [{}, {}{}{}]{}", + CondToString(cond), t, n, sign, m, + ShiftStr(shift, imm5), W ? "!" : ""); + } else { + return fmt::format("str{} {}, [{}], {}{}{}{}", + CondToString(cond), t, n, sign, m, + ShiftStr(shift, imm5), W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_STRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) { + const u32 imm12_value = imm12.ZeroExtend(); + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("strb{} {}, [{}, #{}{}]{}", + CondToString(cond), t, n, sign, imm12_value, + W ? "!" : ""); + } else { + return fmt::format("strb{} {}, [{}], #{}{}{}", + CondToString(cond), t, n, sign, imm12_value, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_STRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) { + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("strb{} {}, [{}, {}{}{}]{}", + CondToString(cond), t, n, sign, m, + ShiftStr(shift, imm5), W ? "!" : ""); + } else { + return fmt::format("strb{} {}, [{}], {}{}{}{}", + CondToString(cond), t, n, sign, m, + ShiftStr(shift, imm5), W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_STRBT() { return "ice"; } + std::string arm_STRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("strd{} {}, {}, [{}, #{}{}]{}", + CondToString(cond), t, t + 1, n, sign, imm32, + W ? "!" : ""); + } else { + return fmt::format("strd{} {}, {}, [{}], #{}{}{}", + CondToString(cond), t, t + 1, n, sign, imm32, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_STRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("strd{} {}, {}, [{}, {}{}]{}", + CondToString(cond), t, t + 1, n, sign, m, + W ? "!" : ""); + } else { + return fmt::format("strd{} {}, {}, [{}], {}{}{}", + CondToString(cond), t, t + 1, n, sign, m, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_STRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("strh{} {}, [{}, #{}{}]{}", + CondToString(cond), t, n, sign, imm32, + W ? "!" : ""); + } else { + return fmt::format("strh{} {}, [{}], #{}{}{}", + CondToString(cond), t, n, sign, imm32, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_STRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + const char sign = U ? '+' : '-'; + + if (P) { + return fmt::format("strd{} {}, [{}, {}{}]{}", + CondToString(cond), t, n, sign, m, + W ? "!" : ""); + } else { + return fmt::format("strd{} {}, [{}], {}{}{}", + CondToString(cond), t, n, sign, m, + W ? " (err: W == 1!!!)" : ""); + } + } + std::string arm_STRHT() { return "ice"; } + std::string arm_STRT() { return "ice"; } + + // Load/Store multiple instructions + std::string arm_LDM(Cond cond, bool W, Reg n, RegList list) { + return fmt::format("ldm{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list)); + } + std::string arm_LDMDA(Cond cond, bool W, Reg n, RegList list) { + return fmt::format("ldmda{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list)); + } + std::string arm_LDMDB(Cond cond, bool W, Reg n, RegList list) { + return fmt::format("ldmdb{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list)); + } + std::string arm_LDMIB(Cond cond, bool W, Reg n, RegList list) { + return fmt::format("ldmib{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list)); + } + std::string arm_LDM_usr() { return "ice"; } + std::string arm_LDM_eret() { return "ice"; } + std::string arm_STM(Cond cond, bool W, Reg n, RegList list) { + return fmt::format("stm{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list)); + } + std::string arm_STMDA(Cond cond, bool W, Reg n, RegList list) { + return fmt::format("stmda{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list)); + } + std::string arm_STMDB(Cond cond, bool W, Reg n, RegList list) { + return fmt::format("stmdb{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list)); + } + std::string arm_STMIB(Cond cond, bool W, Reg n, RegList list) { + return fmt::format("stmib{} {}{}, {{{}}}", CondToString(cond), n, W ? "!" : "", RegListToString(list)); + } + std::string arm_STM_usr() { return "ice"; } + + // Miscellaneous instructions + std::string arm_BFC(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb) { + const u32 lsb_value = lsb.ZeroExtend(); + const u32 width = msb.ZeroExtend() - lsb_value + 1; + return fmt::format("bfc{} {}, #{}, #{}", + CondToString(cond), d, lsb_value, width); + } + std::string arm_BFI(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb, Reg n) { + const u32 lsb_value = lsb.ZeroExtend(); + const u32 width = msb.ZeroExtend() - lsb_value + 1; + return fmt::format("bfi{} {}, {}, #{}, #{}", + CondToString(cond), d, n, lsb_value, width); + } + std::string arm_CLZ(Cond cond, Reg d, Reg m) { + return fmt::format("clz{} {}, {}", CondToString(cond), d, m); + } + std::string arm_MOVT(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12) { + const u32 imm = concatenate(imm4, imm12).ZeroExtend(); + return fmt::format("movt{} {}, #{}", CondToString(cond), d, imm); + } + std::string arm_MOVW(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12) { + const u32 imm = concatenate(imm4, imm12).ZeroExtend(); + return fmt::format("movw{}, {}, #{}", CondToString(cond), d, imm); + } + std::string arm_NOP() { + return "nop"; + } + std::string arm_RBIT(Cond cond, Reg d, Reg m) { + return fmt::format("rbit{} {}, {}", CondToString(cond), d, m); + } + std::string arm_SBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n) { + const u32 lsb_value = lsb.ZeroExtend(); + const u32 width = widthm1.ZeroExtend() + 1; + return fmt::format("sbfx{} {}, {}, #{}, #{}", + CondToString(cond), d, n, lsb_value, width); + } + std::string arm_SEL(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("sel{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n) { + const u32 lsb_value = lsb.ZeroExtend(); + const u32 width = widthm1.ZeroExtend() + 1; + return fmt::format("ubfx{} {}, {}, #{}, #{}", + CondToString(cond), d, n, lsb_value, width); + } + + // Unsigned sum of absolute difference functions + std::string arm_USAD8(Cond cond, Reg d, Reg m, Reg n) { + return fmt::format("usad8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_USADA8(Cond cond, Reg d, Reg a, Reg m, Reg n) { + return fmt::format("usad8a{} {}, {}, {}, {}", CondToString(cond), d, n, m, a); + } + + // Packing instructions + std::string arm_PKHBT(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m) { + return fmt::format("pkhbt{} {}, {}, {}{}", CondToString(cond), d, n, m, ShiftStr(ShiftType::LSL, imm5)); + } + std::string arm_PKHTB(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m) { + return fmt::format("pkhtb{} {}, {}, {}{}", CondToString(cond), d, n, m, ShiftStr(ShiftType::ASR, imm5)); + } + + // Reversal instructions + std::string arm_REV(Cond cond, Reg d, Reg m) { + return fmt::format("rev{} {}, {}", CondToString(cond), d, m); + } + std::string arm_REV16(Cond cond, Reg d, Reg m) { + return fmt::format("rev16{} {}, {}", CondToString(cond), d, m); + } + std::string arm_REVSH(Cond cond, Reg d, Reg m) { + return fmt::format("revsh{} {}, {}", CondToString(cond), d, m); + } + + // Saturation instructions + std::string arm_SSAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n) { + const u32 bit_position = sat_imm.ZeroExtend() + 1; + return fmt::format("ssat{} {}, #{}, {}{}", + CondToString(cond), d, bit_position, n, + ShiftStr(ShiftType(sh << 1), imm5)); + } + std::string arm_SSAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n) { + const u32 bit_position = sat_imm.ZeroExtend() + 1; + return fmt::format("ssat16{} {}, #{}, {}", + CondToString(cond), d, bit_position, n); + } + std::string arm_USAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n) { + return fmt::format("usat{} {}, #{}, {}{}", + CondToString(cond), d, sat_imm.ZeroExtend(), n, + ShiftStr(ShiftType(sh << 1), imm5)); + } + std::string arm_USAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n) { + return fmt::format("usat16{} {}, #{}, {}", + CondToString(cond), d, sat_imm.ZeroExtend(), n); + } + + // Divide instructions + std::string arm_SDIV(Cond cond, Reg d, Reg m, Reg n) { + return fmt::format("sdiv{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UDIV(Cond cond, Reg d, Reg m, Reg n) { + return fmt::format("udiv{} {}, {}, {}", CondToString(cond), d, n, m); + } + + // Multiply (Normal) instructions + std::string arm_MLA(Cond cond, bool S, Reg d, Reg a, Reg m, Reg n) { + return fmt::format("mla{}{} {}, {}, {}, {}", S ? "s" : "", CondToString(cond), d, n, m, a); + } + std::string arm_MLS(Cond cond, Reg d, Reg a, Reg m, Reg n) { + return fmt::format("mls{} {}, {}, {}, {}", CondToString(cond), d, n, m, a); + } + std::string arm_MUL(Cond cond, bool S, Reg d, Reg m, Reg n) { + return fmt::format("mul{}{} {}, {}, {}", S ? "s" : "", CondToString(cond), d, n, m); + } + + // Multiply (Long) instructions + std::string arm_SMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) { + return fmt::format("smlal{}{} {}, {}, {}, {}", S ? "s" : "", CondToString(cond), dLo, dHi, n, m); + } + std::string arm_SMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) { + return fmt::format("smull{}{} {}, {}, {}, {}", S ? "s" : "", CondToString(cond), dLo, dHi, n, m); + } + std::string arm_UMAAL(Cond cond, Reg dHi, Reg dLo, Reg m, Reg n) { + return fmt::format("umaal{} {}, {}, {}, {}", CondToString(cond), dLo, dHi, n, m); + } + std::string arm_UMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) { + return fmt::format("umlal{}{} {}, {}, {}, {}", S ? "s" : "", CondToString(cond), dLo, dHi, n, m); + } + std::string arm_UMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) { + return fmt::format("umull{}{} {}, {}, {}, {}", S ? "s" : "", CondToString(cond), dLo, dHi, n, m); + } + + // Multiply (Halfword) instructions + std::string arm_SMLALxy(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, bool N, Reg n) { + return fmt::format("smlal{}{}{} {}, {}, {}, {}", N ? 't' : 'b', M ? 't' : 'b', CondToString(cond), dLo, dHi, n, m); + } + std::string arm_SMLAxy(Cond cond, Reg d, Reg a, Reg m, bool M, bool N, Reg n) { + return fmt::format("smla{}{}{} {}, {}, {}, {}", N ? 't' : 'b', M ? 't' : 'b', CondToString(cond), d, n, m, a); + } + std::string arm_SMULxy(Cond cond, Reg d, Reg m, bool M, bool N, Reg n) { + return fmt::format("smul{}{}{} {}, {}, {}", N ? 't' : 'b', M ? 't' : 'b', CondToString(cond), d, n, m); + } + + // Multiply (word by halfword) instructions + std::string arm_SMLAWy(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) { + return fmt::format("smlaw{}{} {}, {}, {}, {}", M ? 't' : 'b', CondToString(cond), d, n, m, a); + } + std::string arm_SMULWy(Cond cond, Reg d, Reg m, bool M, Reg n) { + return fmt::format("smulw{}{} {}, {}, {}", M ? 't' : 'b', CondToString(cond), d, n, m); + } + + // Multiply (Most significant word) instructions + std::string arm_SMMLA(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n) { + return fmt::format("smmla{}{} {}, {}, {}, {}", R ? "r" : "", CondToString(cond), d, n, m, a); + } + std::string arm_SMMLS(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n) { + return fmt::format("smmls{}{} {}, {}, {}, {}", R ? "r" : "", CondToString(cond), d, n, m, a); + } + std::string arm_SMMUL(Cond cond, Reg d, Reg m, bool R, Reg n) { + return fmt::format("smmul{}{} {}, {}, {}", R ? "r" : "", CondToString(cond), d, n, m); + } + + // Multiply (Dual) instructions + std::string arm_SMLAD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) { + return fmt::format("smlad{}{} {}, {}, {}, {}", M ? "x" : "", CondToString(cond), d, n, m, a); + } + std::string arm_SMLALD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n) { + return fmt::format("smlald{}{} {}, {}, {}, {}", M ? "x" : "", CondToString(cond), dLo, dHi, n, m); + } + std::string arm_SMLSD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) { + return fmt::format("smlsd{}{} {}, {}, {}, {}", M ? "x" : "", CondToString(cond), d, n, m, a); + } + std::string arm_SMLSLD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n) { + return fmt::format("smlsld{}{} {}, {}, {}, {}", M ? "x" : "", CondToString(cond), dLo, dHi, n, m); + } + std::string arm_SMUAD(Cond cond, Reg d, Reg m, bool M, Reg n) { + return fmt::format("smuad{}{} {}, {}, {}", M ? "x" : "", CondToString(cond), d, n, m); + } + std::string arm_SMUSD(Cond cond, Reg d, Reg m, bool M, Reg n) { + return fmt::format("smusd{}{} {}, {}, {}", M ? "x" : "", CondToString(cond), d, n, m); + } + + // Parallel Add/Subtract (Modulo arithmetic) instructions + std::string arm_SADD8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("sadd8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_SADD16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("sadd16{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_SASX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("sasx{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_SSAX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("ssax{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_SSUB8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("ssub8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_SSUB16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("ssub16{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UADD8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uadd8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UADD16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uadd16{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UASX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uasx{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_USAX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("usax{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_USUB8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("usub8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_USUB16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("usub16{} {}, {}, {}", CondToString(cond), d, n, m); + } + + // Parallel Add/Subtract (Saturating) instructions + std::string arm_QADD8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("qadd8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_QADD16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("qadd16{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_QASX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("qasx{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_QSAX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("qsax{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_QSUB8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("qsub8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_QSUB16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("qsub16{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UQADD8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uqadd8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UQADD16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uqadd16{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UQASX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uqasx{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UQSAX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uqsax{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UQSUB8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uqsub8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UQSUB16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uqsub16{} {}, {}, {}", CondToString(cond), d, n, m); + } + + // Parallel Add/Subtract (Halving) instructions + std::string arm_SHADD8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("shadd8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_SHADD16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("shadd16{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_SHASX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("shasx{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_SHSAX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("shsax{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_SHSUB8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("shsub8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_SHSUB16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("shsub16{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UHADD8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uhadd8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UHADD16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uhadd16{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UHASX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uhasx{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UHSAX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uhsax{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UHSUB8(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uhsub8{} {}, {}, {}", CondToString(cond), d, n, m); + } + std::string arm_UHSUB16(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("uhsub16{} {}, {}, {}", CondToString(cond), d, n, m); + } + + // Saturated Add/Subtract instructions + std::string arm_QADD(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("qadd{} {}, {}, {}", CondToString(cond), d, m, n); + } + std::string arm_QSUB(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("qsub{} {}, {}, {}", CondToString(cond), d, m, n); + } + std::string arm_QDADD(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("qdadd{} {}, {}, {}", CondToString(cond), d, m, n); + } + std::string arm_QDSUB(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("qdsub{} {}, {}, {}", CondToString(cond), d, m, n); + } + + // Synchronization Primitive instructions + std::string arm_CLREX() { + return "clrex"; + } + std::string arm_SWP(Cond cond, Reg n, Reg t, Reg t2) { + return fmt::format("swp{} {}, {}, [{}]", CondToString(cond), t, t2, n); + } + std::string arm_SWPB(Cond cond, Reg n, Reg t, Reg t2) { + return fmt::format("swpb{} {}, {}, [{}]", CondToString(cond), t, t2, n); + } + std::string arm_LDA(Cond cond, Reg n, Reg t) { + return fmt::format("lda{} {}, [{}]", CondToString(cond), t, n); + } + std::string arm_LDAB(Cond cond, Reg n, Reg t) { + return fmt::format("ldab{} {}, [{}]", CondToString(cond), t, n); + } + std::string arm_LDAH(Cond cond, Reg n, Reg t) { + return fmt::format("ldah{} {}, [{}]", CondToString(cond), t, n); + } + std::string arm_LDAEX(Cond cond, Reg n, Reg t) { + return fmt::format("ldaex{} {}, [{}]", CondToString(cond), t, n); + } + std::string arm_LDAEXB(Cond cond, Reg n, Reg t) { + return fmt::format("ldaexb{} {}, [{}]", CondToString(cond), t, n); + } + std::string arm_LDAEXD(Cond cond, Reg n, Reg t) { + return fmt::format("ldaexd{} {}, {}, [{}]", CondToString(cond), t, t + 1, n); + } + std::string arm_LDAEXH(Cond cond, Reg n, Reg t) { + return fmt::format("ldaexh{} {}, [{}]", CondToString(cond), t, n); + } + std::string arm_STL(Cond cond, Reg n, Reg t) { + return fmt::format("stl{} {}, [{}]", CondToString(cond), t, n); + } + std::string arm_STLB(Cond cond, Reg n, Reg t) { + return fmt::format("stlb{} {}, [{}]", CondToString(cond), t, n); + } + std::string arm_STLH(Cond cond, Reg n, Reg t) { + return fmt::format("stlh{} {}, [{}]", CondToString(cond), t, n); + } + std::string arm_STLEX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("stlex{} {}, {}, [{}]", CondToString(cond), d, m, n); + } + std::string arm_STLEXB(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("stlexb{} {}, {}, [{}]", CondToString(cond), d, m, n); + } + std::string arm_STLEXD(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("stlexd{} {}, {}, {}, [{}]", CondToString(cond), d, m, m + 1, n); + } + std::string arm_STLEXH(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("stlexh{} {}, {}, [{}]", CondToString(cond), d, m, n); + } + std::string arm_LDREX(Cond cond, Reg n, Reg d) { + return fmt::format("ldrex{} {}, [{}]", CondToString(cond), d, n); + } + std::string arm_LDREXB(Cond cond, Reg n, Reg d) { + return fmt::format("ldrexb{} {}, [{}]", CondToString(cond), d, n); + } + std::string arm_LDREXD(Cond cond, Reg n, Reg d) { + return fmt::format("ldrexd{} {}, {}, [{}]", CondToString(cond), d, d + 1, n); + } + std::string arm_LDREXH(Cond cond, Reg n, Reg d) { + return fmt::format("ldrexh{} {}, [{}]", CondToString(cond), d, n); + } + std::string arm_STREX(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("strex{} {}, {}, [{}]", CondToString(cond), d, m, n); + } + std::string arm_STREXB(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("strexb{} {}, {}, [{}]", CondToString(cond), d, m, n); + } + std::string arm_STREXD(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("strexd{} {}, {}, {}, [{}]", CondToString(cond), d, m, m + 1, n); + } + std::string arm_STREXH(Cond cond, Reg n, Reg d, Reg m) { + return fmt::format("strexh{} {}, {}, [{}]", CondToString(cond), d, m, n); + } + + // Status register access instructions + std::string arm_CPS() { return "ice"; } + std::string arm_MRS(Cond cond, Reg d) { + return fmt::format("mrs{} {}, apsr", CondToString(cond), d); + } + std::string arm_MSR_imm(Cond cond, unsigned mask, int rotate, Imm<8> imm8) { + const bool write_c = mcl::bit::get_bit<0>(mask); + const bool write_x = mcl::bit::get_bit<1>(mask); + const bool write_s = mcl::bit::get_bit<2>(mask); + const bool write_f = mcl::bit::get_bit<3>(mask); + return fmt::format("msr{} cpsr_{}{}{}{}, #{}", + CondToString(cond), + write_c ? "c" : "", + write_x ? "x" : "", + write_s ? "s" : "", + write_f ? "f" : "", + ArmExpandImm(rotate, imm8)); + } + std::string arm_MSR_reg(Cond cond, unsigned mask, Reg n) { + const bool write_c = mcl::bit::get_bit<0>(mask); + const bool write_x = mcl::bit::get_bit<1>(mask); + const bool write_s = mcl::bit::get_bit<2>(mask); + const bool write_f = mcl::bit::get_bit<3>(mask); + return fmt::format("msr{} cpsr_{}{}{}{}, {}", + CondToString(cond), + write_c ? "c" : "", + write_x ? "x" : "", + write_s ? "s" : "", + write_f ? "f" : "", + n); + } + std::string arm_RFE() { return "ice"; } + std::string arm_SETEND(bool E) { + return E ? "setend be" : "setend le"; + } + std::string arm_SRS() { return "ice"; } + + // Floating point arithmetic instructions + std::string vfp_VADD(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vadd{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VSUB(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vsub{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vmul{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vmla{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vmls{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VNMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vnmul{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VNMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vnmla{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VNMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vnmls{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VDIV(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vdiv{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VFNMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vfnms{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VFNMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vfnma{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VFMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vfms{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VFMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vfma{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VSEL(bool D, Imm<2> cc, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + const Cond cond = concatenate(cc, Imm<1>{cc.Bit<0>() != cc.Bit<1>()}, Imm<1>{0}).ZeroExtend<Cond>(); + return fmt::format("vsel{}.{} {}, {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VMAXNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vmaxnm.{} {}, {}, {}", sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VMINNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + return fmt::format("vminnm.{} {}, {}, {}", sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vn, N), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VMOV_imm(Cond cond, bool D, Imm<4> imm4H, size_t Vd, bool sz, Imm<4> imm4L) { + const auto imm8 = concatenate(imm4H, imm4L); + + if (sz) { + const u64 sign = static_cast<u64>(imm8.Bit<7>()); + const u64 exp = (imm8.Bit<6>() ? 0x3FC : 0x400) | imm8.Bits<4, 5, u64>(); + const u64 fract = imm8.Bits<0, 3, u64>() << 48; + const u64 immediate = (sign << 63) | (exp << 52) | fract; + return fmt::format("vmov{}.f64 {}, #0x{:016x}", CondToString(cond), FPRegStr(sz, Vd, D), immediate); + } else { + const u32 sign = static_cast<u32>(imm8.Bit<7>()); + const u32 exp = (imm8.Bit<6>() ? 0x7C : 0x80) | imm8.Bits<4, 5>(); + const u32 fract = imm8.Bits<0, 3>() << 19; + const u32 immediate = (sign << 31) | (exp << 23) | fract; + return fmt::format("vmov{}.f32 {}, #0x{:08x}", CondToString(cond), FPRegStr(sz, Vd, D), immediate); + } + } + + std::string vfp_VMOV_u32_f64(Cond cond, size_t Vd, Reg t, bool D) { + return fmt::format("vmov{}.32 {}, {}", CondToString(cond), FPRegStr(true, Vd, D), t); + } + + std::string vfp_VMOV_f64_u32(Cond cond, size_t Vn, Reg t, bool N) { + return fmt::format("vmov{}.32 {}, {}", CondToString(cond), t, FPRegStr(true, Vn, N)); + } + + std::string vfp_VMOV_u32_f32(Cond cond, size_t Vn, Reg t, bool N) { + return fmt::format("vmov{}.32 {}, {}", CondToString(cond), FPRegStr(false, Vn, N), t); + } + + std::string vfp_VMOV_f32_u32(Cond cond, size_t Vn, Reg t, bool N) { + return fmt::format("vmov{}.32 {}, {}", CondToString(cond), t, FPRegStr(false, Vn, N)); + } + + std::string vfp_VMOV_2u32_2f32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) { + return fmt::format("vmov{} {}, {}, {}, {}", CondToString(cond), FPRegStr(false, Vm, M), FPNextRegStr(false, Vm, M), t, t2); + } + + std::string vfp_VMOV_2f32_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) { + return fmt::format("vmov{} {}, {}, {}, {}", CondToString(cond), t, t2, FPRegStr(false, Vm, M), FPNextRegStr(false, Vm, M)); + } + + std::string vfp_VMOV_2u32_f64(Cond cond, Reg t2, Reg t, bool M, size_t Vm) { + return fmt::format("vmov{} {}, {}, {}", CondToString(cond), FPRegStr(true, Vm, M), t, t2); + } + + std::string vfp_VMOV_f64_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) { + return fmt::format("vmov{} {}, {}, {}", CondToString(cond), t, t2, FPRegStr(true, Vm, M)); + } + + std::string vfp_VMOV_from_i32(Cond cond, Imm<1> i, size_t Vd, Reg t, bool D) { + const size_t index = i.ZeroExtend(); + return fmt::format("vmov{}.32 {}[{}], {}", CondToString(cond), FPRegStr(true, Vd, D), index, t); + } + + std::string vfp_VMOV_from_i16(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<1> i2) { + const size_t index = concatenate(i1, i2).ZeroExtend(); + return fmt::format("vmov{}.16 {}[{}], {}", CondToString(cond), FPRegStr(true, Vd, D), index, t); + } + + std::string vfp_VMOV_from_i8(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<2> i2) { + const size_t index = concatenate(i1, i2).ZeroExtend(); + return fmt::format("vmov{}.8 {}[{}], {}", CondToString(cond), FPRegStr(true, Vd, D), index, t); + } + + std::string vfp_VMOV_to_i32(Cond cond, Imm<1> i, size_t Vn, Reg t, bool N) { + const size_t index = i.ZeroExtend(); + return fmt::format("vmov{}.32 {}, {}[{}]", CondToString(cond), t, FPRegStr(true, Vn, N), index); + } + + std::string vfp_VMOV_to_i16(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<1> i2) { + const size_t index = concatenate(i1, i2).ZeroExtend(); + return fmt::format("vmov{}.{}16 {}, {}[{}]", CondToString(cond), U ? 'u' : 's', t, FPRegStr(true, Vn, N), index); + } + + std::string vfp_VMOV_to_i8(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<2> i2) { + const size_t index = concatenate(i1, i2).ZeroExtend(); + return fmt::format("vmov{}.{}8 {}, {}[{}]", CondToString(cond), U ? 'u' : 's', t, FPRegStr(true, Vn, N), index); + } + + std::string vfp_VDUP(Cond cond, Imm<1> B, bool Q, size_t Vd, Reg t, bool D, Imm<1> E) { + const size_t esize = 32u >> concatenate(B, E).ZeroExtend(); + return fmt::format("vdup{}.{} {}, {}", CondToString(cond), esize, VectorStr(Q, Vd, D), t); + } + + std::string vfp_VMOV_reg(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + return fmt::format("vmov{}.{} {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VABS(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + return fmt::format("vadd{}.{} {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VNEG(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + return fmt::format("vneg{}.{} {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VSQRT(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + return fmt::format("vsqrt{}.{} {}, {}", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VCVTB(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm) { + const bool convert_from_half = !op; + const char* const to = convert_from_half ? (sz ? "f64" : "f32") : "f16"; + const char* const from = convert_from_half ? "f16" : (sz ? "f64" : "f32"); + return fmt::format("vcvtb{}.{}.{} {}, {}", CondToString(cond), to, from, FPRegStr(convert_from_half ? sz : false, Vd, D), FPRegStr(convert_from_half ? false : sz, Vm, M)); + } + + std::string vfp_VCVTT(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm) { + const bool convert_from_half = !op; + const char* const to = convert_from_half ? (sz ? "f64" : "f32") : "f16"; + const char* const from = convert_from_half ? "f16" : (sz ? "f64" : "f32"); + return fmt::format("vcvtt{}.{}.{} {}, {}", CondToString(cond), to, from, FPRegStr(convert_from_half ? sz : false, Vd, D), FPRegStr(convert_from_half ? false : sz, Vm, M)); + } + + std::string vfp_VCMP(Cond cond, bool D, size_t Vd, bool sz, bool E, bool M, size_t Vm) { + return fmt::format("vcmp{}{}.{} {}, {}", E ? "e" : "", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VCMP_zero(Cond cond, bool D, size_t Vd, bool sz, bool E) { + return fmt::format("vcmp{}{}.{} {}, #0.0", E ? "e" : "", CondToString(cond), sz ? "f64" : "f32", FPRegStr(sz, Vd, D)); + } + + std::string vfp_VRINTR(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + return fmt::format("vrintr{} {}, {}", CondToString(cond), FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VRINTZ(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + return fmt::format("vrintz{} {}, {}", CondToString(cond), FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VRINTX(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + return fmt::format("vrintx{} {}, {}", CondToString(cond), FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VCVT_f_to_f(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + return fmt::format("vcvt{}.{}.{} {}, {}", CondToString(cond), !sz ? "f64" : "f32", sz ? "f64" : "f32", FPRegStr(!sz, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VCVT_from_int(Cond cond, bool D, size_t Vd, bool sz, bool is_signed, bool M, size_t Vm) { + return fmt::format("vcvt{}.{}.{} {}, {}", CondToString(cond), sz ? "f64" : "f32", is_signed ? "s32" : "u32", FPRegStr(sz, Vd, D), FPRegStr(false, Vm, M)); + } + + std::string vfp_VCVT_from_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4) { + const size_t size = sx ? 32 : 16; + const size_t fbits = size - concatenate(imm4, i).ZeroExtend(); + return fmt::format("vcvt{}.{}.{}{} {}, {}, #{}", CondToString(cond), sz ? "f64" : "f32", U ? 'u' : 's', size, FPRegStr(sz, Vd, D), FPRegStr(sz, Vd, D), fbits); + } + + std::string vfp_VCVT_to_u32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm) { + return fmt::format("vcvt{}{}.u32.{} {}, {}", round_towards_zero ? "" : "r", CondToString(cond), sz ? "f64" : "f32", FPRegStr(false, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VCVT_to_s32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm) { + return fmt::format("vcvt{}{}.s32.{} {}, {}", round_towards_zero ? "" : "r", CondToString(cond), sz ? "f64" : "f32", FPRegStr(false, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VCVT_to_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4) { + const size_t size = sx ? 32 : 16; + const size_t fbits = size - concatenate(imm4, i).ZeroExtend(); + return fmt::format("vcvt{}.{}{}.{} {}, {}, #{}", CondToString(cond), U ? 'u' : 's', size, sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vd, D), fbits); + } + + std::string vfp_VRINT_rm(bool D, size_t rm, size_t Vd, bool sz, bool M, size_t Vm) { + return fmt::format("vrint{}.{} {}, {}", "anpm"[rm], sz ? "f64" : "f32", FPRegStr(sz, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VCVT_rm(bool D, size_t rm, size_t Vd, bool sz, bool U, bool M, size_t Vm) { + return fmt::format("vcvt{}.{}.{} {}, {}", "anpm"[rm], U ? "u32" : "s32", sz ? "f64" : "f32", FPRegStr(false, Vd, D), FPRegStr(sz, Vm, M)); + } + + std::string vfp_VMSR(Cond cond, Reg t) { + return fmt::format("vmsr{} fpscr, {}", CondToString(cond), t); + } + + std::string vfp_VMRS(Cond cond, Reg t) { + if (t == Reg::R15) { + return fmt::format("vmrs{} apsr_nzcv, fpscr", CondToString(cond)); + } else { + return fmt::format("vmrs{} {}, fpscr", CondToString(cond), t); + } + } + + std::string vfp_VPOP(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8) { + return fmt::format("vpop{} {}(+{})", + CondToString(cond), FPRegStr(sz, Vd, D), + imm8.ZeroExtend() >> (sz ? 1 : 0)); + } + + std::string vfp_VPUSH(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8) { + return fmt::format("vpush{} {}(+{})", + CondToString(cond), FPRegStr(sz, Vd, D), + imm8.ZeroExtend() >> (sz ? 1 : 0)); + } + + std::string vfp_VLDR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + const char sign = U ? '+' : '-'; + return fmt::format("vldr{} {}, [{}, #{}{}]", + CondToString(cond), FPRegStr(sz, Vd, D), n, sign, imm32); + } + + std::string vfp_VSTR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + const char sign = U ? '+' : '-'; + return fmt::format("vstr{} {}, [{}, #{}{}]", + CondToString(cond), FPRegStr(sz, Vd, D), n, sign, imm32); + } + + std::string vfp_VSTM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) { + const char* mode = "<invalid mode>"; + if (!p && u) { + mode = "ia"; + } + if (p && !u) { + mode = "db"; + } + return fmt::format("vstm{}{}.f64 {}{}, {}(+{})", mode, + CondToString(cond), n, w ? "!" : "", + FPRegStr(true, Vd, D), imm8.ZeroExtend()); + } + + std::string vfp_VSTM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) { + const char* mode = "<invalid mode>"; + if (!p && u) { + mode = "ia"; + } + if (p && !u) { + mode = "db"; + } + return fmt::format("vstm{}{}.f32 {}{}, {}(+{})", mode, + CondToString(cond), n, w ? "!" : "", + FPRegStr(false, Vd, D), imm8.ZeroExtend()); + } + + std::string vfp_VLDM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) { + const char* mode = "<invalid mode>"; + if (!p && u) { + mode = "ia"; + } + if (p && !u) { + mode = "db"; + } + return fmt::format("vldm{}{}.f64 {}{}, {}(+{})", mode, + CondToString(cond), n, w ? "!" : "", + FPRegStr(true, Vd, D), imm8.ZeroExtend()); + } + + std::string vfp_VLDM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) { + const char* mode = "<invalid mode>"; + if (!p && u) { + mode = "ia"; + } + if (p && !u) { + mode = "db"; + } + return fmt::format("vldm{}{}.f32 {}{}, {}(+{})", mode, + CondToString(cond), n, w ? "!" : "", + FPRegStr(false, Vd, D), imm8.ZeroExtend()); + } +}; + +std::string DisassembleArm(u32 instruction) { + DisassemblerVisitor visitor; + if (auto vfp_decoder = DecodeVFP<DisassemblerVisitor>(instruction)) { + return vfp_decoder->get().call(visitor, instruction); + } else if (auto decoder = DecodeArm<DisassemblerVisitor>(instruction)) { + return decoder->get().call(visitor, instruction); + } else { + return fmt::format("UNKNOWN: {:x}", instruction); + } +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_thumb.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_thumb.cpp new file mode 100644 index 0000000000..860181311b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/disassembler/disassembler_thumb.cpp @@ -0,0 +1,397 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <cstdlib> +#include <string> + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/common/string_util.h" +#include "dynarmic/frontend/A32/a32_types.h" +#include "dynarmic/frontend/A32/decoder/thumb16.h" +#include "dynarmic/frontend/A32/disassembler/disassembler.h" +#include "dynarmic/frontend/imm.h" + +namespace Dynarmic::A32 { + +class DisassemblerVisitor { +public: + using instruction_return_type = std::string; + + std::string thumb16_LSL_imm(Imm<5> imm5, Reg m, Reg d) { + return fmt::format("lsls {}, {}, #{}", d, m, imm5.ZeroExtend()); + } + + std::string thumb16_LSR_imm(Imm<5> imm5, Reg m, Reg d) { + const u32 shift = imm5 != 0 ? imm5.ZeroExtend() : 32U; + return fmt::format("lsrs {}, {}, #{}", d, m, shift); + } + + std::string thumb16_ASR_imm(Imm<5> imm5, Reg m, Reg d) { + const u32 shift = imm5 != 0 ? imm5.ZeroExtend() : 32U; + return fmt::format("asrs {}, {}, #{}", d, m, shift); + } + + std::string thumb16_ADD_reg_t1(Reg m, Reg n, Reg d) { + return fmt::format("adds {}, {}, {}", d, n, m); + } + + std::string thumb16_SUB_reg(Reg m, Reg n, Reg d) { + return fmt::format("subs {}, {}, {}", d, n, m); + } + + std::string thumb16_ADD_imm_t1(Imm<3> imm3, Reg n, Reg d) { + return fmt::format("adds {}, {}, #{}", d, n, imm3.ZeroExtend()); + } + + std::string thumb16_SUB_imm_t1(Imm<3> imm3, Reg n, Reg d) { + return fmt::format("subs {}, {}, #{}", d, n, imm3.ZeroExtend()); + } + + std::string thumb16_MOV_imm(Reg d, Imm<8> imm8) { + return fmt::format("movs {}, #{}", d, imm8.ZeroExtend()); + } + + std::string thumb16_CMP_imm(Reg n, Imm<8> imm8) { + return fmt::format("cmp {}, #{}", n, imm8.ZeroExtend()); + } + + std::string thumb16_ADD_imm_t2(Reg d_n, Imm<8> imm8) { + return fmt::format("adds {}, #{}", d_n, imm8.ZeroExtend()); + } + + std::string thumb16_SUB_imm_t2(Reg d_n, Imm<8> imm8) { + return fmt::format("subs {}, #{}", d_n, imm8.ZeroExtend()); + } + + std::string thumb16_AND_reg(Reg m, Reg d_n) { + return fmt::format("ands {}, {}", d_n, m); + } + + std::string thumb16_EOR_reg(Reg m, Reg d_n) { + return fmt::format("eors {}, {}", d_n, m); + } + + std::string thumb16_LSL_reg(Reg m, Reg d_n) { + return fmt::format("lsls {}, {}", d_n, m); + } + + std::string thumb16_LSR_reg(Reg m, Reg d_n) { + return fmt::format("lsrs {}, {}", d_n, m); + } + + std::string thumb16_ASR_reg(Reg m, Reg d_n) { + return fmt::format("asrs {}, {}", d_n, m); + } + + std::string thumb16_ADC_reg(Reg m, Reg d_n) { + return fmt::format("adcs {}, {}", d_n, m); + } + + std::string thumb16_SBC_reg(Reg m, Reg d_n) { + return fmt::format("sbcs {}, {}", d_n, m); + } + + std::string thumb16_ROR_reg(Reg m, Reg d_n) { + return fmt::format("rors {}, {}", d_n, m); + } + + std::string thumb16_TST_reg(Reg m, Reg n) { + return fmt::format("tst {}, {}", n, m); + } + + std::string thumb16_RSB_imm(Reg n, Reg d) { + // Pre-UAL syntax: NEGS <Rd>, <Rn> + return fmt::format("rsbs {}, {}, #0", d, n); + } + + std::string thumb16_CMP_reg_t1(Reg m, Reg n) { + return fmt::format("cmp {}, {}", n, m); + } + + std::string thumb16_CMN_reg(Reg m, Reg n) { + return fmt::format("cmn {}, {}", n, m); + } + + std::string thumb16_ORR_reg(Reg m, Reg d_n) { + return fmt::format("orrs {}, {}", d_n, m); + } + + std::string thumb16_MUL_reg(Reg n, Reg d_m) { + return fmt::format("muls {}, {}, {}", d_m, n, d_m); + } + + std::string thumb16_BIC_reg(Reg m, Reg d_n) { + return fmt::format("bics {}, {}", d_n, m); + } + + std::string thumb16_MVN_reg(Reg m, Reg d) { + return fmt::format("mvns {}, {}", d, m); + } + + std::string thumb16_ADD_reg_t2(bool d_n_hi, Reg m, Reg d_n_lo) { + const Reg d_n = d_n_hi ? (d_n_lo + 8) : d_n_lo; + return fmt::format("add {}, {}", d_n, m); + } + + std::string thumb16_CMP_reg_t2(bool n_hi, Reg m, Reg n_lo) { + const Reg n = n_hi ? (n_lo + 8) : n_lo; + return fmt::format("cmp {}, {}", n, m); + } + + std::string thumb16_MOV_reg(bool d_hi, Reg m, Reg d_lo) { + const Reg d = d_hi ? (d_lo + 8) : d_lo; + return fmt::format("mov {}, {}", d, m); + } + + std::string thumb16_LDR_literal(Reg t, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + return fmt::format("ldr {}, [pc, #{}]", t, imm32); + } + + std::string thumb16_STR_reg(Reg m, Reg n, Reg t) { + return fmt::format("str {}, [{}, {}]", t, n, m); + } + + std::string thumb16_STRH_reg(Reg m, Reg n, Reg t) { + return fmt::format("strh {}, [{}, {}]", t, n, m); + } + + std::string thumb16_STRB_reg(Reg m, Reg n, Reg t) { + return fmt::format("strb {}, [{}, {}]", t, n, m); + } + + std::string thumb16_LDRSB_reg(Reg m, Reg n, Reg t) { + return fmt::format("ldrsb {}, [{}, {}]", t, n, m); + } + + std::string thumb16_LDR_reg(Reg m, Reg n, Reg t) { + return fmt::format("ldr {}, [{}, {}]", t, n, m); + } + + std::string thumb16_LDRH_reg(Reg m, Reg n, Reg t) { + return fmt::format("ldrh {}, [%s, %s]", t, n, m); + } + + std::string thumb16_LDRB_reg(Reg m, Reg n, Reg t) { + return fmt::format("ldrb {}, [{}, {}]", t, n, m); + } + + std::string thumb16_LDRSH_reg(Reg m, Reg n, Reg t) { + return fmt::format("ldrsh {}, [{}, {}]", t, n, m); + } + + std::string thumb16_STR_imm_t1(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend() << 2; + return fmt::format("str {}, [{}, #{}]", t, n, imm32); + } + + std::string thumb16_LDR_imm_t1(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend() << 2; + return fmt::format("ldr {}, [{}, #{}]", t, n, imm32); + } + + std::string thumb16_STRB_imm(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend(); + return fmt::format("strb {}, [{}, #{}]", t, n, imm32); + } + + std::string thumb16_LDRB_imm(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend(); + return fmt::format("ldrb {}, [{}, #{}]", t, n, imm32); + } + + std::string thumb16_STRH_imm(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend() << 1; + return fmt::format("strh {}, [{}, #{}]", t, n, imm32); + } + + std::string thumb16_LDRH_imm(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend() << 1; + return fmt::format("ldrh {}, [{}, #{}]", t, n, imm32); + } + + std::string thumb16_STR_imm_t2(Reg t, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + return fmt::format("str {}, [sp, #{}]", t, imm32); + } + + std::string thumb16_LDR_imm_t2(Reg t, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + return fmt::format("ldr {}, [sp, #{}]", t, imm32); + } + + std::string thumb16_ADR(Reg d, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + return fmt::format("adr {}, +#{}", d, imm32); + } + + std::string thumb16_ADD_sp_t1(Reg d, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + return fmt::format("add {}, sp, #{}", d, imm32); + } + + std::string thumb16_ADD_sp_t2(Imm<7> imm7) { + const u32 imm32 = imm7.ZeroExtend() << 2; + return fmt::format("add sp, sp, #{}", imm32); + } + + std::string thumb16_SUB_sp(Imm<7> imm7) { + const u32 imm32 = imm7.ZeroExtend() << 2; + return fmt::format("sub sp, sp, #{}", imm32); + } + + std::string thumb16_NOP() { + return "nop"; + } + + std::string thumb16_SEV() { + return "sev"; + } + + std::string thumb16_SEVL() { + return "sevl"; + } + + std::string thumb16_WFE() { + return "wfe"; + } + + std::string thumb16_WFI() { + return "wfi"; + } + + std::string thumb16_YIELD() { + return "yield"; + } + + std::string thumb16_IT(Imm<8> imm8) { + const Cond firstcond = imm8.Bits<4, 7, Cond>(); + const bool firstcond0 = imm8.Bit<4>(); + const auto [x, y, z] = [&] { + if (imm8.Bits<0, 3>() == 0b1000) { + return std::make_tuple("", "", ""); + } + if (imm8.Bits<0, 2>() == 0b100) { + return std::make_tuple(imm8.Bit<3>() == firstcond0 ? "t" : "e", "", ""); + } + if (imm8.Bits<0, 1>() == 0b10) { + return std::make_tuple(imm8.Bit<3>() == firstcond0 ? "t" : "e", imm8.Bit<2>() == firstcond0 ? "t" : "e", ""); + } + // Sanity note: Here imm8.Bit<0>() is guaranteed to be == 1. (imm8 can never be 0bxxxx0000) + return std::make_tuple(imm8.Bit<3>() == firstcond0 ? "t" : "e", imm8.Bit<2>() == firstcond0 ? "t" : "e", imm8.Bit<1>() == firstcond0 ? "t" : "e"); + }(); + return fmt::format("it{}{}{} {}", x, y, z, CondToString(firstcond)); + } + + std::string thumb16_SXTH(Reg m, Reg d) { + return fmt::format("sxth {}, {}", d, m); + } + + std::string thumb16_SXTB(Reg m, Reg d) { + return fmt::format("sxtb {}, {}", d, m); + } + + std::string thumb16_UXTH(Reg m, Reg d) { + return fmt::format("uxth {}, {}", d, m); + } + + std::string thumb16_UXTB(Reg m, Reg d) { + return fmt::format("uxtb {}, {}", d, m); + } + + std::string thumb16_PUSH(bool M, RegList reg_list) { + if (M) + reg_list |= 1 << 14; + return fmt::format("push {{{}}}", RegListToString(reg_list)); + } + + std::string thumb16_POP(bool P, RegList reg_list) { + if (P) + reg_list |= 1 << 15; + return fmt::format("pop {{{}}}", RegListToString(reg_list)); + } + + std::string thumb16_SETEND(bool E) { + return fmt::format("setend {}", E ? "BE" : "LE"); + } + + std::string thumb16_CPS(bool im, bool a, bool i, bool f) { + return fmt::format("cps{} {}{}{}", im ? "id" : "ie", a ? "a" : "", i ? "i" : "", f ? "f" : ""); + } + + std::string thumb16_REV(Reg m, Reg d) { + return fmt::format("rev {}, {}", d, m); + } + + std::string thumb16_REV16(Reg m, Reg d) { + return fmt::format("rev16 {}, {}", d, m); + } + + std::string thumb16_REVSH(Reg m, Reg d) { + return fmt::format("revsh {}, {}", d, m); + } + + std::string thumb16_BKPT(Imm<8> imm8) { + return fmt::format("bkpt #{}", imm8.ZeroExtend()); + } + + std::string thumb16_STMIA(Reg n, RegList reg_list) { + return fmt::format("stm {}!, {{{}}}", n, RegListToString(reg_list)); + } + + std::string thumb16_LDMIA(Reg n, RegList reg_list) { + const bool write_back = !mcl::bit::get_bit(static_cast<size_t>(n), reg_list); + return fmt::format("ldm {}{}, {{{}}}", n, write_back ? "!" : "", RegListToString(reg_list)); + } + + std::string thumb16_BX(Reg m) { + return fmt::format("bx {}", m); + } + + std::string thumb16_BLX_reg(Reg m) { + return fmt::format("blx {}", m); + } + + std::string thumb16_CBZ_CBNZ(bool nonzero, Imm<1> i, Imm<5> imm5, Reg n) { + const char* const name = nonzero ? "cbnz" : "cbz"; + const u32 imm = concatenate(i, imm5, Imm<1>{0}).ZeroExtend(); + + return fmt::format("{} {}, #{}", name, n, imm); + } + + std::string thumb16_UDF() { + return fmt::format("udf"); + } + + std::string thumb16_SVC(Imm<8> imm8) { + return fmt::format("svc #{}", imm8.ZeroExtend()); + } + + std::string thumb16_B_t1(Cond cond, Imm<8> imm8) { + const s32 imm32 = static_cast<s32>((imm8.SignExtend<u32>() << 1) + 4); + return fmt::format("b{} {}#{}", + CondToString(cond), + Common::SignToChar(imm32), + abs(imm32)); + } + + std::string thumb16_B_t2(Imm<11> imm11) { + const s32 imm32 = static_cast<s32>((imm11.SignExtend<u32>() << 1) + 4); + return fmt::format("b {}#{}", + Common::SignToChar(imm32), + abs(imm32)); + } +}; + +std::string DisassembleThumb16(u16 instruction) { + DisassemblerVisitor visitor; + auto decoder = DecodeThumb16<DisassemblerVisitor>(instruction); + return !decoder ? fmt::format("UNKNOWN: {:x}", instruction) : decoder->get().call(visitor, instruction); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.cpp new file mode 100644 index 0000000000..97a7f11adf --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.cpp @@ -0,0 +1,27 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/a32_translate.h" + +#include "dynarmic/frontend/A32/a32_location_descriptor.h" +#include "dynarmic/ir/basic_block.h" + +namespace Dynarmic::A32 { + +IR::Block TranslateArm(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options); +IR::Block TranslateThumb(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options); + +IR::Block Translate(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options) { + return (descriptor.TFlag() ? TranslateThumb : TranslateArm)(descriptor, tcb, options); +} + +bool TranslateSingleArmInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction); +bool TranslateSingleThumbInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction); + +bool TranslateSingleInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction) { + return (descriptor.TFlag() ? TranslateSingleThumbInstruction : TranslateSingleArmInstruction)(block, descriptor, instruction); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.h b/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.h new file mode 100644 index 0000000000..0f2c3a121f --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/a32_translate.h @@ -0,0 +1,52 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ +#pragma once + +#include <mcl/stdint.hpp> + +#include "dynarmic/interface/A32/arch_version.h" + +namespace Dynarmic::IR { +class Block; +} // namespace Dynarmic::IR + +namespace Dynarmic::A32 { + +class LocationDescriptor; +struct TranslateCallbacks; + +struct TranslationOptions { + ArchVersion arch_version; + + /// This changes what IR we emit when we translate an unpredictable instruction. + /// If this is false, the ExceptionRaised IR instruction is emitted. + /// If this is true, we define some behaviour for some instructions. + bool define_unpredictable_behaviour = false; + + /// This changes what IR we emit when we translate a hint instruction. + /// If this is false, we treat the instruction as a NOP. + /// If this is true, we emit an ExceptionRaised instruction. + bool hook_hint_instructions = true; +}; + +/** + * This function translates instructions in memory into our intermediate representation. + * @param descriptor The starting location of the basic block. Includes information like PC, Thumb state, &c. + * @param tcb The callbacks we should use to read emulated memory. + * @param options Configures how certain instructions are translated. + * @return A translated basic block in the intermediate representation. + */ +IR::Block Translate(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options); + +/** + * This function translates a single provided instruction into our intermediate representation. + * @param block The block to append the IR for the instruction to. + * @param descriptor The location of the instruction. Includes information like PC, Thumb state, &c. + * @param instruction The instruction to translate. + * @return The translated instruction translated to the intermediate representation. + */ +bool TranslateSingleInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction); + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.cpp new file mode 100644 index 0000000000..8bd875c7d1 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.cpp @@ -0,0 +1,82 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/conditional_state.h" + +#include <algorithm> + +#include <mcl/assert.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/A32/a32_ir_emitter.h" +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" +#include "dynarmic/interface/A32/config.h" +#include "dynarmic/ir/cond.h" + +namespace Dynarmic::A32 { + +bool CondCanContinue(ConditionalState cond_state, const A32::IREmitter& ir) { + ASSERT_MSG(cond_state != ConditionalState::Break, "Should never happen."); + + if (cond_state == ConditionalState::None) + return true; + + // TODO: This is more conservative than necessary. + return std::all_of(ir.block.begin(), ir.block.end(), [](const IR::Inst& inst) { return !inst.WritesToCPSR(); }); +} + +bool IsConditionPassed(TranslatorVisitor& v, IR::Cond cond) { + ASSERT_MSG(v.cond_state != ConditionalState::Break, + "This should never happen. We requested a break but that wasn't honored."); + + if (cond == IR::Cond::NV) { + // NV conditional is obsolete + v.cond_state = ConditionalState::Break; + v.RaiseException(Exception::UnpredictableInstruction); + return false; + } + + if (v.cond_state == ConditionalState::Translating) { + if (v.ir.block.ConditionFailedLocation() != v.ir.current_location || cond == IR::Cond::AL) { + v.cond_state = ConditionalState::Trailing; + } else { + if (cond == v.ir.block.GetCondition()) { + v.ir.block.SetConditionFailedLocation(v.ir.current_location.AdvancePC(static_cast<int>(v.current_instruction_size)).AdvanceIT()); + v.ir.block.ConditionFailedCycleCount()++; + return true; + } + + // cond has changed, abort + v.cond_state = ConditionalState::Break; + v.ir.SetTerm(IR::Term::LinkBlockFast{v.ir.current_location}); + return false; + } + } + + if (cond == IR::Cond::AL) { + // Everything is fine with the world + return true; + } + + // non-AL cond + + if (!v.ir.block.empty()) { + // We've already emitted instructions. Quit for now, we'll make a new block here later. + v.cond_state = ConditionalState::Break; + v.ir.SetTerm(IR::Term::LinkBlockFast{v.ir.current_location}); + return false; + } + + // We've not emitted instructions yet. + // We'll emit one instruction, and set the block-entry conditional appropriately. + + v.cond_state = ConditionalState::Translating; + v.ir.block.SetCondition(cond); + v.ir.block.SetConditionFailedLocation(v.ir.current_location.AdvancePC(static_cast<int>(v.current_instruction_size)).AdvanceIT()); + v.ir.block.ConditionFailedCycleCount() = v.ir.block.CycleCount() + 1; + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.h b/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.h new file mode 100644 index 0000000000..18c8b1cccb --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/conditional_state.h @@ -0,0 +1,33 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/stdint.hpp> + +namespace Dynarmic::IR { +enum class Cond; +} // namespace Dynarmic::IR + +namespace Dynarmic::A32 { + +class IREmitter; +struct TranslatorVisitor; + +enum class ConditionalState { + /// We haven't met any conditional instructions yet. + None, + /// Current instruction is a conditional. This marks the end of this basic block. + Break, + /// This basic block is made up solely of conditional instructions. + Translating, + /// This basic block is made up of conditional instructions followed by unconditional instructions. + Trailing, +}; + +bool CondCanContinue(ConditionalState cond_state, const A32::IREmitter& ir); +bool IsConditionPassed(TranslatorVisitor& v, IR::Cond cond); + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_branch.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_branch.cpp new file mode 100644 index 0000000000..d87cfcfe82 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_branch.cpp @@ -0,0 +1,88 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// B <label> +bool TranslatorVisitor::arm_B(Cond cond, Imm<24> imm24) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = mcl::bit::sign_extend<26, u32>(imm24.ZeroExtend() << 2) + 8; + const auto new_location = ir.current_location.AdvancePC(imm32); + ir.SetTerm(IR::Term::LinkBlock{new_location}); + return false; +} + +// BL <label> +bool TranslatorVisitor::arm_BL(Cond cond, Imm<24> imm24) { + if (!ArmConditionPassed(cond)) { + return true; + } + + ir.PushRSB(ir.current_location.AdvancePC(4)); + ir.SetRegister(Reg::LR, ir.Imm32(ir.current_location.PC() + 4)); + + const u32 imm32 = mcl::bit::sign_extend<26, u32>(imm24.ZeroExtend() << 2) + 8; + const auto new_location = ir.current_location.AdvancePC(imm32); + ir.SetTerm(IR::Term::LinkBlock{new_location}); + return false; +} + +// BLX <label> +bool TranslatorVisitor::arm_BLX_imm(bool H, Imm<24> imm24) { + ir.PushRSB(ir.current_location.AdvancePC(4)); + ir.SetRegister(Reg::LR, ir.Imm32(ir.current_location.PC() + 4)); + + const u32 imm32 = mcl::bit::sign_extend<26, u32>((imm24.ZeroExtend() << 2)) + (H ? 2 : 0) + 8; + const auto new_location = ir.current_location.AdvancePC(imm32).SetTFlag(true); + ir.SetTerm(IR::Term::LinkBlock{new_location}); + return false; +} + +// BLX <Rm> +bool TranslatorVisitor::arm_BLX_reg(Cond cond, Reg m) { + if (m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + ir.PushRSB(ir.current_location.AdvancePC(4)); + ir.BXWritePC(ir.GetRegister(m)); + ir.SetRegister(Reg::LR, ir.Imm32(ir.current_location.PC() + 4)); + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; +} + +// BX <Rm> +bool TranslatorVisitor::arm_BX(Cond cond, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + ir.BXWritePC(ir.GetRegister(m)); + if (m == Reg::R14) { + ir.SetTerm(IR::Term::PopRSBHint{}); + } else { + ir.SetTerm(IR::Term::FastDispatchHint{}); + } + + return false; +} + +bool TranslatorVisitor::arm_BXJ(Cond cond, Reg m) { + // Jazelle not supported + return arm_BX(cond, m); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_crc32.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_crc32.cpp new file mode 100644 index 0000000000..8ee5441013 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_crc32.cpp @@ -0,0 +1,95 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// It's considered constrained UNPREDICTABLE behavior if either +// CRC32 instruction variant is executed with a condition code +// that is *not* 0xE (Always execute). ARM defines one of the following +// as being a requirement in this case. Either: +// +// 1. The instruction is undefined. +// 2. The instruction executes as a NOP. +// 3. The instruction executes unconditionally. +// 4. The instruction executes conditionally. +// +// It's also considered constrained UNPREDICTABLE behavior if +// either CRC32 instruction variant is executed with a size specifier +// of 64-bit (sz -> 0b11) +// +// In this case, either: +// +// 1. The instruction is undefined +// 2. The instruction executes as a NOP. +// 3. The instruction executes with the additional decode: size = 32. +// +// In both cases, we treat as unpredictable, to allow +// library users to provide their own intended behavior +// in the unpredictable exception handler. + +namespace { +enum class CRCType { + Castagnoli, + ISO, +}; + +bool CRC32Variant(TranslatorVisitor& v, Cond cond, Imm<2> sz, Reg n, Reg d, Reg m, CRCType type) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return v.UnpredictableInstruction(); + } + + if (sz == 0b11) { + return v.UnpredictableInstruction(); + } + + if (cond != Cond::AL) { + return v.UnpredictableInstruction(); + } + + const IR::U32 result = [m, n, sz, type, &v] { + const IR::U32 accumulator = v.ir.GetRegister(n); + const IR::U32 data = v.ir.GetRegister(m); + + if (type == CRCType::ISO) { + switch (sz.ZeroExtend()) { + case 0b00: + return v.ir.CRC32ISO8(accumulator, data); + case 0b01: + return v.ir.CRC32ISO16(accumulator, data); + case 0b10: + return v.ir.CRC32ISO32(accumulator, data); + } + } else { + switch (sz.ZeroExtend()) { + case 0b00: + return v.ir.CRC32Castagnoli8(accumulator, data); + case 0b01: + return v.ir.CRC32Castagnoli16(accumulator, data); + case 0b10: + return v.ir.CRC32Castagnoli32(accumulator, data); + } + } + + UNREACHABLE(); + }(); + + v.ir.SetRegister(d, result); + return true; +} +} // Anonymous namespace + +// CRC32{B,H,W}{<q>} <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_CRC32(Cond cond, Imm<2> sz, Reg n, Reg d, Reg m) { + return CRC32Variant(*this, cond, sz, n, d, m, CRCType::ISO); +} + +// CRC32C{B,H,W}{<q>} <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_CRC32C(Cond cond, Imm<2> sz, Reg n, Reg d, Reg m) { + return CRC32Variant(*this, cond, sz, n, d, m, CRCType::Castagnoli); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_exception_generating.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_exception_generating.cpp new file mode 100644 index 0000000000..c9616b0871 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_exception_generating.cpp @@ -0,0 +1,44 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" +#include "dynarmic/interface/A32/config.h" + +namespace Dynarmic::A32 { + +// BKPT #<imm16> +bool TranslatorVisitor::arm_BKPT(Cond cond, Imm<12> /*imm12*/, Imm<4> /*imm4*/) { + if (cond != Cond::AL && !options.define_unpredictable_behaviour) { + return UnpredictableInstruction(); + } + // UNPREDICTABLE: The instruction executes conditionally. + + if (!ArmConditionPassed(cond)) { + return true; + } + + return RaiseException(Exception::Breakpoint); +} + +// SVC<c> #<imm24> +bool TranslatorVisitor::arm_SVC(Cond cond, Imm<24> imm24) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm24.ZeroExtend(); + ir.PushRSB(ir.current_location.AdvancePC(4)); + ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4)); + ir.CallSupervisor(ir.Imm32(imm32)); + ir.SetTerm(IR::Term::CheckHalt{IR::Term::PopRSBHint{}}); + return false; +} + +// UDF<c> #<imm16> +bool TranslatorVisitor::arm_UDF() { + return UndefinedInstruction(); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.cpp new file mode 100644 index 0000000000..276f8384e7 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.cpp @@ -0,0 +1,110 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +#include <mcl/assert.hpp> + +#include "dynarmic/interface/A32/config.h" + +namespace Dynarmic::A32 { + +bool TranslatorVisitor::ArmConditionPassed(Cond cond) { + return IsConditionPassed(*this, cond); +} + +bool TranslatorVisitor::ThumbConditionPassed() { + const Cond cond = ir.current_location.IT().Cond(); + return IsConditionPassed(*this, cond); +} + +bool TranslatorVisitor::VFPConditionPassed(Cond cond) { + if (ir.current_location.TFlag()) { + ASSERT(cond == Cond::AL); + return true; + } + return ArmConditionPassed(cond); +} + +bool TranslatorVisitor::InterpretThisInstruction() { + ir.SetTerm(IR::Term::Interpret(ir.current_location)); + return false; +} + +bool TranslatorVisitor::UnpredictableInstruction() { + return RaiseException(Exception::UnpredictableInstruction); +} + +bool TranslatorVisitor::UndefinedInstruction() { + return RaiseException(Exception::UndefinedInstruction); +} + +bool TranslatorVisitor::DecodeError() { + return RaiseException(Exception::DecodeError); +} + +bool TranslatorVisitor::RaiseException(Exception exception) { + ir.UpdateUpperLocationDescriptor(); + ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + static_cast<u32>(current_instruction_size))); + ir.ExceptionRaised(exception); + ir.SetTerm(IR::Term::CheckHalt{IR::Term::ReturnToDispatch{}}); + return false; +} + +IR::UAny TranslatorVisitor::I(size_t bitsize, u64 value) { + switch (bitsize) { + case 8: + return ir.Imm8(static_cast<u8>(value)); + case 16: + return ir.Imm16(static_cast<u16>(value)); + case 32: + return ir.Imm32(static_cast<u32>(value)); + case 64: + return ir.Imm64(value); + default: + ASSERT_FALSE("Imm - get: Invalid bitsize"); + } +} + +IR::ResultAndCarry<IR::U32> TranslatorVisitor::EmitImmShift(IR::U32 value, ShiftType type, Imm<3> imm3, Imm<2> imm2, IR::U1 carry_in) { + return EmitImmShift(value, type, concatenate(imm3, imm2), carry_in); +} + +IR::ResultAndCarry<IR::U32> TranslatorVisitor::EmitImmShift(IR::U32 value, ShiftType type, Imm<5> imm5, IR::U1 carry_in) { + u8 imm5_value = imm5.ZeroExtend<u8>(); + switch (type) { + case ShiftType::LSL: + return ir.LogicalShiftLeft(value, ir.Imm8(imm5_value), carry_in); + case ShiftType::LSR: + imm5_value = imm5_value ? imm5_value : 32; + return ir.LogicalShiftRight(value, ir.Imm8(imm5_value), carry_in); + case ShiftType::ASR: + imm5_value = imm5_value ? imm5_value : 32; + return ir.ArithmeticShiftRight(value, ir.Imm8(imm5_value), carry_in); + case ShiftType::ROR: + if (imm5_value) { + return ir.RotateRight(value, ir.Imm8(imm5_value), carry_in); + } else { + return ir.RotateRightExtended(value, carry_in); + } + } + UNREACHABLE(); +} + +IR::ResultAndCarry<IR::U32> TranslatorVisitor::EmitRegShift(IR::U32 value, ShiftType type, IR::U8 amount, IR::U1 carry_in) { + switch (type) { + case ShiftType::LSL: + return ir.LogicalShiftLeft(value, amount, carry_in); + case ShiftType::LSR: + return ir.LogicalShiftRight(value, amount, carry_in); + case ShiftType::ASR: + return ir.ArithmeticShiftRight(value, amount, carry_in); + case ShiftType::ROR: + return ir.RotateRight(value, amount, carry_in); + } + UNREACHABLE(); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.h b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.h new file mode 100644 index 0000000000..44ac24503c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.h @@ -0,0 +1,982 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/bit/rotate.hpp> + +#include "dynarmic/frontend/A32/a32_ir_emitter.h" +#include "dynarmic/frontend/A32/a32_location_descriptor.h" +#include "dynarmic/frontend/A32/a32_types.h" +#include "dynarmic/frontend/A32/translate/a32_translate.h" +#include "dynarmic/frontend/A32/translate/conditional_state.h" +#include "dynarmic/frontend/imm.h" + +namespace Dynarmic::A32 { + +enum class Exception; + +struct TranslatorVisitor final { + using instruction_return_type = bool; + + explicit TranslatorVisitor(IR::Block& block, LocationDescriptor descriptor, const TranslationOptions& options) + : ir(block, descriptor, options.arch_version), options(options) {} + + A32::IREmitter ir; + ConditionalState cond_state = ConditionalState::None; + TranslationOptions options; + + size_t current_instruction_size; + + bool ArmConditionPassed(Cond cond); + bool ThumbConditionPassed(); + bool VFPConditionPassed(Cond cond); + + bool InterpretThisInstruction(); + bool UnpredictableInstruction(); + bool UndefinedInstruction(); + bool DecodeError(); + bool RaiseException(Exception exception); + + struct ImmAndCarry { + u32 imm32; + IR::U1 carry; + }; + + ImmAndCarry ArmExpandImm_C(int rotate, Imm<8> imm8, IR::U1 carry_in) { + u32 imm32 = imm8.ZeroExtend(); + auto carry_out = carry_in; + if (rotate) { + imm32 = mcl::bit::rotate_right<u32>(imm8.ZeroExtend(), rotate * 2); + carry_out = ir.Imm1(mcl::bit::get_bit<31>(imm32)); + } + return {imm32, carry_out}; + } + + u32 ArmExpandImm(int rotate, Imm<8> imm8) { + return ArmExpandImm_C(rotate, imm8, ir.Imm1(0)).imm32; + } + + ImmAndCarry ThumbExpandImm_C(Imm<1> i, Imm<3> imm3, Imm<8> imm8, IR::U1 carry_in) { + const Imm<12> imm12 = concatenate(i, imm3, imm8); + if (imm12.Bits<10, 11>() == 0) { + const u32 imm32 = [&] { + const u32 imm8 = imm12.Bits<0, 7>(); + switch (imm12.Bits<8, 9>()) { + case 0b00: + return imm8; + case 0b01: + return mcl::bit::replicate_element<u16, u32>(imm8); + case 0b10: + return mcl::bit::replicate_element<u16, u32>(imm8 << 8); + case 0b11: + return mcl::bit::replicate_element<u8, u32>(imm8); + } + UNREACHABLE(); + }(); + return {imm32, carry_in}; + } + const u32 imm32 = mcl::bit::rotate_right<u32>((1 << 7) | imm12.Bits<0, 6>(), imm12.Bits<7, 11>()); + return {imm32, ir.Imm1(mcl::bit::get_bit<31>(imm32))}; + } + + u32 ThumbExpandImm(Imm<1> i, Imm<3> imm3, Imm<8> imm8) { + return ThumbExpandImm_C(i, imm3, imm8, ir.Imm1(0)).imm32; + } + + // Creates an immediate of the given value + IR::UAny I(size_t bitsize, u64 value); + + IR::ResultAndCarry<IR::U32> EmitImmShift(IR::U32 value, ShiftType type, Imm<3> imm3, Imm<2> imm2, IR::U1 carry_in); + IR::ResultAndCarry<IR::U32> EmitImmShift(IR::U32 value, ShiftType type, Imm<5> imm5, IR::U1 carry_in); + IR::ResultAndCarry<IR::U32> EmitRegShift(IR::U32 value, ShiftType type, IR::U8 amount, IR::U1 carry_in); + template<typename FnT> + bool EmitVfpVectorOperation(bool sz, ExtReg d, ExtReg n, ExtReg m, const FnT& fn); + template<typename FnT> + bool EmitVfpVectorOperation(bool sz, ExtReg d, ExtReg m, const FnT& fn); + + // Barrier instructions + bool arm_DMB(Imm<4> option); + bool arm_DSB(Imm<4> option); + bool arm_ISB(Imm<4> option); + + // Branch instructions + bool arm_B(Cond cond, Imm<24> imm24); + bool arm_BL(Cond cond, Imm<24> imm24); + bool arm_BLX_imm(bool H, Imm<24> imm24); + bool arm_BLX_reg(Cond cond, Reg m); + bool arm_BX(Cond cond, Reg m); + bool arm_BXJ(Cond cond, Reg m); + + // Coprocessor instructions + bool arm_CDP(Cond cond, size_t opc1, CoprocReg CRn, CoprocReg CRd, size_t coproc_no, size_t opc2, CoprocReg CRm); + bool arm_LDC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8); + bool arm_MCR(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm); + bool arm_MCRR(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm); + bool arm_MRC(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm); + bool arm_MRRC(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm); + bool arm_STC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8); + + // CRC32 instructions + bool arm_CRC32(Cond cond, Imm<2> sz, Reg n, Reg d, Reg m); + bool arm_CRC32C(Cond cond, Imm<2> sz, Reg n, Reg d, Reg m); + + // Data processing instructions + bool arm_ADC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8); + bool arm_ADC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_ADC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_ADD_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8); + bool arm_ADD_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_ADD_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_AND_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8); + bool arm_AND_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_AND_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_BIC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8); + bool arm_BIC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_BIC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_CMN_imm(Cond cond, Reg n, int rotate, Imm<8> imm8); + bool arm_CMN_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_CMN_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m); + bool arm_CMP_imm(Cond cond, Reg n, int rotate, Imm<8> imm8); + bool arm_CMP_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_CMP_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m); + bool arm_EOR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8); + bool arm_EOR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_EOR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_MOV_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8); + bool arm_MOV_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_MOV_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_MVN_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8); + bool arm_MVN_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_MVN_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_ORR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8); + bool arm_ORR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_ORR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_RSB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8); + bool arm_RSB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_RSB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_RSC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8); + bool arm_RSC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_RSC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_SBC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8); + bool arm_SBC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_SBC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_SUB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8); + bool arm_SUB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_SUB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m); + bool arm_TEQ_imm(Cond cond, Reg n, int rotate, Imm<8> imm8); + bool arm_TEQ_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_TEQ_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m); + bool arm_TST_imm(Cond cond, Reg n, int rotate, Imm<8> imm8); + bool arm_TST_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_TST_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m); + + // Exception generating instructions + bool arm_BKPT(Cond cond, Imm<12> imm12, Imm<4> imm4); + bool arm_SVC(Cond cond, Imm<24> imm24); + bool arm_UDF(); + + // Extension instructions + bool arm_SXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m); + bool arm_SXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m); + bool arm_SXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m); + bool arm_SXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m); + bool arm_SXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m); + bool arm_SXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m); + bool arm_UXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m); + bool arm_UXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m); + bool arm_UXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m); + bool arm_UXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m); + bool arm_UXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m); + bool arm_UXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m); + + // Hint instructions + bool arm_PLD_imm(bool add, bool R, Reg n, Imm<12> imm12); + bool arm_PLD_reg(bool add, bool R, Reg n, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_SEV(); + bool arm_SEVL(); + bool arm_WFE(); + bool arm_WFI(); + bool arm_YIELD(); + + // Load/Store + bool arm_LDRBT(); + bool arm_LDRHT(); + bool arm_LDRSBT(); + bool arm_LDRSHT(); + bool arm_LDRT(); + bool arm_STRBT(); + bool arm_STRHT(); + bool arm_STRT(); + bool arm_LDR_lit(Cond cond, bool U, Reg t, Imm<12> imm12); + bool arm_LDR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg d, Imm<12> imm12); + bool arm_LDR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_LDRB_lit(Cond cond, bool U, Reg t, Imm<12> imm12); + bool arm_LDRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12); + bool arm_LDRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_LDRD_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b); + bool arm_LDRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b); + bool arm_LDRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg d, Reg m); + bool arm_LDRH_lit(Cond cond, bool P, bool U, bool W, Reg t, Imm<4> imm8a, Imm<4> imm8b); + bool arm_LDRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b); + bool arm_LDRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg d, Reg m); + bool arm_LDRSB_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b); + bool arm_LDRSB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b); + bool arm_LDRSB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m); + bool arm_LDRSH_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b); + bool arm_LDRSH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b); + bool arm_LDRSH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m); + bool arm_STR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12); + bool arm_STR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_STRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12); + bool arm_STRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m); + bool arm_STRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b); + bool arm_STRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m); + bool arm_STRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b); + bool arm_STRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m); + + // Load/Store multiple instructions + bool arm_LDM(Cond cond, bool W, Reg n, RegList list); + bool arm_LDMDA(Cond cond, bool W, Reg n, RegList list); + bool arm_LDMDB(Cond cond, bool W, Reg n, RegList list); + bool arm_LDMIB(Cond cond, bool W, Reg n, RegList list); + bool arm_LDM_usr(); + bool arm_LDM_eret(); + bool arm_STM(Cond cond, bool W, Reg n, RegList list); + bool arm_STMDA(Cond cond, bool W, Reg n, RegList list); + bool arm_STMDB(Cond cond, bool W, Reg n, RegList list); + bool arm_STMIB(Cond cond, bool W, Reg n, RegList list); + bool arm_STM_usr(); + + // Miscellaneous instructions + bool arm_BFC(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb); + bool arm_BFI(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb, Reg n); + bool arm_CLZ(Cond cond, Reg d, Reg m); + bool arm_MOVT(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12); + bool arm_MOVW(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12); + bool arm_NOP() { return true; } + bool arm_RBIT(Cond cond, Reg d, Reg m); + bool arm_SBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n); + bool arm_SEL(Cond cond, Reg n, Reg d, Reg m); + bool arm_UBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n); + + // Unsigned sum of absolute difference functions + bool arm_USAD8(Cond cond, Reg d, Reg m, Reg n); + bool arm_USADA8(Cond cond, Reg d, Reg a, Reg m, Reg n); + + // Packing instructions + bool arm_PKHBT(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m); + bool arm_PKHTB(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m); + + // Reversal instructions + bool arm_REV(Cond cond, Reg d, Reg m); + bool arm_REV16(Cond cond, Reg d, Reg m); + bool arm_REVSH(Cond cond, Reg d, Reg m); + + // Saturation instructions + bool arm_SSAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n); + bool arm_SSAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n); + bool arm_USAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n); + bool arm_USAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n); + + // Divide instructions + bool arm_SDIV(Cond cond, Reg d, Reg m, Reg n); + bool arm_UDIV(Cond cond, Reg d, Reg m, Reg n); + + // Multiply (Normal) instructions + bool arm_MLA(Cond cond, bool S, Reg d, Reg a, Reg m, Reg n); + bool arm_MLS(Cond cond, Reg d, Reg a, Reg m, Reg n); + bool arm_MUL(Cond cond, bool S, Reg d, Reg m, Reg n); + + // Multiply (Long) instructions + bool arm_SMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n); + bool arm_SMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n); + bool arm_UMAAL(Cond cond, Reg dHi, Reg dLo, Reg m, Reg n); + bool arm_UMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n); + bool arm_UMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n); + + // Multiply (Halfword) instructions + bool arm_SMLALxy(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, bool N, Reg n); + bool arm_SMLAxy(Cond cond, Reg d, Reg a, Reg m, bool M, bool N, Reg n); + bool arm_SMULxy(Cond cond, Reg d, Reg m, bool M, bool N, Reg n); + + // Multiply (word by halfword) instructions + bool arm_SMLAWy(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n); + bool arm_SMULWy(Cond cond, Reg d, Reg m, bool M, Reg n); + + // Multiply (Most significant word) instructions + bool arm_SMMLA(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n); + bool arm_SMMLS(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n); + bool arm_SMMUL(Cond cond, Reg d, Reg m, bool R, Reg n); + + // Multiply (Dual) instructions + bool arm_SMLAD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n); + bool arm_SMLALD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n); + bool arm_SMLSD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n); + bool arm_SMLSLD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n); + bool arm_SMUAD(Cond cond, Reg d, Reg m, bool M, Reg n); + bool arm_SMUSD(Cond cond, Reg d, Reg m, bool M, Reg n); + + // Parallel Add/Subtract (Modulo arithmetic) instructions + bool arm_SADD8(Cond cond, Reg n, Reg d, Reg m); + bool arm_SADD16(Cond cond, Reg n, Reg d, Reg m); + bool arm_SASX(Cond cond, Reg n, Reg d, Reg m); + bool arm_SSAX(Cond cond, Reg n, Reg d, Reg m); + bool arm_SSUB8(Cond cond, Reg n, Reg d, Reg m); + bool arm_SSUB16(Cond cond, Reg n, Reg d, Reg m); + bool arm_UADD8(Cond cond, Reg n, Reg d, Reg m); + bool arm_UADD16(Cond cond, Reg n, Reg d, Reg m); + bool arm_UASX(Cond cond, Reg n, Reg d, Reg m); + bool arm_USAX(Cond cond, Reg n, Reg d, Reg m); + bool arm_USUB8(Cond cond, Reg n, Reg d, Reg m); + bool arm_USUB16(Cond cond, Reg n, Reg d, Reg m); + + // Parallel Add/Subtract (Saturating) instructions + bool arm_QADD8(Cond cond, Reg n, Reg d, Reg m); + bool arm_QADD16(Cond cond, Reg n, Reg d, Reg m); + bool arm_QASX(Cond cond, Reg n, Reg d, Reg m); + bool arm_QSAX(Cond cond, Reg n, Reg d, Reg m); + bool arm_QSUB8(Cond cond, Reg n, Reg d, Reg m); + bool arm_QSUB16(Cond cond, Reg n, Reg d, Reg m); + bool arm_UQADD8(Cond cond, Reg n, Reg d, Reg m); + bool arm_UQADD16(Cond cond, Reg n, Reg d, Reg m); + bool arm_UQASX(Cond cond, Reg n, Reg d, Reg m); + bool arm_UQSAX(Cond cond, Reg n, Reg d, Reg m); + bool arm_UQSUB8(Cond cond, Reg n, Reg d, Reg m); + bool arm_UQSUB16(Cond cond, Reg n, Reg d, Reg m); + + // Parallel Add/Subtract (Halving) instructions + bool arm_SHADD8(Cond cond, Reg n, Reg d, Reg m); + bool arm_SHADD16(Cond cond, Reg n, Reg d, Reg m); + bool arm_SHASX(Cond cond, Reg n, Reg d, Reg m); + bool arm_SHSAX(Cond cond, Reg n, Reg d, Reg m); + bool arm_SHSUB8(Cond cond, Reg n, Reg d, Reg m); + bool arm_SHSUB16(Cond cond, Reg n, Reg d, Reg m); + bool arm_UHADD8(Cond cond, Reg n, Reg d, Reg m); + bool arm_UHADD16(Cond cond, Reg n, Reg d, Reg m); + bool arm_UHASX(Cond cond, Reg n, Reg d, Reg m); + bool arm_UHSAX(Cond cond, Reg n, Reg d, Reg m); + bool arm_UHSUB8(Cond cond, Reg n, Reg d, Reg m); + bool arm_UHSUB16(Cond cond, Reg n, Reg d, Reg m); + + // Saturated Add/Subtract instructions + bool arm_QADD(Cond cond, Reg n, Reg d, Reg m); + bool arm_QSUB(Cond cond, Reg n, Reg d, Reg m); + bool arm_QDADD(Cond cond, Reg n, Reg d, Reg m); + bool arm_QDSUB(Cond cond, Reg n, Reg d, Reg m); + + // Synchronization Primitive instructions + bool arm_CLREX(); + bool arm_SWP(Cond cond, Reg n, Reg t, Reg t2); + bool arm_SWPB(Cond cond, Reg n, Reg t, Reg t2); + bool arm_STL(Cond cond, Reg n, Reg t); + bool arm_STLEX(Cond cond, Reg n, Reg d, Reg t); + bool arm_STREX(Cond cond, Reg n, Reg d, Reg t); + bool arm_LDA(Cond cond, Reg n, Reg t); + bool arm_LDAEX(Cond cond, Reg n, Reg t); + bool arm_LDREX(Cond cond, Reg n, Reg t); + bool arm_STLEXD(Cond cond, Reg n, Reg d, Reg t); + bool arm_STREXD(Cond cond, Reg n, Reg d, Reg t); + bool arm_LDAEXD(Cond cond, Reg n, Reg t); + bool arm_LDREXD(Cond cond, Reg n, Reg t); + bool arm_STLB(Cond cond, Reg n, Reg t); + bool arm_STLEXB(Cond cond, Reg n, Reg d, Reg t); + bool arm_STREXB(Cond cond, Reg n, Reg d, Reg t); + bool arm_LDAB(Cond cond, Reg n, Reg t); + bool arm_LDAEXB(Cond cond, Reg n, Reg t); + bool arm_LDREXB(Cond cond, Reg n, Reg t); + bool arm_STLH(Cond cond, Reg n, Reg t); + bool arm_STLEXH(Cond cond, Reg n, Reg d, Reg t); + bool arm_STREXH(Cond cond, Reg n, Reg d, Reg t); + bool arm_LDAH(Cond cond, Reg n, Reg t); + bool arm_LDAEXH(Cond cond, Reg n, Reg t); + bool arm_LDREXH(Cond cond, Reg n, Reg t); + + // Status register access instructions + bool arm_CPS(); + bool arm_MRS(Cond cond, Reg d); + bool arm_MSR_imm(Cond cond, unsigned mask, int rotate, Imm<8> imm8); + bool arm_MSR_reg(Cond cond, unsigned mask, Reg n); + bool arm_RFE(); + bool arm_SETEND(bool E); + bool arm_SRS(); + + // thumb16 + bool thumb16_LSL_imm(Imm<5> imm5, Reg m, Reg d); + bool thumb16_LSR_imm(Imm<5> imm5, Reg m, Reg d); + bool thumb16_ASR_imm(Imm<5> imm5, Reg m, Reg d); + bool thumb16_ADD_reg_t1(Reg m, Reg n, Reg d); + bool thumb16_SUB_reg(Reg m, Reg n, Reg d); + bool thumb16_ADD_imm_t1(Imm<3> imm3, Reg n, Reg d); + bool thumb16_SUB_imm_t1(Imm<3> imm3, Reg n, Reg d); + bool thumb16_MOV_imm(Reg d, Imm<8> imm8); + bool thumb16_CMP_imm(Reg n, Imm<8> imm8); + bool thumb16_ADD_imm_t2(Reg d_n, Imm<8> imm8); + bool thumb16_SUB_imm_t2(Reg d_n, Imm<8> imm8); + bool thumb16_AND_reg(Reg m, Reg d_n); + bool thumb16_EOR_reg(Reg m, Reg d_n); + bool thumb16_LSL_reg(Reg m, Reg d_n); + bool thumb16_LSR_reg(Reg m, Reg d_n); + bool thumb16_ASR_reg(Reg m, Reg d_n); + bool thumb16_ADC_reg(Reg m, Reg d_n); + bool thumb16_SBC_reg(Reg m, Reg d_n); + bool thumb16_ROR_reg(Reg m, Reg d_n); + bool thumb16_TST_reg(Reg m, Reg n); + bool thumb16_RSB_imm(Reg n, Reg d); + bool thumb16_CMP_reg_t1(Reg m, Reg n); + bool thumb16_CMN_reg(Reg m, Reg n); + bool thumb16_ORR_reg(Reg m, Reg d_n); + bool thumb16_MUL_reg(Reg n, Reg d_m); + bool thumb16_BIC_reg(Reg m, Reg d_n); + bool thumb16_MVN_reg(Reg m, Reg d); + bool thumb16_ADD_reg_t2(bool d_n_hi, Reg m, Reg d_n_lo); + bool thumb16_CMP_reg_t2(bool n_hi, Reg m, Reg n_lo); + bool thumb16_MOV_reg(bool d_hi, Reg m, Reg d_lo); + bool thumb16_LDR_literal(Reg t, Imm<8> imm8); + bool thumb16_STR_reg(Reg m, Reg n, Reg t); + bool thumb16_STRH_reg(Reg m, Reg n, Reg t); + bool thumb16_STRB_reg(Reg m, Reg n, Reg t); + bool thumb16_LDRSB_reg(Reg m, Reg n, Reg t); + bool thumb16_LDR_reg(Reg m, Reg n, Reg t); + bool thumb16_LDRH_reg(Reg m, Reg n, Reg t); + bool thumb16_LDRB_reg(Reg m, Reg n, Reg t); + bool thumb16_LDRSH_reg(Reg m, Reg n, Reg t); + bool thumb16_STR_imm_t1(Imm<5> imm5, Reg n, Reg t); + bool thumb16_LDR_imm_t1(Imm<5> imm5, Reg n, Reg t); + bool thumb16_STRB_imm(Imm<5> imm5, Reg n, Reg t); + bool thumb16_LDRB_imm(Imm<5> imm5, Reg n, Reg t); + bool thumb16_STRH_imm(Imm<5> imm5, Reg n, Reg t); + bool thumb16_LDRH_imm(Imm<5> imm5, Reg n, Reg t); + bool thumb16_STR_imm_t2(Reg t, Imm<8> imm8); + bool thumb16_LDR_imm_t2(Reg t, Imm<8> imm8); + bool thumb16_ADR(Reg d, Imm<8> imm8); + bool thumb16_ADD_sp_t1(Reg d, Imm<8> imm8); + bool thumb16_ADD_sp_t2(Imm<7> imm7); + bool thumb16_SUB_sp(Imm<7> imm7); + bool thumb16_SEV(); + bool thumb16_SEVL(); + bool thumb16_WFE(); + bool thumb16_WFI(); + bool thumb16_YIELD(); + bool thumb16_NOP(); + bool thumb16_IT(Imm<8> imm8); + bool thumb16_SXTH(Reg m, Reg d); + bool thumb16_SXTB(Reg m, Reg d); + bool thumb16_UXTH(Reg m, Reg d); + bool thumb16_UXTB(Reg m, Reg d); + bool thumb16_PUSH(bool M, RegList reg_list); + bool thumb16_POP(bool P, RegList reg_list); + bool thumb16_SETEND(bool E); + bool thumb16_CPS(bool, bool, bool, bool); + bool thumb16_REV(Reg m, Reg d); + bool thumb16_REV16(Reg m, Reg d); + bool thumb16_REVSH(Reg m, Reg d); + bool thumb16_BKPT(Imm<8> imm8); + bool thumb16_STMIA(Reg n, RegList reg_list); + bool thumb16_LDMIA(Reg n, RegList reg_list); + bool thumb16_CBZ_CBNZ(bool nonzero, Imm<1> i, Imm<5> imm5, Reg n); + bool thumb16_UDF(); + bool thumb16_BX(Reg m); + bool thumb16_BLX_reg(Reg m); + bool thumb16_SVC(Imm<8> imm8); + bool thumb16_B_t1(Cond cond, Imm<8> imm8); + bool thumb16_B_t2(Imm<11> imm11); + + // thumb32 load/store multiple instructions + bool thumb32_LDMDB(bool W, Reg n, Imm<16> reg_list); + bool thumb32_LDMIA(bool W, Reg n, Imm<16> reg_list); + bool thumb32_POP(Imm<16> reg_list); + bool thumb32_PUSH(Imm<15> reg_list); + bool thumb32_STMIA(bool W, Reg n, Imm<15> reg_list); + bool thumb32_STMDB(bool W, Reg n, Imm<15> reg_list); + + // thumb32 load/store dual, load/store exclusive, table branch instructions + bool thumb32_LDA(Reg n, Reg t); + bool thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_LDRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_LDRD_lit_1(bool U, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_LDRD_lit_2(bool U, bool W, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_STRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_STRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_LDREX(Reg n, Reg t, Imm<8> imm8); + bool thumb32_LDREXD(Reg n, Reg t, Reg t2); + bool thumb32_LDREXB(Reg n, Reg t); + bool thumb32_LDREXH(Reg n, Reg t); + bool thumb32_STL(Reg n, Reg t); + bool thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8); + bool thumb32_STREXB(Reg n, Reg t, Reg d); + bool thumb32_STREXD(Reg n, Reg t, Reg t2, Reg d); + bool thumb32_STREXH(Reg n, Reg t, Reg d); + bool thumb32_TBB(Reg n, Reg m); + bool thumb32_TBH(Reg n, Reg m); + + // thumb32 data processing (shifted register) instructions + bool thumb32_TST_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_AND_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_BIC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_MOV_reg(bool S, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_ORR_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_MVN_reg(bool S, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_ORN_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_TEQ_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_EOR_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_PKH(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<1> tb, Reg m); + bool thumb32_CMN_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_ADD_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_ADC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_SBC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_CMP_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_SUB_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + bool thumb32_RSB_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m); + + // thumb32 data processing (modified immediate) instructions + bool thumb32_TST_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8); + bool thumb32_AND_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_BIC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_MOV_imm(Imm<1> i, bool S, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_ORR_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_MVN_imm(Imm<1> i, bool S, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_ORN_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_TEQ_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8); + bool thumb32_EOR_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_CMN_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8); + bool thumb32_ADD_imm_1(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_ADC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_SBC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_CMP_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8); + bool thumb32_SUB_imm_1(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_RSB_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + + // thumb32 data processing (plain binary immediate) instructions. + bool thumb32_ADR_t2(Imm<1> imm1, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_ADR_t3(Imm<1> imm1, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_ADD_imm_2(Imm<1> imm1, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_BFC(Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> msb); + bool thumb32_BFI(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> msb); + bool thumb32_MOVT(Imm<1> imm1, Imm<4> imm4, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_MOVW_imm(Imm<1> imm1, Imm<4> imm4, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_SBFX(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> widthm1); + bool thumb32_SSAT(bool sh, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> sat_imm); + bool thumb32_SSAT16(Reg n, Reg d, Imm<4> sat_imm); + bool thumb32_SUB_imm_2(Imm<1> imm1, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8); + bool thumb32_UBFX(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> widthm1); + bool thumb32_USAT(bool sh, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> sat_imm); + bool thumb32_USAT16(Reg n, Reg d, Imm<4> sat_imm); + + // thumb32 miscellaneous control instructions + bool thumb32_BXJ(Reg m); + bool thumb32_CLREX(); + bool thumb32_DMB(Imm<4> option); + bool thumb32_DSB(Imm<4> option); + bool thumb32_ISB(Imm<4> option); + bool thumb32_NOP(); + bool thumb32_SEV(); + bool thumb32_SEVL(); + bool thumb32_UDF(); + bool thumb32_WFE(); + bool thumb32_WFI(); + bool thumb32_YIELD(); + bool thumb32_MSR_reg(bool R, Reg n, Imm<4> mask); + bool thumb32_MRS_reg(bool R, Reg d); + + // thumb32 branch instructions + bool thumb32_BL_imm(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo); + bool thumb32_BLX_imm(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo); + bool thumb32_B(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo); + bool thumb32_B_cond(Imm<1> S, Cond cond, Imm<6> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo); + + // thumb32 store single data item instructions + bool thumb32_STRB_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8); + bool thumb32_STRB_imm_2(Reg n, Reg t, Imm<8> imm8); + bool thumb32_STRB_imm_3(Reg n, Reg t, Imm<12> imm12); + bool thumb32_STRBT(Reg n, Reg t, Imm<8> imm8); + bool thumb32_STRB(Reg n, Reg t, Imm<2> imm2, Reg m); + bool thumb32_STRH_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8); + bool thumb32_STRH_imm_2(Reg n, Reg t, Imm<8> imm8); + bool thumb32_STRH_imm_3(Reg n, Reg t, Imm<12> imm12); + bool thumb32_STRHT(Reg n, Reg t, Imm<8> imm8); + bool thumb32_STRH(Reg n, Reg t, Imm<2> imm2, Reg m); + bool thumb32_STR_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8); + bool thumb32_STR_imm_2(Reg n, Reg t, Imm<8> imm8); + bool thumb32_STR_imm_3(Reg n, Reg t, Imm<12> imm12); + bool thumb32_STRT(Reg n, Reg t, Imm<8> imm8); + bool thumb32_STR_reg(Reg n, Reg t, Imm<2> imm2, Reg m); + + // thumb32 load byte and memory hints + bool thumb32_PLD_lit(bool U, Imm<12> imm12); + bool thumb32_PLD_imm8(bool W, Reg n, Imm<8> imm8); + bool thumb32_PLD_imm12(bool W, Reg n, Imm<12> imm12); + bool thumb32_PLD_reg(bool W, Reg n, Imm<2> imm2, Reg m); + bool thumb32_PLI_lit(bool U, Imm<12> imm12); + bool thumb32_PLI_imm8(Reg n, Imm<8> imm8); + bool thumb32_PLI_imm12(Reg n, Imm<12> imm12); + bool thumb32_PLI_reg(Reg n, Imm<2> imm2, Reg m); + bool thumb32_LDRB_lit(bool U, Reg t, Imm<12> imm12); + bool thumb32_LDRB_reg(Reg n, Reg t, Imm<2> imm2, Reg m); + bool thumb32_LDRB_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8); + bool thumb32_LDRB_imm12(Reg n, Reg t, Imm<12> imm12); + bool thumb32_LDRBT(Reg n, Reg t, Imm<8> imm8); + bool thumb32_LDRSB_lit(bool U, Reg t, Imm<12> imm12); + bool thumb32_LDRSB_reg(Reg n, Reg t, Imm<2> imm2, Reg m); + bool thumb32_LDRSB_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8); + bool thumb32_LDRSB_imm12(Reg n, Reg t, Imm<12> imm12); + bool thumb32_LDRSBT(Reg n, Reg t, Imm<8> imm8); + + // thumb32 load halfword instructions + bool thumb32_LDRH_lit(bool U, Reg t, Imm<12> imm12); + bool thumb32_LDRH_reg(Reg n, Reg t, Imm<2> imm2, Reg m); + bool thumb32_LDRH_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8); + bool thumb32_LDRH_imm12(Reg n, Reg t, Imm<12> imm12); + bool thumb32_LDRHT(Reg n, Reg t, Imm<8> imm8); + bool thumb32_LDRSH_lit(bool U, Reg t, Imm<12> imm12); + bool thumb32_LDRSH_reg(Reg n, Reg t, Imm<2> imm2, Reg m); + bool thumb32_LDRSH_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8); + bool thumb32_LDRSH_imm12(Reg n, Reg t, Imm<12> imm12); + bool thumb32_LDRSHT(Reg n, Reg t, Imm<8> imm8); + + // thumb32 load word instructions + bool thumb32_LDR_lit(bool U, Reg t, Imm<12> imm12); + bool thumb32_LDR_reg(Reg n, Reg t, Imm<2> imm2, Reg m); + bool thumb32_LDR_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8); + bool thumb32_LDR_imm12(Reg n, Reg t, Imm<12> imm12); + bool thumb32_LDRT(Reg n, Reg t, Imm<8> imm8); + + // thumb32 data processing (register) instructions + bool thumb32_ASR_reg(bool S, Reg m, Reg d, Reg s); + bool thumb32_LSL_reg(bool S, Reg m, Reg d, Reg s); + bool thumb32_LSR_reg(bool S, Reg m, Reg d, Reg s); + bool thumb32_ROR_reg(bool S, Reg m, Reg d, Reg s); + bool thumb32_SXTB(Reg d, SignExtendRotation rotate, Reg m); + bool thumb32_SXTB16(Reg d, SignExtendRotation rotate, Reg m); + bool thumb32_SXTAB(Reg n, Reg d, SignExtendRotation rotate, Reg m); + bool thumb32_SXTAB16(Reg n, Reg d, SignExtendRotation rotate, Reg m); + bool thumb32_SXTH(Reg d, SignExtendRotation rotate, Reg m); + bool thumb32_SXTAH(Reg n, Reg d, SignExtendRotation rotate, Reg m); + bool thumb32_UXTB(Reg d, SignExtendRotation rotate, Reg m); + bool thumb32_UXTB16(Reg d, SignExtendRotation rotate, Reg m); + bool thumb32_UXTAB(Reg n, Reg d, SignExtendRotation rotate, Reg m); + bool thumb32_UXTAB16(Reg n, Reg d, SignExtendRotation rotate, Reg m); + bool thumb32_UXTH(Reg d, SignExtendRotation rotate, Reg m); + bool thumb32_UXTAH(Reg n, Reg d, SignExtendRotation rotate, Reg m); + + // thumb32 long multiply, long multiply accumulate, and divide instructions + bool thumb32_SDIV(Reg n, Reg d, Reg m); + bool thumb32_SMLAL(Reg n, Reg dLo, Reg dHi, Reg m); + bool thumb32_SMLALD(Reg n, Reg dLo, Reg dHi, bool M, Reg m); + bool thumb32_SMLALXY(Reg n, Reg dLo, Reg dHi, bool N, bool M, Reg m); + bool thumb32_SMLSLD(Reg n, Reg dLo, Reg dHi, bool M, Reg m); + bool thumb32_SMULL(Reg n, Reg dLo, Reg dHi, Reg m); + bool thumb32_UDIV(Reg n, Reg d, Reg m); + bool thumb32_UMAAL(Reg n, Reg dLo, Reg dHi, Reg m); + bool thumb32_UMLAL(Reg n, Reg dLo, Reg dHi, Reg m); + bool thumb32_UMULL(Reg n, Reg dLo, Reg dHi, Reg m); + + // thumb32 miscellaneous instructions + bool thumb32_CLZ(Reg n, Reg d, Reg m); + bool thumb32_QADD(Reg n, Reg d, Reg m); + bool thumb32_QDADD(Reg n, Reg d, Reg m); + bool thumb32_QDSUB(Reg n, Reg d, Reg m); + bool thumb32_QSUB(Reg n, Reg d, Reg m); + bool thumb32_RBIT(Reg n, Reg d, Reg m); + bool thumb32_REV(Reg n, Reg d, Reg m); + bool thumb32_REV16(Reg n, Reg d, Reg m); + bool thumb32_REVSH(Reg n, Reg d, Reg m); + bool thumb32_SEL(Reg n, Reg d, Reg m); + + // thumb32 multiply instructions + bool thumb32_MLA(Reg n, Reg a, Reg d, Reg m); + bool thumb32_MLS(Reg n, Reg a, Reg d, Reg m); + bool thumb32_MUL(Reg n, Reg d, Reg m); + bool thumb32_SMLAD(Reg n, Reg a, Reg d, bool X, Reg m); + bool thumb32_SMLAXY(Reg n, Reg a, Reg d, bool N, bool M, Reg m); + bool thumb32_SMLAWY(Reg n, Reg a, Reg d, bool M, Reg m); + bool thumb32_SMLSD(Reg n, Reg a, Reg d, bool X, Reg m); + bool thumb32_SMMLA(Reg n, Reg a, Reg d, bool R, Reg m); + bool thumb32_SMMLS(Reg n, Reg a, Reg d, bool R, Reg m); + bool thumb32_SMMUL(Reg n, Reg d, bool R, Reg m); + bool thumb32_SMUAD(Reg n, Reg d, bool M, Reg m); + bool thumb32_SMUSD(Reg n, Reg d, bool M, Reg m); + bool thumb32_SMULXY(Reg n, Reg d, bool N, bool M, Reg m); + bool thumb32_SMULWY(Reg n, Reg d, bool M, Reg m); + bool thumb32_USAD8(Reg n, Reg d, Reg m); + bool thumb32_USADA8(Reg n, Reg a, Reg d, Reg m); + + // thumb32 parallel add/sub instructions + bool thumb32_SADD8(Reg n, Reg d, Reg m); + bool thumb32_SADD16(Reg n, Reg d, Reg m); + bool thumb32_SASX(Reg n, Reg d, Reg m); + bool thumb32_SSAX(Reg n, Reg d, Reg m); + bool thumb32_SSUB8(Reg n, Reg d, Reg m); + bool thumb32_SSUB16(Reg n, Reg d, Reg m); + bool thumb32_UADD8(Reg n, Reg d, Reg m); + bool thumb32_UADD16(Reg n, Reg d, Reg m); + bool thumb32_UASX(Reg n, Reg d, Reg m); + bool thumb32_USAX(Reg n, Reg d, Reg m); + bool thumb32_USUB8(Reg n, Reg d, Reg m); + bool thumb32_USUB16(Reg n, Reg d, Reg m); + + bool thumb32_QADD8(Reg n, Reg d, Reg m); + bool thumb32_QADD16(Reg n, Reg d, Reg m); + bool thumb32_QASX(Reg n, Reg d, Reg m); + bool thumb32_QSAX(Reg n, Reg d, Reg m); + bool thumb32_QSUB8(Reg n, Reg d, Reg m); + bool thumb32_QSUB16(Reg n, Reg d, Reg m); + bool thumb32_UQADD8(Reg n, Reg d, Reg m); + bool thumb32_UQADD16(Reg n, Reg d, Reg m); + bool thumb32_UQASX(Reg n, Reg d, Reg m); + bool thumb32_UQSAX(Reg n, Reg d, Reg m); + bool thumb32_UQSUB8(Reg n, Reg d, Reg m); + bool thumb32_UQSUB16(Reg n, Reg d, Reg m); + + bool thumb32_SHADD8(Reg n, Reg d, Reg m); + bool thumb32_SHADD16(Reg n, Reg d, Reg m); + bool thumb32_SHASX(Reg n, Reg d, Reg m); + bool thumb32_SHSAX(Reg n, Reg d, Reg m); + bool thumb32_SHSUB8(Reg n, Reg d, Reg m); + bool thumb32_SHSUB16(Reg n, Reg d, Reg m); + bool thumb32_UHADD8(Reg n, Reg d, Reg m); + bool thumb32_UHADD16(Reg n, Reg d, Reg m); + bool thumb32_UHASX(Reg n, Reg d, Reg m); + bool thumb32_UHSAX(Reg n, Reg d, Reg m); + bool thumb32_UHSUB8(Reg n, Reg d, Reg m); + bool thumb32_UHSUB16(Reg n, Reg d, Reg m); + + // thumb32 coprocessor insturctions + bool thumb32_MCRR(bool two, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm); + bool thumb32_MRRC(bool two, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm); + bool thumb32_STC(bool two, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8); + bool thumb32_LDC(bool two, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8); + bool thumb32_CDP(bool two, size_t opc1, CoprocReg CRn, CoprocReg CRd, size_t coproc_no, size_t opc2, CoprocReg CRm); + bool thumb32_MCR(bool two, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm); + bool thumb32_MRC(bool two, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm); + + // Floating-point three-register data processing instructions + bool vfp_VADD(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VSUB(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VNMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VNMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VNMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VDIV(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VFNMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VFNMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VFMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VFMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VSEL(bool D, Imm<2> cc, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VMAXNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + bool vfp_VMINNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm); + + // Floating-point move instructions + bool vfp_VMOV_u32_f64(Cond cond, size_t Vd, Reg t, bool D); + bool vfp_VMOV_f64_u32(Cond cond, size_t Vn, Reg t, bool N); + bool vfp_VMOV_u32_f32(Cond cond, size_t Vn, Reg t, bool N); + bool vfp_VMOV_f32_u32(Cond cond, size_t Vn, Reg t, bool N); + bool vfp_VMOV_2u32_2f32(Cond cond, Reg t2, Reg t, bool M, size_t Vm); + bool vfp_VMOV_2f32_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm); + bool vfp_VMOV_2u32_f64(Cond cond, Reg t2, Reg t, bool M, size_t Vm); + bool vfp_VMOV_f64_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm); + bool vfp_VMOV_from_i32(Cond cond, Imm<1> i, size_t Vd, Reg t, bool D); + bool vfp_VMOV_from_i16(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<1> i2); + bool vfp_VMOV_from_i8(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<2> i2); + bool vfp_VMOV_to_i32(Cond cond, Imm<1> i, size_t Vn, Reg t, bool N); + bool vfp_VMOV_to_i16(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<1> i2); + bool vfp_VMOV_to_i8(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<2> i2); + bool vfp_VDUP(Cond cond, Imm<1> B, bool Q, size_t Vd, Reg t, bool D, Imm<1> E); + bool vfp_VMOV_imm(Cond cond, bool D, Imm<4> imm4H, size_t Vd, bool sz, Imm<4> imm4L); + bool vfp_VMOV_reg(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm); + + // Floating-point misc instructions + bool vfp_VABS(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm); + bool vfp_VNEG(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm); + bool vfp_VSQRT(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm); + bool vfp_VCVTB(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm); + bool vfp_VCVTT(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm); + bool vfp_VCMP(Cond cond, bool D, size_t Vd, bool sz, bool E, bool M, size_t Vm); + bool vfp_VCMP_zero(Cond cond, bool D, size_t Vd, bool sz, bool E); + bool vfp_VRINTR(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm); + bool vfp_VRINTZ(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm); + bool vfp_VRINTX(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm); + bool vfp_VCVT_f_to_f(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm); + bool vfp_VCVT_from_int(Cond cond, bool D, size_t Vd, bool sz, bool is_signed, bool M, size_t Vm); + bool vfp_VCVT_from_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4); + bool vfp_VCVT_to_u32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm); + bool vfp_VCVT_to_s32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm); + bool vfp_VCVT_to_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4); + bool vfp_VRINT_rm(bool D, size_t rm, size_t Vd, bool sz, bool M, size_t Vm); + bool vfp_VCVT_rm(bool D, size_t rm, size_t Vd, bool sz, bool U, bool M, size_t Vm); + + // Floating-point system register access + bool vfp_VMSR(Cond cond, Reg t); + bool vfp_VMRS(Cond cond, Reg t); + + // Floating-point load-store instructions + bool vfp_VLDR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8); + bool vfp_VSTR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8); + bool vfp_VPOP(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8); + bool vfp_VPUSH(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8); + bool vfp_VSTM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8); + bool vfp_VSTM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8); + bool vfp_VLDM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8); + bool vfp_VLDM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8); + + // Advanced SIMD one register, modified immediate + bool asimd_VMOV_imm(Imm<1> a, bool D, Imm<1> b, Imm<1> c, Imm<1> d, size_t Vd, Imm<4> cmode, bool Q, bool op, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h); + + // Advanced SIMD three register with same length + bool asimd_VHADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VQADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VRHADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VAND_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VBIC_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VORR_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VORN_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VEOR_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VBSL(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VBIT(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VBIF(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VHSUB(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VQSUB(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VCGT_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VCGE_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VABD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VABA(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VADD_int(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VSUB_int(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VSHL_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VQSHL_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VRSHL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VMAX(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, bool op, size_t Vm); + bool asimd_VTST(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VCEQ_reg(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VMLA(bool op, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VPMAX_int(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, bool op, size_t Vm); + bool v8_VMAXNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool v8_VMINNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VQDMULH(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VQRDMULH(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VPADD(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VFMA(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VFMS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VADD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VSUB_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VPADD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VABD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VMLA_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VMLS_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VMUL_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VCEQ_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VCGE_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VCGT_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VACGE(bool D, bool op, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VPMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VPMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool v8_SHA256H(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool v8_SHA256H2(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool v8_SHA256SU1(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + + // Advanced SIMD three registers with different lengths + bool asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm); + bool asimd_VSUBL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm); + bool asimd_VABAL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm); + bool asimd_VABDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm); + bool asimd_VMLAL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm); + bool asimd_VMULL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool P, bool N, bool M, size_t Vm); + + // Advanced SIMD two registers and a scalar + bool asimd_VMLA_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool F, bool N, bool M, size_t Vm); + bool asimd_VMLAL_scalar(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm); + bool asimd_VMUL_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool F, bool N, bool M, size_t Vm); + bool asimd_VMULL_scalar(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm); + bool asimd_VQDMULL_scalar(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm); + bool asimd_VQDMULH_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm); + bool asimd_VQRDMULH_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm); + + // Two registers and a shift amount + bool asimd_SHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm); + bool asimd_SRA(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm); + bool asimd_VRSHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm); + bool asimd_VRSRA(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm); + bool asimd_VSRI(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm); + bool asimd_VSHL(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm); + bool asimd_VSLI(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm); + bool asimd_VQSHL(bool U, bool D, size_t imm6, size_t Vd, bool op, bool L, bool Q, bool M, size_t Vm); + bool asimd_VSHRN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm); + bool asimd_VRSHRN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm); + bool asimd_VQSHRUN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm); + bool asimd_VQRSHRUN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm); + bool asimd_VQSHRN(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm); + bool asimd_VQRSHRN(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm); + bool asimd_VSHLL(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm); + bool asimd_VCVT_fixed(bool U, bool D, size_t imm6, size_t Vd, bool to_fixed, bool Q, bool M, size_t Vm); + + // Advanced SIMD two register, miscellaneous + bool asimd_VREV(bool D, size_t sz, size_t Vd, size_t op, bool Q, bool M, size_t Vm); + bool asimd_VPADDL(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm); + bool v8_AESD(bool D, size_t sz, size_t Vd, bool M, size_t Vm); + bool v8_AESE(bool D, size_t sz, size_t Vd, bool M, size_t Vm); + bool v8_AESIMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm); + bool v8_AESMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm); + bool v8_SHA256SU0(bool D, size_t sz, size_t Vd, bool M, size_t Vm); + bool asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool asimd_VCLZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool asimd_VCNT(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool asimd_VMVN_reg(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool asimd_VPADAL(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm); + bool asimd_VQABS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool asimd_VQNEG(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool asimd_VCGT_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm); + bool asimd_VCGE_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm); + bool asimd_VCEQ_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm); + bool asimd_VCLE_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm); + bool asimd_VCLT_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm); + bool asimd_VABS(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm); + bool asimd_VNEG(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm); + bool asimd_VSWP(bool D, size_t Vd, bool Q, bool M, size_t Vm); + bool asimd_VTRN(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool asimd_VUZP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool asimd_VZIP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool asimd_VMOVN(bool D, size_t sz, size_t Vd, bool M, size_t Vm); + bool asimd_VQMOVUN(bool D, size_t sz, size_t Vd, bool M, size_t Vm); + bool asimd_VQMOVN(bool D, size_t sz, size_t Vd, bool op, bool M, size_t Vm); + bool asimd_VSHLL_max(bool D, size_t sz, size_t Vd, bool M, size_t Vm); + bool v8_VRINTN(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool v8_VRINTX(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool v8_VRINTA(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool v8_VRINTZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool v8_VRINTM(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool v8_VRINTP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); + bool asimd_VCVT_half(bool D, size_t sz, size_t Vd, bool op, bool M, size_t Vm); + bool v8_VCVTA(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm); + bool v8_VCVTN(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm); + bool v8_VCVTP(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm); + bool v8_VCVTM(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm); + bool asimd_VRECPE(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm); + bool asimd_VRSQRTE(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm); + bool asimd_VCVT_integer(bool D, size_t sz, size_t Vd, bool op, bool U, bool Q, bool M, size_t Vm); + + // Advanced SIMD miscellaneous + bool asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4, bool N, bool Q, bool M, size_t Vm); + bool asimd_VTBL(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm); + bool asimd_VTBX(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm); + bool asimd_VDUP_scalar(bool D, Imm<4> imm4, size_t Vd, bool Q, bool M, size_t Vm); + + // Advanced SIMD load/store structures + bool v8_VST_multiple(bool D, Reg n, size_t Vd, Imm<4> type, size_t sz, size_t align, Reg m); + bool v8_VLD_multiple(bool D, Reg n, size_t Vd, Imm<4> type, size_t sz, size_t align, Reg m); + bool v8_VLD_all_lanes(bool D, Reg n, size_t Vd, size_t nn, size_t sz, bool T, bool a, Reg m); + bool v8_VST_single(bool D, Reg n, size_t Vd, size_t sz, size_t nn, size_t index_align, Reg m); + bool v8_VLD_single(bool D, Reg n, size_t Vd, size_t sz, size_t nn, size_t index_align, Reg m); +}; + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_load_store_structures.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_load_store_structures.cpp new file mode 100644 index 0000000000..d444e023e9 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_load_store_structures.cpp @@ -0,0 +1,375 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <optional> +#include <tuple> + +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +namespace { + +std::optional<std::tuple<size_t, size_t, size_t>> DecodeType(Imm<4> type, size_t size, size_t align) { + switch (type.ZeroExtend()) { + case 0b0111: // VST1 A1 / VLD1 A1 + if (mcl::bit::get_bit<1>(align)) { + return std::nullopt; + } + return std::tuple<size_t, size_t, size_t>{1, 1, 0}; + case 0b1010: // VST1 A2 / VLD1 A2 + if (align == 0b11) { + return std::nullopt; + } + return std::tuple<size_t, size_t, size_t>{1, 2, 0}; + case 0b0110: // VST1 A3 / VLD1 A3 + if (mcl::bit::get_bit<1>(align)) { + return std::nullopt; + } + return std::tuple<size_t, size_t, size_t>{1, 3, 0}; + case 0b0010: // VST1 A4 / VLD1 A4 + return std::tuple<size_t, size_t, size_t>{1, 4, 0}; + case 0b1000: // VST2 A1 / VLD2 A1 + if (size == 0b11 || align == 0b11) { + return std::nullopt; + } + return std::tuple<size_t, size_t, size_t>{2, 1, 1}; + case 0b1001: // VST2 A1 / VLD2 A1 + if (size == 0b11 || align == 0b11) { + return std::nullopt; + } + return std::tuple<size_t, size_t, size_t>{2, 1, 2}; + case 0b0011: // VST2 A2 / VLD2 A2 + if (size == 0b11) { + return std::nullopt; + } + return std::tuple<size_t, size_t, size_t>{2, 2, 2}; + case 0b0100: // VST3 / VLD3 + if (size == 0b11 || mcl::bit::get_bit<1>(align)) { + return std::nullopt; + } + return std::tuple<size_t, size_t, size_t>{3, 1, 1}; + case 0b0101: // VST3 / VLD3 + if (size == 0b11 || mcl::bit::get_bit<1>(align)) { + return std::nullopt; + } + return std::tuple<size_t, size_t, size_t>{3, 1, 2}; + case 0b0000: // VST4 / VLD4 + if (size == 0b11) { + return std::nullopt; + } + return std::tuple<size_t, size_t, size_t>{4, 1, 1}; + case 0b0001: // VST4 / VLD4 + if (size == 0b11) { + return std::nullopt; + } + return std::tuple<size_t, size_t, size_t>{4, 1, 2}; + } + ASSERT_FALSE("Decode error"); +} +} // namespace + +bool TranslatorVisitor::v8_VST_multiple(bool D, Reg n, size_t Vd, Imm<4> type, size_t size, size_t align, Reg m) { + if (type == 0b1011 || type.Bits<2, 3>() == 0b11) { + return DecodeError(); + } + + const auto decoded_type = DecodeType(type, size, align); + if (!decoded_type) { + return UndefinedInstruction(); + } + const auto [nelem, regs, inc] = *decoded_type; + + const ExtReg d = ToExtRegD(Vd, D); + const size_t d_last = RegNumber(d) + inc * (nelem - 1); + if (n == Reg::R15 || d_last + regs > 32) { + return UnpredictableInstruction(); + } + + [[maybe_unused]] const size_t alignment = align == 0 ? 1 : 4 << align; + const size_t ebytes = static_cast<size_t>(1) << size; + const size_t elements = 8 / ebytes; + + const bool wback = m != Reg::R15; + const bool register_index = m != Reg::R15 && m != Reg::R13; + + IR::U32 address = ir.GetRegister(n); + for (size_t r = 0; r < regs; r++) { + for (size_t e = 0; e < elements; e++) { + for (size_t i = 0; i < nelem; i++) { + const ExtReg ext_reg = d + i * inc + r; + const IR::U64 shifted_element = ir.LogicalShiftRight(ir.GetExtendedRegister(ext_reg), ir.Imm8(static_cast<u8>(e * ebytes * 8))); + const IR::UAny element = ir.LeastSignificant(8 * ebytes, shifted_element); + ir.WriteMemory(8 * ebytes, address, element, IR::AccType::NORMAL); + + address = ir.Add(address, ir.Imm32(static_cast<u32>(ebytes))); + } + } + } + + if (wback) { + if (register_index) { + ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.GetRegister(m))); + } else { + ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.Imm32(static_cast<u32>(8 * nelem * regs)))); + } + } + + return true; +} + +bool TranslatorVisitor::v8_VLD_multiple(bool D, Reg n, size_t Vd, Imm<4> type, size_t size, size_t align, Reg m) { + if (type == 0b1011 || type.Bits<2, 3>() == 0b11) { + return DecodeError(); + } + + const auto decoded_type = DecodeType(type, size, align); + if (!decoded_type) { + return UndefinedInstruction(); + } + const auto [nelem, regs, inc] = *decoded_type; + + const ExtReg d = ToExtRegD(Vd, D); + const size_t d_last = RegNumber(d) + inc * (nelem - 1); + if (n == Reg::R15 || d_last + regs > 32) { + return UnpredictableInstruction(); + } + + [[maybe_unused]] const size_t alignment = align == 0 ? 1 : 4 << align; + const size_t ebytes = static_cast<size_t>(1) << size; + const size_t elements = 8 / ebytes; + + const bool wback = m != Reg::R15; + const bool register_index = m != Reg::R15 && m != Reg::R13; + + for (size_t r = 0; r < regs; r++) { + for (size_t i = 0; i < nelem; i++) { + const ExtReg ext_reg = d + i * inc + r; + ir.SetExtendedRegister(ext_reg, ir.Imm64(0)); + } + } + + IR::U32 address = ir.GetRegister(n); + for (size_t r = 0; r < regs; r++) { + for (size_t e = 0; e < elements; e++) { + for (size_t i = 0; i < nelem; i++) { + const IR::U64 element = ir.ZeroExtendToLong(ir.ReadMemory(ebytes * 8, address, IR::AccType::NORMAL)); + const IR::U64 shifted_element = ir.LogicalShiftLeft(element, ir.Imm8(static_cast<u8>(e * ebytes * 8))); + + const ExtReg ext_reg = d + i * inc + r; + ir.SetExtendedRegister(ext_reg, ir.Or(ir.GetExtendedRegister(ext_reg), shifted_element)); + + address = ir.Add(address, ir.Imm32(static_cast<u32>(ebytes))); + } + } + } + + if (wback) { + if (register_index) { + ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.GetRegister(m))); + } else { + ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.Imm32(static_cast<u32>(8 * nelem * regs)))); + } + } + + return true; +} + +bool TranslatorVisitor::v8_VLD_all_lanes(bool D, Reg n, size_t Vd, size_t nn, size_t sz, bool T, bool a, Reg m) { + const size_t nelem = nn + 1; + + if (nelem == 1 && (sz == 0b11 || (sz == 0b00 && a))) { + return UndefinedInstruction(); + } + if (nelem == 2 && sz == 0b11) { + return UndefinedInstruction(); + } + if (nelem == 3 && (sz == 0b11 || a)) { + return UndefinedInstruction(); + } + if (nelem == 4 && (sz == 0b11 && !a)) { + return UndefinedInstruction(); + } + + const size_t ebytes = sz == 0b11 ? 4 : (1 << sz); + const size_t inc = T ? 2 : 1; + const size_t regs = nelem == 1 ? inc : 1; + [[maybe_unused]] const size_t alignment = [&]() -> size_t { + if (a && nelem == 1) { + return ebytes; + } + if (a && nelem == 2) { + return ebytes * 2; + } + if (a && nelem == 4) { + return sz >= 0b10 ? 2 * ebytes : 4 * ebytes; + } + return 1; + }(); + + const ExtReg d = ToExtRegD(Vd, D); + const size_t d_last = RegNumber(d) + inc * (nelem - 1); + if (n == Reg::R15 || d_last + regs > 32) { + return UnpredictableInstruction(); + } + + const bool wback = m != Reg::R15; + const bool register_index = m != Reg::R15 && m != Reg::R13; + + auto address = ir.GetRegister(n); + for (size_t i = 0; i < nelem; i++) { + const auto element = ir.ReadMemory(ebytes * 8, address, IR::AccType::NORMAL); + const auto replicated_element = ir.VectorBroadcast(ebytes * 8, element); + + for (size_t r = 0; r < regs; r++) { + const ExtReg ext_reg = d + i * inc + r; + ir.SetVector(ext_reg, replicated_element); + } + + address = ir.Add(address, ir.Imm32(static_cast<u32>(ebytes))); + } + + if (wback) { + if (register_index) { + ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.GetRegister(m))); + } else { + ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.Imm32(static_cast<u32>(nelem * ebytes)))); + } + } + + return true; +} + +bool TranslatorVisitor::v8_VST_single(bool D, Reg n, size_t Vd, size_t sz, size_t nn, size_t index_align, Reg m) { + const size_t nelem = nn + 1; + + if (sz == 0b11) { + return DecodeError(); + } + + if (nelem == 1 && mcl::bit::get_bit(sz, index_align)) { + return UndefinedInstruction(); + } + + const size_t ebytes = size_t(1) << sz; + const size_t index = mcl::bit::get_bits(sz + 1, 3, index_align); + const size_t inc = (sz != 0 && mcl::bit::get_bit(sz, index_align)) ? 2 : 1; + const size_t a = mcl::bit::get_bits(0, sz ? sz - 1 : 0, index_align); + + if (nelem == 1 && inc == 2) { + return UndefinedInstruction(); + } + if (nelem == 1 && sz == 2 && (a != 0b00 && a != 0b11)) { + return UndefinedInstruction(); + } + if (nelem == 2 && mcl::bit::get_bit<1>(a)) { + return UndefinedInstruction(); + } + if (nelem == 3 && a != 0b00) { + return UndefinedInstruction(); + } + if (nelem == 4 && a == 0b11) { + return UndefinedInstruction(); + } + + // TODO: alignment + + const ExtReg d = ToExtRegD(Vd, D); + const size_t d_last = RegNumber(d) + inc * (nelem - 1); + if (n == Reg::R15 || d_last + 1 > 32) { + return UnpredictableInstruction(); + } + + const bool wback = m != Reg::R15; + const bool register_index = m != Reg::R15 && m != Reg::R13; + + auto address = ir.GetRegister(n); + for (size_t i = 0; i < nelem; i++) { + const ExtReg ext_reg = d + i * inc; + const auto element = ir.VectorGetElement(ebytes * 8, ir.GetVector(ext_reg), index); + + ir.WriteMemory(ebytes * 8, address, element, IR::AccType::NORMAL); + + address = ir.Add(address, ir.Imm32(static_cast<u32>(ebytes))); + } + + if (wback) { + if (register_index) { + ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.GetRegister(m))); + } else { + ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.Imm32(static_cast<u32>(nelem * ebytes)))); + } + } + + return true; +} + +bool TranslatorVisitor::v8_VLD_single(bool D, Reg n, size_t Vd, size_t sz, size_t nn, size_t index_align, Reg m) { + const size_t nelem = nn + 1; + + if (sz == 0b11) { + return DecodeError(); + } + + if (nelem == 1 && mcl::bit::get_bit(sz, index_align)) { + return UndefinedInstruction(); + } + + const size_t ebytes = size_t(1) << sz; + const size_t index = mcl::bit::get_bits(sz + 1, 3, index_align); + const size_t inc = (sz != 0 && mcl::bit::get_bit(sz, index_align)) ? 2 : 1; + const size_t a = mcl::bit::get_bits(0, sz ? sz - 1 : 0, index_align); + + if (nelem == 1 && inc == 2) { + return UndefinedInstruction(); + } + if (nelem == 1 && sz == 2 && (a != 0b00 && a != 0b11)) { + return UndefinedInstruction(); + } + if (nelem == 2 && mcl::bit::get_bit<1>(a)) { + return UndefinedInstruction(); + } + if (nelem == 3 && a != 0b00) { + return UndefinedInstruction(); + } + if (nelem == 4 && a == 0b11) { + return UndefinedInstruction(); + } + + // TODO: alignment + + const ExtReg d = ToExtRegD(Vd, D); + const size_t d_last = RegNumber(d) + inc * (nelem - 1); + if (n == Reg::R15 || d_last + 1 > 32) { + return UnpredictableInstruction(); + } + + const bool wback = m != Reg::R15; + const bool register_index = m != Reg::R15 && m != Reg::R13; + + auto address = ir.GetRegister(n); + for (size_t i = 0; i < nelem; i++) { + const auto element = ir.ReadMemory(ebytes * 8, address, IR::AccType::NORMAL); + + const ExtReg ext_reg = d + i * inc; + const auto new_reg = ir.VectorSetElement(ebytes * 8, ir.GetVector(ext_reg), index, element); + ir.SetVector(ext_reg, new_reg); + + address = ir.Add(address, ir.Imm32(static_cast<u32>(ebytes))); + } + + if (wback) { + if (register_index) { + ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.GetRegister(m))); + } else { + ir.SetRegister(n, ir.Add(ir.GetRegister(n), ir.Imm32(static_cast<u32>(nelem * ebytes)))); + } + } + + return true; +} +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_misc.cpp new file mode 100644 index 0000000000..e30c438eff --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_misc.cpp @@ -0,0 +1,93 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <vector> + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_count.hpp> +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +static bool TableLookup(TranslatorVisitor& v, bool is_vtbl, bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) { + const size_t length = len + 1; + const auto d = ToVector(false, Vd, D); + const auto m = ToVector(false, Vm, M); + const auto n = ToVector(false, Vn, N); + + if (RegNumber(n) + length > 32) { + return v.UnpredictableInstruction(); + } + + const IR::Table table = v.ir.VectorTable([&] { + std::vector<IR::U64> result; + for (size_t i = 0; i < length; ++i) { + result.emplace_back(v.ir.GetExtendedRegister(n + i)); + } + return result; + }()); + const IR::U64 indicies = v.ir.GetExtendedRegister(m); + const IR::U64 defaults = is_vtbl ? v.ir.Imm64(0) : IR::U64{v.ir.GetExtendedRegister(d)}; + const IR::U64 result = v.ir.VectorTableLookup(defaults, table, indicies); + + v.ir.SetExtendedRegister(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VEXT(bool D, size_t Vn, size_t Vd, Imm<4> imm4, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (!Q && imm4.Bit<3>()) { + return UndefinedInstruction(); + } + + const u8 position = 8 * imm4.ZeroExtend<u8>(); + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_n = ir.GetVector(n); + const auto reg_m = ir.GetVector(m); + const auto result = Q ? ir.VectorExtract(reg_n, reg_m, position) : ir.VectorExtractLower(reg_n, reg_m, position); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VTBL(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) { + return TableLookup(*this, true, D, Vn, Vd, len, N, M, Vm); +} + +bool TranslatorVisitor::asimd_VTBX(bool D, size_t Vn, size_t Vd, size_t len, bool N, bool M, size_t Vm) { + return TableLookup(*this, false, D, Vn, Vd, len, N, M, Vm); +} + +bool TranslatorVisitor::asimd_VDUP_scalar(bool D, Imm<4> imm4, size_t Vd, bool Q, bool M, size_t Vm) { + if (Q && mcl::bit::get_bit<0>(Vd)) { + return UndefinedInstruction(); + } + + if (imm4.Bits<0, 2>() == 0b000) { + return UndefinedInstruction(); + } + + const size_t imm4_lsb = mcl::bit::lowest_set_bit(imm4.ZeroExtend()); + const size_t esize = 8u << imm4_lsb; + const size_t index = imm4.ZeroExtend() >> (imm4_lsb + 1); + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(false, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorBroadcastElement(esize, reg_m, index); + + ir.SetVector(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_one_reg_modified_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_one_reg_modified_immediate.cpp new file mode 100644 index 0000000000..94e3e841e8 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_one_reg_modified_immediate.cpp @@ -0,0 +1,113 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +bool TranslatorVisitor::asimd_VMOV_imm(Imm<1> a, bool D, Imm<1> b, Imm<1> c, Imm<1> d, size_t Vd, Imm<4> cmode, bool Q, bool op, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h) { + if (Q && mcl::bit::get_bit<0>(Vd)) { + return UndefinedInstruction(); + } + + const auto d_reg = ToVector(Q, Vd, D); + const auto imm = AdvSIMDExpandImm(op, cmode, concatenate(a, b, c, d, e, f, g, h)); + + // VMOV + const auto mov = [&] { + const auto imm64 = ir.Imm64(imm); + if (Q) { + ir.SetVector(d_reg, ir.VectorBroadcast(64, imm64)); + } else { + ir.SetExtendedRegister(d_reg, imm64); + } + return true; + }; + + // VMVN + // mvn is a predefined macro in arm64 MSVC + const auto mvn_ = [&] { + const auto imm64 = ir.Imm64(~imm); + if (Q) { + ir.SetVector(d_reg, ir.VectorBroadcast(64, imm64)); + } else { + ir.SetExtendedRegister(d_reg, imm64); + } + return true; + }; + + // VORR + const auto orr = [&] { + const auto imm64 = ir.Imm64(imm); + if (Q) { + const auto reg_value = ir.GetVector(d_reg); + ir.SetVector(d_reg, ir.VectorOr(reg_value, ir.VectorBroadcast(64, imm64))); + } else { + const auto reg_value = ir.GetExtendedRegister(d_reg); + ir.SetExtendedRegister(d_reg, ir.Or(reg_value, imm64)); + } + return true; + }; + + // VBIC + const auto bic = [&] { + const auto imm64 = ir.Imm64(~imm); + if (Q) { + const auto reg_value = ir.GetVector(d_reg); + ir.SetVector(d_reg, ir.VectorAnd(reg_value, ir.VectorBroadcast(64, imm64))); + } else { + const auto reg_value = ir.GetExtendedRegister(d_reg); + ir.SetExtendedRegister(d_reg, ir.And(reg_value, imm64)); + } + return true; + }; + + switch (concatenate(cmode, Imm<1>{op}).ZeroExtend()) { + case 0b00000: + case 0b00100: + case 0b01000: + case 0b01100: + case 0b10000: + case 0b10100: + case 0b11000: + case 0b11010: + case 0b11100: + case 0b11101: + case 0b11110: + return mov(); + case 0b11111: + return UndefinedInstruction(); + case 0b00001: + case 0b00101: + case 0b01001: + case 0b01101: + case 0b10001: + case 0b10101: + case 0b11001: + case 0b11011: + return mvn_(); + case 0b00010: + case 0b00110: + case 0b01010: + case 0b01110: + case 0b10010: + case 0b10110: + return orr(); + case 0b00011: + case 0b00111: + case 0b01011: + case 0b01111: + case 0b10011: + case 0b10111: + return bic(); + } + + UNREACHABLE(); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_three_regs.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_three_regs.cpp new file mode 100644 index 0000000000..a69f39bfb6 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_three_regs.cpp @@ -0,0 +1,973 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +namespace { +enum class Comparison { + GE, + GT, + EQ, + AbsoluteGE, + AbsoluteGT, +}; + +enum class AccumulateBehavior { + None, + Accumulate, +}; + +enum class WidenBehaviour { + Second, + Both, +}; + +template<bool WithDst, typename Callable> +bool BitwiseInstruction(TranslatorVisitor& v, bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm, Callable fn) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return v.UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + if constexpr (WithDst) { + const IR::U128 reg_d = v.ir.GetVector(d); + const IR::U128 reg_m = v.ir.GetVector(m); + const IR::U128 reg_n = v.ir.GetVector(n); + const IR::U128 result = fn(reg_d, reg_n, reg_m); + v.ir.SetVector(d, result); + } else { + const IR::U128 reg_m = v.ir.GetVector(m); + const IR::U128 reg_n = v.ir.GetVector(n); + const IR::U128 result = fn(reg_n, reg_m); + v.ir.SetVector(d, result); + } + + return true; +} + +template<typename Callable> +bool FloatingPointInstruction(TranslatorVisitor& v, bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm, Callable fn) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return v.UndefinedInstruction(); + } + + if (sz == 0b1) { + return v.UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_d = v.ir.GetVector(d); + const auto reg_n = v.ir.GetVector(n); + const auto reg_m = v.ir.GetVector(m); + const auto result = fn(reg_d, reg_n, reg_m); + + v.ir.SetVector(d, result); + return true; +} + +bool IntegerComparison(TranslatorVisitor& v, bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm, Comparison comparison) { + if (sz == 0b11) { + return v.UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return v.UndefinedInstruction(); + } + + const size_t esize = 8 << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_n = v.ir.GetVector(n); + const auto reg_m = v.ir.GetVector(m); + const auto result = [&] { + switch (comparison) { + case Comparison::GT: + return U ? v.ir.VectorGreaterUnsigned(esize, reg_n, reg_m) + : v.ir.VectorGreaterSigned(esize, reg_n, reg_m); + case Comparison::GE: + return U ? v.ir.VectorGreaterEqualUnsigned(esize, reg_n, reg_m) + : v.ir.VectorGreaterEqualSigned(esize, reg_n, reg_m); + case Comparison::EQ: + return v.ir.VectorEqual(esize, reg_n, reg_m); + default: + return IR::U128{}; + } + }(); + + v.ir.SetVector(d, result); + return true; +} + +bool FloatComparison(TranslatorVisitor& v, bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm, Comparison comparison) { + if (sz) { + return v.UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return v.UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_n = v.ir.GetVector(n); + const auto reg_m = v.ir.GetVector(m); + const auto result = [&] { + switch (comparison) { + case Comparison::GE: + return v.ir.FPVectorGreaterEqual(32, reg_n, reg_m, false); + case Comparison::GT: + return v.ir.FPVectorGreater(32, reg_n, reg_m, false); + case Comparison::EQ: + return v.ir.FPVectorEqual(32, reg_n, reg_m, false); + case Comparison::AbsoluteGE: + return v.ir.FPVectorGreaterEqual(32, v.ir.FPVectorAbs(32, reg_n), v.ir.FPVectorAbs(32, reg_m), false); + case Comparison::AbsoluteGT: + return v.ir.FPVectorGreater(32, v.ir.FPVectorAbs(32, reg_n), v.ir.FPVectorAbs(32, reg_m), false); + default: + return IR::U128{}; + } + }(); + + v.ir.SetVector(d, result); + return true; +} + +bool AbsoluteDifference(TranslatorVisitor& v, bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm, AccumulateBehavior accumulate) { + if (sz == 0b11) { + return v.UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return v.UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_m = v.ir.GetVector(m); + const auto reg_n = v.ir.GetVector(n); + const auto result = [&] { + const auto absdiff = U ? v.ir.VectorUnsignedAbsoluteDifference(esize, reg_n, reg_m) + : v.ir.VectorSignedAbsoluteDifference(esize, reg_n, reg_m); + + if (accumulate == AccumulateBehavior::Accumulate) { + const auto reg_d = v.ir.GetVector(d); + return v.ir.VectorAdd(esize, reg_d, absdiff); + } + + return absdiff; + }(); + + v.ir.SetVector(d, result); + return true; +} + +bool AbsoluteDifferenceLong(TranslatorVisitor& v, bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm, AccumulateBehavior accumulate) { + if (sz == 0b11) { + return v.DecodeError(); + } + + if (mcl::bit::get_bit<0>(Vd)) { + return v.UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(true, Vd, D); + const auto m = ToVector(false, Vm, M); + const auto n = ToVector(false, Vn, N); + + const auto reg_m = v.ir.GetVector(m); + const auto reg_n = v.ir.GetVector(n); + const auto operand_m = v.ir.VectorZeroExtend(esize, v.ir.ZeroExtendToQuad(v.ir.VectorGetElement(64, reg_m, 0))); + const auto operand_n = v.ir.VectorZeroExtend(esize, v.ir.ZeroExtendToQuad(v.ir.VectorGetElement(64, reg_n, 0))); + const auto result = [&] { + const auto absdiff = U ? v.ir.VectorUnsignedAbsoluteDifference(esize, operand_m, operand_n) + : v.ir.VectorSignedAbsoluteDifference(esize, operand_m, operand_n); + + if (accumulate == AccumulateBehavior::Accumulate) { + const auto reg_d = v.ir.GetVector(d); + return v.ir.VectorAdd(2 * esize, reg_d, absdiff); + } + + return absdiff; + }(); + + v.ir.SetVector(d, result); + return true; +} + +template<typename Callable> +bool WideInstruction(TranslatorVisitor& v, bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm, WidenBehaviour widen_behaviour, Callable fn) { + const size_t esize = 8U << sz; + const bool widen_first = widen_behaviour == WidenBehaviour::Both; + + if (sz == 0b11) { + return v.DecodeError(); + } + + if (mcl::bit::get_bit<0>(Vd) || (!widen_first && mcl::bit::get_bit<0>(Vn))) { + return v.UndefinedInstruction(); + } + + const auto d = ToVector(true, Vd, D); + const auto m = ToVector(false, Vm, M); + const auto n = ToVector(!widen_first, Vn, N); + + const auto reg_d = v.ir.GetVector(d); + const auto reg_m = v.ir.GetVector(m); + const auto reg_n = v.ir.GetVector(n); + const auto wide_n = U ? v.ir.VectorZeroExtend(esize, reg_n) : v.ir.VectorSignExtend(esize, reg_n); + const auto wide_m = U ? v.ir.VectorZeroExtend(esize, reg_m) : v.ir.VectorSignExtend(esize, reg_m); + const auto result = fn(esize * 2, reg_d, widen_first ? wide_n : reg_n, wide_m); + + v.ir.SetVector(d, result); + return true; +} + +} // Anonymous namespace + +// ASIMD Three registers of the same length + +bool TranslatorVisitor::asimd_VHADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (sz == 0b11) { + return UndefinedInstruction(); + } + + const size_t esize = 8 << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const IR::U128 reg_n = ir.GetVector(n); + const IR::U128 reg_m = ir.GetVector(m); + const IR::U128 result = U ? ir.VectorHalvingAddUnsigned(esize, reg_n, reg_m) : ir.VectorHalvingAddSigned(esize, reg_n, reg_m); + ir.SetVector(d, result); + + return true; +} + +bool TranslatorVisitor::asimd_VQADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (sz == 0b11) { + return UndefinedInstruction(); + } + + const size_t esize = 8 << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const IR::U128 reg_n = ir.GetVector(n); + const IR::U128 reg_m = ir.GetVector(m); + const IR::U128 result = U ? ir.VectorUnsignedSaturatedAdd(esize, reg_n, reg_m) : ir.VectorSignedSaturatedAdd(esize, reg_n, reg_m); + ir.SetVector(d, result); + + return true; +} + +bool TranslatorVisitor::asimd_VRHADD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (sz == 0b11) { + return UndefinedInstruction(); + } + + const size_t esize = 8 << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const IR::U128 reg_n = ir.GetVector(n); + const IR::U128 reg_m = ir.GetVector(m); + const IR::U128 result = U ? ir.VectorRoundingHalvingAddUnsigned(esize, reg_n, reg_m) : ir.VectorRoundingHalvingAddSigned(esize, reg_n, reg_m); + ir.SetVector(d, result); + + return true; +} + +bool TranslatorVisitor::asimd_VAND_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return BitwiseInstruction<false>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_n, const auto& reg_m) { + return ir.VectorAnd(reg_n, reg_m); + }); +} + +bool TranslatorVisitor::asimd_VBIC_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return BitwiseInstruction<false>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_n, const auto& reg_m) { + return ir.VectorAndNot(reg_n, reg_m); + }); +} + +bool TranslatorVisitor::asimd_VORR_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return BitwiseInstruction<false>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_n, const auto& reg_m) { + return ir.VectorOr(reg_n, reg_m); + }); +} + +bool TranslatorVisitor::asimd_VORN_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return BitwiseInstruction<false>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_n, const auto& reg_m) { + return ir.VectorOr(reg_n, ir.VectorNot(reg_m)); + }); +} + +bool TranslatorVisitor::asimd_VEOR_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return BitwiseInstruction<false>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_n, const auto& reg_m) { + return ir.VectorEor(reg_n, reg_m); + }); +} + +bool TranslatorVisitor::asimd_VBSL(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return BitwiseInstruction<true>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) { + return ir.VectorOr(ir.VectorAnd(reg_n, reg_d), ir.VectorAndNot(reg_m, reg_d)); + }); +} + +bool TranslatorVisitor::asimd_VBIT(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return BitwiseInstruction<true>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) { + return ir.VectorOr(ir.VectorAnd(reg_n, reg_m), ir.VectorAndNot(reg_d, reg_m)); + }); +} + +bool TranslatorVisitor::asimd_VBIF(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return BitwiseInstruction<true>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) { + return ir.VectorOr(ir.VectorAnd(reg_d, reg_m), ir.VectorAndNot(reg_n, reg_m)); + }); +} + +bool TranslatorVisitor::asimd_VHSUB(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (sz == 0b11) { + return UndefinedInstruction(); + } + + const size_t esize = 8 << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const IR::U128 reg_n = ir.GetVector(n); + const IR::U128 reg_m = ir.GetVector(m); + const IR::U128 result = U ? ir.VectorHalvingSubUnsigned(esize, reg_n, reg_m) : ir.VectorHalvingSubSigned(esize, reg_n, reg_m); + ir.SetVector(d, result); + + return true; +} + +bool TranslatorVisitor::asimd_VQSUB(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (sz == 0b11) { + return UndefinedInstruction(); + } + + const size_t esize = 8 << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const IR::U128 reg_n = ir.GetVector(n); + const IR::U128 reg_m = ir.GetVector(m); + const IR::U128 result = U ? ir.VectorUnsignedSaturatedSub(esize, reg_n, reg_m) : ir.VectorSignedSaturatedSub(esize, reg_n, reg_m); + ir.SetVector(d, result); + + return true; +} + +bool TranslatorVisitor::asimd_VCGT_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return IntegerComparison(*this, U, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::GT); +} + +bool TranslatorVisitor::asimd_VCGE_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return IntegerComparison(*this, U, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::GE); +} + +bool TranslatorVisitor::asimd_VABD(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return AbsoluteDifference(*this, U, D, sz, Vn, Vd, N, Q, M, Vm, AccumulateBehavior::None); +} + +bool TranslatorVisitor::asimd_VABA(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return AbsoluteDifference(*this, U, D, sz, Vn, Vd, N, Q, M, Vm, AccumulateBehavior::Accumulate); +} + +bool TranslatorVisitor::asimd_VADD_int(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_m = ir.GetVector(m); + const auto reg_n = ir.GetVector(n); + const auto result = ir.VectorAdd(esize, reg_n, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VSUB_int(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_m = ir.GetVector(m); + const auto reg_n = ir.GetVector(n); + const auto result = ir.VectorSub(esize, reg_n, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VSHL_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_m = ir.GetVector(m); + const auto reg_n = ir.GetVector(n); + const auto result = U ? ir.VectorLogicalVShift(esize, reg_m, reg_n) + : ir.VectorArithmeticVShift(esize, reg_m, reg_n); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VQSHL_reg(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_m = ir.GetVector(m); + const auto reg_n = ir.GetVector(n); + const auto result = U ? ir.VectorUnsignedSaturatedShiftLeft(esize, reg_m, reg_n) + : ir.VectorSignedSaturatedShiftLeft(esize, reg_m, reg_n); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VRSHL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_m = ir.GetVector(m); + const auto reg_n = ir.GetVector(n); + const auto result = U ? ir.VectorRoundingShiftLeftUnsigned(esize, reg_m, reg_n) + : ir.VectorRoundingShiftLeftSigned(esize, reg_m, reg_n); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VMAX(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, bool op, size_t Vm) { + if (sz == 0b11) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_m = ir.GetVector(m); + const auto reg_n = ir.GetVector(n); + const auto result = [&] { + if (op) { + return U ? ir.VectorMinUnsigned(esize, reg_n, reg_m) + : ir.VectorMinSigned(esize, reg_n, reg_m); + } else { + return U ? ir.VectorMaxUnsigned(esize, reg_n, reg_m) + : ir.VectorMaxSigned(esize, reg_n, reg_m); + } + }(); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VTST(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (sz == 0b11) { + return UndefinedInstruction(); + } + + const size_t esize = 8 << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_n = ir.GetVector(n); + const auto reg_m = ir.GetVector(m); + const auto anded = ir.VectorAnd(reg_n, reg_m); + const auto result = ir.VectorNot(ir.VectorEqual(esize, anded, ir.ZeroVector())); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VCEQ_reg(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return IntegerComparison(*this, false, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::EQ); +} + +bool TranslatorVisitor::asimd_VMLA(bool op, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (sz == 0b11) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_n = ir.GetVector(n); + const auto reg_m = ir.GetVector(m); + const auto reg_d = ir.GetVector(d); + const auto multiply = ir.VectorMultiply(esize, reg_n, reg_m); + const auto result = op ? ir.VectorSub(esize, reg_d, multiply) + : ir.VectorAdd(esize, reg_d, multiply); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VMUL(bool P, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (sz == 0b11 || (P && sz != 0b00)) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_n = ir.GetVector(n); + const auto reg_m = ir.GetVector(m); + const auto result = P ? ir.VectorPolynomialMultiply(reg_n, reg_m) + : ir.VectorMultiply(esize, reg_n, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VPMAX_int(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, bool op, size_t Vm) { + if (sz == 0b11 || Q) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(false, Vd, D); + const auto m = ToVector(false, Vm, M); + const auto n = ToVector(false, Vn, N); + + const auto reg_m = ir.GetVector(m); + const auto reg_n = ir.GetVector(n); + + const auto bottom = ir.VectorDeinterleaveEvenLower(esize, reg_n, reg_m); + const auto top = ir.VectorDeinterleaveOddLower(esize, reg_n, reg_m); + + const auto result = [&] { + if (op) { + return U ? ir.VectorMinUnsigned(esize, bottom, top) + : ir.VectorMinSigned(esize, bottom, top); + } else { + return U ? ir.VectorMaxUnsigned(esize, bottom, top) + : ir.VectorMaxSigned(esize, bottom, top); + } + }(); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::v8_VMAXNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMaxNumeric(32, reg_n, reg_m, false); + }); +} + +bool TranslatorVisitor::v8_VMINNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMinNumeric(32, reg_n, reg_m, false); + }); +} + +bool TranslatorVisitor::asimd_VQDMULH(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (sz == 0b00 || sz == 0b11) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_n = ir.GetVector(n); + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorSignedSaturatedDoublingMultiplyHigh(esize, reg_n, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VQRDMULH(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (sz == 0b00 || sz == 0b11) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_n = ir.GetVector(n); + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, reg_n, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VPADD(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q || sz == 0b11) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto n = ToVector(Q, Vn, N); + + const auto reg_n = ir.GetVector(n); + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorPairedAddLower(esize, reg_n, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VFMA(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMulAdd(32, reg_d, reg_n, reg_m, false); + }); +} + +bool TranslatorVisitor::asimd_VFMS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMulAdd(32, reg_d, ir.FPVectorNeg(32, reg_n), reg_m, false); + }); +} + +bool TranslatorVisitor::asimd_VADD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorAdd(32, reg_n, reg_m, false); + }); +} + +bool TranslatorVisitor::asimd_VSUB_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorSub(32, reg_n, reg_m, false); + }); +} + +bool TranslatorVisitor::asimd_VPADD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q) { + return UndefinedInstruction(); + } + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorPairedAddLower(32, reg_n, reg_m, false); + }); +} + +bool TranslatorVisitor::asimd_VABD_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorAbs(32, ir.FPVectorSub(32, reg_n, reg_m, false)); + }); +} + +bool TranslatorVisitor::asimd_VMLA_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) { + const auto product = ir.FPVectorMul(32, reg_n, reg_m, false); + return ir.FPVectorAdd(32, reg_d, product, false); + }); +} + +bool TranslatorVisitor::asimd_VMLS_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) { + const auto product = ir.FPVectorMul(32, reg_n, reg_m, false); + return ir.FPVectorAdd(32, reg_d, ir.FPVectorNeg(32, product), false); + }); +} + +bool TranslatorVisitor::asimd_VMUL_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMul(32, reg_n, reg_m, false); + }); +} + +bool TranslatorVisitor::asimd_VCEQ_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatComparison(*this, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::EQ); +} + +bool TranslatorVisitor::asimd_VCGE_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatComparison(*this, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::GE); +} + +bool TranslatorVisitor::asimd_VCGT_reg_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatComparison(*this, D, sz, Vn, Vd, N, Q, M, Vm, Comparison::GT); +} + +bool TranslatorVisitor::asimd_VACGE(bool D, bool op, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + const auto comparison = op ? Comparison::AbsoluteGT : Comparison::AbsoluteGE; + return FloatComparison(*this, D, sz, Vn, Vd, N, Q, M, Vm, comparison); +} + +bool TranslatorVisitor::asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMax(32, reg_n, reg_m, false); + }); +} + +bool TranslatorVisitor::asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMin(32, reg_n, reg_m, false); + }); +} + +bool TranslatorVisitor::asimd_VPMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q) { + return UndefinedInstruction(); + } + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + const auto bottom = ir.VectorDeinterleaveEvenLower(32, reg_n, reg_m); + const auto top = ir.VectorDeinterleaveOddLower(32, reg_n, reg_m); + return ir.FPVectorMax(32, bottom, top, false); + }); +} + +bool TranslatorVisitor::asimd_VPMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (Q) { + return UndefinedInstruction(); + } + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + const auto bottom = ir.VectorDeinterleaveEvenLower(32, reg_n, reg_m); + const auto top = ir.VectorDeinterleaveOddLower(32, reg_n, reg_m); + return ir.FPVectorMin(32, bottom, top, false); + }); +} + +bool TranslatorVisitor::asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorRecipStepFused(32, reg_n, reg_m, false); + }); +} + +bool TranslatorVisitor::asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorRSqrtStepFused(32, reg_n, reg_m, false); + }); +} + +bool TranslatorVisitor::v8_SHA256H(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (!Q || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto n = ToVector(Q, Vn, N); + const auto m = ToVector(Q, Vm, M); + + const auto x = ir.GetVector(d); + const auto y = ir.GetVector(n); + const auto w = ir.GetVector(m); + const auto result = ir.SHA256Hash(x, y, w, true); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::v8_SHA256H2(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (!Q || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + + const auto n = ToVector(Q, Vn, N); + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto x = ir.GetVector(n); + const auto y = ir.GetVector(d); + const auto w = ir.GetVector(m); + const auto result = ir.SHA256Hash(x, y, w, false); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::v8_SHA256SU1(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + if (!Q || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn) || mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto n = ToVector(Q, Vn, N); + const auto m = ToVector(Q, Vm, M); + + const auto x = ir.GetVector(d); + const auto y = ir.GetVector(n); + const auto z = ir.GetVector(m); + const auto result = ir.SHA256MessageSchedule1(x, y, z); + + ir.SetVector(d, result); + return true; +} + +// ASIMD Three registers of different length + +bool TranslatorVisitor::asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) { + return WideInstruction(*this, U, D, sz, Vn, Vd, N, M, Vm, op ? WidenBehaviour::Second : WidenBehaviour::Both, [this](size_t esize, const auto&, const auto& reg_n, const auto& reg_m) { + return ir.VectorAdd(esize, reg_n, reg_m); + }); +} + +bool TranslatorVisitor::asimd_VSUBL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) { + return WideInstruction(*this, U, D, sz, Vn, Vd, N, M, Vm, op ? WidenBehaviour::Second : WidenBehaviour::Both, [this](size_t esize, const auto&, const auto& reg_n, const auto& reg_m) { + return ir.VectorSub(esize, reg_n, reg_m); + }); +} + +bool TranslatorVisitor::asimd_VABAL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) { + return AbsoluteDifferenceLong(*this, U, D, sz, Vn, Vd, N, M, Vm, AccumulateBehavior::Accumulate); +} + +bool TranslatorVisitor::asimd_VABDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) { + return AbsoluteDifferenceLong(*this, U, D, sz, Vn, Vd, N, M, Vm, AccumulateBehavior::None); +} + +bool TranslatorVisitor::asimd_VMLAL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) { + const size_t esize = 8U << sz; + + if (sz == 0b11) { + return DecodeError(); + } + + if (mcl::bit::get_bit<0>(Vd)) { + return UndefinedInstruction(); + } + + const auto d = ToVector(true, Vd, D); + const auto m = ToVector(false, Vm, M); + const auto n = ToVector(false, Vn, N); + + const auto reg_d = ir.GetVector(d); + const auto reg_m = ir.GetVector(m); + const auto reg_n = ir.GetVector(n); + const auto multiply = U ? ir.VectorMultiplyUnsignedWiden(esize, reg_n, reg_m) + : ir.VectorMultiplySignedWiden(esize, reg_n, reg_m); + const auto result = op ? ir.VectorSub(esize * 2, reg_d, multiply) + : ir.VectorAdd(esize * 2, reg_d, multiply); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VMULL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool P, bool N, bool M, size_t Vm) { + if (sz == 0b11) { + return DecodeError(); + } + + if ((P & (U || sz == 0b10)) || mcl::bit::get_bit<0>(Vd)) { + return UndefinedInstruction(); + } + + const size_t esize = P ? (sz == 0b00 ? 8 : 64) : 8U << sz; + const auto d = ToVector(true, Vd, D); + const auto m = ToVector(false, Vm, M); + const auto n = ToVector(false, Vn, N); + + const auto reg_n = ir.GetVector(n); + const auto reg_m = ir.GetVector(m); + const auto result = P ? ir.VectorPolynomialMultiplyLong(esize, reg_n, reg_m) + : U ? ir.VectorMultiplyUnsignedWiden(esize, reg_n, reg_m) + : ir.VectorMultiplySignedWiden(esize, reg_n, reg_m); + + ir.SetVector(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp new file mode 100644 index 0000000000..62b9af55a5 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp @@ -0,0 +1,767 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <array> + +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +namespace { +enum class Comparison { + EQ, + GE, + GT, + LE, + LT, +}; + +bool CompareWithZero(TranslatorVisitor& v, bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm, Comparison type) { + if (sz == 0b11 || (F && sz != 0b10)) { + return v.UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return v.UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto result = [&] { + const auto reg_m = v.ir.GetVector(m); + const auto zero = v.ir.ZeroVector(); + + if (F) { + switch (type) { + case Comparison::EQ: + return v.ir.FPVectorEqual(32, reg_m, zero, false); + case Comparison::GE: + return v.ir.FPVectorGreaterEqual(32, reg_m, zero, false); + case Comparison::GT: + return v.ir.FPVectorGreater(32, reg_m, zero, false); + case Comparison::LE: + return v.ir.FPVectorGreaterEqual(32, zero, reg_m, false); + case Comparison::LT: + return v.ir.FPVectorGreater(32, zero, reg_m, false); + } + + return IR::U128{}; + } else { + static constexpr std::array fns{ + &IREmitter::VectorEqual, + &IREmitter::VectorGreaterEqualSigned, + &IREmitter::VectorGreaterSigned, + &IREmitter::VectorLessEqualSigned, + &IREmitter::VectorLessSigned, + }; + + const size_t esize = 8U << sz; + return (v.ir.*fns[static_cast<size_t>(type)])(esize, reg_m, zero); + } + }(); + + v.ir.SetVector(d, result); + return true; +} + +enum class AccumulateBehavior { + None, + Accumulate, +}; + +bool PairedAddOperation(TranslatorVisitor& v, bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm, AccumulateBehavior accumulate) { + if (sz == 0b11) { + return v.UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return v.UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto reg_m = v.ir.GetVector(m); + const auto result = [&] { + const auto tmp = op ? v.ir.VectorPairedAddUnsignedWiden(esize, reg_m) + : v.ir.VectorPairedAddSignedWiden(esize, reg_m); + + if (accumulate == AccumulateBehavior::Accumulate) { + const auto reg_d = v.ir.GetVector(d); + return v.ir.VectorAdd(esize * 2, reg_d, tmp); + } + + return tmp; + }(); + + v.ir.SetVector(d, result); + return true; +} + +bool RoundFloatToInteger(TranslatorVisitor& v, bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm, bool exact, FP::RoundingMode rounding_mode) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return v.UndefinedInstruction(); + } + + if (sz != 0b10) { + return v.UndefinedInstruction(); // TODO: FP16 + } + + const size_t esize = 8 << sz; + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto reg_m = v.ir.GetVector(m); + const auto result = v.ir.FPVectorRoundInt(esize, reg_m, rounding_mode, exact, false); + + v.ir.SetVector(d, result); + return true; +} + +bool ConvertFloatToInteger(TranslatorVisitor& v, bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm, FP::RoundingMode rounding_mode) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return v.UndefinedInstruction(); + } + + if (sz != 0b10) { + return v.UndefinedInstruction(); // TODO: FP16 + } + + const bool unsigned_ = op; + const size_t esize = 8 << sz; + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto reg_m = v.ir.GetVector(m); + const auto result = unsigned_ + ? v.ir.FPVectorToUnsignedFixed(esize, reg_m, 0, rounding_mode, false) + : v.ir.FPVectorToSignedFixed(esize, reg_m, 0, rounding_mode, false); + + v.ir.SetVector(d, result); + return true; +} + +} // Anonymous namespace + +bool TranslatorVisitor::asimd_VREV(bool D, size_t sz, size_t Vd, size_t op, bool Q, bool M, size_t Vm) { + if (op + sz >= 3) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto result = [this, m, op, sz] { + const auto reg_m = ir.GetVector(m); + const size_t esize = 8 << sz; + + switch (op) { + case 0b00: + return ir.VectorReverseElementsInLongGroups(esize, reg_m); + case 0b01: + return ir.VectorReverseElementsInWordGroups(esize, reg_m); + case 0b10: + return ir.VectorReverseElementsInHalfGroups(esize, reg_m); + } + + UNREACHABLE(); + }(); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VPADDL(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) { + return PairedAddOperation(*this, D, sz, Vd, op, Q, M, Vm, AccumulateBehavior::None); +} + +bool TranslatorVisitor::v8_AESD(bool D, size_t sz, size_t Vd, bool M, size_t Vm) { + if (sz != 0b00 || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + + const auto d = ToVector(true, Vd, D); + const auto m = ToVector(true, Vm, M); + const auto reg_d = ir.GetVector(d); + const auto reg_m = ir.GetVector(m); + const auto result = ir.AESDecryptSingleRound(ir.VectorEor(reg_d, reg_m)); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::v8_AESE(bool D, size_t sz, size_t Vd, bool M, size_t Vm) { + if (sz != 0b00 || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + + const auto d = ToVector(true, Vd, D); + const auto m = ToVector(true, Vm, M); + const auto reg_d = ir.GetVector(d); + const auto reg_m = ir.GetVector(m); + const auto result = ir.AESEncryptSingleRound(ir.VectorEor(reg_d, reg_m)); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::v8_AESIMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm) { + if (sz != 0b00 || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + + const auto d = ToVector(true, Vd, D); + const auto m = ToVector(true, Vm, M); + const auto reg_m = ir.GetVector(m); + const auto result = ir.AESInverseMixColumns(reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::v8_AESMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm) { + if (sz != 0b00 || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + + const auto d = ToVector(true, Vd, D); + const auto m = ToVector(true, Vm, M); + const auto reg_m = ir.GetVector(m); + const auto result = ir.AESMixColumns(reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::v8_SHA256SU0(bool D, size_t sz, size_t Vd, bool M, size_t Vm) { + if (sz != 0b10 || mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + + const auto d = ToVector(true, Vd, D); + const auto m = ToVector(true, Vm, M); + const auto x = ir.GetVector(d); + const auto y = ir.GetVector(m); + const auto result = ir.SHA256MessageSchedule0(x, y); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + if (sz == 0b11) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto result = [this, m, sz] { + const auto reg_m = ir.GetVector(m); + const size_t esize = 8U << sz; + const auto shifted = ir.VectorArithmeticShiftRight(esize, reg_m, static_cast<u8>(esize)); + const auto xored = ir.VectorEor(reg_m, shifted); + const auto clz = ir.VectorCountLeadingZeros(esize, xored); + return ir.VectorSub(esize, clz, ir.VectorBroadcast(esize, I(esize, 1))); + }(); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VCLZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + if (sz == 0b11) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto result = [this, m, sz] { + const auto reg_m = ir.GetVector(m); + const size_t esize = 8U << sz; + + return ir.VectorCountLeadingZeros(esize, reg_m); + }(); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VCNT(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + if (sz != 0b00) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorPopulationCount(reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VMVN_reg(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + if (sz != 0b00) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorNot(reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VPADAL(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) { + return PairedAddOperation(*this, D, sz, Vd, op, Q, M, Vm, AccumulateBehavior::Accumulate); +} + +bool TranslatorVisitor::asimd_VQABS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + if (sz == 0b11) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorSignedSaturatedAbs(esize, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VQNEG(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + if (sz == 0b11) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorSignedSaturatedNeg(esize, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VCGT_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) { + return CompareWithZero(*this, D, sz, Vd, F, Q, M, Vm, Comparison::GT); +} + +bool TranslatorVisitor::asimd_VCGE_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) { + return CompareWithZero(*this, D, sz, Vd, F, Q, M, Vm, Comparison::GE); +} + +bool TranslatorVisitor::asimd_VCEQ_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) { + return CompareWithZero(*this, D, sz, Vd, F, Q, M, Vm, Comparison::EQ); +} + +bool TranslatorVisitor::asimd_VCLE_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) { + return CompareWithZero(*this, D, sz, Vd, F, Q, M, Vm, Comparison::LE); +} + +bool TranslatorVisitor::asimd_VCLT_zero(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) { + return CompareWithZero(*this, D, sz, Vd, F, Q, M, Vm, Comparison::LT); +} + +bool TranslatorVisitor::asimd_VABS(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) { + if (sz == 0b11 || (F && sz != 0b10)) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto result = [this, F, m, sz] { + const auto reg_m = ir.GetVector(m); + + if (F) { + return ir.FPVectorAbs(32, reg_m); + } + + const size_t esize = 8U << sz; + return ir.VectorAbs(esize, reg_m); + }(); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VNEG(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) { + if (sz == 0b11 || (F && sz != 0b10)) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto result = [this, F, m, sz] { + const auto reg_m = ir.GetVector(m); + + if (F) { + return ir.FPVectorNeg(32, reg_m); + } + + const size_t esize = 8U << sz; + return ir.VectorSub(esize, ir.ZeroVector(), reg_m); + }(); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VSWP(bool D, size_t Vd, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + // Swapping the same register results in the same contents. + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + if (d == m) { + return true; + } + + if (Q) { + const auto reg_d = ir.GetVector(d); + const auto reg_m = ir.GetVector(m); + + ir.SetVector(m, reg_d); + ir.SetVector(d, reg_m); + } else { + const auto reg_d = ir.GetExtendedRegister(d); + const auto reg_m = ir.GetExtendedRegister(m); + + ir.SetExtendedRegister(m, reg_d); + ir.SetExtendedRegister(d, reg_m); + } + + return true; +} + +bool TranslatorVisitor::asimd_VTRN(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + if (sz == 0b11) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + if (d == m) { + return UnpredictableInstruction(); + } + + const auto reg_d = ir.GetVector(d); + const auto reg_m = ir.GetVector(m); + const auto result_d = ir.VectorTranspose(esize, reg_d, reg_m, false); + const auto result_m = ir.VectorTranspose(esize, reg_d, reg_m, true); + + ir.SetVector(d, result_d); + ir.SetVector(m, result_m); + return true; +} + +bool TranslatorVisitor::asimd_VUZP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + if (sz == 0b11 || (!Q && sz == 0b10)) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + if (d == m) { + return UnpredictableInstruction(); + } + + const auto reg_d = ir.GetVector(d); + const auto reg_m = ir.GetVector(m); + auto result_d = Q ? ir.VectorDeinterleaveEven(esize, reg_d, reg_m) : ir.VectorDeinterleaveEvenLower(esize, reg_d, reg_m); + auto result_m = Q ? ir.VectorDeinterleaveOdd(esize, reg_d, reg_m) : ir.VectorDeinterleaveOddLower(esize, reg_d, reg_m); + + ir.SetVector(d, result_d); + ir.SetVector(m, result_m); + return true; +} + +bool TranslatorVisitor::asimd_VZIP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + if (sz == 0b11 || (!Q && sz == 0b10)) { + return UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + if (d == m) { + return UnpredictableInstruction(); + } + + const auto reg_d = ir.GetVector(d); + const auto reg_m = ir.GetVector(m); + + if (Q) { + const auto result_d = ir.VectorInterleaveLower(esize, reg_d, reg_m); + const auto result_m = ir.VectorInterleaveUpper(esize, reg_d, reg_m); + + ir.SetVector(d, result_d); + ir.SetVector(m, result_m); + } else { + const auto result = ir.VectorInterleaveLower(esize, reg_d, reg_m); + + ir.SetExtendedRegister(d, ir.VectorGetElement(64, result, 0)); + ir.SetExtendedRegister(m, ir.VectorGetElement(64, result, 1)); + } + return true; +} + +bool TranslatorVisitor::asimd_VMOVN(bool D, size_t sz, size_t Vd, bool M, size_t Vm) { + if (sz == 0b11 || mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + const size_t esize = 8U << sz; + const auto d = ToVector(false, Vd, D); + const auto m = ToVector(true, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorNarrow(2 * esize, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VQMOVUN(bool D, size_t sz, size_t Vd, bool M, size_t Vm) { + if (sz == 0b11 || mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + const size_t esize = 8U << sz; + const auto d = ToVector(false, Vd, D); + const auto m = ToVector(true, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorSignedSaturatedNarrowToUnsigned(2 * esize, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VQMOVN(bool D, size_t sz, size_t Vd, bool op, bool M, size_t Vm) { + if (sz == 0b11 || mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + const size_t esize = 8U << sz; + const auto d = ToVector(false, Vd, D); + const auto m = ToVector(true, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto result = op ? ir.VectorUnsignedSaturatedNarrow(2 * esize, reg_m) + : ir.VectorSignedSaturatedNarrowToSigned(2 * esize, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VSHLL_max(bool D, size_t sz, size_t Vd, bool M, size_t Vm) { + if (sz == 0b11 || mcl::bit::get_bit<0>(Vd)) { + return UndefinedInstruction(); + } + const size_t esize = 8U << sz; + const auto d = ToVector(true, Vd, D); + const auto m = ToVector(false, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorLogicalShiftLeft(2 * esize, ir.VectorZeroExtend(esize, reg_m), static_cast<u8>(esize)); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::v8_VRINTN(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, false, FP::RoundingMode::ToNearest_TieEven); +} +bool TranslatorVisitor::v8_VRINTX(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, true, FP::RoundingMode::ToNearest_TieEven); +} +bool TranslatorVisitor::v8_VRINTA(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, false, FP::RoundingMode::ToNearest_TieAwayFromZero); +} +bool TranslatorVisitor::v8_VRINTZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, false, FP::RoundingMode::TowardsZero); +} +bool TranslatorVisitor::v8_VRINTM(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, false, FP::RoundingMode::TowardsMinusInfinity); +} +bool TranslatorVisitor::v8_VRINTP(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { + return RoundFloatToInteger(*this, D, sz, Vd, Q, M, Vm, false, FP::RoundingMode::TowardsPlusInfinity); +} + +bool TranslatorVisitor::asimd_VCVT_half(bool D, size_t sz, size_t Vd, bool half_to_single, bool M, size_t Vm) { + if (sz != 0b01) { + return UndefinedInstruction(); + } + if (half_to_single && mcl::bit::get_bit<0>(Vd)) { + return UndefinedInstruction(); + } + if (!half_to_single && mcl::bit::get_bit<0>(Vm)) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto rounding_mode = FP::RoundingMode::ToNearest_TieEven; // StandardFPSCRValue().RMode + const auto d = ToVector(half_to_single, Vd, D); + const auto m = ToVector(!half_to_single, Vm, M); + + const auto operand = ir.GetVector(m); + const IR::U128 result = half_to_single ? ir.FPVectorFromHalf(esize * 2, operand, rounding_mode, false) + : ir.FPVectorToHalf(esize * 2, operand, rounding_mode, false); + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::v8_VCVTA(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) { + return ConvertFloatToInteger(*this, D, sz, Vd, op, Q, M, Vm, FP::RoundingMode::ToNearest_TieAwayFromZero); +} +bool TranslatorVisitor::v8_VCVTN(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) { + return ConvertFloatToInteger(*this, D, sz, Vd, op, Q, M, Vm, FP::RoundingMode::ToNearest_TieEven); +} +bool TranslatorVisitor::v8_VCVTP(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) { + return ConvertFloatToInteger(*this, D, sz, Vd, op, Q, M, Vm, FP::RoundingMode::TowardsPlusInfinity); +} +bool TranslatorVisitor::v8_VCVTM(bool D, size_t sz, size_t Vd, bool op, bool Q, bool M, size_t Vm) { + return ConvertFloatToInteger(*this, D, sz, Vd, op, Q, M, Vm, FP::RoundingMode::TowardsMinusInfinity); +} + +bool TranslatorVisitor::asimd_VRECPE(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (sz == 0b00 || sz == 0b11) { + return UndefinedInstruction(); + } + + if (!F && sz == 0b01) { + // TODO: Implement 16-bit VectorUnsignedRecipEstimate + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto reg_m = ir.GetVector(m); + const auto result = F ? ir.FPVectorRecipEstimate(esize, reg_m, false) + : ir.VectorUnsignedRecipEstimate(reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VRSQRTE(bool D, size_t sz, size_t Vd, bool F, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (sz == 0b00 || sz == 0b11) { + return UndefinedInstruction(); + } + + if (!F && sz == 0b01) { + // TODO: Implement 16-bit VectorUnsignedRecipEstimate + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto reg_m = ir.GetVector(m); + const auto result = F ? ir.FPVectorRSqrtEstimate(esize, reg_m, false) + : ir.VectorUnsignedRecipSqrtEstimate(reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VCVT_integer(bool D, size_t sz, size_t Vd, bool op, bool U, bool Q, bool M, size_t Vm) { + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (sz != 0b10) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto reg_m = ir.GetVector(m); + const auto result = op ? (U ? ir.FPVectorToUnsignedFixed(esize, reg_m, 0, FP::RoundingMode::TowardsZero, false) + : ir.FPVectorToSignedFixed(esize, reg_m, 0, FP::RoundingMode::TowardsZero, false)) + : (U ? ir.FPVectorFromUnsignedFixed(esize, reg_m, 0, FP::RoundingMode::ToNearest_TieEven, false) + : ir.FPVectorFromSignedFixed(esize, reg_m, 0, FP::RoundingMode::ToNearest_TieEven, false)); + + ir.SetVector(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_scalar.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_scalar.cpp new file mode 100644 index 0000000000..8d1876a05d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_scalar.cpp @@ -0,0 +1,188 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <utility> + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +namespace { +std::pair<ExtReg, size_t> GetScalarLocation(size_t esize, bool M, size_t Vm) { + const ExtReg m = ExtReg::Q0 + ((Vm >> 1) & (esize == 16 ? 0b11 : 0b111)); + const size_t index = concatenate(Imm<1>{mcl::bit::get_bit<0>(Vm)}, Imm<1>{M}, Imm<1>{mcl::bit::get_bit<3>(Vm)}).ZeroExtend() >> (esize == 16 ? 0 : 1); + return std::make_pair(m, index); +} + +enum class MultiplyBehavior { + Multiply, + MultiplyAccumulate, + MultiplySubtract, +}; + +enum class Rounding { + None, + Round, +}; + +bool ScalarMultiply(TranslatorVisitor& v, bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool F, bool N, bool M, size_t Vm, MultiplyBehavior multiply) { + if (sz == 0b11) { + return v.DecodeError(); + } + + if (sz == 0b00 || (F && sz == 0b01)) { + return v.UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn))) { + return v.UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto n = ToVector(Q, Vn, N); + const auto [m, index] = GetScalarLocation(esize, M, Vm); + + const auto reg_n = v.ir.GetVector(n); + const auto reg_m = v.ir.VectorBroadcastElement(esize, v.ir.GetVector(m), index); + const auto addend = F ? v.ir.FPVectorMul(esize, reg_n, reg_m, false) + : v.ir.VectorMultiply(esize, reg_n, reg_m); + const auto result = [&] { + switch (multiply) { + case MultiplyBehavior::Multiply: + return addend; + case MultiplyBehavior::MultiplyAccumulate: + return F ? v.ir.FPVectorAdd(esize, v.ir.GetVector(d), addend, false) + : v.ir.VectorAdd(esize, v.ir.GetVector(d), addend); + case MultiplyBehavior::MultiplySubtract: + return F ? v.ir.FPVectorSub(esize, v.ir.GetVector(d), addend, false) + : v.ir.VectorSub(esize, v.ir.GetVector(d), addend); + default: + return IR::U128{}; + } + }(); + + v.ir.SetVector(d, result); + return true; +} + +bool ScalarMultiplyLong(TranslatorVisitor& v, bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm, MultiplyBehavior multiply) { + if (sz == 0b11) { + return v.DecodeError(); + } + + if (sz == 0b00 || mcl::bit::get_bit<0>(Vd)) { + return v.UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(true, Vd, D); + const auto n = ToVector(false, Vn, N); + const auto [m, index] = GetScalarLocation(esize, M, Vm); + + const auto scalar = v.ir.VectorGetElement(esize, v.ir.GetVector(m), index); + const auto reg_n = v.ir.GetVector(n); + const auto reg_m = v.ir.VectorBroadcast(esize, scalar); + const auto addend = U ? v.ir.VectorMultiplyUnsignedWiden(esize, reg_n, reg_m) + : v.ir.VectorMultiplySignedWiden(esize, reg_n, reg_m); + const auto result = [&] { + switch (multiply) { + case MultiplyBehavior::Multiply: + return addend; + case MultiplyBehavior::MultiplyAccumulate: + return v.ir.VectorAdd(esize * 2, v.ir.GetVector(d), addend); + case MultiplyBehavior::MultiplySubtract: + return v.ir.VectorSub(esize * 2, v.ir.GetVector(d), addend); + default: + return IR::U128{}; + } + }(); + + v.ir.SetVector(d, result); + return true; +} + +bool ScalarMultiplyDoublingReturnHigh(TranslatorVisitor& v, bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm, Rounding round) { + if (sz == 0b11) { + return v.DecodeError(); + } + + if (sz == 0b00) { + return v.UndefinedInstruction(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vn))) { + return v.UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(Q, Vd, D); + const auto n = ToVector(Q, Vn, N); + const auto [m, index] = GetScalarLocation(esize, M, Vm); + + const auto reg_n = v.ir.GetVector(n); + const auto reg_m = v.ir.VectorBroadcastElement(esize, v.ir.GetVector(m), index); + const auto result = round == Rounding::None + ? v.ir.VectorSignedSaturatedDoublingMultiplyHigh(esize, reg_n, reg_m) + : v.ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, reg_n, reg_m); + + v.ir.SetVector(d, result); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::asimd_VMLA_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool F, bool N, bool M, size_t Vm) { + const auto behavior = op ? MultiplyBehavior::MultiplySubtract + : MultiplyBehavior::MultiplyAccumulate; + return ScalarMultiply(*this, Q, D, sz, Vn, Vd, F, N, M, Vm, behavior); +} + +bool TranslatorVisitor::asimd_VMLAL_scalar(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) { + const auto behavior = op ? MultiplyBehavior::MultiplySubtract + : MultiplyBehavior::MultiplyAccumulate; + return ScalarMultiplyLong(*this, U, D, sz, Vn, Vd, N, M, Vm, behavior); +} + +bool TranslatorVisitor::asimd_VMUL_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool F, bool N, bool M, size_t Vm) { + return ScalarMultiply(*this, Q, D, sz, Vn, Vd, F, N, M, Vm, MultiplyBehavior::Multiply); +} + +bool TranslatorVisitor::asimd_VMULL_scalar(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) { + return ScalarMultiplyLong(*this, U, D, sz, Vn, Vd, N, M, Vm, MultiplyBehavior::Multiply); +} + +bool TranslatorVisitor::asimd_VQDMULL_scalar(bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) { + if (sz == 0b11) { + return DecodeError(); + } + + if (sz == 0b00 || mcl::bit::get_bit<0>(Vd)) { + return UndefinedInstruction(); + } + + const size_t esize = 8U << sz; + const auto d = ToVector(true, Vd, D); + const auto n = ToVector(false, Vn, N); + const auto [m, index] = GetScalarLocation(esize, M, Vm); + + const auto reg_n = ir.GetVector(n); + const auto reg_m = ir.VectorBroadcastElement(esize, ir.GetVector(m), index); + const auto result = ir.VectorSignedSaturatedDoublingMultiplyLong(esize, reg_n, reg_m); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VQDMULH_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) { + return ScalarMultiplyDoublingReturnHigh(*this, Q, D, sz, Vn, Vd, N, M, Vm, Rounding::None); +} + +bool TranslatorVisitor::asimd_VQRDMULH_scalar(bool Q, bool D, size_t sz, size_t Vn, size_t Vd, bool N, bool M, size_t Vm) { + return ScalarMultiplyDoublingReturnHigh(*this, Q, D, sz, Vn, Vd, N, M, Vm, Rounding::Round); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_shift.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_shift.cpp new file mode 100644 index 0000000000..b7300f91e4 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_shift.cpp @@ -0,0 +1,349 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_count.hpp> +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +namespace { +enum class Accumulating { + None, + Accumulate +}; + +enum class Rounding { + None, + Round, +}; + +enum class Narrowing { + Truncation, + SaturateToUnsigned, + SaturateToSigned, +}; + +enum class Signedness { + Signed, + Unsigned +}; + +IR::U128 PerformRoundingCorrection(TranslatorVisitor& v, size_t esize, u64 round_value, IR::U128 original, IR::U128 shifted) { + const auto round_const = v.ir.VectorBroadcast(esize, v.I(esize, round_value)); + const auto round_correction = v.ir.VectorEqual(esize, v.ir.VectorAnd(original, round_const), round_const); + return v.ir.VectorSub(esize, shifted, round_correction); +} + +std::pair<size_t, size_t> ElementSizeAndShiftAmount(bool right_shift, bool L, size_t imm6) { + if (right_shift) { + if (L) { + return {64, 64 - imm6}; + } + + const size_t esize = 8U << mcl::bit::highest_set_bit(imm6 >> 3); + const size_t shift_amount = (esize * 2) - imm6; + return {esize, shift_amount}; + } else { + if (L) { + return {64, imm6}; + } + + const size_t esize = 8U << mcl::bit::highest_set_bit(imm6 >> 3); + const size_t shift_amount = imm6 - esize; + return {esize, shift_amount}; + } +} + +bool ShiftRight(TranslatorVisitor& v, bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm, Accumulating accumulate, Rounding rounding) { + if (!L && mcl::bit::get_bits<3, 5>(imm6) == 0) { + return v.DecodeError(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return v.UndefinedInstruction(); + } + + const auto [esize, shift_amount] = ElementSizeAndShiftAmount(true, L, imm6); + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto reg_m = v.ir.GetVector(m); + auto result = U ? v.ir.VectorLogicalShiftRight(esize, reg_m, static_cast<u8>(shift_amount)) + : v.ir.VectorArithmeticShiftRight(esize, reg_m, static_cast<u8>(shift_amount)); + + if (rounding == Rounding::Round) { + const u64 round_value = 1ULL << (shift_amount - 1); + result = PerformRoundingCorrection(v, esize, round_value, reg_m, result); + } + + if (accumulate == Accumulating::Accumulate) { + const auto reg_d = v.ir.GetVector(d); + result = v.ir.VectorAdd(esize, result, reg_d); + } + + v.ir.SetVector(d, result); + return true; +} + +bool ShiftRightNarrowing(TranslatorVisitor& v, bool D, size_t imm6, size_t Vd, bool M, size_t Vm, Rounding rounding, Narrowing narrowing, Signedness signedness) { + if (mcl::bit::get_bits<3, 5>(imm6) == 0) { + return v.DecodeError(); + } + + if (mcl::bit::get_bit<0>(Vm)) { + return v.UndefinedInstruction(); + } + + const auto [esize, shift_amount_] = ElementSizeAndShiftAmount(true, false, imm6); + const auto source_esize = 2 * esize; + const auto shift_amount = static_cast<u8>(shift_amount_); + + const auto d = ToVector(false, Vd, D); + const auto m = ToVector(true, Vm, M); + + const auto reg_m = v.ir.GetVector(m); + auto wide_result = [&] { + if (signedness == Signedness::Signed) { + return v.ir.VectorArithmeticShiftRight(source_esize, reg_m, shift_amount); + } + return v.ir.VectorLogicalShiftRight(source_esize, reg_m, shift_amount); + }(); + + if (rounding == Rounding::Round) { + const u64 round_value = 1ULL << (shift_amount - 1); + wide_result = PerformRoundingCorrection(v, source_esize, round_value, reg_m, wide_result); + } + + const auto result = [&] { + switch (narrowing) { + case Narrowing::Truncation: + return v.ir.VectorNarrow(source_esize, wide_result); + case Narrowing::SaturateToUnsigned: + if (signedness == Signedness::Signed) { + return v.ir.VectorSignedSaturatedNarrowToUnsigned(source_esize, wide_result); + } + return v.ir.VectorUnsignedSaturatedNarrow(source_esize, wide_result); + case Narrowing::SaturateToSigned: + ASSERT(signedness == Signedness::Signed); + return v.ir.VectorSignedSaturatedNarrowToSigned(source_esize, wide_result); + } + UNREACHABLE(); + }(); + + v.ir.SetVector(d, result); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::asimd_SHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) { + return ShiftRight(*this, U, D, imm6, Vd, L, Q, M, Vm, + Accumulating::None, Rounding::None); +} + +bool TranslatorVisitor::asimd_SRA(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) { + return ShiftRight(*this, U, D, imm6, Vd, L, Q, M, Vm, + Accumulating::Accumulate, Rounding::None); +} + +bool TranslatorVisitor::asimd_VRSHR(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) { + return ShiftRight(*this, U, D, imm6, Vd, L, Q, M, Vm, + Accumulating::None, Rounding::Round); +} + +bool TranslatorVisitor::asimd_VRSRA(bool U, bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) { + return ShiftRight(*this, U, D, imm6, Vd, L, Q, M, Vm, + Accumulating::Accumulate, Rounding::Round); +} + +bool TranslatorVisitor::asimd_VSRI(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) { + if (!L && mcl::bit::get_bits<3, 5>(imm6) == 0) { + return DecodeError(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const auto [esize, shift_amount] = ElementSizeAndShiftAmount(true, L, imm6); + const u64 mask = shift_amount == esize ? 0 : mcl::bit::ones<u64>(esize) >> shift_amount; + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto reg_d = ir.GetVector(d); + + const auto shifted = ir.VectorLogicalShiftRight(esize, reg_m, static_cast<u8>(shift_amount)); + const auto mask_vec = ir.VectorBroadcast(esize, I(esize, mask)); + const auto result = ir.VectorOr(ir.VectorAndNot(reg_d, mask_vec), shifted); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VSLI(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) { + if (!L && mcl::bit::get_bits<3, 5>(imm6) == 0) { + return DecodeError(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const auto [esize, shift_amount] = ElementSizeAndShiftAmount(false, L, imm6); + const u64 mask = mcl::bit::ones<u64>(esize) << shift_amount; + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto reg_d = ir.GetVector(d); + + const auto shifted = ir.VectorLogicalShiftLeft(esize, reg_m, static_cast<u8>(shift_amount)); + const auto mask_vec = ir.VectorBroadcast(esize, I(esize, mask)); + const auto result = ir.VectorOr(ir.VectorAndNot(reg_d, mask_vec), shifted); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VQSHL(bool U, bool D, size_t imm6, size_t Vd, bool op, bool L, bool Q, bool M, size_t Vm) { + if (!L && mcl::bit::get_bits<3, 5>(imm6) == 0) { + return DecodeError(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (!U && !op) { + return UndefinedInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + const auto result = [&] { + const auto reg_m = ir.GetVector(m); + const auto [esize, shift_amount] = ElementSizeAndShiftAmount(false, L, imm6); + const IR::U128 shift_vec = ir.VectorBroadcast(esize, I(esize, shift_amount)); + + if (U) { + if (op) { + return ir.VectorUnsignedSaturatedShiftLeft(esize, reg_m, shift_vec); + } + + return ir.VectorSignedSaturatedShiftLeftUnsigned(esize, reg_m, static_cast<u8>(shift_amount)); + } + if (op) { + return ir.VectorSignedSaturatedShiftLeft(esize, reg_m, shift_vec); + } + + return IR::U128{}; + }(); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VSHL(bool D, size_t imm6, size_t Vd, bool L, bool Q, bool M, size_t Vm) { + if (!L && mcl::bit::get_bits<3, 5>(imm6) == 0) { + return DecodeError(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + const auto [esize, shift_amount] = ElementSizeAndShiftAmount(false, L, imm6); + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto result = ir.VectorLogicalShiftLeft(esize, reg_m, static_cast<u8>(shift_amount)); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VSHRN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm) { + return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm, + Rounding::None, Narrowing::Truncation, Signedness::Unsigned); +} + +bool TranslatorVisitor::asimd_VRSHRN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm) { + return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm, + Rounding::Round, Narrowing::Truncation, Signedness::Unsigned); +} + +bool TranslatorVisitor::asimd_VQRSHRUN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm) { + return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm, + Rounding::Round, Narrowing::SaturateToUnsigned, Signedness::Signed); +} + +bool TranslatorVisitor::asimd_VQSHRUN(bool D, size_t imm6, size_t Vd, bool M, size_t Vm) { + return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm, + Rounding::None, Narrowing::SaturateToUnsigned, Signedness::Signed); +} + +bool TranslatorVisitor::asimd_VQSHRN(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm) { + return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm, + Rounding::None, U ? Narrowing::SaturateToUnsigned : Narrowing::SaturateToSigned, U ? Signedness::Unsigned : Signedness::Signed); +} + +bool TranslatorVisitor::asimd_VQRSHRN(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm) { + return ShiftRightNarrowing(*this, D, imm6, Vd, M, Vm, + Rounding::Round, U ? Narrowing::SaturateToUnsigned : Narrowing::SaturateToSigned, U ? Signedness::Unsigned : Signedness::Signed); +} + +bool TranslatorVisitor::asimd_VSHLL(bool U, bool D, size_t imm6, size_t Vd, bool M, size_t Vm) { + if (mcl::bit::get_bits<3, 5>(imm6) == 0) { + return DecodeError(); + } + + if (mcl::bit::get_bit<0>(Vd)) { + return UndefinedInstruction(); + } + + const auto [esize, shift_amount] = ElementSizeAndShiftAmount(false, false, imm6); + + const auto d = ToVector(true, Vd, D); + const auto m = ToVector(false, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto ext_vec = U ? ir.VectorZeroExtend(esize, reg_m) : ir.VectorSignExtend(esize, reg_m); + const auto result = ir.VectorLogicalShiftLeft(esize * 2, ext_vec, static_cast<u8>(shift_amount)); + + ir.SetVector(d, result); + return true; +} + +bool TranslatorVisitor::asimd_VCVT_fixed(bool U, bool D, size_t imm6, size_t Vd, bool to_fixed, bool Q, bool M, size_t Vm) { + if (mcl::bit::get_bits<3, 5>(imm6) == 0) { + return DecodeError(); + } + + if (Q && (mcl::bit::get_bit<0>(Vd) || mcl::bit::get_bit<0>(Vm))) { + return UndefinedInstruction(); + } + + if (!mcl::bit::get_bit<5>(imm6)) { + return UndefinedInstruction(); + } + + const size_t fbits = 64 - imm6; + const auto d = ToVector(Q, Vd, D); + const auto m = ToVector(Q, Vm, M); + + const auto reg_m = ir.GetVector(m); + const auto result = to_fixed ? (U ? ir.FPVectorToUnsignedFixed(32, reg_m, fbits, FP::RoundingMode::TowardsZero, false) : ir.FPVectorToSignedFixed(32, reg_m, fbits, FP::RoundingMode::TowardsZero, false)) + : (U ? ir.FPVectorFromUnsignedFixed(32, reg_m, fbits, FP::RoundingMode::ToNearest_TieEven, false) : ir.FPVectorFromSignedFixed(32, reg_m, fbits, FP::RoundingMode::ToNearest_TieEven, false)); + + ir.SetVector(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/barrier.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/barrier.cpp new file mode 100644 index 0000000000..7d9e7459f1 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/barrier.cpp @@ -0,0 +1,27 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +bool TranslatorVisitor::arm_DMB(Imm<4> /*option*/) { + ir.DataMemoryBarrier(); + return true; +} + +bool TranslatorVisitor::arm_DSB(Imm<4> /*option*/) { + ir.DataSynchronizationBarrier(); + return true; +} + +bool TranslatorVisitor::arm_ISB(Imm<4> /*option*/) { + ir.InstructionSynchronizationBarrier(); + ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4)); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/coprocessor.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/coprocessor.cpp new file mode 100644 index 0000000000..29748a6eba --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/coprocessor.cpp @@ -0,0 +1,166 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// CDP{2} <coproc_no>, #<opc1>, <CRd>, <CRn>, <CRm>, #<opc2> +bool TranslatorVisitor::arm_CDP(Cond cond, size_t opc1, CoprocReg CRn, CoprocReg CRd, size_t coproc_no, size_t opc2, CoprocReg CRm) { + if ((coproc_no & 0b1110) == 0b1010) { + return arm_UDF(); + } + + const bool two = cond == Cond::NV; + + if (two || ArmConditionPassed(cond)) { + ir.CoprocInternalOperation(coproc_no, two, opc1, CRd, CRn, CRm, opc2); + } + return true; +} + +// LDC{2}{L}<c> <coproc_no>, <CRd>, [<Rn>, #+/-<imm32>]{!} +// LDC{2}{L}<c> <coproc_no>, <CRd>, [<Rn>], #+/-<imm32> +// LDC{2}{L}<c> <coproc_no>, <CRd>, [<Rn>], <imm8> +bool TranslatorVisitor::arm_LDC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) { + if (!p && !u && !d && !w) { + return arm_UDF(); + } + + if ((coproc_no & 0b1110) == 0b1010) { + return arm_UDF(); + } + + const bool two = cond == Cond::NV; + + if (two || ArmConditionPassed(cond)) { + const u32 imm32 = imm8.ZeroExtend() << 2; + const bool index = p; + const bool add = u; + const bool wback = w; + const bool has_option = !p && !w && u; + const IR::U32 reg_n = ir.GetRegister(n); + const IR::U32 offset_address = add ? ir.Add(reg_n, ir.Imm32(imm32)) : ir.Sub(reg_n, ir.Imm32(imm32)); + const IR::U32 address = index ? offset_address : reg_n; + ir.CoprocLoadWords(coproc_no, two, d, CRd, address, has_option, imm8.ZeroExtend<u8>()); + if (wback) { + ir.SetRegister(n, offset_address); + } + } + return true; +} + +// MCR{2}<c> <coproc_no>, #<opc1>, <Rt>, <CRn>, <CRm>, #<opc2> +bool TranslatorVisitor::arm_MCR(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) { + if ((coproc_no & 0b1110) == 0b1010) { + return arm_UDF(); + } + + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + const bool two = cond == Cond::NV; + + if (two || ArmConditionPassed(cond)) { + ir.CoprocSendOneWord(coproc_no, two, opc1, CRn, CRm, opc2, ir.GetRegister(t)); + } + return true; +} + +// MCRR{2}<c> <coproc_no>, #<opc>, <Rt>, <Rt2>, <CRm> +bool TranslatorVisitor::arm_MCRR(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) { + if ((coproc_no & 0b1110) == 0b1010) { + return arm_UDF(); + } + + if (t == Reg::PC || t2 == Reg::PC) { + return UnpredictableInstruction(); + } + + const bool two = cond == Cond::NV; + + if (two || ArmConditionPassed(cond)) { + ir.CoprocSendTwoWords(coproc_no, two, opc, CRm, ir.GetRegister(t), ir.GetRegister(t2)); + } + return true; +} + +// MRC{2}<c> <coproc_no>, #<opc1>, <Rt>, <CRn>, <CRm>, #<opc2> +bool TranslatorVisitor::arm_MRC(Cond cond, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) { + if ((coproc_no & 0b1110) == 0b1010) { + return arm_UDF(); + } + + const bool two = cond == Cond::NV; + + if (two || ArmConditionPassed(cond)) { + const auto word = ir.CoprocGetOneWord(coproc_no, two, opc1, CRn, CRm, opc2); + if (t != Reg::PC) { + ir.SetRegister(t, word); + } else { + const auto new_cpsr_nzcv = ir.And(word, ir.Imm32(0xF0000000)); + ir.SetCpsrNZCVRaw(new_cpsr_nzcv); + } + } + return true; +} + +// MRRC{2}<c> <coproc_no>, #<opc>, <Rt>, <Rt2>, <CRm> +bool TranslatorVisitor::arm_MRRC(Cond cond, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) { + if ((coproc_no & 0b1110) == 0b1010) { + return arm_UDF(); + } + + if (t == Reg::PC || t2 == Reg::PC || t == t2) { + return UnpredictableInstruction(); + } + + const bool two = cond == Cond::NV; + + if (two || ArmConditionPassed(cond)) { + const auto two_words = ir.CoprocGetTwoWords(coproc_no, two, opc, CRm); + ir.SetRegister(t, ir.LeastSignificantWord(two_words)); + ir.SetRegister(t2, ir.MostSignificantWord(two_words).result); + } + return true; +} + +// STC{2}{L}<c> <coproc>, <CRd>, [<Rn>, #+/-<imm32>]{!} +// STC{2}{L}<c> <coproc>, <CRd>, [<Rn>], #+/-<imm32> +// STC{2}{L}<c> <coproc>, <CRd>, [<Rn>], <imm8> +bool TranslatorVisitor::arm_STC(Cond cond, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) { + if ((coproc_no & 0b1110) == 0b1010) { + return arm_UDF(); + } + + if (!p && !u && !d && !w) { + return arm_UDF(); + } + + if (n == Reg::PC && w) { + return UnpredictableInstruction(); + } + + const bool two = cond == Cond::NV; + + if (two || ArmConditionPassed(cond)) { + const u32 imm32 = imm8.ZeroExtend() << 2; + const bool index = p; + const bool add = u; + const bool wback = w; + const bool has_option = !p && !w && u; + const IR::U32 reg_n = ir.GetRegister(n); + const IR::U32 offset_address = add ? ir.Add(reg_n, ir.Imm32(imm32)) : ir.Sub(reg_n, ir.Imm32(imm32)); + const IR::U32 address = index ? offset_address : reg_n; + ir.CoprocStoreWords(coproc_no, two, d, CRd, address, has_option, imm8.ZeroExtend<u8>()); + if (wback) { + ir.SetRegister(n, offset_address); + } + } + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/data_processing.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/data_processing.cpp new file mode 100644 index 0000000000..9d7ea6a170 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/data_processing.cpp @@ -0,0 +1,1116 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// ADC{S}<c> <Rd>, <Rn>, #<imm> +bool TranslatorVisitor::arm_ADC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = ArmExpandImm(rotate, imm8); + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.GetCFlag()); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +// ADC{S}<c> <Rd>, <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_ADC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()); + const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag()); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// ADC{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_ADC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || s == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag()); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// ADD{S}<c> <Rd>, <Rn>, #<const> +bool TranslatorVisitor::arm_ADD_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = ArmExpandImm(rotate, imm8); + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0)); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// ADD{S}<c> <Rd>, <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_ADD_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()); + const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0)); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// ADD{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_ADD_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || s == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0)); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// AND{S}<c> <Rd>, <Rn>, #<const> +bool TranslatorVisitor::arm_AND_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag()); + const auto result = ir.And(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + + return true; +} + +// AND{S}<c> <Rd>, <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_AND_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in); + const auto result = ir.And(ir.GetRegister(n), shifted.result); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +// AND{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_AND_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || s == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.And(ir.GetRegister(n), shifted.result); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +// BIC{S}<c> <Rd>, <Rn>, #<const> +bool TranslatorVisitor::arm_BIC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag()); + const auto result = ir.AndNot(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + + return true; +} + +// BIC{S}<c> <Rd>, <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_BIC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in); + const auto result = ir.AndNot(ir.GetRegister(n), shifted.result); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +// BIC{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_BIC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || s == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.AndNot(ir.GetRegister(n), shifted.result); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +// CMN<c> <Rn>, #<const> +bool TranslatorVisitor::arm_CMN_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = ArmExpandImm(rotate, imm8); + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0)); + + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +// CMN<c> <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_CMN_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()); + const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0)); + + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +// CMN<c> <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_CMN_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) { + if (n == Reg::PC || m == Reg::PC || s == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0)); + + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +// CMP<c> <Rn>, #<imm> +bool TranslatorVisitor::arm_CMP_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = ArmExpandImm(rotate, imm8); + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1)); + + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +// CMP<c> <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_CMP_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()); + const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1)); + + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +// CMP<c> <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_CMP_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) { + if (n == Reg::PC || m == Reg::PC || s == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1)); + + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +// EOR{S}<c> <Rd>, <Rn>, #<const> +bool TranslatorVisitor::arm_EOR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag()); + const auto result = ir.Eor(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + + return true; +} + +// EOR{S}<c> <Rd>, <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_EOR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in); + const auto result = ir.Eor(ir.GetRegister(n), shifted.result); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +// EOR{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_EOR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || s == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.Eor(ir.GetRegister(n), shifted.result); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +// MOV{S}<c> <Rd>, #<const> +bool TranslatorVisitor::arm_MOV_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag()); + const auto result = ir.Imm32(imm_carry.imm32); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + + return true; +} + +// MOV{S}<c> <Rd>, <Rm> +bool TranslatorVisitor::arm_MOV_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in); + const auto result = shifted.result; + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +bool TranslatorVisitor::arm_MOV_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m) { + if (d == Reg::PC || m == Reg::PC || s == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = shifted.result; + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +// MVN{S}<c> <Rd>, #<const> +bool TranslatorVisitor::arm_MVN_imm(Cond cond, bool S, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag()); + const auto result = ir.Not(ir.Imm32(imm_carry.imm32)); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + + return true; +} + +// MVN{S}<c> <Rd>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_MVN_reg(Cond cond, bool S, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in); + const auto result = ir.Not(shifted.result); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +// MVN{S}<c> <Rd>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_MVN_rsr(Cond cond, bool S, Reg d, Reg s, ShiftType shift, Reg m) { + if (d == Reg::PC || m == Reg::PC || s == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.Not(shifted.result); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +// ORR{S}<c> <Rd>, <Rn>, #<const> +bool TranslatorVisitor::arm_ORR_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag()); + const auto result = ir.Or(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + + return true; +} + +// ORR{S}<c> <Rd>, <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_ORR_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in); + const auto result = ir.Or(ir.GetRegister(n), shifted.result); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +// ORR{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_ORR_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + if (n == Reg::PC || m == Reg::PC || s == Reg::PC || d == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.Or(ir.GetRegister(n), shifted.result); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + + return true; +} + +// RSB{S}<c> <Rd>, <Rn>, #<const> +bool TranslatorVisitor::arm_RSB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = ArmExpandImm(rotate, imm8); + const auto result = ir.SubWithCarry(ir.Imm32(imm32), ir.GetRegister(n), ir.Imm1(1)); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// RSB{S}<c> <Rd>, <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_RSB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()); + const auto result = ir.SubWithCarry(shifted.result, ir.GetRegister(n), ir.Imm1(1)); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// RSB{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_RSB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + if (n == Reg::PC || m == Reg::PC || s == Reg::PC || d == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.SubWithCarry(shifted.result, ir.GetRegister(n), ir.Imm1(1)); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// RSC{S}<c> <Rd>, <Rn>, #<const> +bool TranslatorVisitor::arm_RSC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = ArmExpandImm(rotate, imm8); + const auto result = ir.SubWithCarry(ir.Imm32(imm32), ir.GetRegister(n), ir.GetCFlag()); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +bool TranslatorVisitor::arm_RSC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()); + const auto result = ir.SubWithCarry(shifted.result, ir.GetRegister(n), ir.GetCFlag()); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// RSC{S}<c> <Rd>, <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_RSC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + if (n == Reg::PC || m == Reg::PC || s == Reg::PC || d == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.SubWithCarry(shifted.result, ir.GetRegister(n), ir.GetCFlag()); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// SBC{S}<c> <Rd>, <Rn>, #<const> +bool TranslatorVisitor::arm_SBC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = ArmExpandImm(rotate, imm8); + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.GetCFlag()); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// SBC{S}<c> <Rd>, <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_SBC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()); + const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag()); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// SBC{S}<c> <Rd>, <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_SBC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + if (n == Reg::PC || m == Reg::PC || s == Reg::PC || d == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag()); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// SUB{S}<c> <Rd>, <Rn>, #<const> +bool TranslatorVisitor::arm_SUB_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = ArmExpandImm(rotate, imm8); + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1)); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// SUB{S}<c> <Rd>, <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_SUB_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()); + const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1)); + if (d == Reg::PC) { + if (S) { + // This is UNPREDICTABLE when in user-mode. + return UnpredictableInstruction(); + } + + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; + } + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +bool TranslatorVisitor::arm_SUB_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, ShiftType shift, Reg m) { + if (n == Reg::PC || m == Reg::PC || s == Reg::PC || d == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1)); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + + return true; +} + +// TEQ<c> <Rn>, #<const> +bool TranslatorVisitor::arm_TEQ_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag()); + const auto result = ir.Eor(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + return true; +} + +// TEQ<c> <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_TEQ_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in); + const auto result = ir.Eor(ir.GetRegister(n), shifted.result); + + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + return true; +} + +// TEQ<c> <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_TEQ_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) { + if (n == Reg::PC || m == Reg::PC || s == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.Eor(ir.GetRegister(n), shifted.result); + + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + return true; +} + +// TST<c> <Rn>, #<const> +bool TranslatorVisitor::arm_TST_imm(Cond cond, Reg n, int rotate, Imm<8> imm8) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag()); + const auto result = ir.And(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + return true; +} + +// TST<c> <Rn>, <Rm>{, <shift>} +bool TranslatorVisitor::arm_TST_reg(Cond cond, Reg n, Imm<5> imm5, ShiftType shift, Reg m) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in); + const auto result = ir.And(ir.GetRegister(n), shifted.result); + + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + return true; +} + +// TST<c> <Rn>, <Rm>, <type> <Rs> +bool TranslatorVisitor::arm_TST_rsr(Cond cond, Reg n, Reg s, ShiftType shift, Reg m) { + if (n == Reg::PC || m == Reg::PC || s == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s)); + const auto carry_in = ir.GetCFlag(); + const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in); + const auto result = ir.And(ir.GetRegister(n), shifted.result); + + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/divide.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/divide.cpp new file mode 100644 index 0000000000..f13a257d21 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/divide.cpp @@ -0,0 +1,40 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +namespace { +using DivideFunction = IR::U32U64 (IREmitter::*)(const IR::U32U64&, const IR::U32U64&); + +bool DivideOperation(TranslatorVisitor& v, Cond cond, Reg d, Reg m, Reg n, DivideFunction fn) { + if (d == Reg::PC || m == Reg::PC || n == Reg::PC) { + return v.UnpredictableInstruction(); + } + + if (!v.ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 operand1 = v.ir.GetRegister(n); + const IR::U32 operand2 = v.ir.GetRegister(m); + const IR::U32 result = (v.ir.*fn)(operand1, operand2); + + v.ir.SetRegister(d, result); + return true; +} +} // Anonymous namespace + +// SDIV<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SDIV(Cond cond, Reg d, Reg m, Reg n) { + return DivideOperation(*this, cond, d, m, n, &IREmitter::SignedDiv); +} + +// UDIV<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UDIV(Cond cond, Reg d, Reg m, Reg n) { + return DivideOperation(*this, cond, d, m, n, &IREmitter::UnsignedDiv); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/extension.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/extension.cpp new file mode 100644 index 0000000000..3ee6a670f0 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/extension.cpp @@ -0,0 +1,229 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +static IR::U32 Rotate(A32::IREmitter& ir, Reg m, SignExtendRotation rotate) { + const u8 rotate_by = static_cast<u8>(static_cast<size_t>(rotate) * 8); + return ir.RotateRight(ir.GetRegister(m), ir.Imm8(rotate_by), ir.Imm1(0)).result; +} + +// SXTAB<c> <Rd>, <Rn>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_SXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.Add(reg_n, ir.SignExtendByteToWord(ir.LeastSignificantByte(rotated))); + + ir.SetRegister(d, result); + return true; +} + +// SXTAB16<c> <Rd>, <Rn>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_SXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + const auto low_byte = ir.And(rotated, ir.Imm32(0x00FF00FF)); + const auto sign_bit = ir.And(rotated, ir.Imm32(0x00800080)); + const auto addend = ir.Or(low_byte, ir.Mul(sign_bit, ir.Imm32(0x1FE))); + const auto result = ir.PackedAddU16(addend, ir.GetRegister(n)).result; + + ir.SetRegister(d, result); + return true; +} + +// SXTAH<c> <Rd>, <Rn>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_SXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.Add(reg_n, ir.SignExtendHalfToWord(ir.LeastSignificantHalf(rotated))); + + ir.SetRegister(d, result); + return true; +} + +// SXTB<c> <Rd>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_SXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + const auto result = ir.SignExtendByteToWord(ir.LeastSignificantByte(rotated)); + + ir.SetRegister(d, result); + return true; +} + +// SXTB16<c> <Rd>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_SXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + const auto low_byte = ir.And(rotated, ir.Imm32(0x00FF00FF)); + const auto sign_bit = ir.And(rotated, ir.Imm32(0x00800080)); + const auto result = ir.Or(low_byte, ir.Mul(sign_bit, ir.Imm32(0x1FE))); + + ir.SetRegister(d, result); + return true; +} + +// SXTH<c> <Rd>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_SXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + const auto result = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(rotated)); + + ir.SetRegister(d, result); + return true; +} + +// UXTAB<c> <Rd>, <Rn>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_UXTAB(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.Add(reg_n, ir.ZeroExtendByteToWord(ir.LeastSignificantByte(rotated))); + + ir.SetRegister(d, result); + return true; +} + +// UXTAB16<c> <Rd>, <Rn>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_UXTAB16(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + auto result = ir.And(rotated, ir.Imm32(0x00FF00FF)); + const auto reg_n = ir.GetRegister(n); + result = ir.PackedAddU16(reg_n, result).result; + + ir.SetRegister(d, result); + return true; +} + +// UXTAH<c> <Rd>, <Rn>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_UXTAH(Cond cond, Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.Add(reg_n, ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(rotated))); + + ir.SetRegister(d, result); + return true; +} + +// UXTB<c> <Rd>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_UXTB(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + const auto result = ir.ZeroExtendByteToWord(ir.LeastSignificantByte(rotated)); + ir.SetRegister(d, result); + return true; +} + +// UXTB16<c> <Rd>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_UXTB16(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + const auto result = ir.And(rotated, ir.Imm32(0x00FF00FF)); + + ir.SetRegister(d, result); + return true; +} + +// UXTH<c> <Rd>, <Rm>{, <rotation>} +bool TranslatorVisitor::arm_UXTH(Cond cond, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rotated = Rotate(ir, m, rotate); + const auto result = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(rotated)); + + ir.SetRegister(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/hint.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/hint.cpp new file mode 100644 index 0000000000..0ec6aeff9a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/hint.cpp @@ -0,0 +1,69 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" +#include "dynarmic/interface/A32/config.h" + +namespace Dynarmic::A32 { + +bool TranslatorVisitor::arm_PLD_imm(bool /*add*/, bool R, Reg /*n*/, Imm<12> /*imm12*/) { + if (!options.hook_hint_instructions) { + return true; + } + + const auto exception = R ? Exception::PreloadData : Exception::PreloadDataWithIntentToWrite; + return RaiseException(exception); +} + +bool TranslatorVisitor::arm_PLD_reg(bool /*add*/, bool R, Reg /*n*/, Imm<5> /*imm5*/, ShiftType /*shift*/, Reg /*m*/) { + if (!options.hook_hint_instructions) { + return true; + } + + const auto exception = R ? Exception::PreloadData : Exception::PreloadDataWithIntentToWrite; + return RaiseException(exception); +} + +bool TranslatorVisitor::arm_SEV() { + if (!options.hook_hint_instructions) { + return true; + } + + return RaiseException(Exception::SendEvent); +} + +bool TranslatorVisitor::arm_SEVL() { + if (!options.hook_hint_instructions) { + return true; + } + + return RaiseException(Exception::SendEventLocal); +} + +bool TranslatorVisitor::arm_WFE() { + if (!options.hook_hint_instructions) { + return true; + } + + return RaiseException(Exception::WaitForEvent); +} + +bool TranslatorVisitor::arm_WFI() { + if (!options.hook_hint_instructions) { + return true; + } + + return RaiseException(Exception::WaitForInterrupt); +} + +bool TranslatorVisitor::arm_YIELD() { + if (!options.hook_hint_instructions) { + return true; + } + + return RaiseException(Exception::Yield); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/load_store.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/load_store.cpp new file mode 100644 index 0000000000..7ef8b7e890 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/load_store.cpp @@ -0,0 +1,957 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_count.hpp> +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +bool TranslatorVisitor::arm_LDRBT() { + // System instructions unimplemented + return UndefinedInstruction(); +} + +bool TranslatorVisitor::arm_LDRHT() { + // System instructions unimplemented + return UndefinedInstruction(); +} + +bool TranslatorVisitor::arm_LDRSBT() { + // System instructions unimplemented + return UndefinedInstruction(); +} + +bool TranslatorVisitor::arm_LDRSHT() { + // System instructions unimplemented + return UndefinedInstruction(); +} + +bool TranslatorVisitor::arm_LDRT() { + // System instructions unimplemented + return UndefinedInstruction(); +} + +bool TranslatorVisitor::arm_STRBT() { + // System instructions unimplemented + return UndefinedInstruction(); +} + +bool TranslatorVisitor::arm_STRHT() { + // System instructions unimplemented + return UndefinedInstruction(); +} + +bool TranslatorVisitor::arm_STRT() { + // System instructions unimplemented + return UndefinedInstruction(); +} + +static IR::U32 GetAddress(A32::IREmitter& ir, bool P, bool U, bool W, Reg n, IR::U32 offset) { + const bool index = P; + const bool add = U; + const bool wback = !P || W; + + const IR::U32 offset_addr = add ? ir.Add(ir.GetRegister(n), offset) : ir.Sub(ir.GetRegister(n), offset); + const IR::U32 address = index ? offset_addr : ir.GetRegister(n); + + if (wback) { + ir.SetRegister(n, offset_addr); + } + + return address; +} + +// LDR <Rt>, [PC, #+/-<imm>] +bool TranslatorVisitor::arm_LDR_lit(Cond cond, bool U, Reg t, Imm<12> imm12) { + if (!ArmConditionPassed(cond)) { + return true; + } + + const bool add = U; + const u32 base = ir.AlignPC(4); + const u32 address = add ? (base + imm12.ZeroExtend()) : (base - imm12.ZeroExtend()); + const auto data = ir.ReadMemory32(ir.Imm32(address), IR::AccType::NORMAL); + + if (t == Reg::PC) { + ir.LoadWritePC(data); + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; + } + + ir.SetRegister(t, data); + return true; +} + +// LDR <Rt>, [<Rn>, #+/-<imm>]{!} +// LDR <Rt>, [<Rn>], #+/-<imm> +bool TranslatorVisitor::arm_LDR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) { + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + ASSERT_MSG(!(!P && W), "T form of instruction unimplemented"); + if ((!P || W) && n == t) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm12.ZeroExtend(); + const auto offset = ir.Imm32(imm32); + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL); + + if (t == Reg::PC) { + ir.LoadWritePC(data); + + if (!P && W && n == Reg::R13) { + ir.SetTerm(IR::Term::PopRSBHint{}); + } else { + ir.SetTerm(IR::Term::FastDispatchHint{}); + } + + return false; + } + + ir.SetRegister(t, data); + return true; +} + +// LDR <Rt>, [<Rn>, #+/-<Rm>]{!} +// LDR <Rt>, [<Rn>], #+/-<Rm> +bool TranslatorVisitor::arm_LDR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) { + ASSERT_MSG(!(!P && W), "T form of instruction unimplemented"); + if (m == Reg::PC) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto offset = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()).result; + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL); + + if (t == Reg::PC) { + ir.LoadWritePC(data); + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; + } + + ir.SetRegister(t, data); + return true; +} + +// LDRB <Rt>, [PC, #+/-<imm>] +bool TranslatorVisitor::arm_LDRB_lit(Cond cond, bool U, Reg t, Imm<12> imm12) { + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm12.ZeroExtend(); + const bool add = U; + const u32 base = ir.AlignPC(4); + const u32 address = add ? (base + imm32) : (base - imm32); + const auto data = ir.ZeroExtendByteToWord(ir.ReadMemory8(ir.Imm32(address), IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRB <Rt>, [<Rn>, #+/-<imm>]{!} +// LDRB <Rt>, [<Rn>], #+/-<imm> +bool TranslatorVisitor::arm_LDRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) { + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + ASSERT_MSG(!(!P && W), "T form of instruction unimplemented"); + if ((!P || W) && n == t) { + return UnpredictableInstruction(); + } + + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm12.ZeroExtend(); + const auto offset = ir.Imm32(imm32); + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto data = ir.ZeroExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRB <Rt>, [<Rn>, #+/-<Rm>]{!} +// LDRB <Rt>, [<Rn>], #+/-<Rm> +bool TranslatorVisitor::arm_LDRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) { + ASSERT_MSG(!(!P && W), "T form of instruction unimplemented"); + if (t == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto offset = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()).result; + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto data = ir.ZeroExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRD <Rt>, <Rt2>, [PC, #+/-<imm>] +bool TranslatorVisitor::arm_LDRD_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + if (RegNumber(t) % 2 == 1) { + return UnpredictableInstruction(); + } + + if (t + 1 == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const Reg t2 = t + 1; + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const bool add = U; + + const u32 base = ir.AlignPC(4); + const u32 address = add ? (base + imm32) : (base - imm32); + + // NOTE: If alignment is exactly off by 4, each word is an atomic access. + const IR::U64 data = ir.ReadMemory64(ir.Imm32(address), IR::AccType::ATOMIC); + + if (ir.current_location.EFlag()) { + ir.SetRegister(t, ir.MostSignificantWord(data).result); + ir.SetRegister(t2, ir.LeastSignificantWord(data)); + } else { + ir.SetRegister(t, ir.LeastSignificantWord(data)); + ir.SetRegister(t2, ir.MostSignificantWord(data).result); + } + return true; +} + +// LDRD <Rt>, [<Rn>, #+/-<imm>]{!} +// LDRD <Rt>, [<Rn>], #+/-<imm> +bool TranslatorVisitor::arm_LDRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (RegNumber(t) % 2 == 1) { + return UnpredictableInstruction(); + } + + if (!P && W) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == t || n == t + 1)) { + return UnpredictableInstruction(); + } + + if (t + 1 == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const Reg t2 = t + 1; + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + + const auto offset = ir.Imm32(imm32); + const auto address = GetAddress(ir, P, U, W, n, offset); + + // NOTE: If alignment is exactly off by 4, each word is an atomic access. + const IR::U64 data = ir.ReadMemory64(address, IR::AccType::ATOMIC); + + if (ir.current_location.EFlag()) { + ir.SetRegister(t, ir.MostSignificantWord(data).result); + ir.SetRegister(t2, ir.LeastSignificantWord(data)); + } else { + ir.SetRegister(t, ir.LeastSignificantWord(data)); + ir.SetRegister(t2, ir.MostSignificantWord(data).result); + } + return true; +} + +// LDRD <Rt>, [<Rn>, #+/-<Rm>]{!} +// LDRD <Rt>, [<Rn>], #+/-<Rm> +bool TranslatorVisitor::arm_LDRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + if (RegNumber(t) % 2 == 1) { + return UnpredictableInstruction(); + } + + if (!P && W) { + return UnpredictableInstruction(); + } + + if (t + 1 == Reg::PC || m == Reg::PC || m == t || m == t + 1) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t || n == t + 1)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const Reg t2 = t + 1; + const auto offset = ir.GetRegister(m); + const auto address = GetAddress(ir, P, U, W, n, offset); + + // NOTE: If alignment is exactly off by 4, each word is an atomic access. + const IR::U64 data = ir.ReadMemory64(address, IR::AccType::ATOMIC); + + if (ir.current_location.EFlag()) { + ir.SetRegister(t, ir.MostSignificantWord(data).result); + ir.SetRegister(t2, ir.LeastSignificantWord(data)); + } else { + ir.SetRegister(t, ir.LeastSignificantWord(data)); + ir.SetRegister(t2, ir.MostSignificantWord(data).result); + } + return true; +} + +// LDRH <Rt>, [PC, #-/+<imm>] +bool TranslatorVisitor::arm_LDRH_lit(Cond cond, bool P, bool U, bool W, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + ASSERT_MSG(!(!P && W), "T form of instruction unimplemented"); + if (P == W) { + return UnpredictableInstruction(); + } + + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const bool add = U; + const u32 base = ir.AlignPC(4); + const u32 address = add ? (base + imm32) : (base - imm32); + const auto data = ir.ZeroExtendHalfToWord(ir.ReadMemory16(ir.Imm32(address), IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRH <Rt>, [<Rn>, #+/-<imm>]{!} +// LDRH <Rt>, [<Rn>], #+/-<imm> +bool TranslatorVisitor::arm_LDRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + ASSERT_MSG(!(!P && W), "T form of instruction unimplemented"); + if ((!P || W) && n == t) { + return UnpredictableInstruction(); + } + + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const auto offset = ir.Imm32(imm32); + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto data = ir.ZeroExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRH <Rt>, [<Rn>, #+/-<Rm>]{!} +// LDRH <Rt>, [<Rn>], #+/-<Rm> +bool TranslatorVisitor::arm_LDRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + ASSERT_MSG(!(!P && W), "T form of instruction unimplemented"); + if (t == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto offset = ir.GetRegister(m); + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto data = ir.ZeroExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRSB <Rt>, [PC, #+/-<imm>] +bool TranslatorVisitor::arm_LDRSB_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const bool add = U; + + const u32 base = ir.AlignPC(4); + const u32 address = add ? (base + imm32) : (base - imm32); + const auto data = ir.SignExtendByteToWord(ir.ReadMemory8(ir.Imm32(address), IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRSB <Rt>, [<Rn>, #+/-<imm>]{!} +// LDRSB <Rt>, [<Rn>], #+/-<imm> +bool TranslatorVisitor::arm_LDRSB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + ASSERT_MSG(!(!P && W), "T form of instruction unimplemented"); + if ((!P || W) && n == t) { + return UnpredictableInstruction(); + } + + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const auto offset = ir.Imm32(imm32); + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto data = ir.SignExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRSB <Rt>, [<Rn>, #+/-<Rm>]{!} +// LDRSB <Rt>, [<Rn>], #+/-<Rm> +bool TranslatorVisitor::arm_LDRSB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + ASSERT_MSG(!(!P && W), "T form of instruction unimplemented"); + if (t == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto offset = ir.GetRegister(m); + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto data = ir.SignExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRSH <Rt>, [PC, #-/+<imm>] +bool TranslatorVisitor::arm_LDRSH_lit(Cond cond, bool U, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const bool add = U; + const u32 base = ir.AlignPC(4); + const u32 address = add ? (base + imm32) : (base - imm32); + const auto data = ir.SignExtendHalfToWord(ir.ReadMemory16(ir.Imm32(address), IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRSH <Rt>, [<Rn>, #+/-<imm>]{!} +// LDRSH <Rt>, [<Rn>], #+/-<imm> +bool TranslatorVisitor::arm_LDRSH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + ASSERT_MSG(!(!P && W), "T form of instruction unimplemented"); + if ((!P || W) && n == t) { + return UnpredictableInstruction(); + } + + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const auto offset = ir.Imm32(imm32); + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto data = ir.SignExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRSH <Rt>, [<Rn>, #+/-<Rm>]{!} +// LDRSH <Rt>, [<Rn>], #+/-<Rm> +bool TranslatorVisitor::arm_LDRSH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + ASSERT_MSG(!(!P && W), "T form of instruction unimplemented"); + if (t == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto offset = ir.GetRegister(m); + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto data = ir.SignExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// STR <Rt>, [<Rn>, #+/-<imm>]{!} +// STR <Rt>, [<Rn>], #+/-<imm> +bool TranslatorVisitor::arm_STR_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) { + if ((!P || W) && (n == Reg::PC || n == t)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto offset = ir.Imm32(imm12.ZeroExtend()); + const auto address = GetAddress(ir, P, U, W, n, offset); + ir.WriteMemory32(address, ir.GetRegister(t), IR::AccType::NORMAL); + return true; +} + +// STR <Rt>, [<Rn>, #+/-<Rm>]{!} +// STR <Rt>, [<Rn>], #+/-<Rm> +bool TranslatorVisitor::arm_STR_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) { + if (m == Reg::PC) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto offset = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()).result; + const auto address = GetAddress(ir, P, U, W, n, offset); + ir.WriteMemory32(address, ir.GetRegister(t), IR::AccType::NORMAL); + return true; +} + +// STRB <Rt>, [<Rn>, #+/-<imm>]{!} +// STRB <Rt>, [<Rn>], #+/-<imm> +bool TranslatorVisitor::arm_STRB_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<12> imm12) { + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto offset = ir.Imm32(imm12.ZeroExtend()); + const auto address = GetAddress(ir, P, U, W, n, offset); + ir.WriteMemory8(address, ir.LeastSignificantByte(ir.GetRegister(t)), IR::AccType::NORMAL); + return true; +} + +// STRB <Rt>, [<Rn>, #+/-<Rm>]{!} +// STRB <Rt>, [<Rn>], #+/-<Rm> +bool TranslatorVisitor::arm_STRB_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<5> imm5, ShiftType shift, Reg m) { + if (t == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto offset = EmitImmShift(ir.GetRegister(m), shift, imm5, ir.GetCFlag()).result; + const auto address = GetAddress(ir, P, U, W, n, offset); + ir.WriteMemory8(address, ir.LeastSignificantByte(ir.GetRegister(t)), IR::AccType::NORMAL); + return true; +} + +// STRD <Rt>, [<Rn>, #+/-<imm>]{!} +// STRD <Rt>, [<Rn>], #+/-<imm> +bool TranslatorVisitor::arm_STRD_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + if (size_t(t) % 2 != 0) { + return UnpredictableInstruction(); + } + + if (!P && W) { + return UnpredictableInstruction(); + } + + const Reg t2 = t + 1; + if ((!P || W) && (n == Reg::PC || n == t || n == t2)) { + return UnpredictableInstruction(); + } + + if (t2 == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const auto offset = ir.Imm32(imm32); + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto value_a = ir.GetRegister(t); + const auto value_b = ir.GetRegister(t2); + + const IR::U64 data = ir.current_location.EFlag() ? ir.Pack2x32To1x64(value_b, value_a) + : ir.Pack2x32To1x64(value_a, value_b); + + // NOTE: If alignment is exactly off by 4, each word is an atomic access. + ir.WriteMemory64(address, data, IR::AccType::ATOMIC); + return true; +} + +// STRD <Rt>, [<Rn>, #+/-<Rm>]{!} +// STRD <Rt>, [<Rn>], #+/-<Rm> +bool TranslatorVisitor::arm_STRD_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + if (size_t(t) % 2 != 0) { + return UnpredictableInstruction(); + } + + if (!P && W) { + return UnpredictableInstruction(); + } + + const Reg t2 = t + 1; + if (t2 == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t || n == t2)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto offset = ir.GetRegister(m); + const auto address = GetAddress(ir, P, U, W, n, offset); + const auto value_a = ir.GetRegister(t); + const auto value_b = ir.GetRegister(t2); + + const IR::U64 data = ir.current_location.EFlag() ? ir.Pack2x32To1x64(value_b, value_a) + : ir.Pack2x32To1x64(value_a, value_b); + + // NOTE: If alignment is exactly off by 4, each word is an atomic access. + ir.WriteMemory64(address, data, IR::AccType::ATOMIC); + return true; +} + +// STRH <Rt>, [<Rn>, #+/-<imm>]{!} +// STRH <Rt>, [<Rn>], #+/-<imm> +bool TranslatorVisitor::arm_STRH_imm(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Imm<4> imm8a, Imm<4> imm8b) { + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 imm32 = concatenate(imm8a, imm8b).ZeroExtend(); + const auto offset = ir.Imm32(imm32); + const auto address = GetAddress(ir, P, U, W, n, offset); + + ir.WriteMemory16(address, ir.LeastSignificantHalf(ir.GetRegister(t)), IR::AccType::NORMAL); + return true; +} + +// STRH <Rt>, [<Rn>, #+/-<Rm>]{!} +// STRH <Rt>, [<Rn>], #+/-<Rm> +bool TranslatorVisitor::arm_STRH_reg(Cond cond, bool P, bool U, bool W, Reg n, Reg t, Reg m) { + if (t == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if ((!P || W) && (n == Reg::PC || n == t)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto offset = ir.GetRegister(m); + const auto address = GetAddress(ir, P, U, W, n, offset); + + ir.WriteMemory16(address, ir.LeastSignificantHalf(ir.GetRegister(t)), IR::AccType::NORMAL); + return true; +} + +static bool LDMHelper(A32::IREmitter& ir, bool W, Reg n, RegList list, IR::U32 start_address, IR::U32 writeback_address) { + auto address = start_address; + for (size_t i = 0; i <= 14; i++) { + if (mcl::bit::get_bit(i, list)) { + ir.SetRegister(static_cast<Reg>(i), ir.ReadMemory32(address, IR::AccType::ATOMIC)); + address = ir.Add(address, ir.Imm32(4)); + } + } + if (W && !mcl::bit::get_bit(RegNumber(n), list)) { + ir.SetRegister(n, writeback_address); + } + if (mcl::bit::get_bit<15>(list)) { + ir.LoadWritePC(ir.ReadMemory32(address, IR::AccType::ATOMIC)); + if (n == Reg::R13) + ir.SetTerm(IR::Term::PopRSBHint{}); + else + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; + } + return true; +} + +// LDM <Rn>{!}, <reg_list> +bool TranslatorVisitor::arm_LDM(Cond cond, bool W, Reg n, RegList list) { + if (n == Reg::PC || mcl::bit::count_ones(list) < 1) { + return UnpredictableInstruction(); + } + if (W && mcl::bit::get_bit(static_cast<size_t>(n), list)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto start_address = ir.GetRegister(n); + const auto writeback_address = ir.Add(start_address, ir.Imm32(u32(mcl::bit::count_ones(list) * 4))); + return LDMHelper(ir, W, n, list, start_address, writeback_address); +} + +// LDMDA <Rn>{!}, <reg_list> +bool TranslatorVisitor::arm_LDMDA(Cond cond, bool W, Reg n, RegList list) { + if (n == Reg::PC || mcl::bit::count_ones(list) < 1) { + return UnpredictableInstruction(); + } + if (W && mcl::bit::get_bit(static_cast<size_t>(n), list)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list) - 4))); + const auto writeback_address = ir.Sub(start_address, ir.Imm32(4)); + return LDMHelper(ir, W, n, list, start_address, writeback_address); +} + +// LDMDB <Rn>{!}, <reg_list> +bool TranslatorVisitor::arm_LDMDB(Cond cond, bool W, Reg n, RegList list) { + if (n == Reg::PC || mcl::bit::count_ones(list) < 1) { + return UnpredictableInstruction(); + } + if (W && mcl::bit::get_bit(static_cast<size_t>(n), list)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list)))); + const auto writeback_address = start_address; + return LDMHelper(ir, W, n, list, start_address, writeback_address); +} + +// LDMIB <Rn>{!}, <reg_list> +bool TranslatorVisitor::arm_LDMIB(Cond cond, bool W, Reg n, RegList list) { + if (n == Reg::PC || mcl::bit::count_ones(list) < 1) { + return UnpredictableInstruction(); + } + if (W && mcl::bit::get_bit(static_cast<size_t>(n), list)) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto start_address = ir.Add(ir.GetRegister(n), ir.Imm32(4)); + const auto writeback_address = ir.Add(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list)))); + return LDMHelper(ir, W, n, list, start_address, writeback_address); +} + +bool TranslatorVisitor::arm_LDM_usr() { + return InterpretThisInstruction(); +} + +bool TranslatorVisitor::arm_LDM_eret() { + return InterpretThisInstruction(); +} + +static bool STMHelper(A32::IREmitter& ir, bool W, Reg n, RegList list, IR::U32 start_address, IR::U32 writeback_address) { + auto address = start_address; + for (size_t i = 0; i <= 14; i++) { + if (mcl::bit::get_bit(i, list)) { + ir.WriteMemory32(address, ir.GetRegister(static_cast<Reg>(i)), IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + } + } + if (W) { + ir.SetRegister(n, writeback_address); + } + if (mcl::bit::get_bit<15>(list)) { + ir.WriteMemory32(address, ir.Imm32(ir.PC()), IR::AccType::ATOMIC); + } + return true; +} + +// STM <Rn>{!}, <reg_list> +bool TranslatorVisitor::arm_STM(Cond cond, bool W, Reg n, RegList list) { + if (n == Reg::PC || mcl::bit::count_ones(list) < 1) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto start_address = ir.GetRegister(n); + const auto writeback_address = ir.Add(start_address, ir.Imm32(u32(mcl::bit::count_ones(list) * 4))); + return STMHelper(ir, W, n, list, start_address, writeback_address); +} + +// STMDA <Rn>{!}, <reg_list> +bool TranslatorVisitor::arm_STMDA(Cond cond, bool W, Reg n, RegList list) { + if (n == Reg::PC || mcl::bit::count_ones(list) < 1) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list) - 4))); + const auto writeback_address = ir.Sub(start_address, ir.Imm32(4)); + return STMHelper(ir, W, n, list, start_address, writeback_address); +} + +// STMDB <Rn>{!}, <reg_list> +bool TranslatorVisitor::arm_STMDB(Cond cond, bool W, Reg n, RegList list) { + if (n == Reg::PC || mcl::bit::count_ones(list) < 1) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list)))); + const auto writeback_address = start_address; + return STMHelper(ir, W, n, list, start_address, writeback_address); +} + +// STMIB <Rn>{!}, <reg_list> +bool TranslatorVisitor::arm_STMIB(Cond cond, bool W, Reg n, RegList list) { + if (n == Reg::PC || mcl::bit::count_ones(list) < 1) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto start_address = ir.Add(ir.GetRegister(n), ir.Imm32(4)); + const auto writeback_address = ir.Add(ir.GetRegister(n), ir.Imm32(u32(4 * mcl::bit::count_ones(list)))); + return STMHelper(ir, W, n, list, start_address, writeback_address); +} + +bool TranslatorVisitor::arm_STM_usr() { + return InterpretThisInstruction(); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/misc.cpp new file mode 100644 index 0000000000..ef54b66827 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/misc.cpp @@ -0,0 +1,179 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bitsizeof.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// BFC<c> <Rd>, #<lsb>, #<width> +bool TranslatorVisitor::arm_BFC(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + if (msb < lsb) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 lsb_value = lsb.ZeroExtend(); + const u32 msb_value = msb.ZeroExtend(); + const u32 mask = ~(mcl::bit::ones<u32>(msb_value - lsb_value + 1) << lsb_value); + const IR::U32 operand = ir.GetRegister(d); + const IR::U32 result = ir.And(operand, ir.Imm32(mask)); + + ir.SetRegister(d, result); + return true; +} + +// BFI<c> <Rd>, <Rn>, #<lsb>, #<width> +bool TranslatorVisitor::arm_BFI(Cond cond, Imm<5> msb, Reg d, Imm<5> lsb, Reg n) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + if (msb < lsb) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const u32 lsb_value = lsb.ZeroExtend(); + const u32 msb_value = msb.ZeroExtend(); + const u32 inclusion_mask = mcl::bit::ones<u32>(msb_value - lsb_value + 1) << lsb_value; + const u32 exclusion_mask = ~inclusion_mask; + const IR::U32 operand1 = ir.And(ir.GetRegister(d), ir.Imm32(exclusion_mask)); + const IR::U32 operand2 = ir.And(ir.LogicalShiftLeft(ir.GetRegister(n), ir.Imm8(u8(lsb_value))), ir.Imm32(inclusion_mask)); + const IR::U32 result = ir.Or(operand1, operand2); + + ir.SetRegister(d, result); + return true; +} + +// CLZ<c> <Rd>, <Rm> +bool TranslatorVisitor::arm_CLZ(Cond cond, Reg d, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + ir.SetRegister(d, ir.CountLeadingZeros(ir.GetRegister(m))); + return true; +} + +// MOVT<c> <Rd>, #<imm16> +bool TranslatorVisitor::arm_MOVT(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 imm16 = ir.Imm32(concatenate(imm4, imm12).ZeroExtend() << 16); + const IR::U32 operand = ir.GetRegister(d); + const IR::U32 result = ir.Or(ir.And(operand, ir.Imm32(0x0000FFFFU)), imm16); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::arm_MOVW(Cond cond, Imm<4> imm4, Reg d, Imm<12> imm12) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 imm = ir.Imm32(concatenate(imm4, imm12).ZeroExtend()); + + ir.SetRegister(d, imm); + return true; +} + +// SBFX<c> <Rd>, <Rn>, #<lsb>, #<width> +bool TranslatorVisitor::arm_SBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const u32 lsb_value = lsb.ZeroExtend(); + const u32 widthm1_value = widthm1.ZeroExtend(); + const u32 msb = lsb_value + widthm1_value; + if (msb >= mcl::bitsizeof<u32>) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + constexpr size_t max_width = mcl::bitsizeof<u32>; + const u32 width = widthm1_value + 1; + const u8 left_shift_amount = static_cast<u8>(max_width - width - lsb_value); + const u8 right_shift_amount = static_cast<u8>(max_width - width); + const IR::U32 operand = ir.GetRegister(n); + const IR::U32 tmp = ir.LogicalShiftLeft(operand, ir.Imm8(left_shift_amount)); + const IR::U32 result = ir.ArithmeticShiftRight(tmp, ir.Imm8(right_shift_amount)); + + ir.SetRegister(d, result); + return true; +} + +// SEL<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SEL(Cond cond, Reg n, Reg d, Reg m) { + if (n == Reg::PC || d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto to = ir.GetRegister(m); + const auto from = ir.GetRegister(n); + const auto result = ir.PackedSelect(ir.GetGEFlags(), to, from); + + ir.SetRegister(d, result); + return true; +} + +// UBFX<c> <Rd>, <Rn>, #<lsb>, #<width> +bool TranslatorVisitor::arm_UBFX(Cond cond, Imm<5> widthm1, Reg d, Imm<5> lsb, Reg n) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const u32 lsb_value = lsb.ZeroExtend(); + const u32 widthm1_value = widthm1.ZeroExtend(); + const u32 msb = lsb_value + widthm1_value; + if (msb >= mcl::bitsizeof<u32>) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 operand = ir.GetRegister(n); + const IR::U32 mask = ir.Imm32(mcl::bit::ones<u32>(widthm1_value + 1)); + const IR::U32 result = ir.And(ir.LogicalShiftRight(operand, ir.Imm8(u8(lsb_value))), mask); + + ir.SetRegister(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/multiply.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/multiply.cpp new file mode 100644 index 0000000000..2bda56a961 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/multiply.cpp @@ -0,0 +1,604 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// MLA{S}<c> <Rd>, <Rn>, <Rm>, <Ra> +bool TranslatorVisitor::arm_MLA(Cond cond, bool S, Reg d, Reg a, Reg m, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.Add(ir.Mul(ir.GetRegister(n), ir.GetRegister(m)), ir.GetRegister(a)); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + + return true; +} + +// MLS<c> <Rd>, <Rn>, <Rm>, <Ra> +bool TranslatorVisitor::arm_MLS(Cond cond, Reg d, Reg a, Reg m, Reg n) { + if (d == Reg::PC || a == Reg::PC || m == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 operand1 = ir.GetRegister(n); + const IR::U32 operand2 = ir.GetRegister(m); + const IR::U32 operand3 = ir.GetRegister(a); + const IR::U32 result = ir.Sub(operand3, ir.Mul(operand1, operand2)); + + ir.SetRegister(d, result); + return true; +} + +// MUL{S}<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_MUL(Cond cond, bool S, Reg d, Reg m, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.Mul(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + + return true; +} + +// SMLAL{S}<c> <RdLo>, <RdHi>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dLo == dHi) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m)); + const auto product = ir.Mul(n64, m64); + const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi)); + const auto result = ir.Add(product, addend); + const auto lo = ir.LeastSignificantWord(result); + const auto hi = ir.MostSignificantWord(result).result; + + ir.SetRegister(dLo, lo); + ir.SetRegister(dHi, hi); + if (S) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + + return true; +} + +// SMULL{S}<c> <RdLo>, <RdHi>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dLo == dHi) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m)); + const auto result = ir.Mul(n64, m64); + const auto lo = ir.LeastSignificantWord(result); + const auto hi = ir.MostSignificantWord(result).result; + + ir.SetRegister(dLo, lo); + ir.SetRegister(dHi, hi); + if (S) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + + return true; +} + +// UMAAL<c> <RdLo>, <RdHi>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UMAAL(Cond cond, Reg dHi, Reg dLo, Reg m, Reg n) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dLo == dHi) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto lo64 = ir.ZeroExtendWordToLong(ir.GetRegister(dLo)); + const auto hi64 = ir.ZeroExtendWordToLong(ir.GetRegister(dHi)); + const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m)); + const auto result = ir.Add(ir.Add(ir.Mul(n64, m64), hi64), lo64); + + ir.SetRegister(dLo, ir.LeastSignificantWord(result)); + ir.SetRegister(dHi, ir.MostSignificantWord(result).result); + return true; +} + +// UMLAL{S}<c> <RdLo>, <RdHi>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UMLAL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dLo == dHi) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi)); + const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m)); + const auto result = ir.Add(ir.Mul(n64, m64), addend); + const auto lo = ir.LeastSignificantWord(result); + const auto hi = ir.MostSignificantWord(result).result; + + ir.SetRegister(dLo, lo); + ir.SetRegister(dHi, hi); + if (S) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + + return true; +} + +// UMULL{S}<c> <RdLo>, <RdHi>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UMULL(Cond cond, bool S, Reg dHi, Reg dLo, Reg m, Reg n) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dLo == dHi) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m)); + const auto result = ir.Mul(n64, m64); + const auto lo = ir.LeastSignificantWord(result); + const auto hi = ir.MostSignificantWord(result).result; + + ir.SetRegister(dLo, lo); + ir.SetRegister(dHi, hi); + if (S) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + + return true; +} + +// SMLAL<x><y><c> <RdLo>, <RdHi>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SMLALxy(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, bool N, Reg n) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dLo == dHi) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + const IR::U64 product = ir.SignExtendWordToLong(ir.Mul(n16, m16)); + const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi)); + const auto result = ir.Add(product, addend); + + ir.SetRegister(dLo, ir.LeastSignificantWord(result)); + ir.SetRegister(dHi, ir.MostSignificantWord(result).result); + return true; +} + +// SMLA<x><y><c> <Rd>, <Rn>, <Rm>, <Ra> +bool TranslatorVisitor::arm_SMLAxy(Cond cond, Reg d, Reg a, Reg m, bool M, bool N, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + const IR::U32 product = ir.Mul(n16, m16); + const auto result = ir.AddWithCarry(product, ir.GetRegister(a), ir.Imm1(0)); + + ir.SetRegister(d, result); + ir.OrQFlag(ir.GetOverflowFrom(result)); + return true; +} + +// SMUL<x><y><c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SMULxy(Cond cond, Reg d, Reg m, bool M, bool N, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + const IR::U32 result = ir.Mul(n16, m16); + + ir.SetRegister(d, result); + return true; +} + +// SMLAW<y><c> <Rd>, <Rn>, <Rm>, <Ra> +bool TranslatorVisitor::arm_SMLAWy(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U64 n32 = ir.SignExtendWordToLong(ir.GetRegister(n)); + IR::U32 m32 = ir.GetRegister(m); + if (M) { + m32 = ir.LogicalShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + } + const IR::U64 m16 = ir.SignExtendWordToLong(ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32))); + const auto product = ir.LeastSignificantWord(ir.LogicalShiftRight(ir.Mul(n32, m16), ir.Imm8(16))); + const auto result = ir.AddWithCarry(product, ir.GetRegister(a), ir.Imm1(0)); + + ir.SetRegister(d, result); + ir.OrQFlag(ir.GetOverflowFrom(result)); + return true; +} + +// SMULW<y><c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SMULWy(Cond cond, Reg d, Reg m, bool M, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U64 n32 = ir.SignExtendWordToLong(ir.GetRegister(n)); + IR::U32 m32 = ir.GetRegister(m); + if (M) { + m32 = ir.LogicalShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + } + const IR::U64 m16 = ir.SignExtendWordToLong(ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32))); + const auto result = ir.LogicalShiftRight(ir.Mul(n32, m16), ir.Imm8(16)); + + ir.SetRegister(d, ir.LeastSignificantWord(result)); + return true; +} + +// SMMLA{R}<c> <Rd>, <Rn>, <Rm>, <Ra> +bool TranslatorVisitor::arm_SMMLA(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC /* no check for a */) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m)); + const auto a64 = ir.Pack2x32To1x64(ir.Imm32(0), ir.GetRegister(a)); + const auto temp = ir.Add(a64, ir.Mul(n64, m64)); + const auto result_carry = ir.MostSignificantWord(temp); + auto result = result_carry.result; + if (R) { + result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry); + } + + ir.SetRegister(d, result); + return true; +} + +// SMMLS{R}<c> <Rd>, <Rn>, <Rm>, <Ra> +bool TranslatorVisitor::arm_SMMLS(Cond cond, Reg d, Reg a, Reg m, bool R, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m)); + const auto a64 = ir.Pack2x32To1x64(ir.Imm32(0), ir.GetRegister(a)); + const auto temp = ir.Sub(a64, ir.Mul(n64, m64)); + const auto result_carry = ir.MostSignificantWord(temp); + auto result = result_carry.result; + if (R) { + result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry); + } + + ir.SetRegister(d, result); + return true; +} + +// SMMUL{R}<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SMMUL(Cond cond, Reg d, Reg m, bool R, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m)); + const auto product = ir.Mul(n64, m64); + const auto result_carry = ir.MostSignificantWord(product); + auto result = result_carry.result; + if (R) { + result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry); + } + + ir.SetRegister(d, result); + return true; +} + +// SMLAD{X}<c> <Rd>, <Rn>, <Rm>, <Ra> +bool TranslatorVisitor::arm_SMLAD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) { + if (a == Reg::PC) { + return arm_SMUAD(cond, d, m, M, n); + } + + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (M) { + std::swap(m_lo, m_hi); + } + + const IR::U32 product_lo = ir.Mul(n_lo, m_lo); + const IR::U32 product_hi = ir.Mul(n_hi, m_hi); + const IR::U32 addend = ir.GetRegister(a); + + auto result = ir.AddWithCarry(product_lo, product_hi, ir.Imm1(0)); + ir.OrQFlag(ir.GetOverflowFrom(result)); + result = ir.AddWithCarry(result, addend, ir.Imm1(0)); + ir.SetRegister(d, result); + ir.OrQFlag(ir.GetOverflowFrom(result)); + return true; +} + +// SMLALD{X}<c> <RdLo>, <RdHi>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SMLALD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dLo == dHi) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (M) { + std::swap(m_lo, m_hi); + } + + const IR::U64 product_lo = ir.SignExtendWordToLong(ir.Mul(n_lo, m_lo)); + const IR::U64 product_hi = ir.SignExtendWordToLong(ir.Mul(n_hi, m_hi)); + const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi)); + const auto result = ir.Add(ir.Add(product_lo, product_hi), addend); + + ir.SetRegister(dLo, ir.LeastSignificantWord(result)); + ir.SetRegister(dHi, ir.MostSignificantWord(result).result); + return true; +} + +// SMLSD{X}<c> <Rd>, <Rn>, <Rm>, <Ra> +bool TranslatorVisitor::arm_SMLSD(Cond cond, Reg d, Reg a, Reg m, bool M, Reg n) { + if (a == Reg::PC) { + return arm_SMUSD(cond, d, m, M, n); + } + + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (M) { + std::swap(m_lo, m_hi); + } + + const IR::U32 product_lo = ir.Mul(n_lo, m_lo); + const IR::U32 product_hi = ir.Mul(n_hi, m_hi); + const IR::U32 addend = ir.GetRegister(a); + const IR::U32 product = ir.Sub(product_lo, product_hi); + auto result = ir.AddWithCarry(product, addend, ir.Imm1(0)); + + ir.SetRegister(d, result); + ir.OrQFlag(ir.GetOverflowFrom(result)); + return true; +} + +// SMLSLD{X}<c> <RdLo>, <RdHi>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SMLSLD(Cond cond, Reg dHi, Reg dLo, Reg m, bool M, Reg n) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dLo == dHi) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (M) { + std::swap(m_lo, m_hi); + } + + const IR::U64 product_lo = ir.SignExtendWordToLong(ir.Mul(n_lo, m_lo)); + const IR::U64 product_hi = ir.SignExtendWordToLong(ir.Mul(n_hi, m_hi)); + const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi)); + const auto result = ir.Add(ir.Sub(product_lo, product_hi), addend); + + ir.SetRegister(dLo, ir.LeastSignificantWord(result)); + ir.SetRegister(dHi, ir.MostSignificantWord(result).result); + return true; +} + +// SMUAD{X}<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SMUAD(Cond cond, Reg d, Reg m, bool M, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (M) { + std::swap(m_lo, m_hi); + } + + const IR::U32 product_lo = ir.Mul(n_lo, m_lo); + const IR::U32 product_hi = ir.Mul(n_hi, m_hi); + const auto result = ir.AddWithCarry(product_lo, product_hi, ir.Imm1(0)); + + ir.SetRegister(d, result); + ir.OrQFlag(ir.GetOverflowFrom(result)); + return true; +} + +// SMUSD{X}<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SMUSD(Cond cond, Reg d, Reg m, bool M, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (M) { + std::swap(m_lo, m_hi); + } + + const IR::U32 product_lo = ir.Mul(n_lo, m_lo); + const IR::U32 product_hi = ir.Mul(n_hi, m_hi); + auto result = ir.Sub(product_lo, product_hi); + ir.SetRegister(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/packing.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/packing.cpp new file mode 100644 index 0000000000..5cb88fe47d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/packing.cpp @@ -0,0 +1,46 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// PKHBT<c> <Rd>, <Rn>, <Rm>{, LSL #<imm>} +bool TranslatorVisitor::arm_PKHBT(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m) { + if (n == Reg::PC || d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), ShiftType::LSL, imm5, ir.Imm1(false)).result; + const auto lower_half = ir.And(ir.GetRegister(n), ir.Imm32(0x0000FFFF)); + const auto upper_half = ir.And(shifted, ir.Imm32(0xFFFF0000)); + + ir.SetRegister(d, ir.Or(lower_half, upper_half)); + return true; +} + +// PKHTB<c> <Rd>, <Rn>, <Rm>{, ASR #<imm>} +bool TranslatorVisitor::arm_PKHTB(Cond cond, Reg n, Reg d, Imm<5> imm5, Reg m) { + if (n == Reg::PC || d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), ShiftType::ASR, imm5, ir.Imm1(false)).result; + const auto lower_half = ir.And(shifted, ir.Imm32(0x0000FFFF)); + const auto upper_half = ir.And(ir.GetRegister(n), ir.Imm32(0xFFFF0000)); + + ir.SetRegister(d, ir.Or(lower_half, upper_half)); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/parallel.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/parallel.cpp new file mode 100644 index 0000000000..666cee2c33 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/parallel.cpp @@ -0,0 +1,537 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// SADD8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SADD8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedAddS8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// SADD16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SADD16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedAddS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// SASX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SASX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedAddSubS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// SSAX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SSAX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSubAddS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// SSUB8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SSUB8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSubS8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// SSUB16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SSUB16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSubS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// UADD8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UADD8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedAddU8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// UADD16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UADD16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedAddU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// UASX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UASX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedAddSubU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// USAX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_USAX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSubAddU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// USAD8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_USAD8(Cond cond, Reg d, Reg m, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedAbsDiffSumU8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// USADA8<c> <Rd>, <Rn>, <Rm>, <Ra> +bool TranslatorVisitor::arm_USADA8(Cond cond, Reg d, Reg a, Reg m, Reg n) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto tmp = ir.PackedAbsDiffSumU8(ir.GetRegister(n), ir.GetRegister(m)); + const auto result = ir.AddWithCarry(ir.GetRegister(a), tmp, ir.Imm1(0)); + ir.SetRegister(d, result); + return true; +} + +// USUB8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_USUB8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSubU8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// USUB16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_USUB16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSubU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +// Parallel Add/Subtract (Saturating) instructions + +// QADD8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_QADD8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSaturatedAddS8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// QADD16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_QADD16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSaturatedAddS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// QSUB8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_QSUB8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSaturatedSubS8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// QSUB16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_QSUB16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSaturatedSubS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// UQADD8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UQADD8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSaturatedAddU8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// UQADD16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UQADD16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSaturatedAddU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// UQSUB8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UQSUB8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSaturatedSubU8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// UQSUB16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UQSUB16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedSaturatedSubU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// Parallel Add/Subtract (Halving) instructions + +// SHADD8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SHADD8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingAddS8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// SHADD16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SHADD16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingAddS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// SHASX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SHASX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingAddSubS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// SHSAX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SHSAX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// SHSUB8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SHSUB8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingSubS8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// SHSUB16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_SHSUB16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingSubS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// UHADD8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UHADD8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingAddU8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// UHADD16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UHADD16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingAddU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// UHASX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UHASX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingAddSubU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// UHSAX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UHSAX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// UHSUB8<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UHSUB8(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingSubU8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// UHSUB16<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UHSUB16(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.PackedHalvingSubU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/reversal.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/reversal.cpp new file mode 100644 index 0000000000..4bf594919e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/reversal.cpp @@ -0,0 +1,89 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// RBIT<c> <Rd>, <Rm> +bool TranslatorVisitor::arm_RBIT(Cond cond, Reg d, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const IR::U32 swapped = ir.ByteReverseWord(ir.GetRegister(m)); + + // ((x & 0xF0F0F0F0) >> 4) | ((x & 0x0F0F0F0F) << 4) + const IR::U32 first_lsr = ir.LogicalShiftRight(ir.And(swapped, ir.Imm32(0xF0F0F0F0)), ir.Imm8(4)); + const IR::U32 first_lsl = ir.LogicalShiftLeft(ir.And(swapped, ir.Imm32(0x0F0F0F0F)), ir.Imm8(4)); + const IR::U32 corrected = ir.Or(first_lsl, first_lsr); + + // ((x & 0x88888888) >> 3) | ((x & 0x44444444) >> 1) | + // ((x & 0x22222222) << 1) | ((x & 0x11111111) << 3) + const IR::U32 second_lsr = ir.LogicalShiftRight(ir.And(corrected, ir.Imm32(0x88888888)), ir.Imm8(3)); + const IR::U32 third_lsr = ir.LogicalShiftRight(ir.And(corrected, ir.Imm32(0x44444444)), ir.Imm8(1)); + const IR::U32 second_lsl = ir.LogicalShiftLeft(ir.And(corrected, ir.Imm32(0x22222222)), ir.Imm8(1)); + const IR::U32 third_lsl = ir.LogicalShiftLeft(ir.And(corrected, ir.Imm32(0x11111111)), ir.Imm8(3)); + + const IR::U32 result = ir.Or(ir.Or(ir.Or(second_lsr, third_lsr), second_lsl), third_lsl); + + ir.SetRegister(d, result); + return true; +} + +// REV<c> <Rd>, <Rm> +bool TranslatorVisitor::arm_REV(Cond cond, Reg d, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto result = ir.ByteReverseWord(ir.GetRegister(m)); + ir.SetRegister(d, result); + return true; +} + +// REV16<c> <Rd>, <Rm> +bool TranslatorVisitor::arm_REV16(Cond cond, Reg d, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto reg_m = ir.GetRegister(m); + const auto lo = ir.And(ir.LogicalShiftRight(reg_m, ir.Imm8(8), ir.Imm1(0)).result, ir.Imm32(0x00FF00FF)); + const auto hi = ir.And(ir.LogicalShiftLeft(reg_m, ir.Imm8(8), ir.Imm1(0)).result, ir.Imm32(0xFF00FF00)); + const auto result = ir.Or(lo, hi); + + ir.SetRegister(d, result); + return true; +} + +// REVSH<c> <Rd>, <Rm> +bool TranslatorVisitor::arm_REVSH(Cond cond, Reg d, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto rev_half = ir.ByteReverseHalf(ir.LeastSignificantHalf(ir.GetRegister(m))); + ir.SetRegister(d, ir.SignExtendHalfToWord(rev_half)); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/saturated.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/saturated.cpp new file mode 100644 index 0000000000..41db115044 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/saturated.cpp @@ -0,0 +1,285 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +static IR::U32 Pack2x16To1x32(A32::IREmitter& ir, IR::U32 lo, IR::U32 hi) { + return ir.Or(ir.And(lo, ir.Imm32(0xFFFF)), ir.LogicalShiftLeft(hi, ir.Imm8(16), ir.Imm1(0)).result); +} + +static IR::U16 MostSignificantHalf(A32::IREmitter& ir, IR::U32 value) { + return ir.LeastSignificantHalf(ir.LogicalShiftRight(value, ir.Imm8(16), ir.Imm1(0)).result); +} + +// Saturation instructions + +// SSAT<c> <Rd>, #<imm>, <Rn>{, <shift>} +bool TranslatorVisitor::arm_SSAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto saturate_to = static_cast<size_t>(sat_imm.ZeroExtend()) + 1; + const auto shift = !sh ? ShiftType::LSL : ShiftType::ASR; + const auto operand = EmitImmShift(ir.GetRegister(n), shift, imm5, ir.GetCFlag()); + const auto result = ir.SignedSaturation(operand.result, saturate_to); + + ir.SetRegister(d, result.result); + ir.OrQFlag(result.overflow); + return true; +} + +// SSAT16<c> <Rd>, #<imm>, <Rn> +bool TranslatorVisitor::arm_SSAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto saturate_to = static_cast<size_t>(sat_imm.ZeroExtend()) + 1; + const auto lo_operand = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(ir.GetRegister(n))); + const auto hi_operand = ir.SignExtendHalfToWord(MostSignificantHalf(ir, ir.GetRegister(n))); + const auto lo_result = ir.SignedSaturation(lo_operand, saturate_to); + const auto hi_result = ir.SignedSaturation(hi_operand, saturate_to); + + ir.SetRegister(d, Pack2x16To1x32(ir, lo_result.result, hi_result.result)); + ir.OrQFlag(lo_result.overflow); + ir.OrQFlag(hi_result.overflow); + return true; +} + +// USAT<c> <Rd>, #<imm5>, <Rn>{, <shift>} +bool TranslatorVisitor::arm_USAT(Cond cond, Imm<5> sat_imm, Reg d, Imm<5> imm5, bool sh, Reg n) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto saturate_to = static_cast<size_t>(sat_imm.ZeroExtend()); + const auto shift = !sh ? ShiftType::LSL : ShiftType::ASR; + const auto operand = EmitImmShift(ir.GetRegister(n), shift, imm5, ir.GetCFlag()); + const auto result = ir.UnsignedSaturation(operand.result, saturate_to); + + ir.SetRegister(d, result.result); + ir.OrQFlag(result.overflow); + return true; +} + +// USAT16<c> <Rd>, #<imm4>, <Rn> +bool TranslatorVisitor::arm_USAT16(Cond cond, Imm<4> sat_imm, Reg d, Reg n) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + // UnsignedSaturation takes a *signed* value as input, hence sign extension is required. + const auto saturate_to = static_cast<size_t>(sat_imm.ZeroExtend()); + const auto lo_operand = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(ir.GetRegister(n))); + const auto hi_operand = ir.SignExtendHalfToWord(MostSignificantHalf(ir, ir.GetRegister(n))); + const auto lo_result = ir.UnsignedSaturation(lo_operand, saturate_to); + const auto hi_result = ir.UnsignedSaturation(hi_operand, saturate_to); + + ir.SetRegister(d, Pack2x16To1x32(ir, lo_result.result, hi_result.result)); + ir.OrQFlag(lo_result.overflow); + ir.OrQFlag(hi_result.overflow); + return true; +} + +// Saturated Add/Subtract instructions + +// QADD<c> <Rd>, <Rm>, <Rn> +bool TranslatorVisitor::arm_QADD(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto a = ir.GetRegister(m); + const auto b = ir.GetRegister(n); + const auto result = ir.SignedSaturatedAddWithFlag(a, b); + + ir.SetRegister(d, result.result); + ir.OrQFlag(result.overflow); + return true; +} + +// QSUB<c> <Rd>, <Rm>, <Rn> +bool TranslatorVisitor::arm_QSUB(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto a = ir.GetRegister(m); + const auto b = ir.GetRegister(n); + const auto result = ir.SignedSaturatedSubWithFlag(a, b); + + ir.SetRegister(d, result.result); + ir.OrQFlag(result.overflow); + return true; +} + +// QDADD<c> <Rd>, <Rm>, <Rn> +bool TranslatorVisitor::arm_QDADD(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto a = ir.GetRegister(m); + const auto b = ir.GetRegister(n); + const auto doubled = ir.SignedSaturatedAddWithFlag(b, b); + ir.OrQFlag(doubled.overflow); + + const auto result = ir.SignedSaturatedAddWithFlag(a, doubled.result); + ir.SetRegister(d, result.result); + ir.OrQFlag(result.overflow); + return true; +} + +// QDSUB<c> <Rd>, <Rm>, <Rn> +bool TranslatorVisitor::arm_QDSUB(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto a = ir.GetRegister(m); + const auto b = ir.GetRegister(n); + const auto doubled = ir.SignedSaturatedAddWithFlag(b, b); + ir.OrQFlag(doubled.overflow); + + const auto result = ir.SignedSaturatedSubWithFlag(a, doubled.result); + ir.SetRegister(d, result.result); + ir.OrQFlag(result.overflow); + return true; +} + +// Parallel saturated instructions + +// QASX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_QASX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto Rn = ir.GetRegister(n); + const auto Rm = ir.GetRegister(m); + const auto Rn_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rn)); + const auto Rn_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rn)); + const auto Rm_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rm)); + const auto Rm_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rm)); + const auto diff = ir.SignedSaturation(ir.Sub(Rn_lo, Rm_hi), 16).result; + const auto sum = ir.SignedSaturation(ir.Add(Rn_hi, Rm_lo), 16).result; + const auto result = Pack2x16To1x32(ir, diff, sum); + + ir.SetRegister(d, result); + return true; +} + +// QSAX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_QSAX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto Rn = ir.GetRegister(n); + const auto Rm = ir.GetRegister(m); + const auto Rn_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rn)); + const auto Rn_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rn)); + const auto Rm_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rm)); + const auto Rm_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rm)); + const auto sum = ir.SignedSaturation(ir.Add(Rn_lo, Rm_hi), 16).result; + const auto diff = ir.SignedSaturation(ir.Sub(Rn_hi, Rm_lo), 16).result; + const auto result = Pack2x16To1x32(ir, sum, diff); + + ir.SetRegister(d, result); + return true; +} + +// UQASX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UQASX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto Rn = ir.GetRegister(n); + const auto Rm = ir.GetRegister(m); + const auto Rn_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rn)); + const auto Rn_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rn)); + const auto Rm_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rm)); + const auto Rm_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rm)); + const auto diff = ir.UnsignedSaturation(ir.Sub(Rn_lo, Rm_hi), 16).result; + const auto sum = ir.UnsignedSaturation(ir.Add(Rn_hi, Rm_lo), 16).result; + const auto result = Pack2x16To1x32(ir, diff, sum); + + ir.SetRegister(d, result); + return true; +} + +// UQSAX<c> <Rd>, <Rn>, <Rm> +bool TranslatorVisitor::arm_UQSAX(Cond cond, Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto Rn = ir.GetRegister(n); + const auto Rm = ir.GetRegister(m); + const auto Rn_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rn)); + const auto Rn_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rn)); + const auto Rm_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rm)); + const auto Rm_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rm)); + const auto sum = ir.UnsignedSaturation(ir.Add(Rn_lo, Rm_hi), 16).result; + const auto diff = ir.UnsignedSaturation(ir.Sub(Rn_hi, Rm_lo), 16).result; + const auto result = Pack2x16To1x32(ir, sum, diff); + + ir.SetRegister(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/status_register_access.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/status_register_access.cpp new file mode 100644 index 0000000000..60110df891 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/status_register_access.cpp @@ -0,0 +1,121 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// CPS<effect> <iflags>{, #<mode>} +// CPS #<mode> +bool TranslatorVisitor::arm_CPS() { + return InterpretThisInstruction(); +} + +// MRS<c> <Rd>, <spec_reg> +bool TranslatorVisitor::arm_MRS(Cond cond, Reg d) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + ir.SetRegister(d, ir.GetCpsr()); + return true; +} + +// MSR<c> <spec_reg>, #<const> +bool TranslatorVisitor::arm_MSR_imm(Cond cond, unsigned mask, int rotate, Imm<8> imm8) { + ASSERT_MSG(mask != 0, "Decode error"); + + if (!ArmConditionPassed(cond)) { + return true; + } + + const bool write_nzcvq = mcl::bit::get_bit<3>(mask); + const bool write_g = mcl::bit::get_bit<2>(mask); + const bool write_e = mcl::bit::get_bit<1>(mask); + const u32 imm32 = ArmExpandImm(rotate, imm8); + + if (write_nzcvq) { + ir.SetCpsrNZCVQ(ir.Imm32(imm32 & 0xF8000000)); + } + + if (write_g) { + ir.SetGEFlagsCompressed(ir.Imm32(imm32 & 0x000F0000)); + } + + if (write_e) { + const bool E = (imm32 & 0x00000200) != 0; + if (E != ir.current_location.EFlag()) { + ir.SetTerm(IR::Term::LinkBlock{ir.current_location.AdvancePC(4).SetEFlag(E)}); + return false; + } + } + + return true; +} + +// MSR<c> <spec_reg>, <Rn> +bool TranslatorVisitor::arm_MSR_reg(Cond cond, unsigned mask, Reg n) { + if (mask == 0) { + return UnpredictableInstruction(); + } + + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const bool write_nzcvq = mcl::bit::get_bit<3>(mask); + const bool write_g = mcl::bit::get_bit<2>(mask); + const bool write_e = mcl::bit::get_bit<1>(mask); + const auto value = ir.GetRegister(n); + + if (!write_e) { + if (write_nzcvq) { + ir.SetCpsrNZCVQ(ir.And(value, ir.Imm32(0xF8000000))); + } + + if (write_g) { + ir.SetGEFlagsCompressed(ir.And(value, ir.Imm32(0x000F0000))); + } + } else { + const u32 cpsr_mask = (write_nzcvq ? 0xF8000000 : 0) | (write_g ? 0x000F0000 : 0) | 0x00000200; + const auto old_cpsr = ir.And(ir.GetCpsr(), ir.Imm32(~cpsr_mask)); + const auto new_cpsr = ir.And(value, ir.Imm32(cpsr_mask)); + ir.SetCpsr(ir.Or(old_cpsr, new_cpsr)); + ir.PushRSB(ir.current_location.AdvancePC(4)); + ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4)); + ir.SetTerm(IR::Term::CheckHalt{IR::Term::PopRSBHint{}}); + return false; + } + + return true; +} + +// RFE{<amode>} <Rn>{!} +bool TranslatorVisitor::arm_RFE() { + return InterpretThisInstruction(); +} + +// SETEND <endian_specifier> +bool TranslatorVisitor::arm_SETEND(bool E) { + ir.SetTerm(IR::Term::LinkBlock{ir.current_location.AdvancePC(4).SetEFlag(E)}); + return false; +} + +// SRS{<amode>} SP{!}, #<mode> +bool TranslatorVisitor::arm_SRS() { + return InterpretThisInstruction(); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/synchronization.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/synchronization.cpp new file mode 100644 index 0000000000..e9f04b70e5 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/synchronization.cpp @@ -0,0 +1,439 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// CLREX +bool TranslatorVisitor::arm_CLREX() { + ir.ClearExclusive(); + return true; +} + +// SWP<c> <Rt>, <Rt2>, [<Rn>] +// TODO: UNDEFINED if current mode is Hypervisor +bool TranslatorVisitor::arm_SWP(Cond cond, Reg n, Reg t, Reg t2) { + if (t == Reg::PC || t2 == Reg::PC || n == Reg::PC || n == t || n == t2) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + // TODO (HACK): Implement bus locking here + const auto data = ir.ReadMemory32(ir.GetRegister(n), IR::AccType::SWAP); + ir.WriteMemory32(ir.GetRegister(n), ir.GetRegister(t2), IR::AccType::SWAP); + // TODO: Alignment check + ir.SetRegister(t, data); + return true; +} + +// SWPB<c> <Rt>, <Rt2>, [<Rn>] +// TODO: UNDEFINED if current mode is Hypervisor +bool TranslatorVisitor::arm_SWPB(Cond cond, Reg n, Reg t, Reg t2) { + if (t == Reg::PC || t2 == Reg::PC || n == Reg::PC || n == t || n == t2) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + // TODO (HACK): Implement bus locking here + const auto data = ir.ReadMemory8(ir.GetRegister(n), IR::AccType::SWAP); + ir.WriteMemory8(ir.GetRegister(n), ir.LeastSignificantByte(ir.GetRegister(t2)), IR::AccType::SWAP); + // TODO: Alignment check + ir.SetRegister(t, ir.ZeroExtendByteToWord(data)); + return true; +} + +// LDA<c> <Rt>, [<Rn>] +bool TranslatorVisitor::arm_LDA(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.SetRegister(t, ir.ReadMemory32(address, IR::AccType::ORDERED)); + return true; +} +// LDAB<c> <Rt>, [<Rn>] +bool TranslatorVisitor::arm_LDAB(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.SetRegister(t, ir.ZeroExtendToWord(ir.ReadMemory8(address, IR::AccType::ORDERED))); + return true; +} +// LDAH<c> <Rt>, [<Rn>] +bool TranslatorVisitor::arm_LDAH(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.SetRegister(t, ir.ZeroExtendToWord(ir.ReadMemory16(address, IR::AccType::ORDERED))); + return true; +} + +// LDAEX<c> <Rt>, [<Rn>] +bool TranslatorVisitor::arm_LDAEX(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.SetRegister(t, ir.ExclusiveReadMemory32(address, IR::AccType::ORDERED)); + return true; +} + +// LDAEXB<c> <Rt>, [<Rn>] +bool TranslatorVisitor::arm_LDAEXB(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.SetRegister(t, ir.ZeroExtendByteToWord(ir.ExclusiveReadMemory8(address, IR::AccType::ORDERED))); + return true; +} + +// LDAEXD<c> <Rt>, <Rt2>, [<Rn>] +bool TranslatorVisitor::arm_LDAEXD(Cond cond, Reg n, Reg t) { + if (t == Reg::LR || t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + const auto [lo, hi] = ir.ExclusiveReadMemory64(address, IR::AccType::ORDERED); + // DO NOT SWAP hi AND lo IN BIG ENDIAN MODE, THIS IS CORRECT BEHAVIOUR + ir.SetRegister(t, lo); + ir.SetRegister(t + 1, hi); + return true; +} + +// LDAEXH<c> <Rt>, [<Rn>] +bool TranslatorVisitor::arm_LDAEXH(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.SetRegister(t, ir.ZeroExtendHalfToWord(ir.ExclusiveReadMemory16(address, IR::AccType::ORDERED))); + return true; +} + +// STL<c> <Rt>, [<Rn>] +bool TranslatorVisitor::arm_STL(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.WriteMemory32(address, ir.GetRegister(t), IR::AccType::ORDERED); + return true; +} + +// STLB<c> <Rt>, [<Rn>] +bool TranslatorVisitor::arm_STLB(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.WriteMemory8(address, ir.LeastSignificantByte(ir.GetRegister(t)), IR::AccType::ORDERED); + return true; +} + +// STLH<c> <Rd>, <Rt>, [<Rn>] +bool TranslatorVisitor::arm_STLH(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.WriteMemory16(address, ir.LeastSignificantHalf(ir.GetRegister(t)), IR::AccType::ORDERED); + return true; +} + +// STLEXB<c> <Rd>, <Rt>, [<Rn>] +bool TranslatorVisitor::arm_STLEXB(Cond cond, Reg n, Reg d, Reg t) { + if (n == Reg::PC || d == Reg::PC || t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (d == n || d == t) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + const auto value = ir.LeastSignificantByte(ir.GetRegister(t)); + const auto passed = ir.ExclusiveWriteMemory8(address, value, IR::AccType::ORDERED); + ir.SetRegister(d, passed); + return true; +} +// STLEXD<c> <Rd>, <Rt>, <Rt2>, [<Rn>] +bool TranslatorVisitor::arm_STLEXD(Cond cond, Reg n, Reg d, Reg t) { + if (n == Reg::PC || d == Reg::PC || t == Reg::LR || static_cast<size_t>(t) % 2 == 1) { + return UnpredictableInstruction(); + } + + if (d == n || d == t || d == t + 1) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const Reg t2 = t + 1; + const auto address = ir.GetRegister(n); + const auto value_lo = ir.GetRegister(t); + const auto value_hi = ir.GetRegister(t2); + const auto passed = ir.ExclusiveWriteMemory64(address, value_lo, value_hi, IR::AccType::ORDERED); + ir.SetRegister(d, passed); + return true; +} + +// STLEXH<c> <Rd>, <Rt>, [<Rn>] +bool TranslatorVisitor::arm_STLEXH(Cond cond, Reg n, Reg d, Reg t) { + if (n == Reg::PC || d == Reg::PC || t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (d == n || d == t) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + const auto value = ir.LeastSignificantHalf(ir.GetRegister(t)); + const auto passed = ir.ExclusiveWriteMemory16(address, value, IR::AccType::ORDERED); + ir.SetRegister(d, passed); + return true; +} + +// STLEX<c> <Rd>, <Rt>, [<Rn>] +bool TranslatorVisitor::arm_STLEX(Cond cond, Reg n, Reg d, Reg t) { + if (n == Reg::PC || d == Reg::PC || t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (d == n || d == t) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + const auto value = ir.GetRegister(t); + const auto passed = ir.ExclusiveWriteMemory32(address, value, IR::AccType::ORDERED); + ir.SetRegister(d, passed); + return true; +} + +// LDREX<c> <Rt>, [<Rn>] +bool TranslatorVisitor::arm_LDREX(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.SetRegister(t, ir.ExclusiveReadMemory32(address, IR::AccType::ATOMIC)); + return true; +} + +// LDREXB<c> <Rt>, [<Rn>] +bool TranslatorVisitor::arm_LDREXB(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.SetRegister(t, ir.ZeroExtendByteToWord(ir.ExclusiveReadMemory8(address, IR::AccType::ATOMIC))); + return true; +} + +// LDREXD<c> <Rt>, <Rt2>, [<Rn>] +bool TranslatorVisitor::arm_LDREXD(Cond cond, Reg n, Reg t) { + if (t == Reg::LR || t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + const auto [lo, hi] = ir.ExclusiveReadMemory64(address, IR::AccType::ATOMIC); + // DO NOT SWAP hi AND lo IN BIG ENDIAN MODE, THIS IS CORRECT BEHAVIOUR + ir.SetRegister(t, lo); + ir.SetRegister(t + 1, hi); + return true; +} + +// LDREXH<c> <Rt>, [<Rn>] +bool TranslatorVisitor::arm_LDREXH(Cond cond, Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + ir.SetRegister(t, ir.ZeroExtendHalfToWord(ir.ExclusiveReadMemory16(address, IR::AccType::ATOMIC))); + return true; +} + +// STREX<c> <Rd>, <Rt>, [<Rn>] +bool TranslatorVisitor::arm_STREX(Cond cond, Reg n, Reg d, Reg t) { + if (n == Reg::PC || d == Reg::PC || t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (d == n || d == t) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + const auto value = ir.GetRegister(t); + const auto passed = ir.ExclusiveWriteMemory32(address, value, IR::AccType::ATOMIC); + ir.SetRegister(d, passed); + return true; +} + +// STREXB<c> <Rd>, <Rt>, [<Rn>] +bool TranslatorVisitor::arm_STREXB(Cond cond, Reg n, Reg d, Reg t) { + if (n == Reg::PC || d == Reg::PC || t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (d == n || d == t) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + const auto value = ir.LeastSignificantByte(ir.GetRegister(t)); + const auto passed = ir.ExclusiveWriteMemory8(address, value, IR::AccType::ATOMIC); + ir.SetRegister(d, passed); + return true; +} + +// STREXD<c> <Rd>, <Rt>, <Rt2>, [<Rn>] +bool TranslatorVisitor::arm_STREXD(Cond cond, Reg n, Reg d, Reg t) { + if (n == Reg::PC || d == Reg::PC || t == Reg::LR || static_cast<size_t>(t) % 2 == 1) { + return UnpredictableInstruction(); + } + + if (d == n || d == t || d == t + 1) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const Reg t2 = t + 1; + const auto address = ir.GetRegister(n); + const auto value_lo = ir.GetRegister(t); + const auto value_hi = ir.GetRegister(t2); + const auto passed = ir.ExclusiveWriteMemory64(address, value_lo, value_hi, IR::AccType::ATOMIC); + ir.SetRegister(d, passed); + return true; +} + +// STREXH<c> <Rd>, <Rt>, [<Rn>] +bool TranslatorVisitor::arm_STREXH(Cond cond, Reg n, Reg d, Reg t) { + if (n == Reg::PC || d == Reg::PC || t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (d == n || d == t) { + return UnpredictableInstruction(); + } + + if (!ArmConditionPassed(cond)) { + return true; + } + + const auto address = ir.GetRegister(n); + const auto value = ir.LeastSignificantHalf(ir.GetRegister(t)); + const auto passed = ir.ExclusiveWriteMemory16(address, value, IR::AccType::ATOMIC); + ir.SetRegister(d, passed); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb16.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb16.cpp new file mode 100644 index 0000000000..67a94a0e80 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb16.cpp @@ -0,0 +1,995 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_count.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" +#include "dynarmic/interface/A32/config.h" + +namespace Dynarmic::A32 { + +// LSLS <Rd>, <Rm>, #<imm5> +bool TranslatorVisitor::thumb16_LSL_imm(Imm<5> imm5, Reg m, Reg d) { + const u8 shift_n = imm5.ZeroExtend<u8>(); + if (shift_n == 0 && ir.current_location.IT().IsInITBlock()) { + return UnpredictableInstruction(); + } + + const auto cpsr_c = ir.GetCFlag(); + const auto result = ir.LogicalShiftLeft(ir.GetRegister(m), ir.Imm8(shift_n), cpsr_c); + + ir.SetRegister(d, result.result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry); + } + return true; +} + +// LSRS <Rd>, <Rm>, #<imm5> +bool TranslatorVisitor::thumb16_LSR_imm(Imm<5> imm5, Reg m, Reg d) { + const u8 shift_n = imm5 != 0 ? imm5.ZeroExtend<u8>() : u8(32); + const auto cpsr_c = ir.GetCFlag(); + const auto result = ir.LogicalShiftRight(ir.GetRegister(m), ir.Imm8(shift_n), cpsr_c); + + ir.SetRegister(d, result.result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry); + } + return true; +} + +// ASRS <Rd>, <Rm>, #<imm5> +bool TranslatorVisitor::thumb16_ASR_imm(Imm<5> imm5, Reg m, Reg d) { + const u8 shift_n = imm5 != 0 ? imm5.ZeroExtend<u8>() : u8(32); + const auto cpsr_c = ir.GetCFlag(); + const auto result = ir.ArithmeticShiftRight(ir.GetRegister(m), ir.Imm8(shift_n), cpsr_c); + + ir.SetRegister(d, result.result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry); + } + return true; +} + +// ADDS <Rd>, <Rn>, <Rm> +// Note that it is not possible to encode Rd == R15. +bool TranslatorVisitor::thumb16_ADD_reg_t1(Reg m, Reg n, Reg d) { + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(0)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +// SUBS <Rd>, <Rn>, <Rm> +// Note that it is not possible to encode Rd == R15. +bool TranslatorVisitor::thumb16_SUB_reg(Reg m, Reg n, Reg d) { + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(1)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +// ADDS <Rd>, <Rn>, #<imm3> +// Rd can never encode R15. +bool TranslatorVisitor::thumb16_ADD_imm_t1(Imm<3> imm3, Reg n, Reg d) { + const u32 imm32 = imm3.ZeroExtend(); + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +// SUBS <Rd>, <Rn>, #<imm3> +// Rd can never encode R15. +bool TranslatorVisitor::thumb16_SUB_imm_t1(Imm<3> imm3, Reg n, Reg d) { + const u32 imm32 = imm3.ZeroExtend(); + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +// MOVS <Rd>, #<imm8> +// Rd can never encode R15. +bool TranslatorVisitor::thumb16_MOV_imm(Reg d, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend(); + const auto result = ir.Imm32(imm32); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + return true; +} + +// CMP <Rn>, #<imm8> +bool TranslatorVisitor::thumb16_CMP_imm(Reg n, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend(); + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1)); + + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +// ADDS <Rdn>, #<imm8> +// Rd can never encode R15. +bool TranslatorVisitor::thumb16_ADD_imm_t2(Reg d_n, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend(); + const Reg d = d_n; + const Reg n = d_n; + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +// SUBS <Rd>, <Rn>, #<imm3> +// Rd can never encode R15. +bool TranslatorVisitor::thumb16_SUB_imm_t2(Reg d_n, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend(); + const Reg d = d_n; + const Reg n = d_n; + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +// ANDS <Rdn>, <Rm> +// Note that it is not possible to encode Rdn == R15. +bool TranslatorVisitor::thumb16_AND_reg(Reg m, Reg d_n) { + const Reg d = d_n; + const Reg n = d_n; + const auto result = ir.And(ir.GetRegister(n), ir.GetRegister(m)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + return true; +} + +// EORS <Rdn>, <Rm> +// Note that it is not possible to encode Rdn == R15. +bool TranslatorVisitor::thumb16_EOR_reg(Reg m, Reg d_n) { + const Reg d = d_n; + const Reg n = d_n; + const auto result = ir.Eor(ir.GetRegister(n), ir.GetRegister(m)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + return true; +} + +// LSLS <Rdn>, <Rm> +bool TranslatorVisitor::thumb16_LSL_reg(Reg m, Reg d_n) { + const Reg d = d_n; + const Reg n = d_n; + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(m)); + const auto apsr_c = ir.GetCFlag(); + const auto result_carry = ir.LogicalShiftLeft(ir.GetRegister(n), shift_n, apsr_c); + + ir.SetRegister(d, result_carry.result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZC(ir.NZFrom(result_carry.result), result_carry.carry); + } + return true; +} + +// LSRS <Rdn>, <Rm> +bool TranslatorVisitor::thumb16_LSR_reg(Reg m, Reg d_n) { + const Reg d = d_n; + const Reg n = d_n; + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(m)); + const auto cpsr_c = ir.GetCFlag(); + const auto result = ir.LogicalShiftRight(ir.GetRegister(n), shift_n, cpsr_c); + + ir.SetRegister(d, result.result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry); + } + return true; +} + +// ASRS <Rdn>, <Rm> +bool TranslatorVisitor::thumb16_ASR_reg(Reg m, Reg d_n) { + const Reg d = d_n; + const Reg n = d_n; + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(m)); + const auto cpsr_c = ir.GetCFlag(); + const auto result = ir.ArithmeticShiftRight(ir.GetRegister(n), shift_n, cpsr_c); + + ir.SetRegister(d, result.result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry); + } + return true; +} + +// ADCS <Rdn>, <Rm> +// Note that it is not possible to encode Rd == R15. +bool TranslatorVisitor::thumb16_ADC_reg(Reg m, Reg d_n) { + const Reg d = d_n; + const Reg n = d_n; + const auto aspr_c = ir.GetCFlag(); + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.GetRegister(m), aspr_c); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +// SBCS <Rdn>, <Rm> +// Note that it is not possible to encode Rd == R15. +bool TranslatorVisitor::thumb16_SBC_reg(Reg m, Reg d_n) { + const Reg d = d_n; + const Reg n = d_n; + const auto aspr_c = ir.GetCFlag(); + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.GetRegister(m), aspr_c); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +// RORS <Rdn>, <Rm> +bool TranslatorVisitor::thumb16_ROR_reg(Reg m, Reg d_n) { + const Reg d = d_n; + const Reg n = d_n; + const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(m)); + const auto cpsr_c = ir.GetCFlag(); + const auto result = ir.RotateRight(ir.GetRegister(n), shift_n, cpsr_c); + + ir.SetRegister(d, result.result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZC(ir.NZFrom(result.result), result.carry); + } + return true; +} + +// TST <Rn>, <Rm> +bool TranslatorVisitor::thumb16_TST_reg(Reg m, Reg n) { + const auto result = ir.And(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetCpsrNZ(ir.NZFrom(result)); + return true; +} + +// RSBS <Rd>, <Rn>, #0 +// Rd can never encode R15. +bool TranslatorVisitor::thumb16_RSB_imm(Reg n, Reg d) { + const auto result = ir.SubWithCarry(ir.Imm32(0), ir.GetRegister(n), ir.Imm1(1)); + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +// CMP <Rn>, <Rm> +bool TranslatorVisitor::thumb16_CMP_reg_t1(Reg m, Reg n) { + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(1)); + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +// CMN <Rn>, <Rm> +bool TranslatorVisitor::thumb16_CMN_reg(Reg m, Reg n) { + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(0)); + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +// ORRS <Rdn>, <Rm> +// Rd cannot encode R15. +bool TranslatorVisitor::thumb16_ORR_reg(Reg m, Reg d_n) { + const Reg d = d_n; + const Reg n = d_n; + const auto result = ir.Or(ir.GetRegister(m), ir.GetRegister(n)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + return true; +} + +// MULS <Rdn>, <Rm>, <Rdn> +// Rd cannot encode R15. +bool TranslatorVisitor::thumb16_MUL_reg(Reg n, Reg d_m) { + const Reg d = d_m; + const Reg m = d_m; + const auto result = ir.Mul(ir.GetRegister(m), ir.GetRegister(n)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + return true; +} + +// BICS <Rdn>, <Rm> +// Rd cannot encode R15. +bool TranslatorVisitor::thumb16_BIC_reg(Reg m, Reg d_n) { + const Reg d = d_n; + const Reg n = d_n; + const auto result = ir.AndNot(ir.GetRegister(n), ir.GetRegister(m)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + return true; +} + +// MVNS <Rd>, <Rm> +// Rd cannot encode R15. +bool TranslatorVisitor::thumb16_MVN_reg(Reg m, Reg d) { + const auto result = ir.Not(ir.GetRegister(m)); + + ir.SetRegister(d, result); + if (!ir.current_location.IT().IsInITBlock()) { + ir.SetCpsrNZ(ir.NZFrom(result)); + } + return true; +} + +// ADD <Rdn>, <Rm> +bool TranslatorVisitor::thumb16_ADD_reg_t2(bool d_n_hi, Reg m, Reg d_n_lo) { + const Reg d_n = d_n_hi ? (d_n_lo + 8) : d_n_lo; + const Reg n = d_n; + const Reg d = d_n; + if (n == Reg::PC && m == Reg::PC) { + return UnpredictableInstruction(); + } + if (d == Reg::PC && ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) { + return UnpredictableInstruction(); + } + + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(0)); + if (d == Reg::PC) { + ir.UpdateUpperLocationDescriptor(); + ir.ALUWritePC(result); + // Return to dispatch as we can't predict what PC is going to be. Stop compilation. + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; + } else { + ir.SetRegister(d, result); + return true; + } +} + +// CMP <Rn>, <Rm> +bool TranslatorVisitor::thumb16_CMP_reg_t2(bool n_hi, Reg m, Reg n_lo) { + const Reg n = n_hi ? (n_lo + 8) : n_lo; + if (n < Reg::R8 && m < Reg::R8) { + return UnpredictableInstruction(); + } + if (n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.GetRegister(m), ir.Imm1(1)); + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +// MOV <Rd>, <Rm> +bool TranslatorVisitor::thumb16_MOV_reg(bool d_hi, Reg m, Reg d_lo) { + const Reg d = d_hi ? (d_lo + 8) : d_lo; + if (d == Reg::PC && ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) { + return UnpredictableInstruction(); + } + + const auto result = ir.GetRegister(m); + + if (d == Reg::PC) { + ir.UpdateUpperLocationDescriptor(); + ir.ALUWritePC(result); + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; + } else { + ir.SetRegister(d, result); + return true; + } +} + +// LDR <Rt>, <label> +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_LDR_literal(Reg t, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + const u32 address = ir.AlignPC(4) + imm32; + const auto data = ir.ReadMemory32(ir.Imm32(address), IR::AccType::NORMAL); + + ir.SetRegister(t, data); + return true; +} + +// STR <Rt>, [<Rn>, <Rm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_STR_reg(Reg m, Reg n, Reg t) { + const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m)); + const auto data = ir.GetRegister(t); + + ir.WriteMemory32(address, data, IR::AccType::NORMAL); + return true; +} + +// STRH <Rt>, [<Rn>, <Rm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_STRH_reg(Reg m, Reg n, Reg t) { + const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m)); + const auto data = ir.LeastSignificantHalf(ir.GetRegister(t)); + + ir.WriteMemory16(address, data, IR::AccType::NORMAL); + return true; +} + +// STRB <Rt>, [<Rn>, <Rm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_STRB_reg(Reg m, Reg n, Reg t) { + const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m)); + const auto data = ir.LeastSignificantByte(ir.GetRegister(t)); + + ir.WriteMemory8(address, data, IR::AccType::NORMAL); + return true; +} + +// LDRSB <Rt>, [<Rn>, <Rm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_LDRSB_reg(Reg m, Reg n, Reg t) { + const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m)); + const auto data = ir.SignExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDR <Rt>, [<Rn>, <Rm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_LDR_reg(Reg m, Reg n, Reg t) { + const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m)); + const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL); + + ir.SetRegister(t, data); + return true; +} + +// LDRH <Rt>, [<Rn>, <Rm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_LDRH_reg(Reg m, Reg n, Reg t) { + const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m)); + const auto data = ir.ZeroExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRB <Rt>, [<Rn>, <Rm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_LDRB_reg(Reg m, Reg n, Reg t) { + const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m)); + const auto data = ir.ZeroExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// LDRH <Rt>, [<Rn>, <Rm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_LDRSH_reg(Reg m, Reg n, Reg t) { + const auto address = ir.Add(ir.GetRegister(n), ir.GetRegister(m)); + const auto data = ir.SignExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// STR <Rt>, [<Rn>, #<imm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_STR_imm_t1(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend() << 2; + const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32)); + const auto data = ir.GetRegister(t); + + ir.WriteMemory32(address, data, IR::AccType::NORMAL); + return true; +} + +// LDR <Rt>, [<Rn>, #<imm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_LDR_imm_t1(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend() << 2; + const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32)); + const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL); + + ir.SetRegister(t, data); + return true; +} + +// STRB <Rt>, [<Rn>, #<imm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_STRB_imm(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend(); + const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32)); + const auto data = ir.LeastSignificantByte(ir.GetRegister(t)); + + ir.WriteMemory8(address, data, IR::AccType::NORMAL); + return true; +} + +// LDRB <Rt>, [<Rn>, #<imm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_LDRB_imm(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend(); + const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32)); + const auto data = ir.ZeroExtendByteToWord(ir.ReadMemory8(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// STRH <Rt>, [<Rn>, #<imm5>] +bool TranslatorVisitor::thumb16_STRH_imm(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend() << 1; + const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32)); + const auto data = ir.LeastSignificantHalf(ir.GetRegister(t)); + + ir.WriteMemory16(address, data, IR::AccType::NORMAL); + return true; +} + +// LDRH <Rt>, [<Rn>, #<imm5>] +bool TranslatorVisitor::thumb16_LDRH_imm(Imm<5> imm5, Reg n, Reg t) { + const u32 imm32 = imm5.ZeroExtend() << 1; + const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32)); + const auto data = ir.ZeroExtendHalfToWord(ir.ReadMemory16(address, IR::AccType::NORMAL)); + + ir.SetRegister(t, data); + return true; +} + +// STR <Rt>, [<Rn>, #<imm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_STR_imm_t2(Reg t, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + const Reg n = Reg::SP; + const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32)); + const auto data = ir.GetRegister(t); + + ir.WriteMemory32(address, data, IR::AccType::NORMAL); + return true; +} + +// LDR <Rt>, [<Rn>, #<imm>] +// Rt cannot encode R15. +bool TranslatorVisitor::thumb16_LDR_imm_t2(Reg t, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + const Reg n = Reg::SP; + const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm32)); + const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL); + + ir.SetRegister(t, data); + return true; +} + +// ADR <Rd>, <label> +// Rd cannot encode R15. +bool TranslatorVisitor::thumb16_ADR(Reg d, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + const auto result = ir.Imm32(ir.AlignPC(4) + imm32); + + ir.SetRegister(d, result); + return true; +} + +// ADD <Rd>, SP, #<imm> +bool TranslatorVisitor::thumb16_ADD_sp_t1(Reg d, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + const auto result = ir.AddWithCarry(ir.GetRegister(Reg::SP), ir.Imm32(imm32), ir.Imm1(0)); + + ir.SetRegister(d, result); + return true; +} + +// ADD SP, SP, #<imm> +bool TranslatorVisitor::thumb16_ADD_sp_t2(Imm<7> imm7) { + const u32 imm32 = imm7.ZeroExtend() << 2; + const Reg d = Reg::SP; + const auto result = ir.AddWithCarry(ir.GetRegister(Reg::SP), ir.Imm32(imm32), ir.Imm1(0)); + + ir.SetRegister(d, result); + return true; +} + +// SUB SP, SP, #<imm> +bool TranslatorVisitor::thumb16_SUB_sp(Imm<7> imm7) { + const u32 imm32 = imm7.ZeroExtend() << 2; + const Reg d = Reg::SP; + const auto result = ir.SubWithCarry(ir.GetRegister(Reg::SP), ir.Imm32(imm32), ir.Imm1(1)); + + ir.SetRegister(d, result); + return true; +} + +// SEV<c> +bool TranslatorVisitor::thumb16_SEV() { + if (!options.hook_hint_instructions) { + return true; + } + return RaiseException(Exception::SendEvent); +} + +// SEVL<c> +bool TranslatorVisitor::thumb16_SEVL() { + if (!options.hook_hint_instructions) { + return true; + } + return RaiseException(Exception::SendEventLocal); +} + +// WFE<c> +bool TranslatorVisitor::thumb16_WFE() { + if (!options.hook_hint_instructions) { + return true; + } + return RaiseException(Exception::WaitForEvent); +} + +// WFI<c> +bool TranslatorVisitor::thumb16_WFI() { + if (!options.hook_hint_instructions) { + return true; + } + return RaiseException(Exception::WaitForInterrupt); +} + +// YIELD<c> +bool TranslatorVisitor::thumb16_YIELD() { + if (!options.hook_hint_instructions) { + return true; + } + return RaiseException(Exception::Yield); +} + +// NOP<c> +bool TranslatorVisitor::thumb16_NOP() { + return true; +} + +// IT{<x>{<y>{<z>}}} <cond> +bool TranslatorVisitor::thumb16_IT(Imm<8> imm8) { + ASSERT_MSG((imm8.Bits<0, 3>() != 0b0000), "Decode Error"); + if (imm8.Bits<4, 7>() == 0b1111 || (imm8.Bits<4, 7>() == 0b1110 && mcl::bit::count_ones(imm8.Bits<0, 3>()) != 1)) { + return UnpredictableInstruction(); + } + if (ir.current_location.IT().IsInITBlock()) { + return UnpredictableInstruction(); + } + + const auto next_location = ir.current_location.AdvancePC(2).SetIT(ITState{imm8.ZeroExtend<u8>()}); + ir.SetTerm(IR::Term::LinkBlockFast{next_location}); + return false; +} + +// SXTH <Rd>, <Rm> +// Rd cannot encode R15. +bool TranslatorVisitor::thumb16_SXTH(Reg m, Reg d) { + const auto half = ir.LeastSignificantHalf(ir.GetRegister(m)); + ir.SetRegister(d, ir.SignExtendHalfToWord(half)); + return true; +} + +// SXTB <Rd>, <Rm> +// Rd cannot encode R15. +bool TranslatorVisitor::thumb16_SXTB(Reg m, Reg d) { + const auto byte = ir.LeastSignificantByte(ir.GetRegister(m)); + ir.SetRegister(d, ir.SignExtendByteToWord(byte)); + return true; +} + +// UXTH <Rd>, <Rm> +// Rd cannot encode R15. +bool TranslatorVisitor::thumb16_UXTH(Reg m, Reg d) { + const auto half = ir.LeastSignificantHalf(ir.GetRegister(m)); + ir.SetRegister(d, ir.ZeroExtendHalfToWord(half)); + return true; +} + +// UXTB <Rd>, <Rm> +// Rd cannot encode R15. +bool TranslatorVisitor::thumb16_UXTB(Reg m, Reg d) { + const auto byte = ir.LeastSignificantByte(ir.GetRegister(m)); + ir.SetRegister(d, ir.ZeroExtendByteToWord(byte)); + return true; +} + +// PUSH <reg_list> +// reg_list cannot encode for R15. +bool TranslatorVisitor::thumb16_PUSH(bool M, RegList reg_list) { + if (M) { + reg_list |= 1 << 14; + } + if (mcl::bit::count_ones(reg_list) < 1) { + return UnpredictableInstruction(); + } + + const u32 num_bytes_to_push = static_cast<u32>(4 * mcl::bit::count_ones(reg_list)); + const auto final_address = ir.Sub(ir.GetRegister(Reg::SP), ir.Imm32(num_bytes_to_push)); + auto address = final_address; + for (size_t i = 0; i < 16; i++) { + if (mcl::bit::get_bit(i, reg_list)) { + // TODO: Deal with alignment + const auto Ri = ir.GetRegister(static_cast<Reg>(i)); + ir.WriteMemory32(address, Ri, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + } + } + + ir.SetRegister(Reg::SP, final_address); + // TODO(optimization): Possible location for an RSB push. + return true; +} + +// POP <reg_list> +bool TranslatorVisitor::thumb16_POP(bool P, RegList reg_list) { + if (P) { + reg_list |= 1 << 15; + } + if (mcl::bit::count_ones(reg_list) < 1) { + return UnpredictableInstruction(); + } + + auto address = ir.GetRegister(Reg::SP); + for (size_t i = 0; i < 15; i++) { + if (mcl::bit::get_bit(i, reg_list)) { + // TODO: Deal with alignment + const auto data = ir.ReadMemory32(address, IR::AccType::ATOMIC); + ir.SetRegister(static_cast<Reg>(i), data); + address = ir.Add(address, ir.Imm32(4)); + } + } + + if (mcl::bit::get_bit<15>(reg_list)) { + // TODO(optimization): Possible location for an RSB pop. + const auto data = ir.ReadMemory32(address, IR::AccType::ATOMIC); + ir.UpdateUpperLocationDescriptor(); + ir.LoadWritePC(data); + address = ir.Add(address, ir.Imm32(4)); + ir.SetRegister(Reg::SP, address); + ir.SetTerm(IR::Term::PopRSBHint{}); + return false; + } else { + ir.SetRegister(Reg::SP, address); + return true; + } +} + +// SETEND <endianness> +bool TranslatorVisitor::thumb16_SETEND(bool E) { + if (ir.current_location.IT().IsInITBlock()) { + return UnpredictableInstruction(); + } + + if (E == ir.current_location.EFlag()) { + return true; + } + + ir.SetTerm(IR::Term::LinkBlock{ir.current_location.AdvancePC(2).SetEFlag(E).AdvanceIT()}); + return false; +} + +// CPS{IE,ID} <a,i,f> +// A CPS is treated as a NOP in User mode. +bool TranslatorVisitor::thumb16_CPS(bool, bool, bool, bool) { + return true; +} + +// REV <Rd>, <Rm> +// Rd cannot encode R15. +bool TranslatorVisitor::thumb16_REV(Reg m, Reg d) { + ir.SetRegister(d, ir.ByteReverseWord(ir.GetRegister(m))); + return true; +} + +// REV16 <Rd>, <Rm> +// Rd cannot encode R15. +// TODO: Consider optimizing +bool TranslatorVisitor::thumb16_REV16(Reg m, Reg d) { + const auto Rm = ir.GetRegister(m); + const auto upper_half = ir.LeastSignificantHalf(ir.LogicalShiftRight(Rm, ir.Imm8(16), ir.Imm1(0)).result); + const auto lower_half = ir.LeastSignificantHalf(Rm); + const auto rev_upper_half = ir.ZeroExtendHalfToWord(ir.ByteReverseHalf(upper_half)); + const auto rev_lower_half = ir.ZeroExtendHalfToWord(ir.ByteReverseHalf(lower_half)); + const auto result = ir.Or(ir.LogicalShiftLeft(rev_upper_half, ir.Imm8(16), ir.Imm1(0)).result, + rev_lower_half); + + ir.SetRegister(d, result); + return true; +} + +// REVSH <Rd>, <Rm> +// Rd cannot encode R15. +bool TranslatorVisitor::thumb16_REVSH(Reg m, Reg d) { + const auto rev_half = ir.ByteReverseHalf(ir.LeastSignificantHalf(ir.GetRegister(m))); + ir.SetRegister(d, ir.SignExtendHalfToWord(rev_half)); + return true; +} + +// BKPT #<imm8> +bool TranslatorVisitor::thumb16_BKPT(Imm<8> /*imm8*/) { + return RaiseException(Exception::Breakpoint); +} + +// STM <Rn>!, <reg_list> +bool TranslatorVisitor::thumb16_STMIA(Reg n, RegList reg_list) { + if (mcl::bit::count_ones(reg_list) == 0) { + return UnpredictableInstruction(); + } + if (mcl::bit::get_bit(static_cast<size_t>(n), reg_list) && n != static_cast<Reg>(mcl::bit::lowest_set_bit(reg_list))) { + return UnpredictableInstruction(); + } + + auto address = ir.GetRegister(n); + for (size_t i = 0; i < 8; i++) { + if (mcl::bit::get_bit(i, reg_list)) { + const auto Ri = ir.GetRegister(static_cast<Reg>(i)); + ir.WriteMemory32(address, Ri, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + } + } + + ir.SetRegister(n, address); + return true; +} + +// LDM <Rn>!, <reg_list> +bool TranslatorVisitor::thumb16_LDMIA(Reg n, RegList reg_list) { + if (mcl::bit::count_ones(reg_list) == 0) { + return UnpredictableInstruction(); + } + + const bool write_back = !mcl::bit::get_bit(static_cast<size_t>(n), reg_list); + auto address = ir.GetRegister(n); + + for (size_t i = 0; i < 8; i++) { + if (mcl::bit::get_bit(i, reg_list)) { + const auto data = ir.ReadMemory32(address, IR::AccType::ATOMIC); + ir.SetRegister(static_cast<Reg>(i), data); + address = ir.Add(address, ir.Imm32(4)); + } + } + + if (write_back) { + ir.SetRegister(n, address); + } + return true; +} + +// CB{N}Z <Rn>, <label> +bool TranslatorVisitor::thumb16_CBZ_CBNZ(bool nonzero, Imm<1> i, Imm<5> imm5, Reg n) { + if (ir.current_location.IT().IsInITBlock()) { + return UnpredictableInstruction(); + } + + const u32 imm = concatenate(i, imm5, Imm<1>{0}).ZeroExtend(); + const IR::U32 rn = ir.GetRegister(n); + + ir.SetCheckBit(ir.IsZero(rn)); + + const auto [cond_pass, cond_fail] = [this, imm, nonzero] { + const auto skip = IR::Term::LinkBlock{ir.current_location.AdvancePC(2).AdvanceIT()}; + const auto branch = IR::Term::LinkBlock{ir.current_location.AdvancePC(imm + 4).AdvanceIT()}; + + if (nonzero) { + return std::make_pair(skip, branch); + } else { + return std::make_pair(branch, skip); + } + }(); + + ir.SetTerm(IR::Term::CheckBit{cond_pass, cond_fail}); + return false; +} + +bool TranslatorVisitor::thumb16_UDF() { + return UndefinedInstruction(); +} + +// BX <Rm> +bool TranslatorVisitor::thumb16_BX(Reg m) { + if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) { + return UnpredictableInstruction(); + } + + ir.UpdateUpperLocationDescriptor(); + ir.BXWritePC(ir.GetRegister(m)); + if (m == Reg::R14) + ir.SetTerm(IR::Term::PopRSBHint{}); + else + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; +} + +// BLX <Rm> +bool TranslatorVisitor::thumb16_BLX_reg(Reg m) { + if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) { + return UnpredictableInstruction(); + } + + ir.PushRSB(ir.current_location.AdvancePC(2).AdvanceIT()); + ir.UpdateUpperLocationDescriptor(); + ir.BXWritePC(ir.GetRegister(m)); + ir.SetRegister(Reg::LR, ir.Imm32((ir.current_location.PC() + 2) | 1)); + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; +} + +// SVC #<imm8> +bool TranslatorVisitor::thumb16_SVC(Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend(); + ir.PushRSB(ir.current_location.AdvancePC(2).AdvanceIT()); + ir.UpdateUpperLocationDescriptor(); + ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 2)); + ir.CallSupervisor(ir.Imm32(imm32)); + ir.SetTerm(IR::Term::CheckHalt{IR::Term::PopRSBHint{}}); + return false; +} + +// B<cond> <label> +bool TranslatorVisitor::thumb16_B_t1(Cond cond, Imm<8> imm8) { + if (ir.current_location.IT().IsInITBlock()) { + return UnpredictableInstruction(); + } + + if (cond == Cond::AL) { + return thumb16_UDF(); + } + + const s32 imm32 = static_cast<s32>((imm8.SignExtend<u32>() << 1) + 4); + const auto then_location = ir.current_location.AdvancePC(imm32).AdvanceIT(); + const auto else_location = ir.current_location.AdvancePC(2).AdvanceIT(); + + ir.SetTerm(IR::Term::If{cond, IR::Term::LinkBlock{then_location}, IR::Term::LinkBlock{else_location}}); + return false; +} + +// B <label> +bool TranslatorVisitor::thumb16_B_t2(Imm<11> imm11) { + if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) { + return UnpredictableInstruction(); + } + + const s32 imm32 = static_cast<s32>((imm11.SignExtend<u32>() << 1) + 4); + const auto next_location = ir.current_location.AdvancePC(imm32).AdvanceIT(); + + ir.SetTerm(IR::Term::LinkBlock{next_location}); + return false; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_branch.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_branch.cpp new file mode 100644 index 0000000000..83734f5c70 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_branch.cpp @@ -0,0 +1,88 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +// BL <label> +bool TranslatorVisitor::thumb32_BL_imm(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo) { + const Imm<1> i1{j1 == S}; + const Imm<1> i2{j2 == S}; + + if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) { + return UnpredictableInstruction(); + } + + ir.PushRSB(ir.current_location.AdvancePC(4).AdvanceIT()); + ir.SetRegister(Reg::LR, ir.Imm32((ir.current_location.PC() + 4) | 1)); + + const s32 imm32 = static_cast<s32>((concatenate(S, i1, i2, hi, lo).SignExtend<u32>() << 1) + 4); + const auto new_location = ir.current_location + .AdvancePC(imm32) + .AdvanceIT(); + ir.SetTerm(IR::Term::LinkBlock{new_location}); + return false; +} + +// BLX <label> +bool TranslatorVisitor::thumb32_BLX_imm(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo) { + const Imm<1> i1{j1 == S}; + const Imm<1> i2{j2 == S}; + + if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) { + return UnpredictableInstruction(); + } + + if (lo.Bit<0>()) { + return UnpredictableInstruction(); + } + + ir.PushRSB(ir.current_location.AdvancePC(4).AdvanceIT()); + ir.SetRegister(Reg::LR, ir.Imm32((ir.current_location.PC() + 4) | 1)); + + const s32 imm32 = static_cast<s32>(concatenate(S, i1, i2, hi, lo).SignExtend<u32>() << 1); + const auto new_location = ir.current_location + .SetPC(ir.AlignPC(4) + imm32) + .SetTFlag(false) + .AdvanceIT(); + ir.SetTerm(IR::Term::LinkBlock{new_location}); + return false; +} + +bool TranslatorVisitor::thumb32_B(Imm<1> S, Imm<10> hi, Imm<1> j1, Imm<1> j2, Imm<11> lo) { + const Imm<1> i1{j1 == S}; + const Imm<1> i2{j2 == S}; + + if (ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock()) { + return UnpredictableInstruction(); + } + + const s32 imm32 = static_cast<s32>((concatenate(S, i1, i2, hi, lo).SignExtend<u32>() << 1) + 4); + const auto new_location = ir.current_location + .AdvancePC(imm32) + .AdvanceIT(); + ir.SetTerm(IR::Term::LinkBlock{new_location}); + return false; +} + +bool TranslatorVisitor::thumb32_B_cond(Imm<1> S, Cond cond, Imm<6> hi, Imm<1> i1, Imm<1> i2, Imm<11> lo) { + if (ir.current_location.IT().IsInITBlock()) { + return UnpredictableInstruction(); + } + + // Note: i1 and i2 were not inverted from encoding and are opposite compared to the other B instructions. + const s32 imm32 = static_cast<s32>((concatenate(S, i2, i1, hi, lo).SignExtend<u32>() << 1) + 4); + const auto then_location = ir.current_location + .AdvancePC(imm32) + .AdvanceIT(); + const auto else_location = ir.current_location + .AdvancePC(4) + .AdvanceIT(); + ir.SetTerm(IR::Term::If{cond, IR::Term::LinkBlock{then_location}, IR::Term::LinkBlock{else_location}}); + return false; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_control.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_control.cpp new file mode 100644 index 0000000000..aa03937301 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_control.cpp @@ -0,0 +1,126 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +bool TranslatorVisitor::thumb32_BXJ(Reg m) { + if (m == Reg::PC) { + return UnpredictableInstruction(); + } + + return thumb16_BX(m); +} + +bool TranslatorVisitor::thumb32_CLREX() { + ir.ClearExclusive(); + return true; +} + +bool TranslatorVisitor::thumb32_DMB(Imm<4> /*option*/) { + ir.DataMemoryBarrier(); + return true; +} + +bool TranslatorVisitor::thumb32_DSB(Imm<4> /*option*/) { + ir.DataSynchronizationBarrier(); + return true; +} + +bool TranslatorVisitor::thumb32_ISB(Imm<4> /*option*/) { + ir.InstructionSynchronizationBarrier(); + ir.UpdateUpperLocationDescriptor(); + ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4)); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; +} + +bool TranslatorVisitor::thumb32_NOP() { + return thumb16_NOP(); +} + +bool TranslatorVisitor::thumb32_SEV() { + return thumb16_SEV(); +} + +bool TranslatorVisitor::thumb32_SEVL() { + return thumb16_SEVL(); +} + +bool TranslatorVisitor::thumb32_UDF() { + return thumb16_UDF(); +} + +bool TranslatorVisitor::thumb32_WFE() { + return thumb16_WFE(); +} + +bool TranslatorVisitor::thumb32_WFI() { + return thumb16_WFI(); +} + +bool TranslatorVisitor::thumb32_YIELD() { + return thumb16_YIELD(); +} + +bool TranslatorVisitor::thumb32_MSR_reg(bool write_spsr, Reg n, Imm<4> mask) { + if (mask == 0) { + return UnpredictableInstruction(); + } + + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + if (write_spsr) { + return UndefinedInstruction(); + } + + const bool write_nzcvq = mask.Bit<3>(); + const bool write_g = mask.Bit<2>(); + const bool write_e = mask.Bit<1>(); + const auto value = ir.GetRegister(n); + + if (!write_e) { + if (write_nzcvq) { + ir.SetCpsrNZCVQ(ir.And(value, ir.Imm32(0xF8000000))); + } + + if (write_g) { + ir.SetGEFlagsCompressed(ir.And(value, ir.Imm32(0x000F0000))); + } + } else { + ir.UpdateUpperLocationDescriptor(); + + const u32 cpsr_mask = (write_nzcvq ? 0xF8000000 : 0) | (write_g ? 0x000F0000 : 0) | 0x00000200; + const auto old_cpsr = ir.And(ir.GetCpsr(), ir.Imm32(~cpsr_mask)); + const auto new_cpsr = ir.And(value, ir.Imm32(cpsr_mask)); + ir.SetCpsr(ir.Or(old_cpsr, new_cpsr)); + ir.PushRSB(ir.current_location.AdvancePC(4).AdvanceIT()); + ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4)); + ir.SetTerm(IR::Term::CheckHalt{IR::Term::PopRSBHint{}}); + return false; + } + + return true; +} + +bool TranslatorVisitor::thumb32_MRS_reg(bool read_spsr, Reg d) { + if (d == Reg::R15) { + return UnpredictableInstruction(); + } + + // TODO: Revisit when implementing more than user mode. + + if (read_spsr) { + return UndefinedInstruction(); + } + + ir.SetRegister(d, ir.GetCpsr()); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_coprocessor.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_coprocessor.cpp new file mode 100644 index 0000000000..ab7c539932 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_coprocessor.cpp @@ -0,0 +1,75 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +bool TranslatorVisitor::thumb32_MCRR(bool two, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) { + ir.CoprocSendTwoWords(coproc_no, two, opc, CRm, ir.GetRegister(t), ir.GetRegister(t2)); + return true; +} + +bool TranslatorVisitor::thumb32_MRRC(bool two, Reg t2, Reg t, size_t coproc_no, size_t opc, CoprocReg CRm) { + const auto two_words = ir.CoprocGetTwoWords(coproc_no, two, opc, CRm); + ir.SetRegister(t, ir.LeastSignificantWord(two_words)); + ir.SetRegister(t2, ir.MostSignificantWord(two_words).result); + return true; +} + +bool TranslatorVisitor::thumb32_STC(bool two, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + const bool index = p; + const bool add = u; + const bool wback = w; + const bool has_option = !p && !w && u; + const IR::U32 reg_n = ir.GetRegister(n); + const IR::U32 offset_address = add ? ir.Add(reg_n, ir.Imm32(imm32)) : ir.Sub(reg_n, ir.Imm32(imm32)); + const IR::U32 address = index ? offset_address : reg_n; + ir.CoprocStoreWords(coproc_no, two, d, CRd, address, has_option, imm8.ZeroExtend<u8>()); + if (wback) { + ir.SetRegister(n, offset_address); + } + return true; +} + +bool TranslatorVisitor::thumb32_LDC(bool two, bool p, bool u, bool d, bool w, Reg n, CoprocReg CRd, size_t coproc_no, Imm<8> imm8) { + const u32 imm32 = imm8.ZeroExtend() << 2; + const bool index = p; + const bool add = u; + const bool wback = w; + const bool has_option = !p && !w && u; + const IR::U32 reg_n = ir.GetRegister(n); + const IR::U32 offset_address = add ? ir.Add(reg_n, ir.Imm32(imm32)) : ir.Sub(reg_n, ir.Imm32(imm32)); + const IR::U32 address = index ? offset_address : reg_n; + ir.CoprocLoadWords(coproc_no, two, d, CRd, address, has_option, imm8.ZeroExtend<u8>()); + if (wback) { + ir.SetRegister(n, offset_address); + } + return true; +} + +bool TranslatorVisitor::thumb32_CDP(bool two, size_t opc1, CoprocReg CRn, CoprocReg CRd, size_t coproc_no, size_t opc2, CoprocReg CRm) { + ir.CoprocInternalOperation(coproc_no, two, opc1, CRd, CRn, CRm, opc2); + return true; +} + +bool TranslatorVisitor::thumb32_MCR(bool two, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) { + ir.CoprocSendOneWord(coproc_no, two, opc1, CRn, CRm, opc2, ir.GetRegister(t)); + return true; +} + +bool TranslatorVisitor::thumb32_MRC(bool two, size_t opc1, CoprocReg CRn, Reg t, size_t coproc_no, size_t opc2, CoprocReg CRm) { + const auto word = ir.CoprocGetOneWord(coproc_no, two, opc1, CRn, CRm, opc2); + if (t != Reg::PC) { + ir.SetRegister(t, word); + } else { + const auto new_cpsr_nzcv = ir.And(word, ir.Imm32(0xF0000000)); + ir.SetCpsrNZCVRaw(new_cpsr_nzcv); + } + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_modified_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_modified_immediate.cpp new file mode 100644 index 0000000000..0c0114f84c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_modified_immediate.cpp @@ -0,0 +1,244 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +bool TranslatorVisitor::thumb32_TST_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8) { + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag()); + const auto result = ir.And(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + return true; +} + +bool TranslatorVisitor::thumb32_AND_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + ASSERT_MSG(!(d == Reg::PC && S), "Decode error"); + if ((d == Reg::PC && !S) || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag()); + const auto result = ir.And(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_BIC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag()); + const auto result = ir.AndNot(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_MOV_imm(Imm<1> i, bool S, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag()); + const auto result = ir.Imm32(imm_carry.imm32); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_ORR_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + ASSERT_MSG(n != Reg::PC, "Decode error"); + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag()); + const auto result = ir.Or(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_MVN_imm(Imm<1> i, bool S, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag()); + const auto result = ir.Imm32(~imm_carry.imm32); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_ORN_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + ASSERT_MSG(n != Reg::PC, "Decode error"); + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag()); + const auto result = ir.Or(ir.GetRegister(n), ir.Imm32(~imm_carry.imm32)); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_TEQ_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8) { + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag()); + const auto result = ir.Eor(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + return true; +} + +bool TranslatorVisitor::thumb32_EOR_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + ASSERT_MSG(!(d == Reg::PC && S), "Decode error"); + if ((d == Reg::PC && !S) || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag()); + const auto result = ir.Eor(ir.GetRegister(n), ir.Imm32(imm_carry.imm32)); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), imm_carry.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_CMN_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8) { + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm32 = ThumbExpandImm(i, imm3, imm8); + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0)); + + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +bool TranslatorVisitor::thumb32_ADD_imm_1(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + ASSERT_MSG(!(d == Reg::PC && S), "Decode error"); + if ((d == Reg::PC && !S) || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm32 = ThumbExpandImm(i, imm3, imm8); + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(0)); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +bool TranslatorVisitor::thumb32_ADC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm32 = ThumbExpandImm(i, imm3, imm8); + const auto result = ir.AddWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.GetCFlag()); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +bool TranslatorVisitor::thumb32_SBC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm32 = ThumbExpandImm(i, imm3, imm8); + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.GetCFlag()); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +bool TranslatorVisitor::thumb32_CMP_imm(Imm<1> i, Reg n, Imm<3> imm3, Imm<8> imm8) { + if (n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm32 = ThumbExpandImm(i, imm3, imm8); + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1)); + + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +bool TranslatorVisitor::thumb32_SUB_imm_1(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + ASSERT_MSG(!(d == Reg::PC && S), "Decode error"); + if ((d == Reg::PC && !S) || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm32 = ThumbExpandImm(i, imm3, imm8); + const auto result = ir.SubWithCarry(ir.GetRegister(n), ir.Imm32(imm32), ir.Imm1(1)); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +bool TranslatorVisitor::thumb32_RSB_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm32 = ThumbExpandImm(i, imm3, imm8); + const auto result = ir.SubWithCarry(ir.Imm32(imm32), ir.GetRegister(n), ir.Imm1(1)); + + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_plain_binary_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_plain_binary_immediate.cpp new file mode 100644 index 0000000000..1a6767dbee --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_plain_binary_immediate.cpp @@ -0,0 +1,232 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/assert.hpp> +#include <mcl/bitsizeof.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +static IR::U32 Pack2x16To1x32(A32::IREmitter& ir, IR::U32 lo, IR::U32 hi) { + return ir.Or(ir.And(lo, ir.Imm32(0xFFFF)), ir.LogicalShiftLeft(hi, ir.Imm8(16), ir.Imm1(0)).result); +} + +static IR::U16 MostSignificantHalf(A32::IREmitter& ir, IR::U32 value) { + return ir.LeastSignificantHalf(ir.LogicalShiftRight(value, ir.Imm8(16), ir.Imm1(0)).result); +} + +using SaturationFunction = IR::ResultAndOverflow<IR::U32> (IREmitter::*)(const IR::U32&, size_t); + +static bool Saturation(TranslatorVisitor& v, bool sh, Reg n, Reg d, Imm<5> shift_amount, size_t saturate_to, SaturationFunction sat_fn) { + ASSERT_MSG(!(sh && shift_amount == 0), "Invalid decode"); + + if (d == Reg::PC || n == Reg::PC) { + return v.UnpredictableInstruction(); + } + + const auto shift = sh ? ShiftType::ASR : ShiftType::LSL; + const auto operand = v.EmitImmShift(v.ir.GetRegister(n), shift, shift_amount, v.ir.GetCFlag()); + const auto result = (v.ir.*sat_fn)(operand.result, saturate_to); + + v.ir.SetRegister(d, result.result); + v.ir.OrQFlag(result.overflow); + return true; +} + +static bool Saturation16(TranslatorVisitor& v, Reg n, Reg d, size_t saturate_to, SaturationFunction sat_fn) { + if (d == Reg::PC || n == Reg::PC) { + return v.UnpredictableInstruction(); + } + + const auto reg_n = v.ir.GetRegister(n); + + const auto lo_operand = v.ir.SignExtendHalfToWord(v.ir.LeastSignificantHalf(reg_n)); + const auto hi_operand = v.ir.SignExtendHalfToWord(MostSignificantHalf(v.ir, reg_n)); + const auto lo_result = (v.ir.*sat_fn)(lo_operand, saturate_to); + const auto hi_result = (v.ir.*sat_fn)(hi_operand, saturate_to); + + v.ir.SetRegister(d, Pack2x16To1x32(v.ir, lo_result.result, hi_result.result)); + v.ir.OrQFlag(lo_result.overflow); + v.ir.OrQFlag(hi_result.overflow); + return true; +} + +bool TranslatorVisitor::thumb32_ADR_t2(Imm<1> imm1, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm32 = concatenate(imm1, imm3, imm8).ZeroExtend(); + const auto result = ir.AlignPC(4) - imm32; + + ir.SetRegister(d, ir.Imm32(result)); + return true; +} + +bool TranslatorVisitor::thumb32_ADR_t3(Imm<1> imm1, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto imm32 = concatenate(imm1, imm3, imm8).ZeroExtend(); + const auto result = ir.AlignPC(4) + imm32; + + ir.SetRegister(d, ir.Imm32(result)); + return true; +} + +bool TranslatorVisitor::thumb32_ADD_imm_2(Imm<1> imm1, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const u32 imm = concatenate(imm1, imm3, imm8).ZeroExtend(); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.AddWithCarry(reg_n, ir.Imm32(imm), ir.Imm1(0)); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_BFC(Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> msb) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + const u32 lsbit = concatenate(imm3, imm2).ZeroExtend(); + const u32 msbit = msb.ZeroExtend(); + + if (msbit < lsbit) { + return UnpredictableInstruction(); + } + + const u32 mask = ~(mcl::bit::ones<u32>(msbit - lsbit + 1) << lsbit); + const auto reg_d = ir.GetRegister(d); + const auto result = ir.And(reg_d, ir.Imm32(mask)); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_BFI(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> msb) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const u32 lsbit = concatenate(imm3, imm2).ZeroExtend(); + const u32 msbit = msb.ZeroExtend(); + + if (msbit < lsbit) { + return UnpredictableInstruction(); + } + + const u32 inclusion_mask = mcl::bit::ones<u32>(msbit - lsbit + 1) << lsbit; + const u32 exclusion_mask = ~inclusion_mask; + const IR::U32 operand1 = ir.And(ir.GetRegister(d), ir.Imm32(exclusion_mask)); + const IR::U32 operand2 = ir.And(ir.LogicalShiftLeft(ir.GetRegister(n), ir.Imm8(u8(lsbit))), ir.Imm32(inclusion_mask)); + const IR::U32 result = ir.Or(operand1, operand2); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_MOVT(Imm<1> imm1, Imm<4> imm4, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + const IR::U32 imm16 = ir.Imm32(concatenate(imm4, imm1, imm3, imm8).ZeroExtend() << 16); + const IR::U32 operand = ir.GetRegister(d); + const IR::U32 result = ir.Or(ir.And(operand, ir.Imm32(0x0000FFFFU)), imm16); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_MOVW_imm(Imm<1> imm1, Imm<4> imm4, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC) { + return UnpredictableInstruction(); + } + + const IR::U32 imm = ir.Imm32(concatenate(imm4, imm1, imm3, imm8).ZeroExtend()); + + ir.SetRegister(d, imm); + return true; +} + +bool TranslatorVisitor::thumb32_SBFX(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> widthm1) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const u32 lsbit = concatenate(imm3, imm2).ZeroExtend(); + const u32 widthm1_value = widthm1.ZeroExtend(); + const u32 msb = lsbit + widthm1_value; + if (msb >= mcl::bitsizeof<u32>) { + return UnpredictableInstruction(); + } + + constexpr size_t max_width = mcl::bitsizeof<u32>; + const auto width = widthm1_value + 1; + const auto left_shift_amount = static_cast<u8>(max_width - width - lsbit); + const auto right_shift_amount = static_cast<u8>(max_width - width); + const auto operand = ir.GetRegister(n); + const auto tmp = ir.LogicalShiftLeft(operand, ir.Imm8(left_shift_amount)); + const auto result = ir.ArithmeticShiftRight(tmp, ir.Imm8(right_shift_amount)); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SSAT(bool sh, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> sat_imm) { + return Saturation(*this, sh, n, d, concatenate(imm3, imm2), sat_imm.ZeroExtend() + 1, &IREmitter::SignedSaturation); +} + +bool TranslatorVisitor::thumb32_SSAT16(Reg n, Reg d, Imm<4> sat_imm) { + return Saturation16(*this, n, d, sat_imm.ZeroExtend() + 1, &IREmitter::SignedSaturation); +} + +bool TranslatorVisitor::thumb32_SUB_imm_2(Imm<1> imm1, Reg n, Imm<3> imm3, Reg d, Imm<8> imm8) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const u32 imm = concatenate(imm1, imm3, imm8).ZeroExtend(); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.SubWithCarry(reg_n, ir.Imm32(imm), ir.Imm1(1)); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UBFX(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> widthm1) { + if (d == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const u32 lsbit = concatenate(imm3, imm2).ZeroExtend(); + const u32 widthm1_value = widthm1.ZeroExtend(); + const u32 msb = lsbit + widthm1_value; + if (msb >= mcl::bitsizeof<u32>) { + return UnpredictableInstruction(); + } + + const auto operand = ir.GetRegister(n); + const auto mask = ir.Imm32(mcl::bit::ones<u32>(widthm1_value + 1)); + const auto result = ir.And(ir.LogicalShiftRight(operand, ir.Imm8(u8(lsbit))), mask); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_USAT(bool sh, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<5> sat_imm) { + return Saturation(*this, sh, n, d, concatenate(imm3, imm2), sat_imm.ZeroExtend(), &IREmitter::UnsignedSaturation); +} + +bool TranslatorVisitor::thumb32_USAT16(Reg n, Reg d, Imm<4> sat_imm) { + return Saturation16(*this, n, d, sat_imm.ZeroExtend(), &IREmitter::UnsignedSaturation); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_register.cpp new file mode 100644 index 0000000000..1dda532d92 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_register.cpp @@ -0,0 +1,206 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +namespace { +IR::U32 Rotate(A32::IREmitter& ir, Reg m, SignExtendRotation rotate) { + const u8 rotate_by = static_cast<u8>(static_cast<size_t>(rotate) * 8); + return ir.RotateRight(ir.GetRegister(m), ir.Imm8(rotate_by), ir.Imm1(0)).result; +} + +using ShiftFunction = IR::ResultAndCarry<IR::U32> (IREmitter::*)(const IR::U32&, const IR::U8&, const IR::U1&); + +bool ShiftInstruction(TranslatorVisitor& v, Reg m, Reg d, Reg s, bool S, ShiftFunction shift_fn) { + if (d == Reg::PC || m == Reg::PC || s == Reg::PC) { + return v.UnpredictableInstruction(); + } + + const auto shift_s = v.ir.LeastSignificantByte(v.ir.GetRegister(s)); + const auto apsr_c = v.ir.GetCFlag(); + const auto result_carry = (v.ir.*shift_fn)(v.ir.GetRegister(m), shift_s, apsr_c); + + if (S) { + v.ir.SetCpsrNZC(v.ir.NZFrom(result_carry.result), result_carry.carry); + } + + v.ir.SetRegister(d, result_carry.result); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::thumb32_ASR_reg(bool S, Reg m, Reg d, Reg s) { + return ShiftInstruction(*this, m, d, s, S, &IREmitter::ArithmeticShiftRight); +} + +bool TranslatorVisitor::thumb32_LSL_reg(bool S, Reg m, Reg d, Reg s) { + return ShiftInstruction(*this, m, d, s, S, &IREmitter::LogicalShiftLeft); +} + +bool TranslatorVisitor::thumb32_LSR_reg(bool S, Reg m, Reg d, Reg s) { + return ShiftInstruction(*this, m, d, s, S, &IREmitter::LogicalShiftRight); +} + +bool TranslatorVisitor::thumb32_ROR_reg(bool S, Reg m, Reg d, Reg s) { + return ShiftInstruction(*this, m, d, s, S, &IREmitter::RotateRight); +} + +bool TranslatorVisitor::thumb32_SXTB(Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + const auto result = ir.SignExtendByteToWord(ir.LeastSignificantByte(rotated)); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SXTB16(Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + const auto low_byte = ir.And(rotated, ir.Imm32(0x00FF00FF)); + const auto sign_bit = ir.And(rotated, ir.Imm32(0x00800080)); + const auto result = ir.Or(low_byte, ir.Mul(sign_bit, ir.Imm32(0x1FE))); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SXTAB(Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.Add(reg_n, ir.SignExtendByteToWord(ir.LeastSignificantByte(rotated))); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SXTAB16(Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + const auto low_byte = ir.And(rotated, ir.Imm32(0x00FF00FF)); + const auto sign_bit = ir.And(rotated, ir.Imm32(0x00800080)); + const auto addend = ir.Or(low_byte, ir.Mul(sign_bit, ir.Imm32(0x1FE))); + const auto result = ir.PackedAddU16(addend, ir.GetRegister(n)).result; + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SXTH(Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + const auto result = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(rotated)); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SXTAH(Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.Add(reg_n, ir.SignExtendHalfToWord(ir.LeastSignificantHalf(rotated))); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UXTB(Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + const auto result = ir.ZeroExtendByteToWord(ir.LeastSignificantByte(rotated)); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UXTB16(Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + const auto result = ir.And(rotated, ir.Imm32(0x00FF00FF)); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UXTAB(Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.Add(reg_n, ir.ZeroExtendByteToWord(ir.LeastSignificantByte(rotated))); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UXTAB16(Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + auto result = ir.And(rotated, ir.Imm32(0x00FF00FF)); + const auto reg_n = ir.GetRegister(n); + result = ir.PackedAddU16(reg_n, result).result; + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UXTH(Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + const auto result = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(rotated)); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UXTAH(Reg n, Reg d, SignExtendRotation rotate, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto rotated = Rotate(ir, m, rotate); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.Add(reg_n, ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(rotated))); + + ir.SetRegister(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_shifted_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_shifted_register.cpp new file mode 100644 index 0000000000..3058350043 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_data_processing_shifted_register.cpp @@ -0,0 +1,253 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +bool TranslatorVisitor::thumb32_TST_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m) { + if (n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.And(ir.GetRegister(n), shifted.result); + + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + return true; +} + +bool TranslatorVisitor::thumb32_AND_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + ASSERT_MSG(!(d == Reg::PC && S), "Decode error"); + + if ((d == Reg::PC && !S) || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.And(ir.GetRegister(n), shifted.result); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_BIC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.AndNot(ir.GetRegister(n), shifted.result); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_MOV_reg(bool S, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = shifted.result; + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_ORR_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + ASSERT_MSG(n != Reg::PC, "Decode error"); + + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.Or(ir.GetRegister(n), shifted.result); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_MVN_reg(bool S, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.Not(shifted.result); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_ORN_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + ASSERT_MSG(n != Reg::PC, "Decode error"); + + if (d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.Or(ir.GetRegister(n), ir.Not(shifted.result)); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_TEQ_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m) { + if (n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.Eor(ir.GetRegister(n), shifted.result); + + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + return true; +} + +bool TranslatorVisitor::thumb32_EOR_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + ASSERT_MSG(!(d == Reg::PC && S), "Decode error"); + + if ((d == Reg::PC && !S) || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.Eor(ir.GetRegister(n), shifted.result); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZC(ir.NZFrom(result), shifted.carry); + } + return true; +} + +bool TranslatorVisitor::thumb32_PKH(Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, Imm<1> tb, Reg m) { + const ShiftType type = concatenate(tb, Imm<1>{0}).ZeroExtend<ShiftType>(); + + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto operand2 = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()).result; + const auto lower = ir.And((tb == 1) ? operand2 : ir.GetRegister(n), ir.Imm32(0x0000FFFF)); + const auto upper = ir.And((tb == 1) ? ir.GetRegister(n) : operand2, ir.Imm32(0xFFFF0000)); + const auto result = ir.Or(upper, lower); + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_CMN_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m) { + if (n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0)); + + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +bool TranslatorVisitor::thumb32_ADD_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + ASSERT_MSG(!(d == Reg::PC && S), "Decode error"); + + if ((d == Reg::PC && !S) || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(0)); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +bool TranslatorVisitor::thumb32_ADC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.AddWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag()); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +bool TranslatorVisitor::thumb32_SBC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.GetCFlag()); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +bool TranslatorVisitor::thumb32_CMP_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m) { + if (n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1)); + + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + return true; +} + +bool TranslatorVisitor::thumb32_SUB_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + ASSERT_MSG(!(d == Reg::PC && S), "Decode error"); + + if ((d == Reg::PC && !S) || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.SubWithCarry(ir.GetRegister(n), shifted.result, ir.Imm1(1)); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +bool TranslatorVisitor::thumb32_RSB_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag()); + const auto result = ir.SubWithCarry(shifted.result, ir.GetRegister(n), ir.Imm1(1)); + ir.SetRegister(d, result); + if (S) { + ir.SetCpsrNZCV(ir.NZCVFrom(result)); + } + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_byte.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_byte.cpp new file mode 100644 index 0000000000..42b2ebf4aa --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_byte.cpp @@ -0,0 +1,198 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" +#include "dynarmic/interface/A32/config.h" + +namespace Dynarmic::A32 { +static bool PLDHandler(TranslatorVisitor& v, bool W) { + if (!v.options.hook_hint_instructions) { + return true; + } + + const auto exception = W ? Exception::PreloadDataWithIntentToWrite + : Exception::PreloadData; + return v.RaiseException(exception); +} + +static bool PLIHandler(TranslatorVisitor& v) { + if (!v.options.hook_hint_instructions) { + return true; + } + + return v.RaiseException(Exception::PreloadInstruction); +} + +using ExtensionFunction = IR::U32 (IREmitter::*)(const IR::U8&); + +static bool LoadByteLiteral(TranslatorVisitor& v, bool U, Reg t, Imm<12> imm12, ExtensionFunction ext_fn) { + const u32 imm32 = imm12.ZeroExtend(); + const u32 base = v.ir.AlignPC(4); + const u32 address = U ? (base + imm32) : (base - imm32); + const auto data = (v.ir.*ext_fn)(v.ir.ReadMemory8(v.ir.Imm32(address), IR::AccType::NORMAL)); + + v.ir.SetRegister(t, data); + return true; +} + +static bool LoadByteRegister(TranslatorVisitor& v, Reg n, Reg t, Imm<2> imm2, Reg m, ExtensionFunction ext_fn) { + if (m == Reg::PC) { + return v.UnpredictableInstruction(); + } + + const auto reg_n = v.ir.GetRegister(n); + const auto reg_m = v.ir.GetRegister(m); + const auto offset = v.ir.LogicalShiftLeft(reg_m, v.ir.Imm8(imm2.ZeroExtend<u8>())); + const auto address = v.ir.Add(reg_n, offset); + const auto data = (v.ir.*ext_fn)(v.ir.ReadMemory8(address, IR::AccType::NORMAL)); + + v.ir.SetRegister(t, data); + return true; +} + +static bool LoadByteImmediate(TranslatorVisitor& v, Reg n, Reg t, bool P, bool U, bool W, Imm<12> imm12, ExtensionFunction ext_fn) { + const u32 imm32 = imm12.ZeroExtend(); + const IR::U32 reg_n = v.ir.GetRegister(n); + const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm32)) + : v.ir.Sub(reg_n, v.ir.Imm32(imm32)); + const IR::U32 address = P ? offset_address : reg_n; + const IR::U32 data = (v.ir.*ext_fn)(v.ir.ReadMemory8(address, IR::AccType::NORMAL)); + + v.ir.SetRegister(t, data); + if (W) { + v.ir.SetRegister(n, offset_address); + } + return true; +} + +bool TranslatorVisitor::thumb32_PLD_lit(bool /*U*/, Imm<12> /*imm12*/) { + return PLDHandler(*this, false); +} + +bool TranslatorVisitor::thumb32_PLD_imm8(bool W, Reg /*n*/, Imm<8> /*imm8*/) { + return PLDHandler(*this, W); +} + +bool TranslatorVisitor::thumb32_PLD_imm12(bool W, Reg /*n*/, Imm<12> /*imm12*/) { + return PLDHandler(*this, W); +} + +bool TranslatorVisitor::thumb32_PLD_reg(bool W, Reg /*n*/, Imm<2> /*imm2*/, Reg m) { + if (m == Reg::PC) { + return UnpredictableInstruction(); + } + + return PLDHandler(*this, W); +} + +bool TranslatorVisitor::thumb32_PLI_lit(bool /*U*/, Imm<12> /*imm12*/) { + return PLIHandler(*this); +} + +bool TranslatorVisitor::thumb32_PLI_imm8(Reg /*n*/, Imm<8> /*imm8*/) { + return PLIHandler(*this); +} + +bool TranslatorVisitor::thumb32_PLI_imm12(Reg /*n*/, Imm<12> /*imm12*/) { + return PLIHandler(*this); +} + +bool TranslatorVisitor::thumb32_PLI_reg(Reg /*n*/, Imm<2> /*imm2*/, Reg m) { + if (m == Reg::PC) { + return UnpredictableInstruction(); + } + + return PLIHandler(*this); +} + +bool TranslatorVisitor::thumb32_LDRB_lit(bool U, Reg t, Imm<12> imm12) { + return LoadByteLiteral(*this, U, t, imm12, + &IREmitter::ZeroExtendByteToWord); +} + +bool TranslatorVisitor::thumb32_LDRB_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8) { + if (t == Reg::PC && W) { + return UnpredictableInstruction(); + } + if (W && n == t) { + return UnpredictableInstruction(); + } + if (!P && !W) { + return UndefinedInstruction(); + } + + return LoadByteImmediate(*this, n, t, P, U, W, Imm<12>{imm8.ZeroExtend()}, + &IREmitter::ZeroExtendByteToWord); +} + +bool TranslatorVisitor::thumb32_LDRB_imm12(Reg n, Reg t, Imm<12> imm12) { + return LoadByteImmediate(*this, n, t, true, true, false, imm12, + &IREmitter::ZeroExtendByteToWord); +} + +bool TranslatorVisitor::thumb32_LDRB_reg(Reg n, Reg t, Imm<2> imm2, Reg m) { + return LoadByteRegister(*this, n, t, imm2, m, + &IREmitter::ZeroExtendByteToWord); +} + +bool TranslatorVisitor::thumb32_LDRBT(Reg n, Reg t, Imm<8> imm8) { + // TODO: Add an unpredictable instruction path if this + // is executed in hypervisor mode if we ever support + // privileged execution modes. + + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + // Treat it as a normal LDRB, given we don't support + // execution levels other than EL0 currently. + return thumb32_LDRB_imm8(n, t, true, true, false, imm8); +} + +bool TranslatorVisitor::thumb32_LDRSB_lit(bool U, Reg t, Imm<12> imm12) { + return LoadByteLiteral(*this, U, t, imm12, + &IREmitter::SignExtendByteToWord); +} + +bool TranslatorVisitor::thumb32_LDRSB_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8) { + if (t == Reg::PC && W) { + return UnpredictableInstruction(); + } + if (W && n == t) { + return UnpredictableInstruction(); + } + if (!P && !W) { + return UndefinedInstruction(); + } + + return LoadByteImmediate(*this, n, t, P, U, W, Imm<12>{imm8.ZeroExtend()}, + &IREmitter::SignExtendByteToWord); +} + +bool TranslatorVisitor::thumb32_LDRSB_imm12(Reg n, Reg t, Imm<12> imm12) { + return LoadByteImmediate(*this, n, t, true, true, false, imm12, + &IREmitter::SignExtendByteToWord); +} + +bool TranslatorVisitor::thumb32_LDRSB_reg(Reg n, Reg t, Imm<2> imm2, Reg m) { + return LoadByteRegister(*this, n, t, imm2, m, + &IREmitter::SignExtendByteToWord); +} + +bool TranslatorVisitor::thumb32_LDRSBT(Reg n, Reg t, Imm<8> imm8) { + // TODO: Add an unpredictable instruction path if this + // is executed in hypervisor mode if we ever support + // privileged execution modes. + + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + // Treat it as a normal LDRSB, given we don't support + // execution levels other than EL0 currently. + return thumb32_LDRSB_imm8(n, t, true, true, false, imm8); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_halfword.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_halfword.cpp new file mode 100644 index 0000000000..5b9f1639af --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_halfword.cpp @@ -0,0 +1,138 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +using ExtensionFunction = IR::U32 (IREmitter::*)(const IR::U16&); + +static bool LoadHalfLiteral(TranslatorVisitor& v, bool U, Reg t, Imm<12> imm12, ExtensionFunction ext_fn) { + const auto imm32 = imm12.ZeroExtend(); + const auto base = v.ir.AlignPC(4); + const auto address = U ? (base + imm32) : (base - imm32); + const auto data = (v.ir.*ext_fn)(v.ir.ReadMemory16(v.ir.Imm32(address), IR::AccType::NORMAL)); + + v.ir.SetRegister(t, data); + return true; +} + +static bool LoadHalfRegister(TranslatorVisitor& v, Reg n, Reg t, Imm<2> imm2, Reg m, ExtensionFunction ext_fn) { + if (m == Reg::PC) { + return v.UnpredictableInstruction(); + } + + const IR::U32 reg_m = v.ir.GetRegister(m); + const IR::U32 reg_n = v.ir.GetRegister(n); + const IR::U32 offset = v.ir.LogicalShiftLeft(reg_m, v.ir.Imm8(imm2.ZeroExtend<u8>())); + const IR::U32 address = v.ir.Add(reg_n, offset); + const IR::U32 data = (v.ir.*ext_fn)(v.ir.ReadMemory16(address, IR::AccType::NORMAL)); + + v.ir.SetRegister(t, data); + return true; +} + +static bool LoadHalfImmediate(TranslatorVisitor& v, Reg n, Reg t, bool P, bool U, bool W, Imm<12> imm12, ExtensionFunction ext_fn) { + const u32 imm32 = imm12.ZeroExtend(); + const IR::U32 reg_n = v.ir.GetRegister(n); + const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm32)) + : v.ir.Sub(reg_n, v.ir.Imm32(imm32)); + const IR::U32 address = P ? offset_address + : reg_n; + const IR::U32 data = (v.ir.*ext_fn)(v.ir.ReadMemory16(address, IR::AccType::NORMAL)); + + if (W) { + v.ir.SetRegister(n, offset_address); + } + + v.ir.SetRegister(t, data); + return true; +} + +bool TranslatorVisitor::thumb32_LDRH_lit(bool U, Reg t, Imm<12> imm12) { + return LoadHalfLiteral(*this, U, t, imm12, &IREmitter::ZeroExtendHalfToWord); +} + +bool TranslatorVisitor::thumb32_LDRH_reg(Reg n, Reg t, Imm<2> imm2, Reg m) { + return LoadHalfRegister(*this, n, t, imm2, m, &IREmitter::ZeroExtendHalfToWord); +} + +bool TranslatorVisitor::thumb32_LDRH_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8) { + if (!P && !W) { + return UndefinedInstruction(); + } + if (t == Reg::PC && W) { + return UnpredictableInstruction(); + } + if (W && n == t) { + return UnpredictableInstruction(); + } + + return LoadHalfImmediate(*this, n, t, P, U, W, Imm<12>{imm8.ZeroExtend()}, + &IREmitter::ZeroExtendHalfToWord); +} + +bool TranslatorVisitor::thumb32_LDRH_imm12(Reg n, Reg t, Imm<12> imm12) { + return LoadHalfImmediate(*this, n, t, true, true, false, imm12, + &IREmitter::ZeroExtendHalfToWord); +} + +bool TranslatorVisitor::thumb32_LDRHT(Reg n, Reg t, Imm<8> imm8) { + // TODO: Add an unpredictable instruction path if this + // is executed in hypervisor mode if we ever support + // privileged execution levels. + + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + // Treat it as a normal LDRH, given we don't support + // execution levels other than EL0 currently. + return thumb32_LDRH_imm8(n, t, true, true, false, imm8); +} + +bool TranslatorVisitor::thumb32_LDRSH_lit(bool U, Reg t, Imm<12> imm12) { + return LoadHalfLiteral(*this, U, t, imm12, &IREmitter::SignExtendHalfToWord); +} + +bool TranslatorVisitor::thumb32_LDRSH_reg(Reg n, Reg t, Imm<2> imm2, Reg m) { + return LoadHalfRegister(*this, n, t, imm2, m, &IREmitter::SignExtendHalfToWord); +} + +bool TranslatorVisitor::thumb32_LDRSH_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8) { + if (!P && !W) { + return UndefinedInstruction(); + } + if (t == Reg::PC && W) { + return UnpredictableInstruction(); + } + if (W && n == t) { + return UnpredictableInstruction(); + } + + return LoadHalfImmediate(*this, n, t, P, U, W, Imm<12>{imm8.ZeroExtend()}, + &IREmitter::SignExtendHalfToWord); +} + +bool TranslatorVisitor::thumb32_LDRSH_imm12(Reg n, Reg t, Imm<12> imm12) { + return LoadHalfImmediate(*this, n, t, true, true, false, imm12, + &IREmitter::SignExtendHalfToWord); +} + +bool TranslatorVisitor::thumb32_LDRSHT(Reg n, Reg t, Imm<8> imm8) { + // TODO: Add an unpredictable instruction path if this + // is executed in hypervisor mode if we ever support + // privileged execution levels. + + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + // Treat it as a normal LDRSH, given we don't support + // execution levels other than EL0 currently. + return thumb32_LDRSH_imm8(n, t, true, true, false, imm8); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_dual.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_dual.cpp new file mode 100644 index 0000000000..17d4285c23 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_dual.cpp @@ -0,0 +1,292 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +static bool ITBlockCheck(const A32::IREmitter& ir) { + return ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock(); +} + +static bool TableBranch(TranslatorVisitor& v, Reg n, Reg m, bool half) { + if (m == Reg::PC) { + return v.UnpredictableInstruction(); + } + if (ITBlockCheck(v.ir)) { + return v.UnpredictableInstruction(); + } + + const auto reg_m = v.ir.GetRegister(m); + const auto reg_n = v.ir.GetRegister(n); + + IR::U32 halfwords; + if (half) { + const auto data = v.ir.ReadMemory16(v.ir.Add(reg_n, v.ir.LogicalShiftLeft(reg_m, v.ir.Imm8(1))), IR::AccType::NORMAL); + halfwords = v.ir.ZeroExtendToWord(data); + } else { + halfwords = v.ir.ZeroExtendToWord(v.ir.ReadMemory8(v.ir.Add(reg_n, reg_m), IR::AccType::NORMAL)); + } + + const auto current_pc = v.ir.Imm32(v.ir.PC()); + const auto branch_value = v.ir.Add(current_pc, v.ir.Add(halfwords, halfwords)); + + v.ir.UpdateUpperLocationDescriptor(); + v.ir.BranchWritePC(branch_value); + v.ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; +} + +static bool LoadDualImmediate(TranslatorVisitor& v, bool P, bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8) { + if (W && (n == t || n == t2)) { + return v.UnpredictableInstruction(); + } + if (t == Reg::PC || t2 == Reg::PC || t == t2) { + return v.UnpredictableInstruction(); + } + + const u32 imm = imm8.ZeroExtend() << 2; + const IR::U32 reg_n = v.ir.GetRegister(n); + const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm)) + : v.ir.Sub(reg_n, v.ir.Imm32(imm)); + const IR::U32 address = P ? offset_address : reg_n; + + // NOTE: If alignment is exactly off by 4, each word is an atomic access. + const IR::U64 data = v.ir.ReadMemory64(address, IR::AccType::ATOMIC); + + if (v.ir.current_location.EFlag()) { + v.ir.SetRegister(t, v.ir.MostSignificantWord(data).result); + v.ir.SetRegister(t2, v.ir.LeastSignificantWord(data)); + } else { + v.ir.SetRegister(t, v.ir.LeastSignificantWord(data)); + v.ir.SetRegister(t2, v.ir.MostSignificantWord(data).result); + } + + if (W) { + v.ir.SetRegister(n, offset_address); + } + return true; +} + +static bool LoadDualLiteral(TranslatorVisitor& v, bool U, bool W, Reg t, Reg t2, Imm<8> imm8) { + if (t == Reg::PC || t2 == Reg::PC || t == t2) { + return v.UnpredictableInstruction(); + } + if (W) { + return v.UnpredictableInstruction(); + } + + const auto imm = imm8.ZeroExtend() << 2; + const auto address = U ? v.ir.Add(v.ir.Imm32(v.ir.AlignPC(4)), v.ir.Imm32(imm)) + : v.ir.Sub(v.ir.Imm32(v.ir.AlignPC(4)), v.ir.Imm32(imm)); + + // NOTE: If alignment is exactly off by 4, each word is an atomic access. + const IR::U64 data = v.ir.ReadMemory64(address, IR::AccType::ATOMIC); + + if (v.ir.current_location.EFlag()) { + v.ir.SetRegister(t, v.ir.MostSignificantWord(data).result); + v.ir.SetRegister(t2, v.ir.LeastSignificantWord(data)); + } else { + v.ir.SetRegister(t, v.ir.LeastSignificantWord(data)); + v.ir.SetRegister(t2, v.ir.MostSignificantWord(data).result); + } + + return true; +} + +static bool StoreDual(TranslatorVisitor& v, bool P, bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8) { + if (W && (n == t || n == t2)) { + return v.UnpredictableInstruction(); + } + if (n == Reg::PC || t == Reg::PC || t2 == Reg::PC) { + return v.UnpredictableInstruction(); + } + + const u32 imm = imm8.ZeroExtend() << 2; + const IR::U32 reg_n = v.ir.GetRegister(n); + const IR::U32 reg_t = v.ir.GetRegister(t); + const IR::U32 reg_t2 = v.ir.GetRegister(t2); + + const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm)) + : v.ir.Sub(reg_n, v.ir.Imm32(imm)); + const IR::U32 address = P ? offset_address : reg_n; + + const IR::U64 data = v.ir.current_location.EFlag() ? v.ir.Pack2x32To1x64(reg_t2, reg_t) + : v.ir.Pack2x32To1x64(reg_t, reg_t2); + + // NOTE: If alignment is exactly off by 4, each word is an atomic access. + v.ir.WriteMemory64(address, data, IR::AccType::ATOMIC); + + if (W) { + v.ir.SetRegister(n, offset_address); + } + return true; +} + +bool TranslatorVisitor::thumb32_LDA(Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto address = ir.GetRegister(n); + ir.SetRegister(t, ir.ReadMemory32(address, IR::AccType::ORDERED)); + return true; +} + +bool TranslatorVisitor::thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8) { + return LoadDualImmediate(*this, false, U, true, n, t, t2, imm8); +} + +bool TranslatorVisitor::thumb32_LDRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8) { + return LoadDualImmediate(*this, true, U, W, n, t, t2, imm8); +} + +bool TranslatorVisitor::thumb32_LDRD_lit_1(bool U, Reg t, Reg t2, Imm<8> imm8) { + return LoadDualLiteral(*this, U, true, t, t2, imm8); +} + +bool TranslatorVisitor::thumb32_LDRD_lit_2(bool U, bool W, Reg t, Reg t2, Imm<8> imm8) { + return LoadDualLiteral(*this, U, W, t, t2, imm8); +} + +bool TranslatorVisitor::thumb32_STRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8) { + return StoreDual(*this, false, U, true, n, t, t2, imm8); +} + +bool TranslatorVisitor::thumb32_STRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8) { + return StoreDual(*this, true, U, W, n, t, t2, imm8); +} + +bool TranslatorVisitor::thumb32_LDREX(Reg n, Reg t, Imm<8> imm8) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm8.ZeroExtend() << 2)); + const auto value = ir.ExclusiveReadMemory32(address, IR::AccType::ATOMIC); + + ir.SetRegister(t, value); + return true; +} + +bool TranslatorVisitor::thumb32_LDREXB(Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto address = ir.GetRegister(n); + const auto value = ir.ZeroExtendToWord(ir.ExclusiveReadMemory8(address, IR::AccType::ATOMIC)); + + ir.SetRegister(t, value); + return true; +} + +bool TranslatorVisitor::thumb32_LDREXD(Reg n, Reg t, Reg t2) { + if (t == Reg::PC || t2 == Reg::PC || t == t2 || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto address = ir.GetRegister(n); + const auto [lo, hi] = ir.ExclusiveReadMemory64(address, IR::AccType::ATOMIC); + + // DO NOT SWAP hi AND lo IN BIG ENDIAN MODE, THIS IS CORRECT BEHAVIOUR + ir.SetRegister(t, lo); + ir.SetRegister(t2, hi); + return true; +} + +bool TranslatorVisitor::thumb32_LDREXH(Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto address = ir.GetRegister(n); + const auto value = ir.ZeroExtendToWord(ir.ExclusiveReadMemory16(address, IR::AccType::ATOMIC)); + + ir.SetRegister(t, value); + return true; +} + +bool TranslatorVisitor::thumb32_STL(Reg n, Reg t) { + if (t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto address = ir.GetRegister(n); + ir.WriteMemory32(address, ir.GetRegister(t), IR::AccType::ORDERED); + return true; +} + +bool TranslatorVisitor::thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8) { + if (d == Reg::PC || t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + if (d == n || d == t) { + return UnpredictableInstruction(); + } + + const auto address = ir.Add(ir.GetRegister(n), ir.Imm32(imm8.ZeroExtend() << 2)); + const auto value = ir.GetRegister(t); + const auto passed = ir.ExclusiveWriteMemory32(address, value, IR::AccType::ATOMIC); + ir.SetRegister(d, passed); + return true; +} + +bool TranslatorVisitor::thumb32_STREXB(Reg n, Reg t, Reg d) { + if (d == Reg::PC || t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + if (d == n || d == t) { + return UnpredictableInstruction(); + } + + const auto address = ir.GetRegister(n); + const auto value = ir.LeastSignificantByte(ir.GetRegister(t)); + const auto passed = ir.ExclusiveWriteMemory8(address, value, IR::AccType::ATOMIC); + ir.SetRegister(d, passed); + return true; +} + +bool TranslatorVisitor::thumb32_STREXD(Reg n, Reg t, Reg t2, Reg d) { + if (d == Reg::PC || t == Reg::PC || t2 == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + if (d == n || d == t || d == t2) { + return UnpredictableInstruction(); + } + + const auto address = ir.GetRegister(n); + const auto value_lo = ir.GetRegister(t); + const auto value_hi = ir.GetRegister(t2); + const auto passed = ir.ExclusiveWriteMemory64(address, value_lo, value_hi, IR::AccType::ATOMIC); + ir.SetRegister(d, passed); + return true; +} + +bool TranslatorVisitor::thumb32_STREXH(Reg n, Reg t, Reg d) { + if (d == Reg::PC || t == Reg::PC || n == Reg::PC) { + return UnpredictableInstruction(); + } + if (d == n || d == t) { + return UnpredictableInstruction(); + } + + const auto address = ir.GetRegister(n); + const auto value = ir.LeastSignificantHalf(ir.GetRegister(t)); + const auto passed = ir.ExclusiveWriteMemory16(address, value, IR::AccType::ATOMIC); + ir.SetRegister(d, passed); + return true; +} + +bool TranslatorVisitor::thumb32_TBB(Reg n, Reg m) { + return TableBranch(*this, n, m, false); +} + +bool TranslatorVisitor::thumb32_TBH(Reg n, Reg m) { + return TableBranch(*this, n, m, true); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_multiple.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_multiple.cpp new file mode 100644 index 0000000000..2bc782b973 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_multiple.cpp @@ -0,0 +1,149 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_count.hpp> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +static bool ITBlockCheck(const A32::IREmitter& ir) { + return ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock(); +} + +static bool LDMHelper(A32::IREmitter& ir, bool W, Reg n, u32 list, const IR::U32& start_address, const IR::U32& writeback_address) { + auto address = start_address; + for (size_t i = 0; i <= 14; i++) { + if (mcl::bit::get_bit(i, list)) { + ir.SetRegister(static_cast<Reg>(i), ir.ReadMemory32(address, IR::AccType::ATOMIC)); + address = ir.Add(address, ir.Imm32(4)); + } + } + if (W && !mcl::bit::get_bit(RegNumber(n), list)) { + ir.SetRegister(n, writeback_address); + } + if (mcl::bit::get_bit<15>(list)) { + ir.UpdateUpperLocationDescriptor(); + ir.LoadWritePC(ir.ReadMemory32(address, IR::AccType::ATOMIC)); + if (n == Reg::R13) { + ir.SetTerm(IR::Term::PopRSBHint{}); + } else { + ir.SetTerm(IR::Term::FastDispatchHint{}); + } + return false; + } + return true; +} + +static bool STMHelper(A32::IREmitter& ir, bool W, Reg n, u32 list, const IR::U32& start_address, const IR::U32& writeback_address) { + auto address = start_address; + for (size_t i = 0; i <= 14; i++) { + if (mcl::bit::get_bit(i, list)) { + ir.WriteMemory32(address, ir.GetRegister(static_cast<Reg>(i)), IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + } + } + if (W) { + ir.SetRegister(n, writeback_address); + } + return true; +} + +bool TranslatorVisitor::thumb32_LDMDB(bool W, Reg n, Imm<16> reg_list) { + const auto regs_imm = reg_list.ZeroExtend(); + const auto num_regs = static_cast<u32>(mcl::bit::count_ones(regs_imm)); + + if (n == Reg::PC || num_regs < 2) { + return UnpredictableInstruction(); + } + if (reg_list.Bit<15>() && reg_list.Bit<14>()) { + return UnpredictableInstruction(); + } + if (W && mcl::bit::get_bit(static_cast<size_t>(n), regs_imm)) { + return UnpredictableInstruction(); + } + if (reg_list.Bit<13>()) { + return UnpredictableInstruction(); + } + if (reg_list.Bit<15>() && ITBlockCheck(ir)) { + return UnpredictableInstruction(); + } + + // Start address is the same as the writeback address. + const IR::U32 start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(4 * num_regs)); + return LDMHelper(ir, W, n, regs_imm, start_address, start_address); +} + +bool TranslatorVisitor::thumb32_LDMIA(bool W, Reg n, Imm<16> reg_list) { + const auto regs_imm = reg_list.ZeroExtend(); + const auto num_regs = static_cast<u32>(mcl::bit::count_ones(regs_imm)); + + if (n == Reg::PC || num_regs < 2) { + return UnpredictableInstruction(); + } + if (reg_list.Bit<15>() && reg_list.Bit<14>()) { + return UnpredictableInstruction(); + } + if (W && mcl::bit::get_bit(static_cast<size_t>(n), regs_imm)) { + return UnpredictableInstruction(); + } + if (reg_list.Bit<13>()) { + return UnpredictableInstruction(); + } + if (reg_list.Bit<15>() && ITBlockCheck(ir)) { + return UnpredictableInstruction(); + } + + const auto start_address = ir.GetRegister(n); + const auto writeback_address = ir.Add(start_address, ir.Imm32(num_regs * 4)); + return LDMHelper(ir, W, n, regs_imm, start_address, writeback_address); +} + +bool TranslatorVisitor::thumb32_POP(Imm<16> reg_list) { + return thumb32_LDMIA(true, Reg::SP, reg_list); +} + +bool TranslatorVisitor::thumb32_PUSH(Imm<15> reg_list) { + return thumb32_STMDB(true, Reg::SP, reg_list); +} + +bool TranslatorVisitor::thumb32_STMIA(bool W, Reg n, Imm<15> reg_list) { + const auto regs_imm = reg_list.ZeroExtend(); + const auto num_regs = static_cast<u32>(mcl::bit::count_ones(regs_imm)); + + if (n == Reg::PC || num_regs < 2) { + return UnpredictableInstruction(); + } + if (W && mcl::bit::get_bit(static_cast<size_t>(n), regs_imm)) { + return UnpredictableInstruction(); + } + if (reg_list.Bit<13>()) { + return UnpredictableInstruction(); + } + + const auto start_address = ir.GetRegister(n); + const auto writeback_address = ir.Add(start_address, ir.Imm32(num_regs * 4)); + return STMHelper(ir, W, n, regs_imm, start_address, writeback_address); +} + +bool TranslatorVisitor::thumb32_STMDB(bool W, Reg n, Imm<15> reg_list) { + const auto regs_imm = reg_list.ZeroExtend(); + const auto num_regs = static_cast<u32>(mcl::bit::count_ones(regs_imm)); + + if (n == Reg::PC || num_regs < 2) { + return UnpredictableInstruction(); + } + if (W && mcl::bit::get_bit(static_cast<size_t>(n), regs_imm)) { + return UnpredictableInstruction(); + } + if (reg_list.Bit<13>()) { + return UnpredictableInstruction(); + } + + // Start address is the same as the writeback address. + const IR::U32 start_address = ir.Sub(ir.GetRegister(n), ir.Imm32(4 * num_regs)); + return STMHelper(ir, W, n, regs_imm, start_address, start_address); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_word.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_word.cpp new file mode 100644 index 0000000000..b92e27fc66 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_word.cpp @@ -0,0 +1,134 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +static bool ITBlockCheck(const A32::IREmitter& ir) { + return ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock(); +} + +bool TranslatorVisitor::thumb32_LDR_lit(bool U, Reg t, Imm<12> imm12) { + if (t == Reg::PC && ITBlockCheck(ir)) { + return UnpredictableInstruction(); + } + + const u32 imm32 = imm12.ZeroExtend(); + const u32 base = ir.AlignPC(4); + const u32 address = U ? base + imm32 : base - imm32; + const auto data = ir.ReadMemory32(ir.Imm32(address), IR::AccType::NORMAL); + + if (t == Reg::PC) { + ir.UpdateUpperLocationDescriptor(); + ir.LoadWritePC(data); + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; + } + + ir.SetRegister(t, data); + return true; +} + +bool TranslatorVisitor::thumb32_LDR_imm8(Reg n, Reg t, bool P, bool U, bool W, Imm<8> imm8) { + if (!P && !W) { + return UndefinedInstruction(); + } + if (W && n == t) { + return UnpredictableInstruction(); + } + if (t == Reg::PC && ITBlockCheck(ir)) { + return UnpredictableInstruction(); + } + + const u32 imm32 = imm8.ZeroExtend(); + const IR::U32 reg_n = ir.GetRegister(n); + const IR::U32 offset_address = U ? ir.Add(reg_n, ir.Imm32(imm32)) + : ir.Sub(reg_n, ir.Imm32(imm32)); + const IR::U32 address = P ? offset_address + : reg_n; + const IR::U32 data = ir.ReadMemory32(address, IR::AccType::NORMAL); + + if (W) { + ir.SetRegister(n, offset_address); + } + + if (t == Reg::PC) { + ir.UpdateUpperLocationDescriptor(); + ir.LoadWritePC(data); + + if (!P && W && n == Reg::R13) { + ir.SetTerm(IR::Term::PopRSBHint{}); + } else { + ir.SetTerm(IR::Term::FastDispatchHint{}); + } + + return false; + } + + ir.SetRegister(t, data); + return true; +} + +bool TranslatorVisitor::thumb32_LDR_imm12(Reg n, Reg t, Imm<12> imm12) { + if (t == Reg::PC && ITBlockCheck(ir)) { + return UnpredictableInstruction(); + } + + const auto imm32 = imm12.ZeroExtend(); + const auto reg_n = ir.GetRegister(n); + const auto address = ir.Add(reg_n, ir.Imm32(imm32)); + const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL); + + if (t == Reg::PC) { + ir.UpdateUpperLocationDescriptor(); + ir.LoadWritePC(data); + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; + } + + ir.SetRegister(t, data); + return true; +} + +bool TranslatorVisitor::thumb32_LDR_reg(Reg n, Reg t, Imm<2> imm2, Reg m) { + if (m == Reg::PC) { + return UnpredictableInstruction(); + } + if (t == Reg::PC && ITBlockCheck(ir)) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto offset = ir.LogicalShiftLeft(reg_m, ir.Imm8(imm2.ZeroExtend<u8>())); + const auto address = ir.Add(reg_n, offset); + const auto data = ir.ReadMemory32(address, IR::AccType::NORMAL); + + if (t == Reg::PC) { + ir.UpdateUpperLocationDescriptor(); + ir.LoadWritePC(data); + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; + } + + ir.SetRegister(t, data); + return true; +} + +bool TranslatorVisitor::thumb32_LDRT(Reg n, Reg t, Imm<8> imm8) { + // TODO: Add an unpredictable instruction path if this + // is executed in hypervisor mode if we ever support + // privileged execution levels. + + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + // Treat it as a normal LDR, given we don't support + // execution levels other than EL0 currently. + return thumb32_LDR_imm8(n, t, true, true, false, imm8); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_long_multiply.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_long_multiply.cpp new file mode 100644 index 0000000000..0a0cdd9f2e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_long_multiply.cpp @@ -0,0 +1,222 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +namespace { +using DivideFunction = IR::U32U64 (IREmitter::*)(const IR::U32U64&, const IR::U32U64&); + +bool DivideOperation(TranslatorVisitor& v, Reg d, Reg m, Reg n, DivideFunction fn) { + if (d == Reg::PC || m == Reg::PC || n == Reg::PC) { + return v.UnpredictableInstruction(); + } + + const IR::U32 operand1 = v.ir.GetRegister(n); + const IR::U32 operand2 = v.ir.GetRegister(m); + const IR::U32 result = (v.ir.*fn)(operand1, operand2); + + v.ir.SetRegister(d, result); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::thumb32_SDIV(Reg n, Reg d, Reg m) { + return DivideOperation(*this, d, m, n, &IREmitter::SignedDiv); +} + +bool TranslatorVisitor::thumb32_SMLAL(Reg n, Reg dLo, Reg dHi, Reg m) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dHi == dLo) { + return UnpredictableInstruction(); + } + + const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m)); + const auto product = ir.Mul(n64, m64); + const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi)); + const auto result = ir.Add(product, addend); + const auto lo = ir.LeastSignificantWord(result); + const auto hi = ir.MostSignificantWord(result).result; + + ir.SetRegister(dLo, lo); + ir.SetRegister(dHi, hi); + return true; +} + +bool TranslatorVisitor::thumb32_SMLALD(Reg n, Reg dLo, Reg dHi, bool M, Reg m) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dHi == dLo) { + return UnpredictableInstruction(); + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (M) { + std::swap(m_lo, m_hi); + } + + const IR::U64 product_lo = ir.SignExtendWordToLong(ir.Mul(n_lo, m_lo)); + const IR::U64 product_hi = ir.SignExtendWordToLong(ir.Mul(n_hi, m_hi)); + const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi)); + const auto result = ir.Add(ir.Add(product_lo, product_hi), addend); + + ir.SetRegister(dLo, ir.LeastSignificantWord(result)); + ir.SetRegister(dHi, ir.MostSignificantWord(result).result); + return true; +} + +bool TranslatorVisitor::thumb32_SMLALXY(Reg n, Reg dLo, Reg dHi, bool N, bool M, Reg m) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dHi == dLo) { + return UnpredictableInstruction(); + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + const IR::U64 product = ir.SignExtendWordToLong(ir.Mul(n16, m16)); + const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi)); + const auto result = ir.Add(product, addend); + + ir.SetRegister(dLo, ir.LeastSignificantWord(result)); + ir.SetRegister(dHi, ir.MostSignificantWord(result).result); + return true; +} + +bool TranslatorVisitor::thumb32_SMLSLD(Reg n, Reg dLo, Reg dHi, bool M, Reg m) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dHi == dLo) { + return UnpredictableInstruction(); + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (M) { + std::swap(m_lo, m_hi); + } + + const IR::U64 product_lo = ir.SignExtendWordToLong(ir.Mul(n_lo, m_lo)); + const IR::U64 product_hi = ir.SignExtendWordToLong(ir.Mul(n_hi, m_hi)); + const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi)); + const auto result = ir.Add(ir.Sub(product_lo, product_hi), addend); + + ir.SetRegister(dLo, ir.LeastSignificantWord(result)); + ir.SetRegister(dHi, ir.MostSignificantWord(result).result); + return true; +} + +bool TranslatorVisitor::thumb32_SMULL(Reg n, Reg dLo, Reg dHi, Reg m) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dHi == dLo) { + return UnpredictableInstruction(); + } + + const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m)); + const auto result = ir.Mul(n64, m64); + const auto lo = ir.LeastSignificantWord(result); + const auto hi = ir.MostSignificantWord(result).result; + + ir.SetRegister(dLo, lo); + ir.SetRegister(dHi, hi); + return true; +} + +bool TranslatorVisitor::thumb32_UDIV(Reg n, Reg d, Reg m) { + return DivideOperation(*this, d, m, n, &IREmitter::UnsignedDiv); +} + +bool TranslatorVisitor::thumb32_UMLAL(Reg n, Reg dLo, Reg dHi, Reg m) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dHi == dLo) { + return UnpredictableInstruction(); + } + + const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m)); + const auto product = ir.Mul(n64, m64); + const auto addend = ir.Pack2x32To1x64(ir.GetRegister(dLo), ir.GetRegister(dHi)); + const auto result = ir.Add(product, addend); + const auto lo = ir.LeastSignificantWord(result); + const auto hi = ir.MostSignificantWord(result).result; + + ir.SetRegister(dLo, lo); + ir.SetRegister(dHi, hi); + return true; +} + +bool TranslatorVisitor::thumb32_UMULL(Reg n, Reg dLo, Reg dHi, Reg m) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dHi == dLo) { + return UnpredictableInstruction(); + } + + const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m)); + const auto result = ir.Mul(n64, m64); + const auto lo = ir.LeastSignificantWord(result); + const auto hi = ir.MostSignificantWord(result).result; + + ir.SetRegister(dLo, lo); + ir.SetRegister(dHi, hi); + return true; +} + +bool TranslatorVisitor::thumb32_UMAAL(Reg n, Reg dLo, Reg dHi, Reg m) { + if (dLo == Reg::PC || dHi == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + if (dHi == dLo) { + return UnpredictableInstruction(); + } + + const auto lo64 = ir.ZeroExtendWordToLong(ir.GetRegister(dLo)); + const auto hi64 = ir.ZeroExtendWordToLong(ir.GetRegister(dHi)); + const auto n64 = ir.ZeroExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.ZeroExtendWordToLong(ir.GetRegister(m)); + const auto result = ir.Add(ir.Add(ir.Mul(n64, m64), hi64), lo64); + + ir.SetRegister(dLo, ir.LeastSignificantWord(result)); + ir.SetRegister(dHi, ir.MostSignificantWord(result).result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_misc.cpp new file mode 100644 index 0000000000..9edb2310b9 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_misc.cpp @@ -0,0 +1,157 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +bool TranslatorVisitor::thumb32_CLZ(Reg n, Reg d, Reg m) { + if (m != n || d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto result = ir.CountLeadingZeros(reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_QADD(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.SignedSaturatedAddWithFlag(reg_m, reg_n); + + ir.SetRegister(d, result.result); + ir.OrQFlag(result.overflow); + return true; +} + +bool TranslatorVisitor::thumb32_QDADD(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto doubled_n = ir.SignedSaturatedAddWithFlag(reg_n, reg_n); + ir.OrQFlag(doubled_n.overflow); + + const auto result = ir.SignedSaturatedAddWithFlag(reg_m, doubled_n.result); + ir.SetRegister(d, result.result); + ir.OrQFlag(result.overflow); + return true; +} + +bool TranslatorVisitor::thumb32_QDSUB(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto doubled_n = ir.SignedSaturatedAddWithFlag(reg_n, reg_n); + ir.OrQFlag(doubled_n.overflow); + + const auto result = ir.SignedSaturatedSubWithFlag(reg_m, doubled_n.result); + ir.SetRegister(d, result.result); + ir.OrQFlag(result.overflow); + return true; +} + +bool TranslatorVisitor::thumb32_QSUB(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.SignedSaturatedSubWithFlag(reg_m, reg_n); + + ir.SetRegister(d, result.result); + ir.OrQFlag(result.overflow); + return true; +} + +bool TranslatorVisitor::thumb32_RBIT(Reg n, Reg d, Reg m) { + if (m != n || d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const IR::U32 swapped = ir.ByteReverseWord(ir.GetRegister(m)); + + // ((x & 0xF0F0F0F0) >> 4) | ((x & 0x0F0F0F0F) << 4) + const IR::U32 first_lsr = ir.LogicalShiftRight(ir.And(swapped, ir.Imm32(0xF0F0F0F0)), ir.Imm8(4)); + const IR::U32 first_lsl = ir.LogicalShiftLeft(ir.And(swapped, ir.Imm32(0x0F0F0F0F)), ir.Imm8(4)); + const IR::U32 corrected = ir.Or(first_lsl, first_lsr); + + // ((x & 0x88888888) >> 3) | ((x & 0x44444444) >> 1) | + // ((x & 0x22222222) << 1) | ((x & 0x11111111) << 3) + const IR::U32 second_lsr = ir.LogicalShiftRight(ir.And(corrected, ir.Imm32(0x88888888)), ir.Imm8(3)); + const IR::U32 third_lsr = ir.LogicalShiftRight(ir.And(corrected, ir.Imm32(0x44444444)), ir.Imm8(1)); + const IR::U32 second_lsl = ir.LogicalShiftLeft(ir.And(corrected, ir.Imm32(0x22222222)), ir.Imm8(1)); + const IR::U32 third_lsl = ir.LogicalShiftLeft(ir.And(corrected, ir.Imm32(0x11111111)), ir.Imm8(3)); + + const IR::U32 result = ir.Or(ir.Or(ir.Or(second_lsr, third_lsr), second_lsl), third_lsl); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_REV(Reg n, Reg d, Reg m) { + if (m != n || d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto result = ir.ByteReverseWord(ir.GetRegister(m)); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_REV16(Reg n, Reg d, Reg m) { + if (m != n || d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto lo = ir.And(ir.LogicalShiftRight(reg_m, ir.Imm8(8), ir.Imm1(0)).result, ir.Imm32(0x00FF00FF)); + const auto hi = ir.And(ir.LogicalShiftLeft(reg_m, ir.Imm8(8), ir.Imm1(0)).result, ir.Imm32(0xFF00FF00)); + const auto result = ir.Or(lo, hi); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_REVSH(Reg n, Reg d, Reg m) { + if (m != n || d == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto rev_half = ir.ByteReverseHalf(ir.LeastSignificantHalf(reg_m)); + + ir.SetRegister(d, ir.SignExtendHalfToWord(rev_half)); + return true; +} + +bool TranslatorVisitor::thumb32_SEL(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSelect(ir.GetGEFlags(), reg_m, reg_n); + + ir.SetRegister(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_multiply.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_multiply.cpp new file mode 100644 index 0000000000..587cbaafdb --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_multiply.cpp @@ -0,0 +1,312 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +bool TranslatorVisitor::thumb32_MLA(Reg n, Reg a, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_a = ir.GetRegister(a); + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.Add(ir.Mul(reg_n, reg_m), reg_a); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_MLS(Reg n, Reg a, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_a = ir.GetRegister(a); + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.Sub(reg_a, ir.Mul(reg_n, reg_m)); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_MUL(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.Mul(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SMLAD(Reg n, Reg a, Reg d, bool X, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (X) { + std::swap(m_lo, m_hi); + } + + const IR::U32 product_lo = ir.Mul(n_lo, m_lo); + const IR::U32 product_hi = ir.Mul(n_hi, m_hi); + const IR::U32 addend = ir.GetRegister(a); + + auto result = ir.AddWithCarry(product_lo, product_hi, ir.Imm1(0)); + ir.OrQFlag(ir.GetOverflowFrom(result)); + result = ir.AddWithCarry(result, addend, ir.Imm1(0)); + + ir.SetRegister(d, result); + ir.OrQFlag(ir.GetOverflowFrom(result)); + return true; +} + +bool TranslatorVisitor::thumb32_SMLSD(Reg n, Reg a, Reg d, bool X, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (X) { + std::swap(m_lo, m_hi); + } + + const IR::U32 product_lo = ir.Mul(n_lo, m_lo); + const IR::U32 product_hi = ir.Mul(n_hi, m_hi); + const IR::U32 addend = ir.GetRegister(a); + const IR::U32 product = ir.Sub(product_lo, product_hi); + auto result = ir.AddWithCarry(product, addend, ir.Imm1(0)); + + ir.SetRegister(d, result); + ir.OrQFlag(ir.GetOverflowFrom(result)); + return true; +} + +bool TranslatorVisitor::thumb32_SMLAXY(Reg n, Reg a, Reg d, bool N, bool M, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + const IR::U32 product = ir.Mul(n16, m16); + const auto result = ir.AddWithCarry(product, ir.GetRegister(a), ir.Imm1(0)); + + ir.SetRegister(d, result); + ir.OrQFlag(ir.GetOverflowFrom(result)); + return true; +} + +bool TranslatorVisitor::thumb32_SMMLA(Reg n, Reg a, Reg d, bool R, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m)); + const auto a64 = ir.Pack2x32To1x64(ir.Imm32(0), ir.GetRegister(a)); + const auto temp = ir.Add(a64, ir.Mul(n64, m64)); + const auto result_carry = ir.MostSignificantWord(temp); + auto result = result_carry.result; + if (R) { + result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry); + } + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SMMLS(Reg n, Reg a, Reg d, bool R, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m)); + const auto a64 = ir.Pack2x32To1x64(ir.Imm32(0), ir.GetRegister(a)); + const auto temp = ir.Sub(a64, ir.Mul(n64, m64)); + const auto result_carry = ir.MostSignificantWord(temp); + auto result = result_carry.result; + if (R) { + result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry); + } + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SMMUL(Reg n, Reg d, bool R, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto n64 = ir.SignExtendWordToLong(ir.GetRegister(n)); + const auto m64 = ir.SignExtendWordToLong(ir.GetRegister(m)); + const auto product = ir.Mul(n64, m64); + const auto result_carry = ir.MostSignificantWord(product); + auto result = result_carry.result; + if (R) { + result = ir.AddWithCarry(result, ir.Imm32(0), result_carry.carry); + } + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SMUAD(Reg n, Reg d, bool M, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (M) { + std::swap(m_lo, m_hi); + } + + const IR::U32 product_lo = ir.Mul(n_lo, m_lo); + const IR::U32 product_hi = ir.Mul(n_hi, m_hi); + const auto result = ir.AddWithCarry(product_lo, product_hi, ir.Imm1(0)); + + ir.SetRegister(d, result); + ir.OrQFlag(ir.GetOverflowFrom(result)); + return true; +} + +bool TranslatorVisitor::thumb32_SMUSD(Reg n, Reg d, bool M, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const IR::U32 n32 = ir.GetRegister(n); + const IR::U32 m32 = ir.GetRegister(m); + const IR::U32 n_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const IR::U32 n_hi = ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result; + + IR::U32 m_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + IR::U32 m_hi = ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + if (M) { + std::swap(m_lo, m_hi); + } + + const IR::U32 product_lo = ir.Mul(n_lo, m_lo); + const IR::U32 product_hi = ir.Mul(n_hi, m_hi); + const IR::U32 result = ir.Sub(product_lo, product_hi); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SMULXY(Reg n, Reg d, bool N, bool M, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto n32 = ir.GetRegister(n); + const auto m32 = ir.GetRegister(m); + const auto n16 = N ? ir.ArithmeticShiftRight(n32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(n32)); + const auto m16 = M ? ir.ArithmeticShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result + : ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32)); + const auto result = ir.Mul(n16, m16); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SMLAWY(Reg n, Reg a, Reg d, bool M, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + const IR::U64 n32 = ir.SignExtendWordToLong(ir.GetRegister(n)); + IR::U32 m32 = ir.GetRegister(m); + if (M) { + m32 = ir.LogicalShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + } + const IR::U64 m16 = ir.SignExtendWordToLong(ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32))); + const auto product = ir.LeastSignificantWord(ir.LogicalShiftRight(ir.Mul(n32, m16), ir.Imm8(16))); + const auto result = ir.AddWithCarry(product, ir.GetRegister(a), ir.Imm1(0)); + + ir.SetRegister(d, result); + ir.OrQFlag(ir.GetOverflowFrom(result)); + return true; +} + +bool TranslatorVisitor::thumb32_SMULWY(Reg n, Reg d, bool M, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const IR::U64 n32 = ir.SignExtendWordToLong(ir.GetRegister(n)); + IR::U32 m32 = ir.GetRegister(m); + if (M) { + m32 = ir.LogicalShiftRight(m32, ir.Imm8(16), ir.Imm1(0)).result; + } + const IR::U64 m16 = ir.SignExtendWordToLong(ir.SignExtendHalfToWord(ir.LeastSignificantHalf(m32))); + const auto result = ir.LogicalShiftRight(ir.Mul(n32, m16), ir.Imm8(16)); + + ir.SetRegister(d, ir.LeastSignificantWord(result)); + return true; +} + +bool TranslatorVisitor::thumb32_USAD8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedAbsDiffSumU8(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_USADA8(Reg n, Reg a, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC || a == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_a = ir.GetRegister(a); + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto tmp = ir.PackedAbsDiffSumU8(reg_n, reg_m); + const auto result = ir.AddWithCarry(reg_a, tmp, ir.Imm1(0)); + + ir.SetRegister(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_parallel.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_parallel.cpp new file mode 100644 index 0000000000..654940967d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_parallel.cpp @@ -0,0 +1,521 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { +static IR::U32 Pack2x16To1x32(A32::IREmitter& ir, IR::U32 lo, IR::U32 hi) { + return ir.Or(ir.And(lo, ir.Imm32(0xFFFF)), ir.LogicalShiftLeft(hi, ir.Imm8(16), ir.Imm1(0)).result); +} + +static IR::U16 MostSignificantHalf(A32::IREmitter& ir, IR::U32 value) { + return ir.LeastSignificantHalf(ir.LogicalShiftRight(value, ir.Imm8(16), ir.Imm1(0)).result); +} + +bool TranslatorVisitor::thumb32_SADD8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedAddS8(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_SADD16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedAddS16(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_SASX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedAddSubS16(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_SSAX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSubAddS16(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_SSUB8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSubS8(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_SSUB16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSubS16(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_UADD8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedAddU8(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_UADD16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedAddU16(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_UASX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedAddSubU16(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_USAX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSubAddU16(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_USUB8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSubU8(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_USUB16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSubU16(reg_n, reg_m); + + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + return true; +} + +bool TranslatorVisitor::thumb32_QADD8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSaturatedAddS8(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_QADD16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSaturatedAddS16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_QASX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto Rn = ir.GetRegister(n); + const auto Rm = ir.GetRegister(m); + const auto Rn_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rn)); + const auto Rn_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rn)); + const auto Rm_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rm)); + const auto Rm_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rm)); + const auto diff = ir.SignedSaturation(ir.Sub(Rn_lo, Rm_hi), 16).result; + const auto sum = ir.SignedSaturation(ir.Add(Rn_hi, Rm_lo), 16).result; + const auto result = Pack2x16To1x32(ir, diff, sum); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_QSAX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto Rn = ir.GetRegister(n); + const auto Rm = ir.GetRegister(m); + const auto Rn_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rn)); + const auto Rn_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rn)); + const auto Rm_lo = ir.SignExtendHalfToWord(ir.LeastSignificantHalf(Rm)); + const auto Rm_hi = ir.SignExtendHalfToWord(MostSignificantHalf(ir, Rm)); + const auto sum = ir.SignedSaturation(ir.Add(Rn_lo, Rm_hi), 16).result; + const auto diff = ir.SignedSaturation(ir.Sub(Rn_hi, Rm_lo), 16).result; + const auto result = Pack2x16To1x32(ir, sum, diff); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_QSUB8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSaturatedSubS8(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_QSUB16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSaturatedSubS16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UQADD8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSaturatedAddU8(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UQADD16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSaturatedAddU16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UQASX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto Rn = ir.GetRegister(n); + const auto Rm = ir.GetRegister(m); + const auto Rn_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rn)); + const auto Rn_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rn)); + const auto Rm_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rm)); + const auto Rm_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rm)); + const auto diff = ir.UnsignedSaturation(ir.Sub(Rn_lo, Rm_hi), 16).result; + const auto sum = ir.UnsignedSaturation(ir.Add(Rn_hi, Rm_lo), 16).result; + const auto result = Pack2x16To1x32(ir, diff, sum); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UQSAX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto Rn = ir.GetRegister(n); + const auto Rm = ir.GetRegister(m); + const auto Rn_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rn)); + const auto Rn_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rn)); + const auto Rm_lo = ir.ZeroExtendHalfToWord(ir.LeastSignificantHalf(Rm)); + const auto Rm_hi = ir.ZeroExtendHalfToWord(MostSignificantHalf(ir, Rm)); + const auto sum = ir.UnsignedSaturation(ir.Add(Rn_lo, Rm_hi), 16).result; + const auto diff = ir.UnsignedSaturation(ir.Sub(Rn_hi, Rm_lo), 16).result; + const auto result = Pack2x16To1x32(ir, sum, diff); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UQSUB8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSaturatedSubU8(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UQSUB16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedSaturatedSubU16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SHADD8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingAddS8(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SHADD16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingAddS16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SHASX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingAddSubS16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SHSAX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingSubAddS16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SHSUB8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingSubS8(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_SHSUB16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingSubS16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UHADD8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingAddU8(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UHADD16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingAddU16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UHASX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingAddSubU16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UHSAX(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingSubAddU16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UHSUB8(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingSubU8(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +bool TranslatorVisitor::thumb32_UHSUB16(Reg n, Reg d, Reg m) { + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) { + return UnpredictableInstruction(); + } + + const auto reg_m = ir.GetRegister(m); + const auto reg_n = ir.GetRegister(n); + const auto result = ir.PackedHalvingSubU16(reg_n, reg_m); + + ir.SetRegister(d, result); + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_store_single_data_item.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_store_single_data_item.cpp new file mode 100644 index 0000000000..93c383757e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_store_single_data_item.cpp @@ -0,0 +1,223 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +template<typename StoreRegFn> +static bool StoreRegister(TranslatorVisitor& v, Reg n, Reg t, Imm<2> imm2, Reg m, StoreRegFn store_fn) { + if (n == Reg::PC) { + return v.UndefinedInstruction(); + } + + if (t == Reg::PC || m == Reg::PC) { + return v.UnpredictableInstruction(); + } + + const auto reg_m = v.ir.GetRegister(m); + const auto reg_n = v.ir.GetRegister(n); + const auto reg_t = v.ir.GetRegister(t); + + const auto shift_amount = v.ir.Imm8(static_cast<u8>(imm2.ZeroExtend())); + const auto offset = v.ir.LogicalShiftLeft(reg_m, shift_amount); + const auto offset_address = v.ir.Add(reg_n, offset); + + store_fn(offset_address, reg_t); + return true; +} + +using StoreImmFn = void (*)(TranslatorVisitor&, const IR::U32&, const IR::U32&); + +static void StoreImmByteFn(TranslatorVisitor& v, const IR::U32& address, const IR::U32& data) { + v.ir.WriteMemory8(address, v.ir.LeastSignificantByte(data), IR::AccType::NORMAL); +} + +static void StoreImmHalfFn(TranslatorVisitor& v, const IR::U32& address, const IR::U32& data) { + v.ir.WriteMemory16(address, v.ir.LeastSignificantHalf(data), IR::AccType::NORMAL); +} + +static void StoreImmWordFn(TranslatorVisitor& v, const IR::U32& address, const IR::U32& data) { + v.ir.WriteMemory32(address, data, IR::AccType::NORMAL); +} + +static bool StoreImmediate(TranslatorVisitor& v, Reg n, Reg t, bool P, bool U, bool W, Imm<12> imm12, StoreImmFn store_fn) { + const auto imm32 = imm12.ZeroExtend(); + const auto reg_n = v.ir.GetRegister(n); + const auto reg_t = v.ir.GetRegister(t); + + const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm32)) + : v.ir.Sub(reg_n, v.ir.Imm32(imm32)); + const IR::U32 address = P ? offset_address + : reg_n; + + store_fn(v, address, reg_t); + if (W) { + v.ir.SetRegister(n, offset_address); + } + + return true; +} + +bool TranslatorVisitor::thumb32_STRB_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8) { + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC || n == t) { + return UnpredictableInstruction(); + } + return StoreImmediate(*this, n, t, P, U, true, Imm<12>{imm8.ZeroExtend()}, StoreImmByteFn); +} + +bool TranslatorVisitor::thumb32_STRB_imm_2(Reg n, Reg t, Imm<8> imm8) { + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + return StoreImmediate(*this, n, t, true, false, false, Imm<12>{imm8.ZeroExtend()}, StoreImmByteFn); +} + +bool TranslatorVisitor::thumb32_STRB_imm_3(Reg n, Reg t, Imm<12> imm12) { + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + return StoreImmediate(*this, n, t, true, true, false, imm12, StoreImmByteFn); +} + +bool TranslatorVisitor::thumb32_STRBT(Reg n, Reg t, Imm<8> imm8) { + // TODO: Add an unpredictable instruction path if this + // is executed in hypervisor mode if we ever support + // privileged execution levels. + + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + // Treat this as a normal STRB, given we don't support + // execution levels other than EL0 currently. + return StoreImmediate(*this, n, t, true, true, false, Imm<12>{imm8.ZeroExtend()}, StoreImmByteFn); +} + +bool TranslatorVisitor::thumb32_STRB(Reg n, Reg t, Imm<2> imm2, Reg m) { + return StoreRegister(*this, n, t, imm2, m, [this](const IR::U32& offset_address, const IR::U32& data) { + ir.WriteMemory8(offset_address, ir.LeastSignificantByte(data), IR::AccType::NORMAL); + }); +} + +bool TranslatorVisitor::thumb32_STRH_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8) { + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC || n == t) { + return UnpredictableInstruction(); + } + return StoreImmediate(*this, n, t, P, U, true, Imm<12>{imm8.ZeroExtend()}, StoreImmHalfFn); +} + +bool TranslatorVisitor::thumb32_STRH_imm_2(Reg n, Reg t, Imm<8> imm8) { + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + return StoreImmediate(*this, n, t, true, false, false, Imm<12>{imm8.ZeroExtend()}, StoreImmHalfFn); +} + +bool TranslatorVisitor::thumb32_STRH_imm_3(Reg n, Reg t, Imm<12> imm12) { + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + return StoreImmediate(*this, n, t, true, true, false, imm12, StoreImmHalfFn); +} + +bool TranslatorVisitor::thumb32_STRHT(Reg n, Reg t, Imm<8> imm8) { + // TODO: Add an unpredictable instruction path if this + // is executed in hypervisor mode if we ever support + // privileged execution levels. + + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + // Treat this as a normal STRH, given we don't support + // execution levels other than EL0 currently. + return StoreImmediate(*this, n, t, true, true, false, Imm<12>{imm8.ZeroExtend()}, StoreImmHalfFn); +} + +bool TranslatorVisitor::thumb32_STRH(Reg n, Reg t, Imm<2> imm2, Reg m) { + return StoreRegister(*this, n, t, imm2, m, [this](const IR::U32& offset_address, const IR::U32& data) { + ir.WriteMemory16(offset_address, ir.LeastSignificantHalf(data), IR::AccType::NORMAL); + }); +} + +bool TranslatorVisitor::thumb32_STR_imm_1(Reg n, Reg t, bool P, bool U, Imm<8> imm8) { + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC || n == t) { + return UnpredictableInstruction(); + } + return StoreImmediate(*this, n, t, P, U, true, Imm<12>{imm8.ZeroExtend()}, StoreImmWordFn); +} + +bool TranslatorVisitor::thumb32_STR_imm_2(Reg n, Reg t, Imm<8> imm8) { + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + return StoreImmediate(*this, n, t, true, false, false, Imm<12>{imm8.ZeroExtend()}, StoreImmWordFn); +} + +bool TranslatorVisitor::thumb32_STR_imm_3(Reg n, Reg t, Imm<12> imm12) { + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + return StoreImmediate(*this, n, t, true, true, false, imm12, StoreImmWordFn); +} + +bool TranslatorVisitor::thumb32_STRT(Reg n, Reg t, Imm<8> imm8) { + // TODO: Add an unpredictable instruction path if this + // is executed in hypervisor mode if we ever support + // privileged execution levels. + + if (n == Reg::PC) { + return UndefinedInstruction(); + } + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + // Treat this as a normal STR, given we don't support + // execution levels other than EL0 currently. + return StoreImmediate(*this, n, t, true, true, false, Imm<12>{imm8.ZeroExtend()}, StoreImmWordFn); +} + +bool TranslatorVisitor::thumb32_STR_reg(Reg n, Reg t, Imm<2> imm2, Reg m) { + return StoreRegister(*this, n, t, imm2, m, [this](const IR::U32& offset_address, const IR::U32& data) { + ir.WriteMemory32(offset_address, data, IR::AccType::NORMAL); + }); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/vfp.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/vfp.cpp new file mode 100644 index 0000000000..a4e37f747a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/vfp.cpp @@ -0,0 +1,1489 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <array> + +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" + +namespace Dynarmic::A32 { + +template<typename FnT> +bool TranslatorVisitor::EmitVfpVectorOperation(bool sz, ExtReg d, ExtReg n, ExtReg m, const FnT& fn) { + if (!ir.current_location.FPSCR().Stride()) { + return UnpredictableInstruction(); + } + + // VFP register banks are 8 single-precision registers in size. + const size_t register_bank_size = sz ? 4 : 8; + size_t vector_length = ir.current_location.FPSCR().Len(); + const size_t vector_stride = *ir.current_location.FPSCR().Stride(); + + // Unpredictable case + if (vector_stride * vector_length > register_bank_size) { + return UnpredictableInstruction(); + } + + // Scalar case + if (vector_length == 1) { + if (vector_stride != 1) { + return UnpredictableInstruction(); + } + + fn(d, n, m); + return true; + } + + // The VFP register file is divided into banks each containing: + // * eight single-precision registers, or + // * four double-precision registers. + // VFP vector instructions access these registers in a circular manner. + const auto bank_increment = [register_bank_size](ExtReg reg, size_t stride) -> ExtReg { + const auto reg_number = static_cast<size_t>(reg); + const auto bank_index = reg_number % register_bank_size; + const auto bank_start = reg_number - bank_index; + const auto next_reg_number = bank_start + ((bank_index + stride) % register_bank_size); + return static_cast<ExtReg>(next_reg_number); + }; + + // The first and fifth banks in the register file are scalar banks. + // All the other banks are vector banks. + const auto belongs_to_scalar_bank = [](ExtReg reg) -> bool { + return (reg >= ExtReg::D0 && reg <= ExtReg::D3) + || (reg >= ExtReg::D16 && reg <= ExtReg::D19) + || (reg >= ExtReg::S0 && reg <= ExtReg::S7); + }; + + const bool d_is_scalar = belongs_to_scalar_bank(d); + const bool m_is_scalar = belongs_to_scalar_bank(m); + + if (d_is_scalar) { + // If destination register is in a scalar bank, the operands and results are all scalars. + vector_length = 1; + } + + for (size_t i = 0; i < vector_length; i++) { + fn(d, n, m); + + d = bank_increment(d, vector_stride); + n = bank_increment(n, vector_stride); + if (!m_is_scalar) { + m = bank_increment(m, vector_stride); + } + } + + return true; +} + +template<typename FnT> +bool TranslatorVisitor::EmitVfpVectorOperation(bool sz, ExtReg d, ExtReg m, const FnT& fn) { + return EmitVfpVectorOperation(sz, d, ExtReg::S0, m, [fn](ExtReg d, ExtReg, ExtReg m) { + fn(d, m); + }); +} + +// VADD<c>.F64 <Dd>, <Dn>, <Dm> +// VADD<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VADD(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPAdd(reg_n, reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VSUB<c>.F64 <Dd>, <Dn>, <Dm> +// VSUB<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VSUB(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPSub(reg_n, reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VMUL<c>.F64 <Dd>, <Dn>, <Dm> +// VMUL<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPMul(reg_n, reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VMLA<c>.F64 <Dd>, <Dn>, <Dm> +// VMLA<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto reg_d = ir.GetExtendedRegister(d); + const auto result = ir.FPAdd(reg_d, ir.FPMul(reg_n, reg_m)); + ir.SetExtendedRegister(d, result); + }); +} + +// VMLS<c>.F64 <Dd>, <Dn>, <Dm> +// VMLS<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto reg_d = ir.GetExtendedRegister(d); + const auto result = ir.FPAdd(reg_d, ir.FPNeg(ir.FPMul(reg_n, reg_m))); + ir.SetExtendedRegister(d, result); + }); +} + +// VNMUL<c>.F64 <Dd>, <Dn>, <Dm> +// VNMUL<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VNMUL(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPNeg(ir.FPMul(reg_n, reg_m)); + ir.SetExtendedRegister(d, result); + }); +} + +// VNMLA<c>.F64 <Dd>, <Dn>, <Dm> +// VNMLA<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VNMLA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto reg_d = ir.GetExtendedRegister(d); + const auto result = ir.FPAdd(ir.FPNeg(reg_d), ir.FPNeg(ir.FPMul(reg_n, reg_m))); + ir.SetExtendedRegister(d, result); + }); +} + +// VNMLS<c>.F64 <Dd>, <Dn>, <Dm> +// VNMLS<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VNMLS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto reg_d = ir.GetExtendedRegister(d); + const auto result = ir.FPAdd(ir.FPNeg(reg_d), ir.FPMul(reg_n, reg_m)); + ir.SetExtendedRegister(d, result); + }); +} + +// VDIV<c>.F64 <Dd>, <Dn>, <Dm> +// VDIV<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VDIV(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPDiv(reg_n, reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VFNMS<c>.F64 <Dd>, <Dn>, <Dm> +// VFNMS<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VFNMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto reg_d = ir.GetExtendedRegister(d); + const auto result = ir.FPMulAdd(ir.FPNeg(reg_d), reg_n, reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VFNMA<c>.F64 <Dd>, <Dn>, <Dm> +// VFNMA<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VFNMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto reg_d = ir.GetExtendedRegister(d); + const auto result = ir.FPMulSub(ir.FPNeg(reg_d), reg_n, reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VFMA<c>.F64 <Dd>, <Dn>, <Dm> +// VFMA<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VFMA(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto reg_d = ir.GetExtendedRegister(d); + const auto result = ir.FPMulAdd(reg_d, reg_n, reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VFMS<c>.F64 <Dd>, <Dn>, <Dm> +// VFMS<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VFMS(Cond cond, bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto reg_d = ir.GetExtendedRegister(d); + const auto result = ir.FPMulSub(reg_d, reg_n, reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VSEL<c>.F64 <Dd>, <Dn>, <Dm> +// VSEL<c>.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VSEL(bool D, Imm<2> cc, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + const Cond cond = concatenate(cc, Imm<1>{cc.Bit<0>() != cc.Bit<1>()}, Imm<1>{0}).ZeroExtend<Cond>(); + + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this, cond](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.ConditionalSelect(cond, reg_n, reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VMAXNM.F64 <Dd>, <Dn>, <Dm> +// VMAXNM.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VMAXNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPMaxNumeric(reg_n, reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VMINNM.F64 <Dd>, <Dn>, <Dm> +// VMINNM.F32 <Sd>, <Sn>, <Sm> +bool TranslatorVisitor::vfp_VMINNM(bool D, size_t Vn, size_t Vd, bool sz, bool N, bool M, size_t Vm) { + const auto d = ToExtReg(sz, Vd, D); + const auto n = ToExtReg(sz, Vn, N); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, n, m, [this](ExtReg d, ExtReg n, ExtReg m) { + const auto reg_n = ir.GetExtendedRegister(n); + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPMinNumeric(reg_n, reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VMOV<c>.32 <Dd[0]>, <Rt> +bool TranslatorVisitor::vfp_VMOV_u32_f64(Cond cond, size_t Vd, Reg t, bool D) { + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(true, Vd, D); + const auto reg_d = ir.GetExtendedRegister(d); + const auto reg_t = ir.GetRegister(t); + const auto result = ir.Pack2x32To1x64(reg_t, ir.MostSignificantWord(reg_d).result); + + ir.SetExtendedRegister(d, result); + return true; +} + +// VMOV<c>.32 <Rt>, <Dn[0]> +bool TranslatorVisitor::vfp_VMOV_f64_u32(Cond cond, size_t Vn, Reg t, bool N) { + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto n = ToExtReg(true, Vn, N); + const auto reg_n = ir.GetExtendedRegister(n); + ir.SetRegister(t, ir.LeastSignificantWord(reg_n)); + return true; +} + +// VMOV<c> <Sn>, <Rt> +bool TranslatorVisitor::vfp_VMOV_u32_f32(Cond cond, size_t Vn, Reg t, bool N) { + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto n = ToExtReg(false, Vn, N); + ir.SetExtendedRegister(n, ir.GetRegister(t)); + return true; +} + +// VMOV<c> <Rt>, <Sn> +bool TranslatorVisitor::vfp_VMOV_f32_u32(Cond cond, size_t Vn, Reg t, bool N) { + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto n = ToExtReg(false, Vn, N); + ir.SetRegister(t, ir.GetExtendedRegister(n)); + return true; +} + +// VMOV<c> <Sm>, <Sm1>, <Rt>, <Rt2> +bool TranslatorVisitor::vfp_VMOV_2u32_2f32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) { + const auto m = ToExtReg(false, Vm, M); + if (t == Reg::PC || t2 == Reg::PC || m == ExtReg::S31) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + ir.SetExtendedRegister(m, ir.GetRegister(t)); + ir.SetExtendedRegister(m + 1, ir.GetRegister(t2)); + return true; +} + +// VMOV<c> <Rt>, <Rt2>, <Sm>, <Sm1> +bool TranslatorVisitor::vfp_VMOV_2f32_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) { + const auto m = ToExtReg(false, Vm, M); + if (t == Reg::PC || t2 == Reg::PC || m == ExtReg::S31) { + return UnpredictableInstruction(); + } + + if (t == t2) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + ir.SetRegister(t, ir.GetExtendedRegister(m)); + ir.SetRegister(t2, ir.GetExtendedRegister(m + 1)); + return true; +} + +// VMOV<c> <Dm>, <Rt>, <Rt2> +bool TranslatorVisitor::vfp_VMOV_2u32_f64(Cond cond, Reg t2, Reg t, bool M, size_t Vm) { + const auto m = ToExtReg(true, Vm, M); + if (t == Reg::PC || t2 == Reg::PC || m == ExtReg::S31) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto value = ir.Pack2x32To1x64(ir.GetRegister(t), ir.GetRegister(t2)); + ir.SetExtendedRegister(m, value); + return true; +} + +// VMOV<c> <Rt>, <Rt2>, <Dm> +bool TranslatorVisitor::vfp_VMOV_f64_2u32(Cond cond, Reg t2, Reg t, bool M, size_t Vm) { + const auto m = ToExtReg(true, Vm, M); + if (t == Reg::PC || t2 == Reg::PC || m == ExtReg::S31) { + return UnpredictableInstruction(); + } + + if (t == t2) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto value = ir.GetExtendedRegister(m); + ir.SetRegister(t, ir.LeastSignificantWord(value)); + ir.SetRegister(t2, ir.MostSignificantWord(value).result); + return true; +} + +// VMOV<c>.32 <Dn[x]>, <Rt> +bool TranslatorVisitor::vfp_VMOV_from_i32(Cond cond, Imm<1> i, size_t Vd, Reg t, bool D) { + if (!VFPConditionPassed(cond)) { + return true; + } + + if (t == Reg::R15) { + // TODO: v8 removes UPREDICTABLE for R13 + return UnpredictableInstruction(); + } + + const size_t index = i.ZeroExtend(); + const auto d = ToVector(false, Vd, D); + + const auto reg_d = ir.GetVector(d); + const auto scalar = ir.GetRegister(t); + const auto result = ir.VectorSetElement(32, reg_d, index, scalar); + + ir.SetVector(d, result); + return true; +} + +// VMOV<c>.16 <Dn[x]>, <Rt> +bool TranslatorVisitor::vfp_VMOV_from_i16(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<1> i2) { + if (!VFPConditionPassed(cond)) { + return true; + } + + if (t == Reg::R15) { + // TODO: v8 removes UPREDICTABLE for R13 + return UnpredictableInstruction(); + } + + const size_t index = concatenate(i1, i2).ZeroExtend(); + const auto d = ToVector(false, Vd, D); + + const auto reg_d = ir.GetVector(d); + const auto scalar = ir.LeastSignificantHalf(ir.GetRegister(t)); + const auto result = ir.VectorSetElement(16, reg_d, index, scalar); + + ir.SetVector(d, result); + return true; +} + +// VMOV<c>.8 <Dn[x]>, <Rt> +bool TranslatorVisitor::vfp_VMOV_from_i8(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<2> i2) { + if (!VFPConditionPassed(cond)) { + return true; + } + + if (t == Reg::R15) { + // TODO: v8 removes UPREDICTABLE for R13 + return UnpredictableInstruction(); + } + + const size_t index = concatenate(i1, i2).ZeroExtend(); + const auto d = ToVector(false, Vd, D); + + const auto reg_d = ir.GetVector(d); + const auto scalar = ir.LeastSignificantByte(ir.GetRegister(t)); + const auto result = ir.VectorSetElement(8, reg_d, index, scalar); + + ir.SetVector(d, result); + return true; +} + +// VMOV<c>.32 <Rt>, <Dn[x]> +bool TranslatorVisitor::vfp_VMOV_to_i32(Cond cond, Imm<1> i, size_t Vn, Reg t, bool N) { + if (!VFPConditionPassed(cond)) { + return true; + } + + if (t == Reg::R15) { + // TODO: v8 removes UPREDICTABLE for R13 + return UnpredictableInstruction(); + } + + const size_t index = i.ZeroExtend(); + const auto n = ToVector(false, Vn, N); + + const auto reg_n = ir.GetVector(n); + const auto result = ir.VectorGetElement(32, reg_n, index); + + ir.SetRegister(t, result); + return true; +} + +// VMOV<c>.{U16,S16} <Rt>, <Dn[x]> +bool TranslatorVisitor::vfp_VMOV_to_i16(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<1> i2) { + if (!VFPConditionPassed(cond)) { + return true; + } + + if (t == Reg::R15) { + // TODO: v8 removes UPREDICTABLE for R13 + return UnpredictableInstruction(); + } + + const size_t index = concatenate(i1, i2).ZeroExtend(); + const auto n = ToVector(false, Vn, N); + + const auto reg_n = ir.GetVector(n); + const auto scalar = ir.VectorGetElement(16, reg_n, index); + const auto result = U ? ir.ZeroExtendToWord(scalar) : ir.SignExtendToWord(scalar); + + ir.SetRegister(t, result); + return true; +} + +// VMOV<c>.{U8,S8} <Rt>, <Dn[x]> +bool TranslatorVisitor::vfp_VMOV_to_i8(Cond cond, bool U, Imm<1> i1, size_t Vn, Reg t, bool N, Imm<2> i2) { + if (!VFPConditionPassed(cond)) { + return true; + } + + if (t == Reg::R15) { + // TODO: v8 removes UPREDICTABLE for R13 + return UnpredictableInstruction(); + } + + const size_t index = concatenate(i1, i2).ZeroExtend(); + const auto n = ToVector(false, Vn, N); + + const auto reg_n = ir.GetVector(n); + const auto scalar = ir.VectorGetElement(8, reg_n, index); + const auto result = U ? ir.ZeroExtendToWord(scalar) : ir.SignExtendToWord(scalar); + + ir.SetRegister(t, result); + return true; +} + +// VDUP<c>.{8,16,32} <Qd>, <Rt> +// VDUP<c>.{8,16,32} <Dd>, <Rt> +bool TranslatorVisitor::vfp_VDUP(Cond cond, Imm<1> B, bool Q, size_t Vd, Reg t, bool D, Imm<1> E) { + if (!VFPConditionPassed(cond)) { + return true; + } + + if (Q && mcl::bit::get_bit<0>(Vd)) { + return UndefinedInstruction(); + } + if (t == Reg::R15) { + return UnpredictableInstruction(); + } + + const auto d = ToVector(Q, Vd, D); + const size_t BE = concatenate(B, E).ZeroExtend(); + const size_t esize = 32u >> BE; + + if (BE == 0b11) { + return UndefinedInstruction(); + } + + const auto scalar = ir.LeastSignificant(esize, ir.GetRegister(t)); + const auto result = ir.VectorBroadcast(esize, scalar); + ir.SetVector(d, result); + return true; +} + +// VMOV<c>.F64 <Dd>, #<imm> +// VMOV<c>.F32 <Sd>, #<imm> +bool TranslatorVisitor::vfp_VMOV_imm(Cond cond, bool D, Imm<4> imm4H, size_t Vd, bool sz, Imm<4> imm4L) { + if (!VFPConditionPassed(cond)) { + return true; + } + + if (ir.current_location.FPSCR().Stride() != 1 || ir.current_location.FPSCR().Len() != 1) { + return UndefinedInstruction(); + } + + const auto d = ToExtReg(sz, Vd, D); + const auto imm8 = concatenate(imm4H, imm4L); + + if (sz) { + const u64 sign = static_cast<u64>(imm8.Bit<7>()); + const u64 exp = (imm8.Bit<6>() ? 0x3FC : 0x400) | imm8.Bits<4, 5, u64>(); + const u64 fract = imm8.Bits<0, 3, u64>() << 48; + const u64 immediate = (sign << 63) | (exp << 52) | fract; + ir.SetExtendedRegister(d, ir.Imm64(immediate)); + } else { + const u32 sign = static_cast<u32>(imm8.Bit<7>()); + const u32 exp = (imm8.Bit<6>() ? 0x7C : 0x80) | imm8.Bits<4, 5>(); + const u32 fract = imm8.Bits<0, 3>() << 19; + const u32 immediate = (sign << 31) | (exp << 23) | fract; + ir.SetExtendedRegister(d, ir.Imm32(immediate)); + } + return true; +} + +// VMOV<c>.F64 <Dd>, <Dm> +// VMOV<c>.F32 <Sd>, <Sm> +bool TranslatorVisitor::vfp_VMOV_reg(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, m, [this](ExtReg d, ExtReg m) { + ir.SetExtendedRegister(d, ir.GetExtendedRegister(m)); + }); +} + +// VABS<c>.F64 <Dd>, <Dm> +// VABS<c>.F32 <Sd>, <Sm> +bool TranslatorVisitor::vfp_VABS(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, m, [this](ExtReg d, ExtReg m) { + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPAbs(reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VNEG<c>.F64 <Dd>, <Dm> +// VNEG<c>.F32 <Sd>, <Sm> +bool TranslatorVisitor::vfp_VNEG(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, m, [this](ExtReg d, ExtReg m) { + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPNeg(reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VSQRT<c>.F64 <Dd>, <Dm> +// VSQRT<c>.F32 <Sd>, <Sm> +bool TranslatorVisitor::vfp_VSQRT(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, m, [this](ExtReg d, ExtReg m) { + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPSqrt(reg_m); + ir.SetExtendedRegister(d, result); + }); +} + +// VCVTB<c>.f32.f16 <Dd>, <Dm> +// VCVTB<c>.f64.f16 <Dd>, <Dm> +// VCVTB<c>.f16.f32 <Dd>, <Dm> +// VCVTB<c>.f16.f64 <Dd>, <Dm> +bool TranslatorVisitor::vfp_VCVTB(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const bool convert_from_half = !op; + const auto rounding_mode = ir.current_location.FPSCR().RMode(); + if (convert_from_half) { + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(false, Vm, M); + + return EmitVfpVectorOperation(sz, d, m, [this, sz, rounding_mode](ExtReg d, ExtReg m) { + const auto reg_m = ir.LeastSignificantHalf(ir.GetExtendedRegister(m)); + const auto result = sz ? IR::U32U64{ir.FPHalfToDouble(reg_m, rounding_mode)} : IR::U32U64{ir.FPHalfToSingle(reg_m, rounding_mode)}; + ir.SetExtendedRegister(d, result); + }); + } else { + const auto d = ToExtReg(false, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, m, [this, sz, rounding_mode](ExtReg d, ExtReg m) { + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = sz ? ir.FPDoubleToHalf(reg_m, rounding_mode) : ir.FPSingleToHalf(reg_m, rounding_mode); + ir.SetExtendedRegister(d, ir.Or(ir.And(ir.GetExtendedRegister(d), ir.Imm32(0xFFFF0000)), ir.ZeroExtendToWord(result))); + }); + } +} + +// VCVTT<c>.f32.f16 <Dd>, <Dm> +// VCVTT<c>.f64.f16 <Dd>, <Dm> +// VCVTT<c>.f16.f32 <Dd>, <Dm> +// VCVTT<c>.f16.f64 <Dd>, <Dm> +bool TranslatorVisitor::vfp_VCVTT(Cond cond, bool D, bool op, size_t Vd, bool sz, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const bool convert_from_half = !op; + const auto rounding_mode = ir.current_location.FPSCR().RMode(); + if (convert_from_half) { + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(false, Vm, M); + + return EmitVfpVectorOperation(sz, d, m, [this, sz, rounding_mode](ExtReg d, ExtReg m) { + const auto reg_m = ir.LeastSignificantHalf(ir.LogicalShiftRight(ir.GetExtendedRegister(m), ir.Imm8(16))); + const auto result = sz ? IR::U32U64{ir.FPHalfToDouble(reg_m, rounding_mode)} : IR::U32U64{ir.FPHalfToSingle(reg_m, rounding_mode)}; + ir.SetExtendedRegister(d, result); + }); + } else { + const auto d = ToExtReg(false, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, m, [this, sz, rounding_mode](ExtReg d, ExtReg m) { + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = sz ? ir.FPDoubleToHalf(reg_m, rounding_mode) : ir.FPSingleToHalf(reg_m, rounding_mode); + ir.SetExtendedRegister(d, ir.Or(ir.And(ir.GetExtendedRegister(d), ir.Imm32(0x0000FFFF)), ir.LogicalShiftLeft(ir.ZeroExtendToWord(result), ir.Imm8(16)))); + }); + } +} + +// VCMP{E}.F32 <Sd>, <Sm> +// VCMP{E}.F64 <Dd>, <Dm> +bool TranslatorVisitor::vfp_VCMP(Cond cond, bool D, size_t Vd, bool sz, bool E, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + const auto exc_on_qnan = E; + const auto reg_d = ir.GetExtendedRegister(d); + const auto reg_m = ir.GetExtendedRegister(m); + const auto nzcv = ir.FPCompare(reg_d, reg_m, exc_on_qnan); + + ir.SetFpscrNZCV(nzcv); + return true; +} + +// VCMP{E}.F32 <Sd>, #0.0 +// VCMP{E}.F64 <Dd>, #0.0 +bool TranslatorVisitor::vfp_VCMP_zero(Cond cond, bool D, size_t Vd, bool sz, bool E) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto exc_on_qnan = E; + const auto reg_d = ir.GetExtendedRegister(d); + + if (sz) { + const auto nzcv = ir.FPCompare(reg_d, ir.Imm64(0), exc_on_qnan); + ir.SetFpscrNZCV(nzcv); + } else { + const auto nzcv = ir.FPCompare(reg_d, ir.Imm32(0), exc_on_qnan); + ir.SetFpscrNZCV(nzcv); + } + + return true; +} + +// VRINTR.{F16,F32} <Sd>, <Sm> +// VRINTR.F64 <Dd>, <Dm> +bool TranslatorVisitor::vfp_VRINTR(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + const auto reg_m = ir.GetExtendedRegister(m); + const auto rounding_mode = ir.current_location.FPSCR().RMode(); + + const auto result = ir.FPRoundInt(reg_m, rounding_mode, false); + ir.SetExtendedRegister(d, result); + return true; +} + +// VRINTZ.{F16,F32} <Sd>, <Sm> +// VRINTZ.F64 <Dd>, <Dm> +bool TranslatorVisitor::vfp_VRINTZ(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + const auto reg_m = ir.GetExtendedRegister(m); + const auto rounding_mode = FP::RoundingMode::TowardsZero; + + const auto result = ir.FPRoundInt(reg_m, rounding_mode, false); + ir.SetExtendedRegister(d, result); + return true; +} + +// VRINTX.{F16,F32} <Sd>, <Sm> +// VRINTX.F64 <Dd>, <Dm> +bool TranslatorVisitor::vfp_VRINTX(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + const auto reg_m = ir.GetExtendedRegister(m); + const auto rounding_mode = ir.current_location.FPSCR().RMode(); + + const auto result = ir.FPRoundInt(reg_m, rounding_mode, true); + ir.SetExtendedRegister(d, result); + return true; +} + +// VCVT<c>.F64.F32 <Dd>, <Sm> +// VCVT<c>.F32.F64 <Sd>, <Dm> +bool TranslatorVisitor::vfp_VCVT_f_to_f(Cond cond, bool D, size_t Vd, bool sz, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(!sz, Vd, D); // Destination is of opposite size to source + const auto m = ToExtReg(sz, Vm, M); + const auto reg_m = ir.GetExtendedRegister(m); + const auto rounding_mode = ir.current_location.FPSCR().RMode(); + + if (sz) { + const auto result = ir.FPDoubleToSingle(reg_m, rounding_mode); + ir.SetExtendedRegister(d, result); + } else { + const auto result = ir.FPSingleToDouble(reg_m, rounding_mode); + ir.SetExtendedRegister(d, result); + } + + return true; +} + +// VCVT.F32.{S32,U32} <Sd>, <Sm> +// VCVT.F64.{S32,U32} <Sd>, <Dm> +bool TranslatorVisitor::vfp_VCVT_from_int(Cond cond, bool D, size_t Vd, bool sz, bool is_signed, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(false, Vm, M); + const auto rounding_mode = ir.current_location.FPSCR().RMode(); + const auto reg_m = ir.GetExtendedRegister(m); + + if (sz) { + const auto result = is_signed + ? ir.FPSignedFixedToDouble(reg_m, 0, rounding_mode) + : ir.FPUnsignedFixedToDouble(reg_m, 0, rounding_mode); + ir.SetExtendedRegister(d, result); + } else { + const auto result = is_signed + ? ir.FPSignedFixedToSingle(reg_m, 0, rounding_mode) + : ir.FPUnsignedFixedToSingle(reg_m, 0, rounding_mode); + ir.SetExtendedRegister(d, result); + } + + return true; +} + +// VCVT.F32.{S16,U16,S32,U32} <Sdm>, <Sdm> +// VCVT.F64.{S16,U16,S32,U32} <Ddm>, <Ddm> +bool TranslatorVisitor::vfp_VCVT_from_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const size_t size = sx ? 32 : 16; + const size_t fbits = size - concatenate(imm4, i).ZeroExtend(); + + if (fbits > size) { + return UnpredictableInstruction(); + } + + const auto d = ToExtReg(sz, Vd, D); + const auto rounding_mode = FP::RoundingMode::ToNearest_TieEven; + const auto reg_d = ir.GetExtendedRegister(d); + const auto source = ir.LeastSignificant(size, reg_d); + + if (sz) { + const auto result = U ? ir.FPUnsignedFixedToDouble(source, fbits, rounding_mode) + : ir.FPSignedFixedToDouble(source, fbits, rounding_mode); + ir.SetExtendedRegister(d, result); + } else { + const auto result = U ? ir.FPUnsignedFixedToSingle(source, fbits, rounding_mode) + : ir.FPSignedFixedToSingle(source, fbits, rounding_mode); + ir.SetExtendedRegister(d, result); + } + + return true; +} + +// VCVT{,R}.U32.F32 <Sd>, <Sm> +// VCVT{,R}.U32.F64 <Sd>, <Dm> +bool TranslatorVisitor::vfp_VCVT_to_u32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const ExtReg d = ToExtReg(false, Vd, D); + const ExtReg m = ToExtReg(sz, Vm, M); + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPToFixedU32(reg_m, 0, round_towards_zero ? FP::RoundingMode::TowardsZero : ir.current_location.FPSCR().RMode()); + + ir.SetExtendedRegister(d, result); + return true; +} + +// VCVT{,R}.S32.F32 <Sd>, <Sm> +// VCVT{,R}.S32.F64 <Sd>, <Dm> +bool TranslatorVisitor::vfp_VCVT_to_s32(Cond cond, bool D, size_t Vd, bool sz, bool round_towards_zero, bool M, size_t Vm) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const auto d = ToExtReg(false, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPToFixedS32(reg_m, 0, round_towards_zero ? FP::RoundingMode::TowardsZero : ir.current_location.FPSCR().RMode()); + + ir.SetExtendedRegister(d, result); + return true; +} + +// VCVT.{S16,U16,S32,U32}.F32 <Sdm>, <Sdm> +// VCVT.{S16,U16,S32,U32}.F64 <Ddm>, <Ddm> +bool TranslatorVisitor::vfp_VCVT_to_fixed(Cond cond, bool D, bool U, size_t Vd, bool sz, bool sx, Imm<1> i, Imm<4> imm4) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const size_t size = sx ? 32 : 16; + const size_t fbits = size - concatenate(imm4, i).ZeroExtend(); + + if (fbits > size) { + return UnpredictableInstruction(); + } + + const auto d = ToExtReg(sz, Vd, D); + const auto rounding_mode = FP::RoundingMode::TowardsZero; + const auto reg_d = ir.GetExtendedRegister(d); + + const auto result = [&]() -> IR::U16U32U64 { + if (sx) { + return U ? ir.FPToFixedU32(reg_d, fbits, rounding_mode) + : ir.FPToFixedS32(reg_d, fbits, rounding_mode); + } else { + return U ? ir.FPToFixedU16(reg_d, fbits, rounding_mode) + : ir.FPToFixedS16(reg_d, fbits, rounding_mode); + } + }(); + + if (sz) { + ir.SetExtendedRegister(d, U ? ir.ZeroExtendToLong(result) : ir.SignExtendToLong(result)); + } else { + ir.SetExtendedRegister(d, U ? ir.ZeroExtendToWord(result) : ir.SignExtendToWord(result)); + } + return true; +} + +// VRINT{A,N,P,M}.F32 <Sd>, <Sm> +// VRINT{A,N,P,M}.F64 <Dd>, <Dm> +bool TranslatorVisitor::vfp_VRINT_rm(bool D, size_t rm, size_t Vd, bool sz, bool M, size_t Vm) { + const std::array rm_lookup{ + FP::RoundingMode::ToNearest_TieAwayFromZero, + FP::RoundingMode::ToNearest_TieEven, + FP::RoundingMode::TowardsPlusInfinity, + FP::RoundingMode::TowardsMinusInfinity, + }; + const FP::RoundingMode rounding_mode = rm_lookup[rm]; + + const auto d = ToExtReg(sz, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, m, [this, rounding_mode](ExtReg d, ExtReg m) { + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = ir.FPRoundInt(reg_m, rounding_mode, false); + ir.SetExtendedRegister(d, result); + }); +} + +// VCVT{A,N,P,M}.F32 <Sd>, <Sm> +// VCVT{A,N,P,M}.F64 <Sd>, <Dm> +bool TranslatorVisitor::vfp_VCVT_rm(bool D, size_t rm, size_t Vd, bool sz, bool U, bool M, size_t Vm) { + const std::array rm_lookup{ + FP::RoundingMode::ToNearest_TieAwayFromZero, + FP::RoundingMode::ToNearest_TieEven, + FP::RoundingMode::TowardsPlusInfinity, + FP::RoundingMode::TowardsMinusInfinity, + }; + const FP::RoundingMode rounding_mode = rm_lookup[rm]; + const bool unsigned_ = !U; + + const auto d = ToExtReg(false, Vd, D); + const auto m = ToExtReg(sz, Vm, M); + + return EmitVfpVectorOperation(sz, d, m, [this, rounding_mode, unsigned_](ExtReg d, ExtReg m) { + const auto reg_m = ir.GetExtendedRegister(m); + const auto result = unsigned_ ? ir.FPToFixedU32(reg_m, 0, rounding_mode) : ir.FPToFixedS32(reg_m, 0, rounding_mode); + ir.SetExtendedRegister(d, result); + }); +} + +// VMSR FPSCR, <Rt> +bool TranslatorVisitor::vfp_VMSR(Cond cond, Reg t) { + if (t == Reg::PC) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + // TODO: Replace this with a local cache. + ir.PushRSB(ir.current_location.AdvancePC(4).AdvanceIT()); + + ir.UpdateUpperLocationDescriptor(); + ir.SetFpscr(ir.GetRegister(t)); + ir.BranchWritePC(ir.Imm32(ir.current_location.PC() + 4)); + ir.SetTerm(IR::Term::PopRSBHint{}); + return false; +} + +// VMRS <Rt>, FPSCR +bool TranslatorVisitor::vfp_VMRS(Cond cond, Reg t) { + if (!VFPConditionPassed(cond)) { + return true; + } + + if (t == Reg::R15) { + // This encodes ASPR_nzcv access + const auto nzcv = ir.GetFpscrNZCV(); + ir.SetCpsrNZCVRaw(nzcv); + } else { + ir.SetRegister(t, ir.GetFpscr()); + } + + return true; +} + +// VPOP.{F32,F64} <list> +bool TranslatorVisitor::vfp_VPOP(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8) { + const ExtReg d = ToExtReg(sz, Vd, D); + const size_t regs = sz ? imm8.ZeroExtend() >> 1 : imm8.ZeroExtend(); + + if (regs == 0 || RegNumber(d) + regs > 32) { + return UnpredictableInstruction(); + } + + if (sz && regs > 16) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm8.ZeroExtend() << 2; + auto address = ir.GetRegister(Reg::SP); + ir.SetRegister(Reg::SP, ir.Add(address, ir.Imm32(imm32))); + + for (size_t i = 0; i < regs; ++i) { + if (sz) { + auto lo = ir.ReadMemory32(address, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + auto hi = ir.ReadMemory32(address, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + if (ir.current_location.EFlag()) { + std::swap(lo, hi); + } + ir.SetExtendedRegister(d + i, ir.Pack2x32To1x64(lo, hi)); + } else { + const auto res = ir.ReadMemory32(address, IR::AccType::ATOMIC); + ir.SetExtendedRegister(d + i, res); + address = ir.Add(address, ir.Imm32(4)); + } + } + + return true; +} + +// VPUSH.{F32,F64} <list> +bool TranslatorVisitor::vfp_VPUSH(Cond cond, bool D, size_t Vd, bool sz, Imm<8> imm8) { + const ExtReg d = ToExtReg(sz, Vd, D); + const size_t regs = sz ? imm8.ZeroExtend() >> 1 : imm8.ZeroExtend(); + + if (regs == 0 || RegNumber(d) + regs > 32) { + return UnpredictableInstruction(); + } + + if (sz && regs > 16) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm8.ZeroExtend() << 2; + auto address = ir.Sub(ir.GetRegister(Reg::SP), ir.Imm32(imm32)); + ir.SetRegister(Reg::SP, address); + + for (size_t i = 0; i < regs; ++i) { + if (sz) { + const auto reg_d = ir.GetExtendedRegister(d + i); + auto lo = ir.LeastSignificantWord(reg_d); + auto hi = ir.MostSignificantWord(reg_d).result; + if (ir.current_location.EFlag()) + std::swap(lo, hi); + ir.WriteMemory32(address, lo, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + ir.WriteMemory32(address, hi, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + } else { + ir.WriteMemory32(address, ir.GetExtendedRegister(d + i), IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + } + } + + return true; +} + +// VLDR<c> <Dd>, [<Rn>{, #+/-<imm>}] +// VLDR<c> <Sd>, [<Rn>{, #+/-<imm>}] +bool TranslatorVisitor::vfp_VLDR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm8.ZeroExtend() << 2; + const auto d = ToExtReg(sz, Vd, D); + const auto base = n == Reg::PC ? ir.Imm32(ir.AlignPC(4)) : ir.GetRegister(n); + const auto address = U ? ir.Add(base, ir.Imm32(imm32)) : ir.Sub(base, ir.Imm32(imm32)); + + if (sz) { + auto lo = ir.ReadMemory32(address, IR::AccType::ATOMIC); + auto hi = ir.ReadMemory32(ir.Add(address, ir.Imm32(4)), IR::AccType::ATOMIC); + if (ir.current_location.EFlag()) { + std::swap(lo, hi); + } + ir.SetExtendedRegister(d, ir.Pack2x32To1x64(lo, hi)); + } else { + ir.SetExtendedRegister(d, ir.ReadMemory32(address, IR::AccType::ATOMIC)); + } + + return true; +} + +// VSTR<c> <Dd>, [<Rn>{, #+/-<imm>}] +// VSTR<c> <Sd>, [<Rn>{, #+/-<imm>}] +bool TranslatorVisitor::vfp_VSTR(Cond cond, bool U, bool D, Reg n, size_t Vd, bool sz, Imm<8> imm8) { + if (!VFPConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm8.ZeroExtend() << 2; + const auto d = ToExtReg(sz, Vd, D); + const auto base = n == Reg::PC ? ir.Imm32(ir.AlignPC(4)) : ir.GetRegister(n); + const auto address = U ? ir.Add(base, ir.Imm32(imm32)) : ir.Sub(base, ir.Imm32(imm32)); + if (sz) { + const auto reg_d = ir.GetExtendedRegister(d); + auto lo = ir.LeastSignificantWord(reg_d); + auto hi = ir.MostSignificantWord(reg_d).result; + if (ir.current_location.EFlag()) { + std::swap(lo, hi); + } + ir.WriteMemory32(address, lo, IR::AccType::ATOMIC); + ir.WriteMemory32(ir.Add(address, ir.Imm32(4)), hi, IR::AccType::ATOMIC); + } else { + ir.WriteMemory32(address, ir.GetExtendedRegister(d), IR::AccType::ATOMIC); + } + + return true; +} + +// VSTM{mode}<c> <Rn>{!}, <list of double registers> +bool TranslatorVisitor::vfp_VSTM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) { + if (!p && !u && !w) { + ASSERT_MSG(false, "Decode error"); + } + + if (p && !w) { + ASSERT_MSG(false, "Decode error"); + } + + if (p == u && w) { + return arm_UDF(); + } + + if (n == Reg::PC && w) { + return UnpredictableInstruction(); + } + + const auto d = ToExtReg(true, Vd, D); + const size_t regs = imm8.ZeroExtend() / 2; + + if (regs == 0 || regs > 16 || A32::RegNumber(d) + regs > 32) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm8.ZeroExtend() << 2; + auto address = u ? ir.GetRegister(n) : IR::U32(ir.Sub(ir.GetRegister(n), ir.Imm32(imm32))); + if (w) { + ir.SetRegister(n, u ? IR::U32(ir.Add(address, ir.Imm32(imm32))) : address); + } + for (size_t i = 0; i < regs; i++) { + const auto value = ir.GetExtendedRegister(d + i); + auto word1 = ir.LeastSignificantWord(value); + auto word2 = ir.MostSignificantWord(value).result; + + if (ir.current_location.EFlag()) { + std::swap(word1, word2); + } + + ir.WriteMemory32(address, word1, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + ir.WriteMemory32(address, word2, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + } + + return true; +} + +// VSTM{mode}<c> <Rn>{!}, <list of single registers> +bool TranslatorVisitor::vfp_VSTM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) { + if (!p && !u && !w) { + ASSERT_MSG(false, "Decode error"); + } + + if (p && !w) { + ASSERT_MSG(false, "Decode error"); + } + + if (p == u && w) { + return arm_UDF(); + } + + if (n == Reg::PC && w) { + return UnpredictableInstruction(); + } + + const auto d = ToExtReg(false, Vd, D); + const size_t regs = imm8.ZeroExtend(); + + if (regs == 0 || A32::RegNumber(d) + regs > 32) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm8.ZeroExtend() << 2; + auto address = u ? ir.GetRegister(n) : IR::U32(ir.Sub(ir.GetRegister(n), ir.Imm32(imm32))); + if (w) { + ir.SetRegister(n, u ? IR::U32(ir.Add(address, ir.Imm32(imm32))) : address); + } + for (size_t i = 0; i < regs; i++) { + const auto word = ir.GetExtendedRegister(d + i); + ir.WriteMemory32(address, word, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + } + + return true; +} + +// VLDM{mode}<c> <Rn>{!}, <list of double registers> +bool TranslatorVisitor::vfp_VLDM_a1(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) { + if (!p && !u && !w) { + ASSERT_MSG(false, "Decode error"); + } + + if (p && !w) { + ASSERT_MSG(false, "Decode error"); + } + + if (p == u && w) { + return arm_UDF(); + } + + if (n == Reg::PC && (w || ir.current_location.TFlag())) { + return UnpredictableInstruction(); + } + + const auto d = ToExtReg(true, Vd, D); + const size_t regs = imm8.ZeroExtend() / 2; + + if (regs == 0 || regs > 16 || A32::RegNumber(d) + regs > 32) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm8.ZeroExtend() << 2; + auto address = u ? ir.GetRegister(n) : IR::U32(ir.Sub(ir.GetRegister(n), ir.Imm32(imm32))); + if (w) { + ir.SetRegister(n, u ? IR::U32(ir.Add(address, ir.Imm32(imm32))) : address); + } + for (size_t i = 0; i < regs; i++) { + auto word1 = ir.ReadMemory32(address, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + auto word2 = ir.ReadMemory32(address, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + + if (ir.current_location.EFlag()) { + std::swap(word1, word2); + } + + ir.SetExtendedRegister(d + i, ir.Pack2x32To1x64(word1, word2)); + } + + return true; +} + +// VLDM{mode}<c> <Rn>{!}, <list of single registers> +bool TranslatorVisitor::vfp_VLDM_a2(Cond cond, bool p, bool u, bool D, bool w, Reg n, size_t Vd, Imm<8> imm8) { + if (!p && !u && !w) { + ASSERT_MSG(false, "Decode error"); + } + + if (p && !w) { + ASSERT_MSG(false, "Decode error"); + } + + if (p == u && w) { + return arm_UDF(); + } + + if (n == Reg::PC && (w || ir.current_location.TFlag())) { + return UnpredictableInstruction(); + } + + const auto d = ToExtReg(false, Vd, D); + const size_t regs = imm8.ZeroExtend(); + + if (regs == 0 || A32::RegNumber(d) + regs > 32) { + return UnpredictableInstruction(); + } + + if (!VFPConditionPassed(cond)) { + return true; + } + + const u32 imm32 = imm8.ZeroExtend() << 2; + auto address = u ? ir.GetRegister(n) : IR::U32(ir.Sub(ir.GetRegister(n), ir.Imm32(imm32))); + if (w) { + ir.SetRegister(n, u ? IR::U32(ir.Add(address, ir.Imm32(imm32))) : address); + } + for (size_t i = 0; i < regs; i++) { + const auto word = ir.ReadMemory32(address, IR::AccType::ATOMIC); + address = ir.Add(address, ir.Imm32(4)); + ir.SetExtendedRegister(d + i, word); + } + + return true; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_arm.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_arm.cpp new file mode 100644 index 0000000000..24b5797de8 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_arm.cpp @@ -0,0 +1,115 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/assert.hpp> + +#include "dynarmic/frontend/A32/a32_location_descriptor.h" +#include "dynarmic/frontend/A32/a32_types.h" +#include "dynarmic/frontend/A32/decoder/arm.h" +#include "dynarmic/frontend/A32/decoder/asimd.h" +#include "dynarmic/frontend/A32/decoder/vfp.h" +#include "dynarmic/frontend/A32/translate/a32_translate.h" +#include "dynarmic/frontend/A32/translate/conditional_state.h" +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" +#include "dynarmic/frontend/A32/translate/translate_callbacks.h" +#include "dynarmic/interface/A32/config.h" +#include "dynarmic/ir/basic_block.h" + +namespace Dynarmic::A32 { + +IR::Block TranslateArm(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options) { + const bool single_step = descriptor.SingleStepping(); + + IR::Block block{descriptor}; + TranslatorVisitor visitor{block, descriptor, options}; + + bool should_continue = true; + do { + const u32 arm_pc = visitor.ir.current_location.PC(); + u64 ticks_for_instruction = 1; + + if (!tcb->PreCodeReadHook(false, arm_pc, visitor.ir)) { + should_continue = false; + break; + } + + if (const auto arm_instruction = tcb->MemoryReadCode(arm_pc)) { + visitor.current_instruction_size = 4; + + tcb->PreCodeTranslationHook(false, arm_pc, visitor.ir); + ticks_for_instruction = tcb->GetTicksForCode(false, arm_pc, *arm_instruction); + + if (const auto vfp_decoder = DecodeVFP<TranslatorVisitor>(*arm_instruction)) { + should_continue = vfp_decoder->get().call(visitor, *arm_instruction); + } else if (const auto asimd_decoder = DecodeASIMD<TranslatorVisitor>(*arm_instruction)) { + should_continue = asimd_decoder->get().call(visitor, *arm_instruction); + } else if (const auto decoder = DecodeArm<TranslatorVisitor>(*arm_instruction)) { + should_continue = decoder->get().call(visitor, *arm_instruction); + } else { + should_continue = visitor.arm_UDF(); + } + } else { + visitor.current_instruction_size = 4; + + should_continue = visitor.RaiseException(Exception::NoExecuteFault); + } + + if (visitor.cond_state == ConditionalState::Break) { + break; + } + + visitor.ir.current_location = visitor.ir.current_location.AdvancePC(4); + block.CycleCount() += ticks_for_instruction; + } while (should_continue && CondCanContinue(visitor.cond_state, visitor.ir) && !single_step); + + if (visitor.cond_state == ConditionalState::Translating || visitor.cond_state == ConditionalState::Trailing || single_step) { + if (should_continue) { + if (single_step) { + visitor.ir.SetTerm(IR::Term::LinkBlock{visitor.ir.current_location}); + } else { + visitor.ir.SetTerm(IR::Term::LinkBlockFast{visitor.ir.current_location}); + } + } + } + + ASSERT_MSG(block.HasTerminal(), "Terminal has not been set"); + + block.SetEndLocation(visitor.ir.current_location); + + return block; +} + +bool TranslateSingleArmInstruction(IR::Block& block, LocationDescriptor descriptor, u32 arm_instruction) { + TranslatorVisitor visitor{block, descriptor, {}}; + + bool should_continue = true; + + // TODO: Proper cond handling + + visitor.current_instruction_size = 4; + + const u64 ticks_for_instruction = 1; + + if (const auto vfp_decoder = DecodeVFP<TranslatorVisitor>(arm_instruction)) { + should_continue = vfp_decoder->get().call(visitor, arm_instruction); + } else if (const auto asimd_decoder = DecodeASIMD<TranslatorVisitor>(arm_instruction)) { + should_continue = asimd_decoder->get().call(visitor, arm_instruction); + } else if (const auto decoder = DecodeArm<TranslatorVisitor>(arm_instruction)) { + should_continue = decoder->get().call(visitor, arm_instruction); + } else { + should_continue = visitor.arm_UDF(); + } + + // TODO: Feedback resulting cond status to caller somehow. + + visitor.ir.current_location = visitor.ir.current_location.AdvancePC(4); + block.CycleCount() += ticks_for_instruction; + + block.SetEndLocation(visitor.ir.current_location); + + return should_continue; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_callbacks.h b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_callbacks.h new file mode 100644 index 0000000000..e0ad48ea5c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_callbacks.h @@ -0,0 +1,35 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ +#pragma once + +#include <cstdint> +#include <optional> + +namespace Dynarmic::A32 { + +using VAddr = std::uint32_t; + +class IREmitter; + +struct TranslateCallbacks { + // All reads through this callback are 4-byte aligned. + // Memory must be interpreted as little endian. + virtual std::optional<std::uint32_t> MemoryReadCode(VAddr vaddr) = 0; + + // This function is called before the instruction at pc is read. + // IR code can be emitted by the callee prior to instruction handling. + // By returning false the callee precludes the translation of the instruction; + // in such case the callee is responsible for setting the terminal. + virtual bool PreCodeReadHook(bool is_thumb, VAddr pc, A32::IREmitter& ir) = 0; + + // This function is called before the instruction at pc is interpreted. + // IR code can be emitted by the callee prior to translation of the instruction. + virtual void PreCodeTranslationHook(bool is_thumb, VAddr pc, A32::IREmitter& ir) = 0; + + // How many ticks should this instruction take to execute? + virtual std::uint64_t GetTicksForCode(bool is_thumb, VAddr vaddr, std::uint32_t instruction) = 0; +}; + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_thumb.cpp b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_thumb.cpp new file mode 100644 index 0000000000..d61d8ccbff --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/translate_thumb.cpp @@ -0,0 +1,227 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <tuple> + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/bit/swap.hpp> + +#include "dynarmic/frontend/A32/a32_ir_emitter.h" +#include "dynarmic/frontend/A32/a32_location_descriptor.h" +#include "dynarmic/frontend/A32/decoder/asimd.h" +#include "dynarmic/frontend/A32/decoder/thumb16.h" +#include "dynarmic/frontend/A32/decoder/thumb32.h" +#include "dynarmic/frontend/A32/decoder/vfp.h" +#include "dynarmic/frontend/A32/translate/a32_translate.h" +#include "dynarmic/frontend/A32/translate/conditional_state.h" +#include "dynarmic/frontend/A32/translate/impl/a32_translate_impl.h" +#include "dynarmic/frontend/A32/translate/translate_callbacks.h" +#include "dynarmic/frontend/imm.h" +#include "dynarmic/interface/A32/config.h" + +namespace Dynarmic::A32 { +namespace { + +enum class ThumbInstSize { + Thumb16, + Thumb32 +}; + +bool IsThumb16(u16 first_part) { + return first_part < 0xE800; +} + +bool IsUnconditionalInstruction(bool is_thumb_16, u32 instruction) { + if (!is_thumb_16) + return false; + if ((instruction & 0xFF00) == 0b10111110'00000000) // BKPT + return true; + if ((instruction & 0xFFC0) == 0b10111010'10000000) // HLT + return true; + return false; +} + +std::optional<std::tuple<u32, ThumbInstSize>> ReadThumbInstruction(u32 arm_pc, TranslateCallbacks* tcb) { + u32 instruction; + + const std::optional<u32> first_part = tcb->MemoryReadCode(arm_pc & 0xFFFFFFFC); + if (!first_part) + return std::nullopt; + + if ((arm_pc & 0x2) != 0) { + instruction = *first_part >> 16; + } else { + instruction = *first_part & 0xFFFF; + } + + if (IsThumb16(static_cast<u16>(instruction))) { + // 16-bit thumb instruction + return std::make_tuple(instruction, ThumbInstSize::Thumb16); + } + + // 32-bit thumb instruction + // These always start with 0b11101, 0b11110 or 0b11111. + + instruction <<= 16; + + const std::optional<u32> second_part = tcb->MemoryReadCode((arm_pc + 2) & 0xFFFFFFFC); + if (!second_part) + return std::nullopt; + + if (((arm_pc + 2) & 0x2) != 0) { + instruction |= *second_part >> 16; + } else { + instruction |= *second_part & 0xFFFF; + } + + return std::make_tuple(instruction, ThumbInstSize::Thumb32); +} + +// Convert from thumb ASIMD format to ARM ASIMD format. +u32 ConvertASIMDInstruction(u32 thumb_instruction) { + if ((thumb_instruction & 0xEF000000) == 0xEF000000) { + const bool U = mcl::bit::get_bit<28>(thumb_instruction); + return 0xF2000000 | (U << 24) | (thumb_instruction & 0x00FFFFFF); + } + + if ((thumb_instruction & 0xFF000000) == 0xF9000000) { + return 0xF4000000 | (thumb_instruction & 0x00FFFFFF); + } + + return 0xF7F0A000; // UDF +} + +bool MaybeVFPOrASIMDInstruction(u32 thumb_instruction) { + return (thumb_instruction & 0xEC000000) == 0xEC000000 || (thumb_instruction & 0xFF100000) == 0xF9000000; +} + +} // namespace + +IR::Block TranslateThumb(LocationDescriptor descriptor, TranslateCallbacks* tcb, const TranslationOptions& options) { + const bool single_step = descriptor.SingleStepping(); + + IR::Block block{descriptor}; + TranslatorVisitor visitor{block, descriptor, options}; + + bool should_continue = true; + do { + const u32 arm_pc = visitor.ir.current_location.PC(); + u64 ticks_for_instruction = 1; + + if (!tcb->PreCodeReadHook(true, arm_pc, visitor.ir)) { + should_continue = false; + break; + } + + if (const auto maybe_instruction = ReadThumbInstruction(arm_pc, tcb)) { + const auto [thumb_instruction, inst_size] = *maybe_instruction; + const bool is_thumb_16 = inst_size == ThumbInstSize::Thumb16; + visitor.current_instruction_size = is_thumb_16 ? 2 : 4; + + tcb->PreCodeTranslationHook(true, arm_pc, visitor.ir); + ticks_for_instruction = tcb->GetTicksForCode(true, arm_pc, thumb_instruction); + + if (IsUnconditionalInstruction(is_thumb_16, thumb_instruction) || visitor.ThumbConditionPassed()) { + if (is_thumb_16) { + if (const auto decoder = DecodeThumb16<TranslatorVisitor>(static_cast<u16>(thumb_instruction))) { + should_continue = decoder->get().call(visitor, static_cast<u16>(thumb_instruction)); + } else { + should_continue = visitor.thumb16_UDF(); + } + } else { + if (MaybeVFPOrASIMDInstruction(thumb_instruction)) { + if (const auto vfp_decoder = DecodeVFP<TranslatorVisitor>(thumb_instruction)) { + should_continue = vfp_decoder->get().call(visitor, thumb_instruction); + } else if (const auto asimd_decoder = DecodeASIMD<TranslatorVisitor>(ConvertASIMDInstruction(thumb_instruction))) { + should_continue = asimd_decoder->get().call(visitor, ConvertASIMDInstruction(thumb_instruction)); + } else if (const auto decoder = DecodeThumb32<TranslatorVisitor>(thumb_instruction)) { + should_continue = decoder->get().call(visitor, thumb_instruction); + } else { + should_continue = visitor.thumb32_UDF(); + } + } else if (const auto decoder = DecodeThumb32<TranslatorVisitor>(thumb_instruction)) { + should_continue = decoder->get().call(visitor, thumb_instruction); + } else { + should_continue = visitor.thumb32_UDF(); + } + } + } + } else { + visitor.current_instruction_size = 2; + + should_continue = visitor.RaiseException(Exception::NoExecuteFault); + } + + if (visitor.cond_state == ConditionalState::Break) { + break; + } + + visitor.ir.current_location = visitor.ir.current_location.AdvancePC(static_cast<int>(visitor.current_instruction_size)).AdvanceIT(); + block.CycleCount() += ticks_for_instruction; + } while (should_continue && CondCanContinue(visitor.cond_state, visitor.ir) && !single_step); + + if (visitor.cond_state == ConditionalState::Translating || visitor.cond_state == ConditionalState::Trailing || single_step) { + if (should_continue) { + if (single_step) { + visitor.ir.SetTerm(IR::Term::LinkBlock{visitor.ir.current_location}); + } else { + visitor.ir.SetTerm(IR::Term::LinkBlockFast{visitor.ir.current_location}); + } + } + } + + ASSERT_MSG(block.HasTerminal(), "Terminal has not been set"); + + block.SetEndLocation(visitor.ir.current_location); + + return block; +} + +bool TranslateSingleThumbInstruction(IR::Block& block, LocationDescriptor descriptor, u32 thumb_instruction) { + TranslatorVisitor visitor{block, descriptor, {}}; + + bool should_continue = true; + + const bool is_thumb_16 = IsThumb16(static_cast<u16>(thumb_instruction)); + visitor.current_instruction_size = is_thumb_16 ? 2 : 4; + + const u64 ticks_for_instruction = 1; + + if (is_thumb_16) { + if (const auto decoder = DecodeThumb16<TranslatorVisitor>(static_cast<u16>(thumb_instruction))) { + should_continue = decoder->get().call(visitor, static_cast<u16>(thumb_instruction)); + } else { + should_continue = visitor.thumb16_UDF(); + } + } else { + thumb_instruction = mcl::bit::swap_halves_32(thumb_instruction); + if (MaybeVFPOrASIMDInstruction(thumb_instruction)) { + if (const auto vfp_decoder = DecodeVFP<TranslatorVisitor>(thumb_instruction)) { + should_continue = vfp_decoder->get().call(visitor, thumb_instruction); + } else if (const auto asimd_decoder = DecodeASIMD<TranslatorVisitor>(ConvertASIMDInstruction(thumb_instruction))) { + should_continue = asimd_decoder->get().call(visitor, ConvertASIMDInstruction(thumb_instruction)); + } else if (const auto decoder = DecodeThumb32<TranslatorVisitor>(thumb_instruction)) { + should_continue = decoder->get().call(visitor, thumb_instruction); + } else { + should_continue = visitor.thumb32_UDF(); + } + } else if (const auto decoder = DecodeThumb32<TranslatorVisitor>(thumb_instruction)) { + should_continue = decoder->get().call(visitor, thumb_instruction); + } else { + should_continue = visitor.thumb32_UDF(); + } + } + + const s32 advance_pc = is_thumb_16 ? 2 : 4; + visitor.ir.current_location = visitor.ir.current_location.AdvancePC(advance_pc); + block.CycleCount() += ticks_for_instruction; + + block.SetEndLocation(visitor.ir.current_location); + + return should_continue; +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.cpp new file mode 100644 index 0000000000..3f5a70bdc0 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.cpp @@ -0,0 +1,265 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/a64_ir_emitter.h" + +#include <mcl/assert.hpp> + +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::A64 { + +using Opcode = IR::Opcode; + +u64 IREmitter::PC() const { + return current_location->PC(); +} + +u64 IREmitter::AlignPC(size_t alignment) const { + const u64 pc = PC(); + return static_cast<u64>(pc - pc % alignment); +} + +void IREmitter::SetCheckBit(const IR::U1& value) { + Inst(Opcode::A64SetCheckBit, value); +} + +IR::U1 IREmitter::GetCFlag() { + return Inst<IR::U1>(Opcode::A64GetCFlag); +} + +IR::U32 IREmitter::GetNZCVRaw() { + return Inst<IR::U32>(Opcode::A64GetNZCVRaw); +} + +void IREmitter::SetNZCVRaw(IR::U32 value) { + Inst(Opcode::A64SetNZCVRaw, value); +} + +void IREmitter::SetNZCV(const IR::NZCV& nzcv) { + Inst(Opcode::A64SetNZCV, nzcv); +} + +void IREmitter::CallSupervisor(u32 imm) { + Inst(Opcode::A64CallSupervisor, Imm32(imm)); +} + +void IREmitter::ExceptionRaised(Exception exception) { + Inst(Opcode::A64ExceptionRaised, Imm64(PC()), Imm64(static_cast<u64>(exception))); +} + +void IREmitter::DataCacheOperationRaised(DataCacheOperation op, const IR::U64& value) { + Inst(Opcode::A64DataCacheOperationRaised, ImmCurrentLocationDescriptor(), Imm64(static_cast<u64>(op)), value); +} + +void IREmitter::InstructionCacheOperationRaised(InstructionCacheOperation op, const IR::U64& value) { + Inst(Opcode::A64InstructionCacheOperationRaised, Imm64(static_cast<u64>(op)), value); +} + +void IREmitter::DataSynchronizationBarrier() { + Inst(Opcode::A64DataSynchronizationBarrier); +} + +void IREmitter::DataMemoryBarrier() { + Inst(Opcode::A64DataMemoryBarrier); +} + +void IREmitter::InstructionSynchronizationBarrier() { + Inst(Opcode::A64InstructionSynchronizationBarrier); +} + +IR::U32 IREmitter::GetCNTFRQ() { + return Inst<IR::U32>(Opcode::A64GetCNTFRQ); +} + +IR::U64 IREmitter::GetCNTPCT() { + return Inst<IR::U64>(Opcode::A64GetCNTPCT); +} + +IR::U32 IREmitter::GetCTR() { + return Inst<IR::U32>(Opcode::A64GetCTR); +} + +IR::U32 IREmitter::GetDCZID() { + return Inst<IR::U32>(Opcode::A64GetDCZID); +} + +IR::U64 IREmitter::GetTPIDR() { + return Inst<IR::U64>(Opcode::A64GetTPIDR); +} + +void IREmitter::SetTPIDR(const IR::U64& value) { + Inst(Opcode::A64SetTPIDR, value); +} + +IR::U64 IREmitter::GetTPIDRRO() { + return Inst<IR::U64>(Opcode::A64GetTPIDRRO); +} + +void IREmitter::ClearExclusive() { + Inst(Opcode::A64ClearExclusive); +} + +IR::U8 IREmitter::ReadMemory8(const IR::U64& vaddr, IR::AccType acc_type) { + return Inst<IR::U8>(Opcode::A64ReadMemory8, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +IR::U16 IREmitter::ReadMemory16(const IR::U64& vaddr, IR::AccType acc_type) { + return Inst<IR::U16>(Opcode::A64ReadMemory16, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +IR::U32 IREmitter::ReadMemory32(const IR::U64& vaddr, IR::AccType acc_type) { + return Inst<IR::U32>(Opcode::A64ReadMemory32, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +IR::U64 IREmitter::ReadMemory64(const IR::U64& vaddr, IR::AccType acc_type) { + return Inst<IR::U64>(Opcode::A64ReadMemory64, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +IR::U128 IREmitter::ReadMemory128(const IR::U64& vaddr, IR::AccType acc_type) { + return Inst<IR::U128>(Opcode::A64ReadMemory128, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +IR::U8 IREmitter::ExclusiveReadMemory8(const IR::U64& vaddr, IR::AccType acc_type) { + return Inst<IR::U8>(Opcode::A64ExclusiveReadMemory8, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +IR::U16 IREmitter::ExclusiveReadMemory16(const IR::U64& vaddr, IR::AccType acc_type) { + return Inst<IR::U16>(Opcode::A64ExclusiveReadMemory16, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +IR::U32 IREmitter::ExclusiveReadMemory32(const IR::U64& vaddr, IR::AccType acc_type) { + return Inst<IR::U32>(Opcode::A64ExclusiveReadMemory32, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +IR::U64 IREmitter::ExclusiveReadMemory64(const IR::U64& vaddr, IR::AccType acc_type) { + return Inst<IR::U64>(Opcode::A64ExclusiveReadMemory64, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +IR::U128 IREmitter::ExclusiveReadMemory128(const IR::U64& vaddr, IR::AccType acc_type) { + return Inst<IR::U128>(Opcode::A64ExclusiveReadMemory128, ImmCurrentLocationDescriptor(), vaddr, IR::Value{acc_type}); +} + +void IREmitter::WriteMemory8(const IR::U64& vaddr, const IR::U8& value, IR::AccType acc_type) { + Inst(Opcode::A64WriteMemory8, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +void IREmitter::WriteMemory16(const IR::U64& vaddr, const IR::U16& value, IR::AccType acc_type) { + Inst(Opcode::A64WriteMemory16, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +void IREmitter::WriteMemory32(const IR::U64& vaddr, const IR::U32& value, IR::AccType acc_type) { + Inst(Opcode::A64WriteMemory32, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +void IREmitter::WriteMemory64(const IR::U64& vaddr, const IR::U64& value, IR::AccType acc_type) { + Inst(Opcode::A64WriteMemory64, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +void IREmitter::WriteMemory128(const IR::U64& vaddr, const IR::U128& value, IR::AccType acc_type) { + Inst(Opcode::A64WriteMemory128, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +IR::U32 IREmitter::ExclusiveWriteMemory8(const IR::U64& vaddr, const IR::U8& value, IR::AccType acc_type) { + return Inst<IR::U32>(Opcode::A64ExclusiveWriteMemory8, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +IR::U32 IREmitter::ExclusiveWriteMemory16(const IR::U64& vaddr, const IR::U16& value, IR::AccType acc_type) { + return Inst<IR::U32>(Opcode::A64ExclusiveWriteMemory16, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +IR::U32 IREmitter::ExclusiveWriteMemory32(const IR::U64& vaddr, const IR::U32& value, IR::AccType acc_type) { + return Inst<IR::U32>(Opcode::A64ExclusiveWriteMemory32, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +IR::U32 IREmitter::ExclusiveWriteMemory64(const IR::U64& vaddr, const IR::U64& value, IR::AccType acc_type) { + return Inst<IR::U32>(Opcode::A64ExclusiveWriteMemory64, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +IR::U32 IREmitter::ExclusiveWriteMemory128(const IR::U64& vaddr, const IR::U128& value, IR::AccType acc_type) { + return Inst<IR::U32>(Opcode::A64ExclusiveWriteMemory128, ImmCurrentLocationDescriptor(), vaddr, value, IR::Value{acc_type}); +} + +IR::U32 IREmitter::GetW(Reg reg) { + if (reg == Reg::ZR) + return Imm32(0); + return Inst<IR::U32>(Opcode::A64GetW, IR::Value(reg)); +} + +IR::U64 IREmitter::GetX(Reg reg) { + if (reg == Reg::ZR) + return Imm64(0); + return Inst<IR::U64>(Opcode::A64GetX, IR::Value(reg)); +} + +IR::U128 IREmitter::GetS(Vec vec) { + return Inst<IR::U128>(Opcode::A64GetS, IR::Value(vec)); +} + +IR::U128 IREmitter::GetD(Vec vec) { + return Inst<IR::U128>(Opcode::A64GetD, IR::Value(vec)); +} + +IR::U128 IREmitter::GetQ(Vec vec) { + return Inst<IR::U128>(Opcode::A64GetQ, IR::Value(vec)); +} + +IR::U64 IREmitter::GetSP() { + return Inst<IR::U64>(Opcode::A64GetSP); +} + +IR::U32 IREmitter::GetFPCR() { + return Inst<IR::U32>(Opcode::A64GetFPCR); +} + +IR::U32 IREmitter::GetFPSR() { + return Inst<IR::U32>(Opcode::A64GetFPSR); +} + +void IREmitter::SetW(const Reg reg, const IR::U32& value) { + if (reg == Reg::ZR) + return; + Inst(Opcode::A64SetW, IR::Value(reg), value); +} + +void IREmitter::SetX(const Reg reg, const IR::U64& value) { + if (reg == Reg::ZR) + return; + Inst(Opcode::A64SetX, IR::Value(reg), value); +} + +void IREmitter::SetS(const Vec vec, const IR::U128& value) { + Inst(Opcode::A64SetS, IR::Value(vec), value); +} + +void IREmitter::SetD(const Vec vec, const IR::U128& value) { + Inst(Opcode::A64SetD, IR::Value(vec), value); +} + +void IREmitter::SetQ(const Vec vec, const IR::U128& value) { + Inst(Opcode::A64SetQ, IR::Value(vec), value); +} + +void IREmitter::SetSP(const IR::U64& value) { + Inst(Opcode::A64SetSP, value); +} + +void IREmitter::SetFPCR(const IR::U32& value) { + Inst(Opcode::A64SetFPCR, value); +} + +void IREmitter::SetFPSR(const IR::U32& value) { + Inst(Opcode::A64SetFPSR, value); +} + +void IREmitter::SetPC(const IR::U64& value) { + Inst(Opcode::A64SetPC, value); +} + +IR::U64 IREmitter::ImmCurrentLocationDescriptor() { + return Imm64(IR::LocationDescriptor{*current_location}.Value()); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.h b/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.h new file mode 100644 index 0000000000..7fc8bea7c4 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_ir_emitter.h @@ -0,0 +1,102 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <optional> + +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/A64/a64_location_descriptor.h" +#include "dynarmic/frontend/A64/a64_types.h" +#include "dynarmic/interface/A64/config.h" +#include "dynarmic/ir/ir_emitter.h" +#include "dynarmic/ir/value.h" + +namespace Dynarmic::A64 { + +/** + * Convenience class to construct a basic block of the intermediate representation. + * `block` is the resulting block. + * The user of this class updates `current_location` as appropriate. + */ +class IREmitter : public IR::IREmitter { +public: + explicit IREmitter(IR::Block& block) + : IR::IREmitter(block) {} + explicit IREmitter(IR::Block& block, LocationDescriptor descriptor) + : IR::IREmitter(block), current_location(descriptor) {} + + std::optional<LocationDescriptor> current_location; + + u64 PC() const; + u64 AlignPC(size_t alignment) const; + + void SetCheckBit(const IR::U1& value); + IR::U1 GetCFlag(); + IR::U32 GetNZCVRaw(); + void SetNZCVRaw(IR::U32 value); + void SetNZCV(const IR::NZCV& nzcv); + + void CallSupervisor(u32 imm); + void ExceptionRaised(Exception exception); + void DataCacheOperationRaised(DataCacheOperation op, const IR::U64& value); + void InstructionCacheOperationRaised(InstructionCacheOperation op, const IR::U64& value); + void DataSynchronizationBarrier(); + void DataMemoryBarrier(); + void InstructionSynchronizationBarrier(); + IR::U32 GetCNTFRQ(); + IR::U64 GetCNTPCT(); // TODO: Ensure sub-basic-block cycle counts are updated before this. + IR::U32 GetCTR(); + IR::U32 GetDCZID(); + IR::U64 GetTPIDR(); + IR::U64 GetTPIDRRO(); + void SetTPIDR(const IR::U64& value); + + void ClearExclusive(); + IR::U8 ReadMemory8(const IR::U64& vaddr, IR::AccType acc_type); + IR::U16 ReadMemory16(const IR::U64& vaddr, IR::AccType acc_type); + IR::U32 ReadMemory32(const IR::U64& vaddr, IR::AccType acc_type); + IR::U64 ReadMemory64(const IR::U64& vaddr, IR::AccType acc_type); + IR::U128 ReadMemory128(const IR::U64& vaddr, IR::AccType acc_type); + IR::U8 ExclusiveReadMemory8(const IR::U64& vaddr, IR::AccType acc_type); + IR::U16 ExclusiveReadMemory16(const IR::U64& vaddr, IR::AccType acc_type); + IR::U32 ExclusiveReadMemory32(const IR::U64& vaddr, IR::AccType acc_type); + IR::U64 ExclusiveReadMemory64(const IR::U64& vaddr, IR::AccType acc_type); + IR::U128 ExclusiveReadMemory128(const IR::U64& vaddr, IR::AccType acc_type); + void WriteMemory8(const IR::U64& vaddr, const IR::U8& value, IR::AccType acc_type); + void WriteMemory16(const IR::U64& vaddr, const IR::U16& value, IR::AccType acc_type); + void WriteMemory32(const IR::U64& vaddr, const IR::U32& value, IR::AccType acc_type); + void WriteMemory64(const IR::U64& vaddr, const IR::U64& value, IR::AccType acc_type); + void WriteMemory128(const IR::U64& vaddr, const IR::U128& value, IR::AccType acc_type); + IR::U32 ExclusiveWriteMemory8(const IR::U64& vaddr, const IR::U8& value, IR::AccType acc_type); + IR::U32 ExclusiveWriteMemory16(const IR::U64& vaddr, const IR::U16& value, IR::AccType acc_type); + IR::U32 ExclusiveWriteMemory32(const IR::U64& vaddr, const IR::U32& value, IR::AccType acc_type); + IR::U32 ExclusiveWriteMemory64(const IR::U64& vaddr, const IR::U64& value, IR::AccType acc_type); + IR::U32 ExclusiveWriteMemory128(const IR::U64& vaddr, const IR::U128& value, IR::AccType acc_type); + + IR::U32 GetW(Reg source_reg); + IR::U64 GetX(Reg source_reg); + IR::U128 GetS(Vec source_vec); + IR::U128 GetD(Vec source_vec); + IR::U128 GetQ(Vec source_vec); + IR::U64 GetSP(); + IR::U32 GetFPCR(); + IR::U32 GetFPSR(); + void SetW(Reg dest_reg, const IR::U32& value); + void SetX(Reg dest_reg, const IR::U64& value); + void SetS(Vec dest_vec, const IR::U128& value); + void SetD(Vec dest_vec, const IR::U128& value); + void SetQ(Vec dest_vec, const IR::U128& value); + void SetSP(const IR::U64& value); + void SetFPCR(const IR::U32& value); + void SetFPSR(const IR::U32& value); + void SetPC(const IR::U64& value); + +private: + IR::U64 ImmCurrentLocationDescriptor(); +}; + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.cpp new file mode 100644 index 0000000000..83a931ff40 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.cpp @@ -0,0 +1,16 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/a64_location_descriptor.h" + +#include <fmt/format.h> + +namespace Dynarmic::A64 { + +std::string ToString(const LocationDescriptor& descriptor) { + return fmt::format("{{{}, {}{}}}", descriptor.PC(), descriptor.FPCR().Value(), descriptor.SingleStepping() ? ", step" : ""); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.h b/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.h new file mode 100644 index 0000000000..122bebbcb7 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_location_descriptor.h @@ -0,0 +1,115 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <functional> +#include <string> +#include <tuple> + +#include <fmt/format.h> +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/fp/fpcr.h" +#include "dynarmic/ir/location_descriptor.h" + +namespace Dynarmic::A64 { + +/** + * LocationDescriptor describes the location of a basic block. + * The location is not solely based on the PC because other flags influence the way + * instructions should be translated. + */ +class LocationDescriptor { +public: + static constexpr size_t pc_bit_count = 56; + static constexpr u64 pc_mask = mcl::bit::ones<u64>(pc_bit_count); + static constexpr u32 fpcr_mask = 0x07C8'0000; + static constexpr size_t fpcr_shift = 37; + static constexpr size_t single_stepping_bit = 57; + static_assert((pc_mask & (u64(fpcr_mask) << fpcr_shift) & (u64(1) << single_stepping_bit)) == 0); + + LocationDescriptor(u64 pc, FP::FPCR fpcr, bool single_stepping = false) + : pc(pc & pc_mask), fpcr(fpcr.Value() & fpcr_mask), single_stepping(single_stepping) {} + + explicit LocationDescriptor(const IR::LocationDescriptor& o) + : pc(o.Value() & pc_mask) + , fpcr((o.Value() >> fpcr_shift) & fpcr_mask) + , single_stepping(mcl::bit::get_bit<single_stepping_bit>(o.Value())) {} + + u64 PC() const { return mcl::bit::sign_extend<pc_bit_count>(pc); } + FP::FPCR FPCR() const { return fpcr; } + bool SingleStepping() const { return single_stepping; } + + bool operator==(const LocationDescriptor& o) const { + return std::tie(pc, fpcr, single_stepping) == std::tie(o.pc, o.fpcr, o.single_stepping); + } + + bool operator!=(const LocationDescriptor& o) const { + return !operator==(o); + } + + LocationDescriptor SetPC(u64 new_pc) const { + return LocationDescriptor(new_pc, fpcr, single_stepping); + } + + LocationDescriptor AdvancePC(int amount) const { + return LocationDescriptor(static_cast<u64>(pc + amount), fpcr, single_stepping); + } + + LocationDescriptor SetSingleStepping(bool new_single_stepping) const { + return LocationDescriptor(pc, fpcr, new_single_stepping); + } + + u64 UniqueHash() const noexcept { + // This value MUST BE UNIQUE. + // This calculation has to match up with EmitTerminalPopRSBHint + const u64 fpcr_u64 = static_cast<u64>(fpcr.Value()) << fpcr_shift; + const u64 single_stepping_u64 = static_cast<u64>(single_stepping) << single_stepping_bit; + return pc | fpcr_u64 | single_stepping_u64; + } + + operator IR::LocationDescriptor() const { + return IR::LocationDescriptor{UniqueHash()}; + } + +private: + u64 pc; ///< Current program counter value. + FP::FPCR fpcr; ///< Floating point control register. + bool single_stepping; +}; + +/** + * Provides a string representation of a LocationDescriptor. + * + * @param descriptor The descriptor to get a string representation of + */ +std::string ToString(const LocationDescriptor& descriptor); + +} // namespace Dynarmic::A64 + +namespace std { +template<> +struct less<Dynarmic::A64::LocationDescriptor> { + bool operator()(const Dynarmic::A64::LocationDescriptor& x, const Dynarmic::A64::LocationDescriptor& y) const noexcept { + return x.UniqueHash() < y.UniqueHash(); + } +}; +template<> +struct hash<Dynarmic::A64::LocationDescriptor> { + size_t operator()(const Dynarmic::A64::LocationDescriptor& x) const noexcept { + return std::hash<u64>()(x.UniqueHash()); + } +}; +} // namespace std + +template<> +struct fmt::formatter<Dynarmic::A64::LocationDescriptor> : fmt::formatter<std::string> { + template<typename FormatContext> + auto format(Dynarmic::A64::LocationDescriptor descriptor, FormatContext& ctx) const { + return formatter<std::string>::format(Dynarmic::A64::ToString(descriptor), ctx); + } +}; diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.cpp new file mode 100644 index 0000000000..ca1c59ab36 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.cpp @@ -0,0 +1,33 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/a64_types.h" + +#include <array> +#include <ostream> + +#include <fmt/format.h> + +namespace Dynarmic::A64 { + +const char* CondToString(Cond cond) { + static constexpr std::array cond_strs = { + "eq", "ne", "hs", "lo", "mi", "pl", "vs", "vc", + "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"}; + return cond_strs.at(static_cast<size_t>(cond)); +} + +std::string RegToString(Reg reg) { + if (reg == Reg::R31) { + return "sp|zr"; + } + return fmt::format("r{}", static_cast<size_t>(reg)); +} + +std::string VecToString(Vec vec) { + return fmt::format("v{}", static_cast<size_t>(vec)); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.h b/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.h new file mode 100644 index 0000000000..6bc7a24536 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/a64_types.h @@ -0,0 +1,142 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <string> + +#include <fmt/format.h> +#include <mcl/assert.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/ir/cond.h" + +namespace Dynarmic::A64 { + +using Cond = IR::Cond; + +enum class Reg { + R0, + R1, + R2, + R3, + R4, + R5, + R6, + R7, + R8, + R9, + R10, + R11, + R12, + R13, + R14, + R15, + R16, + R17, + R18, + R19, + R20, + R21, + R22, + R23, + R24, + R25, + R26, + R27, + R28, + R29, + R30, + R31, + LR = R30, + SP = R31, + ZR = R31, +}; + +enum class Vec { + V0, + V1, + V2, + V3, + V4, + V5, + V6, + V7, + V8, + V9, + V10, + V11, + V12, + V13, + V14, + V15, + V16, + V17, + V18, + V19, + V20, + V21, + V22, + V23, + V24, + V25, + V26, + V27, + V28, + V29, + V30, + V31, +}; + +enum class ShiftType { + LSL, + LSR, + ASR, + ROR, +}; + +const char* CondToString(Cond cond); +std::string RegToString(Reg reg); +std::string VecToString(Vec vec); + +constexpr size_t RegNumber(Reg reg) { + return static_cast<size_t>(reg); +} + +constexpr size_t VecNumber(Vec vec) { + return static_cast<size_t>(vec); +} + +inline Reg operator+(Reg reg, size_t number) { + const size_t new_reg = RegNumber(reg) + number; + ASSERT(new_reg <= 31); + + return static_cast<Reg>(new_reg); +} + +inline Vec operator+(Vec vec, size_t number) { + const size_t new_vec = VecNumber(vec) + number; + ASSERT(new_vec <= 31); + + return static_cast<Vec>(new_vec); +} + +} // namespace Dynarmic::A64 + +template<> +struct fmt::formatter<Dynarmic::A64::Reg> : fmt::formatter<std::string> { + template<typename FormatContext> + auto format(Dynarmic::A64::Reg reg, FormatContext& ctx) const { + return formatter<std::string>::format(Dynarmic::A64::RegToString(reg), ctx); + } +}; + +template<> +struct fmt::formatter<Dynarmic::A64::Vec> : fmt::formatter<std::string> { + template<typename FormatContext> + auto format(Dynarmic::A64::Vec vec, FormatContext& ctx) const { + return formatter<std::string>::format(Dynarmic::A64::VecToString(vec), ctx); + } +}; diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.h b/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.h new file mode 100644 index 0000000000..d6f447e575 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.h @@ -0,0 +1,83 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <algorithm> +#include <functional> +#include <optional> +#include <set> +#include <string> +#include <vector> + +#include <mcl/bit/bit_count.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/decoder/decoder_detail.h" +#include "dynarmic/frontend/decoder/matcher.h" + +namespace Dynarmic::A64 { + +template<typename Visitor> +using Matcher = Decoder::Matcher<Visitor, u32>; + +template<typename Visitor> +using DecodeTable = std::array<std::vector<Matcher<Visitor>>, 0x1000>; + +namespace detail { +inline size_t ToFastLookupIndex(u32 instruction) { + return ((instruction >> 10) & 0x00F) | ((instruction >> 18) & 0xFF0); +} +} // namespace detail + +template<typename V> +DecodeTable<V> GetDecodeTable() { + std::vector<Matcher<V>> list = { +#define INST(fn, name, bitstring) DYNARMIC_DECODER_GET_MATCHER(Matcher, fn, name, Decoder::detail::StringToArray<32>(bitstring)), +#include "./a64.inc" +#undef INST + }; + + std::stable_sort(list.begin(), list.end(), [](const auto& matcher1, const auto& matcher2) { + // If a matcher has more bits in its mask it is more specific, so it should come first. + return mcl::bit::count_ones(matcher1.GetMask()) > mcl::bit::count_ones(matcher2.GetMask()); + }); + + // Exceptions to the above rule of thumb. + const std::set<std::string> comes_first{ + "MOVI, MVNI, ORR, BIC (vector, immediate)", + "FMOV (vector, immediate)", + "Unallocated SIMD modified immediate", + }; + + std::stable_partition(list.begin(), list.end(), [&](const auto& matcher) { + return comes_first.count(matcher.GetName()) > 0; + }); + + DecodeTable<V> table{}; + for (size_t i = 0; i < table.size(); ++i) { + for (auto matcher : list) { + const auto expect = detail::ToFastLookupIndex(matcher.GetExpected()); + const auto mask = detail::ToFastLookupIndex(matcher.GetMask()); + if ((i & mask) == expect) { + table[i].push_back(matcher); + } + } + } + return table; +} + +template<typename V> +std::optional<std::reference_wrapper<const Matcher<V>>> Decode(u32 instruction) { + static const auto table = GetDecodeTable<V>(); + + const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); }; + + const auto& subtable = table[detail::ToFastLookupIndex(instruction)]; + auto iter = std::find_if(subtable.begin(), subtable.end(), matches_instruction); + return iter != subtable.end() ? std::optional<std::reference_wrapper<const Matcher<V>>>(*iter) : std::nullopt; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.inc b/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.inc new file mode 100644 index 0000000000..23f8b71933 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/decoder/a64.inc @@ -0,0 +1,1029 @@ +// Data processing - Immediate - PC relative addressing +INST(ADR, "ADR", "0ii10000iiiiiiiiiiiiiiiiiiiddddd") +INST(ADRP, "ADRP", "1ii10000iiiiiiiiiiiiiiiiiiiddddd") + +// Data processing - Immediate - Add/Sub (with tags) +//INST(ADDG, "ADDG", "1001000110iiiiii00IIIInnnnnddddd") // ARMv8.5 +//INST(SUBG, "SUBG", "1101000110iiiiii00IIIInnnnnddddd") // ARMv8.5 + +// Data processing - Immediate - Add/Sub +INST(ADD_imm, "ADD (immediate)", "z0010001ssiiiiiiiiiiiinnnnnddddd") +INST(ADDS_imm, "ADDS (immediate)", "z0110001ssiiiiiiiiiiiinnnnnddddd") +INST(SUB_imm, "SUB (immediate)", "z1010001ssiiiiiiiiiiiinnnnnddddd") +INST(SUBS_imm, "SUBS (immediate)", "z1110001ssiiiiiiiiiiiinnnnnddddd") + +// Data processing - Immediate - Logical +INST(AND_imm, "AND (immediate)", "z00100100Nrrrrrrssssssnnnnnddddd") +INST(ORR_imm, "ORR (immediate)", "z01100100Nrrrrrrssssssnnnnnddddd") +INST(EOR_imm, "EOR (immediate)", "z10100100Nrrrrrrssssssnnnnnddddd") +INST(ANDS_imm, "ANDS (immediate)", "z11100100Nrrrrrrssssssnnnnnddddd") + +// Data processing - Immediate - Move Wide +INST(MOVN, "MOVN", "z00100101ssiiiiiiiiiiiiiiiiddddd") +INST(MOVZ, "MOVZ", "z10100101ssiiiiiiiiiiiiiiiiddddd") +INST(MOVK, "MOVK", "z11100101ssiiiiiiiiiiiiiiiiddddd") + +// Data processing - Immediate - Bitfield +INST(SBFM, "SBFM", "z00100110Nrrrrrrssssssnnnnnddddd") +INST(BFM, "BFM", "z01100110Nrrrrrrssssssnnnnnddddd") +INST(UBFM, "UBFM", "z10100110Nrrrrrrssssssnnnnnddddd") +INST(ASR_1, "ASR (immediate, 32-bit)", "00010011000rrrrr011111nnnnnddddd") +INST(ASR_2, "ASR (immediate, 64-bit)", "1001001101rrrrrr111111nnnnnddddd") +INST(SXTB_1, "SXTB (32-bit)", "0001001100000000000111nnnnnddddd") +INST(SXTB_2, "SXTB (64-bit)", "1001001101000000000111nnnnnddddd") +INST(SXTH_1, "SXTH (32-bit)", "0001001100000000001111nnnnnddddd") +INST(SXTH_2, "SXTH (64-bit)", "1001001101000000001111nnnnnddddd") +INST(SXTW, "SXTW", "1001001101000000011111nnnnnddddd") + +// Data processing - Immediate - Extract +INST(EXTR, "EXTR", "z00100111N0mmmmmssssssnnnnnddddd") + +// Conditional branch +INST(B_cond, "B.cond", "01010100iiiiiiiiiiiiiiiiiii0cccc") + +// Exception generation +INST(SVC, "SVC", "11010100000iiiiiiiiiiiiiiii00001") +//INST(HVC, "HVC", "11010100000iiiiiiiiiiiiiiii00010") +//INST(SMC, "SMC", "11010100000iiiiiiiiiiiiiiii00011") +INST(BRK, "BRK", "11010100001iiiiiiiiiiiiiiii00000") +//INST(HLT, "HLT", "11010100010iiiiiiiiiiiiiiii00000") +//INST(DCPS1, "DCPS1", "11010100101iiiiiiiiiiiiiiii00001") +//INST(DCPS2, "DCPS2", "11010100101iiiiiiiiiiiiiiii00010") +//INST(DCPS3, "DCPS3", "11010100101iiiiiiiiiiiiiiii00011") + +// System +//INST(MSR_imm, "MSR (immediate)", "1101010100000ooo0100MMMMooo11111") +INST(HINT, "HINT", "11010101000000110010MMMMooo11111") +INST(NOP, "NOP", "11010101000000110010000000011111") +INST(YIELD, "YIELD", "11010101000000110010000000111111") +INST(WFE, "WFE", "11010101000000110010000001011111") +INST(WFI, "WFI", "11010101000000110010000001111111") +INST(SEV, "SEV", "11010101000000110010000010011111") +INST(SEVL, "SEVL", "11010101000000110010000010111111") +//INST(DGH, "DGH", "11010101000000110010000011011111") // v8.6 +//INST(WFET, "WFET", "110101010000001100010000000ddddd") // v8.7 +//INST(WFIT, "WFIT", "110101010000001100010000001ddddd") // v8.7 +//INST(XPAC_1, "XPACD, XPACI, XPACLRI", "110110101100000101000D11111ddddd") +//INST(XPAC_2, "XPACD, XPACI, XPACLRI", "11010101000000110010000011111111") +//INST(PACIA_1, "PACIA, PACIA1716, PACIASP, PACIAZ, PACIZA", "110110101100000100Z000nnnnnddddd") +//INST(PACIA_2, "PACIA, PACIA1716, PACIASP, PACIAZ, PACIZA", "1101010100000011001000-100-11111") +//INST(PACIB_1, "PACIB, PACIB1716, PACIBSP, PACIBZ, PACIZB", "110110101100000100Z001nnnnnddddd") +//INST(PACIB_2, "PACIB, PACIB1716, PACIBSP, PACIBZ, PACIZB", "1101010100000011001000-101-11111") +//INST(AUTIA_1, "AUTIA, AUTIA1716, AUTIASP, AUTIAZ, AUTIZA", "110110101100000100Z100nnnnnddddd") +//INST(AUTIA_2, "AUTIA, AUTIA1716, AUTIASP, AUTIAZ, AUTIZA", "1101010100000011001000-110-11111") +//INST(AUTIB_1, "AUTIB, AUTIB1716, AUTIBSP, AUTIBZ, AUTIZB", "110110101100000100Z101nnnnnddddd") +//INST(AUTIB_2, "AUTIB, AUTIB1716, AUTIBSP, AUTIBZ, AUTIZB", "1101010100000011001000-111-11111") +//INST(BTI, "BTI", "110101010000001100100100ii011111") // ARMv8.5 +//INST(ESB, "ESB", "11010101000000110010001000011111") +//INST(PSB, "PSB CSYNC", "11010101000000110010001000111111") +//INST(TSB, "TSB CSYNC", "11010101000000110010001001011111") // ARMv8.5 +//INST(CSDB, "CSDB", "11010101000000110010001010011111") +INST(CLREX, "CLREX", "11010101000000110011MMMM01011111") +INST(DSB, "DSB", "11010101000000110011MMMM10011111") +//INST(SSBB, "SSBB", "11010101000000110011000010011111") +//INST(PSSBB, "PSSBB", "11010101000000110011010010011111") +INST(DMB, "DMB", "11010101000000110011MMMM10111111") +INST(ISB, "ISB", "11010101000000110011MMMM11011111") +//INST(SB, "SB", "11010101000000110011000011111111") +//INST(SYS, "SYS", "1101010100001oooNNNNMMMMooottttt") +INST(MSR_reg, "MSR (register)", "110101010001poooNNNNMMMMooottttt") +//INST(SYSL, "SYSL", "1101010100101oooNNNNMMMMooottttt") +INST(MRS, "MRS", "110101010011poooNNNNMMMMooottttt") + +// System - Flag manipulation instructions +INST(CFINV, "CFINV", "11010101000000000100000000011111") // ARMv8.4 +INST(RMIF, "RMIF", "10111010000iiiiii00001nnnnn0IIII") // ARMv8.4 +//INST(SETF8, "SETF8", "0011101000000000000010nnnnn01101") // ARMv8.4 +//INST(SETF16, "SETF16", "0011101000000000010010nnnnn01101") // ARMv8.4 + +// System - Flag format instructions +INST(XAFlag, "XAFlag", "11010101000000000100000000111111") // ARMv8.5 +INST(AXFlag, "AXFlag", "11010101000000000100000001011111") // ARMv8.5 + +// SYS: Data Cache +INST(DC_IVAC, "DC IVAC", "110101010000100001110110001ttttt") +INST(DC_ISW, "DC ISW", "110101010000100001110110010ttttt") +INST(DC_CSW, "DC CSW", "110101010000100001111010010ttttt") +INST(DC_CISW, "DC CISW", "110101010000100001111110010ttttt") +INST(DC_ZVA, "DC ZVA", "110101010000101101110100001ttttt") +INST(DC_CVAC, "DC CVAC", "110101010000101101111010001ttttt") +INST(DC_CVAU, "DC CVAU", "110101010000101101111011001ttttt") +INST(DC_CVAP, "DC CVAP", "110101010000101101111100001ttttt") +INST(DC_CIVAC, "DC CIVAC", "110101010000101101111110001ttttt") + +// SYS: Instruction Cache +INST(IC_IALLU, "IC IALLU", "11010101000010000111010100011111") +INST(IC_IALLUIS, "IC IALLUIS", "11010101000010000111000100011111") +INST(IC_IVAU, "IC IVAU", "110101010000101101110101001ttttt") + +// Unconditional branch (Register) +INST(BLR, "BLR", "1101011000111111000000nnnnn00000") +INST(BR, "BR", "1101011000011111000000nnnnn00000") +//INST(DRPS, "DRPS", "11010110101111110000001111100000") +//INST(ERET, "ERET", "11010110100111110000001111100000") +INST(RET, "RET", "1101011001011111000000nnnnn00000") +//INST(BLRA, "BLRAA, BLRAAZ, BLRAB, BLRABZ", "1101011Z0011111100001Mnnnnnmmmmm") // ARMv8.3 +//INST(BRA, "BRAA, BRAAZ, BRAB, BRABZ", "1101011Z0001111100001Mnnnnnmmmmm") // ARMv8.3 +//INST(ERETA, "ERETAA, ERETAB", "110101101001111100001M1111111111") // ARMv8.3 +//INST(RETA, "RETAA, RETAB", "110101100101111100001M1111111111") // ARMv8.3 + +// Unconditional branch (immediate) +INST(B_uncond, "B", "000101iiiiiiiiiiiiiiiiiiiiiiiiii") +INST(BL, "BL", "100101iiiiiiiiiiiiiiiiiiiiiiiiii") + +// Compare and branch (immediate) +INST(CBZ, "CBZ", "z0110100iiiiiiiiiiiiiiiiiiittttt") +INST(CBNZ, "CBNZ", "z0110101iiiiiiiiiiiiiiiiiiittttt") +INST(TBZ, "TBZ", "b0110110bbbbbiiiiiiiiiiiiiittttt") +INST(TBNZ, "TBNZ", "b0110111bbbbbiiiiiiiiiiiiiittttt") + +// Loads and stores - Advanced SIMD Load/Store multiple structures +INST(STx_mult_1, "STx (multiple structures)", "0Q00110000000000oooozznnnnnttttt") +INST(STx_mult_2, "STx (multiple structures)", "0Q001100100mmmmmoooozznnnnnttttt") +INST(LDx_mult_1, "LDx (multiple structures)", "0Q00110001000000oooozznnnnnttttt") +INST(LDx_mult_2, "LDx (multiple structures)", "0Q001100110mmmmmoooozznnnnnttttt") + +// Loads and stores - Advanced SIMD Load/Store single structures +INST(ST1_sngl_1, "ST1 (single structure)", "0Q00110100000000oo0Szznnnnnttttt") +INST(ST1_sngl_2, "ST1 (single structure)", "0Q001101100mmmmmoo0Szznnnnnttttt") +INST(ST3_sngl_1, "ST3 (single structure)", "0Q00110100000000oo1Szznnnnnttttt") +INST(ST3_sngl_2, "ST3 (single structure)", "0Q001101100mmmmmoo1Szznnnnnttttt") +INST(ST2_sngl_1, "ST2 (single structure)", "0Q00110100100000oo0Szznnnnnttttt") +INST(ST2_sngl_2, "ST2 (single structure)", "0Q001101101mmmmmoo0Szznnnnnttttt") +INST(ST4_sngl_1, "ST4 (single structure)", "0Q00110100100000oo1Szznnnnnttttt") +INST(ST4_sngl_2, "ST4 (single structure)", "0Q001101101mmmmmoo1Szznnnnnttttt") +INST(LD1_sngl_1, "LD1 (single structure)", "0Q00110101000000oo0Szznnnnnttttt") +INST(LD1_sngl_2, "LD1 (single structure)", "0Q001101110mmmmmoo0Szznnnnnttttt") +INST(LD3_sngl_1, "LD3 (single structure)", "0Q00110101000000oo1Szznnnnnttttt") +INST(LD3_sngl_2, "LD3 (single structure)", "0Q001101110mmmmmoo1Szznnnnnttttt") +INST(LD1R_1, "LD1R", "0Q001101010000001100zznnnnnttttt") +INST(LD1R_2, "LD1R", "0Q001101110mmmmm1100zznnnnnttttt") +INST(LD3R_1, "LD3R", "0Q001101010000001110zznnnnnttttt") +INST(LD3R_2, "LD3R", "0Q001101110mmmmm1110zznnnnnttttt") +INST(LD2_sngl_1, "LD2 (single structure)", "0Q00110101100000oo0Szznnnnnttttt") +INST(LD2_sngl_2, "LD2 (single structure)", "0Q001101111mmmmmoo0Szznnnnnttttt") +INST(LD4_sngl_1, "LD4 (single structure)", "0Q00110101100000oo1Szznnnnnttttt") +INST(LD4_sngl_2, "LD4 (single structure)", "0Q001101111mmmmmoo1Szznnnnnttttt") +INST(LD2R_1, "LD2R", "0Q001101011000001100zznnnnnttttt") +INST(LD2R_2, "LD2R", "0Q001101111mmmmm1100zznnnnnttttt") +INST(LD4R_1, "LD4R", "0Q001101011000001110zznnnnnttttt") +INST(LD4R_2, "LD4R", "0Q001101111mmmmm1110zznnnnnttttt") + +// Loads and stores - Load/Store Exclusive +INST(STXR, "STXRB, STXRH, STXR", "zz001000000sssss011111nnnnnttttt") +INST(STLXR, "STLXRB, STLXRH, STLXR", "zz001000000sssss111111nnnnnttttt") +INST(STXP, "STXP", "1z001000001sssss0uuuuunnnnnttttt") +INST(STLXP, "STLXP", "1z001000001sssss1uuuuunnnnnttttt") +INST(LDXR, "LDXRB, LDXRH, LDXR", "zz00100001011111011111nnnnnttttt") +INST(LDAXR, "LDAXRB, LDAXRH, LDAXR", "zz00100001011111111111nnnnnttttt") +INST(LDXP, "LDXP", "1z001000011111110uuuuunnnnnttttt") +INST(LDAXP, "LDAXP", "1z001000011111111uuuuunnnnnttttt") +INST(STLLR, "STLLRB, STLLRH, STLLR", "zz00100010011111011111nnnnnttttt") +INST(STLR, "STLRB, STLRH, STLR", "zz00100010011111111111nnnnnttttt") +INST(LDLAR, "LDLARB, LDLARH, LDLAR", "zz00100011011111011111nnnnnttttt") +INST(LDAR, "LDARB, LDARH, LDAR", "zz00100011011111111111nnnnnttttt") +//INST(CASP, "CASP, CASPA, CASPAL, CASPL", "0z0010000L1sssssp11111nnnnnttttt") // ARMv8.1 +//INST(CASB, "CASB, CASAB, CASALB, CASLB", "000010001L1sssssp11111nnnnnttttt") // ARMv8.1 +//INST(CASH, "CASH, CASAH, CASALH, CASLH", "010010001L1sssssp11111nnnnnttttt") // ARMv8.1 +//INST(CAS, "CAS, CASA, CASAL, CASL", "1z0010001L1sssssp11111nnnnnttttt") // ARMv8.1 + +// Loads and stores - Load register (literal) +INST(LDR_lit_gen, "LDR (literal)", "0z011000iiiiiiiiiiiiiiiiiiittttt") +INST(LDRSW_lit, "LDRSW (literal)", "10011000iiiiiiiiiiiiiiiiiiittttt") +INST(PRFM_lit, "PRFM (literal)", "11011000iiiiiiiiiiiiiiiiiiittttt") +INST(LDR_lit_fpsimd, "LDR (literal, SIMD&FP)", "oo011100iiiiiiiiiiiiiiiiiiittttt") + +// Loads and stores - Load/Store no-allocate pair +INST(STNP_LDNP_gen, "STNP/LDNP", "o01010000Liiiiiiiuuuuunnnnnttttt") +INST(STNP_LDNP_fpsimd, "STNP/LDNP (SIMD&FP)", "oo1011000Liiiiiiiuuuuunnnnnttttt") + +// Loads and stores - Load/Store register pair +INST(STP_LDP_gen, "STP/LDP", "oo10100pwLiiiiiiiuuuuunnnnnttttt") +INST(UnallocatedEncoding, "", "--1010000-----------------------") +INST(STP_LDP_fpsimd, "STP/LDP (SIMD&FP)", "oo10110pwLiiiiiiiuuuuunnnnnttttt") +INST(UnallocatedEncoding, "", "--1011000-----------------------") + +// Loads and stores - Load/Store register (unscaled immediate) +INST(STURx_LDURx, "STURx/LDURx", "zz111000oo0iiiiiiiii00nnnnnttttt") +INST(UnallocatedEncoding, "", "111110001-0---------00----------") +INST(UnallocatedEncoding, "", "10111000110---------00----------") +INST(PRFM_imm, "PRFM (immediate)", "1111100110iiiiiiiiiiiinnnnnttttt") +INST(PRFM_unscaled_imm, "PRFM (unscaled offset)", "11111000100iiiiiiiii00nnnnnttttt") +INST(STUR_fpsimd, "STUR (SIMD&FP)", "zz111100o00iiiiiiiii00nnnnnttttt") +INST(LDUR_fpsimd, "LDUR (SIMD&FP)", "zz111100o10iiiiiiiii00nnnnnttttt") + +// Loads and stores - Load/Store register (immediate pre/post-indexed) +INST(STRx_LDRx_imm_1, "STRx/LDRx (immediate)", "zz111000oo0iiiiiiiiip1nnnnnttttt") +INST(STRx_LDRx_imm_2, "STRx/LDRx (immediate)", "zz111001ooiiiiiiiiiiiinnnnnttttt") +INST(UnallocatedEncoding, "", "111110001-0----------1----------") +INST(UnallocatedEncoding, "", "10111000110----------1----------") +INST(UnallocatedEncoding, "", "1111100111----------------------") +INST(UnallocatedEncoding, "", "1011100111----------------------") +INST(STR_imm_fpsimd_1, "STR (immediate, SIMD&FP)", "zz111100o00iiiiiiiiip1nnnnnttttt") +INST(STR_imm_fpsimd_2, "STR (immediate, SIMD&FP)", "zz111101o0iiiiiiiiiiiinnnnnttttt") +INST(LDR_imm_fpsimd_1, "LDR (immediate, SIMD&FP)", "zz111100o10iiiiiiiiip1nnnnnttttt") +INST(LDR_imm_fpsimd_2, "LDR (immediate, SIMD&FP)", "zz111101o1iiiiiiiiiiiinnnnnttttt") +//INST(STGP_1, "STGP (post-index)", "0110100010iiiiiiimmmmmnnnnnttttt") // ARMv8.5 +//INST(STGP_2, "STGP (pre-index)", "0110100110iiiiiiimmmmmnnnnnttttt") // ARMv8.5 +//INST(STGP_3, "STGP (signed-offset)", "0110100100iiiiiiimmmmmnnnnnttttt") // ARMv8.5 + +// Loads and stores - Load/Store register (unprivileged) +INST(STTRB, "STTRB", "00111000000iiiiiiiii10nnnnnttttt") +INST(LDTRB, "LDTRB", "00111000010iiiiiiiii10nnnnnttttt") +INST(LDTRSB, "LDTRSB", "00111000oo0iiiiiiiii10nnnnnttttt") +INST(STTRH, "STTRH", "01111000000iiiiiiiii10nnnnnttttt") +INST(LDTRH, "LDTRH", "01111000010iiiiiiiii10nnnnnttttt") +INST(LDTRSH, "LDTRSH", "01111000oo0iiiiiiiii10nnnnnttttt") +INST(STTR, "STTR", "zz111000000iiiiiiiii10nnnnnttttt") +INST(LDTR, "LDTR", "zz111000010iiiiiiiii10nnnnnttttt") +INST(LDTRSW, "LDTRSW", "10111000100iiiiiiiii10nnnnnttttt") + +// Loads and stores - Atomic memory options +//INST(LDADDB, "LDADDB, LDADDAB, LDADDALB, LDADDLB", "00111000AR1sssss000000nnnnnttttt") +//INST(LDCLRB, "LDCLRB, LDCLRAB, LDCLRALB, LDCLRLB", "00111000AR1sssss000100nnnnnttttt") +//INST(LDEORB, "LDEORB, LDEORAB, LDEORALB, LDEORLB", "00111000AR1sssss001000nnnnnttttt") +//INST(LDSETB, "LDSETB, LDSETAB, LDSETALB, LDSETLB", "00111000AR1sssss001100nnnnnttttt") +//INST(LDSMAXB, "LDSMAXB, LDSMAXAB, LDSMAXALB, LDSMAXLB", "00111000AR1sssss010000nnnnnttttt") +//INST(LDSMINB, "LDSMINB, LDSMINAB, LDSMINALB, LDSMINLB", "00111000AR1sssss010100nnnnnttttt") +//INST(LDUMAXB, "LDUMAXB, LDUMAXAB, LDUMAXALB, LDUMAXLB", "00111000AR1sssss011000nnnnnttttt") +//INST(LDUMINB, "LDUMINB, LDUMINAB, LDUMINALB, LDUMINLB", "00111000AR1sssss011100nnnnnttttt") +//INST(SWPB, "SWPB, SWPAB, SWPALB, SWPLB", "00111000AR1sssss100000nnnnnttttt") +//INST(LDAPRB, "LDAPRB", "0011100010111111110000nnnnnttttt") +//INST(LDADDH, "LDADDH, LDADDAH, LDADDALH, LDADDLH", "01111000AR1sssss000000nnnnnttttt") +//INST(LDCLRH, "LDCLRH, LDCLRAH, LDCLRALH, LDCLRLH", "01111000AR1sssss000100nnnnnttttt") +//INST(LDEORH, "LDEORH, LDEORAH, LDEORALH, LDEORLH", "01111000AR1sssss001000nnnnnttttt") +//INST(LDSETH, "LDSETH, LDSETAH, LDSETALH, LDSETLH", "01111000AR1sssss001100nnnnnttttt") +//INST(LDSMAXH, "LDSMAXH, LDSMAXAH, LDSMAXALH, LDSMAXLH", "01111000AR1sssss010000nnnnnttttt") +//INST(LDSMINH, "LDSMINH, LDSMINAH, LDSMINALH, LDSMINLH", "01111000AR1sssss010100nnnnnttttt") +//INST(LDUMAXH, "LDUMAXH, LDUMAXAH, LDUMAXALH, LDUMAXLH", "01111000AR1sssss011000nnnnnttttt") +//INST(LDUMINH, "LDUMINH, LDUMINAH, LDUMINALH, LDUMINLH", "01111000AR1sssss011100nnnnnttttt") +//INST(SWPH, "SWPH, SWPAH, SWPALH, SWPLH", "01111000AR1sssss100000nnnnnttttt") +//INST(LDAPRH, "LDAPRH", "0111100010111111110000nnnnnttttt") +//INST(LDADD, "LDADD, LDADDA, LDADDAL, LDADDL", "1-111000AR1sssss000000nnnnnttttt") +//INST(LDCLR, "LDCLR, LDCLRA, LDCLRAL, LDCLRL", "1-111000AR1sssss000100nnnnnttttt") +//INST(LDEOR, "LDEOR, LDEORA, LDEORAL, LDEORL", "1-111000AR1sssss001000nnnnnttttt") +//INST(LDSET, "LDSET, LDSETA, LDSETAL, LDSETL", "1-111000AR1sssss001100nnnnnttttt") +//INST(LDSMAX, "LDSMAX, LDSMAXA, LDSMAXAL, LDSMAXL", "1-111000AR1sssss010000nnnnnttttt") +//INST(LDSMIN, "LDSMIN, LDSMINA, LDSMINAL, LDSMINL", "1-111000AR1sssss010100nnnnnttttt") +//INST(LDUMAX, "LDUMAX, LDUMAXA, LDUMAXAL, LDUMAXL", "1-111000AR1sssss011000nnnnnttttt") +//INST(LDUMIN, "LDUMIN, LDUMINA, LDUMINAL, LDUMINL", "1-111000AR1sssss011100nnnnnttttt") +//INST(SWP, "SWP, SWPA, SWPAL, SWPL", "1-111000AR1sssss100000nnnnnttttt") +//INST(LDAPR, "LDAPR", "1-11100010111111110000nnnnnttttt") +//INST(LD64B, "LD64B", "1111100000111111110100nnnnnttttt") // v8.7 +//INST(ST64B, "ST64B", "1111100000111111100100nnnnnttttt") // v8.7 +//INST(ST64BV, "ST64BV", "11111000001sssss101100nnnnnttttt") // v8.7 +//INST(ST64BV0, "ST64BV0", "11111000001sssss101000nnnnnttttt") // v8.7 + +// Loads and stores - Load/Store register (register offset) +INST(STRx_reg, "STRx (register)", "zz111000o01mmmmmxxxS10nnnnnttttt") +INST(LDRx_reg, "LDRx (register)", "zz111000o11mmmmmxxxS10nnnnnttttt") +INST(STR_reg_fpsimd, "STR (register, SIMD&FP)", "zz111100o01mmmmmxxxS10nnnnnttttt") +INST(LDR_reg_fpsimd, "LDR (register, SIMD&FP)", "zz111100o11mmmmmxxxS10nnnnnttttt") + +// Loads and stores - Load/Store memory tags +//INST(STG_1, "STG (post-index)", "11011001001iiiiiiiii01nnnnn11111") // ARMv8.5 +//INST(STG_2, "STG (pre-index)", "11011001001iiiiiiiii11nnnnn11111") // ARMv8.5 +//INST(STG_3, "STG (signed-offset)", "11011001001iiiiiiiii10nnnnn11111") // ARMv8.5 +//INST(LDG, "LDG", "11011001011iiiiiiiii00nnnnnttttt") // ARMv8.5 +//INST(STZG_1, "STZG (post-index)", "11011001011iiiiiiiii01nnnnn11111") // ARMv8.5 +//INST(STZG_2, "STZG (pre-index)", "11011001011iiiiiiiii11nnnnn11111") // ARMv8.5 +//INST(STZG_3, "STZG (signed-offset)", "11011001011iiiiiiiii10nnnnn11111") // ARMv8.5 +//INST(ST2G_1, "ST2G (post-index)", "11011001101iiiiiiiii01nnnnn11111") // ARMv8.5 +//INST(ST2G_2, "ST2G (pre-index)", "11011001101iiiiiiiii11nnnnn11111") // ARMv8.5 +//INST(ST2G_3, "ST2G (signed-offset)", "11011001101iiiiiiiii10nnnnn11111") // ARMv8.5 +//INST(STGV, "STGV", "1101100110100000000000nnnnnttttt") // ARMv8.5 +//INST(STZ2G_1, "STZ2G (post-index)", "11011001111iiiiiiiii01nnnnn11111") // ARMv8.5 +//INST(STZ2G_2, "STZ2G (pre-index)", "11011001111iiiiiiiii11nnnnn11111") // ARMv8.5 +//INST(STZ2G_3, "STZ2G (signed-offset)", "11011001111iiiiiiiii10nnnnn11111") // ARMv8.5 +//INST(LDGV, "LDGV", "1101100111100000000000nnnnnttttt") // ARMv8.5 + +// Loads and stores - Load/Store register (pointer authentication) +//INST(LDRA, "LDRAA, LDRAB", "11111000MS1iiiiiiiiiW1nnnnnttttt") + +// Data Processing - Register - 2 source +INST(UDIV, "UDIV", "z0011010110mmmmm000010nnnnnddddd") +INST(SDIV, "SDIV", "z0011010110mmmmm000011nnnnnddddd") +INST(LSLV, "LSLV", "z0011010110mmmmm001000nnnnnddddd") +INST(LSRV, "LSRV", "z0011010110mmmmm001001nnnnnddddd") +INST(ASRV, "ASRV", "z0011010110mmmmm001010nnnnnddddd") +INST(RORV, "RORV", "z0011010110mmmmm001011nnnnnddddd") +INST(CRC32, "CRC32B, CRC32H, CRC32W, CRC32X", "z0011010110mmmmm0100zznnnnnddddd") +INST(CRC32C, "CRC32CB, CRC32CH, CRC32CW, CRC32CX", "z0011010110mmmmm0101zznnnnnddddd") +//INST(PACGA, "PACGA", "10011010110mmmmm001100nnnnnddddd") +//INST(SUBP, "SUBP", "10011010110mmmmm000000nnnnnddddd") // ARMv8.5 +//INST(IRG, "IRG", "10011010110mmmmm000100nnnnnddddd") // ARMv8.5 +//INST(GMI, "GMI", "10011010110mmmmm000101nnnnnddddd") // ARMv8.5 +//INST(SUBPS, "SUBPS", "10111010110mmmmm000000nnnnnddddd") // ARMv8.5 + +// Data Processing - Register - 1 source +INST(RBIT_int, "RBIT", "z101101011000000000000nnnnnddddd") +INST(REV16_int, "REV16", "z101101011000000000001nnnnnddddd") +INST(REV, "REV", "z10110101100000000001onnnnnddddd") +INST(CLZ_int, "CLZ", "z101101011000000000100nnnnnddddd") +INST(CLS_int, "CLS", "z101101011000000000101nnnnnddddd") +INST(REV32_int, "REV32", "1101101011000000000010nnnnnddddd") +//INST(PACDA, "PACDA, PACDZA", "110110101100000100Z010nnnnnddddd") +//INST(PACDB, "PACDB, PACDZB", "110110101100000100Z011nnnnnddddd") +//INST(AUTDA, "AUTDA, AUTDZA", "110110101100000100Z110nnnnnddddd") +//INST(AUTDB, "AUTDB, AUTDZB", "110110101100000100Z111nnnnnddddd") + +// Data Processing - Register - Logical (shifted register) +INST(AND_shift, "AND (shifted register)", "z0001010ss0mmmmmiiiiiinnnnnddddd") +INST(BIC_shift, "BIC (shifted register)", "z0001010ss1mmmmmiiiiiinnnnnddddd") +INST(ORR_shift, "ORR (shifted register)", "z0101010ss0mmmmmiiiiiinnnnnddddd") +INST(ORN_shift, "ORN (shifted register)", "z0101010ss1mmmmmiiiiiinnnnnddddd") +INST(EOR_shift, "EOR (shifted register)", "z1001010ss0mmmmmiiiiiinnnnnddddd") +INST(EON, "EON (shifted register)", "z1001010ss1mmmmmiiiiiinnnnnddddd") +INST(ANDS_shift, "ANDS (shifted register)", "z1101010ss0mmmmmiiiiiinnnnnddddd") +INST(BICS, "BICS (shifted register)", "z1101010ss1mmmmmiiiiiinnnnnddddd") + +// Data Processing - Register - Add/Sub (shifted register) +INST(ADD_shift, "ADD (shifted register)", "z0001011ss0mmmmmiiiiiinnnnnddddd") +INST(ADDS_shift, "ADDS (shifted register)", "z0101011ss0mmmmmiiiiiinnnnnddddd") +INST(SUB_shift, "SUB (shifted register)", "z1001011ss0mmmmmiiiiiinnnnnddddd") +INST(SUBS_shift, "SUBS (shifted register)", "z1101011ss0mmmmmiiiiiinnnnnddddd") + +// Data Processing - Register - Add/Sub (shifted register) +INST(ADD_ext, "ADD (extended register)", "z0001011001mmmmmxxxiiinnnnnddddd") +INST(ADDS_ext, "ADDS (extended register)", "z0101011001mmmmmxxxiiinnnnnddddd") +INST(SUB_ext, "SUB (extended register)", "z1001011001mmmmmxxxiiinnnnnddddd") +INST(SUBS_ext, "SUBS (extended register)", "z1101011001mmmmmxxxiiinnnnnddddd") + +// Data Processing - Register - Add/Sub (with carry) +INST(ADC, "ADC", "z0011010000mmmmm000000nnnnnddddd") +INST(ADCS, "ADCS", "z0111010000mmmmm000000nnnnnddddd") +INST(SBC, "SBC", "z1011010000mmmmm000000nnnnnddddd") +INST(SBCS, "SBCS", "z1111010000mmmmm000000nnnnnddddd") + +// Data Processing - Register - Conditional compare +INST(CCMN_reg, "CCMN (register)", "z0111010010mmmmmcccc00nnnnn0ffff") +INST(CCMP_reg, "CCMP (register)", "z1111010010mmmmmcccc00nnnnn0ffff") +INST(CCMN_imm, "CCMN (immediate)", "z0111010010iiiiicccc10nnnnn0ffff") +INST(CCMP_imm, "CCMP (immediate)", "z1111010010iiiiicccc10nnnnn0ffff") + +// Data Processing - Register - Conditional select +INST(CSEL, "CSEL", "z0011010100mmmmmcccc00nnnnnddddd") +INST(CSINC, "CSINC", "z0011010100mmmmmcccc01nnnnnddddd") +INST(CSINV, "CSINV", "z1011010100mmmmmcccc00nnnnnddddd") +INST(CSNEG, "CSNEG", "z1011010100mmmmmcccc01nnnnnddddd") + +// Data Processing - Register - 3 source +INST(MADD, "MADD", "z0011011000mmmmm0aaaaannnnnddddd") +INST(MSUB, "MSUB", "z0011011000mmmmm1aaaaannnnnddddd") +INST(SMADDL, "SMADDL", "10011011001mmmmm0aaaaannnnnddddd") +INST(SMSUBL, "SMSUBL", "10011011001mmmmm1aaaaannnnnddddd") +INST(SMULH, "SMULH", "10011011010mmmmm011111nnnnnddddd") +INST(UMADDL, "UMADDL", "10011011101mmmmm0aaaaannnnnddddd") +INST(UMSUBL, "UMSUBL", "10011011101mmmmm1aaaaannnnnddddd") +INST(UMULH, "UMULH", "10011011110mmmmm011111nnnnnddddd") + +// Data Processing - FP and SIMD - AES +INST(AESE, "AESE", "0100111000101000010010nnnnnddddd") +INST(AESD, "AESD", "0100111000101000010110nnnnnddddd") +INST(AESMC, "AESMC", "0100111000101000011010nnnnnddddd") +INST(AESIMC, "AESIMC", "0100111000101000011110nnnnnddddd") + +// Data Processing - FP and SIMD - SHA +INST(SHA1C, "SHA1C", "01011110000mmmmm000000nnnnnddddd") +INST(SHA1P, "SHA1P", "01011110000mmmmm000100nnnnnddddd") +INST(SHA1M, "SHA1M", "01011110000mmmmm001000nnnnnddddd") +INST(SHA1SU0, "SHA1SU0", "01011110000mmmmm001100nnnnnddddd") +INST(SHA256H, "SHA256H", "01011110000mmmmm010000nnnnnddddd") +INST(SHA256H2, "SHA256H2", "01011110000mmmmm010100nnnnnddddd") +INST(SHA256SU1, "SHA256SU1", "01011110000mmmmm011000nnnnnddddd") +INST(SHA1H, "SHA1H", "0101111000101000000010nnnnnddddd") +INST(SHA1SU1, "SHA1SU1", "0101111000101000000110nnnnnddddd") +INST(SHA256SU0, "SHA256SU0", "0101111000101000001010nnnnnddddd") + +// Data Processing - FP and SIMD - Scalar copy +INST(DUP_elt_1, "DUP (element)", "01011110000iiiii000001nnnnnddddd") + +// Data Processing - FP and SIMD - Scalar three +//INST(FMULX_vec_1, "FMULX", "01011110010mmmmm000111nnnnnddddd") +INST(FMULX_vec_2, "FMULX", "010111100z1mmmmm110111nnnnnddddd") +INST(FCMEQ_reg_1, "FCMEQ (register)", "01011110010mmmmm001001nnnnnddddd") +INST(FCMEQ_reg_2, "FCMEQ (register)", "010111100z1mmmmm111001nnnnnddddd") +INST(FRECPS_1, "FRECPS", "01011110010mmmmm001111nnnnnddddd") +INST(FRECPS_2, "FRECPS", "010111100z1mmmmm111111nnnnnddddd") +INST(FRSQRTS_1, "FRSQRTS", "01011110110mmmmm001111nnnnnddddd") +INST(FRSQRTS_2, "FRSQRTS", "010111101z1mmmmm111111nnnnnddddd") +//INST(FCMGE_reg_1, "FCMGE (register)", "01111110010mmmmm001001nnnnnddddd") +INST(FCMGE_reg_2, "FCMGE (register)", "011111100z1mmmmm111001nnnnnddddd") +//INST(FACGE_1, "FACGE", "01111110010mmmmm001011nnnnnddddd") +INST(FACGE_2, "FACGE", "011111100z1mmmmm111011nnnnnddddd") +//INST(FABD_1, "FABD", "01111110110mmmmm000101nnnnnddddd") +INST(FABD_2, "FABD", "011111101z1mmmmm110101nnnnnddddd") +//INST(FCMGT_reg_1, "FCMGT (register)", "01111110110mmmmm001001nnnnnddddd") +INST(FCMGT_reg_2, "FCMGT (register)", "011111101z1mmmmm111001nnnnnddddd") +//INST(FACGT_1, "FACGT", "01111110110mmmmm001011nnnnnddddd") +INST(FACGT_2, "FACGT", "011111101z1mmmmm111011nnnnnddddd") + +// Data Processing - FP and SIMD - Scalar two register misc +//INST(FCVTNS_1, "FCVTNS (vector)", "0101111001111001101010nnnnnddddd") +INST(FCVTNS_2, "FCVTNS (vector)", "010111100z100001101010nnnnnddddd") +//INST(FCVTMS_1, "FCVTMS (vector)", "0101111001111001101110nnnnnddddd") +INST(FCVTMS_2, "FCVTMS (vector)", "010111100z100001101110nnnnnddddd") +//INST(FCVTAS_1, "FCVTAS (vector)", "0101111001111001110010nnnnnddddd") +INST(FCVTAS_2, "FCVTAS (vector)", "010111100z100001110010nnnnnddddd") +//INST(SCVTF_int_1, "SCVTF (vector, integer)", "0101111001111001110110nnnnnddddd") +INST(SCVTF_int_2, "SCVTF (vector, integer)", "010111100z100001110110nnnnnddddd") +//INST(FCMGT_zero_1, "FCMGT (zero)", "0101111011111000110010nnnnnddddd") +INST(FCMGT_zero_2, "FCMGT (zero)", "010111101z100000110010nnnnnddddd") +INST(FCMEQ_zero_1, "FCMEQ (zero)", "0101111011111000110110nnnnnddddd") +INST(FCMEQ_zero_2, "FCMEQ (zero)", "010111101z100000110110nnnnnddddd") +//INST(FCMLT_1, "FCMLT (zero)", "0101111011111000111010nnnnnddddd") +INST(FCMLT_2, "FCMLT (zero)", "010111101z100000111010nnnnnddddd") +//INST(FCVTPS_1, "FCVTPS (vector)", "0101111011111001101010nnnnnddddd") +INST(FCVTPS_2, "FCVTPS (vector)", "010111101z100001101010nnnnnddddd") +//INST(FCVTZS_int_1, "FCVTZS (vector, integer)", "0101111011111001101110nnnnnddddd") +INST(FCVTZS_int_2, "FCVTZS (vector, integer)", "010111101z100001101110nnnnnddddd") +INST(FRECPE_1, "FRECPE", "0101111011111001110110nnnnnddddd") +INST(FRECPE_2, "FRECPE", "010111101z100001110110nnnnnddddd") +INST(FRECPX_1, "FRECPX", "0101111011111001111110nnnnnddddd") +INST(FRECPX_2, "FRECPX", "010111101z100001111110nnnnnddddd") +//INST(FCVTNU_1, "FCVTNU (vector)", "0111111001111001101010nnnnnddddd") +INST(FCVTNU_2, "FCVTNU (vector)", "011111100z100001101010nnnnnddddd") +//INST(FCVTMU_1, "FCVTMU (vector)", "0111111001111001101110nnnnnddddd") +INST(FCVTMU_2, "FCVTMU (vector)", "011111100z100001101110nnnnnddddd") +//INST(FCVTAU_1, "FCVTAU (vector)", "0111111001111001110010nnnnnddddd") +INST(FCVTAU_2, "FCVTAU (vector)", "011111100z100001110010nnnnnddddd") +//INST(UCVTF_int_1, "UCVTF (vector, integer)", "0111111001111001110110nnnnnddddd") +INST(UCVTF_int_2, "UCVTF (vector, integer)", "011111100z100001110110nnnnnddddd") +//INST(FCMGE_zero_1, "FCMGE (zero)", "0111111011111000110010nnnnnddddd") +INST(FCMGE_zero_2, "FCMGE (zero)", "011111101z100000110010nnnnnddddd") +//INST(FCMLE_1, "FCMLE (zero)", "0111111011111000110110nnnnnddddd") +INST(FCMLE_2, "FCMLE (zero)", "011111101z100000110110nnnnnddddd") +//INST(FCVTPU_1, "FCVTPU (vector)", "0111111011111001101010nnnnnddddd") +INST(FCVTPU_2, "FCVTPU (vector)", "011111101z100001101010nnnnnddddd") +//INST(FCVTZU_int_1, "FCVTZU (vector, integer)", "0111111011111001101110nnnnnddddd") +INST(FCVTZU_int_2, "FCVTZU (vector, integer)", "011111101z100001101110nnnnnddddd") +INST(FRSQRTE_1, "FRSQRTE", "0111111011111001110110nnnnnddddd") +INST(FRSQRTE_2, "FRSQRTE", "011111101z100001110110nnnnnddddd") + +// Data Processing - FP and SIMD - Scalar three same extra +//INST(SQRDMLAH_vec_1, "SQRDMLAH (vector)", "01111110zz0mmmmm100001nnnnnddddd") +//INST(SQRDMLAH_vec_2, "SQRDMLAH (vector)", "0Q101110zz0mmmmm100001nnnnnddddd") +//INST(SQRDMLSH_vec_1, "SQRDMLSH (vector)", "01111110zz0mmmmm100011nnnnnddddd") +//INST(SQRDMLSH_vec_2, "SQRDMLSH (vector)", "0Q101110zz0mmmmm100011nnnnnddddd") + +// Data Processing - FP and SIMD - Scalar two-register misc +INST(SUQADD_1, "SUQADD", "01011110zz100000001110nnnnnddddd") +INST(SQABS_1, "SQABS", "01011110zz100000011110nnnnnddddd") +INST(CMGT_zero_1, "CMGT (zero)", "01011110zz100000100010nnnnnddddd") +INST(CMEQ_zero_1, "CMEQ (zero)", "01011110zz100000100110nnnnnddddd") +INST(CMLT_1, "CMLT (zero)", "01011110zz100000101010nnnnnddddd") +INST(ABS_1, "ABS", "01011110zz100000101110nnnnnddddd") +INST(SQXTN_1, "SQXTN, SQXTN2", "01011110zz100001010010nnnnnddddd") +INST(USQADD_1, "USQADD", "01111110zz100000001110nnnnnddddd") +INST(SQNEG_1, "SQNEG", "01111110zz100000011110nnnnnddddd") +INST(CMGE_zero_1, "CMGE (zero)", "01111110zz100000100010nnnnnddddd") +INST(CMLE_1, "CMLE (zero)", "01111110zz100000100110nnnnnddddd") +INST(NEG_1, "NEG (vector)", "01111110zz100000101110nnnnnddddd") +INST(SQXTUN_1, "SQXTUN, SQXTUN2", "01111110zz100001001010nnnnnddddd") +INST(UQXTN_1, "UQXTN, UQXTN2", "01111110zz100001010010nnnnnddddd") +INST(FCVTXN_1, "FCVTXN, FCVTXN2", "011111100z100001011010nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Scalar pairwise +INST(ADDP_pair, "ADDP (scalar)", "01011110zz110001101110nnnnnddddd") +//INST(FMAXNMP_pair_1, "FMAXNMP (scalar)", "0101111000110000110010nnnnnddddd") +INST(FMAXNMP_pair_2, "FMAXNMP (scalar)", "011111100z110000110010nnnnnddddd") +//INST(FADDP_pair_1, "FADDP (scalar)", "0101111000110000110110nnnnnddddd") +INST(FADDP_pair_2, "FADDP (scalar)", "011111100z110000110110nnnnnddddd") +//INST(FMAXP_pair_1, "FMAXP (scalar)", "0101111000110000111110nnnnnddddd") +INST(FMAXP_pair_2, "FMAXP (scalar)", "011111100z110000111110nnnnnddddd") +//INST(FMINNMP_pair_1, "FMINNMP (scalar)", "0101111010110000110010nnnnnddddd") +INST(FMINNMP_pair_2, "FMINNMP (scalar)", "011111101z110000110010nnnnnddddd") +//INST(FMINP_pair_1, "FMINP (scalar)", "0101111010110000111110nnnnnddddd") +INST(FMINP_pair_2, "FMINP (scalar)", "011111101z110000111110nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Scalar three different +//INST(SQDMLAL_vec_1, "SQDMLAL, SQDMLAL2 (vector)", "01011110zz1mmmmm100100nnnnnddddd") +//INST(SQDMLSL_vec_1, "SQDMLSL, SQDMLSL2 (vector)", "01011110zz1mmmmm101100nnnnnddddd") +//INST(SQDMULL_vec_1, "SQDMULL, SQDMULL2 (vector)", "01011110zz1mmmmm110100nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Scalar three same +INST(SQADD_1, "SQADD", "01011110zz1mmmmm000011nnnnnddddd") +INST(SQSUB_1, "SQSUB", "01011110zz1mmmmm001011nnnnnddddd") +INST(CMGT_reg_1, "CMGT (register)", "01011110zz1mmmmm001101nnnnnddddd") +INST(CMGE_reg_1, "CMGE (register)", "01011110zz1mmmmm001111nnnnnddddd") +INST(SSHL_1, "SSHL", "01011110zz1mmmmm010001nnnnnddddd") +INST(SQSHL_reg_1, "SQSHL (register)", "01011110zz1mmmmm010011nnnnnddddd") +INST(SRSHL_1, "SRSHL", "01011110zz1mmmmm010101nnnnnddddd") +//INST(SQRSHL_1, "SQRSHL", "01011110zz1mmmmm010111nnnnnddddd") +INST(ADD_1, "ADD (vector)", "01011110zz1mmmmm100001nnnnnddddd") +INST(CMTST_1, "CMTST", "01011110zz1mmmmm100011nnnnnddddd") +INST(SQDMULH_vec_1, "SQDMULH (vector)", "01011110zz1mmmmm101101nnnnnddddd") +INST(UQADD_1, "UQADD", "01111110zz1mmmmm000011nnnnnddddd") +INST(UQSUB_1, "UQSUB", "01111110zz1mmmmm001011nnnnnddddd") +INST(CMHI_1, "CMHI (register)", "01111110zz1mmmmm001101nnnnnddddd") +INST(CMHS_1, "CMHS (register)", "01111110zz1mmmmm001111nnnnnddddd") +INST(USHL_1, "USHL", "01111110zz1mmmmm010001nnnnnddddd") +INST(UQSHL_reg_1, "UQSHL (register)", "01111110zz1mmmmm010011nnnnnddddd") +INST(URSHL_1, "URSHL", "01111110zz1mmmmm010101nnnnnddddd") +//INST(UQRSHL_1, "UQRSHL", "01111110zz1mmmmm010111nnnnnddddd") +INST(SUB_1, "SUB (vector)", "01111110zz1mmmmm100001nnnnnddddd") +INST(CMEQ_reg_1, "CMEQ (register)", "01111110zz1mmmmm100011nnnnnddddd") +INST(SQRDMULH_vec_1, "SQRDMULH (vector)", "01111110zz1mmmmm101101nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Scalar shift by immediate +INST(SSHR_1, "SSHR", "010111110IIIIiii000001nnnnnddddd") +INST(SSRA_1, "SSRA", "010111110IIIIiii000101nnnnnddddd") +INST(SRSHR_1, "SRSHR", "010111110IIIIiii001001nnnnnddddd") +INST(SRSRA_1, "SRSRA", "010111110IIIIiii001101nnnnnddddd") +INST(SHL_1, "SHL", "010111110IIIIiii010101nnnnnddddd") +INST(SQSHL_imm_1, "SQSHL (immediate)", "010111110IIIIiii011101nnnnnddddd") +INST(SQSHRN_1, "SQSHRN, SQSHRN2", "010111110IIIIiii100101nnnnnddddd") +//INST(SQRSHRN_1, "SQRSHRN, SQRSHRN2", "010111110IIIIiii100111nnnnnddddd") +INST(SCVTF_fix_1, "SCVTF (vector, fixed-point)", "010111110IIIIiii111001nnnnnddddd") +INST(FCVTZS_fix_1, "FCVTZS (vector, fixed-point)", "010111110IIIIiii111111nnnnnddddd") +INST(USHR_1, "USHR", "011111110IIIIiii000001nnnnnddddd") +INST(USRA_1, "USRA", "011111110IIIIiii000101nnnnnddddd") +INST(URSHR_1, "URSHR", "011111110IIIIiii001001nnnnnddddd") +INST(URSRA_1, "URSRA", "011111110IIIIiii001101nnnnnddddd") +INST(SRI_1, "SRI", "011111110IIIIiii010001nnnnnddddd") +INST(SLI_1, "SLI", "011111110IIIIiii010101nnnnnddddd") +INST(SQSHLU_1, "SQSHLU", "011111110IIIIiii011001nnnnnddddd") +INST(UQSHL_imm_1, "UQSHL (immediate)", "011111110IIIIiii011101nnnnnddddd") +INST(SQSHRUN_1, "SQSHRUN, SQSHRUN2", "011111110IIIIiii100001nnnnnddddd") +//INST(SQRSHRUN_1, "SQRSHRUN, SQRSHRUN2", "011111110IIIIiii100011nnnnnddddd") +INST(UQSHRN_1, "UQSHRN, UQSHRN2", "011111110IIIIiii100101nnnnnddddd") +//INST(UQRSHRN_1, "UQRSHRN, UQRSHRN2", "011111110IIIIiii100111nnnnnddddd") +INST(UCVTF_fix_1, "UCVTF (vector, fixed-point)", "011111110IIIIiii111001nnnnnddddd") +INST(FCVTZU_fix_1, "FCVTZU (vector, fixed-point)", "011111110IIIIiii111111nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Scalar x indexed element +//INST(SQDMLAL_elt_1, "SQDMLAL, SQDMLAL2 (by element)", "01011111zzLMmmmm0011H0nnnnnddddd") +//INST(SQDMLSL_elt_1, "SQDMLSL, SQDMLSL2 (by element)", "01011111zzLMmmmm0111H0nnnnnddddd") +INST(SQDMULL_elt_1, "SQDMULL, SQDMULL2 (by element)", "01011111zzLMmmmm1011H0nnnnnddddd") +INST(SQDMULH_elt_1, "SQDMULH (by element)", "01011111zzLMmmmm1100H0nnnnnddddd") +INST(SQRDMULH_elt_1, "SQRDMULH (by element)", "01011111zzLMmmmm1101H0nnnnnddddd") +INST(FMLA_elt_1, "FMLA (by element)", "0101111100LMmmmm0001H0nnnnnddddd") +INST(FMLA_elt_2, "FMLA (by element)", "010111111zLMmmmm0001H0nnnnnddddd") +INST(FMLS_elt_1, "FMLS (by element)", "0101111100LMmmmm0101H0nnnnnddddd") +INST(FMLS_elt_2, "FMLS (by element)", "010111111zLMmmmm0101H0nnnnnddddd") +//INST(FMUL_elt_1, "FMUL (by element)", "0101111100LMmmmm1001H0nnnnnddddd") +INST(FMUL_elt_2, "FMUL (by element)", "010111111zLMmmmm1001H0nnnnnddddd") +//INST(SQRDMLAH_elt_1, "SQRDMLAH (by element)", "01111111zzLMmmmm1101H0nnnnnddddd") +//INST(SQRDMLSH_elt_1, "SQRDMLSH (by element)", "01111111zzLMmmmm1111H0nnnnnddddd") +//INST(FMULX_elt_1, "FMULX (by element)", "0111111100LMmmmm1001H0nnnnnddddd") +INST(FMULX_elt_2, "FMULX (by element)", "011111111zLMmmmm1001H0nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Table Lookup +INST(TBL, "TBL", "0Q001110000mmmmm0LL000nnnnnddddd") +INST(TBX, "TBX", "0Q001110000mmmmm0LL100nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Permute +INST(UZP1, "UZP1", "0Q001110zz0mmmmm000110nnnnnddddd") +INST(TRN1, "TRN1", "0Q001110zz0mmmmm001010nnnnnddddd") +INST(ZIP1, "ZIP1", "0Q001110zz0mmmmm001110nnnnnddddd") +INST(UZP2, "UZP2", "0Q001110zz0mmmmm010110nnnnnddddd") +INST(TRN2, "TRN2", "0Q001110zz0mmmmm011010nnnnnddddd") +INST(ZIP2, "ZIP2", "0Q001110zz0mmmmm011110nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Extract +INST(EXT, "EXT", "0Q101110000mmmmm0iiii0nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Copy +INST(DUP_elt_2, "DUP (element)", "0Q001110000iiiii000001nnnnnddddd") +INST(DUP_gen, "DUP (general)", "0Q001110000iiiii000011nnnnnddddd") +INST(SMOV, "SMOV", "0Q001110000iiiii001011nnnnnddddd") +INST(UMOV, "UMOV", "0Q001110000iiiii001111nnnnnddddd") +INST(INS_gen, "INS (general)", "01001110000iiiii000111nnnnnddddd") +INST(INS_elt, "INS (element)", "01101110000iiiii0iiii1nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Three same +//INST(FMULX_vec_3, "FMULX", "0Q001110010mmmmm000111nnnnnddddd") +INST(FCMEQ_reg_3, "FCMEQ (register)", "0Q001110010mmmmm001001nnnnnddddd") +INST(FRECPS_3, "FRECPS", "0Q001110010mmmmm001111nnnnnddddd") +INST(FRSQRTS_3, "FRSQRTS", "0Q001110110mmmmm001111nnnnnddddd") +//INST(FCMGE_reg_3, "FCMGE (register)", "0Q101110010mmmmm001001nnnnnddddd") +//INST(FACGE_3, "FACGE", "0Q101110010mmmmm001011nnnnnddddd") +//INST(FABD_3, "FABD", "0Q101110110mmmmm000101nnnnnddddd") +//INST(FCMGT_reg_3, "FCMGT (register)", "0Q101110110mmmmm001001nnnnnddddd") +//INST(FACGT_3, "FACGT", "0Q101110110mmmmm001011nnnnnddddd") +//INST(FMAXNM_1, "FMAXNM (vector)", "0Q001110010mmmmm000001nnnnnddddd") +INST(FMLA_vec_1, "FMLA (vector)", "0Q001110010mmmmm000011nnnnnddddd") +//INST(FADD_1, "FADD (vector)", "0Q001110010mmmmm000101nnnnnddddd") +//INST(FMAX_1, "FMAX (vector)", "0Q001110010mmmmm001101nnnnnddddd") +//INST(FMINNM_1, "FMINNM (vector)", "0Q001110110mmmmm000001nnnnnddddd") +INST(FMLS_vec_1, "FMLS (vector)", "0Q001110110mmmmm000011nnnnnddddd") +//INST(FSUB_1, "FSUB (vector)", "0Q001110110mmmmm000101nnnnnddddd") +//INST(FMIN_1, "FMIN (vector)", "0Q001110110mmmmm001101nnnnnddddd") +//INST(FMAXNMP_vec_1, "FMAXNMP (vector)", "0Q101110010mmmmm000001nnnnnddddd") +//INST(FADDP_vec_1, "FADDP (vector)", "0Q101110010mmmmm000101nnnnnddddd") +//INST(FMUL_vec_1, "FMUL (vector)", "0Q101110010mmmmm000111nnnnnddddd") +//INST(FMAXP_vec_1, "FMAXP (vector)", "0Q101110010mmmmm001101nnnnnddddd") +//INST(FDIV_1, "FDIV (vector)", "0Q101110010mmmmm001111nnnnnddddd") +//INST(FMINNMP_vec_1, "FMINNMP (vector)", "0Q101110110mmmmm000001nnnnnddddd") +//INST(FMINP_vec_1, "FMINP (vector)", "0Q101110110mmmmm001101nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Three same extra +//INST(SMMLA_vec, "SMMLA", "01001110100mmmmm101001nnnnnddddd") // v8.6 +//INST(UMMLA_vec, "UMMLA", "01101110100mmmmm101001nnnnnddddd") // v8.6 +//INST(USMMLA_vec, "USMMLA", "01001110100mmmmm101011nnnnnddddd") // v8.6 +//INST(SUDOT_element, "SUDOT (by element)", "0Q00111100LMmmmm1111H0nnnnnddddd") // v8.6 +//INST(USDOT_element, "USDOT (by_element)", "0Q00111110LMmmmm1111H0nnnnnddddd") // v8.6 +//INST(USDOT_vec, "USDOT (vector)", "0Q001110100mmmmm100111nnnnnddddd") // v8.6 +INST(SDOT_vec, "SDOT (vector)", "0Q001110zz0mmmmm100101nnnnnddddd") +INST(UDOT_vec, "UDOT (vector)", "0Q101110zz0mmmmm100101nnnnnddddd") +INST(FCMLA_vec, "FCMLA", "0Q101110zz0mmmmm110rr1nnnnnddddd") +INST(FCADD_vec, "FCADD", "0Q101110zz0mmmmm111r01nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD Two-register misc +INST(REV64_asimd, "REV64", "0Q001110zz100000000010nnnnnddddd") +INST(REV16_asimd, "REV16 (vector)", "0Q001110zz100000000110nnnnnddddd") +INST(SADDLP, "SADDLP", "0Q001110zz100000001010nnnnnddddd") +INST(SUQADD_2, "SUQADD", "0Q001110zz100000001110nnnnnddddd") +INST(CLS_asimd, "CLS (vector)", "0Q001110zz100000010010nnnnnddddd") +INST(CNT, "CNT", "0Q001110zz100000010110nnnnnddddd") +INST(SADALP, "SADALP", "0Q001110zz100000011010nnnnnddddd") +INST(SQABS_2, "SQABS", "0Q001110zz100000011110nnnnnddddd") +INST(CMGT_zero_2, "CMGT (zero)", "0Q001110zz100000100010nnnnnddddd") +INST(CMEQ_zero_2, "CMEQ (zero)", "0Q001110zz100000100110nnnnnddddd") +INST(CMLT_2, "CMLT (zero)", "0Q001110zz100000101010nnnnnddddd") +INST(ABS_2, "ABS", "0Q001110zz100000101110nnnnnddddd") +INST(XTN, "XTN, XTN2", "0Q001110zz100001001010nnnnnddddd") +INST(SQXTN_2, "SQXTN, SQXTN2", "0Q001110zz100001010010nnnnnddddd") +INST(FCVTN, "FCVTN, FCVTN2", "0Q0011100z100001011010nnnnnddddd") +INST(FCVTL, "FCVTL, FCVTL2", "0Q0011100z100001011110nnnnnddddd") +INST(FRINTN_1, "FRINTN (vector)", "0Q00111001111001100010nnnnnddddd") +INST(FRINTN_2, "FRINTN (vector)", "0Q0011100z100001100010nnnnnddddd") +INST(FRINTM_1, "FRINTM (vector)", "0Q00111001111001100110nnnnnddddd") +INST(FRINTM_2, "FRINTM (vector)", "0Q0011100z100001100110nnnnnddddd") +//INST(FCVTNS_3, "FCVTNS (vector)", "0Q00111001111001101010nnnnnddddd") +INST(FCVTNS_4, "FCVTNS (vector)", "0Q0011100z100001101010nnnnnddddd") +//INST(FCVTMS_3, "FCVTMS (vector)", "0Q00111001111001101110nnnnnddddd") +INST(FCVTMS_4, "FCVTMS (vector)", "0Q0011100z100001101110nnnnnddddd") +//INST(FCVTAS_3, "FCVTAS (vector)", "0Q00111001111001110010nnnnnddddd") +INST(FCVTAS_4, "FCVTAS (vector)", "0Q0011100z100001110010nnnnnddddd") +//INST(SCVTF_int_3, "SCVTF (vector, integer)", "0Q00111001111001110110nnnnnddddd") +INST(SCVTF_int_4, "SCVTF (vector, integer)", "0Q0011100z100001110110nnnnnddddd") +//INST(FCMGT_zero_3, "FCMGT (zero)", "0Q00111011111000110010nnnnnddddd") +INST(FCMGT_zero_4, "FCMGT (zero)", "0Q0011101z100000110010nnnnnddddd") +INST(FCMEQ_zero_3, "FCMEQ (zero)", "0Q00111011111000110110nnnnnddddd") +INST(FCMEQ_zero_4, "FCMEQ (zero)", "0Q0011101z100000110110nnnnnddddd") +//INST(FCMLT_3, "FCMLT (zero)", "0Q00111011111000111010nnnnnddddd") +INST(FCMLT_4, "FCMLT (zero)", "0Q0011101z100000111010nnnnnddddd") +INST(FABS_1, "FABS (vector)", "0Q00111011111000111110nnnnnddddd") +INST(FABS_2, "FABS (vector)", "0Q0011101z100000111110nnnnnddddd") +INST(FRINTP_1, "FRINTP (vector)", "0Q00111011111001100010nnnnnddddd") +INST(FRINTP_2, "FRINTP (vector)", "0Q0011101z100001100010nnnnnddddd") +INST(FRINTZ_1, "FRINTZ (vector)", "0Q00111011111001100110nnnnnddddd") +INST(FRINTZ_2, "FRINTZ (vector)", "0Q0011101z100001100110nnnnnddddd") +//INST(FCVTPS_3, "FCVTPS (vector)", "0Q00111011111001101010nnnnnddddd") +INST(FCVTPS_4, "FCVTPS (vector)", "0Q0011101z100001101010nnnnnddddd") +//INST(FCVTZS_int_3, "FCVTZS (vector, integer)", "0Q00111011111001101110nnnnnddddd") +INST(FCVTZS_int_4, "FCVTZS (vector, integer)", "0Q0011101z100001101110nnnnnddddd") +INST(URECPE, "URECPE", "0Q0011101z100001110010nnnnnddddd") +INST(FRECPE_3, "FRECPE", "0Q00111011111001110110nnnnnddddd") +INST(FRECPE_4, "FRECPE", "0Q0011101z100001110110nnnnnddddd") +INST(REV32_asimd, "REV32 (vector)", "0Q101110zz100000000010nnnnnddddd") +INST(UADDLP, "UADDLP", "0Q101110zz100000001010nnnnnddddd") +INST(USQADD_2, "USQADD", "0Q101110zz100000001110nnnnnddddd") +INST(CLZ_asimd, "CLZ (vector)", "0Q101110zz100000010010nnnnnddddd") +INST(UADALP, "UADALP", "0Q101110zz100000011010nnnnnddddd") +INST(SQNEG_2, "SQNEG", "0Q101110zz100000011110nnnnnddddd") +INST(CMGE_zero_2, "CMGE (zero)", "0Q101110zz100000100010nnnnnddddd") +INST(CMLE_2, "CMLE (zero)", "0Q101110zz100000100110nnnnnddddd") +INST(NEG_2, "NEG (vector)", "0Q101110zz100000101110nnnnnddddd") +INST(SQXTUN_2, "SQXTUN, SQXTUN2", "0Q101110zz100001001010nnnnnddddd") +INST(SHLL, "SHLL, SHLL2", "0Q101110zz100001001110nnnnnddddd") +INST(UQXTN_2, "UQXTN, UQXTN2", "0Q101110zz100001010010nnnnnddddd") +INST(FCVTXN_2, "FCVTXN, FCVTXN2", "0Q1011100z100001011010nnnnnddddd") +INST(FRINTA_1, "FRINTA (vector)", "0Q10111001111001100010nnnnnddddd") +INST(FRINTA_2, "FRINTA (vector)", "0Q1011100z100001100010nnnnnddddd") +INST(FRINTX_1, "FRINTX (vector)", "0Q10111001111001100110nnnnnddddd") +INST(FRINTX_2, "FRINTX (vector)", "0Q1011100z100001100110nnnnnddddd") +//INST(FCVTNU_3, "FCVTNU (vector)", "0Q10111001111001101010nnnnnddddd") +INST(FCVTNU_4, "FCVTNU (vector)", "0Q1011100z100001101010nnnnnddddd") +//INST(FCVTMU_3, "FCVTMU (vector)", "0Q10111001111001101110nnnnnddddd") +INST(FCVTMU_4, "FCVTMU (vector)", "0Q1011100z100001101110nnnnnddddd") +//INST(FCVTAU_3, "FCVTAU (vector)", "0Q10111001111001110010nnnnnddddd") +INST(FCVTAU_4, "FCVTAU (vector)", "0Q1011100z100001110010nnnnnddddd") +//INST(UCVTF_int_3, "UCVTF (vector, integer)", "0Q10111001111001110110nnnnnddddd") +INST(UCVTF_int_4, "UCVTF (vector, integer)", "0Q1011100z100001110110nnnnnddddd") +INST(NOT, "NOT", "0Q10111000100000010110nnnnnddddd") +INST(RBIT_asimd, "RBIT (vector)", "0Q10111001100000010110nnnnnddddd") +INST(FNEG_1, "FNEG (vector)", "0Q10111011111000111110nnnnnddddd") +INST(FNEG_2, "FNEG (vector)", "0Q1011101z100000111110nnnnnddddd") +INST(FRINTI_1, "FRINTI (vector)", "0Q10111011111001100110nnnnnddddd") +INST(FRINTI_2, "FRINTI (vector)", "0Q1011101z100001100110nnnnnddddd") +//INST(FCMGE_zero_3, "FCMGE (zero)", "0Q10111011111000110010nnnnnddddd") +INST(FCMGE_zero_4, "FCMGE (zero)", "0Q1011101z100000110010nnnnnddddd") +//INST(FCMLE_3, "FCMLE (zero)", "0Q10111011111000110110nnnnnddddd") +INST(FCMLE_4, "FCMLE (zero)", "0Q1011101z100000110110nnnnnddddd") +//INST(FCVTPU_3, "FCVTPU (vector)", "0Q10111011111001101010nnnnnddddd") +INST(FCVTPU_4, "FCVTPU (vector)", "0Q1011101z100001101010nnnnnddddd") +//INST(FCVTZU_int_3, "FCVTZU (vector, integer)", "0Q10111011111001101110nnnnnddddd") +INST(FCVTZU_int_4, "FCVTZU (vector, integer)", "0Q1011101z100001101110nnnnnddddd") +INST(URSQRTE, "URSQRTE", "0Q1011101z100001110010nnnnnddddd") +INST(FRSQRTE_3, "FRSQRTE", "0Q10111011111001110110nnnnnddddd") +INST(FRSQRTE_4, "FRSQRTE", "0Q1011101z100001110110nnnnnddddd") +//INST(FSQRT_1, "FSQRT (vector)", "0Q10111011111001111110nnnnnddddd") +INST(FSQRT_2, "FSQRT (vector)", "0Q1011101z100001111110nnnnnddddd") +//INST(FRINT32X_1, "FRINT32X (vector)", "0Q1011100z100001111110nnnnnddddd") // ARMv8.5 +//INST(FRINT64X_1, "FRINT64X (vector)", "0Q1011100z100001111010nnnnnddddd") // ARMv8.5 +//INST(FRINT32Z_1, "FRINT32Z (vector)", "0Q0011100z100001111010nnnnnddddd") // ARMv8.5 +//INST(FRINT64Z_1, "FRINT64Z (vector)", "0Q0011100z100001111110nnnnnddddd") // ARMv8.5 + +// Data Processing - FP and SIMD - SIMD across lanes +INST(SADDLV, "SADDLV", "0Q001110zz110000001110nnnnnddddd") +INST(SMAXV, "SMAXV", "0Q001110zz110000101010nnnnnddddd") +INST(SMINV, "SMINV", "0Q001110zz110001101010nnnnnddddd") +INST(ADDV, "ADDV", "0Q001110zz110001101110nnnnnddddd") +//INST(FMAXNMV_1, "FMAXNMV", "0Q00111000110000110010nnnnnddddd") +INST(FMAXNMV_2, "FMAXNMV", "0Q1011100z110000110010nnnnnddddd") +//INST(FMAXV_1, "FMAXV", "0Q00111000110000111110nnnnnddddd") +INST(FMAXV_2, "FMAXV", "0Q1011100z110000111110nnnnnddddd") +//INST(FMINNMV_1, "FMINNMV", "0Q00111010110000110010nnnnnddddd") +INST(FMINNMV_2, "FMINNMV", "0Q1011101z110000110010nnnnnddddd") +//INST(FMINV_1, "FMINV", "0Q00111010110000111110nnnnnddddd") +INST(FMINV_2, "FMINV", "0Q1011101z110000111110nnnnnddddd") +INST(UADDLV, "UADDLV", "0Q101110zz110000001110nnnnnddddd") +INST(UMAXV, "UMAXV", "0Q101110zz110000101010nnnnnddddd") +INST(UMINV, "UMINV", "0Q101110zz110001101010nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD three different +INST(SADDL, "SADDL, SADDL2", "0Q001110zz1mmmmm000000nnnnnddddd") +INST(SADDW, "SADDW, SADDW2", "0Q001110zz1mmmmm000100nnnnnddddd") +INST(SSUBL, "SSUBL, SSUBL2", "0Q001110zz1mmmmm001000nnnnnddddd") +INST(SSUBW, "SSUBW, SSUBW2", "0Q001110zz1mmmmm001100nnnnnddddd") +INST(ADDHN, "ADDHN, ADDHN2", "0Q001110zz1mmmmm010000nnnnnddddd") +INST(SABAL, "SABAL, SABAL2", "0Q001110zz1mmmmm010100nnnnnddddd") +INST(SUBHN, "SUBHN, SUBHN2", "0Q001110zz1mmmmm011000nnnnnddddd") +INST(SABDL, "SABDL, SABDL2", "0Q001110zz1mmmmm011100nnnnnddddd") +INST(SMLAL_vec, "SMLAL, SMLAL2 (vector)", "0Q001110zz1mmmmm100000nnnnnddddd") +INST(SMLSL_vec, "SMLSL, SMLSL2 (vector)", "0Q001110zz1mmmmm101000nnnnnddddd") +INST(SMULL_vec, "SMULL, SMULL2 (vector)", "0Q001110zz1mmmmm110000nnnnnddddd") +INST(PMULL, "PMULL, PMULL2", "0Q001110zz1mmmmm111000nnnnnddddd") +INST(UADDL, "UADDL, UADDL2", "0Q101110zz1mmmmm000000nnnnnddddd") +INST(UADDW, "UADDW, UADDW2", "0Q101110zz1mmmmm000100nnnnnddddd") +INST(USUBL, "USUBL, USUBL2", "0Q101110zz1mmmmm001000nnnnnddddd") +INST(USUBW, "USUBW, USUBW2", "0Q101110zz1mmmmm001100nnnnnddddd") +INST(RADDHN, "RADDHN, RADDHN2", "0Q101110zz1mmmmm010000nnnnnddddd") +INST(UABAL, "UABAL, UABAL2", "0Q101110zz1mmmmm010100nnnnnddddd") +INST(RSUBHN, "RSUBHN, RSUBHN2", "0Q101110zz1mmmmm011000nnnnnddddd") +INST(UABDL, "UABDL, UABDL2", "0Q101110zz1mmmmm011100nnnnnddddd") +INST(UMLAL_vec, "UMLAL, UMLAL2 (vector)", "0Q101110zz1mmmmm100000nnnnnddddd") +INST(UMLSL_vec, "UMLSL, UMLSL2 (vector)", "0Q101110zz1mmmmm101000nnnnnddddd") +INST(UMULL_vec, "UMULL, UMULL2 (vector)", "0Q101110zz1mmmmm110000nnnnnddddd") +//INST(SQDMLAL_vec_2, "SQDMLAL, SQDMLAL2 (vector)", "0Q001110zz1mmmmm100100nnnnnddddd") +//INST(SQDMLSL_vec_2, "SQDMLSL, SQDMLSL2 (vector)", "0Q001110zz1mmmmm101100nnnnnddddd") +INST(SQDMULL_vec_2, "SQDMULL, SQDMULL2 (vector)", "0Q001110zz1mmmmm110100nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD three same +INST(SHADD, "SHADD", "0Q001110zz1mmmmm000001nnnnnddddd") +INST(SQADD_2, "SQADD", "0Q001110zz1mmmmm000011nnnnnddddd") +INST(SRHADD, "SRHADD", "0Q001110zz1mmmmm000101nnnnnddddd") +INST(SHSUB, "SHSUB", "0Q001110zz1mmmmm001001nnnnnddddd") +INST(SQSUB_2, "SQSUB", "0Q001110zz1mmmmm001011nnnnnddddd") +INST(CMGT_reg_2, "CMGT (register)", "0Q001110zz1mmmmm001101nnnnnddddd") +INST(CMGE_reg_2, "CMGE (register)", "0Q001110zz1mmmmm001111nnnnnddddd") +INST(SSHL_2, "SSHL", "0Q001110zz1mmmmm010001nnnnnddddd") +INST(SQSHL_reg_2, "SQSHL (register)", "0Q001110zz1mmmmm010011nnnnnddddd") +INST(SRSHL_2, "SRSHL", "0Q001110zz1mmmmm010101nnnnnddddd") +//INST(SQRSHL_2, "SQRSHL", "0Q001110zz1mmmmm010111nnnnnddddd") +INST(SMAX, "SMAX", "0Q001110zz1mmmmm011001nnnnnddddd") +INST(SMIN, "SMIN", "0Q001110zz1mmmmm011011nnnnnddddd") +INST(SABD, "SABD", "0Q001110zz1mmmmm011101nnnnnddddd") +INST(SABA, "SABA", "0Q001110zz1mmmmm011111nnnnnddddd") +INST(ADD_vector, "ADD (vector)", "0Q001110zz1mmmmm100001nnnnnddddd") +INST(CMTST_2, "CMTST", "0Q001110zz1mmmmm100011nnnnnddddd") +INST(MLA_vec, "MLA (vector)", "0Q001110zz1mmmmm100101nnnnnddddd") +INST(MUL_vec, "MUL (vector)", "0Q001110zz1mmmmm100111nnnnnddddd") +INST(SMAXP, "SMAXP", "0Q001110zz1mmmmm101001nnnnnddddd") +INST(SMINP, "SMINP", "0Q001110zz1mmmmm101011nnnnnddddd") +INST(SQDMULH_vec_2, "SQDMULH (vector)", "0Q001110zz1mmmmm101101nnnnnddddd") +INST(ADDP_vec, "ADDP (vector)", "0Q001110zz1mmmmm101111nnnnnddddd") +INST(FMAXNM_2, "FMAXNM (vector)", "0Q0011100z1mmmmm110001nnnnnddddd") +INST(FMLA_vec_2, "FMLA (vector)", "0Q0011100z1mmmmm110011nnnnnddddd") +INST(FADD_2, "FADD (vector)", "0Q0011100z1mmmmm110101nnnnnddddd") +INST(FMAX_2, "FMAX (vector)", "0Q0011100z1mmmmm111101nnnnnddddd") +INST(FMULX_vec_4, "FMULX", "0Q0011100z1mmmmm110111nnnnnddddd") +INST(FCMEQ_reg_4, "FCMEQ (register)", "0Q0011100z1mmmmm111001nnnnnddddd") +//INST(FMLAL_vec_1, "FMLAL, FMLAL2 (vector)", "0Q0011100z1mmmmm111011nnnnnddddd") +INST(FRECPS_4, "FRECPS", "0Q0011100z1mmmmm111111nnnnnddddd") +INST(AND_asimd, "AND (vector)", "0Q001110001mmmmm000111nnnnnddddd") +INST(BIC_asimd_reg, "BIC (vector, register)", "0Q001110011mmmmm000111nnnnnddddd") +INST(FMINNM_2, "FMINNM (vector)", "0Q0011101z1mmmmm110001nnnnnddddd") +INST(FMLS_vec_2, "FMLS (vector)", "0Q0011101z1mmmmm110011nnnnnddddd") +INST(FSUB_2, "FSUB (vector)", "0Q0011101z1mmmmm110101nnnnnddddd") +//INST(FMLSL_vec_1, "FMLSL, FMLSL2 (vector)", "0Q0011101z1mmmmm111011nnnnnddddd") +INST(FMIN_2, "FMIN (vector)", "0Q0011101z1mmmmm111101nnnnnddddd") +INST(FRSQRTS_4, "FRSQRTS", "0Q0011101z1mmmmm111111nnnnnddddd") +INST(ORR_asimd_reg, "ORR (vector, register)", "0Q001110101mmmmm000111nnnnnddddd") +INST(ORN_asimd, "ORN (vector)", "0Q001110111mmmmm000111nnnnnddddd") +INST(UHADD, "UHADD", "0Q101110zz1mmmmm000001nnnnnddddd") +INST(UQADD_2, "UQADD", "0Q101110zz1mmmmm000011nnnnnddddd") +INST(URHADD, "URHADD", "0Q101110zz1mmmmm000101nnnnnddddd") +INST(UHSUB, "UHSUB", "0Q101110zz1mmmmm001001nnnnnddddd") +INST(UQSUB_2, "UQSUB", "0Q101110zz1mmmmm001011nnnnnddddd") +INST(CMHI_2, "CMHI (register)", "0Q101110zz1mmmmm001101nnnnnddddd") +INST(CMHS_2, "CMHS (register)", "0Q101110zz1mmmmm001111nnnnnddddd") +INST(USHL_2, "USHL", "0Q101110zz1mmmmm010001nnnnnddddd") +INST(UQSHL_reg_2, "UQSHL (register)", "0Q101110zz1mmmmm010011nnnnnddddd") +INST(URSHL_2, "URSHL", "0Q101110zz1mmmmm010101nnnnnddddd") +//INST(UQRSHL_2, "UQRSHL", "0Q101110zz1mmmmm010111nnnnnddddd") +INST(UMAX, "UMAX", "0Q101110zz1mmmmm011001nnnnnddddd") +INST(UMIN, "UMIN", "0Q101110zz1mmmmm011011nnnnnddddd") +INST(UABD, "UABD", "0Q101110zz1mmmmm011101nnnnnddddd") +INST(UABA, "UABA", "0Q101110zz1mmmmm011111nnnnnddddd") +INST(SUB_2, "SUB (vector)", "0Q101110zz1mmmmm100001nnnnnddddd") +INST(CMEQ_reg_2, "CMEQ (register)", "0Q101110zz1mmmmm100011nnnnnddddd") +INST(MLS_vec, "MLS (vector)", "0Q101110zz1mmmmm100101nnnnnddddd") +INST(PMUL, "PMUL", "0Q101110zz1mmmmm100111nnnnnddddd") +INST(UMAXP, "UMAXP", "0Q101110zz1mmmmm101001nnnnnddddd") +INST(UMINP, "UMINP", "0Q101110zz1mmmmm101011nnnnnddddd") +INST(SQRDMULH_vec_2, "SQRDMULH (vector)", "0Q101110zz1mmmmm101101nnnnnddddd") +INST(FMAXNMP_vec_2, "FMAXNMP (vector)", "0Q1011100z1mmmmm110001nnnnnddddd") +//INST(FMLAL_vec_2, "FMLAL, FMLAL2 (vector)", "0Q1011100z1mmmmm110011nnnnnddddd") +INST(FADDP_vec_2, "FADDP (vector)", "0Q1011100z1mmmmm110101nnnnnddddd") +INST(FMUL_vec_2, "FMUL (vector)", "0Q1011100z1mmmmm110111nnnnnddddd") +INST(FCMGE_reg_4, "FCMGE (register)", "0Q1011100z1mmmmm111001nnnnnddddd") +INST(FACGE_4, "FACGE", "0Q1011100z1mmmmm111011nnnnnddddd") +INST(FMAXP_vec_2, "FMAXP (vector)", "0Q1011100z1mmmmm111101nnnnnddddd") +INST(FDIV_2, "FDIV (vector)", "0Q1011100z1mmmmm111111nnnnnddddd") +INST(EOR_asimd, "EOR (vector)", "0Q101110001mmmmm000111nnnnnddddd") +INST(BSL, "BSL", "0Q101110011mmmmm000111nnnnnddddd") +INST(FMINNMP_vec_2, "FMINNMP (vector)", "0Q1011101z1mmmmm110001nnnnnddddd") +//INST(FMLSL_vec_2, "FMLSL, FMLSL2 (vector)", "0Q1011101z1mmmmm110011nnnnnddddd") +INST(FABD_4, "FABD", "0Q1011101z1mmmmm110101nnnnnddddd") +INST(FCMGT_reg_4, "FCMGT (register)", "0Q1011101z1mmmmm111001nnnnnddddd") +INST(FACGT_4, "FACGT", "0Q1011101z1mmmmm111011nnnnnddddd") +INST(FMINP_vec_2, "FMINP (vector)", "0Q1011101z1mmmmm111101nnnnnddddd") +INST(BIT, "BIT", "0Q101110101mmmmm000111nnnnnddddd") +INST(BIF, "BIF", "0Q101110111mmmmm000111nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD modified immediate +INST(MOVI, "MOVI, MVNI, ORR, BIC (vector, immediate)", "0Qo0111100000abcmmmm01defghddddd") +INST(FMOV_2, "FMOV (vector, immediate)", "0Qo0111100000abc111101defghddddd") +INST(FMOV_3, "FMOV (vector, immediate)", "0Q00111100000abc111111defghddddd") +INST(UnallocatedEncoding, "Unallocated SIMD modified immediate", "0--0111100000-------11----------") + +// Data Processing - FP and SIMD - SIMD Shift by immediate +INST(SSHR_2, "SSHR", "0Q0011110IIIIiii000001nnnnnddddd") +INST(SSRA_2, "SSRA", "0Q0011110IIIIiii000101nnnnnddddd") +INST(SRSHR_2, "SRSHR", "0Q0011110IIIIiii001001nnnnnddddd") +INST(SRSRA_2, "SRSRA", "0Q0011110IIIIiii001101nnnnnddddd") +INST(SHL_2, "SHL", "0Q0011110IIIIiii010101nnnnnddddd") +INST(SQSHL_imm_2, "SQSHL (immediate)", "0Q0011110IIIIiii011101nnnnnddddd") +INST(SHRN, "SHRN, SHRN2", "0Q0011110IIIIiii100001nnnnnddddd") +INST(RSHRN, "RSHRN, RSHRN2", "0Q0011110IIIIiii100011nnnnnddddd") +INST(SQSHRN_2, "SQSHRN, SQSHRN2", "0Q0011110IIIIiii100101nnnnnddddd") +INST(SQRSHRN_2, "SQRSHRN, SQRSHRN2", "0Q0011110IIIIiii100111nnnnnddddd") +INST(SSHLL, "SSHLL, SSHLL2", "0Q0011110IIIIiii101001nnnnnddddd") +INST(SCVTF_fix_2, "SCVTF (vector, fixed-point)", "0Q0011110IIIIiii111001nnnnnddddd") +INST(FCVTZS_fix_2, "FCVTZS (vector, fixed-point)", "0Q0011110IIIIiii111111nnnnnddddd") +INST(USHR_2, "USHR", "0Q1011110IIIIiii000001nnnnnddddd") +INST(USRA_2, "USRA", "0Q1011110IIIIiii000101nnnnnddddd") +INST(URSHR_2, "URSHR", "0Q1011110IIIIiii001001nnnnnddddd") +INST(URSRA_2, "URSRA", "0Q1011110IIIIiii001101nnnnnddddd") +INST(SRI_2, "SRI", "0Q1011110IIIIiii010001nnnnnddddd") +INST(SLI_2, "SLI", "0Q1011110IIIIiii010101nnnnnddddd") +INST(SQSHLU_2, "SQSHLU", "0Q1011110IIIIiii011001nnnnnddddd") +INST(UQSHL_imm_2, "UQSHL (immediate)", "0Q1011110IIIIiii011101nnnnnddddd") +INST(SQSHRUN_2, "SQSHRUN, SQSHRUN2", "0Q1011110IIIIiii100001nnnnnddddd") +INST(SQRSHRUN_2, "SQRSHRUN, SQRSHRUN2", "0Q1011110IIIIiii100011nnnnnddddd") +INST(UQSHRN_2, "UQSHRN, UQSHRN2", "0Q1011110IIIIiii100101nnnnnddddd") +INST(UQRSHRN_2, "UQRSHRN, UQRSHRN2", "0Q1011110IIIIiii100111nnnnnddddd") +INST(USHLL, "USHLL, USHLL2", "0Q1011110IIIIiii101001nnnnnddddd") +INST(UCVTF_fix_2, "UCVTF (vector, fixed-point)", "0Q1011110IIIIiii111001nnnnnddddd") +INST(FCVTZU_fix_2, "FCVTZU (vector, fixed-point)", "0Q1011110IIIIiii111111nnnnnddddd") + +// Data Processing - FP and SIMD - SIMD vector x indexed element +INST(SMLAL_elt, "SMLAL, SMLAL2 (by element)", "0Q001111zzLMmmmm0010H0nnnnnddddd") +//INST(SQDMLAL_elt_2, "SQDMLAL, SQDMLAL2 (by element)", "0Q001111zzLMmmmm0011H0nnnnnddddd") +INST(SMLSL_elt, "SMLSL, SMLSL2 (by element)", "0Q001111zzLMmmmm0110H0nnnnnddddd") +//INST(SQDMLSL_elt_2, "SQDMLSL, SQDMLSL2 (by element)", "0Q001111zzLMmmmm0111H0nnnnnddddd") +INST(MUL_elt, "MUL (by element)", "0Q001111zzLMmmmm1000H0nnnnnddddd") +INST(SMULL_elt, "SMULL, SMULL2 (by element)", "0Q001111zzLMmmmm1010H0nnnnnddddd") +INST(SQDMULL_elt_2, "SQDMULL, SQDMULL2 (by element)", "0Q001111zzLMmmmm1011H0nnnnnddddd") +INST(SQDMULH_elt_2, "SQDMULH (by element)", "0Q001111zzLMmmmm1100H0nnnnnddddd") +INST(SQRDMULH_elt_2, "SQRDMULH (by element)", "0Q001111zzLMmmmm1101H0nnnnnddddd") +INST(SDOT_elt, "SDOT (by element)", "0Q001111zzLMmmmm1110H0nnnnnddddd") +INST(FMLA_elt_3, "FMLA (by element)", "0Q00111100LMmmmm0001H0nnnnnddddd") +INST(FMLA_elt_4, "FMLA (by element)", "0Q0011111zLMmmmm0001H0nnnnnddddd") +INST(FMLS_elt_3, "FMLS (by element)", "0Q00111100LMmmmm0101H0nnnnnddddd") +INST(FMLS_elt_4, "FMLS (by element)", "0Q0011111zLMmmmm0101H0nnnnnddddd") +//INST(FMUL_elt_3, "FMUL (by element)", "0Q00111100LMmmmm1001H0nnnnnddddd") +INST(FMUL_elt_4, "FMUL (by element)", "0Q0011111zLMmmmm1001H0nnnnnddddd") +//INST(FMLAL_elt_1, "FMLAL, FMLAL2 (by element)", "0Q0011111zLMmmmm0000H0nnnnnddddd") +//INST(FMLAL_elt_2, "FMLAL, FMLAL2 (by element)", "0Q1011111zLMmmmm1000H0nnnnnddddd") +//INST(FMLSL_elt_1, "FMLSL, FMLSL2 (by element)", "0Q0011111zLMmmmm0100H0nnnnnddddd") +//INST(FMLSL_elt_2, "FMLSL, FMLSL2 (by element)", "0Q1011111zLMmmmm1100H0nnnnnddddd") +INST(MLA_elt, "MLA (by element)", "0Q101111zzLMmmmm0000H0nnnnnddddd") +INST(UMLAL_elt, "UMLAL, UMLAL2 (by element)", "0Q101111zzLMmmmm0010H0nnnnnddddd") +INST(MLS_elt, "MLS (by element)", "0Q101111zzLMmmmm0100H0nnnnnddddd") +INST(UMLSL_elt, "UMLSL, UMLSL2 (by element)", "0Q101111zzLMmmmm0110H0nnnnnddddd") +INST(UMULL_elt, "UMULL, UMULL2 (by element)", "0Q101111zzLMmmmm1010H0nnnnnddddd") +//INST(SQRDMLAH_elt_2, "SQRDMLAH (by element)", "0Q101111zzLMmmmm1101H0nnnnnddddd") +INST(UDOT_elt, "UDOT (by element)", "0Q101111zzLMmmmm1110H0nnnnnddddd") +//INST(SQRDMLSH_elt_2, "SQRDMLSH (by element)", "0Q101111zzLMmmmm1111H0nnnnnddddd") +//INST(FMULX_elt_3, "FMULX (by element)", "0Q10111100LMmmmm1001H0nnnnnddddd") +INST(FMULX_elt_4, "FMULX (by element)", "0Q1011111zLMmmmm1001H0nnnnnddddd") +INST(FCMLA_elt, "FCMLA (by element)", "0Q101111zzLMmmmm0rr1H0nnnnnddddd") + +// Data Processing - FP and SIMD - Cryptographic three register +INST(SM3TT1A, "SM3TT1A", "11001110010mmmmm10ii00nnnnnddddd") +INST(SM3TT1B, "SM3TT1B", "11001110010mmmmm10ii01nnnnnddddd") +INST(SM3TT2A, "SM3TT2A", "11001110010mmmmm10ii10nnnnnddddd") +INST(SM3TT2B, "SM3TT2B", "11001110010mmmmm10ii11nnnnnddddd") + +// Data Processing - FP and SIMD - SHA512 three register +INST(SHA512H, "SHA512H", "11001110011mmmmm100000nnnnnddddd") +INST(SHA512H2, "SHA512H2", "11001110011mmmmm100001nnnnnddddd") +INST(SHA512SU1, "SHA512SU1", "11001110011mmmmm100010nnnnnddddd") +INST(RAX1, "RAX1", "11001110011mmmmm100011nnnnnddddd") +INST(SM3PARTW1, "SM3PARTW1", "11001110011mmmmm110000nnnnnddddd") +INST(SM3PARTW2, "SM3PARTW2", "11001110011mmmmm110001nnnnnddddd") +INST(SM4EKEY, "SM4EKEY", "11001110011mmmmm110010nnnnnddddd") +INST(XAR, "XAR", "11001110100mmmmmiiiiiinnnnnddddd") + +// Data Processing - FP and SIMD - Cryptographic four register +INST(EOR3, "EOR3", "11001110000mmmmm0aaaaannnnnddddd") +INST(BCAX, "BCAX", "11001110001mmmmm0aaaaannnnnddddd") +INST(SM3SS1, "SM3SS1", "11001110010mmmmm0aaaaannnnnddddd") + +// Data Processing - FP and SIMD - SHA512 two register +INST(SHA512SU0, "SHA512SU0", "1100111011000000100000nnnnnddddd") +INST(SM4E, "SM4E", "1100111011000000100001nnnnnddddd") + +// Data Processing - FP and SIMD - Conversion between floating point and fixed point +INST(SCVTF_float_fix, "SCVTF (scalar, fixed-point)", "z0011110yy000010ppppppnnnnnddddd") +INST(UCVTF_float_fix, "UCVTF (scalar, fixed-point)", "z0011110yy000011ppppppnnnnnddddd") +INST(FCVTZS_float_fix, "FCVTZS (scalar, fixed-point)", "z0011110yy011000ppppppnnnnnddddd") +INST(FCVTZU_float_fix, "FCVTZU (scalar, fixed-point)", "z0011110yy011001ppppppnnnnnddddd") + +// Data Processing - FP and SIMD - Conversion between floating point and integer +INST(FCVTNS_float, "FCVTNS (scalar)", "z0011110yy100000000000nnnnnddddd") +INST(FCVTNU_float, "FCVTNU (scalar)", "z0011110yy100001000000nnnnnddddd") +INST(SCVTF_float_int, "SCVTF (scalar, integer)", "z0011110yy100010000000nnnnnddddd") +INST(UCVTF_float_int, "UCVTF (scalar, integer)", "z0011110yy100011000000nnnnnddddd") +INST(FCVTAS_float, "FCVTAS (scalar)", "z0011110yy100100000000nnnnnddddd") +INST(FCVTAU_float, "FCVTAU (scalar)", "z0011110yy100101000000nnnnnddddd") +INST(FMOV_float_gen, "FMOV (general)", "z0011110yy10r11o000000nnnnnddddd") +INST(FCVTPS_float, "FCVTPS (scalar)", "z0011110yy101000000000nnnnnddddd") +INST(FCVTPU_float, "FCVTPU (scalar)", "z0011110yy101001000000nnnnnddddd") +INST(FCVTMS_float, "FCVTMS (scalar)", "z0011110yy110000000000nnnnnddddd") +INST(FCVTMU_float, "FCVTMU (scalar)", "z0011110yy110001000000nnnnnddddd") +INST(FCVTZS_float_int, "FCVTZS (scalar, integer)", "z0011110yy111000000000nnnnnddddd") +INST(FCVTZU_float_int, "FCVTZU (scalar, integer)", "z0011110yy111001000000nnnnnddddd") +//INST(FJCVTZS, "FJCVTZS", "0001111001111110000000nnnnnddddd") + +// Data Processing - FP and SIMD - Floating point data processing +INST(FMOV_float, "FMOV (register)", "00011110yy100000010000nnnnnddddd") +INST(FABS_float, "FABS (scalar)", "00011110yy100000110000nnnnnddddd") +INST(FNEG_float, "FNEG (scalar)", "00011110yy100001010000nnnnnddddd") +INST(FSQRT_float, "FSQRT (scalar)", "00011110yy100001110000nnnnnddddd") +INST(FCVT_float, "FCVT", "00011110yy10001oo10000nnnnnddddd") +INST(FRINTN_float, "FRINTN (scalar)", "00011110yy100100010000nnnnnddddd") +INST(FRINTP_float, "FRINTP (scalar)", "00011110yy100100110000nnnnnddddd") +INST(FRINTM_float, "FRINTM (scalar)", "00011110yy100101010000nnnnnddddd") +INST(FRINTZ_float, "FRINTZ (scalar)", "00011110yy100101110000nnnnnddddd") +INST(FRINTA_float, "FRINTA (scalar)", "00011110yy100110010000nnnnnddddd") +INST(FRINTX_float, "FRINTX (scalar)", "00011110yy100111010000nnnnnddddd") +INST(FRINTI_float, "FRINTI (scalar)", "00011110yy100111110000nnnnnddddd") +//INST(FRINT32X_float, "FRINT32X (scalar)", "00011110yy101000110000nnnnnddddd") // ARMv8.5 +//INST(FRINT64X_float, "FRINT64X (scalar)", "00011110yy101001110000nnnnnddddd") // ARMv8.5 +//INST(FRINT32Z_float, "FRINT32Z (scalar)", "00011110yy101000010000nnnnnddddd") // ARMv8.5 +//INST(FRINT64Z_float, "FRINT64Z (scalar)", "00011110yy101001010000nnnnnddddd") // ARMv8.5 + +// Data Processing - FP and SIMD - Floating point compare +INST(FCMP_float, "FCMP", "00011110yy1mmmmm001000nnnnn0o000") +INST(FCMPE_float, "FCMPE", "00011110yy1mmmmm001000nnnnn1o000") + +// Data Processing - FP and SIMD - Floating point immediate +INST(FMOV_float_imm, "FMOV (scalar, immediate)", "00011110yy1iiiiiiii10000000ddddd") + +// Data Processing - FP and SIMD - Floating point conditional compare +INST(FCCMP_float, "FCCMP", "00011110yy1mmmmmcccc01nnnnn0ffff") +INST(FCCMPE_float, "FCCMPE", "00011110yy1mmmmmcccc01nnnnn1ffff") + +// Data Processing - FP and SIMD - Floating point data processing two register +INST(FMUL_float, "FMUL (scalar)", "00011110yy1mmmmm000010nnnnnddddd") +INST(FDIV_float, "FDIV (scalar)", "00011110yy1mmmmm000110nnnnnddddd") +INST(FADD_float, "FADD (scalar)", "00011110yy1mmmmm001010nnnnnddddd") +INST(FSUB_float, "FSUB (scalar)", "00011110yy1mmmmm001110nnnnnddddd") +INST(FMAX_float, "FMAX (scalar)", "00011110yy1mmmmm010010nnnnnddddd") +INST(FMIN_float, "FMIN (scalar)", "00011110yy1mmmmm010110nnnnnddddd") +INST(FMAXNM_float, "FMAXNM (scalar)", "00011110yy1mmmmm011010nnnnnddddd") +INST(FMINNM_float, "FMINNM (scalar)", "00011110yy1mmmmm011110nnnnnddddd") +INST(FNMUL_float, "FNMUL (scalar)", "00011110yy1mmmmm100010nnnnnddddd") + +// Data Processing - FP and SIMD - Floating point conditional select +INST(FCSEL_float, "FCSEL", "00011110yy1mmmmmcccc11nnnnnddddd") + +// Data Processing - FP and SIMD - Floating point data processing three register +INST(FMADD_float, "FMADD", "00011111yy0mmmmm0aaaaannnnnddddd") +INST(FMSUB_float, "FMSUB", "00011111yy0mmmmm1aaaaannnnnddddd") +INST(FNMADD_float, "FNMADD", "00011111yy1mmmmm0aaaaannnnnddddd") +INST(FNMSUB_float, "FNMSUB", "00011111yy1mmmmm1aaaaannnnnddddd") + +// BFloat16 +//INST(BFCVT, "BFCVT", "0001111001100011010000nnnnnddddd") // v8.6 +//INST(BFCVTN, "BFCVTN{2}", "0Q00111010100001011010nnnnnddddd") // v8.6 +//INST(BFDOT_element, "BFDOT (by element)", "0Q00111101LMmmmm1111H0nnnnnddddd") // v8.6 +//INST(BFDOT_vec, "BFDOT (vector)", "0Q101110010mmmmm111111nnnnnddddd") // v8.6 +//INST(BFMLALX_element, "BFMLALX (by element)", "0Q00111111LMmmmm1111H0nnnnnddddd") // v8.6 +//INST(BFMLALX_vector, "BFMLALX (vector)", "0Q101110110mmmmm111111nnnnnddddd") // v8.6 +//INST(BFMMLA, "BFMMLA", "01101110010mmmmm111011nnnnnddddd") // v8.6 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.cpp new file mode 100644 index 0000000000..05996aeb64 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.cpp @@ -0,0 +1,69 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/a64_translate.h" + +#include "dynarmic/frontend/A64/a64_location_descriptor.h" +#include "dynarmic/frontend/A64/decoder/a64.h" +#include "dynarmic/frontend/A64/translate/impl/impl.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/terminal.h" + +namespace Dynarmic::A64 { + +IR::Block Translate(LocationDescriptor descriptor, MemoryReadCodeFuncType memory_read_code, TranslationOptions options) { + const bool single_step = descriptor.SingleStepping(); + + IR::Block block{descriptor}; + TranslatorVisitor visitor{block, descriptor, std::move(options)}; + + bool should_continue = true; + do { + const u64 pc = visitor.ir.current_location->PC(); + + if (const auto instruction = memory_read_code(pc)) { + if (auto decoder = Decode<TranslatorVisitor>(*instruction)) { + should_continue = decoder->get().call(visitor, *instruction); + } else { + should_continue = visitor.InterpretThisInstruction(); + } + } else { + should_continue = visitor.RaiseException(Exception::NoExecuteFault); + } + + visitor.ir.current_location = visitor.ir.current_location->AdvancePC(4); + block.CycleCount()++; + } while (should_continue && !single_step); + + if (single_step && should_continue) { + visitor.ir.SetTerm(IR::Term::LinkBlock{*visitor.ir.current_location}); + } + + ASSERT_MSG(block.HasTerminal(), "Terminal has not been set"); + + block.SetEndLocation(*visitor.ir.current_location); + + return block; +} + +bool TranslateSingleInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction) { + TranslatorVisitor visitor{block, descriptor, {}}; + + bool should_continue = true; + if (auto decoder = Decode<TranslatorVisitor>(instruction)) { + should_continue = decoder->get().call(visitor, instruction); + } else { + should_continue = visitor.InterpretThisInstruction(); + } + + visitor.ir.current_location = visitor.ir.current_location->AdvancePC(4); + block.CycleCount()++; + + block.SetEndLocation(*visitor.ir.current_location); + + return should_continue; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.h b/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.h new file mode 100644 index 0000000000..2f62ac6e50 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/a64_translate.h @@ -0,0 +1,59 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ +#pragma once + +#include <functional> +#include <optional> + +#include <mcl/stdint.hpp> + +namespace Dynarmic { + +namespace IR { +class Block; +} // namespace IR + +namespace A64 { + +class LocationDescriptor; + +using MemoryReadCodeFuncType = std::function<std::optional<u32>(u64 vaddr)>; + +struct TranslationOptions { + /// This changes what IR we emit when we translate an unpredictable instruction. + /// If this is false, the ExceptionRaised IR instruction is emitted. + /// If this is true, we define some behaviour for some instructions. + bool define_unpredictable_behaviour = false; + + /// This tells the translator a wall clock will be used, thus allowing it + /// to avoid writting certain unnecessary code only needed for cycle timers. + bool wall_clock_cntpct = false; + + /// This changes what IR we emit when we translate a hint instruction. + /// If this is false, we treat the instruction as a NOP. + /// If this is true, we emit an ExceptionRaised instruction. + bool hook_hint_instructions = true; +}; + +/** + * This function translates instructions in memory into our intermediate representation. + * @param descriptor The starting location of the basic block. Includes information like PC, FPCR state, &c. + * @param memory_read_code The function we should use to read emulated memory. + * @param options Configures how certain instructions are translated. + * @return A translated basic block in the intermediate representation. + */ +IR::Block Translate(LocationDescriptor descriptor, MemoryReadCodeFuncType memory_read_code, TranslationOptions options); + +/** + * This function translates a single provided instruction into our intermediate representation. + * @param block The block to append the IR for the instruction to. + * @param descriptor The location of the instruction. Includes information like PC, FPCR state, &c. + * @param instruction The instruction to translate. + * @return The translated instruction translated to the intermediate representation. + */ +bool TranslateSingleInstruction(IR::Block& block, LocationDescriptor descriptor, u32 instruction); + +} // namespace A64 +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_branch.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_branch.cpp new file mode 100644 index 0000000000..01cc1390c7 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_branch.cpp @@ -0,0 +1,128 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::B_cond(Imm<19> imm19, Cond cond) { + const s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>(); + const u64 target = ir.PC() + offset; + + const auto cond_pass = IR::Term::LinkBlock{ir.current_location->SetPC(target)}; + const auto cond_fail = IR::Term::LinkBlock{ir.current_location->AdvancePC(4)}; + ir.SetTerm(IR::Term::If{cond, cond_pass, cond_fail}); + return false; +} + +bool TranslatorVisitor::B_uncond(Imm<26> imm26) { + const s64 offset = concatenate(imm26, Imm<2>{0}).SignExtend<s64>(); + const u64 target = ir.PC() + offset; + + ir.SetTerm(IR::Term::LinkBlock{ir.current_location->SetPC(target)}); + return false; +} + +bool TranslatorVisitor::BL(Imm<26> imm26) { + const s64 offset = concatenate(imm26, Imm<2>{0}).SignExtend<s64>(); + + X(64, Reg::R30, ir.Imm64(ir.PC() + 4)); + ir.PushRSB(ir.current_location->AdvancePC(4)); + + const u64 target = ir.PC() + offset; + ir.SetTerm(IR::Term::LinkBlock{ir.current_location->SetPC(target)}); + return false; +} + +bool TranslatorVisitor::BLR(Reg Rn) { + const auto target = X(64, Rn); + + X(64, Reg::R30, ir.Imm64(ir.PC() + 4)); + ir.PushRSB(ir.current_location->AdvancePC(4)); + + ir.SetPC(target); + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; +} + +bool TranslatorVisitor::BR(Reg Rn) { + const auto target = X(64, Rn); + + ir.SetPC(target); + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; +} + +bool TranslatorVisitor::RET(Reg Rn) { + const auto target = X(64, Rn); + + ir.SetPC(target); + ir.SetTerm(IR::Term::PopRSBHint{}); + return false; +} + +bool TranslatorVisitor::CBZ(bool sf, Imm<19> imm19, Reg Rt) { + const size_t datasize = sf ? 64 : 32; + const s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>(); + + const IR::U32U64 operand1 = X(datasize, Rt); + + ir.SetCheckBit(ir.IsZero(operand1)); + + const u64 target = ir.PC() + offset; + const auto cond_pass = IR::Term::LinkBlock{ir.current_location->SetPC(target)}; + const auto cond_fail = IR::Term::LinkBlock{ir.current_location->AdvancePC(4)}; + ir.SetTerm(IR::Term::CheckBit{cond_pass, cond_fail}); + return false; +} + +bool TranslatorVisitor::CBNZ(bool sf, Imm<19> imm19, Reg Rt) { + const size_t datasize = sf ? 64 : 32; + const s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>(); + + const IR::U32U64 operand1 = X(datasize, Rt); + + ir.SetCheckBit(ir.IsZero(operand1)); + + const u64 target = ir.PC() + offset; + const auto cond_pass = IR::Term::LinkBlock{ir.current_location->AdvancePC(4)}; + const auto cond_fail = IR::Term::LinkBlock{ir.current_location->SetPC(target)}; + ir.SetTerm(IR::Term::CheckBit{cond_pass, cond_fail}); + return false; +} + +bool TranslatorVisitor::TBZ(Imm<1> b5, Imm<5> b40, Imm<14> imm14, Reg Rt) { + const size_t datasize = b5 == 1 ? 64 : 32; + const u8 bit_pos = concatenate(b5, b40).ZeroExtend<u8>(); + const s64 offset = concatenate(imm14, Imm<2>{0}).SignExtend<s64>(); + + const auto operand = X(datasize, Rt); + + ir.SetCheckBit(ir.TestBit(operand, ir.Imm8(bit_pos))); + + const u64 target = ir.PC() + offset; + const auto cond_1 = IR::Term::LinkBlock{ir.current_location->AdvancePC(4)}; + const auto cond_0 = IR::Term::LinkBlock{ir.current_location->SetPC(target)}; + ir.SetTerm(IR::Term::CheckBit{cond_1, cond_0}); + return false; +} + +bool TranslatorVisitor::TBNZ(Imm<1> b5, Imm<5> b40, Imm<14> imm14, Reg Rt) { + const size_t datasize = b5 == 1 ? 64 : 32; + const u8 bit_pos = concatenate(b5, b40).ZeroExtend<u8>(); + const s64 offset = concatenate(imm14, Imm<2>{0}).SignExtend<s64>(); + + const auto operand = X(datasize, Rt); + + ir.SetCheckBit(ir.TestBit(operand, ir.Imm8(bit_pos))); + + const u64 target = ir.PC() + offset; + const auto cond_1 = IR::Term::LinkBlock{ir.current_location->SetPC(target)}; + const auto cond_0 = IR::Term::LinkBlock{ir.current_location->AdvancePC(4)}; + ir.SetTerm(IR::Term::CheckBit{cond_1, cond_0}); + return false; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_exception_generating.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_exception_generating.cpp new file mode 100644 index 0000000000..460ebfa6f8 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/a64_exception_generating.cpp @@ -0,0 +1,22 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::BRK(Imm<16> /*imm16*/) { + return RaiseException(Exception::Breakpoint); +} + +bool TranslatorVisitor::SVC(Imm<16> imm16) { + ir.PushRSB(ir.current_location->AdvancePC(4)); + ir.SetPC(ir.Imm64(ir.current_location->PC() + 4)); + ir.CallSupervisor(imm16.ZeroExtend()); + ir.SetTerm(IR::Term::CheckHalt{IR::Term::PopRSBHint{}}); + return false; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_addsub.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_addsub.cpp new file mode 100644 index 0000000000..5df4fe1ce9 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_addsub.cpp @@ -0,0 +1,330 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::ADD_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd) { + u64 imm; + switch (shift.ZeroExtend()) { + case 0b00: + imm = imm12.ZeroExtend<u64>(); + break; + case 0b01: + imm = imm12.ZeroExtend<u64>() << 12; + break; + default: + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn)); + + const auto result = ir.Add(operand1, I(datasize, imm)); + + if (Rd == Reg::SP) { + SP(datasize, result); + } else { + X(datasize, Rd, result); + } + + return true; +} + +bool TranslatorVisitor::ADDS_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd) { + u64 imm; + switch (shift.ZeroExtend()) { + case 0b00: + imm = imm12.ZeroExtend<u64>(); + break; + case 0b01: + imm = imm12.ZeroExtend<u64>() << 12; + break; + default: + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn)); + + const auto result = ir.Add(operand1, I(datasize, imm)); + + ir.SetNZCV(ir.NZCVFrom(result)); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::SUB_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd) { + u64 imm; + switch (shift.ZeroExtend()) { + case 0b00: + imm = imm12.ZeroExtend<u64>(); + break; + case 0b01: + imm = imm12.ZeroExtend<u64>() << 12; + break; + default: + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn)); + + const auto result = ir.Sub(operand1, I(datasize, imm)); + + if (Rd == Reg::SP) { + SP(datasize, result); + } else { + X(datasize, Rd, result); + } + + return true; +} + +bool TranslatorVisitor::SUBS_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd) { + u64 imm; + switch (shift.ZeroExtend()) { + case 0b00: + imm = imm12.ZeroExtend<u64>(); + break; + case 0b01: + imm = imm12.ZeroExtend<u64>() << 12; + break; + default: + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn)); + + const auto result = ir.Sub(operand1, I(datasize, imm)); + + ir.SetNZCV(ir.NZCVFrom(result)); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::ADD_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (shift == 0b11) { + return ReservedValue(); + } + + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)); + + const auto result = ir.Add(operand1, operand2); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::ADDS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (shift == 0b11) { + return ReservedValue(); + } + + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)); + + const auto result = ir.Add(operand1, operand2); + + ir.SetNZCV(ir.NZCVFrom(result)); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::SUB_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (shift == 0b11) { + return ReservedValue(); + } + + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)); + + const auto result = ir.Sub(operand1, operand2); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::SUBS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (shift == 0b11) { + return ReservedValue(); + } + + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)); + + const auto result = ir.Sub(operand1, operand2); + + ir.SetNZCV(ir.NZCVFrom(result)); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::ADD_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd) { + const u8 shift = imm3.ZeroExtend<u8>(); + if (shift > 4) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn)); + const auto operand2 = ExtendReg(datasize, Rm, option, shift); + + const auto result = ir.Add(operand1, operand2); + + if (Rd == Reg::SP) { + SP(datasize, result); + } else { + X(datasize, Rd, result); + } + + return true; +} + +bool TranslatorVisitor::ADDS_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd) { + const u8 shift = imm3.ZeroExtend<u8>(); + if (shift > 4) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn)); + const auto operand2 = ExtendReg(datasize, Rm, option, shift); + + const auto result = ir.Add(operand1, operand2); + + ir.SetNZCV(ir.NZCVFrom(result)); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::SUB_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd) { + const u8 shift = imm3.ZeroExtend<u8>(); + if (shift > 4) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn)); + const auto operand2 = ExtendReg(datasize, Rm, option, shift); + + const auto result = ir.Sub(operand1, operand2); + + if (Rd == Reg::SP) { + SP(datasize, result); + } else { + X(datasize, Rd, result); + } + + return true; +} + +bool TranslatorVisitor::SUBS_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd) { + const u8 shift = imm3.ZeroExtend<u8>(); + if (shift > 4) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = Rn == Reg::SP ? SP(datasize) : IR::U32U64(X(datasize, Rn)); + const auto operand2 = ExtendReg(datasize, Rm, option, shift); + + const auto result = ir.Sub(operand1, operand2); + + ir.SetNZCV(ir.NZCVFrom(result)); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::ADC(bool sf, Reg Rm, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = X(datasize, Rm); + + const auto result = ir.AddWithCarry(operand1, operand2, ir.GetCFlag()); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::ADCS(bool sf, Reg Rm, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = X(datasize, Rm); + + const auto result = ir.AddWithCarry(operand1, operand2, ir.GetCFlag()); + + ir.SetNZCV(ir.NZCVFrom(result)); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::SBC(bool sf, Reg Rm, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = X(datasize, Rm); + + const auto result = ir.SubWithCarry(operand1, operand2, ir.GetCFlag()); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::SBCS(bool sf, Reg Rm, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = X(datasize, Rm); + + const auto result = ir.SubWithCarry(operand1, operand2, ir.GetCFlag()); + + ir.SetNZCV(ir.NZCVFrom(result)); + + X(datasize, Rd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_bitfield.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_bitfield.cpp new file mode 100644 index 0000000000..aec6729ad8 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_bitfield.cpp @@ -0,0 +1,155 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::SBFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) { + if (sf && !N) { + return ReservedValue(); + } + + if (!sf && (N || immr.Bit<5>() || imms.Bit<5>())) { + return ReservedValue(); + } + + const u8 R = immr.ZeroExtend<u8>(); + const u8 S = imms.ZeroExtend<u8>(); + const auto masks = DecodeBitMasks(N, imms, immr, false); + if (!masks) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto src = X(datasize, Rn); + + auto bot = ir.And(ir.RotateRight(src, ir.Imm8(R)), I(datasize, masks->wmask)); + auto top = ir.ReplicateBit(src, S); + + top = ir.And(top, I(datasize, ~masks->tmask)); + bot = ir.And(bot, I(datasize, masks->tmask)); + + X(datasize, Rd, ir.Or(top, bot)); + return true; +} + +bool TranslatorVisitor::BFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) { + if (sf && !N) { + return ReservedValue(); + } + + if (!sf && (N || immr.Bit<5>() || imms.Bit<5>())) { + return ReservedValue(); + } + + const u8 R = immr.ZeroExtend<u8>(); + const auto masks = DecodeBitMasks(N, imms, immr, false); + if (!masks) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto dst = X(datasize, Rd); + const auto src = X(datasize, Rn); + + const auto bot = ir.Or(ir.And(dst, I(datasize, ~masks->wmask)), ir.And(ir.RotateRight(src, ir.Imm8(R)), I(datasize, masks->wmask))); + + X(datasize, Rd, ir.Or(ir.And(dst, I(datasize, ~masks->tmask)), ir.And(bot, I(datasize, masks->tmask)))); + return true; +} + +bool TranslatorVisitor::UBFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) { + if (sf && !N) { + return ReservedValue(); + } + + if (!sf && (N || immr.Bit<5>() || imms.Bit<5>())) { + return ReservedValue(); + } + + const u8 R = immr.ZeroExtend<u8>(); + const auto masks = DecodeBitMasks(N, imms, immr, false); + if (!masks) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto src = X(datasize, Rn); + const auto bot = ir.And(ir.RotateRight(src, ir.Imm8(R)), I(datasize, masks->wmask)); + + X(datasize, Rd, ir.And(bot, I(datasize, masks->tmask))); + return true; +} + +bool TranslatorVisitor::ASR_1(Imm<5> immr, Reg Rn, Reg Rd) { + const auto src = X(32, Rn); + const auto result = ir.ArithmeticShiftRightMasked(src, ir.Imm32(immr.ZeroExtend<u32>())); + X(32, Rd, result); + return true; +} + +bool TranslatorVisitor::ASR_2(Imm<6> immr, Reg Rn, Reg Rd) { + const auto src = X(64, Rn); + const auto result = ir.ArithmeticShiftRightMasked(src, ir.Imm64(immr.ZeroExtend<u64>())); + X(64, Rd, result); + return true; +} + +bool TranslatorVisitor::SXTB_1(Reg Rn, Reg Rd) { + const auto src = X(32, Rn); + const auto result = ir.SignExtendToWord(ir.LeastSignificantByte(src)); + X(32, Rd, result); + return true; +} + +bool TranslatorVisitor::SXTB_2(Reg Rn, Reg Rd) { + const auto src = X(64, Rn); + const auto result = ir.SignExtendToLong(ir.LeastSignificantByte(src)); + X(64, Rd, result); + return true; +} + +bool TranslatorVisitor::SXTH_1(Reg Rn, Reg Rd) { + const auto src = X(32, Rn); + const auto result = ir.SignExtendToWord(ir.LeastSignificantHalf(src)); + X(32, Rd, result); + return true; +} + +bool TranslatorVisitor::SXTH_2(Reg Rn, Reg Rd) { + const auto src = X(64, Rn); + const auto result = ir.SignExtendToLong(ir.LeastSignificantHalf(src)); + X(64, Rd, result); + return true; +} + +bool TranslatorVisitor::SXTW(Reg Rn, Reg Rd) { + const auto src = X(64, Rn); + const auto result = ir.SignExtendToLong(ir.LeastSignificantWord(src)); + X(64, Rd, result); + return true; +} + +bool TranslatorVisitor::EXTR(bool sf, bool N, Reg Rm, Imm<6> imms, Reg Rn, Reg Rd) { + if (N != sf) { + return UnallocatedEncoding(); + } + + if (!sf && imms.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 m = X(datasize, Rm); + const IR::U32U64 n = X(datasize, Rn); + const IR::U32U64 result = ir.ExtractRegister(m, n, ir.Imm8(imms.ZeroExtend<u8>())); + + X(datasize, Rd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_compare.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_compare.cpp new file mode 100644 index 0000000000..03725c262d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_compare.cpp @@ -0,0 +1,62 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::CCMN_reg(bool sf, Reg Rm, Cond cond, Reg Rn, Imm<4> nzcv) { + const size_t datasize = sf ? 64 : 32; + const u32 flags = nzcv.ZeroExtend<u32>() << 28; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = X(datasize, Rm); + + const IR::NZCV then_flags = ir.NZCVFrom(ir.AddWithCarry(operand1, operand2, ir.Imm1(0))); + const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags)); + ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags)); + return true; +} + +bool TranslatorVisitor::CCMP_reg(bool sf, Reg Rm, Cond cond, Reg Rn, Imm<4> nzcv) { + const size_t datasize = sf ? 64 : 32; + const u32 flags = nzcv.ZeroExtend<u32>() << 28; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = X(datasize, Rm); + + const IR::NZCV then_flags = ir.NZCVFrom(ir.SubWithCarry(operand1, operand2, ir.Imm1(1))); + const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags)); + ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags)); + return true; +} + +bool TranslatorVisitor::CCMN_imm(bool sf, Imm<5> imm5, Cond cond, Reg Rn, Imm<4> nzcv) { + const size_t datasize = sf ? 64 : 32; + const u32 flags = nzcv.ZeroExtend<u32>() << 28; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = I(datasize, imm5.ZeroExtend<u32>()); + + const IR::NZCV then_flags = ir.NZCVFrom(ir.AddWithCarry(operand1, operand2, ir.Imm1(0))); + const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags)); + ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags)); + return true; +} + +bool TranslatorVisitor::CCMP_imm(bool sf, Imm<5> imm5, Cond cond, Reg Rn, Imm<4> nzcv) { + const size_t datasize = sf ? 64 : 32; + const u32 flags = nzcv.ZeroExtend<u32>() << 28; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = I(datasize, imm5.ZeroExtend<u32>()); + + const IR::NZCV then_flags = ir.NZCVFrom(ir.SubWithCarry(operand1, operand2, ir.Imm1(1))); + const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags)); + ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags)); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_select.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_select.cpp new file mode 100644 index 0000000000..49dadaa3be --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_conditional_select.cpp @@ -0,0 +1,58 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::CSEL(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = X(datasize, Rm); + + const IR::U32U64 result = ir.ConditionalSelect(cond, operand1, operand2); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::CSINC(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = X(datasize, Rm); + + const IR::U32U64 result = ir.ConditionalSelect(cond, operand1, ir.Add(operand2, I(datasize, 1))); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::CSINV(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = X(datasize, Rm); + + const IR::U32U64 result = ir.ConditionalSelect(cond, operand1, ir.Not(operand2)); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::CSNEG(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand1 = X(datasize, Rn); + const IR::U32U64 operand2 = X(datasize, Rm); + + const IR::U32U64 result = ir.ConditionalSelect(cond, operand1, ir.Add(ir.Not(operand2), I(datasize, 1))); + + X(datasize, Rd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_crc32.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_crc32.cpp new file mode 100644 index 0000000000..a0bf932a2a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_crc32.cpp @@ -0,0 +1,76 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::CRC32(bool sf, Reg Rm, Imm<2> sz, Reg Rn, Reg Rd) { + const u32 integral_size = sz.ZeroExtend(); + + if (sf && integral_size != 0b11) { + return UnallocatedEncoding(); + } + + if (!sf && integral_size == 0b11) { + return UnallocatedEncoding(); + } + + const IR::U32 result = [&] { + const size_t datasize = sf ? 64 : 32; + const IR::U32 accumulator = ir.GetW(Rn); + const IR::U32U64 data = X(datasize, Rm); + + switch (integral_size) { + case 0b00: + return ir.CRC32ISO8(accumulator, data); + case 0b01: + return ir.CRC32ISO16(accumulator, data); + case 0b10: + return ir.CRC32ISO32(accumulator, data); + case 0b11: + default: + return ir.CRC32ISO64(accumulator, data); + } + }(); + + X(32, Rd, result); + return true; +} + +bool TranslatorVisitor::CRC32C(bool sf, Reg Rm, Imm<2> sz, Reg Rn, Reg Rd) { + const u32 integral_size = sz.ZeroExtend(); + + if (sf && integral_size != 0b11) { + return UnallocatedEncoding(); + } + + if (!sf && integral_size == 0b11) { + return UnallocatedEncoding(); + } + + const IR::U32 result = [&] { + const size_t datasize = sf ? 64 : 32; + const IR::U32 accumulator = ir.GetW(Rn); + const IR::U32U64 data = X(datasize, Rm); + + switch (integral_size) { + case 0b00: + return ir.CRC32Castagnoli8(accumulator, data); + case 0b01: + return ir.CRC32Castagnoli16(accumulator, data); + case 0b10: + return ir.CRC32Castagnoli32(accumulator, data); + case 0b11: + default: + return ir.CRC32Castagnoli64(accumulator, data); + } + }(); + + X(32, Rd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_logical.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_logical.cpp new file mode 100644 index 0000000000..d280e37ee8 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_logical.cpp @@ -0,0 +1,236 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::AND_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) { + if (!sf && N) { + return ReservedValue(); + } + + u64 imm; + if (auto masks = DecodeBitMasks(N, imms, immr, true)) { + imm = masks->wmask; + } else { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = X(datasize, Rn); + + const auto result = ir.And(operand1, I(datasize, imm)); + if (Rd == Reg::SP) { + SP(datasize, result); + } else { + X(datasize, Rd, result); + } + + return true; +} + +bool TranslatorVisitor::ORR_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) { + if (!sf && N) { + return ReservedValue(); + } + + u64 imm; + if (auto masks = DecodeBitMasks(N, imms, immr, true)) { + imm = masks->wmask; + } else { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = X(datasize, Rn); + + const auto result = ir.Or(operand1, I(datasize, imm)); + if (Rd == Reg::SP) { + SP(datasize, result); + } else { + X(datasize, Rd, result); + } + + return true; +} + +bool TranslatorVisitor::EOR_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) { + if (!sf && N) { + return ReservedValue(); + } + + u64 imm; + if (auto masks = DecodeBitMasks(N, imms, immr, true)) { + imm = masks->wmask; + } else { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = X(datasize, Rn); + + const auto result = ir.Eor(operand1, I(datasize, imm)); + if (Rd == Reg::SP) { + SP(datasize, result); + } else { + X(datasize, Rd, result); + } + + return true; +} + +bool TranslatorVisitor::ANDS_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd) { + if (!sf && N) { + return ReservedValue(); + } + + u64 imm; + if (auto masks = DecodeBitMasks(N, imms, immr, true)) { + imm = masks->wmask; + } else { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const auto operand1 = X(datasize, Rn); + const auto result = ir.And(operand1, I(datasize, imm)); + + ir.SetNZCV(ir.NZCVFrom(result)); + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::AND_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)); + const auto result = ir.And(operand1, operand2); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::BIC_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)); + const auto result = ir.AndNot(operand1, operand2); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::ORR_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)); + const auto result = ir.Or(operand1, operand2); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::ORN_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ir.Not(ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount))); + const auto result = ir.Or(operand1, operand2); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::EOR_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)); + const auto result = ir.Eor(operand1, operand2); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::EON(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ir.Not(ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount))); + const auto result = ir.Eor(operand1, operand2); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::ANDS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)); + const auto result = ir.And(operand1, operand2); + + ir.SetNZCV(ir.NZCVFrom(result)); + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::BICS(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd) { + if (!sf && imm6.Bit<5>()) { + return ReservedValue(); + } + + const size_t datasize = sf ? 64 : 32; + const u8 shift_amount = imm6.ZeroExtend<u8>(); + + const auto operand1 = X(datasize, Rn); + const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)); + const auto result = ir.AndNot(operand1, operand2); + + ir.SetNZCV(ir.NZCVFrom(result)); + X(datasize, Rd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_multiply.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_multiply.cpp new file mode 100644 index 0000000000..60910803f0 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_multiply.cpp @@ -0,0 +1,100 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::MADD(bool sf, Reg Rm, Reg Ra, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 a = X(datasize, Ra); + const IR::U32U64 m = X(datasize, Rm); + const IR::U32U64 n = X(datasize, Rn); + + const IR::U32U64 result = ir.Add(a, ir.Mul(n, m)); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::MSUB(bool sf, Reg Rm, Reg Ra, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 a = X(datasize, Ra); + const IR::U32U64 m = X(datasize, Rm); + const IR::U32U64 n = X(datasize, Rn); + + const IR::U32U64 result = ir.Sub(a, ir.Mul(n, m)); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::SMADDL(Reg Rm, Reg Ra, Reg Rn, Reg Rd) { + const IR::U64 a = X(64, Ra); + const IR::U64 m = ir.SignExtendToLong(X(32, Rm)); + const IR::U64 n = ir.SignExtendToLong(X(32, Rn)); + + const IR::U64 result = ir.Add(a, ir.Mul(n, m)); + + X(64, Rd, result); + return true; +} + +bool TranslatorVisitor::SMSUBL(Reg Rm, Reg Ra, Reg Rn, Reg Rd) { + const IR::U64 a = X(64, Ra); + const IR::U64 m = ir.SignExtendToLong(X(32, Rm)); + const IR::U64 n = ir.SignExtendToLong(X(32, Rn)); + + const IR::U64 result = ir.Sub(a, ir.Mul(n, m)); + + X(64, Rd, result); + return true; +} + +bool TranslatorVisitor::SMULH(Reg Rm, Reg Rn, Reg Rd) { + const IR::U64 m = X(64, Rm); + const IR::U64 n = X(64, Rn); + + const IR::U64 result = ir.SignedMultiplyHigh(n, m); + + X(64, Rd, result); + return true; +} + +bool TranslatorVisitor::UMADDL(Reg Rm, Reg Ra, Reg Rn, Reg Rd) { + const IR::U64 a = X(64, Ra); + const IR::U64 m = ir.ZeroExtendToLong(X(32, Rm)); + const IR::U64 n = ir.ZeroExtendToLong(X(32, Rn)); + + const IR::U64 result = ir.Add(a, ir.Mul(n, m)); + + X(64, Rd, result); + return true; +} + +bool TranslatorVisitor::UMSUBL(Reg Rm, Reg Ra, Reg Rn, Reg Rd) { + const IR::U64 a = X(64, Ra); + const IR::U64 m = ir.ZeroExtendToLong(X(32, Rm)); + const IR::U64 n = ir.ZeroExtendToLong(X(32, Rn)); + + const IR::U64 result = ir.Sub(a, ir.Mul(n, m)); + + X(64, Rd, result); + return true; +} + +bool TranslatorVisitor::UMULH(Reg Rm, Reg Rn, Reg Rd) { + const IR::U64 m = X(64, Rm); + const IR::U64 n = X(64, Rn); + + const IR::U64 result = ir.UnsignedMultiplyHigh(n, m); + + X(64, Rd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_pcrel.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_pcrel.cpp new file mode 100644 index 0000000000..48b860a091 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_pcrel.cpp @@ -0,0 +1,24 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::ADR(Imm<2> immlo, Imm<19> immhi, Reg Rd) { + const u64 imm = concatenate(immhi, immlo).SignExtend<u64>(); + const u64 base = ir.PC(); + X(64, Rd, ir.Imm64(base + imm)); + return true; +} + +bool TranslatorVisitor::ADRP(Imm<2> immlo, Imm<19> immhi, Reg Rd) { + const u64 imm = concatenate(immhi, immlo).SignExtend<u64>() << 12; + const u64 base = ir.PC() & ~u64(0xFFF); + X(64, Rd, ir.Imm64(base + imm)); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_register.cpp new file mode 100644 index 0000000000..3484a1e3d8 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_register.cpp @@ -0,0 +1,139 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::CLZ_int(bool sf, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand = X(datasize, Rn); + const IR::U32U64 result = ir.CountLeadingZeros(operand); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::CLS_int(bool sf, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand = X(datasize, Rn); + const IR::U32U64 result = ir.Sub(ir.CountLeadingZeros(ir.Eor(operand, ir.ArithmeticShiftRight(operand, ir.Imm8(u8(datasize))))), I(datasize, 1)); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::RBIT_int(bool sf, Reg Rn, Reg Rd) { + const auto rbit32 = [this](const IR::U32& operand) { + // x = (x & 0x55555555) << 1 | ((x >> 1) & 0x55555555); + const IR::U32 first_lsl = ir.LogicalShiftLeft(ir.And(operand, ir.Imm32(0x55555555)), ir.Imm8(1)); + const IR::U32 first_lsr = ir.And(ir.LogicalShiftRight(operand, ir.Imm8(1)), ir.Imm32(0x55555555)); + const IR::U32 first = ir.Or(first_lsl, first_lsr); + + // x = (x & 0x33333333) << 2 | ((x >> 2) & 0x33333333); + const IR::U32 second_lsl = ir.LogicalShiftLeft(ir.And(first, ir.Imm32(0x33333333)), ir.Imm8(2)); + const IR::U32 second_lsr = ir.And(ir.LogicalShiftRight(first, ir.Imm8(2)), ir.Imm32(0x33333333)); + const IR::U32 second = ir.Or(second_lsl, second_lsr); + + // x = (x & 0x0F0F0F0F) << 4 | ((x >> 4) & 0x0F0F0F0F); + const IR::U32 third_lsl = ir.LogicalShiftLeft(ir.And(second, ir.Imm32(0x0F0F0F0F)), ir.Imm8(4)); + const IR::U32 third_lsr = ir.And(ir.LogicalShiftRight(second, ir.Imm8(4)), ir.Imm32(0x0F0F0F0F)); + const IR::U32 third = ir.Or(third_lsl, third_lsr); + + // x = (x << 24) | ((x & 0xFF00) << 8) | ((x >> 8) & 0xFF00) | (x >> 24); + const IR::U32 fourth_lsl = ir.Or(ir.LogicalShiftLeft(third, ir.Imm8(24)), + ir.LogicalShiftLeft(ir.And(third, ir.Imm32(0xFF00)), ir.Imm8(8))); + const IR::U32 fourth_lsr = ir.Or(ir.And(ir.LogicalShiftRight(third, ir.Imm8(8)), ir.Imm32(0xFF00)), + ir.LogicalShiftRight(third, ir.Imm8(24))); + return ir.Or(fourth_lsl, fourth_lsr); + }; + + const size_t datasize = sf ? 64 : 32; + const IR::U32U64 operand = X(datasize, Rn); + + if (sf) { + const IR::U32 lsw = rbit32(ir.LeastSignificantWord(operand)); + const IR::U32 msw = rbit32(ir.MostSignificantWord(operand).result); + const IR::U64 result = ir.Pack2x32To1x64(msw, lsw); + + X(datasize, Rd, result); + } else { + X(datasize, Rd, rbit32(operand)); + } + + return true; +} + +bool TranslatorVisitor::REV(bool sf, bool opc_0, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + if (!sf && opc_0) + return UnallocatedEncoding(); + + const IR::U32U64 operand = X(datasize, Rn); + + if (sf) { + X(datasize, Rd, ir.ByteReverseDual(operand)); + } else { + X(datasize, Rd, ir.ByteReverseWord(operand)); + } + return true; +} + +bool TranslatorVisitor::REV32_int(Reg Rn, Reg Rd) { + const IR::U64 operand = ir.GetX(Rn); + const IR::U32 lo = ir.ByteReverseWord(ir.LeastSignificantWord(operand)); + const IR::U32 hi = ir.ByteReverseWord(ir.MostSignificantWord(operand).result); + const IR::U64 result = ir.Pack2x32To1x64(lo, hi); + X(64, Rd, result); + return true; +} + +bool TranslatorVisitor::REV16_int(bool sf, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + if (sf) { + const IR::U64 operand = X(datasize, Rn); + const IR::U64 hihalf = ir.And(ir.LogicalShiftRight(operand, ir.Imm8(8)), ir.Imm64(0x00FF00FF00FF00FF)); + const IR::U64 lohalf = ir.And(ir.LogicalShiftLeft(operand, ir.Imm8(8)), ir.Imm64(0xFF00FF00FF00FF00)); + const IR::U64 result = ir.Or(hihalf, lohalf); + X(datasize, Rd, result); + } else { + const IR::U32 operand = X(datasize, Rn); + const IR::U32 hihalf = ir.And(ir.LogicalShiftRight(operand, ir.Imm8(8)), ir.Imm32(0x00FF00FF)); + const IR::U32 lohalf = ir.And(ir.LogicalShiftLeft(operand, ir.Imm8(8)), ir.Imm32(0xFF00FF00)); + const IR::U32 result = ir.Or(hihalf, lohalf); + X(datasize, Rd, result); + } + return true; +} + +bool TranslatorVisitor::UDIV(bool sf, Reg Rm, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 m = X(datasize, Rm); + const IR::U32U64 n = X(datasize, Rn); + + const IR::U32U64 result = ir.UnsignedDiv(n, m); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::SDIV(bool sf, Reg Rm, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 m = X(datasize, Rm); + const IR::U32U64 n = X(datasize, Rn); + + const IR::U32U64 result = ir.SignedDiv(n, m); + + X(datasize, Rd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_shift.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_shift.cpp new file mode 100644 index 0000000000..1b48abd7e3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/data_processing_shift.cpp @@ -0,0 +1,58 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::LSLV(bool sf, Reg Rm, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand = X(datasize, Rn); + const IR::U32U64 shift_amount = X(datasize, Rm); + + const IR::U32U64 result = ir.LogicalShiftLeftMasked(operand, shift_amount); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::LSRV(bool sf, Reg Rm, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand = X(datasize, Rn); + const IR::U32U64 shift_amount = X(datasize, Rm); + + const IR::U32U64 result = ir.LogicalShiftRightMasked(operand, shift_amount); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::ASRV(bool sf, Reg Rm, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand = X(datasize, Rn); + const IR::U32U64 shift_amount = X(datasize, Rm); + + const IR::U32U64 result = ir.ArithmeticShiftRightMasked(operand, shift_amount); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::RORV(bool sf, Reg Rm, Reg Rn, Reg Rd) { + const size_t datasize = sf ? 64 : 32; + + const IR::U32U64 operand = X(datasize, Rn); + const IR::U32U64 shift_amount = X(datasize, Rm); + + const IR::U32U64 result = ir.RotateRightMasked(operand, shift_amount); + + X(datasize, Rd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_compare.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_compare.cpp new file mode 100644 index 0000000000..c1835b815d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_compare.cpp @@ -0,0 +1,38 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +bool FPCompare(TranslatorVisitor& v, Imm<2> type, Vec Vm, Vec Vn, bool exc_on_qnan, bool cmp_with_zero) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return v.UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = v.V_scalar(*datasize, Vn); + IR::U32U64 operand2; + if (cmp_with_zero) { + operand2 = v.I(*datasize, 0); + } else { + operand2 = v.V_scalar(*datasize, Vm); + } + + const auto nzcv = v.ir.FPCompare(operand1, operand2, exc_on_qnan); + v.ir.SetNZCV(nzcv); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::FCMP_float(Imm<2> type, Vec Vm, Vec Vn, bool cmp_with_zero) { + return FPCompare(*this, type, Vm, Vn, false, cmp_with_zero); +} + +bool TranslatorVisitor::FCMPE_float(Imm<2> type, Vec Vm, Vec Vn, bool cmp_with_zero) { + return FPCompare(*this, type, Vm, Vn, true, cmp_with_zero); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_compare.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_compare.cpp new file mode 100644 index 0000000000..a0b1022410 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_compare.cpp @@ -0,0 +1,35 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +bool FPCompare(TranslatorVisitor& v, Imm<2> type, Vec Vm, Cond cond, Vec Vn, Imm<4> nzcv, bool exc_on_qnan) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return v.UnallocatedEncoding(); + } + const u32 flags = nzcv.ZeroExtend<u32>() << 28; + + const IR::U32U64 operand1 = v.V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = v.V_scalar(*datasize, Vm); + + const IR::NZCV then_flags = v.ir.FPCompare(operand1, operand2, exc_on_qnan); + const IR::NZCV else_flags = v.ir.NZCVFromPackedFlags(v.ir.Imm32(flags)); + v.ir.SetNZCV(v.ir.ConditionalSelect(cond, then_flags, else_flags)); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::FCCMP_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Imm<4> nzcv) { + return FPCompare(*this, type, Vm, cond, Vn, nzcv, false); +} + +bool TranslatorVisitor::FCCMPE_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Imm<4> nzcv) { + return FPCompare(*this, type, Vm, cond, Vn, nzcv, true); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_select.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_select.cpp new file mode 100644 index 0000000000..11ce6f623a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conditional_select.cpp @@ -0,0 +1,24 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::FCSEL_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = V_scalar(*datasize, Vm); + const IR::U32U64 result = ir.ConditionalSelect(cond, operand1, operand2); + V_scalar(*datasize, Vd, result); + + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_fixed_point.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_fixed_point.cpp new file mode 100644 index 0000000000..5fa7ac1f98 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_fixed_point.cpp @@ -0,0 +1,114 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::SCVTF_float_fix(bool sf, Imm<2> type, Imm<6> scale, Reg Rn, Vec Vd) { + const size_t intsize = sf ? 64 : 32; + const auto fltsize = FPGetDataSize(type); + if (!fltsize || *fltsize == 16) { + return UnallocatedEncoding(); + } + if (!sf && !scale.Bit<5>()) { + return UnallocatedEncoding(); + } + const u8 fracbits = 64 - scale.ZeroExtend<u8>(); + const FP::RoundingMode rounding_mode = ir.current_location->FPCR().RMode(); + + const IR::U32U64 intval = X(intsize, Rn); + const IR::U32U64 fltval = [&]() -> IR::U32U64 { + switch (*fltsize) { + case 32: + return ir.FPSignedFixedToSingle(intval, fracbits, rounding_mode); + case 64: + return ir.FPSignedFixedToDouble(intval, fracbits, rounding_mode); + } + UNREACHABLE(); + }(); + + V_scalar(*fltsize, Vd, fltval); + return true; +} + +bool TranslatorVisitor::UCVTF_float_fix(bool sf, Imm<2> type, Imm<6> scale, Reg Rn, Vec Vd) { + const size_t intsize = sf ? 64 : 32; + const auto fltsize = FPGetDataSize(type); + if (!fltsize || *fltsize == 16) { + return UnallocatedEncoding(); + } + if (!sf && !scale.Bit<5>()) { + return UnallocatedEncoding(); + } + const u8 fracbits = 64 - scale.ZeroExtend<u8>(); + const FP::RoundingMode rounding_mode = ir.current_location->FPCR().RMode(); + + const IR::U32U64 intval = X(intsize, Rn); + const IR::U32U64 fltval = [&]() -> IR::U32U64 { + switch (*fltsize) { + case 32: + return ir.FPUnsignedFixedToSingle(intval, fracbits, rounding_mode); + case 64: + return ir.FPUnsignedFixedToDouble(intval, fracbits, rounding_mode); + } + UNREACHABLE(); + }(); + + V_scalar(*fltsize, Vd, fltval); + return true; +} + +bool TranslatorVisitor::FCVTZS_float_fix(bool sf, Imm<2> type, Imm<6> scale, Vec Vn, Reg Rd) { + const size_t intsize = sf ? 64 : 32; + const auto fltsize = FPGetDataSize(type); + if (!fltsize) { + return UnallocatedEncoding(); + } + if (!sf && !scale.Bit<5>()) { + return UnallocatedEncoding(); + } + const u8 fracbits = 64 - scale.ZeroExtend<u8>(); + + const IR::U16U32U64 fltval = V_scalar(*fltsize, Vn); + IR::U32U64 intval; + if (intsize == 32) { + intval = ir.FPToFixedS32(fltval, fracbits, FP::RoundingMode::TowardsZero); + } else if (intsize == 64) { + intval = ir.FPToFixedS64(fltval, fracbits, FP::RoundingMode::TowardsZero); + } else { + UNREACHABLE(); + } + + X(intsize, Rd, intval); + return true; +} + +bool TranslatorVisitor::FCVTZU_float_fix(bool sf, Imm<2> type, Imm<6> scale, Vec Vn, Reg Rd) { + const size_t intsize = sf ? 64 : 32; + const auto fltsize = FPGetDataSize(type); + if (!fltsize) { + return UnallocatedEncoding(); + } + if (!sf && !scale.Bit<5>()) { + return UnallocatedEncoding(); + } + const u8 fracbits = 64 - scale.ZeroExtend<u8>(); + + const IR::U16U32U64 fltval = V_scalar(*fltsize, Vn); + IR::U32U64 intval; + if (intsize == 32) { + intval = ir.FPToFixedU32(fltval, fracbits, FP::RoundingMode::TowardsZero); + } else if (intsize == 64) { + intval = ir.FPToFixedU64(fltval, fracbits, FP::RoundingMode::TowardsZero); + } else { + UNREACHABLE(); + } + + X(intsize, Rd, intval); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_integer.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_integer.cpp new file mode 100644 index 0000000000..2f98ebfb13 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_conversion_integer.cpp @@ -0,0 +1,199 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/rounding_mode.h" +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::SCVTF_float_int(bool sf, Imm<2> type, Reg Rn, Vec Vd) { + const size_t intsize = sf ? 64 : 32; + const auto fltsize = FPGetDataSize(type); + if (!fltsize || *fltsize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 intval = X(intsize, Rn); + IR::U32U64 fltval; + + if (*fltsize == 32) { + fltval = ir.FPSignedFixedToSingle(intval, 0, ir.current_location->FPCR().RMode()); + } else if (*fltsize == 64) { + fltval = ir.FPSignedFixedToDouble(intval, 0, ir.current_location->FPCR().RMode()); + } else { + UNREACHABLE(); + } + + V_scalar(*fltsize, Vd, fltval); + + return true; +} + +bool TranslatorVisitor::UCVTF_float_int(bool sf, Imm<2> type, Reg Rn, Vec Vd) { + const size_t intsize = sf ? 64 : 32; + const auto fltsize = FPGetDataSize(type); + if (!fltsize || *fltsize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 intval = X(intsize, Rn); + IR::U32U64 fltval; + + if (*fltsize == 32) { + fltval = ir.FPUnsignedFixedToSingle(intval, 0, ir.current_location->FPCR().RMode()); + } else if (*fltsize == 64) { + fltval = ir.FPUnsignedFixedToDouble(intval, 0, ir.current_location->FPCR().RMode()); + } else { + UNREACHABLE(); + } + + V_scalar(*fltsize, Vd, fltval); + + return true; +} + +bool TranslatorVisitor::FMOV_float_gen(bool sf, Imm<2> type, Imm<1> rmode_0, Imm<1> opc_0, size_t n, size_t d) { + // NOTE: + // opcode<2:1> == 0b11 + // rmode<1> == 0b0 + + if (type == 0b10 && rmode_0 != 1) { + return UnallocatedEncoding(); + } + + const size_t intsize = sf ? 64 : 32; + size_t fltsize = [type] { + switch (type.ZeroExtend()) { + case 0b00: + return 32; + case 0b01: + return 64; + case 0b10: + return 128; + case 0b11: + return 16; + default: + UNREACHABLE(); + } + }(); + + bool integer_to_float; + size_t part; + switch (rmode_0.ZeroExtend()) { + case 0b0: + if (fltsize != 16 && fltsize != intsize) { + return UnallocatedEncoding(); + } + integer_to_float = opc_0 == 0b1; + part = 0; + break; + default: + case 0b1: + if (intsize != 64 || fltsize != 128) { + return UnallocatedEncoding(); + } + integer_to_float = opc_0 == 0b1; + part = 1; + fltsize = 64; + break; + } + + if (integer_to_float) { + const IR::U16U32U64 intval = X(fltsize, static_cast<Reg>(n)); + Vpart_scalar(fltsize, static_cast<Vec>(d), part, intval); + } else { + const IR::UAny fltval = Vpart_scalar(fltsize, static_cast<Vec>(n), part); + const IR::U32U64 intval = ZeroExtend(fltval, intsize); + X(intsize, static_cast<Reg>(d), intval); + } + + return true; +} + +static bool FloaingPointConvertSignedInteger(TranslatorVisitor& v, bool sf, Imm<2> type, Vec Vn, Reg Rd, FP::RoundingMode rounding_mode) { + const size_t intsize = sf ? 64 : 32; + const auto fltsize = FPGetDataSize(type); + if (!fltsize) { + return v.UnallocatedEncoding(); + } + + const IR::U16U32U64 fltval = v.V_scalar(*fltsize, Vn); + IR::U32U64 intval; + + if (intsize == 32) { + intval = v.ir.FPToFixedS32(fltval, 0, rounding_mode); + } else if (intsize == 64) { + intval = v.ir.FPToFixedS64(fltval, 0, rounding_mode); + } else { + UNREACHABLE(); + } + + v.X(intsize, Rd, intval); + return true; +} + +static bool FloaingPointConvertUnsignedInteger(TranslatorVisitor& v, bool sf, Imm<2> type, Vec Vn, Reg Rd, FP::RoundingMode rounding_mode) { + const size_t intsize = sf ? 64 : 32; + const auto fltsize = FPGetDataSize(type); + if (!fltsize) { + return v.UnallocatedEncoding(); + } + + const IR::U16U32U64 fltval = v.V_scalar(*fltsize, Vn); + IR::U32U64 intval; + + if (intsize == 32) { + intval = v.ir.FPToFixedU32(fltval, 0, rounding_mode); + } else if (intsize == 64) { + intval = v.ir.FPToFixedU64(fltval, 0, rounding_mode); + } else { + UNREACHABLE(); + } + + v.X(intsize, Rd, intval); + return true; +} + +bool TranslatorVisitor::FCVTNS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) { + return FloaingPointConvertSignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::ToNearest_TieEven); +} + +bool TranslatorVisitor::FCVTNU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) { + return FloaingPointConvertUnsignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::ToNearest_TieEven); +} + +bool TranslatorVisitor::FCVTZS_float_int(bool sf, Imm<2> type, Vec Vn, Reg Rd) { + return FloaingPointConvertSignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsZero); +} + +bool TranslatorVisitor::FCVTZU_float_int(bool sf, Imm<2> type, Vec Vn, Reg Rd) { + return FloaingPointConvertUnsignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsZero); +} + +bool TranslatorVisitor::FCVTAS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) { + return FloaingPointConvertSignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::ToNearest_TieAwayFromZero); +} + +bool TranslatorVisitor::FCVTAU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) { + return FloaingPointConvertUnsignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::ToNearest_TieAwayFromZero); +} + +bool TranslatorVisitor::FCVTPS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) { + return FloaingPointConvertSignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsPlusInfinity); +} + +bool TranslatorVisitor::FCVTPU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) { + return FloaingPointConvertUnsignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsPlusInfinity); +} + +bool TranslatorVisitor::FCVTMS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) { + return FloaingPointConvertSignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsMinusInfinity); +} + +bool TranslatorVisitor::FCVTMU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd) { + return FloaingPointConvertUnsignedInteger(*this, sf, type, Vn, Rd, FP::RoundingMode::TowardsMinusInfinity); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp new file mode 100644 index 0000000000..187f0a8908 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_one_register.cpp @@ -0,0 +1,186 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::FMOV_float(Imm<2> type, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize) { + return UnallocatedEncoding(); + } + + const IR::U16U32U64 operand = V_scalar(*datasize, Vn); + + V_scalar(*datasize, Vd, operand); + return true; +} + +bool TranslatorVisitor::FABS_float(Imm<2> type, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize) { + return UnallocatedEncoding(); + } + + const IR::U16U32U64 operand = V_scalar(*datasize, Vn); + const IR::U16U32U64 result = ir.FPAbs(operand); + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FNEG_float(Imm<2> type, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize) { + return UnallocatedEncoding(); + } + + const IR::U16U32U64 operand = V_scalar(*datasize, Vn); + const IR::U16U32U64 result = ir.FPNeg(operand); + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FSQRT_float(Imm<2> type, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand = V_scalar(*datasize, Vn); + const IR::U32U64 result = ir.FPSqrt(operand); + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMOV_float_imm(Imm<2> type, Imm<8> imm8, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize) { + return UnallocatedEncoding(); + } + + IR::UAny result = [&]() -> IR::UAny { + switch (*datasize) { + case 16: { + const u16 sign = imm8.Bit<7>() ? 1 : 0; + const u16 exp = (imm8.Bit<6>() ? 0b0'1100 : 0b1'0000) | imm8.Bits<4, 5, u16>(); + const u16 fract = imm8.Bits<0, 3, u16>() << 6; + return ir.Imm16((sign << 15) | (exp << 10) | fract); + } + case 32: { + const u32 sign = imm8.Bit<7>() ? 1 : 0; + const u32 exp = (imm8.Bit<6>() ? 0b0111'1100 : 0b1000'0000) | imm8.Bits<4, 5, u32>(); + const u32 fract = imm8.Bits<0, 3, u32>() << 19; + return ir.Imm32((sign << 31) | (exp << 23) | fract); + } + case 64: + default: { + const u64 sign = imm8.Bit<7>() ? 1 : 0; + const u64 exp = (imm8.Bit<6>() ? 0b011'1111'1100 : 0b100'0000'0000) | imm8.Bits<4, 5, u64>(); + const u64 fract = imm8.Bits<0, 3, u64>() << 48; + return ir.Imm64((sign << 63) | (exp << 52) | fract); + } + } + }(); + + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FCVT_float(Imm<2> type, Imm<2> opc, Vec Vn, Vec Vd) { + if (type == opc) { + return UnallocatedEncoding(); + } + + const auto srcsize = FPGetDataSize(type); + const auto dstsize = FPGetDataSize(opc); + + if (!srcsize || !dstsize) { + return UnallocatedEncoding(); + } + + const IR::UAny operand = V_scalar(*srcsize, Vn); + const auto rounding_mode = ir.current_location->FPCR().RMode(); + + IR::UAny result; + switch (*srcsize) { + case 16: + switch (*dstsize) { + case 32: + result = ir.FPHalfToSingle(operand, rounding_mode); + break; + case 64: + result = ir.FPHalfToDouble(operand, rounding_mode); + break; + } + break; + case 32: + switch (*dstsize) { + case 16: + result = ir.FPSingleToHalf(operand, rounding_mode); + break; + case 64: + result = ir.FPSingleToDouble(operand, rounding_mode); + break; + } + break; + case 64: + switch (*dstsize) { + case 16: + result = ir.FPDoubleToHalf(operand, rounding_mode); + break; + case 32: + result = ir.FPDoubleToSingle(operand, rounding_mode); + break; + } + break; + } + + V_scalar(*dstsize, Vd, result); + + return true; +} + +static bool FloatingPointRoundToIntegral(TranslatorVisitor& v, Imm<2> type, Vec Vn, Vec Vd, FP::RoundingMode rounding_mode, bool exact) { + const auto datasize = FPGetDataSize(type); + if (!datasize) { + return v.UnallocatedEncoding(); + } + + const IR::U16U32U64 operand = v.V_scalar(*datasize, Vn); + const IR::U16U32U64 result = v.ir.FPRoundInt(operand, rounding_mode, exact); + v.V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRINTN_float(Imm<2> type, Vec Vn, Vec Vd) { + return FloatingPointRoundToIntegral(*this, type, Vn, Vd, FP::RoundingMode::ToNearest_TieEven, false); +} + +bool TranslatorVisitor::FRINTP_float(Imm<2> type, Vec Vn, Vec Vd) { + return FloatingPointRoundToIntegral(*this, type, Vn, Vd, FP::RoundingMode::TowardsPlusInfinity, false); +} + +bool TranslatorVisitor::FRINTM_float(Imm<2> type, Vec Vn, Vec Vd) { + return FloatingPointRoundToIntegral(*this, type, Vn, Vd, FP::RoundingMode::TowardsMinusInfinity, false); +} + +bool TranslatorVisitor::FRINTZ_float(Imm<2> type, Vec Vn, Vec Vd) { + return FloatingPointRoundToIntegral(*this, type, Vn, Vd, FP::RoundingMode::TowardsZero, false); +} + +bool TranslatorVisitor::FRINTA_float(Imm<2> type, Vec Vn, Vec Vd) { + return FloatingPointRoundToIntegral(*this, type, Vn, Vd, FP::RoundingMode::ToNearest_TieAwayFromZero, false); +} + +bool TranslatorVisitor::FRINTX_float(Imm<2> type, Vec Vn, Vec Vd) { + return FloatingPointRoundToIntegral(*this, type, Vn, Vd, ir.current_location->FPCR().RMode(), true); +} + +bool TranslatorVisitor::FRINTI_float(Imm<2> type, Vec Vn, Vec Vd) { + return FloatingPointRoundToIntegral(*this, type, Vn, Vd, ir.current_location->FPCR().RMode(), false); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp new file mode 100644 index 0000000000..75c38c012b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_three_register.cpp @@ -0,0 +1,66 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::FMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize) { + return UnallocatedEncoding(); + } + + const IR::U16U32U64 operanda = V_scalar(*datasize, Va); + const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm); + const IR::U16U32U64 result = ir.FPMulAdd(operanda, operand1, operand2); + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize) { + return UnallocatedEncoding(); + } + + const IR::U16U32U64 operanda = V_scalar(*datasize, Va); + const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm); + const IR::U16U32U64 result = ir.FPMulSub(operanda, operand1, operand2); + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FNMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize) { + return UnallocatedEncoding(); + } + + const IR::U16U32U64 operanda = V_scalar(*datasize, Va); + const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm); + const IR::U16U32U64 result = ir.FPMulSub(ir.FPNeg(operanda), operand1, operand2); + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FNMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize) { + return UnallocatedEncoding(); + } + + const IR::U16U32U64 operanda = V_scalar(*datasize, Va); + const IR::U16U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U16U32U64 operand2 = V_scalar(*datasize, Vm); + const IR::U16U32U64 result = ir.FPMulAdd(ir.FPNeg(operanda), operand1, operand2); + V_scalar(*datasize, Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp new file mode 100644 index 0000000000..a4dbda9cce --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/floating_point_data_processing_two_register.cpp @@ -0,0 +1,145 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::FMUL_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = V_scalar(*datasize, Vm); + + const IR::U32U64 result = ir.FPMul(operand1, operand2); + + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FDIV_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = V_scalar(*datasize, Vm); + + const IR::U32U64 result = ir.FPDiv(operand1, operand2); + + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FADD_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = V_scalar(*datasize, Vm); + + const IR::U32U64 result = ir.FPAdd(operand1, operand2); + + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FSUB_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = V_scalar(*datasize, Vm); + + const IR::U32U64 result = ir.FPSub(operand1, operand2); + + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMAX_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = V_scalar(*datasize, Vm); + + const IR::U32U64 result = ir.FPMax(operand1, operand2); + + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMIN_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = V_scalar(*datasize, Vm); + + const IR::U32U64 result = ir.FPMin(operand1, operand2); + + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMAXNM_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = V_scalar(*datasize, Vm); + + const IR::U32U64 result = ir.FPMaxNumeric(operand1, operand2); + + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMINNM_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = V_scalar(*datasize, Vm); + + const IR::U32U64 result = ir.FPMinNumeric(operand1, operand2); + + V_scalar(*datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FNMUL_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd) { + const auto datasize = FPGetDataSize(type); + if (!datasize || *datasize == 16) { + return UnallocatedEncoding(); + } + + const IR::U32U64 operand1 = V_scalar(*datasize, Vn); + const IR::U32U64 operand2 = V_scalar(*datasize, Vm); + + const IR::U32U64 result = ir.FPNeg(ir.FPMul(operand1, operand2)); + + V_scalar(*datasize, Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.cpp new file mode 100644 index 0000000000..d0fd93867c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.cpp @@ -0,0 +1,409 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +#include <mcl/bit/bit_count.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/bit/rotate.hpp> + +#include "dynarmic/ir/terminal.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::InterpretThisInstruction() { + ir.SetTerm(IR::Term::Interpret(*ir.current_location)); + return false; +} + +bool TranslatorVisitor::UnpredictableInstruction() { + return RaiseException(Exception::UnpredictableInstruction); +} + +bool TranslatorVisitor::DecodeError() { + UNREACHABLE(); +} + +bool TranslatorVisitor::ReservedValue() { + return RaiseException(Exception::ReservedValue); +} + +bool TranslatorVisitor::UnallocatedEncoding() { + return RaiseException(Exception::UnallocatedEncoding); +} + +bool TranslatorVisitor::RaiseException(Exception exception) { + ir.SetPC(ir.Imm64(ir.current_location->PC() + 4)); + ir.ExceptionRaised(exception); + ir.SetTerm(IR::Term::CheckHalt{IR::Term::ReturnToDispatch{}}); + return false; +} + +std::optional<TranslatorVisitor::BitMasks> TranslatorVisitor::DecodeBitMasks(bool immN, Imm<6> imms, Imm<6> immr, bool immediate) { + const int len = mcl::bit::highest_set_bit((immN ? 1 << 6 : 0) | (imms.ZeroExtend() ^ 0b111111)); + if (len < 1) { + return std::nullopt; + } + + const size_t levels = mcl::bit::ones<size_t>(len); + if (immediate && (imms.ZeroExtend() & levels) == levels) { + return std::nullopt; + } + + const s32 S = s32(imms.ZeroExtend() & levels); + const s32 R = s32(immr.ZeroExtend() & levels); + const u64 d = u64(S - R) & levels; + + const size_t esize = size_t{1} << len; + const u64 welem = mcl::bit::ones<u64>(S + 1); + const u64 telem = mcl::bit::ones<u64>(d + 1); + const u64 wmask = mcl::bit::rotate_right(mcl::bit::replicate_element<u64>(esize, welem), R); + const u64 tmask = mcl::bit::replicate_element<u64>(esize, telem); + + return BitMasks{wmask, tmask}; +} + +IR::UAny TranslatorVisitor::I(size_t bitsize, u64 value) { + switch (bitsize) { + case 8: + return ir.Imm8(static_cast<u8>(value)); + case 16: + return ir.Imm16(static_cast<u16>(value)); + case 32: + return ir.Imm32(static_cast<u32>(value)); + case 64: + return ir.Imm64(value); + default: + ASSERT_FALSE("Imm - get: Invalid bitsize"); + } +} + +IR::UAny TranslatorVisitor::X(size_t bitsize, Reg reg) { + switch (bitsize) { + case 8: + return ir.LeastSignificantByte(ir.GetW(reg)); + case 16: + return ir.LeastSignificantHalf(ir.GetW(reg)); + case 32: + return ir.GetW(reg); + case 64: + return ir.GetX(reg); + default: + ASSERT_FALSE("X - get: Invalid bitsize"); + } +} + +void TranslatorVisitor::X(size_t bitsize, Reg reg, IR::U32U64 value) { + switch (bitsize) { + case 32: + ir.SetW(reg, value); + return; + case 64: + ir.SetX(reg, value); + return; + default: + ASSERT_FALSE("X - set: Invalid bitsize"); + } +} + +IR::U32U64 TranslatorVisitor::SP(size_t bitsize) { + switch (bitsize) { + case 32: + return ir.LeastSignificantWord(ir.GetSP()); + case 64: + return ir.GetSP(); + default: + ASSERT_FALSE("SP - get : Invalid bitsize"); + } +} + +void TranslatorVisitor::SP(size_t bitsize, IR::U32U64 value) { + switch (bitsize) { + case 32: + ir.SetSP(ir.ZeroExtendWordToLong(value)); + break; + case 64: + ir.SetSP(value); + break; + default: + ASSERT_FALSE("SP - set : Invalid bitsize"); + } +} + +IR::U128 TranslatorVisitor::V(size_t bitsize, Vec vec) { + switch (bitsize) { + case 32: + return ir.GetS(vec); + case 64: + return ir.GetD(vec); + case 128: + return ir.GetQ(vec); + default: + ASSERT_FALSE("V - get : Invalid bitsize"); + } +} + +void TranslatorVisitor::V(size_t bitsize, Vec vec, IR::U128 value) { + switch (bitsize) { + case 32: + ir.SetS(vec, value); + return; + case 64: + // TODO: Remove VectorZeroUpper when possible. + ir.SetD(vec, ir.VectorZeroUpper(value)); + return; + case 128: + ir.SetQ(vec, value); + return; + default: + ASSERT_FALSE("V - Set : Invalid bitsize"); + } +} + +IR::UAnyU128 TranslatorVisitor::V_scalar(size_t bitsize, Vec vec) { + if (bitsize == 128) { + return V(128, vec); + } + // TODO: Optimize + return ir.VectorGetElement(bitsize, ir.GetQ(vec), 0); +} + +void TranslatorVisitor::V_scalar(size_t bitsize, Vec vec, IR::UAnyU128 value) { + if (bitsize == 128) { + V(128, vec, value); + return; + } + // TODO: Optimize + ir.SetQ(vec, ir.ZeroExtendToQuad(value)); +} + +IR::U128 TranslatorVisitor::Vpart(size_t bitsize, Vec vec, size_t part) { + ASSERT(part == 0 || part == 1); + ASSERT(bitsize == 64); + if (part == 0) { + return V(64, vec); + } + return ir.ZeroExtendToQuad(ir.VectorGetElement(bitsize, V(128, vec), part)); +} + +void TranslatorVisitor::Vpart(size_t bitsize, Vec vec, size_t part, IR::U128 value) { + ASSERT(part == 0 || part == 1); + if (part == 0) { + ASSERT(bitsize == 64); + V(128, vec, ir.VectorZeroExtend(bitsize, value)); + } else { + ASSERT(bitsize == 64); + V(128, vec, ir.VectorInterleaveLower(64, V(128, vec), value)); + } +} + +IR::UAny TranslatorVisitor::Vpart_scalar(size_t bitsize, Vec vec, size_t part) { + ASSERT(part == 0 || part == 1); + if (part == 0) { + ASSERT(bitsize == 8 || bitsize == 16 || bitsize == 32 || bitsize == 64); + } else { + ASSERT(bitsize == 64); + } + return ir.VectorGetElement(bitsize, V(128, vec), part); +} + +void TranslatorVisitor::Vpart_scalar(size_t bitsize, Vec vec, size_t part, IR::UAny value) { + ASSERT(part == 0 || part == 1); + if (part == 0) { + ASSERT(bitsize == 8 || bitsize == 16 || bitsize == 32 || bitsize == 64); + V(128, vec, ir.ZeroExtendToQuad(value)); + } else { + ASSERT(bitsize == 64); + V(128, vec, ir.VectorSetElement(64, V(128, vec), 1, value)); + } +} + +IR::UAnyU128 TranslatorVisitor::Mem(IR::U64 address, size_t bytesize, IR::AccType acc_type) { + switch (bytesize) { + case 1: + return ir.ReadMemory8(address, acc_type); + case 2: + return ir.ReadMemory16(address, acc_type); + case 4: + return ir.ReadMemory32(address, acc_type); + case 8: + return ir.ReadMemory64(address, acc_type); + case 16: + return ir.ReadMemory128(address, acc_type); + default: + ASSERT_FALSE("Invalid bytesize parameter {}", bytesize); + } +} + +void TranslatorVisitor::Mem(IR::U64 address, size_t bytesize, IR::AccType acc_type, IR::UAnyU128 value) { + switch (bytesize) { + case 1: + ir.WriteMemory8(address, value, acc_type); + return; + case 2: + ir.WriteMemory16(address, value, acc_type); + return; + case 4: + ir.WriteMemory32(address, value, acc_type); + return; + case 8: + ir.WriteMemory64(address, value, acc_type); + return; + case 16: + ir.WriteMemory128(address, value, acc_type); + return; + default: + ASSERT_FALSE("Invalid bytesize parameter {}", bytesize); + } +} + +IR::UAnyU128 TranslatorVisitor::ExclusiveMem(IR::U64 address, size_t bytesize, IR::AccType acc_type) { + switch (bytesize) { + case 1: + return ir.ExclusiveReadMemory8(address, acc_type); + case 2: + return ir.ExclusiveReadMemory16(address, acc_type); + case 4: + return ir.ExclusiveReadMemory32(address, acc_type); + case 8: + return ir.ExclusiveReadMemory64(address, acc_type); + case 16: + return ir.ExclusiveReadMemory128(address, acc_type); + default: + ASSERT_FALSE("Invalid bytesize parameter {}", bytesize); + } +} + +IR::U32 TranslatorVisitor::ExclusiveMem(IR::U64 address, size_t bytesize, IR::AccType acc_type, IR::UAnyU128 value) { + switch (bytesize) { + case 1: + return ir.ExclusiveWriteMemory8(address, value, acc_type); + case 2: + return ir.ExclusiveWriteMemory16(address, value, acc_type); + case 4: + return ir.ExclusiveWriteMemory32(address, value, acc_type); + case 8: + return ir.ExclusiveWriteMemory64(address, value, acc_type); + case 16: + return ir.ExclusiveWriteMemory128(address, value, acc_type); + default: + ASSERT_FALSE("Invalid bytesize parameter {}", bytesize); + } +} + +IR::U32U64 TranslatorVisitor::SignExtend(IR::UAny value, size_t to_size) { + switch (to_size) { + case 32: + return ir.SignExtendToWord(value); + case 64: + return ir.SignExtendToLong(value); + default: + ASSERT_FALSE("Invalid size parameter {}", to_size); + } +} + +IR::U32U64 TranslatorVisitor::ZeroExtend(IR::UAny value, size_t to_size) { + switch (to_size) { + case 32: + return ir.ZeroExtendToWord(value); + case 64: + return ir.ZeroExtendToLong(value); + default: + ASSERT_FALSE("Invalid size parameter {}", to_size); + } +} + +IR::U32U64 TranslatorVisitor::ShiftReg(size_t bitsize, Reg reg, Imm<2> shift, IR::U8 amount) { + IR::U32U64 result = X(bitsize, reg); + switch (shift.ZeroExtend()) { + case 0b00: + return ir.LogicalShiftLeft(result, amount); + case 0b01: + return ir.LogicalShiftRight(result, amount); + case 0b10: + return ir.ArithmeticShiftRight(result, amount); + case 0b11: + return ir.RotateRight(result, amount); + } + UNREACHABLE(); +} + +IR::U32U64 TranslatorVisitor::ExtendReg(size_t bitsize, Reg reg, Imm<3> option, u8 shift) { + ASSERT(shift <= 4); + ASSERT(bitsize == 32 || bitsize == 64); + IR::UAny val = X(bitsize, reg); + size_t len; + IR::U32U64 extended; + bool signed_extend; + + switch (option.ZeroExtend()) { + case 0b000: { // UXTB + val = ir.LeastSignificantByte(val); + len = 8; + signed_extend = false; + break; + } + case 0b001: { // UXTH + val = ir.LeastSignificantHalf(val); + len = 16; + signed_extend = false; + break; + } + case 0b010: { // UXTW + if (bitsize != 32) { + val = ir.LeastSignificantWord(val); + } + len = 32; + signed_extend = false; + break; + } + case 0b011: { // UXTX + len = 64; + signed_extend = false; + break; + } + case 0b100: { // SXTB + val = ir.LeastSignificantByte(val); + len = 8; + signed_extend = true; + break; + } + case 0b101: { // SXTH + val = ir.LeastSignificantHalf(val); + len = 16; + signed_extend = true; + break; + } + case 0b110: { // SXTW + if (bitsize != 32) { + val = ir.LeastSignificantWord(val); + } + len = 32; + signed_extend = true; + break; + } + case 0b111: { // SXTX + len = 64; + signed_extend = true; + break; + } + default: + UNREACHABLE(); + } + + if (len < bitsize) { + if (bitsize == 32) { + extended = signed_extend ? ir.SignExtendToWord(val) : ir.ZeroExtendToWord(val); + } else { + extended = signed_extend ? ir.SignExtendToLong(val) : ir.ZeroExtendToLong(val); + } + } else { + extended = val; + } + + return ir.LogicalShiftLeft(extended, ir.Imm8(shift)); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.h b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.h new file mode 100644 index 0000000000..86bc534d5e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/impl.h @@ -0,0 +1,1084 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <optional> + +#include "dynarmic/frontend/A64/a64_ir_emitter.h" +#include "dynarmic/frontend/A64/a64_location_descriptor.h" +#include "dynarmic/frontend/A64/a64_types.h" +#include "dynarmic/frontend/A64/translate/a64_translate.h" +#include "dynarmic/frontend/imm.h" + +namespace Dynarmic::A64 { + +struct TranslatorVisitor final { + using instruction_return_type = bool; + + explicit TranslatorVisitor(IR::Block& block, LocationDescriptor descriptor, TranslationOptions options) + : ir(block, descriptor), options(std::move(options)) {} + + A64::IREmitter ir; + TranslationOptions options; + + bool InterpretThisInstruction(); + bool UnpredictableInstruction(); + bool DecodeError(); + bool ReservedValue(); + bool UnallocatedEncoding(); + bool RaiseException(Exception exception); + + struct BitMasks { + u64 wmask, tmask; + }; + + static std::optional<BitMasks> DecodeBitMasks(bool immN, Imm<6> imms, Imm<6> immr, bool immediate); + + IR::UAny I(size_t bitsize, u64 value); + IR::UAny X(size_t bitsize, Reg reg); + void X(size_t bitsize, Reg reg, IR::U32U64 value); + IR::U32U64 SP(size_t bitsize); + void SP(size_t bitsize, IR::U32U64 value); + + IR::U128 V(size_t bitsize, Vec vec); + void V(size_t bitsize, Vec vec, IR::U128 value); + + IR::UAnyU128 V_scalar(size_t bitsize, Vec vec); + void V_scalar(size_t bitsize, Vec vec, IR::UAnyU128 value); + + IR::U128 Vpart(size_t bitsize, Vec vec, size_t part); + void Vpart(size_t bitsize, Vec vec, size_t part, IR::U128 value); + + IR::UAny Vpart_scalar(size_t bitsize, Vec vec, size_t part); + void Vpart_scalar(size_t bitsize, Vec vec, size_t part, IR::UAny value); + + IR::UAnyU128 Mem(IR::U64 address, size_t size, IR::AccType acctype); + void Mem(IR::U64 address, size_t size, IR::AccType acctype, IR::UAnyU128 value); + IR::UAnyU128 ExclusiveMem(IR::U64 address, size_t size, IR::AccType acctype); + IR::U32 ExclusiveMem(IR::U64 address, size_t size, IR::AccType acctype, IR::UAnyU128 value); + + IR::U32U64 SignExtend(IR::UAny value, size_t to_size); + IR::U32U64 ZeroExtend(IR::UAny value, size_t to_size); + IR::U32U64 ShiftReg(size_t bitsize, Reg reg, Imm<2> shift, IR::U8 amount); + IR::U32U64 ExtendReg(size_t bitsize, Reg reg, Imm<3> option, u8 shift); + + // Data processing - Immediate - PC relative addressing + bool ADR(Imm<2> immlo, Imm<19> immhi, Reg Rd); + bool ADRP(Imm<2> immlo, Imm<19> immhi, Reg Rd); + + // Data processing - Immediate - Add/Sub (with tag) + bool ADDG(Imm<6> offset_imm, Imm<4> tag_offset, Reg Rn, Reg Rd); + bool SUBG(Imm<6> offset_imm, Imm<4> tag_offset, Reg Rn, Reg Rd); + + // Data processing - Immediate - Add/Sub + bool ADD_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd); + bool ADDS_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd); + bool SUB_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd); + bool SUBS_imm(bool sf, Imm<2> shift, Imm<12> imm12, Reg Rn, Reg Rd); + + // Data processing - Immediate - Logical + bool AND_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd); + bool ORR_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd); + bool EOR_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd); + bool ANDS_imm(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd); + + // Data processing - Immediate - Move Wide + bool MOVN(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd); + bool MOVZ(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd); + bool MOVK(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd); + + // Data processing - Immediate - Bitfield + bool SBFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd); + bool BFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd); + bool UBFM(bool sf, bool N, Imm<6> immr, Imm<6> imms, Reg Rn, Reg Rd); + bool ASR_1(Imm<5> immr, Reg Rn, Reg Rd); + bool ASR_2(Imm<6> immr, Reg Rn, Reg Rd); + bool SXTB_1(Reg Rn, Reg Rd); + bool SXTB_2(Reg Rn, Reg Rd); + bool SXTH_1(Reg Rn, Reg Rd); + bool SXTH_2(Reg Rn, Reg Rd); + bool SXTW(Reg Rn, Reg Rd); + + // Data processing - Immediate - Extract + bool EXTR(bool sf, bool N, Reg Rm, Imm<6> imms, Reg Rn, Reg Rd); + + // Conditional branch + bool B_cond(Imm<19> imm19, Cond cond); + + // Exception generation + bool SVC(Imm<16> imm16); + bool HVC(Imm<16> imm16); + bool SMC(Imm<16> imm16); + bool BRK(Imm<16> imm16); + bool HLT(Imm<16> imm16); + bool DCPS1(Imm<16> imm16); + bool DCPS2(Imm<16> imm16); + bool DCPS3(Imm<16> imm16); + + // System + bool MSR_imm(Imm<3> op1, Imm<4> CRm, Imm<3> op2); + bool HINT(Imm<4> CRm, Imm<3> op2); + bool NOP(); + bool YIELD(); + bool WFE(); + bool WFI(); + bool SEV(); + bool SEVL(); + bool XPAC_1(bool D, Reg Rd); + bool XPAC_2(); + bool PACIA_1(bool Z, Reg Rn, Reg Rd); + bool PACIA_2(); + bool PACIB_1(bool Z, Reg Rn, Reg Rd); + bool PACIB_2(); + bool AUTIA_1(bool Z, Reg Rn, Reg Rd); + bool AUTIA_2(); + bool AUTIB_1(bool Z, Reg Rn, Reg Rd); + bool AUTIB_2(); + bool BTI(Imm<2> upper_op2); + bool ESB(); + bool PSB(); + bool TSB(); + bool CSDB(); + bool CLREX(Imm<4> CRm); + bool DSB(Imm<4> CRm); + bool SSBB(); + bool PSSBB(); + bool DMB(Imm<4> CRm); + bool ISB(Imm<4> CRm); + bool SYS(Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt); + bool SB(); + bool MSR_reg(Imm<1> o0, Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt); + bool SYSL(Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt); + bool MRS(Imm<1> o0, Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt); + + // System - Flag manipulation instructions + bool CFINV(); + bool RMIF(Imm<6> lsb, Reg Rn, Imm<4> mask); + bool SETF8(Reg Rn); + bool SETF16(Reg Rn); + + // System - Flag format instructions + bool XAFlag(); + bool AXFlag(); + + // SYS: Data Cache + bool DC_IVAC(Reg Rt); + bool DC_ISW(Reg Rt); + bool DC_CSW(Reg Rt); + bool DC_CISW(Reg Rt); + bool DC_ZVA(Reg Rt); + bool DC_CVAC(Reg Rt); + bool DC_CVAU(Reg Rt); + bool DC_CVAP(Reg Rt); + bool DC_CIVAC(Reg Rt); + + // SYS: Instruction Cache + bool IC_IALLU(); + bool IC_IALLUIS(); + bool IC_IVAU(Reg Rt); + + // Unconditional branch (Register) + bool BR(Reg Rn); + bool BRA(bool Z, bool M, Reg Rn, Reg Rm); + bool BLR(Reg Rn); + bool BLRA(bool Z, bool M, Reg Rn, Reg Rm); + bool RET(Reg Rn); + bool RETA(bool M); + bool ERET(); + bool ERETA(bool M); + bool DRPS(); + + // Unconditional branch (immediate) + bool B_uncond(Imm<26> imm26); + bool BL(Imm<26> imm26); + + // Compare and branch (immediate) + bool CBZ(bool sf, Imm<19> imm19, Reg Rt); + bool CBNZ(bool sf, Imm<19> imm19, Reg Rt); + bool TBZ(Imm<1> b5, Imm<5> b40, Imm<14> imm14, Reg Rt); + bool TBNZ(Imm<1> b5, Imm<5> b40, Imm<14> imm14, Reg Rt); + + // Loads and stores - Advanced SIMD Load/Store multiple structures + bool STx_mult_1(bool Q, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt); + bool STx_mult_2(bool Q, Reg Rm, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt); + bool LDx_mult_1(bool Q, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt); + bool LDx_mult_2(bool Q, Reg Rm, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt); + + // Loads and stores - Advanced SIMD Load/Store single structures + bool ST1_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool ST1_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool ST3_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool ST3_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool ST2_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool ST2_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool ST4_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool ST4_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool LD1_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool LD1_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool LD3_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool LD3_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool LD1R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt); + bool LD1R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt); + bool LD3R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt); + bool LD3R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt); + bool LD2_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool LD2_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool LD4_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool LD4_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt); + bool LD2R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt); + bool LD2R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt); + bool LD4R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt); + bool LD4R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt); + + // Loads and stores - Load/Store Exclusive + bool STXR(Imm<2> size, Reg Rs, Reg Rn, Reg Rt); + bool STLXR(Imm<2> size, Reg Rs, Reg Rn, Reg Rt); + bool STXP(Imm<1> size, Reg Rs, Reg Rt2, Reg Rn, Reg Rt); + bool STLXP(Imm<1> size, Reg Rs, Reg Rt2, Reg Rn, Reg Rt); + bool LDXR(Imm<2> size, Reg Rn, Reg Rt); + bool LDAXR(Imm<2> size, Reg Rn, Reg Rt); + bool LDXP(Imm<1> size, Reg Rt2, Reg Rn, Reg Rt); + bool LDAXP(Imm<1> size, Reg Rt2, Reg Rn, Reg Rt); + bool STLLR(Imm<2> size, Reg Rn, Reg Rt); + bool STLR(Imm<2> size, Reg Rn, Reg Rt); + bool LDLAR(Imm<2> size, Reg Rn, Reg Rt); + bool LDAR(Imm<2> size, Reg Rn, Reg Rt); + bool CASP(bool sz, bool L, Reg Rs, bool o0, Reg Rn, Reg Rt); + bool CASB(bool L, Reg Rs, bool o0, Reg Rn, Reg Rt); + bool CASH(bool L, Reg Rs, bool o0, Reg Rn, Reg Rt); + bool CAS(bool sz, bool L, Reg Rs, bool o0, Reg Rn, Reg Rt); + + // Loads and stores - Load register (literal) + bool LDR_lit_gen(bool opc_0, Imm<19> imm19, Reg Rt); + bool LDR_lit_fpsimd(Imm<2> opc, Imm<19> imm19, Vec Vt); + bool LDRSW_lit(Imm<19> imm19, Reg Rt); + bool PRFM_lit(Imm<19> imm19, Imm<5> prfop); + + // Loads and stores - Load/Store no-allocate pair + bool STNP_LDNP_gen(Imm<1> upper_opc, Imm<1> L, Imm<7> imm7, Reg Rt2, Reg Rn, Reg Rt); + bool STNP_LDNP_fpsimd(Imm<2> opc, Imm<1> L, Imm<7> imm7, Vec Vt2, Reg Rn, Vec Vt); + + // Loads and stores - Load/Store register pair + bool STP_LDP_gen(Imm<2> opc, bool not_postindex, bool wback, Imm<1> L, Imm<7> imm7, Reg Rt2, Reg Rn, Reg Rt); + bool STP_LDP_fpsimd(Imm<2> opc, bool not_postindex, bool wback, Imm<1> L, Imm<7> imm7, Vec Vt2, Reg Rn, Vec Vt); + bool STGP_1(Imm<7> offset_imm, Reg Rt2, Reg Rn, Reg Rt); + bool STGP_2(Imm<7> offset_imm, Reg Rt2, Reg Rn, Reg Rt); + bool STGP_3(Imm<7> offset_imm, Reg Rt2, Reg Rn, Reg Rt); + + // Loads and stores - Load/Store register (immediate) + bool STRx_LDRx_imm_1(Imm<2> size, Imm<2> opc, Imm<9> imm9, bool not_postindex, Reg Rn, Reg Rt); + bool STRx_LDRx_imm_2(Imm<2> size, Imm<2> opc, Imm<12> imm12, Reg Rn, Reg Rt); + bool STURx_LDURx(Imm<2> size, Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt); + bool PRFM_imm(Imm<12> imm12, Reg Rn, Reg Rt); + bool PRFM_unscaled_imm(Imm<9> imm9, Reg Rn, Reg Rt); + bool STR_imm_fpsimd_1(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, bool not_postindex, Reg Rn, Vec Vt); + bool STR_imm_fpsimd_2(Imm<2> size, Imm<1> opc_1, Imm<12> imm12, Reg Rn, Vec Vt); + bool LDR_imm_fpsimd_1(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, bool not_postindex, Reg Rn, Vec Vt); + bool LDR_imm_fpsimd_2(Imm<2> size, Imm<1> opc_1, Imm<12> imm12, Reg Rn, Vec Vt); + bool STUR_fpsimd(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, Reg Rn, Vec Vt); + bool LDUR_fpsimd(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, Reg Rn, Vec Vt); + + // Loads and stores - Load/Store register (unprivileged) + bool STTRB(Imm<9> imm9, Reg Rn, Reg Rt); + bool LDTRB(Imm<9> imm9, Reg Rn, Reg Rt); + bool LDTRSB(Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt); + bool STTRH(Imm<9> imm9, Reg Rn, Reg Rt); + bool LDTRH(Imm<9> imm9, Reg Rn, Reg Rt); + bool LDTRSH(Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt); + bool STTR(Imm<2> size, Imm<9> imm9, Reg Rn, Reg Rt); + bool LDTR(Imm<2> size, Imm<9> imm9, Reg Rn, Reg Rt); + bool LDTRSW(Imm<9> imm9, Reg Rn, Reg Rt); + + // Loads and stores - Atomic memory options + bool LDADDB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDCLRB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDEORB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDSETB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDSMAXB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDSMINB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDUMAXB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDUMINB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool SWPB(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDAPRB(Reg Rn, Reg Rt); + bool LDADDH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDCLRH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDEORH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDSETH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDSMAXH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDSMINH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDUMAXH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDUMINH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool SWPH(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDAPRH(Reg Rn, Reg Rt); + bool LDADD(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDCLR(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDEOR(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDSET(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDSMAX(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDSMIN(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDUMAX(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDUMIN(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool SWP(bool A, bool R, Reg Rs, Reg Rn, Reg Rt); + bool LDAPR(Reg Rn, Reg Rt); + + // Loads and stores - Load/Store register (register offset) + bool STRx_reg(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Reg Rt); + bool LDRx_reg(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Reg Rt); + bool STR_reg_fpsimd(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Vec Vt); + bool LDR_reg_fpsimd(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Vec Vt); + + // Loads and stores - Load/Store memory tags + bool STG_1(Imm<9> imm9, Reg Rn); + bool STG_2(Imm<9> imm9, Reg Rn); + bool STG_3(Imm<9> imm9, Reg Rn); + bool LDG(Imm<9> offset_imm, Reg Rn, Reg Rt); + bool STZG_1(Imm<9> offset_imm, Reg Rn); + bool STZG_2(Imm<9> offset_imm, Reg Rn); + bool STZG_3(Imm<9> offset_imm, Reg Rn); + bool ST2G_1(Imm<9> offset_imm, Reg Rn); + bool ST2G_2(Imm<9> offset_imm, Reg Rn); + bool ST2G_3(Imm<9> offset_imm, Reg Rn); + bool STGV(Reg Rn, Reg Rt); + bool STZ2G_1(Imm<9> offset_imm, Reg Rn); + bool STZ2G_2(Imm<9> offset_imm, Reg Rn); + bool STZ2G_3(Imm<9> offset_imm, Reg Rn); + bool LDGV(Reg Rn, Reg Rt); + + // Loads and stores - Load/Store register (pointer authentication) + bool LDRA(bool M, bool S, Imm<9> imm9, bool W, Reg Rn, Reg Rt); + + // Data Processing - Register - 2 source + bool UDIV(bool sf, Reg Rm, Reg Rn, Reg Rd); + bool SDIV(bool sf, Reg Rm, Reg Rn, Reg Rd); + bool LSLV(bool sf, Reg Rm, Reg Rn, Reg Rd); + bool LSRV(bool sf, Reg Rm, Reg Rn, Reg Rd); + bool ASRV(bool sf, Reg Rm, Reg Rn, Reg Rd); + bool RORV(bool sf, Reg Rm, Reg Rn, Reg Rd); + bool CRC32(bool sf, Reg Rm, Imm<2> sz, Reg Rn, Reg Rd); + bool CRC32C(bool sf, Reg Rm, Imm<2> sz, Reg Rn, Reg Rd); + bool PACGA(Reg Rm, Reg Rn, Reg Rd); + bool SUBP(Reg Rm, Reg Rn, Reg Rd); + bool IRG(Reg Rm, Reg Rn, Reg Rd); + bool GMI(Reg Rm, Reg Rn, Reg Rd); + bool SUBPS(Reg Rm, Reg Rn, Reg Rd); + + // Data Processing - Register - 1 source + bool RBIT_int(bool sf, Reg Rn, Reg Rd); + bool REV16_int(bool sf, Reg Rn, Reg Rd); + bool REV(bool sf, bool opc_0, Reg Rn, Reg Rd); + bool CLZ_int(bool sf, Reg Rn, Reg Rd); + bool CLS_int(bool sf, Reg Rn, Reg Rd); + bool REV32_int(Reg Rn, Reg Rd); + bool PACDA(bool Z, Reg Rn, Reg Rd); + bool PACDB(bool Z, Reg Rn, Reg Rd); + bool AUTDA(bool Z, Reg Rn, Reg Rd); + bool AUTDB(bool Z, Reg Rn, Reg Rd); + + // Data Processing - Register - Logical (shifted register) + bool AND_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + bool BIC_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + bool ORR_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + bool ORN_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + bool EOR_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + bool EON(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + bool ANDS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + bool BICS(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + + // Data Processing - Register - Add/Sub (shifted register) + bool ADD_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + bool ADDS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + bool SUB_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + bool SUBS_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn, Reg Rd); + + // Data Processing - Register - Add/Sub (shifted register) + bool ADD_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd); + bool ADDS_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd); + bool SUB_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd); + bool SUBS_ext(bool sf, Reg Rm, Imm<3> option, Imm<3> imm3, Reg Rn, Reg Rd); + + // Data Processing - Register - Add/Sub (with carry) + bool ADC(bool sf, Reg Rm, Reg Rn, Reg Rd); + bool ADCS(bool sf, Reg Rm, Reg Rn, Reg Rd); + bool SBC(bool sf, Reg Rm, Reg Rn, Reg Rd); + bool SBCS(bool sf, Reg Rm, Reg Rn, Reg Rd); + + // Data Processing - Register - Conditional compare + bool CCMN_reg(bool sf, Reg Rm, Cond cond, Reg Rn, Imm<4> nzcv); + bool CCMP_reg(bool sf, Reg Rm, Cond cond, Reg Rn, Imm<4> nzcv); + bool CCMN_imm(bool sf, Imm<5> imm5, Cond cond, Reg Rn, Imm<4> nzcv); + bool CCMP_imm(bool sf, Imm<5> imm5, Cond cond, Reg Rn, Imm<4> nzcv); + + // Data Processing - Register - Conditional select + bool CSEL(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd); + bool CSINC(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd); + bool CSINV(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd); + bool CSNEG(bool sf, Reg Rm, Cond cond, Reg Rn, Reg Rd); + + // Data Processing - Register - 3 source + bool MADD(bool sf, Reg Rm, Reg Ra, Reg Rn, Reg Rd); + bool MSUB(bool sf, Reg Rm, Reg Ra, Reg Rn, Reg Rd); + bool SMADDL(Reg Rm, Reg Ra, Reg Rn, Reg Rd); + bool SMSUBL(Reg Rm, Reg Ra, Reg Rn, Reg Rd); + bool SMULH(Reg Rm, Reg Rn, Reg Rd); + bool UMADDL(Reg Rm, Reg Ra, Reg Rn, Reg Rd); + bool UMSUBL(Reg Rm, Reg Ra, Reg Rn, Reg Rd); + bool UMULH(Reg Rm, Reg Rn, Reg Rd); + + // Data Processing - FP and SIMD - AES + bool AESE(Vec Vn, Vec Vd); + bool AESD(Vec Vn, Vec Vd); + bool AESMC(Vec Vn, Vec Vd); + bool AESIMC(Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SHA + bool SHA1C(Vec Vm, Vec Vn, Vec Vd); + bool SHA1P(Vec Vm, Vec Vn, Vec Vd); + bool SHA1M(Vec Vm, Vec Vn, Vec Vd); + bool SHA1SU0(Vec Vm, Vec Vn, Vec Vd); + bool SHA256H(Vec Vm, Vec Vn, Vec Vd); + bool SHA256H2(Vec Vm, Vec Vn, Vec Vd); + bool SHA256SU1(Vec Vm, Vec Vn, Vec Vd); + bool SHA1H(Vec Vn, Vec Vd); + bool SHA1SU1(Vec Vn, Vec Vd); + bool SHA256SU0(Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Scalar copy + bool DUP_elt_1(Imm<5> imm5, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Scalar three + bool FMULX_vec_1(Vec Vm, Vec Vn, Vec Vd); + bool FMULX_vec_2(bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FCMEQ_reg_1(Vec Vm, Vec Vn, Vec Vd); + bool FCMEQ_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FRECPS_1(Vec Vm, Vec Vn, Vec Vd); + bool FRECPS_2(bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FRSQRTS_1(Vec Vm, Vec Vn, Vec Vd); + bool FRSQRTS_2(bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FCMGE_reg_1(Vec Vm, Vec Vn, Vec Vd); + bool FCMGE_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FACGE_1(Vec Vm, Vec Vn, Vec Vd); + bool FACGE_2(bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FABD_1(Vec Vm, Vec Vn, Vec Vd); + bool FABD_2(bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FCMGT_reg_1(Vec Vm, Vec Vn, Vec Vd); + bool FCMGT_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FACGT_1(Vec Vm, Vec Vn, Vec Vd); + bool FACGT_2(bool sz, Vec Vm, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Two register misc FP16 + bool FCVTNS_1(Vec Vn, Vec Vd); + bool FCVTMS_1(Vec Vn, Vec Vd); + bool FCVTAS_1(Vec Vn, Vec Vd); + bool SCVTF_int_1(Vec Vn, Vec Vd); + bool FCMGT_zero_1(Vec Vn, Vec Vd); + bool FCMEQ_zero_1(Vec Vn, Vec Vd); + bool FCMLT_1(Vec Vn, Vec Vd); + bool FCVTPS_1(Vec Vn, Vec Vd); + bool FCVTZS_int_1(Vec Vn, Vec Vd); + bool FRECPE_1(Vec Vn, Vec Vd); + bool FRECPX_1(Vec Vn, Vec Vd); + bool FCVTNU_1(Vec Vn, Vec Vd); + bool FCVTMU_1(Vec Vn, Vec Vd); + bool FCVTAU_1(Vec Vn, Vec Vd); + bool UCVTF_int_1(Vec Vn, Vec Vd); + bool FCMGE_zero_1(Vec Vn, Vec Vd); + bool FCMLE_1(Vec Vn, Vec Vd); + bool FCVTPU_1(Vec Vn, Vec Vd); + bool FCVTZU_int_1(Vec Vn, Vec Vd); + bool FRSQRTE_1(Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Two register misc + bool FCVTNS_2(bool sz, Vec Vn, Vec Vd); + bool FCVTMS_2(bool sz, Vec Vn, Vec Vd); + bool FCVTAS_2(bool sz, Vec Vn, Vec Vd); + bool SCVTF_int_2(bool sz, Vec Vn, Vec Vd); + bool FCMGT_zero_2(bool sz, Vec Vn, Vec Vd); + bool FCMEQ_zero_2(bool sz, Vec Vn, Vec Vd); + bool FCMLT_2(bool sz, Vec Vn, Vec Vd); + bool FCVTPS_2(bool sz, Vec Vn, Vec Vd); + bool FCVTZS_int_2(bool sz, Vec Vn, Vec Vd); + bool FRECPE_2(bool sz, Vec Vn, Vec Vd); + bool FRECPX_2(bool sz, Vec Vn, Vec Vd); + bool FCVTNU_2(bool sz, Vec Vn, Vec Vd); + bool FCVTMU_2(bool sz, Vec Vn, Vec Vd); + bool FCVTAU_2(bool sz, Vec Vn, Vec Vd); + bool UCVTF_int_2(bool sz, Vec Vn, Vec Vd); + bool FCMGE_zero_2(bool sz, Vec Vn, Vec Vd); + bool FCMLE_2(bool sz, Vec Vn, Vec Vd); + bool FCVTPU_2(bool sz, Vec Vn, Vec Vd); + bool FCVTZU_int_2(bool sz, Vec Vn, Vec Vd); + bool FRSQRTE_2(bool sz, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Scalar two register misc FP16 + bool FCVTNS_3(bool Q, Vec Vn, Vec Vd); + bool FCVTMS_3(bool Q, Vec Vn, Vec Vd); + bool FCVTAS_3(bool Q, Vec Vn, Vec Vd); + bool SCVTF_int_3(bool Q, Vec Vn, Vec Vd); + bool FCMGT_zero_3(bool Q, Vec Vn, Vec Vd); + bool FCMEQ_zero_3(bool Q, Vec Vn, Vec Vd); + bool FCMLT_3(bool Q, Vec Vn, Vec Vd); + bool FCVTPS_3(bool Q, Vec Vn, Vec Vd); + bool FCVTZS_int_3(bool Q, Vec Vn, Vec Vd); + bool FRECPE_3(bool Q, Vec Vn, Vec Vd); + bool FCVTNU_3(bool Q, Vec Vn, Vec Vd); + bool FCVTMU_3(bool Q, Vec Vn, Vec Vd); + bool FCVTAU_3(bool Q, Vec Vn, Vec Vd); + bool UCVTF_int_3(bool Q, Vec Vn, Vec Vd); + bool FCMGE_zero_3(bool Q, Vec Vn, Vec Vd); + bool FCMLE_3(bool Q, Vec Vn, Vec Vd); + bool FCVTPU_3(bool Q, Vec Vn, Vec Vd); + bool FCVTZU_int_3(bool Q, Vec Vn, Vec Vd); + bool FRSQRTE_3(bool Q, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Scalar two register misc + bool FCVTNS_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCVTMS_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCVTAS_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool SCVTF_int_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCMGT_zero_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCMEQ_zero_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCMLT_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCVTPS_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCVTZS_int_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRECPE_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCVTNU_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCVTMU_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCVTAU_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool UCVTF_int_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCMGE_zero_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCMLE_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCVTPU_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCVTZU_int_4(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRSQRTE_4(bool Q, bool sz, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Scalar three same extra + bool SQRDMLAH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQRDMLAH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQRDMLSH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQRDMLSH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Scalar two-register misc + bool SUQADD_1(Imm<2> size, Vec Vn, Vec Vd); + bool SQABS_1(Imm<2> size, Vec Vn, Vec Vd); + bool CMGT_zero_1(Imm<2> size, Vec Vn, Vec Vd); + bool CMEQ_zero_1(Imm<2> size, Vec Vn, Vec Vd); + bool CMLT_1(Imm<2> size, Vec Vn, Vec Vd); + bool ABS_1(Imm<2> size, Vec Vn, Vec Vd); + bool SQXTN_1(Imm<2> size, Vec Vn, Vec Vd); + bool USQADD_1(Imm<2> size, Vec Vn, Vec Vd); + bool SQNEG_1(Imm<2> size, Vec Vn, Vec Vd); + bool CMGE_zero_1(Imm<2> size, Vec Vn, Vec Vd); + bool CMLE_1(Imm<2> size, Vec Vn, Vec Vd); + bool NEG_1(Imm<2> size, Vec Vn, Vec Vd); + bool SQXTUN_1(Imm<2> size, Vec Vn, Vec Vd); + bool UQXTN_1(Imm<2> size, Vec Vn, Vec Vd); + bool FCVTXN_1(bool sz, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Scalar pairwise + bool ADDP_pair(Imm<2> size, Vec Vn, Vec Vd); + bool FMAXNMP_pair_1(Vec Vn, Vec Vd); + bool FMAXNMP_pair_2(bool sz, Vec Vn, Vec Vd); + bool FADDP_pair_1(Vec Vn, Vec Vd); + bool FADDP_pair_2(bool sz, Vec Vn, Vec Vd); + bool FMAXP_pair_1(Vec Vn, Vec Vd); + bool FMAXP_pair_2(bool sz, Vec Vn, Vec Vd); + bool FMINNMP_pair_1(Vec Vn, Vec Vd); + bool FMINNMP_pair_2(bool sz, Vec Vn, Vec Vd); + bool FMINP_pair_1(Vec Vn, Vec Vd); + bool FMINP_pair_2(bool sz, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Scalar three different + bool SQDMLAL_vec_1(Imm<2> size, Reg Rm, Reg Rn, Vec Vd); + bool SQDMLSL_vec_1(Imm<2> size, Reg Rm, Reg Rn, Vec Vd); + bool SQDMULL_vec_1(Imm<2> size, Reg Rm, Reg Rn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Scalar three same + bool SQADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQSUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMGT_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMGE_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQSHL_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SRSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQRSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool ADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMTST_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQDMULH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UQADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UQSUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMHI_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMHS_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool USHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UQSHL_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool URSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UQRSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMEQ_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQRDMULH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Scalar shift by immediate + bool SSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SRSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SRSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SHL_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQSHL_imm_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQRSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SCVTF_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool FCVTZS_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool USHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool USRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool URSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool URSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SRI_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SLI_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQSHLU_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool UQSHL_imm_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQSHRUN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQRSHRUN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool UQSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool UQRSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool UCVTF_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool FCVTZU_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Scalar x indexed element + bool SQDMLAL_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQDMLSL_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQDMULL_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQDMULH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQRDMULH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLA_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLA_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLS_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLS_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMUL_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMUL_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQRDMLAH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQRDMLSH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMULX_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMULX_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Table Lookup + bool TBL(bool Q, Vec Vm, Imm<2> len, size_t Vn, Vec Vd); + bool TBX(bool Q, Vec Vm, Imm<2> len, size_t Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Permute + bool UZP1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool TRN1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool ZIP1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UZP2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool TRN2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool ZIP2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Extract + bool EXT(bool Q, Vec Vm, Imm<4> imm4, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Copy + bool DUP_elt_2(bool Q, Imm<5> imm5, Vec Vn, Vec Vd); + bool DUP_gen(bool Q, Imm<5> imm5, Reg Rn, Vec Vd); + bool SMOV(bool Q, Imm<5> imm5, Vec Vn, Reg Rd); + bool UMOV(bool Q, Imm<5> imm5, Vec Vn, Reg Rd); + bool INS_gen(Imm<5> imm5, Reg Rn, Vec Vd); + bool INS_elt(Imm<5> imm5, Imm<4> imm4, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Three same + bool FMULX_vec_3(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FCMEQ_reg_3(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FRECPS_3(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FRSQRTS_3(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FCMGE_reg_3(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FACGE_3(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FABD_3(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FCMGT_reg_3(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FACGT_3(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMAXNM_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMLA_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FADD_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMAX_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMINNM_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMLS_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FSUB_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMIN_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMAXNMP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FADDP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMUL_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMAXP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FDIV_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMINNMP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMINP_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Three same extra + bool SDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool FCMLA_vec(bool Q, Imm<2> size, Vec Vm, Imm<2> rot, Vec Vn, Vec Vd); + bool FCADD_vec(bool Q, Imm<2> size, Vec Vm, Imm<1> rot, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Two register misc + bool REV64_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool REV16_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SADDLP(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CLS_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CNT(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SADALP(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool XTN(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool FCVTN(bool Q, bool sz, Vec Vn, Vec Vd); + bool FCVTL(bool Q, bool sz, Vec Vn, Vec Vd); + bool URECPE(bool Q, bool sz, Vec Vn, Vec Vd); + bool REV32_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool UADDLP(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CLZ_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool UADALP(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SHLL(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool NOT(bool Q, Vec Vn, Vec Vd); + bool RBIT_asimd(bool Q, Vec Vn, Vec Vd); + bool URSQRTE(bool Q, bool sz, Vec Vn, Vec Vd); + bool SUQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SQABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CMGT_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CMEQ_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CMLT_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool ABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool USQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SQNEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CMGE_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool CMLE_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool NEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool UQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool FCVTXN_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTN_1(bool Q, Vec Vn, Vec Vd); + bool FRINTN_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTM_1(bool Q, Vec Vn, Vec Vd); + bool FRINTM_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FABS_1(bool Q, Vec Vn, Vec Vd); + bool FABS_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTP_1(bool Q, Vec Vn, Vec Vd); + bool FRINTP_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTZ_1(bool Q, Vec Vn, Vec Vd); + bool FRINTZ_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTA_1(bool Q, Vec Vn, Vec Vd); + bool FRINTA_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTX_1(bool Q, Vec Vn, Vec Vd); + bool FRINTX_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FNEG_1(bool Q, Vec Vn, Vec Vd); + bool FNEG_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINTI_1(bool Q, Vec Vn, Vec Vd); + bool FRINTI_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FSQRT_1(bool Q, Vec Vn, Vec Vd); + bool FSQRT_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINT32X_1(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINT64X_1(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINT32Z_1(bool Q, bool sz, Vec Vn, Vec Vd); + bool FRINT64Z_1(bool Q, bool sz, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD across lanes + bool SADDLV(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SMAXV(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool SMINV(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool ADDV(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool FMAXNMV_1(bool Q, Vec Vn, Vec Vd); + bool FMAXNMV_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FMAXV_1(bool Q, Vec Vn, Vec Vd); + bool FMAXV_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FMINNMV_1(bool Q, Vec Vn, Vec Vd); + bool FMINNMV_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool FMINV_1(bool Q, Vec Vn, Vec Vd); + bool FMINV_2(bool Q, bool sz, Vec Vn, Vec Vd); + bool UADDLV(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool UMAXV(bool Q, Imm<2> size, Vec Vn, Vec Vd); + bool UMINV(bool Q, Imm<2> size, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD three different + bool SADDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SADDW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SSUBL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SSUBW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool ADDHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SABAL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SUBHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SABDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SMLAL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SMLSL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SMULL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool PMULL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UADDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UADDW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool USUBL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool USUBW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool RADDHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UABAL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool RSUBHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UABDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UMLAL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UMLSL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UMULL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQDMLAL_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQDMLSL_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQDMULL_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD three same + bool SHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SRHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SHSUB(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SMAX(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SMIN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SABD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SABA(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool MLA_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool MUL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SMAXP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SMINP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool ADDP_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool FMLAL_vec_1(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMLAL_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool AND_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool BIC_asimd_reg(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMLSL_vec_1(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMLSL_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool ORR_asimd_reg(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool ORN_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool UHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool URHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UHSUB(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UMAX(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UMIN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UABD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UABA(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool MLS_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool PMUL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UMAXP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UMINP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool EOR_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool BSL(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool BIT(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool BIF(bool Q, Vec Vm, Vec Vn, Vec Vd); + bool FMAXNM_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMLA_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FADD_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMAX_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMINNM_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMLS_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FSUB_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMIN_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMAXNMP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FADDP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMUL_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMAXP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FDIV_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMINNMP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMINP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FMULX_vec_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FCMEQ_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FRECPS_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FRSQRTS_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FCMGE_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FACGE_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FABD_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FCMGT_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool FACGT_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd); + bool SQADD_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQSUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMGT_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMGE_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQSHL_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SRSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQRSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool ADD_vector(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMTST_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQDMULH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UQADD_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UQSUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMHI_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMHS_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool USHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UQSHL_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool URSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool UQRSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool CMEQ_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + bool SQRDMULH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD modified immediate + bool MOVI(bool Q, bool op, Imm<1> a, Imm<1> b, Imm<1> c, Imm<4> cmode, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd); + bool FMOV_2(bool Q, bool op, Imm<1> a, Imm<1> b, Imm<1> c, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd); + bool FMOV_3(bool Q, Imm<1> a, Imm<1> b, Imm<1> c, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd); + + // Data Processing - FP and SIMD - SIMD Shift by immediate + bool SSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SRSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SRSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SHL_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQSHL_imm_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SHRN(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool RSHRN(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQRSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SSHLL(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SCVTF_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool FCVTZS_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool USHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool USRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool URSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool URSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SRI_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SLI_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQSHLU_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool UQSHL_imm_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQSHRUN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool SQRSHRUN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool UQSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool UQRSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool USHLL(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool UCVTF_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + bool FCVTZU_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SIMD vector x indexed element + bool SMLAL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQDMLAL_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SMLSL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQDMLSL_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool MUL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SMULL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vm, Imm<1> H, Vec Vn, Vec Vd); + bool SQDMULL_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQRDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SDOT_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLA_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLA_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLS_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLS_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMUL_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMUL_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLAL_elt_1(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLAL_elt_2(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLSL_elt_1(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMLSL_elt_2(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool MLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool UMLAL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool MLS_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool UMLSL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool UMULL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQRDMLAH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool UDOT_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool SQRDMLSH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMULX_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FMULX_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd); + bool FCMLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<2> rot, Imm<1> H, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Cryptographic three register + bool SM3TT1A(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd); + bool SM3TT1B(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd); + bool SM3TT2A(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd); + bool SM3TT2B(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SHA512 three register + bool SHA512H(Vec Vm, Vec Vn, Vec Vd); + bool SHA512H2(Vec Vm, Vec Vn, Vec Vd); + bool SHA512SU1(Vec Vm, Vec Vn, Vec Vd); + bool RAX1(Vec Vm, Vec Vn, Vec Vd); + bool XAR(Vec Vm, Imm<6> imm6, Vec Vn, Vec Vd); + bool SM3PARTW1(Vec Vm, Vec Vn, Vec Vd); + bool SM3PARTW2(Vec Vm, Vec Vn, Vec Vd); + bool SM4EKEY(Vec Vm, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Cryptographic four register + bool EOR3(Vec Vm, Vec Va, Vec Vn, Vec Vd); + bool BCAX(Vec Vm, Vec Va, Vec Vn, Vec Vd); + bool SM3SS1(Vec Vm, Vec Va, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - SHA512 two register + bool SHA512SU0(Vec Vn, Vec Vd); + bool SM4E(Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Conversion between floating point and fixed point + bool SCVTF_float_fix(bool sf, Imm<2> type, Imm<6> scale, Reg Rn, Vec Vd); + bool UCVTF_float_fix(bool sf, Imm<2> type, Imm<6> scale, Reg Rn, Vec Vd); + bool FCVTZS_float_fix(bool sf, Imm<2> type, Imm<6> scale, Vec Vn, Reg Rd); + bool FCVTZU_float_fix(bool sf, Imm<2> type, Imm<6> scale, Vec Vn, Reg Rd); + + // Data Processing - FP and SIMD - Conversion between floating point and integer + bool FCVTNS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd); + bool FCVTNU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd); + bool SCVTF_float_int(bool sf, Imm<2> type, Reg Rn, Vec Vd); + bool UCVTF_float_int(bool sf, Imm<2> type, Reg Rn, Vec Vd); + bool FCVTAS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd); + bool FCVTAU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd); + bool FMOV_float_gen(bool sf, Imm<2> type, Imm<1> rmode_0, Imm<1> opc_0, size_t n, size_t d); + bool FCVTPS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd); + bool FCVTPU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd); + bool FCVTMS_float(bool sf, Imm<2> type, Vec Vn, Reg Rd); + bool FCVTMU_float(bool sf, Imm<2> type, Vec Vn, Reg Rd); + bool FCVTZS_float_int(bool sf, Imm<2> type, Vec Vn, Reg Rd); + bool FCVTZU_float_int(bool sf, Imm<2> type, Vec Vn, Reg Rd); + bool FJCVTZS(Vec Vn, Reg Rd); + + // Data Processing - FP and SIMD - Floating point data processing + bool FMOV_float(Imm<2> type, Vec Vn, Vec Vd); + bool FABS_float(Imm<2> type, Vec Vn, Vec Vd); + bool FNEG_float(Imm<2> type, Vec Vn, Vec Vd); + bool FSQRT_float(Imm<2> type, Vec Vn, Vec Vd); + bool FCVT_float(Imm<2> type, Imm<2> opc, Vec Vn, Vec Vd); + bool FRINTN_float(Imm<2> type, Vec Vn, Vec Vd); + bool FRINTP_float(Imm<2> type, Vec Vn, Vec Vd); + bool FRINTM_float(Imm<2> type, Vec Vn, Vec Vd); + bool FRINTZ_float(Imm<2> type, Vec Vn, Vec Vd); + bool FRINTA_float(Imm<2> type, Vec Vn, Vec Vd); + bool FRINTX_float(Imm<2> type, Vec Vn, Vec Vd); + bool FRINTI_float(Imm<2> type, Vec Vn, Vec Vd); + bool FRINT32X_float(Imm<2> type, Vec Vn, Vec Vd); + bool FRINT64X_float(Imm<2> type, Vec Vn, Vec Vd); + bool FRINT32Z_float(Imm<2> type, Vec Vn, Vec Vd); + bool FRINT64Z_float(Imm<2> type, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Floating point compare + bool FCMP_float(Imm<2> type, Vec Vm, Vec Vn, bool cmp_with_zero); + bool FCMPE_float(Imm<2> type, Vec Vm, Vec Vn, bool cmp_with_zero); + + // Data Processing - FP and SIMD - Floating point immediate + bool FMOV_float_imm(Imm<2> type, Imm<8> imm8, Vec Vd); + + // Data Processing - FP and SIMD - Floating point conditional compare + bool FCCMP_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Imm<4> nzcv); + bool FCCMPE_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Imm<4> nzcv); + + // Data Processing - FP and SIMD - Floating point data processing two register + bool FMUL_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd); + bool FDIV_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd); + bool FADD_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd); + bool FSUB_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd); + bool FMAX_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd); + bool FMIN_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd); + bool FMAXNM_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd); + bool FMINNM_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd); + bool FNMUL_float(Imm<2> type, Vec Vm, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Floating point conditional select + bool FCSEL_float(Imm<2> type, Vec Vm, Cond cond, Vec Vn, Vec Vd); + + // Data Processing - FP and SIMD - Floating point data processing three register + bool FMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd); + bool FMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd); + bool FNMADD_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd); + bool FNMSUB_float(Imm<2> type, Vec Vm, Vec Va, Vec Vn, Vec Vd); +}; + +inline std::optional<size_t> FPGetDataSize(Imm<2> type) { + switch (type.ZeroExtend()) { + case 0b00: + return 32; + case 0b01: + return 64; + case 0b11: + return 16; + } + return std::nullopt; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_exclusive.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_exclusive.cpp new file mode 100644 index 0000000000..0e67e11b4b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_exclusive.cpp @@ -0,0 +1,209 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <optional> + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +static bool ExclusiveSharedDecodeAndOperation(TranslatorVisitor& v, bool pair, size_t size, bool L, bool o0, std::optional<Reg> Rs, std::optional<Reg> Rt2, Reg Rn, Reg Rt) { + // Shared Decode + + const auto acctype = o0 ? IR::AccType::ORDERED : IR::AccType::ATOMIC; + const auto memop = L ? IR::MemOp::LOAD : IR::MemOp::STORE; + const size_t elsize = 8 << size; + const size_t regsize = elsize == 64 ? 64 : 32; + const size_t datasize = pair ? elsize * 2 : elsize; + + // Operation + + const size_t dbytes = datasize / 8; + + if (memop == IR::MemOp::LOAD && pair && Rt == *Rt2) { + return v.UnpredictableInstruction(); + } else if (memop == IR::MemOp::STORE && (*Rs == Rt || (pair && *Rs == *Rt2))) { + if (!v.options.define_unpredictable_behaviour) { + return v.UnpredictableInstruction(); + } + // UNPREDICTABLE: The Constraint_NONE case is executed. + } else if (memop == IR::MemOp::STORE && *Rs == Rn && Rn != Reg::R31) { + return v.UnpredictableInstruction(); + } + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check SP Alignment + address = v.SP(64); + } else { + address = v.X(64, Rn); + } + + switch (memop) { + case IR::MemOp::STORE: { + IR::UAnyU128 data; + if (pair && elsize == 64) { + data = v.ir.Pack2x64To1x128(v.X(64, Rt), v.X(64, *Rt2)); + } else if (pair && elsize == 32) { + data = v.ir.Pack2x32To1x64(v.X(32, Rt), v.X(32, *Rt2)); + } else { + data = v.X(elsize, Rt); + } + const IR::U32 status = v.ExclusiveMem(address, dbytes, acctype, data); + v.X(32, *Rs, status); + break; + } + case IR::MemOp::LOAD: { + const IR::UAnyU128 data = v.ExclusiveMem(address, dbytes, acctype); + if (pair && elsize == 64) { + v.X(64, Rt, v.ir.VectorGetElement(64, data, 0)); + v.X(64, *Rt2, v.ir.VectorGetElement(64, data, 1)); + } else if (pair && elsize == 32) { + v.X(32, Rt, v.ir.LeastSignificantWord(data)); + v.X(32, *Rt2, v.ir.MostSignificantWord(data).result); + } else { + v.X(regsize, Rt, v.ZeroExtend(data, regsize)); + } + break; + } + default: + UNREACHABLE(); + } + + return true; +} + +bool TranslatorVisitor::STXR(Imm<2> sz, Reg Rs, Reg Rn, Reg Rt) { + const bool pair = false; + const size_t size = sz.ZeroExtend<size_t>(); + const bool L = 0; + const bool o0 = 0; + return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, Rs, {}, Rn, Rt); +} + +bool TranslatorVisitor::STLXR(Imm<2> sz, Reg Rs, Reg Rn, Reg Rt) { + const bool pair = false; + const size_t size = sz.ZeroExtend<size_t>(); + const bool L = 0; + const bool o0 = 1; + return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, Rs, {}, Rn, Rt); +} + +bool TranslatorVisitor::STXP(Imm<1> sz, Reg Rs, Reg Rt2, Reg Rn, Reg Rt) { + const bool pair = true; + const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend<size_t>(); + const bool L = 0; + const bool o0 = 0; + return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, Rs, Rt2, Rn, Rt); +} + +bool TranslatorVisitor::STLXP(Imm<1> sz, Reg Rs, Reg Rt2, Reg Rn, Reg Rt) { + const bool pair = true; + const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend<size_t>(); + const bool L = 0; + const bool o0 = 1; + return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, Rs, Rt2, Rn, Rt); +} + +bool TranslatorVisitor::LDXR(Imm<2> sz, Reg Rn, Reg Rt) { + const bool pair = false; + const size_t size = sz.ZeroExtend<size_t>(); + const bool L = 1; + const bool o0 = 0; + return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, {}, {}, Rn, Rt); +} + +bool TranslatorVisitor::LDAXR(Imm<2> sz, Reg Rn, Reg Rt) { + const bool pair = false; + const size_t size = sz.ZeroExtend<size_t>(); + const bool L = 1; + const bool o0 = 1; + return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, {}, {}, Rn, Rt); +} + +bool TranslatorVisitor::LDXP(Imm<1> sz, Reg Rt2, Reg Rn, Reg Rt) { + const bool pair = true; + const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend<size_t>(); + const bool L = 1; + const bool o0 = 0; + return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, {}, Rt2, Rn, Rt); +} + +bool TranslatorVisitor::LDAXP(Imm<1> sz, Reg Rt2, Reg Rn, Reg Rt) { + const bool pair = true; + const size_t size = concatenate(Imm<1>{1}, sz).ZeroExtend<size_t>(); + const bool L = 1; + const bool o0 = 1; + return ExclusiveSharedDecodeAndOperation(*this, pair, size, L, o0, {}, Rt2, Rn, Rt); +} + +static bool OrderedSharedDecodeAndOperation(TranslatorVisitor& v, size_t size, bool L, bool o0, Reg Rn, Reg Rt) { + // Shared Decode + + const auto acctype = !o0 ? IR::AccType::LIMITEDORDERED : IR::AccType::ORDERED; + const auto memop = L ? IR::MemOp::LOAD : IR::MemOp::STORE; + const size_t elsize = 8 << size; + const size_t regsize = elsize == 64 ? 64 : 32; + const size_t datasize = elsize; + + // Operation + + const size_t dbytes = datasize / 8; + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check SP Alignment + address = v.SP(64); + } else { + address = v.X(64, Rn); + } + + switch (memop) { + case IR::MemOp::STORE: { + const IR::UAny data = v.X(datasize, Rt); + v.Mem(address, dbytes, acctype, data); + break; + } + case IR::MemOp::LOAD: { + const IR::UAny data = v.Mem(address, dbytes, acctype); + v.X(regsize, Rt, v.ZeroExtend(data, regsize)); + break; + } + default: + UNREACHABLE(); + } + + return true; +} + +bool TranslatorVisitor::STLLR(Imm<2> sz, Reg Rn, Reg Rt) { + const size_t size = sz.ZeroExtend<size_t>(); + const bool L = 0; + const bool o0 = 0; + return OrderedSharedDecodeAndOperation(*this, size, L, o0, Rn, Rt); +} + +bool TranslatorVisitor::STLR(Imm<2> sz, Reg Rn, Reg Rt) { + const size_t size = sz.ZeroExtend<size_t>(); + const bool L = 0; + const bool o0 = 1; + return OrderedSharedDecodeAndOperation(*this, size, L, o0, Rn, Rt); +} + +bool TranslatorVisitor::LDLAR(Imm<2> sz, Reg Rn, Reg Rt) { + const size_t size = sz.ZeroExtend<size_t>(); + const bool L = 1; + const bool o0 = 0; + return OrderedSharedDecodeAndOperation(*this, size, L, o0, Rn, Rt); +} + +bool TranslatorVisitor::LDAR(Imm<2> sz, Reg Rn, Reg Rt) { + const size_t size = sz.ZeroExtend<size_t>(); + const bool L = 1; + const bool o0 = 1; + return OrderedSharedDecodeAndOperation(*this, size, L, o0, Rn, Rt); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_load_literal.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_load_literal.cpp new file mode 100644 index 0000000000..995aba4252 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_load_literal.cpp @@ -0,0 +1,55 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::LDR_lit_gen(bool opc_0, Imm<19> imm19, Reg Rt) { + const size_t size = opc_0 == 0 ? 4 : 8; + const s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>(); + + const u64 address = ir.PC() + offset; + const auto data = Mem(ir.Imm64(address), size, IR::AccType::NORMAL); + + X(8 * size, Rt, data); + return true; +} + +bool TranslatorVisitor::LDR_lit_fpsimd(Imm<2> opc, Imm<19> imm19, Vec Vt) { + if (opc == 0b11) { + return UnallocatedEncoding(); + } + + const u64 size = 4 << opc.ZeroExtend(); + const u64 offset = imm19.SignExtend<u64>() << 2; + const IR::U64 address = ir.Imm64(ir.PC() + offset); + const IR::UAnyU128 data = Mem(address, size, IR::AccType::VEC); + + if (size == 16) { + V(128, Vt, data); + } else { + V(128, Vt, ir.ZeroExtendToQuad(data)); + } + return true; +} + +bool TranslatorVisitor::LDRSW_lit(Imm<19> imm19, Reg Rt) { + const s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>(); + const u64 address = ir.PC() + offset; + const auto data = Mem(ir.Imm64(address), 4, IR::AccType::NORMAL); + + X(64, Rt, ir.SignExtendWordToLong(data)); + return true; +} + +bool TranslatorVisitor::PRFM_lit(Imm<19> /*imm19*/, Imm<5> /*prfop*/) { + // s64 offset = concatenate(imm19, Imm<2>{0}).SignExtend<s64>(); + // u64 address = ir.PC() + offset; + // Prefetch(address, prfop); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_multiple_structures.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_multiple_structures.cpp new file mode 100644 index 0000000000..77c57a9659 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_multiple_structures.cpp @@ -0,0 +1,134 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <optional> + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +static bool SharedDecodeAndOperation(TranslatorVisitor& v, bool wback, IR::MemOp memop, bool Q, std::optional<Reg> Rm, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt) { + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t elements = datasize / esize; + const size_t ebytes = esize / 8; + + size_t rpt, selem; + switch (opcode.ZeroExtend()) { + case 0b0000: + rpt = 1; + selem = 4; + break; + case 0b0010: + rpt = 4; + selem = 1; + break; + case 0b0100: + rpt = 1; + selem = 3; + break; + case 0b0110: + rpt = 3; + selem = 1; + break; + case 0b0111: + rpt = 1; + selem = 1; + break; + case 0b1000: + rpt = 1; + selem = 2; + break; + case 0b1010: + rpt = 2; + selem = 1; + break; + default: + return v.UnallocatedEncoding(); + } + ASSERT(rpt == 1 || selem == 1); + + if ((size == 0b11 && !Q) && selem != 1) { + return v.ReservedValue(); + } + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check SP Alignment + address = v.SP(64); + } else { + address = v.X(64, Rn); + } + + IR::U64 offs = v.ir.Imm64(0); + if (selem == 1) { + for (size_t r = 0; r < rpt; r++) { + const Vec tt = static_cast<Vec>((VecNumber(Vt) + r) % 32); + if (memop == IR::MemOp::LOAD) { + const IR::UAnyU128 vec = v.Mem(v.ir.Add(address, offs), ebytes * elements, IR::AccType::VEC); + v.V_scalar(datasize, tt, vec); + } else { + const IR::UAnyU128 vec = v.V_scalar(datasize, tt); + v.Mem(v.ir.Add(address, offs), ebytes * elements, IR::AccType::VEC, vec); + } + offs = v.ir.Add(offs, v.ir.Imm64(ebytes * elements)); + } + } else { + for (size_t e = 0; e < elements; e++) { + for (size_t s = 0; s < selem; s++) { + const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32); + if (memop == IR::MemOp::LOAD) { + const IR::UAny elem = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC); + const IR::U128 vec = v.ir.VectorSetElement(esize, v.V(datasize, tt), e, elem); + v.V(datasize, tt, vec); + } else { + const IR::UAny elem = v.ir.VectorGetElement(esize, v.V(datasize, tt), e); + v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC, elem); + } + offs = v.ir.Add(offs, v.ir.Imm64(ebytes)); + } + } + } + + if (wback) { + if (*Rm != Reg::SP) { + offs = v.X(64, *Rm); + } + + if (Rn == Reg::SP) { + v.SP(64, v.ir.Add(address, offs)); + } else { + v.X(64, Rn, v.ir.Add(address, offs)); + } + } + + return true; +} + +bool TranslatorVisitor::STx_mult_1(bool Q, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt) { + const bool wback = false; + const auto memop = IR::MemOp::STORE; + return SharedDecodeAndOperation(*this, wback, memop, Q, {}, opcode, size, Rn, Vt); +} + +bool TranslatorVisitor::STx_mult_2(bool Q, Reg Rm, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt) { + const bool wback = true; + const auto memop = IR::MemOp::STORE; + return SharedDecodeAndOperation(*this, wback, memop, Q, Rm, opcode, size, Rn, Vt); +} + +bool TranslatorVisitor::LDx_mult_1(bool Q, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt) { + const bool wback = false; + const auto memop = IR::MemOp::LOAD; + return SharedDecodeAndOperation(*this, wback, memop, Q, {}, opcode, size, Rn, Vt); +} + +bool TranslatorVisitor::LDx_mult_2(bool Q, Reg Rm, Imm<4> opcode, Imm<2> size, Reg Rn, Vec Vt) { + const bool wback = true; + const auto memop = IR::MemOp::LOAD; + return SharedDecodeAndOperation(*this, wback, memop, Q, Rm, opcode, size, Rn, Vt); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_no_allocate_pair.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_no_allocate_pair.cpp new file mode 100644 index 0000000000..c85cc55241 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_no_allocate_pair.cpp @@ -0,0 +1,22 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +// Given the LDNP and STNP instructions are simply hinting at non-temporal +// accesses, we can treat them as regular LDP and STP instructions for the +// time being since we don't perform data caching. + +bool TranslatorVisitor::STNP_LDNP_gen(Imm<1> upper_opc, Imm<1> L, Imm<7> imm7, Reg Rt2, Reg Rn, Reg Rt) { + return STP_LDP_gen(Imm<2>{upper_opc.ZeroExtend() << 1}, true, false, L, imm7, Rt2, Rn, Rt); +} + +bool TranslatorVisitor::STNP_LDNP_fpsimd(Imm<2> opc, Imm<1> L, Imm<7> imm7, Vec Vt2, Reg Rn, Vec Vt) { + return STP_LDP_fpsimd(opc, true, false, L, imm7, Vt2, Rn, Vt); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_immediate.cpp new file mode 100644 index 0000000000..d939c45f2c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_immediate.cpp @@ -0,0 +1,249 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +static bool LoadStoreRegisterImmediate(TranslatorVisitor& v, bool wback, bool postindex, size_t scale, u64 offset, Imm<2> size, Imm<2> opc, Reg Rn, Reg Rt) { + IR::MemOp memop; + bool signed_ = false; + size_t regsize = 0; + + if (opc.Bit<1>() == 0) { + memop = opc.Bit<0>() ? IR::MemOp::LOAD : IR::MemOp::STORE; + regsize = size == 0b11 ? 64 : 32; + signed_ = false; + } else if (size == 0b11) { + memop = IR::MemOp::PREFETCH; + ASSERT(!opc.Bit<0>()); + } else { + memop = IR::MemOp::LOAD; + ASSERT(!(size == 0b10 && opc.Bit<0>() == 1)); + regsize = opc.Bit<0>() ? 32 : 64; + signed_ = true; + } + + if (memop == IR::MemOp::LOAD && wback && Rn == Rt && Rn != Reg::R31) { + return v.UnpredictableInstruction(); + } + if (memop == IR::MemOp::STORE && wback && Rn == Rt && Rn != Reg::R31) { + return v.UnpredictableInstruction(); + } + + // TODO: Check SP alignment + IR::U64 address = Rn == Reg::SP ? IR::U64(v.SP(64)) : IR::U64(v.X(64, Rn)); + if (!postindex) { + address = v.ir.Add(address, v.ir.Imm64(offset)); + } + + const size_t datasize = 8 << scale; + switch (memop) { + case IR::MemOp::STORE: { + const auto data = v.X(datasize, Rt); + v.Mem(address, datasize / 8, IR::AccType::NORMAL, data); + break; + } + case IR::MemOp::LOAD: { + const auto data = v.Mem(address, datasize / 8, IR::AccType::NORMAL); + if (signed_) { + v.X(regsize, Rt, v.SignExtend(data, regsize)); + } else { + v.X(regsize, Rt, v.ZeroExtend(data, regsize)); + } + break; + } + case IR::MemOp::PREFETCH: + // Prefetch(address, Rt) + break; + } + + if (wback) { + if (postindex) { + address = v.ir.Add(address, v.ir.Imm64(offset)); + } + + if (Rn == Reg::SP) { + v.SP(64, address); + } else { + v.X(64, Rn, address); + } + } + + return true; +} + +bool TranslatorVisitor::STRx_LDRx_imm_1(Imm<2> size, Imm<2> opc, Imm<9> imm9, bool not_postindex, Reg Rn, Reg Rt) { + const bool wback = true; + const bool postindex = !not_postindex; + const size_t scale = size.ZeroExtend<size_t>(); + const u64 offset = imm9.SignExtend<u64>(); + + return LoadStoreRegisterImmediate(*this, wback, postindex, scale, offset, size, opc, Rn, Rt); +} + +bool TranslatorVisitor::STRx_LDRx_imm_2(Imm<2> size, Imm<2> opc, Imm<12> imm12, Reg Rn, Reg Rt) { + const bool wback = false; + const bool postindex = false; + const size_t scale = size.ZeroExtend<size_t>(); + const u64 offset = imm12.ZeroExtend<u64>() << scale; + + return LoadStoreRegisterImmediate(*this, wback, postindex, scale, offset, size, opc, Rn, Rt); +} + +bool TranslatorVisitor::STURx_LDURx(Imm<2> size, Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt) { + const bool wback = false; + const bool postindex = false; + const size_t scale = size.ZeroExtend<size_t>(); + const u64 offset = imm9.SignExtend<u64>(); + + return LoadStoreRegisterImmediate(*this, wback, postindex, scale, offset, size, opc, Rn, Rt); +} + +bool TranslatorVisitor::PRFM_imm([[maybe_unused]] Imm<12> imm12, [[maybe_unused]] Reg Rn, [[maybe_unused]] Reg Rt) { + // Currently a NOP (which is valid behavior, as indicated by + // the ARMv8 architecture reference manual) + return true; +} + +bool TranslatorVisitor::PRFM_unscaled_imm([[maybe_unused]] Imm<9> imm9, [[maybe_unused]] Reg Rn, [[maybe_unused]] Reg Rt) { + // Currently a NOP (which is valid behavior, as indicated by + // the ARMv8 architecture reference manual) + return true; +} + +static bool LoadStoreSIMD(TranslatorVisitor& v, bool wback, bool postindex, size_t scale, u64 offset, IR::MemOp memop, Reg Rn, Vec Vt) { + const auto acctype = IR::AccType::VEC; + const size_t datasize = 8 << scale; + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check SP Alignment + address = v.SP(64); + } else { + address = v.X(64, Rn); + } + + if (!postindex) { + address = v.ir.Add(address, v.ir.Imm64(offset)); + } + + switch (memop) { + case IR::MemOp::STORE: + if (datasize == 128) { + const IR::U128 data = v.V(128, Vt); + v.Mem(address, 16, acctype, data); + } else { + const IR::UAny data = v.ir.VectorGetElement(datasize, v.V(128, Vt), 0); + v.Mem(address, datasize / 8, acctype, data); + } + break; + case IR::MemOp::LOAD: + if (datasize == 128) { + const IR::U128 data = v.Mem(address, 16, acctype); + v.V(128, Vt, data); + } else { + const IR::UAny data = v.Mem(address, datasize / 8, acctype); + v.V(128, Vt, v.ir.ZeroExtendToQuad(data)); + } + break; + default: + UNREACHABLE(); + } + + if (wback) { + if (postindex) { + address = v.ir.Add(address, v.ir.Imm64(offset)); + } + + if (Rn == Reg::SP) { + v.SP(64, address); + } else { + v.X(64, Rn, address); + } + } + + return true; +} + +bool TranslatorVisitor::STR_imm_fpsimd_1(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, bool not_postindex, Reg Rn, Vec Vt) { + const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>(); + if (scale > 4) { + return UnallocatedEncoding(); + } + + const bool wback = true; + const bool postindex = !not_postindex; + const u64 offset = imm9.SignExtend<u64>(); + + return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::STORE, Rn, Vt); +} + +bool TranslatorVisitor::STR_imm_fpsimd_2(Imm<2> size, Imm<1> opc_1, Imm<12> imm12, Reg Rn, Vec Vt) { + const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>(); + if (scale > 4) { + return UnallocatedEncoding(); + } + + const bool wback = false; + const bool postindex = false; + const u64 offset = imm12.ZeroExtend<u64>() << scale; + + return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::STORE, Rn, Vt); +} + +bool TranslatorVisitor::LDR_imm_fpsimd_1(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, bool not_postindex, Reg Rn, Vec Vt) { + const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>(); + if (scale > 4) { + return UnallocatedEncoding(); + } + + const bool wback = true; + const bool postindex = !not_postindex; + const u64 offset = imm9.SignExtend<u64>(); + + return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::LOAD, Rn, Vt); +} + +bool TranslatorVisitor::LDR_imm_fpsimd_2(Imm<2> size, Imm<1> opc_1, Imm<12> imm12, Reg Rn, Vec Vt) { + const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>(); + if (scale > 4) { + return UnallocatedEncoding(); + } + + const bool wback = false; + const bool postindex = false; + const u64 offset = imm12.ZeroExtend<u64>() << scale; + + return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::LOAD, Rn, Vt); +} + +bool TranslatorVisitor::STUR_fpsimd(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, Reg Rn, Vec Vt) { + const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>(); + if (scale > 4) { + return UnallocatedEncoding(); + } + + const bool wback = false; + const bool postindex = false; + const u64 offset = imm9.SignExtend<u64>(); + + return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::STORE, Rn, Vt); +} + +bool TranslatorVisitor::LDUR_fpsimd(Imm<2> size, Imm<1> opc_1, Imm<9> imm9, Reg Rn, Vec Vt) { + const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>(); + if (scale > 4) { + return UnallocatedEncoding(); + } + + const bool wback = false; + const bool postindex = false; + const u64 offset = imm9.SignExtend<u64>(); + + return LoadStoreSIMD(*this, wback, postindex, scale, offset, IR::MemOp::LOAD, Rn, Vt); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_pair.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_pair.cpp new file mode 100644 index 0000000000..4c81564f7c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_pair.cpp @@ -0,0 +1,154 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::STP_LDP_gen(Imm<2> opc, bool not_postindex, bool wback, Imm<1> L, Imm<7> imm7, Reg Rt2, Reg Rn, Reg Rt) { + if ((L == 0 && opc.Bit<0>() == 1) || opc == 0b11) { + return UnallocatedEncoding(); + } + + const auto memop = L == 1 ? IR::MemOp::LOAD : IR::MemOp::STORE; + if (memop == IR::MemOp::LOAD && wback && (Rt == Rn || Rt2 == Rn) && Rn != Reg::R31) { + return UnpredictableInstruction(); + } + if (memop == IR::MemOp::STORE && wback && (Rt == Rn || Rt2 == Rn) && Rn != Reg::R31) { + return UnpredictableInstruction(); + } + if (memop == IR::MemOp::LOAD && Rt == Rt2) { + return UnpredictableInstruction(); + } + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check SP Alignment + address = SP(64); + } else { + address = X(64, Rn); + } + + const bool postindex = !not_postindex; + const bool signed_ = opc.Bit<0>() != 0; + const size_t scale = 2 + opc.Bit<1>(); + const size_t datasize = 8 << scale; + const u64 offset = imm7.SignExtend<u64>() << scale; + + if (!postindex) { + address = ir.Add(address, ir.Imm64(offset)); + } + + const size_t dbytes = datasize / 8; + switch (memop) { + case IR::MemOp::STORE: { + const IR::U32U64 data1 = X(datasize, Rt); + const IR::U32U64 data2 = X(datasize, Rt2); + Mem(address, dbytes, IR::AccType::NORMAL, data1); + Mem(ir.Add(address, ir.Imm64(dbytes)), dbytes, IR::AccType::NORMAL, data2); + break; + } + case IR::MemOp::LOAD: { + const IR::U32U64 data1 = Mem(address, dbytes, IR::AccType::NORMAL); + const IR::U32U64 data2 = Mem(ir.Add(address, ir.Imm64(dbytes)), dbytes, IR::AccType::NORMAL); + if (signed_) { + X(64, Rt, SignExtend(data1, 64)); + X(64, Rt2, SignExtend(data2, 64)); + } else { + X(datasize, Rt, data1); + X(datasize, Rt2, data2); + } + break; + } + case IR::MemOp::PREFETCH: + UNREACHABLE(); + } + + if (wback) { + if (postindex) { + address = ir.Add(address, ir.Imm64(offset)); + } + + if (Rn == Reg::SP) { + SP(64, address); + } else { + X(64, Rn, address); + } + } + + return true; +} + +bool TranslatorVisitor::STP_LDP_fpsimd(Imm<2> opc, bool not_postindex, bool wback, Imm<1> L, Imm<7> imm7, Vec Vt2, Reg Rn, Vec Vt) { + if (opc == 0b11) { + return UnallocatedEncoding(); + } + + const auto memop = L == 1 ? IR::MemOp::LOAD : IR::MemOp::STORE; + if (memop == IR::MemOp::LOAD && Vt == Vt2) { + return UnpredictableInstruction(); + } + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check SP Alignment + address = SP(64); + } else { + address = X(64, Rn); + } + + const bool postindex = !not_postindex; + const size_t scale = 2 + opc.ZeroExtend<size_t>(); + const size_t datasize = 8 << scale; + const u64 offset = imm7.SignExtend<u64>() << scale; + const size_t dbytes = datasize / 8; + + if (!postindex) { + address = ir.Add(address, ir.Imm64(offset)); + } + + switch (memop) { + case IR::MemOp::STORE: { + IR::UAnyU128 data1 = V(datasize, Vt); + IR::UAnyU128 data2 = V(datasize, Vt2); + if (datasize != 128) { + data1 = ir.VectorGetElement(datasize, data1, 0); + data2 = ir.VectorGetElement(datasize, data2, 0); + } + Mem(address, dbytes, IR::AccType::VEC, data1); + Mem(ir.Add(address, ir.Imm64(dbytes)), dbytes, IR::AccType::VEC, data2); + break; + } + case IR::MemOp::LOAD: { + IR::UAnyU128 data1 = Mem(address, dbytes, IR::AccType::VEC); + IR::UAnyU128 data2 = Mem(ir.Add(address, ir.Imm64(dbytes)), dbytes, IR::AccType::VEC); + if (datasize != 128) { + data1 = ir.ZeroExtendToQuad(data1); + data2 = ir.ZeroExtendToQuad(data2); + } + V(datasize, Vt, data1); + V(datasize, Vt2, data2); + break; + } + case IR::MemOp::PREFETCH: + UNREACHABLE(); + } + + if (wback) { + if (postindex) { + address = ir.Add(address, ir.Imm64(offset)); + } + + if (Rn == Reg::SP) { + SP(64, address); + } else { + X(64, Rn, address); + } + } + + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_register_offset.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_register_offset.cpp new file mode 100644 index 0000000000..9791095868 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_register_offset.cpp @@ -0,0 +1,160 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +static bool RegSharedDecodeAndOperation(TranslatorVisitor& v, size_t scale, u8 shift, Imm<2> size, Imm<1> opc_1, Imm<1> opc_0, Reg Rm, Imm<3> option, Reg Rn, Reg Rt) { + // Shared Decode + + const auto acctype = IR::AccType::NORMAL; + IR::MemOp memop; + size_t regsize = 64; + bool signed_ = false; + + if (opc_1 == 0) { + memop = opc_0 == 1 ? IR::MemOp::LOAD : IR::MemOp::STORE; + regsize = size == 0b11 ? 64 : 32; + signed_ = false; + } else if (size == 0b11) { + memop = IR::MemOp::PREFETCH; + if (opc_0 == 1) { + return v.UnallocatedEncoding(); + } + } else { + memop = IR::MemOp::LOAD; + if (size == 0b10 && opc_0 == 1) { + return v.UnallocatedEncoding(); + } + regsize = opc_0 == 1 ? 32 : 64; + signed_ = true; + } + + const size_t datasize = 8 << scale; + + // Operation + + const IR::U64 offset = v.ExtendReg(64, Rm, option, shift); + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check SP alignment + address = v.SP(64); + } else { + address = v.X(64, Rn); + } + address = v.ir.Add(address, offset); + + switch (memop) { + case IR::MemOp::STORE: { + const IR::UAny data = v.X(datasize, Rt); + v.Mem(address, datasize / 8, acctype, data); + break; + } + case IR::MemOp::LOAD: { + const IR::UAny data = v.Mem(address, datasize / 8, acctype); + if (signed_) { + v.X(regsize, Rt, v.SignExtend(data, regsize)); + } else { + v.X(regsize, Rt, v.ZeroExtend(data, regsize)); + } + break; + } + case IR::MemOp::PREFETCH: + // TODO: Prefetch + break; + default: + UNREACHABLE(); + } + + return true; +} + +bool TranslatorVisitor::STRx_reg(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Reg Rt) { + const Imm<1> opc_0{0}; + const size_t scale = size.ZeroExtend<size_t>(); + const u8 shift = S ? static_cast<u8>(scale) : 0; + if (!option.Bit<1>()) { + return UnallocatedEncoding(); + } + return RegSharedDecodeAndOperation(*this, scale, shift, size, opc_1, opc_0, Rm, option, Rn, Rt); +} + +bool TranslatorVisitor::LDRx_reg(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Reg Rt) { + const Imm<1> opc_0{1}; + const size_t scale = size.ZeroExtend<size_t>(); + const u8 shift = S ? static_cast<u8>(scale) : 0; + if (!option.Bit<1>()) { + return UnallocatedEncoding(); + } + return RegSharedDecodeAndOperation(*this, scale, shift, size, opc_1, opc_0, Rm, option, Rn, Rt); +} + +static bool VecSharedDecodeAndOperation(TranslatorVisitor& v, size_t scale, u8 shift, Imm<1> opc_0, Reg Rm, Imm<3> option, Reg Rn, Vec Vt) { + // Shared Decode + + const auto acctype = IR::AccType::VEC; + const auto memop = opc_0 == 1 ? IR::MemOp::LOAD : IR::MemOp::STORE; + const size_t datasize = 8 << scale; + + // Operation + + const IR::U64 offset = v.ExtendReg(64, Rm, option, shift); + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check SP alignment + address = v.SP(64); + } else { + address = v.X(64, Rn); + } + address = v.ir.Add(address, offset); + + switch (memop) { + case IR::MemOp::STORE: { + const IR::UAnyU128 data = v.V_scalar(datasize, Vt); + v.Mem(address, datasize / 8, acctype, data); + break; + } + case IR::MemOp::LOAD: { + const IR::UAnyU128 data = v.Mem(address, datasize / 8, acctype); + v.V_scalar(datasize, Vt, data); + break; + } + default: + UNREACHABLE(); + } + + return true; +} + +bool TranslatorVisitor::STR_reg_fpsimd(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Vec Vt) { + const Imm<1> opc_0{0}; + const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>(); + if (scale > 4) { + return UnallocatedEncoding(); + } + const u8 shift = S ? static_cast<u8>(scale) : 0; + if (!option.Bit<1>()) { + return UnallocatedEncoding(); + } + return VecSharedDecodeAndOperation(*this, scale, shift, opc_0, Rm, option, Rn, Vt); +} + +bool TranslatorVisitor::LDR_reg_fpsimd(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3> option, bool S, Reg Rn, Vec Vt) { + const Imm<1> opc_0{1}; + const size_t scale = concatenate(opc_1, size).ZeroExtend<size_t>(); + if (scale > 4) { + return UnallocatedEncoding(); + } + const u8 shift = S ? static_cast<u8>(scale) : 0; + if (!option.Bit<1>()) { + return UnallocatedEncoding(); + } + return VecSharedDecodeAndOperation(*this, scale, shift, opc_0, Rm, option, Rn, Vt); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_unprivileged.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_unprivileged.cpp new file mode 100644 index 0000000000..d3f4194ff2 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_register_unprivileged.cpp @@ -0,0 +1,149 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +static bool StoreRegister(TranslatorVisitor& v, const size_t datasize, const Imm<9> imm9, const Reg Rn, const Reg Rt) { + const u64 offset = imm9.SignExtend<u64>(); + const auto acctype = IR::AccType::UNPRIV; + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check Stack Alignment + address = v.SP(64); + } else { + address = v.X(64, Rn); + } + address = v.ir.Add(address, v.ir.Imm64(offset)); + + const IR::UAny data = v.X(datasize, Rt); + v.Mem(address, datasize / 8, acctype, data); + return true; +} + +static bool LoadRegister(TranslatorVisitor& v, const size_t datasize, const Imm<9> imm9, const Reg Rn, const Reg Rt) { + const u64 offset = imm9.SignExtend<u64>(); + const auto acctype = IR::AccType::UNPRIV; + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check Stack Alignment + address = v.SP(64); + } else { + address = v.X(64, Rn); + } + address = v.ir.Add(address, v.ir.Imm64(offset)); + + const IR::UAny data = v.Mem(address, datasize / 8, acctype); + // max is used to zeroextend < 32 to 32, and > 32 to 64 + const size_t extended_size = std::max<size_t>(32, datasize); + v.X(extended_size, Rt, v.ZeroExtend(data, extended_size)); + return true; +} + +static bool LoadRegisterSigned(TranslatorVisitor& v, const size_t datasize, const Imm<2> opc, const Imm<9> imm9, const Reg Rn, const Reg Rt) { + const u64 offset = imm9.SignExtend<u64>(); + const auto acctype = IR::AccType::UNPRIV; + + IR::MemOp memop; + bool is_signed; + size_t regsize; + if (opc.Bit<1>() == 0) { + // store or zero-extending load + memop = opc.Bit<0>() ? IR::MemOp::LOAD : IR::MemOp::STORE; + regsize = 32; + is_signed = false; + } else { + // sign-extending load + memop = IR::MemOp::LOAD; + regsize = opc.Bit<0>() ? 32 : 64; + is_signed = true; + } + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check Stack Alignment + address = v.SP(64); + } else { + address = v.X(64, Rn); + } + address = v.ir.Add(address, v.ir.Imm64(offset)); + + switch (memop) { + case IR::MemOp::STORE: + v.Mem(address, datasize / 8, acctype, v.X(datasize, Rt)); + break; + case IR::MemOp::LOAD: { + const IR::UAny data = v.Mem(address, datasize / 8, acctype); + if (is_signed) { + v.X(regsize, Rt, v.SignExtend(data, regsize)); + } else { + v.X(regsize, Rt, v.ZeroExtend(data, regsize)); + } + break; + } + case IR::MemOp::PREFETCH: + // Prefetch(address, Rt); + break; + } + return true; +} + +bool TranslatorVisitor::STTRB(Imm<9> imm9, Reg Rn, Reg Rt) { + return StoreRegister(*this, 8, imm9, Rn, Rt); +} + +bool TranslatorVisitor::STTRH(Imm<9> imm9, Reg Rn, Reg Rt) { + return StoreRegister(*this, 16, imm9, Rn, Rt); +} + +bool TranslatorVisitor::STTR(Imm<2> size, Imm<9> imm9, Reg Rn, Reg Rt) { + const size_t scale = size.ZeroExtend<size_t>(); + const size_t datasize = 8 << scale; + return StoreRegister(*this, datasize, imm9, Rn, Rt); +} + +bool TranslatorVisitor::LDTRB(Imm<9> imm9, Reg Rn, Reg Rt) { + return LoadRegister(*this, 8, imm9, Rn, Rt); +} + +bool TranslatorVisitor::LDTRH(Imm<9> imm9, Reg Rn, Reg Rt) { + return LoadRegister(*this, 16, imm9, Rn, Rt); +} + +bool TranslatorVisitor::LDTR(Imm<2> size, Imm<9> imm9, Reg Rn, Reg Rt) { + const size_t scale = size.ZeroExtend<size_t>(); + const size_t datasize = 8 << scale; + return LoadRegister(*this, datasize, imm9, Rn, Rt); +} + +bool TranslatorVisitor::LDTRSB(Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt) { + return LoadRegisterSigned(*this, 8, opc, imm9, Rn, Rt); +} + +bool TranslatorVisitor::LDTRSH(Imm<2> opc, Imm<9> imm9, Reg Rn, Reg Rt) { + return LoadRegisterSigned(*this, 16, opc, imm9, Rn, Rt); +} + +bool TranslatorVisitor::LDTRSW(Imm<9> imm9, Reg Rn, Reg Rt) { + const u64 offset = imm9.SignExtend<u64>(); + const auto acctype = IR::AccType::UNPRIV; + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check Stack Alignment + address = SP(64); + } else { + address = X(64, Rn); + } + address = ir.Add(address, ir.Imm64(offset)); + + const IR::UAny data = Mem(address, 4, acctype); + X(64, Rt, SignExtend(data, 64)); + return true; +} +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_single_structure.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_single_structure.cpp new file mode 100644 index 0000000000..b4bc842942 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_single_structure.cpp @@ -0,0 +1,224 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <optional> + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +static bool SharedDecodeAndOperation(TranslatorVisitor& v, bool wback, IR::MemOp memop, bool Q, bool S, bool R, bool replicate, std::optional<Reg> Rm, Imm<3> opcode, Imm<2> size, Reg Rn, Vec Vt) { + const size_t selem = (opcode.Bit<0>() << 1 | u32{R}) + 1; + size_t scale = opcode.Bits<1, 2>(); + size_t index = 0; + + switch (scale) { + case 0: + index = Q << 3 | S << 2 | size.ZeroExtend(); + break; + case 1: + if (size.Bit<0>()) { + return v.UnallocatedEncoding(); + } + index = Q << 2 | S << 1 | u32{size.Bit<1>()}; + break; + case 2: + if (size.Bit<1>()) { + return v.UnallocatedEncoding(); + } + if (size.Bit<0>()) { + if (S) { + return v.UnallocatedEncoding(); + } + index = Q; + scale = 3; + } else { + index = Q << 1 | u32{S}; + } + break; + case 3: + if (memop == IR::MemOp::STORE || S) { + return v.UnallocatedEncoding(); + } + scale = size.ZeroExtend(); + break; + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << scale; + const size_t ebytes = esize / 8; + + IR::U64 address; + if (Rn == Reg::SP) { + // TODO: Check SP Alignment + address = v.SP(64); + } else { + address = v.X(64, Rn); + } + + IR::U64 offs = v.ir.Imm64(0); + if (replicate) { + for (size_t s = 0; s < selem; s++) { + const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32); + const IR::UAnyU128 element = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC); + const IR::U128 broadcasted_element = v.ir.VectorBroadcast(esize, element); + + v.V(datasize, tt, broadcasted_element); + + offs = v.ir.Add(offs, v.ir.Imm64(ebytes)); + } + } else { + for (size_t s = 0; s < selem; s++) { + const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32); + const IR::U128 rval = v.V(128, tt); + + if (memop == IR::MemOp::LOAD) { + const IR::UAny elem = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC); + const IR::U128 vec = v.ir.VectorSetElement(esize, rval, index, elem); + v.V(128, tt, vec); + } else { + const IR::UAny elem = v.ir.VectorGetElement(esize, rval, index); + v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC, elem); + } + offs = v.ir.Add(offs, v.ir.Imm64(ebytes)); + } + } + + if (wback) { + if (*Rm != Reg::SP) { + offs = v.X(64, *Rm); + } + + if (Rn == Reg::SP) { + v.SP(64, v.ir.Add(address, offs)); + } else { + v.X(64, Rn, v.ir.Add(address, offs)); + } + } + + return true; +} + +bool TranslatorVisitor::LD1_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, S, false, false, {}, + Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD1_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, S, false, false, Rm, + Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD1R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, false, false, true, + {}, Imm<3>{0b110}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD1R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, false, false, true, + Rm, Imm<3>{0b110}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD2_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, S, true, false, {}, + Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD2_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, S, true, false, Rm, + Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD2R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, false, true, true, + {}, Imm<3>{0b110}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD2R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, false, true, true, + Rm, Imm<3>{0b110}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD3_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, S, false, false, {}, + Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD3_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, S, false, false, Rm, + Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD3R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, false, false, true, + {}, Imm<3>{0b111}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD3R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, false, false, true, + Rm, Imm<3>{0b111}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD4_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, S, true, false, {}, + Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD4_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, S, true, false, Rm, + Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD4R_1(bool Q, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::LOAD, Q, false, true, true, + {}, Imm<3>{0b111}, size, Rn, Vt); +} + +bool TranslatorVisitor::LD4R_2(bool Q, Reg Rm, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::LOAD, Q, false, true, true, + Rm, Imm<3>{0b111}, size, Rn, Vt); +} + +bool TranslatorVisitor::ST1_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::STORE, Q, S, false, false, {}, + Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::ST1_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::STORE, Q, S, false, false, Rm, + Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::ST2_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::STORE, Q, S, true, false, {}, + Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::ST2_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::STORE, Q, S, true, false, Rm, + Imm<3>{upper_opcode.ZeroExtend() << 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::ST3_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::STORE, Q, S, false, false, {}, + Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::ST3_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::STORE, Q, S, false, false, Rm, + Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::ST4_sngl_1(bool Q, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, false, IR::MemOp::STORE, Q, S, true, false, {}, + Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt); +} + +bool TranslatorVisitor::ST4_sngl_2(bool Q, Reg Rm, Imm<2> upper_opcode, bool S, Imm<2> size, Reg Rn, Vec Vt) { + return SharedDecodeAndOperation(*this, true, IR::MemOp::STORE, Q, S, true, false, Rm, + Imm<3>{(upper_opcode.ZeroExtend() << 1) | 1}, size, Rn, Vt); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/move_wide.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/move_wide.cpp new file mode 100644 index 0000000000..7bd2e6e356 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/move_wide.cpp @@ -0,0 +1,59 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::MOVN(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd) { + if (!sf && hw.Bit<1>()) { + return UnallocatedEncoding(); + } + + const size_t datasize = sf ? 64 : 32; + const size_t pos = hw.ZeroExtend<size_t>() << 4; + + u64 value = imm16.ZeroExtend<u64>() << pos; + value = ~value; + + const auto result = I(datasize, value); + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::MOVZ(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd) { + if (!sf && hw.Bit<1>()) { + return UnallocatedEncoding(); + } + + const size_t datasize = sf ? 64 : 32; + const size_t pos = hw.ZeroExtend<size_t>() << 4; + + const u64 value = imm16.ZeroExtend<u64>() << pos; + const auto result = I(datasize, value); + + X(datasize, Rd, result); + return true; +} + +bool TranslatorVisitor::MOVK(bool sf, Imm<2> hw, Imm<16> imm16, Reg Rd) { + if (!sf && hw.Bit<1>()) { + return UnallocatedEncoding(); + } + + const size_t datasize = sf ? 64 : 32; + const size_t pos = hw.ZeroExtend<size_t>() << 4; + + const u64 mask = u64(0xFFFF) << pos; + const u64 value = imm16.ZeroExtend<u64>() << pos; + + auto result = X(datasize, Rd); + result = ir.And(result, I(datasize, ~mask)); + result = ir.Or(result, I(datasize, value)); + X(datasize, Rd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_across_lanes.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_across_lanes.cpp new file mode 100644 index 0000000000..6147a54a02 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_across_lanes.cpp @@ -0,0 +1,220 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +enum class Signedness { + Signed, + Unsigned +}; + +bool LongAdd(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vn, Vec Vd, Signedness sign) { + if ((size == 0b10 && !Q) || size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + const size_t elements = datasize / esize; + + const IR::U128 operand = v.V(datasize, Vn); + + const auto get_element = [&](IR::U128 vec, size_t element) { + const auto vec_element = v.ir.VectorGetElement(esize, vec, element); + + if (sign == Signedness::Signed) { + return v.ir.SignExtendToLong(vec_element); + } + + return v.ir.ZeroExtendToLong(vec_element); + }; + + IR::U64 sum = get_element(operand, 0); + for (size_t i = 1; i < elements; i++) { + sum = v.ir.Add(sum, get_element(operand, i)); + } + + if (size == 0b00) { + v.V(datasize, Vd, v.ir.ZeroExtendToQuad(v.ir.LeastSignificantHalf(sum))); + } else if (size == 0b01) { + v.V(datasize, Vd, v.ir.ZeroExtendToQuad(v.ir.LeastSignificantWord(sum))); + } else { + v.V(datasize, Vd, v.ir.ZeroExtendToQuad(sum)); + } + + return true; +} + +enum class MinMaxOperation { + Max, + MaxNumeric, + Min, + MinNumeric, +}; + +bool FPMinMax(TranslatorVisitor& v, bool Q, bool sz, Vec Vn, Vec Vd, MinMaxOperation operation) { + if (!Q || sz) { + return v.ReservedValue(); + } + + const size_t esize = 32; + const size_t datasize = 128; + const size_t elements = datasize / esize; + + const IR::U128 operand = v.V(datasize, Vn); + + const auto op = [&](const IR::U32U64& lhs, const IR::U32U64& rhs) { + switch (operation) { + case MinMaxOperation::Max: + return v.ir.FPMax(lhs, rhs); + case MinMaxOperation::MaxNumeric: + return v.ir.FPMaxNumeric(lhs, rhs); + case MinMaxOperation::Min: + return v.ir.FPMin(lhs, rhs); + case MinMaxOperation::MinNumeric: + return v.ir.FPMinNumeric(lhs, rhs); + default: + UNREACHABLE(); + } + }; + + const auto reduce = [&](size_t start, size_t end) { + IR::U32U64 result = v.ir.VectorGetElement(esize, operand, start); + + for (size_t i = start + 1; i < end; i++) { + const IR::U32U64 element = v.ir.VectorGetElement(esize, operand, i); + + result = op(result, element); + } + + return result; + }; + + const IR::U32U64 hi = reduce(elements / 2, elements); + const IR::U32U64 lo = reduce(0, elements / 2); + const IR::U32U64 result = op(lo, hi); + + v.V_scalar(esize, Vd, result); + return true; +} + +enum class ScalarMinMaxOperation { + Max, + Min, +}; + +bool ScalarMinMax(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vn, Vec Vd, ScalarMinMaxOperation operation, Signedness sign) { + if ((size == 0b10 && !Q) || size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + const size_t elements = datasize / esize; + + const auto get_element = [&](IR::U128 vec, size_t element) { + const auto vec_element = v.ir.VectorGetElement(esize, vec, element); + + if (sign == Signedness::Signed) { + return v.ir.SignExtendToWord(vec_element); + } + + return v.ir.ZeroExtendToWord(vec_element); + }; + + const auto op_func = [&](const auto& a, const auto& b) { + switch (operation) { + case ScalarMinMaxOperation::Max: + if (sign == Signedness::Signed) { + return v.ir.MaxSigned(a, b); + } + return v.ir.MaxUnsigned(a, b); + + case ScalarMinMaxOperation::Min: + if (sign == Signedness::Signed) { + return v.ir.MinSigned(a, b); + } + return v.ir.MinUnsigned(a, b); + + default: + UNREACHABLE(); + } + }; + + const IR::U128 operand = v.V(datasize, Vn); + + IR::U32 value = get_element(operand, 0); + for (size_t i = 1; i < elements; i++) { + value = op_func(value, get_element(operand, i)); + } + + if (size == 0b00) { + v.V(datasize, Vd, v.ir.ZeroExtendToQuad(v.ir.LeastSignificantByte(value))); + } else if (size == 0b01) { + v.V(datasize, Vd, v.ir.ZeroExtendToQuad(v.ir.LeastSignificantHalf(value))); + } else { + v.V(datasize, Vd, v.ir.ZeroExtendToQuad(value)); + } + + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::ADDV(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if ((size == 0b10 && !Q) || size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + + V(128, Vd, ir.VectorReduceAdd(esize, operand)); + return true; +} + +bool TranslatorVisitor::FMAXNMV_2(bool Q, bool sz, Vec Vn, Vec Vd) { + return FPMinMax(*this, Q, sz, Vn, Vd, MinMaxOperation::MaxNumeric); +} + +bool TranslatorVisitor::FMAXV_2(bool Q, bool sz, Vec Vn, Vec Vd) { + return FPMinMax(*this, Q, sz, Vn, Vd, MinMaxOperation::Max); +} + +bool TranslatorVisitor::FMINNMV_2(bool Q, bool sz, Vec Vn, Vec Vd) { + return FPMinMax(*this, Q, sz, Vn, Vd, MinMaxOperation::MinNumeric); +} + +bool TranslatorVisitor::FMINV_2(bool Q, bool sz, Vec Vn, Vec Vd) { + return FPMinMax(*this, Q, sz, Vn, Vd, MinMaxOperation::Min); +} + +bool TranslatorVisitor::SADDLV(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return LongAdd(*this, Q, size, Vn, Vd, Signedness::Signed); +} + +bool TranslatorVisitor::SMAXV(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return ScalarMinMax(*this, Q, size, Vn, Vd, ScalarMinMaxOperation::Max, Signedness::Signed); +} + +bool TranslatorVisitor::SMINV(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return ScalarMinMax(*this, Q, size, Vn, Vd, ScalarMinMaxOperation::Min, Signedness::Signed); +} + +bool TranslatorVisitor::UADDLV(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return LongAdd(*this, Q, size, Vn, Vd, Signedness::Unsigned); +} + +bool TranslatorVisitor::UMAXV(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return ScalarMinMax(*this, Q, size, Vn, Vd, ScalarMinMaxOperation::Max, Signedness::Unsigned); +} + +bool TranslatorVisitor::UMINV(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return ScalarMinMax(*this, Q, size, Vn, Vd, ScalarMinMaxOperation::Min, Signedness::Unsigned); +} +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_aes.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_aes.cpp new file mode 100644 index 0000000000..d604703ef2 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_aes.cpp @@ -0,0 +1,46 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::AESD(Vec Vn, Vec Vd) { + const IR::U128 operand1 = ir.GetQ(Vd); + const IR::U128 operand2 = ir.GetQ(Vn); + + const IR::U128 result = ir.AESDecryptSingleRound(ir.VectorEor(operand1, operand2)); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::AESE(Vec Vn, Vec Vd) { + const IR::U128 operand1 = ir.GetQ(Vd); + const IR::U128 operand2 = ir.GetQ(Vn); + + const IR::U128 result = ir.AESEncryptSingleRound(ir.VectorEor(operand1, operand2)); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::AESIMC(Vec Vn, Vec Vd) { + const IR::U128 operand = ir.GetQ(Vn); + const IR::U128 result = ir.AESInverseMixColumns(operand); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::AESMC(Vec Vn, Vec Vd) { + const IR::U128 operand = ir.GetQ(Vn); + const IR::U128 result = ir.AESMixColumns(operand); + + ir.SetQ(Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_copy.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_copy.cpp new file mode 100644 index 0000000000..b33bc8f5ad --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_copy.cpp @@ -0,0 +1,158 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_count.hpp> + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::DUP_elt_1(Imm<5> imm5, Vec Vn, Vec Vd) { + const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend()); + if (size > 3) { + return ReservedValue(); + } + + const size_t index = imm5.ZeroExtend<size_t>() >> (size + 1); + const size_t idxdsize = imm5.Bit<4>() ? 128 : 64; + const size_t esize = 8 << size; + + const IR::U128 operand = V(idxdsize, Vn); + const IR::UAny element = ir.VectorGetElement(esize, operand, index); + const IR::U128 result = ir.ZeroExtendToQuad(element); + V(128, Vd, result); + return true; +} + +bool TranslatorVisitor::DUP_elt_2(bool Q, Imm<5> imm5, Vec Vn, Vec Vd) { + const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend()); + if (size > 3) { + return ReservedValue(); + } + + if (size == 3 && !Q) { + return ReservedValue(); + } + + const size_t index = imm5.ZeroExtend<size_t>() >> (size + 1); + const size_t idxdsize = imm5.Bit<4>() ? 128 : 64; + const size_t esize = 8 << size; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(idxdsize, Vn); + const IR::U128 result = Q ? ir.VectorBroadcastElement(esize, operand, index) : ir.VectorBroadcastElementLower(esize, operand, index); + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::DUP_gen(bool Q, Imm<5> imm5, Reg Rn, Vec Vd) { + const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend()); + if (size > 3) { + return ReservedValue(); + } + + if (size == 3 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size; + const size_t datasize = Q ? 128 : 64; + + const IR::UAny element = X(esize, Rn); + + const IR::U128 result = Q ? ir.VectorBroadcast(esize, element) : ir.VectorBroadcastLower(esize, element); + + V(datasize, Vd, result); + + return true; +} + +bool TranslatorVisitor::SMOV(bool Q, Imm<5> imm5, Vec Vn, Reg Rd) { + const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend()); + if (size == 2 && !Q) { + return UnallocatedEncoding(); + } + + if (size > 2) { + return ReservedValue(); + } + + const size_t idxdsize = imm5.Bit<4>() ? 128 : 64; + const size_t index = imm5.ZeroExtend<size_t>() >> (size + 1); + const size_t esize = 8 << size; + const size_t datasize = Q ? 64 : 32; + + const IR::U128 operand = V(idxdsize, Vn); + + const IR::UAny elem = ir.VectorGetElement(esize, operand, index); + X(datasize, Rd, SignExtend(elem, datasize)); + + return true; +} + +bool TranslatorVisitor::UMOV(bool Q, Imm<5> imm5, Vec Vn, Reg Rd) { + const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend()); + if (size < 3 && Q) { + return UnallocatedEncoding(); + } + + if (size == 3 && !Q) { + return UnallocatedEncoding(); + } + + if (size > 3) { + return ReservedValue(); + } + + const size_t idxdsize = imm5.Bit<4>() ? 128 : 64; + const size_t index = imm5.ZeroExtend<size_t>() >> (size + 1); + const size_t esize = 8 << size; + const size_t datasize = Q ? 64 : 32; + + const IR::U128 operand = V(idxdsize, Vn); + + const IR::UAny elem = ir.VectorGetElement(esize, operand, index); + X(datasize, Rd, ZeroExtend(elem, datasize)); + + return true; +} + +bool TranslatorVisitor::INS_gen(Imm<5> imm5, Reg Rn, Vec Vd) { + const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend()); + if (size > 3) { + return ReservedValue(); + } + + const size_t index = imm5.ZeroExtend<size_t>() >> (size + 1); + const size_t esize = 8 << size; + const size_t datasize = 128; + + const IR::UAny element = X(esize, Rn); + const IR::U128 result = ir.VectorSetElement(esize, V(datasize, Vd), index, element); + V(datasize, Vd, result); + + return true; +} + +bool TranslatorVisitor::INS_elt(Imm<5> imm5, Imm<4> imm4, Vec Vn, Vec Vd) { + const size_t size = mcl::bit::lowest_set_bit(imm5.ZeroExtend()); + if (size > 3) { + return ReservedValue(); + } + + const size_t dst_index = imm5.ZeroExtend<size_t>() >> (size + 1); + const size_t src_index = imm4.ZeroExtend<size_t>() >> size; + const size_t idxdsize = imm4.Bit<3>() ? 128 : 64; + const size_t esize = 8 << size; + + const IR::U128 operand = V(idxdsize, Vn); + const IR::UAny elem = ir.VectorGetElement(esize, operand, src_index); + const IR::U128 result = ir.VectorSetElement(esize, V(128, Vd), dst_index, elem); + V(128, Vd, result); + + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_four_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_four_register.cpp new file mode 100644 index 0000000000..8464bba5d7 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_four_register.cpp @@ -0,0 +1,52 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::EOR3(Vec Vm, Vec Va, Vec Vn, Vec Vd) { + const IR::U128 a = ir.GetQ(Va); + const IR::U128 m = ir.GetQ(Vm); + const IR::U128 n = ir.GetQ(Vn); + + const IR::U128 result = ir.VectorEor(ir.VectorEor(n, m), a); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::BCAX(Vec Vm, Vec Va, Vec Vn, Vec Vd) { + const IR::U128 a = ir.GetQ(Va); + const IR::U128 m = ir.GetQ(Vm); + const IR::U128 n = ir.GetQ(Vn); + + const IR::U128 result = ir.VectorEor(n, ir.VectorAndNot(m, a)); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SM3SS1(Vec Vm, Vec Va, Vec Vn, Vec Vd) { + const IR::U128 a = ir.GetQ(Va); + const IR::U128 m = ir.GetQ(Vm); + const IR::U128 n = ir.GetQ(Vn); + + const IR::U32 top_a = ir.VectorGetElement(32, a, 3); + const IR::U32 top_m = ir.VectorGetElement(32, m, 3); + const IR::U32 top_n = ir.VectorGetElement(32, n, 3); + + const IR::U32 rotated_n = ir.RotateRight(top_n, ir.Imm8(20)); + const IR::U32 sum = ir.Add(ir.Add(rotated_n, top_m), top_a); + const IR::U32 result = ir.RotateRight(sum, ir.Imm8(25)); + + const IR::U128 zero_vector = ir.ZeroVector(); + const IR::U128 vector_result = ir.VectorSetElement(32, zero_vector, 3, result); + + ir.SetQ(Vd, vector_result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_three_register.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_three_register.cpp new file mode 100644 index 0000000000..f21570f90e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_crypto_three_register.cpp @@ -0,0 +1,102 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +enum class SM3TTVariant { + A, + B, +}; + +bool SM3TT1(TranslatorVisitor& v, Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd, SM3TTVariant behavior) { + const IR::U128 d = v.ir.GetQ(Vd); + const IR::U128 m = v.ir.GetQ(Vm); + const IR::U128 n = v.ir.GetQ(Vn); + const u32 index = imm2.ZeroExtend(); + + const IR::U32 top_d = v.ir.VectorGetElement(32, d, 3); + const IR::U32 before_top_d = v.ir.VectorGetElement(32, d, 2); + const IR::U32 after_low_d = v.ir.VectorGetElement(32, d, 1); + const IR::U32 low_d = v.ir.VectorGetElement(32, d, 0); + const IR::U32 top_n = v.ir.VectorGetElement(32, n, 3); + + const IR::U32 wj_prime = v.ir.VectorGetElement(32, m, index); + const IR::U32 ss2 = v.ir.Eor(top_n, v.ir.RotateRight(top_d, v.ir.Imm8(20))); + const IR::U32 tt1 = [&] { + if (behavior == SM3TTVariant::A) { + return v.ir.Eor(after_low_d, v.ir.Eor(top_d, before_top_d)); + } + const IR::U32 tmp1 = v.ir.And(top_d, after_low_d); + const IR::U32 tmp2 = v.ir.And(top_d, before_top_d); + const IR::U32 tmp3 = v.ir.And(after_low_d, before_top_d); + return v.ir.Or(v.ir.Or(tmp1, tmp2), tmp3); + }(); + const IR::U32 final_tt1 = v.ir.Add(tt1, v.ir.Add(low_d, v.ir.Add(ss2, wj_prime))); + + const IR::U128 zero_vector = v.ir.ZeroVector(); + const IR::U128 tmp1 = v.ir.VectorSetElement(32, zero_vector, 0, after_low_d); + const IR::U128 tmp2 = v.ir.VectorSetElement(32, tmp1, 1, v.ir.RotateRight(before_top_d, v.ir.Imm8(23))); + const IR::U128 tmp3 = v.ir.VectorSetElement(32, tmp2, 2, top_d); + const IR::U128 result = v.ir.VectorSetElement(32, tmp3, 3, final_tt1); + + v.ir.SetQ(Vd, result); + return true; +} + +bool SM3TT2(TranslatorVisitor& v, Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd, SM3TTVariant behavior) { + const IR::U128 d = v.ir.GetQ(Vd); + const IR::U128 m = v.ir.GetQ(Vm); + const IR::U128 n = v.ir.GetQ(Vn); + const u32 index = imm2.ZeroExtend(); + + const IR::U32 top_d = v.ir.VectorGetElement(32, d, 3); + const IR::U32 before_top_d = v.ir.VectorGetElement(32, d, 2); + const IR::U32 after_low_d = v.ir.VectorGetElement(32, d, 1); + const IR::U32 low_d = v.ir.VectorGetElement(32, d, 0); + const IR::U32 top_n = v.ir.VectorGetElement(32, n, 3); + + const IR::U32 wj = v.ir.VectorGetElement(32, m, index); + const IR::U32 tt2 = [&] { + if (behavior == SM3TTVariant::A) { + return v.ir.Eor(after_low_d, v.ir.Eor(top_d, before_top_d)); + } + const IR::U32 tmp1 = v.ir.And(top_d, before_top_d); + const IR::U32 tmp2 = v.ir.AndNot(after_low_d, top_d); + return v.ir.Or(tmp1, tmp2); + }(); + const IR::U32 final_tt2 = v.ir.Add(tt2, v.ir.Add(low_d, v.ir.Add(top_n, wj))); + const IR::U32 top_result = v.ir.Eor(final_tt2, v.ir.Eor(v.ir.RotateRight(final_tt2, v.ir.Imm8(23)), + v.ir.RotateRight(final_tt2, v.ir.Imm8(15)))); + + const IR::U128 zero_vector = v.ir.ZeroVector(); + const IR::U128 tmp1 = v.ir.VectorSetElement(32, zero_vector, 0, after_low_d); + const IR::U128 tmp2 = v.ir.VectorSetElement(32, tmp1, 1, v.ir.RotateRight(before_top_d, v.ir.Imm8(13))); + const IR::U128 tmp3 = v.ir.VectorSetElement(32, tmp2, 2, top_d); + const IR::U128 result = v.ir.VectorSetElement(32, tmp3, 3, top_result); + + v.ir.SetQ(Vd, result); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::SM3TT1A(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd) { + return SM3TT1(*this, Vm, imm2, Vn, Vd, SM3TTVariant::A); +} + +bool TranslatorVisitor::SM3TT1B(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd) { + return SM3TT1(*this, Vm, imm2, Vn, Vd, SM3TTVariant::B); +} + +bool TranslatorVisitor::SM3TT2A(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd) { + return SM3TT2(*this, Vm, imm2, Vn, Vd, SM3TTVariant::A); +} + +bool TranslatorVisitor::SM3TT2B(Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd) { + return SM3TT2(*this, Vm, imm2, Vn, Vd, SM3TTVariant::B); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_extract.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_extract.cpp new file mode 100644 index 0000000000..bb5dcbe93e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_extract.cpp @@ -0,0 +1,27 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::EXT(bool Q, Vec Vm, Imm<4> imm4, Vec Vn, Vec Vd) { + if (!Q && imm4.Bit<3>()) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t position = imm4.ZeroExtend<size_t>() << 3; + + const IR::U128 lo = V(datasize, Vn); + const IR::U128 hi = V(datasize, Vm); + const IR::U128 result = datasize == 64 ? ir.VectorExtractLower(lo, hi, position) : ir.VectorExtract(lo, hi, position); + + V(datasize, Vd, result); + + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_modified_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_modified_immediate.cpp new file mode 100644 index 0000000000..db8e83631a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_modified_immediate.cpp @@ -0,0 +1,130 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::MOVI(bool Q, bool op, Imm<1> a, Imm<1> b, Imm<1> c, Imm<4> cmode, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + // MOVI + // also FMOV (vector, immediate) when cmode == 0b1111 + const auto movi = [&] { + const u64 imm64 = AdvSIMDExpandImm(op, cmode, concatenate(a, b, c, d, e, f, g, h)); + const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64)); + V(128, Vd, imm); + return true; + }; + + // MVNI + const auto mvni = [&] { + const u64 imm64 = ~AdvSIMDExpandImm(op, cmode, concatenate(a, b, c, d, e, f, g, h)); + const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64)); + V(128, Vd, imm); + return true; + }; + + // ORR (vector, immediate) + const auto orr = [&] { + const u64 imm64 = AdvSIMDExpandImm(op, cmode, concatenate(a, b, c, d, e, f, g, h)); + const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64)); + const IR::U128 operand = V(datasize, Vd); + const IR::U128 result = ir.VectorOr(operand, imm); + V(datasize, Vd, result); + return true; + }; + + // BIC (vector, immediate) + const auto bic = [&] { + const u64 imm64 = ~AdvSIMDExpandImm(op, cmode, concatenate(a, b, c, d, e, f, g, h)); + const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64)); + const IR::U128 operand = V(datasize, Vd); + const IR::U128 result = ir.VectorAnd(operand, imm); + V(datasize, Vd, result); + return true; + }; + + switch (concatenate(cmode, Imm<1>{op}).ZeroExtend()) { + case 0b00000: + case 0b00100: + case 0b01000: + case 0b01100: + case 0b10000: + case 0b10100: + case 0b11000: + case 0b11010: + case 0b11100: + case 0b11101: + case 0b11110: + return movi(); + case 0b11111: + if (!Q) { + return UnallocatedEncoding(); + } + return movi(); + case 0b00001: + case 0b00101: + case 0b01001: + case 0b01101: + case 0b10001: + case 0b10101: + case 0b11001: + case 0b11011: + return mvni(); + case 0b00010: + case 0b00110: + case 0b01010: + case 0b01110: + case 0b10010: + case 0b10110: + return orr(); + case 0b00011: + case 0b00111: + case 0b01011: + case 0b01111: + case 0b10011: + case 0b10111: + return bic(); + } + + UNREACHABLE(); +} + +bool TranslatorVisitor::FMOV_2(bool Q, bool op, Imm<1> a, Imm<1> b, Imm<1> c, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + if (op && !Q) { + return UnallocatedEncoding(); + } + + const u64 imm64 = AdvSIMDExpandImm(op, Imm<4>{0b1111}, concatenate(a, b, c, d, e, f, g, h)); + + const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64)); + V(128, Vd, imm); + return true; +} + +bool TranslatorVisitor::FMOV_3(bool Q, Imm<1> a, Imm<1> b, Imm<1> c, Imm<1> d, Imm<1> e, Imm<1> f, Imm<1> g, Imm<1> h, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const Imm<8> imm8 = concatenate(a, b, c, d, e, f, g, h); + const u16 imm16 = [&imm8] { + u16 imm16 = 0; + imm16 |= imm8.Bit<7>() ? 0x8000 : 0; + imm16 |= imm8.Bit<6>() ? 0x3000 : 0x4000; + imm16 |= imm8.Bits<0, 5, u16>() << 6; + return imm16; + }(); + const u64 imm64 = mcl::bit::replicate_element<u16, u64>(imm16); + + const IR::U128 imm = datasize == 64 ? ir.ZeroExtendToQuad(ir.Imm64(imm64)) : ir.VectorBroadcast(64, ir.Imm64(imm64)); + V(128, Vd, imm); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_permute.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_permute.cpp new file mode 100644 index 0000000000..06e15273cf --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_permute.cpp @@ -0,0 +1,113 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <tuple> + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +enum class Transposition { + TRN1, + TRN2, +}; + +bool VectorTranspose(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Transposition type) { + if (!Q && size == 0b11) { + return v.ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const u8 esize = static_cast<u8>(8 << size.ZeroExtend()); + + const IR::U128 m = v.V(datasize, Vm); + const IR::U128 n = v.V(datasize, Vn); + const IR::U128 result = v.ir.VectorTranspose(esize, n, m, type == Transposition::TRN2); + + v.V(datasize, Vd, result); + return true; +} + +enum class UnzipType { + Even, + Odd, +}; + +bool VectorUnzip(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, UnzipType type) { + if (size == 0b11 && !Q) { + return v.ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 n = v.V(datasize, Vn); + const IR::U128 m = v.V(datasize, Vm); + const IR::U128 result = type == UnzipType::Even + ? (Q ? v.ir.VectorDeinterleaveEven(esize, n, m) : v.ir.VectorDeinterleaveEvenLower(esize, n, m)) + : (Q ? v.ir.VectorDeinterleaveOdd(esize, n, m) : v.ir.VectorDeinterleaveOddLower(esize, n, m)); + + v.V(datasize, Vd, result); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::TRN1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return VectorTranspose(*this, Q, size, Vm, Vn, Vd, Transposition::TRN1); +} + +bool TranslatorVisitor::TRN2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return VectorTranspose(*this, Q, size, Vm, Vn, Vd, Transposition::TRN2); +} + +bool TranslatorVisitor::UZP1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return VectorUnzip(*this, Q, size, Vm, Vn, Vd, UnzipType::Even); +} + +bool TranslatorVisitor::UZP2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return VectorUnzip(*this, Q, size, Vm, Vn, Vd, UnzipType::Odd); +} + +bool TranslatorVisitor::ZIP1(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorInterleaveLower(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::ZIP2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = [&] { + if (Q) { + return ir.VectorInterleaveUpper(esize, operand1, operand2); + } + + // TODO: Urgh. + const IR::U128 interleaved = ir.VectorInterleaveLower(esize, operand1, operand2); + return ir.VectorZeroUpper(ir.VectorRotateWholeVectorRight(interleaved, 64)); + }(); + + V(datasize, Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_pairwise.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_pairwise.cpp new file mode 100644 index 0000000000..63615b0f9a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_pairwise.cpp @@ -0,0 +1,80 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +enum class MinMaxOperation { + Max, + MaxNumeric, + Min, + MinNumeric, +}; + +bool FPPairwiseMinMax(TranslatorVisitor& v, bool sz, Vec Vn, Vec Vd, MinMaxOperation operation) { + const size_t esize = sz ? 64 : 32; + + const IR::U128 operand = v.V(128, Vn); + const IR::U32U64 element1 = v.ir.VectorGetElement(esize, operand, 0); + const IR::U32U64 element2 = v.ir.VectorGetElement(esize, operand, 1); + const IR::U32U64 result = [&] { + switch (operation) { + case MinMaxOperation::Max: + return v.ir.FPMax(element1, element2); + case MinMaxOperation::MaxNumeric: + return v.ir.FPMaxNumeric(element1, element2); + case MinMaxOperation::Min: + return v.ir.FPMin(element1, element2); + case MinMaxOperation::MinNumeric: + return v.ir.FPMinNumeric(element1, element2); + default: + UNREACHABLE(); + } + }(); + + v.V(128, Vd, v.ir.ZeroExtendToQuad(result)); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::ADDP_pair(Imm<2> size, Vec Vn, Vec Vd) { + if (size != 0b11) { + return ReservedValue(); + } + + const IR::U64 operand1 = ir.VectorGetElement(64, V(128, Vn), 0); + const IR::U64 operand2 = ir.VectorGetElement(64, V(128, Vn), 1); + const IR::U128 result = ir.ZeroExtendToQuad(ir.Add(operand1, operand2)); + V(128, Vd, result); + return true; +} + +bool TranslatorVisitor::FADDP_pair_2(bool size, Vec Vn, Vec Vd) { + const size_t esize = size ? 64 : 32; + + const IR::U32U64 operand1 = ir.VectorGetElement(esize, V(128, Vn), 0); + const IR::U32U64 operand2 = ir.VectorGetElement(esize, V(128, Vn), 1); + const IR::U128 result = ir.ZeroExtendToQuad(ir.FPAdd(operand1, operand2)); + V(128, Vd, result); + return true; +} + +bool TranslatorVisitor::FMAXNMP_pair_2(bool sz, Vec Vn, Vec Vd) { + return FPPairwiseMinMax(*this, sz, Vn, Vd, MinMaxOperation::MaxNumeric); +} + +bool TranslatorVisitor::FMAXP_pair_2(bool sz, Vec Vn, Vec Vd) { + return FPPairwiseMinMax(*this, sz, Vn, Vd, MinMaxOperation::Max); +} + +bool TranslatorVisitor::FMINNMP_pair_2(bool sz, Vec Vn, Vec Vd) { + return FPPairwiseMinMax(*this, sz, Vn, Vd, MinMaxOperation::MinNumeric); +} + +bool TranslatorVisitor::FMINP_pair_2(bool sz, Vec Vn, Vec Vd) { + return FPPairwiseMinMax(*this, sz, Vn, Vd, MinMaxOperation::Min); +} +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_shift_by_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_shift_by_immediate.cpp new file mode 100644 index 0000000000..a0570edc7e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_shift_by_immediate.cpp @@ -0,0 +1,354 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_count.hpp> + +#include "dynarmic/common/fp/rounding_mode.h" +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +enum class Narrowing { + Truncation, + SaturateToUnsigned, + SaturateToSigned, +}; + +enum class SaturatingShiftLeftType { + Signed, + Unsigned, + SignedWithUnsignedSaturation, +}; + +enum class ShiftExtraBehavior { + None, + Accumulate, +}; + +enum class Signedness { + Signed, + Unsigned, +}; + +enum class FloatConversionDirection { + FixedToFloat, + FloatToFixed, +}; + +bool SaturatingShiftLeft(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, SaturatingShiftLeftType type) { + if (immh == 0b0000) { + return v.ReservedValue(); + } + + const size_t esize = 8U << mcl::bit::highest_set_bit(immh.ZeroExtend()); + const size_t shift_amount = concatenate(immh, immb).ZeroExtend() - esize; + + const IR::U128 operand = v.ir.ZeroExtendToQuad(v.V_scalar(esize, Vn)); + const IR::U128 shift = v.ir.ZeroExtendToQuad(v.I(esize, shift_amount)); + const IR::U128 result = [&v, esize, operand, shift, type, shift_amount] { + if (type == SaturatingShiftLeftType::Signed) { + return v.ir.VectorSignedSaturatedShiftLeft(esize, operand, shift); + } + + if (type == SaturatingShiftLeftType::Unsigned) { + return v.ir.VectorUnsignedSaturatedShiftLeft(esize, operand, shift); + } + + return v.ir.VectorSignedSaturatedShiftLeftUnsigned(esize, operand, static_cast<u8>(shift_amount)); + }(); + + v.ir.SetQ(Vd, result); + return true; +} + +bool ShiftRight(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, ShiftExtraBehavior behavior, Signedness signedness) { + if (!immh.Bit<3>()) { + return v.ReservedValue(); + } + + const size_t esize = 64; + const u8 shift_amount = static_cast<u8>((esize * 2) - concatenate(immh, immb).ZeroExtend()); + + const IR::U64 operand = v.V_scalar(esize, Vn); + IR::U64 result = [&]() -> IR::U64 { + if (signedness == Signedness::Signed) { + return v.ir.ArithmeticShiftRight(operand, v.ir.Imm8(shift_amount)); + } + return v.ir.LogicalShiftRight(operand, v.ir.Imm8(shift_amount)); + }(); + + if (behavior == ShiftExtraBehavior::Accumulate) { + const IR::U64 addend = v.V_scalar(esize, Vd); + result = v.ir.Add(result, addend); + } + + v.V_scalar(esize, Vd, result); + return true; +} + +bool RoundingShiftRight(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, ShiftExtraBehavior behavior, Signedness signedness) { + if (!immh.Bit<3>()) { + return v.ReservedValue(); + } + + const size_t esize = 64; + const u8 shift_amount = static_cast<u8>((esize * 2) - concatenate(immh, immb).ZeroExtend()); + + const IR::U64 operand = v.V_scalar(esize, Vn); + const IR::U64 round_bit = v.ir.LogicalShiftRight(v.ir.LogicalShiftLeft(operand, v.ir.Imm8(64 - shift_amount)), v.ir.Imm8(63)); + const IR::U64 result = [&] { + const IR::U64 shifted = [&]() -> IR::U64 { + if (signedness == Signedness::Signed) { + return v.ir.ArithmeticShiftRight(operand, v.ir.Imm8(shift_amount)); + } + return v.ir.LogicalShiftRight(operand, v.ir.Imm8(shift_amount)); + }(); + + IR::U64 tmp = v.ir.Add(shifted, round_bit); + + if (behavior == ShiftExtraBehavior::Accumulate) { + tmp = v.ir.Add(tmp, v.V_scalar(esize, Vd)); + } + + return tmp; + }(); + + v.V_scalar(esize, Vd, result); + return true; +} + +enum class ShiftDirection { + Left, + Right, +}; + +bool ShiftAndInsert(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, ShiftDirection direction) { + if (!immh.Bit<3>()) { + return v.ReservedValue(); + } + + const size_t esize = 64; + + const u8 shift_amount = [&] { + if (direction == ShiftDirection::Right) { + return static_cast<u8>((esize * 2) - concatenate(immh, immb).ZeroExtend()); + } + + return static_cast<u8>(concatenate(immh, immb).ZeroExtend() - esize); + }(); + + const u64 mask = [&] { + if (direction == ShiftDirection::Right) { + return shift_amount == esize ? 0 : mcl::bit::ones<u64>(esize) >> shift_amount; + } + + return mcl::bit::ones<u64>(esize) << shift_amount; + }(); + + const IR::U64 operand1 = v.V_scalar(esize, Vn); + const IR::U64 operand2 = v.V_scalar(esize, Vd); + + const IR::U64 shifted = [&] { + if (direction == ShiftDirection::Right) { + return v.ir.LogicalShiftRight(operand1, v.ir.Imm8(shift_amount)); + } + + return v.ir.LogicalShiftLeft(operand1, v.ir.Imm8(shift_amount)); + }(); + + const IR::U64 result = v.ir.Or(v.ir.AndNot(operand2, v.ir.Imm64(mask)), shifted); + v.V_scalar(esize, Vd, result); + return true; +} + +bool ShiftRightNarrowing(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Narrowing narrowing, Signedness signedness) { + if (immh == 0b0000) { + return v.ReservedValue(); + } + + if (immh.Bit<3>()) { + return v.ReservedValue(); + } + + const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend()); + const size_t source_esize = 2 * esize; + const u8 shift_amount = static_cast<u8>(source_esize - concatenate(immh, immb).ZeroExtend()); + + const IR::U128 operand = v.ir.ZeroExtendToQuad(v.ir.VectorGetElement(source_esize, v.V(128, Vn), 0)); + + IR::U128 wide_result = [&] { + if (signedness == Signedness::Signed) { + return v.ir.VectorArithmeticShiftRight(source_esize, operand, shift_amount); + } + return v.ir.VectorLogicalShiftRight(source_esize, operand, shift_amount); + }(); + + const IR::U128 result = [&] { + switch (narrowing) { + case Narrowing::Truncation: + return v.ir.VectorNarrow(source_esize, wide_result); + case Narrowing::SaturateToUnsigned: + if (signedness == Signedness::Signed) { + return v.ir.VectorSignedSaturatedNarrowToUnsigned(source_esize, wide_result); + } + return v.ir.VectorUnsignedSaturatedNarrow(source_esize, wide_result); + case Narrowing::SaturateToSigned: + ASSERT(signedness == Signedness::Signed); + return v.ir.VectorSignedSaturatedNarrowToSigned(source_esize, wide_result); + } + UNREACHABLE(); + }(); + + const IR::UAny segment = v.ir.VectorGetElement(esize, result, 0); + v.V_scalar(esize, Vd, segment); + return true; +} + +bool ScalarFPConvertWithRound(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Signedness sign, FloatConversionDirection direction, FP::RoundingMode rounding_mode) { + const u32 immh_value = immh.ZeroExtend(); + + if ((immh_value & 0b1110) == 0b0000) { + return v.ReservedValue(); + } + + // TODO: We currently don't handle FP16, so bail like the ARM reference manual allows. + if ((immh_value & 0b1110) == 0b0010) { + return v.ReservedValue(); + } + + const size_t esize = (immh_value & 0b1000) != 0 ? 64 : 32; + const size_t concat = concatenate(immh, immb).ZeroExtend(); + const size_t fbits = (esize * 2) - concat; + + const IR::U32U64 operand = v.V_scalar(esize, Vn); + const IR::U32U64 result = [&]() -> IR::U32U64 { + switch (direction) { + case FloatConversionDirection::FloatToFixed: + if (esize == 64) { + return sign == Signedness::Signed + ? v.ir.FPToFixedS64(operand, fbits, rounding_mode) + : v.ir.FPToFixedU64(operand, fbits, rounding_mode); + } + + return sign == Signedness::Signed + ? v.ir.FPToFixedS32(operand, fbits, rounding_mode) + : v.ir.FPToFixedU32(operand, fbits, rounding_mode); + + case FloatConversionDirection::FixedToFloat: + if (esize == 64) { + return sign == Signedness::Signed + ? v.ir.FPSignedFixedToDouble(operand, fbits, rounding_mode) + : v.ir.FPUnsignedFixedToDouble(operand, fbits, rounding_mode); + } + + return sign == Signedness::Signed + ? v.ir.FPSignedFixedToSingle(operand, fbits, rounding_mode) + : v.ir.FPUnsignedFixedToSingle(operand, fbits, rounding_mode); + } + + UNREACHABLE(); + }(); + + v.V_scalar(esize, Vd, result); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::FCVTZS_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, immh, immb, Vn, Vd, Signedness::Signed, FloatConversionDirection::FloatToFixed, FP::RoundingMode::TowardsZero); +} + +bool TranslatorVisitor::FCVTZU_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, immh, immb, Vn, Vd, Signedness::Unsigned, FloatConversionDirection::FloatToFixed, FP::RoundingMode::TowardsZero); +} + +bool TranslatorVisitor::SCVTF_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, immh, immb, Vn, Vd, Signedness::Signed, FloatConversionDirection::FixedToFloat, ir.current_location->FPCR().RMode()); +} + +bool TranslatorVisitor::UCVTF_fix_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, immh, immb, Vn, Vd, Signedness::Unsigned, FloatConversionDirection::FixedToFloat, ir.current_location->FPCR().RMode()); +} + +bool TranslatorVisitor::SLI_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftAndInsert(*this, immh, immb, Vn, Vd, ShiftDirection::Left); +} + +bool TranslatorVisitor::SRI_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftAndInsert(*this, immh, immb, Vn, Vd, ShiftDirection::Right); +} + +bool TranslatorVisitor::SQSHL_imm_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return SaturatingShiftLeft(*this, immh, immb, Vn, Vd, SaturatingShiftLeftType::Signed); +} + +bool TranslatorVisitor::SQSHLU_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return SaturatingShiftLeft(*this, immh, immb, Vn, Vd, SaturatingShiftLeftType::SignedWithUnsignedSaturation); +} + +bool TranslatorVisitor::SQSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRightNarrowing(*this, immh, immb, Vn, Vd, Narrowing::SaturateToSigned, Signedness::Signed); +} + +bool TranslatorVisitor::SQSHRUN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRightNarrowing(*this, immh, immb, Vn, Vd, Narrowing::SaturateToUnsigned, Signedness::Signed); +} + +bool TranslatorVisitor::SRSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return RoundingShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::None, Signedness::Signed); +} + +bool TranslatorVisitor::SRSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return RoundingShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::Accumulate, Signedness::Signed); +} + +bool TranslatorVisitor::SSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::None, Signedness::Signed); +} + +bool TranslatorVisitor::SSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::Accumulate, Signedness::Signed); +} + +bool TranslatorVisitor::SHL_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + if (!immh.Bit<3>()) { + return ReservedValue(); + } + + const size_t esize = 64; + const u8 shift_amount = static_cast<u8>(concatenate(immh, immb).ZeroExtend() - esize); + + const IR::U64 operand = V_scalar(esize, Vn); + const IR::U64 result = ir.LogicalShiftLeft(operand, ir.Imm8(shift_amount)); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::UQSHL_imm_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return SaturatingShiftLeft(*this, immh, immb, Vn, Vd, SaturatingShiftLeftType::Unsigned); +} + +bool TranslatorVisitor::UQSHRN_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRightNarrowing(*this, immh, immb, Vn, Vd, Narrowing::SaturateToUnsigned, Signedness::Unsigned); +} + +bool TranslatorVisitor::URSHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return RoundingShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::None, Signedness::Unsigned); +} + +bool TranslatorVisitor::URSRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return RoundingShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::Accumulate, Signedness::Unsigned); +} + +bool TranslatorVisitor::USHR_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::None, Signedness::Unsigned); +} + +bool TranslatorVisitor::USRA_1(Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, immh, immb, Vn, Vd, ShiftExtraBehavior::Accumulate, Signedness::Unsigned); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_three_same.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_three_same.cpp new file mode 100644 index 0000000000..fb9ae9d141 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_three_same.cpp @@ -0,0 +1,430 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <optional> + +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +enum class ComparisonType { + EQ, + GE, + GT, + HI, + HS, + LE, + LT, +}; + +enum class ComparisonVariant { + Register, + Zero, +}; + +enum class Signedness { + Signed, + Unsigned, +}; + +bool RoundingShiftLeft(TranslatorVisitor& v, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Signedness sign) { + if (size != 0b11) { + return v.ReservedValue(); + } + + const IR::U128 operand1 = v.V(64, Vn); + const IR::U128 operand2 = v.V(64, Vm); + const IR::U128 result = [&] { + if (sign == Signedness::Signed) { + return v.ir.VectorRoundingShiftLeftSigned(64, operand1, operand2); + } + + return v.ir.VectorRoundingShiftLeftUnsigned(64, operand1, operand2); + }(); + + v.V(64, Vd, result); + return true; +} + +bool ScalarCompare(TranslatorVisitor& v, Imm<2> size, std::optional<Vec> Vm, Vec Vn, Vec Vd, ComparisonType type, ComparisonVariant variant) { + if (size != 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 64; + const size_t datasize = 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = variant == ComparisonVariant::Register ? v.V(datasize, *Vm) : v.ir.ZeroVector(); + + const IR::U128 result = [&] { + switch (type) { + case ComparisonType::EQ: + return v.ir.VectorEqual(esize, operand1, operand2); + case ComparisonType::GE: + return v.ir.VectorGreaterEqualSigned(esize, operand1, operand2); + case ComparisonType::GT: + return v.ir.VectorGreaterSigned(esize, operand1, operand2); + case ComparisonType::HI: + return v.ir.VectorGreaterUnsigned(esize, operand1, operand2); + case ComparisonType::HS: + return v.ir.VectorGreaterEqualUnsigned(esize, operand1, operand2); + case ComparisonType::LE: + return v.ir.VectorLessEqualSigned(esize, operand1, operand2); + case ComparisonType::LT: + default: + return v.ir.VectorLessSigned(esize, operand1, operand2); + } + }(); + + v.V_scalar(datasize, Vd, v.ir.VectorGetElement(esize, result, 0)); + return true; +} + +enum class FPComparisonType { + EQ, + GE, + AbsoluteGE, + GT, + AbsoluteGT +}; + +bool ScalarFPCompareRegister(TranslatorVisitor& v, bool sz, Vec Vm, Vec Vn, Vec Vd, FPComparisonType type) { + const size_t esize = sz ? 64 : 32; + const size_t datasize = esize; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + const IR::U128 result = [&] { + switch (type) { + case FPComparisonType::EQ: + return v.ir.FPVectorEqual(esize, operand1, operand2); + case FPComparisonType::GE: + return v.ir.FPVectorGreaterEqual(esize, operand1, operand2); + case FPComparisonType::AbsoluteGE: + return v.ir.FPVectorGreaterEqual(esize, + v.ir.FPVectorAbs(esize, operand1), + v.ir.FPVectorAbs(esize, operand2)); + case FPComparisonType::GT: + return v.ir.FPVectorGreater(esize, operand1, operand2); + case FPComparisonType::AbsoluteGT: + return v.ir.FPVectorGreater(esize, + v.ir.FPVectorAbs(esize, operand1), + v.ir.FPVectorAbs(esize, operand2)); + } + + UNREACHABLE(); + }(); + + v.V_scalar(datasize, Vd, v.ir.VectorGetElement(esize, result, 0)); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::SQADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = 8 << size.ZeroExtend<size_t>(); + + const IR::UAny operand1 = V_scalar(esize, Vn); + const IR::UAny operand2 = V_scalar(esize, Vm); + const auto result = ir.SignedSaturatedAdd(operand1, operand2); + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::SQDMULH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + + const IR::UAny operand1 = V_scalar(esize, Vn); + const IR::UAny operand2 = V_scalar(esize, Vm); + const auto result = ir.SignedSaturatedDoublingMultiplyReturnHigh(operand1, operand2); + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::SQRDMULH_vec_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0)); + const IR::U128 operand2 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vm), 0)); + const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, operand1, operand2); + + V_scalar(esize, Vd, ir.VectorGetElement(esize, result, 0)); + return true; +} + +bool TranslatorVisitor::SQSUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = 8 << size.ZeroExtend<size_t>(); + + const IR::UAny operand1 = V_scalar(esize, Vn); + const IR::UAny operand2 = V_scalar(esize, Vm); + const auto result = ir.SignedSaturatedSub(operand1, operand2); + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::UQADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = 8 << size.ZeroExtend(); + + const IR::UAny operand1 = V_scalar(esize, Vn); + const IR::UAny operand2 = V_scalar(esize, Vm); + const auto result = ir.UnsignedSaturatedAdd(operand1, operand2); + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::UQSUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = 8 << size.ZeroExtend(); + + const IR::UAny operand1 = V_scalar(esize, Vn); + const IR::UAny operand2 = V_scalar(esize, Vm); + const auto result = ir.UnsignedSaturatedSub(operand1, operand2); + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::ADD_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size != 0b11) { + return ReservedValue(); + } + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = esize; + + const IR::U64 operand1 = V_scalar(datasize, Vn); + const IR::U64 operand2 = V_scalar(datasize, Vm); + const IR::U64 result = ir.Add(operand1, operand2); + V_scalar(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::CMEQ_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return ScalarCompare(*this, size, Vm, Vn, Vd, ComparisonType::EQ, ComparisonVariant::Register); +} + +bool TranslatorVisitor::CMEQ_zero_1(Imm<2> size, Vec Vn, Vec Vd) { + return ScalarCompare(*this, size, {}, Vn, Vd, ComparisonType::EQ, ComparisonVariant::Zero); +} + +bool TranslatorVisitor::CMGE_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return ScalarCompare(*this, size, Vm, Vn, Vd, ComparisonType::GE, ComparisonVariant::Register); +} + +bool TranslatorVisitor::CMGE_zero_1(Imm<2> size, Vec Vn, Vec Vd) { + return ScalarCompare(*this, size, {}, Vn, Vd, ComparisonType::GE, ComparisonVariant::Zero); +} + +bool TranslatorVisitor::CMGT_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return ScalarCompare(*this, size, Vm, Vn, Vd, ComparisonType::GT, ComparisonVariant::Register); +} + +bool TranslatorVisitor::CMGT_zero_1(Imm<2> size, Vec Vn, Vec Vd) { + return ScalarCompare(*this, size, {}, Vn, Vd, ComparisonType::GT, ComparisonVariant::Zero); +} + +bool TranslatorVisitor::CMLE_1(Imm<2> size, Vec Vn, Vec Vd) { + return ScalarCompare(*this, size, {}, Vn, Vd, ComparisonType::LE, ComparisonVariant::Zero); +} + +bool TranslatorVisitor::CMLT_1(Imm<2> size, Vec Vn, Vec Vd) { + return ScalarCompare(*this, size, {}, Vn, Vd, ComparisonType::LT, ComparisonVariant::Zero); +} + +bool TranslatorVisitor::CMHI_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return ScalarCompare(*this, size, Vm, Vn, Vd, ComparisonType::HI, ComparisonVariant::Register); +} + +bool TranslatorVisitor::CMHS_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return ScalarCompare(*this, size, Vm, Vn, Vd, ComparisonType::HS, ComparisonVariant::Register); +} + +bool TranslatorVisitor::CMTST_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size != 0b11) { + return ReservedValue(); + } + + const IR::U128 operand1 = V(64, Vn); + const IR::U128 operand2 = V(64, Vm); + const IR::U128 anded = ir.VectorAnd(operand1, operand2); + const IR::U128 result = ir.VectorNot(ir.VectorEqual(64, anded, ir.ZeroVector())); + + V(64, Vd, result); + return true; +} + +bool TranslatorVisitor::FABD_2(bool sz, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = sz ? 64 : 32; + + const IR::U32U64 operand1 = V_scalar(esize, Vn); + const IR::U32U64 operand2 = V_scalar(esize, Vm); + const IR::U32U64 result = ir.FPAbs(ir.FPSub(operand1, operand2)); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMULX_vec_2(bool sz, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = sz ? 64 : 32; + + const IR::U32U64 operand1 = V_scalar(esize, Vn); + const IR::U32U64 operand2 = V_scalar(esize, Vm); + const IR::U32U64 result = ir.FPMulX(operand1, operand2); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRECPS_1(Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = 16; + + const IR::U16 operand1 = V_scalar(esize, Vn); + const IR::U16 operand2 = V_scalar(esize, Vm); + const IR::U16 result = ir.FPRecipStepFused(operand1, operand2); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRECPS_2(bool sz, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = sz ? 64 : 32; + + const IR::U32U64 operand1 = V_scalar(esize, Vn); + const IR::U32U64 operand2 = V_scalar(esize, Vm); + const IR::U32U64 result = ir.FPRecipStepFused(operand1, operand2); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRSQRTS_1(Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = 16; + + const IR::U16 operand1 = V_scalar(esize, Vn); + const IR::U16 operand2 = V_scalar(esize, Vm); + const IR::U16 result = ir.FPRSqrtStepFused(operand1, operand2); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRSQRTS_2(bool sz, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = sz ? 64 : 32; + + const IR::U32U64 operand1 = V_scalar(esize, Vn); + const IR::U32U64 operand2 = V_scalar(esize, Vm); + const IR::U32U64 result = ir.FPRSqrtStepFused(operand1, operand2); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::FACGE_2(bool sz, Vec Vm, Vec Vn, Vec Vd) { + return ScalarFPCompareRegister(*this, sz, Vm, Vn, Vd, FPComparisonType::AbsoluteGE); +} + +bool TranslatorVisitor::FACGT_2(bool sz, Vec Vm, Vec Vn, Vec Vd) { + return ScalarFPCompareRegister(*this, sz, Vm, Vn, Vd, FPComparisonType::AbsoluteGT); +} + +bool TranslatorVisitor::FCMEQ_reg_1(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 lhs = V(128, Vn); + const IR::U128 rhs = V(128, Vm); + const IR::U128 result = ir.FPVectorEqual(16, lhs, rhs); + + V_scalar(16, Vd, ir.VectorGetElement(16, result, 0)); + return true; +} + +bool TranslatorVisitor::FCMEQ_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd) { + return ScalarFPCompareRegister(*this, sz, Vm, Vn, Vd, FPComparisonType::EQ); +} + +bool TranslatorVisitor::FCMGE_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd) { + return ScalarFPCompareRegister(*this, sz, Vm, Vn, Vd, FPComparisonType::GE); +} + +bool TranslatorVisitor::FCMGT_reg_2(bool sz, Vec Vm, Vec Vn, Vec Vd) { + return ScalarFPCompareRegister(*this, sz, Vm, Vn, Vd, FPComparisonType::GT); +} + +bool TranslatorVisitor::SQSHL_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = 8U << size.ZeroExtend(); + + const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0)); + const IR::U128 operand2 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vm), 0)); + const IR::U128 result = ir.VectorSignedSaturatedShiftLeft(esize, operand1, operand2); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SRSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return RoundingShiftLeft(*this, size, Vm, Vn, Vd, Signedness::Signed); +} + +bool TranslatorVisitor::SSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size != 0b11) { + return ReservedValue(); + } + + const IR::U128 operand1 = V(64, Vn); + const IR::U128 operand2 = V(64, Vm); + const IR::U128 result = ir.VectorArithmeticVShift(64, operand1, operand2); + + V(64, Vd, result); + return true; +} + +bool TranslatorVisitor::SUB_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size != 0b11) { + return ReservedValue(); + } + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = esize; + + const IR::U64 operand1 = V_scalar(datasize, Vn); + const IR::U64 operand2 = V_scalar(datasize, Vm); + const IR::U64 result = ir.Sub(operand1, operand2); + V_scalar(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::UQSHL_reg_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = 8U << size.ZeroExtend(); + + const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0)); + const IR::U128 operand2 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vm), 0)); + const IR::U128 result = ir.VectorUnsignedSaturatedShiftLeft(esize, operand1, operand2); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::URSHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return RoundingShiftLeft(*this, size, Vm, Vn, Vd, Signedness::Unsigned); +} + +bool TranslatorVisitor::USHL_1(Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size != 0b11) { + return ReservedValue(); + } + + const IR::U128 operand1 = V(64, Vn); + const IR::U128 operand2 = V(64, Vm); + const IR::U128 result = ir.VectorLogicalVShift(64, operand1, operand2); + + V(64, Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_two_register_misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_two_register_misc.cpp new file mode 100644 index 0000000000..2289f5cbd1 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_two_register_misc.cpp @@ -0,0 +1,331 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +enum class ComparisonType { + EQ, + GE, + GT, + LE, + LT +}; + +enum class Signedness { + Signed, + Unsigned +}; + +bool ScalarFPCompareAgainstZero(TranslatorVisitor& v, bool sz, Vec Vn, Vec Vd, ComparisonType type) { + const size_t esize = sz ? 64 : 32; + const size_t datasize = esize; + + const IR::U128 operand = v.V(datasize, Vn); + const IR::U128 zero = v.ir.ZeroVector(); + const IR::U128 result = [&] { + switch (type) { + case ComparisonType::EQ: + return v.ir.FPVectorEqual(esize, operand, zero); + case ComparisonType::GE: + return v.ir.FPVectorGreaterEqual(esize, operand, zero); + case ComparisonType::GT: + return v.ir.FPVectorGreater(esize, operand, zero); + case ComparisonType::LE: + return v.ir.FPVectorGreaterEqual(esize, zero, operand); + case ComparisonType::LT: + return v.ir.FPVectorGreater(esize, zero, operand); + } + + UNREACHABLE(); + }(); + + v.V_scalar(datasize, Vd, v.ir.VectorGetElement(esize, result, 0)); + return true; +} + +bool ScalarFPConvertWithRound(TranslatorVisitor& v, bool sz, Vec Vn, Vec Vd, FP::RoundingMode rmode, Signedness sign) { + const size_t esize = sz ? 64 : 32; + + const IR::U32U64 operand = v.V_scalar(esize, Vn); + const IR::U32U64 result = [&]() -> IR::U32U64 { + if (sz) { + return sign == Signedness::Signed + ? v.ir.FPToFixedS64(operand, 0, rmode) + : v.ir.FPToFixedU64(operand, 0, rmode); + } + + return sign == Signedness::Signed + ? v.ir.FPToFixedS32(operand, 0, rmode) + : v.ir.FPToFixedU32(operand, 0, rmode); + }(); + + v.V_scalar(esize, Vd, result); + return true; +} + +using NarrowingFn = IR::U128 (IR::IREmitter::*)(size_t, const IR::U128&); + +bool SaturatedNarrow(TranslatorVisitor& v, Imm<2> size, Vec Vn, Vec Vd, NarrowingFn fn) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand = v.ir.ZeroExtendToQuad(v.V_scalar(2 * esize, Vn)); + const IR::U128 result = (v.ir.*fn)(2 * esize, operand); + + v.V_scalar(64, Vd, v.ir.VectorGetElement(64, result, 0)); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::ABS_1(Imm<2> size, Vec Vn, Vec Vd) { + if (size != 0b11) { + return ReservedValue(); + } + + const IR::U64 operand1 = V_scalar(64, Vn); + const IR::U64 operand2 = ir.ArithmeticShiftRight(operand1, ir.Imm8(63)); + const IR::U64 result = ir.Sub(ir.Eor(operand1, operand2), operand2); + + V_scalar(64, Vd, result); + return true; +} + +bool TranslatorVisitor::FCMEQ_zero_1(Vec Vn, Vec Vd) { + const IR::U128 operand = ir.ZeroExtendToQuad(V_scalar(16, Vn)); + const IR::U128 zero = ir.ZeroVector(); + const IR::U128 result = ir.FPVectorEqual(16, operand, zero); + + V_scalar(16, Vd, ir.VectorGetElement(16, result, 0)); + return true; +} + +bool TranslatorVisitor::FCMEQ_zero_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPCompareAgainstZero(*this, sz, Vn, Vd, ComparisonType::EQ); +} + +bool TranslatorVisitor::FCMGE_zero_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPCompareAgainstZero(*this, sz, Vn, Vd, ComparisonType::GE); +} + +bool TranslatorVisitor::FCMGT_zero_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPCompareAgainstZero(*this, sz, Vn, Vd, ComparisonType::GT); +} + +bool TranslatorVisitor::FCMLE_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPCompareAgainstZero(*this, sz, Vn, Vd, ComparisonType::LE); +} + +bool TranslatorVisitor::FCMLT_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPCompareAgainstZero(*this, sz, Vn, Vd, ComparisonType::LT); +} + +bool TranslatorVisitor::FCVTAS_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieAwayFromZero, Signedness::Signed); +} + +bool TranslatorVisitor::FCVTAU_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieAwayFromZero, Signedness::Unsigned); +} + +bool TranslatorVisitor::FCVTMS_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsMinusInfinity, Signedness::Signed); +} + +bool TranslatorVisitor::FCVTMU_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsMinusInfinity, Signedness::Unsigned); +} + +bool TranslatorVisitor::FCVTNS_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieEven, Signedness::Signed); +} + +bool TranslatorVisitor::FCVTNU_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieEven, Signedness::Unsigned); +} + +bool TranslatorVisitor::FCVTPS_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsPlusInfinity, Signedness::Signed); +} + +bool TranslatorVisitor::FCVTPU_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsPlusInfinity, Signedness::Unsigned); +} + +bool TranslatorVisitor::FCVTXN_1(bool sz, Vec Vn, Vec Vd) { + if (!sz) { + return ReservedValue(); + } + + const IR::U64 element = V_scalar(64, Vn); + const IR::U32 result = ir.FPDoubleToSingle(element, FP::RoundingMode::ToOdd); + + V_scalar(32, Vd, result); + return true; +} + +bool TranslatorVisitor::FCVTZS_int_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsZero, Signedness::Signed); +} + +bool TranslatorVisitor::FCVTZU_int_2(bool sz, Vec Vn, Vec Vd) { + return ScalarFPConvertWithRound(*this, sz, Vn, Vd, FP::RoundingMode::TowardsZero, Signedness::Unsigned); +} + +bool TranslatorVisitor::FRECPE_1(Vec Vn, Vec Vd) { + const size_t esize = 16; + + const IR::U16 operand = V_scalar(esize, Vn); + const IR::U16 result = ir.FPRecipEstimate(operand); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRECPE_2(bool sz, Vec Vn, Vec Vd) { + const size_t esize = sz ? 64 : 32; + + const IR::U32U64 operand = V_scalar(esize, Vn); + const IR::U32U64 result = ir.FPRecipEstimate(operand); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRECPX_1(Vec Vn, Vec Vd) { + const IR::U16 operand = V_scalar(16, Vn); + const IR::U16 result = ir.FPRecipExponent(operand); + + V_scalar(16, Vd, result); + return true; +} + +bool TranslatorVisitor::FRECPX_2(bool sz, Vec Vn, Vec Vd) { + const size_t esize = sz ? 64 : 32; + + const IR::U32U64 operand = V_scalar(esize, Vn); + const IR::U32U64 result = ir.FPRecipExponent(operand); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRSQRTE_1(Vec Vn, Vec Vd) { + const size_t esize = 16; + + const IR::U16 operand = V_scalar(esize, Vn); + const IR::U16 result = ir.FPRSqrtEstimate(operand); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRSQRTE_2(bool sz, Vec Vn, Vec Vd) { + const size_t esize = sz ? 64 : 32; + + const IR::U32U64 operand = V_scalar(esize, Vn); + const IR::U32U64 result = ir.FPRSqrtEstimate(operand); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::NEG_1(Imm<2> size, Vec Vn, Vec Vd) { + if (size != 0b11) { + return ReservedValue(); + } + + const IR::U64 operand = V_scalar(64, Vn); + const IR::U64 result = ir.Sub(ir.Imm64(0), operand); + + V_scalar(64, Vd, result); + return true; +} + +bool TranslatorVisitor::SCVTF_int_2(bool sz, Vec Vn, Vec Vd) { + const auto esize = sz ? 64 : 32; + + const IR::U32U64 element = V_scalar(esize, Vn); + const IR::U32U64 result = esize == 32 + ? IR::U32U64(ir.FPSignedFixedToSingle(element, 0, ir.current_location->FPCR().RMode())) + : IR::U32U64(ir.FPSignedFixedToDouble(element, 0, ir.current_location->FPCR().RMode())); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::SQABS_1(Imm<2> size, Vec Vn, Vec Vd) { + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0)); + const IR::U128 result = ir.VectorSignedSaturatedAbs(esize, operand); + + V(128, Vd, result); + return true; +} + +bool TranslatorVisitor::SQNEG_1(Imm<2> size, Vec Vn, Vec Vd) { + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0)); + const IR::U128 result = ir.VectorSignedSaturatedNeg(esize, operand); + + V(128, Vd, result); + return true; +} + +bool TranslatorVisitor::SQXTN_1(Imm<2> size, Vec Vn, Vec Vd) { + return SaturatedNarrow(*this, size, Vn, Vd, &IREmitter::VectorSignedSaturatedNarrowToSigned); +} + +bool TranslatorVisitor::SQXTUN_1(Imm<2> size, Vec Vn, Vec Vd) { + return SaturatedNarrow(*this, size, Vn, Vd, &IREmitter::VectorSignedSaturatedNarrowToUnsigned); +} + +bool TranslatorVisitor::SUQADD_1(Imm<2> size, Vec Vn, Vec Vd) { + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = 64; + + const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(datasize, Vn), 0)); + const IR::U128 operand2 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(datasize, Vd), 0)); + const IR::U128 result = ir.VectorSignedSaturatedAccumulateUnsigned(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::UCVTF_int_2(bool sz, Vec Vn, Vec Vd) { + const auto esize = sz ? 64 : 32; + + const IR::U32U64 element = V_scalar(esize, Vn); + const IR::U32U64 result = esize == 32 + ? IR::U32U64(ir.FPUnsignedFixedToSingle(element, 0, ir.current_location->FPCR().RMode())) + : IR::U32U64(ir.FPUnsignedFixedToDouble(element, 0, ir.current_location->FPCR().RMode())); + + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::UQXTN_1(Imm<2> size, Vec Vn, Vec Vd) { + return SaturatedNarrow(*this, size, Vn, Vd, &IREmitter::VectorUnsignedSaturatedNarrow); +} + +bool TranslatorVisitor::USQADD_1(Imm<2> size, Vec Vn, Vec Vd) { + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = 64; + + const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(datasize, Vn), 0)); + const IR::U128 operand2 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(datasize, Vd), 0)); + const IR::U128 result = ir.VectorUnsignedSaturatedAccumulateSigned(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_x_indexed_element.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_x_indexed_element.cpp new file mode 100644 index 0000000000..dbbc4ce12c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_scalar_x_indexed_element.cpp @@ -0,0 +1,168 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <utility> + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +std::pair<size_t, Vec> Combine(Imm<2> size, Imm<1> H, Imm<1> L, Imm<1> M, Imm<4> Vmlo) { + if (size == 0b01) { + return {concatenate(H, L, M).ZeroExtend(), Vmlo.ZeroExtend<Vec>()}; + } + + return {concatenate(H, L).ZeroExtend(), concatenate(M, Vmlo).ZeroExtend<Vec>()}; +} + +enum class ExtraBehavior { + None, + Accumulate, + Subtract, + MultiplyExtended, +}; + +bool MultiplyByElement(TranslatorVisitor& v, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior) { + if (sz && L == 1) { + return v.ReservedValue(); + } + + const size_t idxdsize = H == 1 ? 128 : 64; + const size_t index = sz ? H.ZeroExtend() : concatenate(H, L).ZeroExtend(); + const Vec Vm = concatenate(M, Vmlo).ZeroExtend<Vec>(); + const size_t esize = sz ? 64 : 32; + + const IR::U32U64 element = v.ir.VectorGetElement(esize, v.V(idxdsize, Vm), index); + const IR::U32U64 result = [&]() -> IR::U32U64 { + IR::U32U64 operand1 = v.V_scalar(esize, Vn); + + if (extra_behavior == ExtraBehavior::None) { + return v.ir.FPMul(operand1, element); + } + + if (extra_behavior == ExtraBehavior::MultiplyExtended) { + return v.ir.FPMulX(operand1, element); + } + + if (extra_behavior == ExtraBehavior::Subtract) { + operand1 = v.ir.FPNeg(operand1); + } + + const IR::U32U64 operand2 = v.V_scalar(esize, Vd); + return v.ir.FPMulAdd(operand2, operand1, element); + }(); + + v.V_scalar(esize, Vd, result); + return true; +} + +bool MultiplyByElementHalfPrecision(TranslatorVisitor& v, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior) { + const size_t esize = 16; + const size_t idxsize = H == 1 ? 128 : 64; + const size_t index = concatenate(H, L, M).ZeroExtend(); + + const auto Vm = Vmlo.ZeroExtend<Vec>(); + const IR::U16 element = v.ir.VectorGetElement(esize, v.V(idxsize, Vm), index); + const IR::U16 result = [&]() -> IR::U16 { + IR::U16 operand1 = v.V_scalar(esize, Vn); + + // TODO: Currently we don't implement half-precision paths + // for regular multiplication and extended multiplication. + + if (extra_behavior == ExtraBehavior::None) { + ASSERT_FALSE("half-precision option unimplemented"); + } + + if (extra_behavior == ExtraBehavior::MultiplyExtended) { + ASSERT_FALSE("half-precision option unimplemented"); + } + + if (extra_behavior == ExtraBehavior::Subtract) { + operand1 = v.ir.FPNeg(operand1); + } + + const IR::U16 operand2 = v.V_scalar(esize, Vd); + return v.ir.FPMulAdd(operand2, operand1, element); + }(); + + v.V_scalar(esize, Vd, result); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::FMLA_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyByElementHalfPrecision(*this, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate); +} + +bool TranslatorVisitor::FMLA_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate); +} + +bool TranslatorVisitor::FMLS_elt_1(Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyByElementHalfPrecision(*this, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract); +} + +bool TranslatorVisitor::FMLS_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract); +} + +bool TranslatorVisitor::FMUL_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None); +} + +bool TranslatorVisitor::FMULX_elt_2(bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyByElement(*this, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::MultiplyExtended); +} + +bool TranslatorVisitor::SQDMULH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const auto [index, Vm] = Combine(size, H, L, M, Vmlo); + + const IR::UAny operand1 = V_scalar(esize, Vn); + const IR::UAny operand2 = ir.VectorGetElement(esize, V(128, Vm), index); + const auto result = ir.SignedSaturatedDoublingMultiplyReturnHigh(operand1, operand2); + V_scalar(esize, Vd, result); + return true; +} + +bool TranslatorVisitor::SQRDMULH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const auto [index, Vm] = Combine(size, H, L, M, Vmlo); + + const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0)); + const IR::U128 operand2 = V(128, Vm); + const IR::U128 broadcast = ir.VectorBroadcastElement(esize, operand2, index); + const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, operand1, broadcast); + + V(128, Vd, result); + return true; +} + +bool TranslatorVisitor::SQDMULL_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const auto [index, Vm] = Combine(size, H, L, M, Vmlo); + + const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0)); + const IR::U128 operand2 = V(128, Vm); + const IR::U128 broadcast = ir.VectorBroadcastElement(esize, operand2, index); + const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyLong(esize, operand1, broadcast); + + V(128, Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp new file mode 100644 index 0000000000..795bfcdb8c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp @@ -0,0 +1,149 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +IR::U32 SHAchoose(IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) { + return ir.Eor(ir.And(ir.Eor(y, z), x), z); +} + +IR::U32 SHAmajority(IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) { + return ir.Or(ir.And(x, y), ir.And(ir.Or(x, y), z)); +} + +IR::U32 SHAparity(IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) { + return ir.Eor(ir.Eor(y, z), x); +} + +using SHA1HashUpdateFunction = IR::U32(IREmitter&, IR::U32, IR::U32, IR::U32); + +IR::U128 SHA1HashUpdate(IREmitter& ir, Vec Vm, Vec Vn, Vec Vd, SHA1HashUpdateFunction fn) { + IR::U128 x = ir.GetQ(Vd); + IR::U32 y = ir.VectorGetElement(32, ir.GetQ(Vn), 0); + const IR::U128 w = ir.GetQ(Vm); + + for (size_t i = 0; i < 4; i++) { + const IR::U32 low_x = ir.VectorGetElement(32, x, 0); + const IR::U32 after_low_x = ir.VectorGetElement(32, x, 1); + const IR::U32 before_high_x = ir.VectorGetElement(32, x, 2); + const IR::U32 high_x = ir.VectorGetElement(32, x, 3); + const IR::U32 t = fn(ir, after_low_x, before_high_x, high_x); + const IR::U32 w_segment = ir.VectorGetElement(32, w, i); + + y = ir.Add(ir.Add(ir.Add(y, ir.RotateRight(low_x, ir.Imm8(27))), t), w_segment); + x = ir.VectorSetElement(32, x, 1, ir.RotateRight(after_low_x, ir.Imm8(2))); + + // Move each 32-bit element to the left once + // e.g. [3, 2, 1, 0], becomes [2, 1, 0, 3] + const IR::U128 shuffled_x = ir.VectorRotateWholeVectorRight(x, 96); + x = ir.VectorSetElement(32, shuffled_x, 0, y); + y = high_x; + } + + return x; +} +} // Anonymous namespace + +bool TranslatorVisitor::SHA1C(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 result = SHA1HashUpdate(ir, Vm, Vn, Vd, SHAchoose); + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA1M(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 result = SHA1HashUpdate(ir, Vm, Vn, Vd, SHAmajority); + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA1P(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 result = SHA1HashUpdate(ir, Vm, Vn, Vd, SHAparity); + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA1SU0(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 d = ir.GetQ(Vd); + const IR::U128 m = ir.GetQ(Vm); + const IR::U128 n = ir.GetQ(Vn); + + IR::U128 result = [&] { + const IR::U64 d_high = ir.VectorGetElement(64, d, 1); + const IR::U64 n_low = ir.VectorGetElement(64, n, 0); + const IR::U128 zero = ir.ZeroVector(); + + const IR::U128 tmp1 = ir.VectorSetElement(64, zero, 0, d_high); + return ir.VectorSetElement(64, tmp1, 1, n_low); + }(); + + result = ir.VectorEor(ir.VectorEor(result, d), m); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA1SU1(Vec Vn, Vec Vd) { + const IR::U128 d = ir.GetQ(Vd); + const IR::U128 n = ir.GetQ(Vn); + + // Shuffle down the whole vector and zero out the top 32 bits + const IR::U128 shuffled_n = ir.VectorSetElement(32, ir.VectorRotateWholeVectorRight(n, 32), 3, ir.Imm32(0)); + const IR::U128 t = ir.VectorEor(d, shuffled_n); + const IR::U128 rotated_t = ir.VectorRotateLeft(32, t, 1); + + const IR::U32 low_rotated_t = ir.RotateRight(ir.VectorGetElement(32, rotated_t, 0), ir.Imm8(31)); + const IR::U32 high_t = ir.VectorGetElement(32, rotated_t, 3); + const IR::U128 result = ir.VectorSetElement(32, rotated_t, 3, ir.Eor(low_rotated_t, high_t)); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA1H(Vec Vn, Vec Vd) { + const IR::U128 data = ir.GetS(Vn); + + const IR::U128 result = ir.VectorOr(ir.VectorLogicalShiftLeft(32, data, 30), + ir.VectorLogicalShiftRight(32, data, 2)); + + ir.SetS(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA256SU0(Vec Vn, Vec Vd) { + const IR::U128 x = ir.GetQ(Vd); + const IR::U128 y = ir.GetQ(Vn); + + const IR::U128 result = ir.SHA256MessageSchedule0(x, y); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA256SU1(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 x = ir.GetQ(Vd); + const IR::U128 y = ir.GetQ(Vn); + const IR::U128 z = ir.GetQ(Vm); + + const IR::U128 result = ir.SHA256MessageSchedule1(x, y, z); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA256H(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 result = ir.SHA256Hash(ir.GetQ(Vd), ir.GetQ(Vn), ir.GetQ(Vm), true); + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA256H2(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 result = ir.SHA256Hash(ir.GetQ(Vn), ir.GetQ(Vd), ir.GetQ(Vm), false); + ir.SetQ(Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha512.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha512.cpp new file mode 100644 index 0000000000..766109bfee --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha512.cpp @@ -0,0 +1,300 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +IR::U64 MakeSig(IREmitter& ir, IR::U64 data, u8 first_rot_amount, u8 second_rot_amount, u8 shift_amount) { + const IR::U64 tmp1 = ir.RotateRight(data, ir.Imm8(first_rot_amount)); + const IR::U64 tmp2 = ir.RotateRight(data, ir.Imm8(second_rot_amount)); + const IR::U64 tmp3 = ir.LogicalShiftRight(data, ir.Imm8(shift_amount)); + + return ir.Eor(tmp1, ir.Eor(tmp2, tmp3)); +} + +IR::U64 MakeMNSig(IREmitter& ir, IR::U64 data, u8 first_rot_amount, u8 second_rot_amount, u8 third_rot_amount) { + const IR::U64 tmp1 = ir.RotateRight(data, ir.Imm8(first_rot_amount)); + const IR::U64 tmp2 = ir.RotateRight(data, ir.Imm8(second_rot_amount)); + const IR::U64 tmp3 = ir.RotateRight(data, ir.Imm8(third_rot_amount)); + + return ir.Eor(tmp1, ir.Eor(tmp2, tmp3)); +} + +enum class SHA512HashPart { + Part1, + Part2, +}; + +IR::U128 SHA512Hash(IREmitter& ir, Vec Vm, Vec Vn, Vec Vd, SHA512HashPart part) { + const IR::U128 x = ir.GetQ(Vn); + const IR::U128 y = ir.GetQ(Vm); + const IR::U128 w = ir.GetQ(Vd); + + const IR::U64 lower_x = ir.VectorGetElement(64, x, 0); + const IR::U64 upper_x = ir.VectorGetElement(64, x, 1); + + const IR::U64 lower_y = ir.VectorGetElement(64, y, 0); + const IR::U64 upper_y = ir.VectorGetElement(64, y, 1); + + const auto make_sigma = [&](IR::U64 data) { + if (part == SHA512HashPart::Part1) { + return MakeMNSig(ir, data, 14, 18, 41); + } + return MakeMNSig(ir, data, 28, 34, 39); + }; + + const auto make_partial_half = [&](IR::U64 a, IR::U64 b, IR::U64 c) { + const IR::U64 tmp1 = ir.And(a, b); + + if (part == SHA512HashPart::Part1) { + const IR::U64 tmp2 = ir.AndNot(c, a); + return ir.Eor(tmp1, tmp2); + } + + const IR::U64 tmp2 = ir.And(a, c); + const IR::U64 tmp3 = ir.And(upper_y, lower_y); + return ir.Eor(tmp1, ir.Eor(tmp2, tmp3)); + }; + + const IR::U64 Vtmp = [&] { + const IR::U64 partial = [&] { + if (part == SHA512HashPart::Part1) { + return make_partial_half(upper_y, lower_x, upper_x); + } + return make_partial_half(lower_x, upper_y, lower_y); + }(); + const IR::U64 upper_w = ir.VectorGetElement(64, w, 1); + const IR::U64 sig = [&] { + if (part == SHA512HashPart::Part1) { + return make_sigma(upper_y); + } + return make_sigma(lower_y); + }(); + + return ir.Add(partial, ir.Add(sig, upper_w)); + }(); + + const IR::U128 low_result = [&] { + const IR::U64 tmp = [&]() -> IR::U64 { + if (part == SHA512HashPart::Part1) { + return ir.Add(Vtmp, lower_y); + } + return Vtmp; + }(); + const IR::U64 partial = [&] { + if (part == SHA512HashPart::Part1) { + return make_partial_half(tmp, upper_y, lower_x); + } + return make_partial_half(Vtmp, lower_y, upper_y); + }(); + const IR::U64 sig = make_sigma(tmp); + const IR::U64 lower_w = ir.VectorGetElement(64, w, 0); + + return ir.ZeroExtendToQuad(ir.Add(partial, ir.Add(sig, lower_w))); + }(); + + return ir.VectorSetElement(64, low_result, 1, Vtmp); +} + +enum class SM4RotationType { + SM4E, + SM4EKEY +}; + +IR::U32 SM4Rotation(IREmitter& ir, IR::U32 intval, IR::U32 round_result_low_word, SM4RotationType type) { + if (type == SM4RotationType::SM4E) { + const IR::U32 tmp1 = ir.RotateRight(intval, ir.Imm8(30)); + const IR::U32 tmp2 = ir.RotateRight(intval, ir.Imm8(22)); + const IR::U32 tmp3 = ir.RotateRight(intval, ir.Imm8(14)); + const IR::U32 tmp4 = ir.RotateRight(intval, ir.Imm8(8)); + const IR::U32 tmp5 = ir.Eor(intval, ir.Eor(tmp1, ir.Eor(tmp2, ir.Eor(tmp3, tmp4)))); + + return ir.Eor(tmp5, round_result_low_word); + } + + const IR::U32 tmp1 = ir.RotateRight(intval, ir.Imm8(19)); + const IR::U32 tmp2 = ir.RotateRight(intval, ir.Imm8(9)); + return ir.Eor(round_result_low_word, ir.Eor(intval, ir.Eor(tmp1, tmp2))); +} + +IR::U128 SM4Hash(IREmitter& ir, Vec Vn, Vec Vd, SM4RotationType type) { + const IR::U128 n = ir.GetQ(Vn); + IR::U128 roundresult = ir.GetQ(Vd); + + for (size_t i = 0; i < 4; i++) { + const IR::U32 round_key = ir.VectorGetElement(32, n, i); + + const IR::U32 upper_round = ir.VectorGetElement(32, roundresult, 3); + const IR::U32 before_upper_round = ir.VectorGetElement(32, roundresult, 2); + const IR::U32 after_lower_round = ir.VectorGetElement(32, roundresult, 1); + + IR::U128 intval_vec = ir.ZeroExtendToQuad(ir.Eor(upper_round, ir.Eor(before_upper_round, ir.Eor(after_lower_round, round_key)))); + + for (size_t j = 0; j < 4; j++) { + const IR::U8 byte_element = ir.VectorGetElement(8, intval_vec, j); + intval_vec = ir.VectorSetElement(8, intval_vec, j, ir.SM4AccessSubstitutionBox(byte_element)); + } + + const IR::U32 intval_low_word = ir.VectorGetElement(32, intval_vec, 0); + const IR::U32 round_result_low_word = ir.VectorGetElement(32, roundresult, 0); + const IR::U32 intval = SM4Rotation(ir, intval_low_word, round_result_low_word, type); + roundresult = ir.VectorRotateWholeVectorRight(roundresult, 32); + roundresult = ir.VectorSetElement(32, roundresult, 3, intval); + } + + return roundresult; +} +} // Anonymous namespace + +bool TranslatorVisitor::SHA512SU0(Vec Vn, Vec Vd) { + const IR::U128 x = ir.GetQ(Vn); + const IR::U128 w = ir.GetQ(Vd); + + const IR::U64 lower_x = ir.VectorGetElement(64, x, 0); + const IR::U64 lower_w = ir.VectorGetElement(64, w, 0); + const IR::U64 upper_w = ir.VectorGetElement(64, w, 1); + + const auto make_sig0 = [&](IR::U64 data) { + return MakeSig(ir, data, 1, 8, 7); + }; + + const IR::U128 low_result = ir.ZeroExtendToQuad(ir.Add(lower_w, make_sig0(upper_w))); + const IR::U64 high_result = ir.Add(upper_w, make_sig0(lower_x)); + const IR::U128 result = ir.VectorSetElement(64, low_result, 1, high_result); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA512SU1(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 x = ir.GetQ(Vn); + const IR::U128 y = ir.GetQ(Vm); + const IR::U128 w = ir.GetQ(Vd); + + const auto make_sig1 = [&](IR::U64 data) { + return MakeSig(ir, data, 19, 61, 6); + }; + + const IR::U128 sig_vector = [&] { + const IR::U64 lower_x = ir.VectorGetElement(64, x, 0); + const IR::U64 upper_x = ir.VectorGetElement(64, x, 1); + + const IR::U128 low_result = ir.ZeroExtendToQuad(make_sig1(lower_x)); + return ir.VectorSetElement(64, low_result, 1, make_sig1(upper_x)); + }(); + + const IR::U128 result = ir.VectorAdd(64, w, ir.VectorAdd(64, y, sig_vector)); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA512H(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 result = SHA512Hash(ir, Vm, Vn, Vd, SHA512HashPart::Part1); + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SHA512H2(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 result = SHA512Hash(ir, Vm, Vn, Vd, SHA512HashPart::Part2); + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::RAX1(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 m = ir.GetQ(Vm); + const IR::U128 n = ir.GetQ(Vn); + + const IR::U128 rotated_m = ir.VectorRotateLeft(64, m, 1); + const IR::U128 result = ir.VectorEor(n, rotated_m); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::XAR(Vec Vm, Imm<6> imm6, Vec Vn, Vec Vd) { + const IR::U128 m = ir.GetQ(Vm); + const IR::U128 n = ir.GetQ(Vn); + + const IR::U128 tmp = ir.VectorEor(m, n); + const IR::U128 result = ir.VectorRotateRight(64, tmp, imm6.ZeroExtend<u8>()); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SM3PARTW1(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 d = ir.GetQ(Vd); + const IR::U128 m = ir.GetQ(Vm); + const IR::U128 n = ir.GetQ(Vn); + + const IR::U128 eor_d_n = ir.VectorEor(d, n); + + const IR::U128 result_low_three_words = [&] { + // Move the top-most 3 words down one element (i.e. [3, 2, 1, 0] -> [0, 3, 2, 1]) + const IR::U128 shuffled_m = ir.VectorRotateWholeVectorRight(m, 32); + + // We treat the uppermost word as junk data and don't touch/use it explicitly for now. + // Given we don't do anything with it yet, the fact we EOR into it doesn't matter. + return ir.VectorEor(eor_d_n, ir.VectorRotateLeft(32, shuffled_m, 15)); + }(); + + IR::U128 result = result_low_three_words; + for (size_t i = 0; i < 4; i++) { + if (i == 3) { + const IR::U32 top_eor_d_n = ir.VectorGetElement(32, eor_d_n, 3); + const IR::U32 low_result_word = ir.VectorGetElement(32, result, 0); + const IR::U32 top_result_word = ir.Eor(top_eor_d_n, ir.RotateRight(low_result_word, ir.Imm8(17))); + + // Now the uppermost word is well-defined data. + result = ir.VectorSetElement(32, result, 3, top_result_word); + } + + const IR::U32 word = ir.VectorGetElement(32, result, i); + const IR::U32 modified = ir.Eor(word, ir.Eor(ir.RotateRight(word, ir.Imm8(17)), + ir.RotateRight(word, ir.Imm8(9)))); + + result = ir.VectorSetElement(32, result, i, modified); + } + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SM3PARTW2(Vec Vm, Vec Vn, Vec Vd) { + const IR::U128 d = ir.GetQ(Vd); + const IR::U128 m = ir.GetQ(Vm); + const IR::U128 n = ir.GetQ(Vn); + + const IR::U128 temp = ir.VectorEor(n, ir.VectorRotateLeft(32, m, 7)); + const IR::U128 temp_result = ir.VectorEor(d, temp); + const IR::U32 temp2 = [&] { + const IR::U32 rotate1 = ir.RotateRight(ir.VectorGetElement(32, temp, 0), ir.Imm8(17)); + const IR::U32 rotate2 = ir.RotateRight(rotate1, ir.Imm8(17)); + const IR::U32 rotate3 = ir.RotateRight(rotate1, ir.Imm8(9)); + + return ir.Eor(rotate1, ir.Eor(rotate2, rotate3)); + }(); + + const IR::U32 high_temp_result = ir.VectorGetElement(32, temp_result, 3); + const IR::U32 replacement = ir.Eor(high_temp_result, temp2); + const IR::U128 result = ir.VectorSetElement(32, temp_result, 3, replacement); + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::SM4E(Vec Vn, Vec Vd) { + ir.SetQ(Vd, SM4Hash(ir, Vn, Vd, SM4RotationType::SM4E)); + return true; +} + +bool TranslatorVisitor::SM4EKEY(Vec Vm, Vec Vn, Vec Vd) { + ir.SetQ(Vd, SM4Hash(ir, Vm, Vn, SM4RotationType::SM4EKEY)); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_shift_by_immediate.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_shift_by_immediate.cpp new file mode 100644 index 0000000000..41b0952249 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_shift_by_immediate.cpp @@ -0,0 +1,402 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/bit/bit_count.hpp> + +#include "dynarmic/common/fp/rounding_mode.h" +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +enum class Rounding { + None, + Round +}; + +enum class Accumulating { + None, + Accumulate +}; + +enum class Signedness { + Signed, + Unsigned +}; + +enum class Narrowing { + Truncation, + SaturateToUnsigned, + SaturateToSigned, +}; + +enum class SaturatingShiftLeftType { + Signed, + Unsigned, + SignedWithUnsignedSaturation, +}; + +enum class FloatConversionDirection { + FixedToFloat, + FloatToFixed, +}; + +IR::U128 PerformRoundingCorrection(TranslatorVisitor& v, size_t esize, u64 round_value, IR::U128 original, IR::U128 shifted) { + const IR::U128 round_const = v.ir.VectorBroadcast(esize, v.I(esize, round_value)); + const IR::U128 round_correction = v.ir.VectorEqual(esize, v.ir.VectorAnd(original, round_const), round_const); + return v.ir.VectorSub(esize, shifted, round_correction); +} + +bool ShiftRight(TranslatorVisitor& v, bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Rounding rounding, Accumulating accumulating, Signedness signedness) { + if (immh == 0b0000) { + return v.DecodeError(); + } + + if (immh.Bit<3>() && !Q) { + return v.ReservedValue(); + } + + const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend()); + const size_t datasize = Q ? 128 : 64; + + const u8 shift_amount = static_cast<u8>(2 * esize) - concatenate(immh, immb).ZeroExtend<u8>(); + + const IR::U128 operand = v.V(datasize, Vn); + + IR::U128 result = [&] { + if (signedness == Signedness::Signed) { + return v.ir.VectorArithmeticShiftRight(esize, operand, shift_amount); + } + return v.ir.VectorLogicalShiftRight(esize, operand, shift_amount); + }(); + + if (rounding == Rounding::Round) { + const u64 round_value = 1ULL << (shift_amount - 1); + result = PerformRoundingCorrection(v, esize, round_value, operand, result); + } + + if (accumulating == Accumulating::Accumulate) { + const IR::U128 accumulator = v.V(datasize, Vd); + result = v.ir.VectorAdd(esize, result, accumulator); + } + + v.V(datasize, Vd, result); + return true; +} + +bool ShiftRightNarrowing(TranslatorVisitor& v, bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Rounding rounding, Narrowing narrowing, Signedness signedness) { + if (immh == 0b0000) { + return v.DecodeError(); + } + + if (immh.Bit<3>()) { + return v.ReservedValue(); + } + + const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend()); + const size_t source_esize = 2 * esize; + const size_t part = Q ? 1 : 0; + + const u8 shift_amount = static_cast<u8>(source_esize - concatenate(immh, immb).ZeroExtend()); + + const IR::U128 operand = v.V(128, Vn); + + IR::U128 wide_result = [&] { + if (signedness == Signedness::Signed) { + return v.ir.VectorArithmeticShiftRight(source_esize, operand, shift_amount); + } + return v.ir.VectorLogicalShiftRight(source_esize, operand, shift_amount); + }(); + + if (rounding == Rounding::Round) { + const u64 round_value = 1ULL << (shift_amount - 1); + wide_result = PerformRoundingCorrection(v, source_esize, round_value, operand, wide_result); + } + + const IR::U128 result = [&] { + switch (narrowing) { + case Narrowing::Truncation: + return v.ir.VectorNarrow(source_esize, wide_result); + case Narrowing::SaturateToUnsigned: + if (signedness == Signedness::Signed) { + return v.ir.VectorSignedSaturatedNarrowToUnsigned(source_esize, wide_result); + } + return v.ir.VectorUnsignedSaturatedNarrow(source_esize, wide_result); + case Narrowing::SaturateToSigned: + ASSERT(signedness == Signedness::Signed); + return v.ir.VectorSignedSaturatedNarrowToSigned(source_esize, wide_result); + } + UNREACHABLE(); + }(); + + v.Vpart(64, Vd, part, result); + return true; +} + +bool ShiftLeftLong(TranslatorVisitor& v, bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Signedness signedness) { + if (immh == 0b0000) { + return v.DecodeError(); + } + + if (immh.Bit<3>()) { + return v.ReservedValue(); + } + + const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend()); + const size_t datasize = 64; + const size_t part = Q ? 1 : 0; + + const u8 shift_amount = concatenate(immh, immb).ZeroExtend<u8>() - static_cast<u8>(esize); + + const IR::U128 operand = v.Vpart(datasize, Vn, part); + const IR::U128 expanded_operand = [&] { + if (signedness == Signedness::Signed) { + return v.ir.VectorSignExtend(esize, operand); + } + return v.ir.VectorZeroExtend(esize, operand); + }(); + const IR::U128 result = v.ir.VectorLogicalShiftLeft(2 * esize, expanded_operand, shift_amount); + + v.V(2 * datasize, Vd, result); + return true; +} + +bool SaturatingShiftLeft(TranslatorVisitor& v, bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, SaturatingShiftLeftType type) { + if (!Q && immh.Bit<3>()) { + return v.ReservedValue(); + } + + const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend()); + const size_t datasize = Q ? 128 : 64; + const size_t shift = concatenate(immh, immb).ZeroExtend() - esize; + + const IR::U128 operand = v.V(datasize, Vn); + const IR::U128 shift_vec = v.ir.VectorBroadcast(esize, v.I(esize, shift)); + const IR::U128 result = [&] { + if (type == SaturatingShiftLeftType::Signed) { + return v.ir.VectorSignedSaturatedShiftLeft(esize, operand, shift_vec); + } + + if (type == SaturatingShiftLeftType::Unsigned) { + return v.ir.VectorUnsignedSaturatedShiftLeft(esize, operand, shift_vec); + } + + return v.ir.VectorSignedSaturatedShiftLeftUnsigned(esize, operand, static_cast<u8>(shift)); + }(); + + v.V(datasize, Vd, result); + return true; +} + +bool ConvertFloat(TranslatorVisitor& v, bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd, Signedness signedness, FloatConversionDirection direction, FP::RoundingMode rounding_mode) { + if (immh == 0b0000) { + return v.DecodeError(); + } + + if (immh == 0b0001 || immh == 0b0010 || immh == 0b0011) { + return v.ReservedValue(); + } + + if (immh.Bit<3>() && !Q) { + return v.ReservedValue(); + } + + const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend()); + const size_t datasize = Q ? 128 : 64; + + const u8 fbits = static_cast<u8>(esize * 2) - concatenate(immh, immb).ZeroExtend<u8>(); + + const IR::U128 operand = v.V(datasize, Vn); + const IR::U128 result = [&] { + switch (direction) { + case FloatConversionDirection::FixedToFloat: + return signedness == Signedness::Signed + ? v.ir.FPVectorFromSignedFixed(esize, operand, fbits, rounding_mode) + : v.ir.FPVectorFromUnsignedFixed(esize, operand, fbits, rounding_mode); + case FloatConversionDirection::FloatToFixed: + return signedness == Signedness::Signed + ? v.ir.FPVectorToSignedFixed(esize, operand, fbits, rounding_mode) + : v.ir.FPVectorToUnsignedFixed(esize, operand, fbits, rounding_mode); + } + UNREACHABLE(); + }(); + + v.V(datasize, Vd, result); + return true; +} + +} // Anonymous namespace + +bool TranslatorVisitor::SSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::None, Accumulating::None, Signedness::Signed); +} + +bool TranslatorVisitor::SRSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Accumulating::None, Signedness::Signed); +} + +bool TranslatorVisitor::SRSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Accumulating::Accumulate, Signedness::Signed); +} + +bool TranslatorVisitor::SSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::None, Accumulating::Accumulate, Signedness::Signed); +} + +bool TranslatorVisitor::SHL_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + if (immh == 0b0000) { + return DecodeError(); + } + if (immh.Bit<3>() && !Q) { + return ReservedValue(); + } + const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend()); + const size_t datasize = Q ? 128 : 64; + + const u8 shift_amount = concatenate(immh, immb).ZeroExtend<u8>() - static_cast<u8>(esize); + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.VectorLogicalShiftLeft(esize, operand, shift_amount); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SHRN(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::None, Narrowing::Truncation, Signedness::Unsigned); +} + +bool TranslatorVisitor::RSHRN(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Narrowing::Truncation, Signedness::Unsigned); +} + +bool TranslatorVisitor::SQSHL_imm_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return SaturatingShiftLeft(*this, Q, immh, immb, Vn, Vd, SaturatingShiftLeftType::Signed); +} + +bool TranslatorVisitor::SQSHLU_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return SaturatingShiftLeft(*this, Q, immh, immb, Vn, Vd, SaturatingShiftLeftType::SignedWithUnsignedSaturation); +} + +bool TranslatorVisitor::SQSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::None, Narrowing::SaturateToSigned, Signedness::Signed); +} + +bool TranslatorVisitor::SQRSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Narrowing::SaturateToSigned, Signedness::Signed); +} + +bool TranslatorVisitor::SQSHRUN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::None, Narrowing::SaturateToUnsigned, Signedness::Signed); +} + +bool TranslatorVisitor::SQRSHRUN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Narrowing::SaturateToUnsigned, Signedness::Signed); +} + +bool TranslatorVisitor::UQSHL_imm_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return SaturatingShiftLeft(*this, Q, immh, immb, Vn, Vd, SaturatingShiftLeftType::Unsigned); +} + +bool TranslatorVisitor::UQSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::None, Narrowing::SaturateToUnsigned, Signedness::Unsigned); +} + +bool TranslatorVisitor::UQRSHRN_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRightNarrowing(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Narrowing::SaturateToUnsigned, Signedness::Unsigned); +} + +bool TranslatorVisitor::SSHLL(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftLeftLong(*this, Q, immh, immb, Vn, Vd, Signedness::Signed); +} + +bool TranslatorVisitor::URSHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Accumulating::None, Signedness::Unsigned); +} + +bool TranslatorVisitor::URSRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::Round, Accumulating::Accumulate, Signedness::Unsigned); +} + +bool TranslatorVisitor::USHR_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::None, Accumulating::None, Signedness::Unsigned); +} + +bool TranslatorVisitor::USRA_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftRight(*this, Q, immh, immb, Vn, Vd, Rounding::None, Accumulating::Accumulate, Signedness::Unsigned); +} + +bool TranslatorVisitor::USHLL(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ShiftLeftLong(*this, Q, immh, immb, Vn, Vd, Signedness::Unsigned); +} + +bool TranslatorVisitor::SRI_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + if (immh == 0b0000) { + return DecodeError(); + } + + if (!Q && immh.Bit<3>()) { + return ReservedValue(); + } + + const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend()); + const size_t datasize = Q ? 128 : 64; + + const u8 shift_amount = static_cast<u8>((esize * 2) - concatenate(immh, immb).ZeroExtend<u8>()); + const u64 mask = shift_amount == esize ? 0 : mcl::bit::ones<u64>(esize) >> shift_amount; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vd); + + const IR::U128 shifted = ir.VectorLogicalShiftRight(esize, operand1, shift_amount); + const IR::U128 mask_vec = ir.VectorBroadcast(esize, I(esize, mask)); + const IR::U128 result = ir.VectorOr(ir.VectorAndNot(operand2, mask_vec), shifted); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SLI_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + if (immh == 0b0000) { + return DecodeError(); + } + + if (!Q && immh.Bit<3>()) { + return ReservedValue(); + } + + const size_t esize = 8 << mcl::bit::highest_set_bit(immh.ZeroExtend()); + const size_t datasize = Q ? 128 : 64; + + const u8 shift_amount = concatenate(immh, immb).ZeroExtend<u8>() - static_cast<u8>(esize); + const u64 mask = mcl::bit::ones<u64>(esize) << shift_amount; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vd); + + const IR::U128 shifted = ir.VectorLogicalShiftLeft(esize, operand1, shift_amount); + const IR::U128 mask_vec = ir.VectorBroadcast(esize, I(esize, mask)); + const IR::U128 result = ir.VectorOr(ir.VectorAndNot(operand2, mask_vec), shifted); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SCVTF_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ConvertFloat(*this, Q, immh, immb, Vn, Vd, Signedness::Signed, FloatConversionDirection::FixedToFloat, ir.current_location->FPCR().RMode()); +} + +bool TranslatorVisitor::UCVTF_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ConvertFloat(*this, Q, immh, immb, Vn, Vd, Signedness::Unsigned, FloatConversionDirection::FixedToFloat, ir.current_location->FPCR().RMode()); +} + +bool TranslatorVisitor::FCVTZS_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ConvertFloat(*this, Q, immh, immb, Vn, Vd, Signedness::Signed, FloatConversionDirection::FloatToFixed, FP::RoundingMode::TowardsZero); +} + +bool TranslatorVisitor::FCVTZU_fix_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd) { + return ConvertFloat(*this, Q, immh, immb, Vn, Vd, Signedness::Unsigned, FloatConversionDirection::FloatToFixed, FP::RoundingMode::TowardsZero); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_table_lookup.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_table_lookup.cpp new file mode 100644 index 0000000000..f267c13581 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_table_lookup.cpp @@ -0,0 +1,40 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <vector> + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +static bool TableLookup(TranslatorVisitor& v, bool Q, Vec Vm, Imm<2> len, bool is_tbl, size_t Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const IR::Table table = v.ir.VectorTable([&] { + std::vector<IR::U128> result; + for (size_t i = 0; i < len.ZeroExtend<size_t>() + 1; ++i) { + result.emplace_back(v.ir.GetQ(static_cast<Vec>((Vn + i) % 32))); + } + return result; + }()); + + const IR::U128 indicies = v.ir.GetQ(Vm); + const IR::U128 defaults = is_tbl ? v.ir.ZeroVector() : v.ir.GetQ(Vd); + + const IR::U128 result = v.ir.VectorTableLookup(defaults, table, indicies); + + v.V(datasize, Vd, datasize == 128 ? result : v.ir.VectorZeroUpper(result)); + return true; +} + +bool TranslatorVisitor::TBL(bool Q, Vec Vm, Imm<2> len, size_t Vn, Vec Vd) { + return TableLookup(*this, Q, Vm, len, true, Vn, Vd); +} + +bool TranslatorVisitor::TBX(bool Q, Vec Vm, Imm<2> len, size_t Vn, Vec Vd) { + return TableLookup(*this, Q, Vm, len, false, Vn, Vd); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_different.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_different.cpp new file mode 100644 index 0000000000..8cc677b652 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_different.cpp @@ -0,0 +1,256 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +enum class AbsoluteDifferenceBehavior { + None, + Accumulate +}; + +enum class Signedness { + Signed, + Unsigned +}; + +bool AbsoluteDifferenceLong(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, AbsoluteDifferenceBehavior behavior, Signedness sign) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = 64; + + const IR::U128 operand1 = v.ir.VectorZeroExtend(esize, v.Vpart(datasize, Vn, Q)); + const IR::U128 operand2 = v.ir.VectorZeroExtend(esize, v.Vpart(datasize, Vm, Q)); + IR::U128 result = sign == Signedness::Signed ? v.ir.VectorSignedAbsoluteDifference(esize, operand1, operand2) + : v.ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2); + + if (behavior == AbsoluteDifferenceBehavior::Accumulate) { + const IR::U128 data = v.V(2 * datasize, Vd); + result = v.ir.VectorAdd(2 * esize, result, data); + } + + v.V(2 * datasize, Vd, result); + return true; +} + +enum class MultiplyLongBehavior { + None, + Accumulate, + Subtract +}; + +bool MultiplyLong(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, MultiplyLongBehavior behavior, Signedness sign) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t doubled_esize = 2 * esize; + const size_t datasize = 64; + const size_t doubled_datasize = datasize * 2; + + IR::U128 result = [&] { + const auto reg_n = v.Vpart(datasize, Vn, Q); + const auto reg_m = v.Vpart(datasize, Vm, Q); + + return sign == Signedness::Signed + ? v.ir.VectorMultiplySignedWiden(esize, reg_n, reg_m) + : v.ir.VectorMultiplyUnsignedWiden(esize, reg_n, reg_m); + }(); + + if (behavior == MultiplyLongBehavior::Accumulate) { + const IR::U128 addend = v.V(doubled_datasize, Vd); + result = v.ir.VectorAdd(doubled_esize, addend, result); + } else if (behavior == MultiplyLongBehavior::Subtract) { + const IR::U128 minuend = v.V(doubled_datasize, Vd); + result = v.ir.VectorSub(doubled_esize, minuend, result); + } + + v.V(doubled_datasize, Vd, result); + return true; +} + +enum class LongOperationBehavior { + Addition, + Subtraction +}; + +bool LongOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, LongOperationBehavior behavior, Signedness sign) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t part = Q ? 1 : 0; + + const auto get_operand = [&](Vec vec) { + const IR::U128 tmp = v.Vpart(64, vec, part); + + if (sign == Signedness::Signed) { + return v.ir.VectorSignExtend(esize, tmp); + } + + return v.ir.VectorZeroExtend(esize, tmp); + }; + + const IR::U128 operand1 = get_operand(Vn); + const IR::U128 operand2 = get_operand(Vm); + const IR::U128 result = [&] { + if (behavior == LongOperationBehavior::Addition) { + return v.ir.VectorAdd(esize * 2, operand1, operand2); + } + + return v.ir.VectorSub(esize * 2, operand1, operand2); + }(); + + v.V(128, Vd, result); + return true; +} + +enum class WideOperationBehavior { + Addition, + Subtraction +}; + +bool WideOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, WideOperationBehavior behavior, Signedness sign) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t part = Q ? 1 : 0; + + const IR::U128 operand1 = v.V(128, Vn); + const IR::U128 operand2 = [&] { + const IR::U128 tmp = v.Vpart(64, Vm, part); + + if (sign == Signedness::Signed) { + return v.ir.VectorSignExtend(esize, tmp); + } + + return v.ir.VectorZeroExtend(esize, tmp); + }(); + const IR::U128 result = [&] { + if (behavior == WideOperationBehavior::Addition) { + return v.ir.VectorAdd(esize * 2, operand1, operand2); + } + + return v.ir.VectorSub(esize * 2, operand1, operand2); + }(); + + v.V(128, Vd, result); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::PMULL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b01 || size == 0b10) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = 64; + + const IR::U128 operand1 = Vpart(datasize, Vn, Q); + const IR::U128 operand2 = Vpart(datasize, Vm, Q); + const IR::U128 result = ir.VectorPolynomialMultiplyLong(esize, operand1, operand2); + + V(128, Vd, result); + return true; +} + +bool TranslatorVisitor::SABAL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return AbsoluteDifferenceLong(*this, Q, size, Vm, Vn, Vd, AbsoluteDifferenceBehavior::Accumulate, Signedness::Signed); +} + +bool TranslatorVisitor::SABDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return AbsoluteDifferenceLong(*this, Q, size, Vm, Vn, Vd, AbsoluteDifferenceBehavior::None, Signedness::Signed); +} + +bool TranslatorVisitor::SADDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return LongOperation(*this, Q, size, Vm, Vn, Vd, LongOperationBehavior::Addition, Signedness::Signed); +} + +bool TranslatorVisitor::SADDW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return WideOperation(*this, Q, size, Vm, Vn, Vd, WideOperationBehavior::Addition, Signedness::Signed); +} + +bool TranslatorVisitor::SMLAL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::Accumulate, Signedness::Signed); +} + +bool TranslatorVisitor::SMLSL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::Subtract, Signedness::Signed); +} + +bool TranslatorVisitor::SMULL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::None, Signedness::Signed); +} + +bool TranslatorVisitor::SSUBW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return WideOperation(*this, Q, size, Vm, Vn, Vd, WideOperationBehavior::Subtraction, Signedness::Signed); +} + +bool TranslatorVisitor::SSUBL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return LongOperation(*this, Q, size, Vm, Vn, Vd, LongOperationBehavior::Subtraction, Signedness::Signed); +} + +bool TranslatorVisitor::UADDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return LongOperation(*this, Q, size, Vm, Vn, Vd, LongOperationBehavior::Addition, Signedness::Unsigned); +} + +bool TranslatorVisitor::UABAL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return AbsoluteDifferenceLong(*this, Q, size, Vm, Vn, Vd, AbsoluteDifferenceBehavior::Accumulate, Signedness::Unsigned); +} + +bool TranslatorVisitor::UABDL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return AbsoluteDifferenceLong(*this, Q, size, Vm, Vn, Vd, AbsoluteDifferenceBehavior::None, Signedness::Unsigned); +} + +bool TranslatorVisitor::UADDW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return WideOperation(*this, Q, size, Vm, Vn, Vd, WideOperationBehavior::Addition, Signedness::Unsigned); +} + +bool TranslatorVisitor::UMLAL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::Accumulate, Signedness::Unsigned); +} + +bool TranslatorVisitor::UMLSL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::Subtract, Signedness::Unsigned); +} + +bool TranslatorVisitor::UMULL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, Vm, Vn, Vd, MultiplyLongBehavior::None, Signedness::Unsigned); +} + +bool TranslatorVisitor::USUBW(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return WideOperation(*this, Q, size, Vm, Vn, Vd, WideOperationBehavior::Subtraction, Signedness::Unsigned); +} + +bool TranslatorVisitor::USUBL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return LongOperation(*this, Q, size, Vm, Vn, Vd, LongOperationBehavior::Subtraction, Signedness::Unsigned); +} + +bool TranslatorVisitor::SQDMULL_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t part = Q ? 1 : 0; + + const IR::U128 operand1 = Vpart(64, Vn, part); + const IR::U128 operand2 = Vpart(64, Vm, part); + const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyLong(esize, operand1, operand2); + + V(128, Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same.cpp new file mode 100644 index 0000000000..5c8bf13aeb --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same.cpp @@ -0,0 +1,1240 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +enum class Operation { + Add, + Subtract, +}; + +enum class ExtraBehavior { + None, + Round +}; + +bool HighNarrowingOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Operation op, ExtraBehavior behavior) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t part = Q; + const size_t esize = 8 << size.ZeroExtend(); + const size_t doubled_esize = 2 * esize; + + const IR::U128 operand1 = v.ir.GetQ(Vn); + const IR::U128 operand2 = v.ir.GetQ(Vm); + IR::U128 wide = [&] { + if (op == Operation::Add) { + return v.ir.VectorAdd(doubled_esize, operand1, operand2); + } + return v.ir.VectorSub(doubled_esize, operand1, operand2); + }(); + + if (behavior == ExtraBehavior::Round) { + const u64 round_const = 1ULL << (esize - 1); + const IR::U128 round_operand = v.ir.VectorBroadcast(doubled_esize, v.I(doubled_esize, round_const)); + wide = v.ir.VectorAdd(doubled_esize, wide, round_operand); + } + + const IR::U128 result = v.ir.VectorNarrow(doubled_esize, + v.ir.VectorLogicalShiftRight(doubled_esize, wide, static_cast<u8>(esize))); + + v.Vpart(64, Vd, part, result); + return true; +} + +enum class AbsDiffExtraBehavior { + None, + Accumulate +}; + +bool SignedAbsoluteDifference(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, AbsDiffExtraBehavior behavior) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + const IR::U128 result = [&] { + const IR::U128 tmp = v.ir.VectorSignedAbsoluteDifference(esize, operand1, operand2); + + if (behavior == AbsDiffExtraBehavior::Accumulate) { + const IR::U128 d = v.V(datasize, Vd); + return v.ir.VectorAdd(esize, d, tmp); + } + + return tmp; + }(); + + v.V(datasize, Vd, result); + return true; +} + +enum class Signedness { + Signed, + Unsigned +}; + +bool RoundingHalvingAdd(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Signedness sign) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vm); + const IR::U128 operand2 = v.V(datasize, Vn); + const IR::U128 result = sign == Signedness::Signed ? v.ir.VectorRoundingHalvingAddSigned(esize, operand1, operand2) + : v.ir.VectorRoundingHalvingAddUnsigned(esize, operand1, operand2); + + v.V(datasize, Vd, result); + return true; +} + +bool RoundingShiftLeft(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Signedness sign) { + if (size == 0b11 && !Q) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + const IR::U128 result = [&] { + if (sign == Signedness::Signed) { + return v.ir.VectorRoundingShiftLeftSigned(esize, operand1, operand2); + } + + return v.ir.VectorRoundingShiftLeftUnsigned(esize, operand1, operand2); + }(); + + v.V(datasize, Vd, result); + return true; +} + +enum class ComparisonType { + EQ, + GE, + AbsoluteGE, + GT, + AbsoluteGT +}; + +bool FPCompareRegister(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd, ComparisonType type) { + if (sz && !Q) { + return v.ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + const IR::U128 result = [&] { + switch (type) { + case ComparisonType::EQ: + return v.ir.FPVectorEqual(esize, operand1, operand2); + case ComparisonType::GE: + return v.ir.FPVectorGreaterEqual(esize, operand1, operand2); + case ComparisonType::AbsoluteGE: + return v.ir.FPVectorGreaterEqual(esize, + v.ir.FPVectorAbs(esize, operand1), + v.ir.FPVectorAbs(esize, operand2)); + case ComparisonType::GT: + return v.ir.FPVectorGreater(esize, operand1, operand2); + case ComparisonType::AbsoluteGT: + return v.ir.FPVectorGreater(esize, + v.ir.FPVectorAbs(esize, operand1), + v.ir.FPVectorAbs(esize, operand2)); + } + + UNREACHABLE(); + }(); + + v.V(datasize, Vd, result); + return true; +} + +enum class MinMaxOperation { + Min, + Max, +}; + +bool VectorMinMaxOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, MinMaxOperation operation, Signedness sign) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + const IR::U128 result = [&] { + switch (operation) { + case MinMaxOperation::Max: + if (sign == Signedness::Signed) { + return v.ir.VectorMaxSigned(esize, operand1, operand2); + } + return v.ir.VectorMaxUnsigned(esize, operand1, operand2); + + case MinMaxOperation::Min: + if (sign == Signedness::Signed) { + return v.ir.VectorMinSigned(esize, operand1, operand2); + } + return v.ir.VectorMinUnsigned(esize, operand1, operand2); + + default: + UNREACHABLE(); + } + }(); + + v.V(datasize, Vd, result); + return true; +} + +bool FPMinMaxOperation(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd, MinMaxOperation operation) { + if (sz && !Q) { + return v.ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + const IR::U128 result = [&] { + if (operation == MinMaxOperation::Min) { + return v.ir.FPVectorMin(esize, operand1, operand2); + } + + return v.ir.FPVectorMax(esize, operand1, operand2); + }(); + + v.V(datasize, Vd, result); + return true; +} + +bool FPMinMaxNumericOperation(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd, MinMaxOperation operation) { + if (sz && !Q) { + return v.ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + const IR::U128 result = [&] { + if (operation == MinMaxOperation::Min) { + return v.ir.FPVectorMinNumeric(esize, operand1, operand2); + } + + return v.ir.FPVectorMaxNumeric(esize, operand1, operand2); + }(); + + v.V(datasize, Vd, result); + return true; +} + +bool PairedMinMaxOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, MinMaxOperation operation, Signedness sign) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + IR::U128 result = [&] { + switch (operation) { + case MinMaxOperation::Max: + if (sign == Signedness::Signed) { + return Q ? v.ir.VectorPairedMaxSigned(esize, operand1, operand2) : v.ir.VectorPairedMaxSignedLower(esize, operand1, operand2); + } + return Q ? v.ir.VectorPairedMaxUnsigned(esize, operand1, operand2) : v.ir.VectorPairedMaxUnsignedLower(esize, operand1, operand2); + + case MinMaxOperation::Min: + if (sign == Signedness::Signed) { + return Q ? v.ir.VectorPairedMinSigned(esize, operand1, operand2) : v.ir.VectorPairedMinSignedLower(esize, operand1, operand2); + } + return Q ? v.ir.VectorPairedMinUnsigned(esize, operand1, operand2) : v.ir.VectorPairedMinUnsignedLower(esize, operand1, operand2); + + default: + UNREACHABLE(); + } + }(); + + v.V(datasize, Vd, result); + return true; +} + +bool FPPairedMinMax(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd, IR::U32U64 (IREmitter::*fn)(const IR::U32U64&, const IR::U32U64&)) { + if (sz && !Q) { + return v.ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + const size_t elements = datasize / esize; + const size_t boundary = elements / 2; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + IR::U128 result = v.ir.ZeroVector(); + + const auto operation = [&](IR::U128 operand, size_t result_start_index) { + for (size_t i = 0; i < elements; i += 2, result_start_index++) { + const IR::UAny elem1 = v.ir.VectorGetElement(esize, operand, i); + const IR::UAny elem2 = v.ir.VectorGetElement(esize, operand, i + 1); + const IR::UAny result_elem = (v.ir.*fn)(elem1, elem2); + + result = v.ir.VectorSetElement(esize, result, result_start_index, result_elem); + } + }; + + operation(operand1, 0); + operation(operand2, boundary); + + v.V(datasize, Vd, result); + return true; +} + +bool SaturatingArithmeticOperation(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Operation op, Signedness sign) { + if (size == 0b11 && !Q) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + + const IR::U128 result = [&] { + if (sign == Signedness::Signed) { + if (op == Operation::Add) { + return v.ir.VectorSignedSaturatedAdd(esize, operand1, operand2); + } + + return v.ir.VectorSignedSaturatedSub(esize, operand1, operand2); + } + + if (op == Operation::Add) { + return v.ir.VectorUnsignedSaturatedAdd(esize, operand1, operand2); + } + + return v.ir.VectorUnsignedSaturatedSub(esize, operand1, operand2); + }(); + + v.V(datasize, Vd, result); + return true; +} + +bool SaturatingShiftLeft(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, Signedness sign) { + if (size == 0b11 && !Q) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + const IR::U128 result = [&] { + if (sign == Signedness::Signed) { + return v.ir.VectorSignedSaturatedShiftLeft(esize, operand1, operand2); + } + + return v.ir.VectorUnsignedSaturatedShiftLeft(esize, operand1, operand2); + }(); + + v.V(datasize, Vd, result); + return true; +} + +} // Anonymous namespace + +bool TranslatorVisitor::CMGT_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorGreaterSigned(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::CMGE_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + IR::U128 result = ir.VectorGreaterEqualSigned(esize, operand1, operand2); + if (datasize == 64) { + result = ir.VectorZeroUpper(result); + } + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SABA(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return SignedAbsoluteDifference(*this, Q, size, Vm, Vn, Vd, AbsDiffExtraBehavior::Accumulate); +} + +bool TranslatorVisitor::SABD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return SignedAbsoluteDifference(*this, Q, size, Vm, Vn, Vd, AbsDiffExtraBehavior::None); +} + +bool TranslatorVisitor::SMAX(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return VectorMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Max, Signedness::Signed); +} + +bool TranslatorVisitor::SMAXP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return PairedMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Max, Signedness::Signed); +} + +bool TranslatorVisitor::SMIN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return VectorMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Min, Signedness::Signed); +} + +bool TranslatorVisitor::SMINP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return PairedMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Min, Signedness::Signed); +} + +bool TranslatorVisitor::SQDMULH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHigh(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SQRDMULH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::ADD_vector(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const auto operand1 = V(datasize, Vn); + const auto operand2 = V(datasize, Vm); + const auto result = ir.VectorAdd(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::MLA_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 operand3 = V(datasize, Vd); + const IR::U128 result = ir.VectorAdd(esize, ir.VectorMultiply(esize, operand1, operand2), operand3); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::MUL_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorMultiply(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::ADDHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return HighNarrowingOperation(*this, Q, size, Vm, Vn, Vd, Operation::Add, ExtraBehavior::None); +} + +bool TranslatorVisitor::RADDHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return HighNarrowingOperation(*this, Q, size, Vm, Vn, Vd, Operation::Add, ExtraBehavior::Round); +} + +bool TranslatorVisitor::SUBHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return HighNarrowingOperation(*this, Q, size, Vm, Vn, Vd, Operation::Subtract, ExtraBehavior::None); +} + +bool TranslatorVisitor::RSUBHN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return HighNarrowingOperation(*this, Q, size, Vm, Vn, Vd, Operation::Subtract, ExtraBehavior::Round); +} + +bool TranslatorVisitor::SHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorHalvingAddSigned(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SHSUB(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorHalvingSubSigned(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SQADD_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return SaturatingArithmeticOperation(*this, Q, size, Vm, Vn, Vd, Operation::Add, Signedness::Signed); +} + +bool TranslatorVisitor::SQSUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return SaturatingArithmeticOperation(*this, Q, size, Vm, Vn, Vd, Operation::Subtract, Signedness::Signed); +} + +bool TranslatorVisitor::SRHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return RoundingHalvingAdd(*this, Q, size, Vm, Vn, Vd, Signedness::Signed); +} + +bool TranslatorVisitor::UHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorHalvingAddUnsigned(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::UHSUB(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorHalvingSubUnsigned(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::UQADD_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return SaturatingArithmeticOperation(*this, Q, size, Vm, Vn, Vd, Operation::Add, Signedness::Unsigned); +} + +bool TranslatorVisitor::UQSUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return SaturatingArithmeticOperation(*this, Q, size, Vm, Vn, Vd, Operation::Subtract, Signedness::Unsigned); +} + +bool TranslatorVisitor::URHADD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return RoundingHalvingAdd(*this, Q, size, Vm, Vn, Vd, Signedness::Unsigned); +} + +bool TranslatorVisitor::ADDP_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = Q ? ir.VectorPairedAdd(esize, operand1, operand2) : ir.VectorPairedAddLower(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FABD_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = sz ? 64 : 32; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.FPVectorAbs(esize, ir.FPVectorSub(esize, operand1, operand2)); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FACGE_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPCompareRegister(*this, Q, sz, Vm, Vn, Vd, ComparisonType::AbsoluteGE); +} + +bool TranslatorVisitor::FACGT_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPCompareRegister(*this, Q, sz, Vm, Vn, Vd, ComparisonType::AbsoluteGT); +} + +bool TranslatorVisitor::FADD_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.FPVectorAdd(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMLA_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + const size_t esize = 16; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 operand3 = V(datasize, Vd); + const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMLA_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 operand3 = V(datasize, Vd); + const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMLS_vec_1(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + const size_t esize = 16; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 operand3 = V(datasize, Vd); + const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, ir.FPVectorNeg(esize, operand1), operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMLS_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 operand3 = V(datasize, Vd); + const IR::U128 result = ir.FPVectorMulAdd(esize, operand3, ir.FPVectorNeg(esize, operand1), operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FCMEQ_reg_3(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const IR::U128 lhs = V(datasize, Vn); + const IR::U128 rhs = V(datasize, Vm); + const IR::U128 result = ir.FPVectorEqual(16, lhs, rhs); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FCMEQ_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPCompareRegister(*this, Q, sz, Vm, Vn, Vd, ComparisonType::EQ); +} + +bool TranslatorVisitor::FCMGE_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPCompareRegister(*this, Q, sz, Vm, Vn, Vd, ComparisonType::GE); +} + +bool TranslatorVisitor::FCMGT_reg_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPCompareRegister(*this, Q, sz, Vm, Vn, Vd, ComparisonType::GT); +} + +bool TranslatorVisitor::AND_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const auto operand1 = V(datasize, Vn); + const auto operand2 = V(datasize, Vm); + const auto result = ir.VectorAnd(operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::BIC_asimd_reg(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + + IR::U128 result = ir.VectorAndNot(operand1, operand2); + if (datasize == 64) { + result = ir.VectorZeroUpper(result); + } + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::CMHI_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorGreaterUnsigned(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::CMHS_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + IR::U128 result = ir.VectorGreaterEqualUnsigned(esize, operand1, operand2); + if (datasize == 64) { + result = ir.VectorZeroUpper(result); + } + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::CMTST_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 anded = ir.VectorAnd(operand1, operand2); + const IR::U128 result = ir.VectorNot(ir.VectorEqual(esize, anded, ir.ZeroVector())); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SQSHL_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return SaturatingShiftLeft(*this, Q, size, Vm, Vn, Vd, Signedness::Signed); +} + +bool TranslatorVisitor::SRSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return RoundingShiftLeft(*this, Q, size, Vm, Vn, Vd, Signedness::Signed); +} + +bool TranslatorVisitor::SSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorArithmeticVShift(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::UQSHL_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return SaturatingShiftLeft(*this, Q, size, Vm, Vn, Vd, Signedness::Unsigned); +} + +bool TranslatorVisitor::URSHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return RoundingShiftLeft(*this, Q, size, Vm, Vn, Vd, Signedness::Unsigned); +} + +bool TranslatorVisitor::USHL_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorLogicalVShift(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::UMAX(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return VectorMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Max, Signedness::Unsigned); +} + +bool TranslatorVisitor::UMAXP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return PairedMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Max, Signedness::Unsigned); +} + +bool TranslatorVisitor::UABA(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 initial_dest = V(datasize, Vd); + + const IR::U128 result = ir.VectorAdd(esize, initial_dest, + ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2)); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::UABD(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::UMIN(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return VectorMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Min, Signedness::Unsigned); +} + +bool TranslatorVisitor::UMINP(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return PairedMinMaxOperation(*this, Q, size, Vm, Vn, Vd, MinMaxOperation::Min, Signedness::Unsigned); +} + +bool TranslatorVisitor::FSUB_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.FPVectorSub(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRECPS_3(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = 16; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.FPVectorRecipStepFused(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRECPS_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.FPVectorRecipStepFused(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRSQRTS_3(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t esize = 16; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.FPVectorRSqrtStepFused(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRSQRTS_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.FPVectorRSqrtStepFused(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::ORR_asimd_reg(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const auto operand1 = V(datasize, Vn); + const auto operand2 = V(datasize, Vm); + const auto result = ir.VectorOr(operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::ORN_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const auto operand1 = V(datasize, Vn); + const auto operand2 = V(datasize, Vm); + + auto result = ir.VectorOr(operand1, ir.VectorNot(operand2)); + if (datasize == 64) { + result = ir.VectorZeroUpper(result); + } + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::PMUL(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size != 0b00) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.VectorPolynomialMultiply(operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SUB_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const auto operand1 = V(datasize, Vn); + const auto operand2 = V(datasize, Vm); + const auto result = ir.VectorSub(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::CMEQ_reg_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + + IR::U128 result = ir.VectorEqual(esize, operand1, operand2); + if (datasize == 64) { + result = ir.VectorZeroUpper(result); + } + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::MLS_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 operand3 = V(datasize, Vd); + + const IR::U128 result = ir.VectorSub(esize, operand3, ir.VectorMultiply(esize, operand1, operand2)); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::EOR_asimd(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const auto operand1 = V(datasize, Vn); + const auto operand2 = V(datasize, Vm); + const auto result = ir.VectorEor(operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMAX_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPMinMaxOperation(*this, Q, sz, Vm, Vn, Vd, MinMaxOperation::Max); +} + +bool TranslatorVisitor::FMAXNM_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPMinMaxNumericOperation(*this, Q, sz, Vm, Vn, Vd, MinMaxOperation::Max); +} + +bool TranslatorVisitor::FMAXNMP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPPairedMinMax(*this, Q, sz, Vm, Vn, Vd, &IREmitter::FPMaxNumeric); +} + +bool TranslatorVisitor::FMAXP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPPairedMinMax(*this, Q, sz, Vm, Vn, Vd, &IREmitter::FPMax); +} + +bool TranslatorVisitor::FMIN_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPMinMaxOperation(*this, Q, sz, Vm, Vn, Vd, MinMaxOperation::Min); +} + +bool TranslatorVisitor::FMINNM_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPMinMaxNumericOperation(*this, Q, sz, Vm, Vn, Vd, MinMaxOperation::Min); +} + +bool TranslatorVisitor::FMINNMP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPPairedMinMax(*this, Q, sz, Vm, Vn, Vd, &IREmitter::FPMinNumeric); +} + +bool TranslatorVisitor::FMINP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + return FPPairedMinMax(*this, Q, sz, Vm, Vn, Vd, &IREmitter::FPMin); +} + +bool TranslatorVisitor::FADDP_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = Q ? ir.FPVectorPairedAdd(esize, operand1, operand2) : ir.FPVectorPairedAddLower(esize, operand1, operand2); + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMUL_vec_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.FPVectorMul(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FMULX_vec_4(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 result = ir.FPVectorMulX(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FDIV_2(bool Q, bool sz, Vec Vm, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + IR::U128 result = ir.FPVectorDiv(esize, operand1, operand2); + if (datasize == 64) { + result = ir.VectorZeroUpper(result); + } + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::BIF(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const auto operand1 = V(datasize, Vd); + const auto operand4 = V(datasize, Vn); + const auto operand3 = ir.VectorNot(V(datasize, Vm)); + const auto result = ir.VectorEor(operand1, ir.VectorAnd(ir.VectorEor(operand1, operand4), operand3)); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::BIT(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const auto operand1 = V(datasize, Vd); + const auto operand4 = V(datasize, Vn); + const auto operand3 = V(datasize, Vm); + const auto result = ir.VectorEor(operand1, ir.VectorAnd(ir.VectorEor(operand1, operand4), operand3)); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::BSL(bool Q, Vec Vm, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const auto operand4 = V(datasize, Vn); + const auto operand1 = V(datasize, Vm); + const auto operand3 = V(datasize, Vd); + const auto result = ir.VectorEor(operand1, ir.VectorAnd(ir.VectorEor(operand1, operand4), operand3)); + + V(datasize, Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same_extra.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same_extra.cpp new file mode 100644 index 0000000000..d18f4b10cb --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same_extra.cpp @@ -0,0 +1,174 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { + +using ExtensionFunction = IR::U32 (IREmitter::*)(const IR::UAny&); + +bool DotProduct(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd, ExtensionFunction extension) { + if (size != 0b10) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + const size_t elements = datasize / esize; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(datasize, Vm); + IR::U128 result = v.V(datasize, Vd); + + for (size_t i = 0; i < elements; i++) { + IR::U32 res_element = v.ir.Imm32(0); + + for (size_t j = 0; j < 4; j++) { + const IR::U32 elem1 = (v.ir.*extension)(v.ir.VectorGetElement(8, operand1, 4 * i + j)); + const IR::U32 elem2 = (v.ir.*extension)(v.ir.VectorGetElement(8, operand2, 4 * i + j)); + + res_element = v.ir.Add(res_element, v.ir.Mul(elem1, elem2)); + } + + res_element = v.ir.Add(v.ir.VectorGetElement(32, result, i), res_element); + result = v.ir.VectorSetElement(32, result, i, res_element); + } + + v.V(datasize, Vd, result); + return true; +} + +} // Anonymous namespace + +bool TranslatorVisitor::SDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return DotProduct(*this, Q, size, Vm, Vn, Vd, &IREmitter::SignExtendToWord); +} + +bool TranslatorVisitor::UDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { + return DotProduct(*this, Q, size, Vm, Vn, Vd, &IREmitter::ZeroExtendToWord); +} + +bool TranslatorVisitor::FCMLA_vec(bool Q, Imm<2> size, Vec Vm, Imm<2> rot, Vec Vn, Vec Vd) { + if (size == 0) { + return ReservedValue(); + } + + if (!Q && size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8U << size.ZeroExtend(); + + // TODO: Currently we don't support half-precision floating point + if (esize == 16) { + return InterpretThisInstruction(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t num_elements = datasize / esize; + const size_t num_iterations = num_elements / 2; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 operand3 = V(datasize, Vd); + IR::U128 result = ir.ZeroVector(); + + IR::U32U64 element1; + IR::U32U64 element2; + IR::U32U64 element3; + IR::U32U64 element4; + for (size_t e = 0; e < num_iterations; ++e) { + const size_t first = e * 2; + const size_t second = first + 1; + + switch (rot.ZeroExtend()) { + case 0b00: // 0 degrees + element1 = ir.VectorGetElement(esize, operand2, first); + element2 = ir.VectorGetElement(esize, operand1, first); + element3 = ir.VectorGetElement(esize, operand2, second); + element4 = ir.VectorGetElement(esize, operand1, first); + break; + case 0b01: // 90 degrees + element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, second)); + element2 = ir.VectorGetElement(esize, operand1, second); + element3 = ir.VectorGetElement(esize, operand2, first); + element4 = ir.VectorGetElement(esize, operand1, second); + break; + case 0b10: // 180 degrees + element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, first)); + element2 = ir.VectorGetElement(esize, operand1, first); + element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, second)); + element4 = ir.VectorGetElement(esize, operand1, first); + break; + case 0b11: // 270 degrees + element1 = ir.VectorGetElement(esize, operand2, second); + element2 = ir.VectorGetElement(esize, operand1, second); + element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, first)); + element4 = ir.VectorGetElement(esize, operand1, second); + break; + } + + const IR::U32U64 operand3_elem1 = ir.VectorGetElement(esize, operand3, first); + const IR::U32U64 operand3_elem2 = ir.VectorGetElement(esize, operand3, second); + + result = ir.VectorSetElement(esize, result, first, ir.FPMulAdd(operand3_elem1, element2, element1)); + result = ir.VectorSetElement(esize, result, second, ir.FPMulAdd(operand3_elem2, element4, element3)); + } + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::FCADD_vec(bool Q, Imm<2> size, Vec Vm, Imm<1> rot, Vec Vn, Vec Vd) { + if (size == 0) { + return ReservedValue(); + } + + if (!Q && size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8U << size.ZeroExtend(); + + // TODO: Currently we don't support half-precision floating point + if (esize == 16) { + return InterpretThisInstruction(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t num_elements = datasize / esize; + const size_t num_iterations = num_elements / 2; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + IR::U128 result = ir.ZeroVector(); + + IR::U32U64 element1; + IR::U32U64 element3; + for (size_t e = 0; e < num_iterations; ++e) { + const size_t first = e * 2; + const size_t second = first + 1; + + if (rot == 0) { + element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, second)); + element3 = ir.VectorGetElement(esize, operand2, first); + } else if (rot == 1) { + element1 = ir.VectorGetElement(esize, operand2, second); + element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, first)); + } + + const IR::U32U64 operand1_elem1 = ir.VectorGetElement(esize, operand1, first); + const IR::U32U64 operand1_elem3 = ir.VectorGetElement(esize, operand1, second); + + result = ir.VectorSetElement(esize, result, first, ir.FPAdd(operand1_elem1, element1)); + result = ir.VectorSetElement(esize, result, second, ir.FPAdd(operand1_elem3, element3)); + } + + ir.SetQ(Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_two_register_misc.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_two_register_misc.cpp new file mode 100644 index 0000000000..ca3e3b9591 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_two_register_misc.cpp @@ -0,0 +1,848 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/fp/rounding_mode.h" +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +enum class ComparisonType { + EQ, + GE, + GT, + LE, + LT, +}; + +bool CompareAgainstZero(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vn, Vec Vd, ComparisonType type) { + if (size == 0b11 && !Q) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = v.V(datasize, Vn); + const IR::U128 zero = v.ir.ZeroVector(); + IR::U128 result = [&] { + switch (type) { + case ComparisonType::EQ: + return v.ir.VectorEqual(esize, operand, zero); + case ComparisonType::GE: + return v.ir.VectorGreaterEqualSigned(esize, operand, zero); + case ComparisonType::GT: + return v.ir.VectorGreaterSigned(esize, operand, zero); + case ComparisonType::LE: + return v.ir.VectorLessEqualSigned(esize, operand, zero); + case ComparisonType::LT: + default: + return v.ir.VectorLessSigned(esize, operand, zero); + } + }(); + + if (datasize == 64) { + result = v.ir.VectorZeroUpper(result); + } + + v.V(datasize, Vd, result); + return true; +} + +bool FPCompareAgainstZero(TranslatorVisitor& v, bool Q, bool sz, Vec Vn, Vec Vd, ComparisonType type) { + if (sz && !Q) { + return v.ReservedValue(); + } + + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = v.V(datasize, Vn); + const IR::U128 zero = v.ir.ZeroVector(); + const IR::U128 result = [&] { + switch (type) { + case ComparisonType::EQ: + return v.ir.FPVectorEqual(esize, operand, zero); + case ComparisonType::GE: + return v.ir.FPVectorGreaterEqual(esize, operand, zero); + case ComparisonType::GT: + return v.ir.FPVectorGreater(esize, operand, zero); + case ComparisonType::LE: + return v.ir.FPVectorGreaterEqual(esize, zero, operand); + case ComparisonType::LT: + return v.ir.FPVectorGreater(esize, zero, operand); + } + + UNREACHABLE(); + }(); + + v.V(datasize, Vd, result); + return true; +} + +enum class Signedness { + Signed, + Unsigned +}; + +bool IntegerConvertToFloat(TranslatorVisitor& v, bool Q, bool sz, Vec Vn, Vec Vd, Signedness signedness) { + if (sz && !Q) { + return v.ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = sz ? 64 : 32; + const FP::RoundingMode rounding_mode = v.ir.current_location->FPCR().RMode(); + + const IR::U128 operand = v.V(datasize, Vn); + const IR::U128 result = signedness == Signedness::Signed + ? v.ir.FPVectorFromSignedFixed(esize, operand, 0, rounding_mode) + : v.ir.FPVectorFromUnsignedFixed(esize, operand, 0, rounding_mode); + + v.V(datasize, Vd, result); + return true; +} + +bool FloatConvertToInteger(TranslatorVisitor& v, bool Q, bool sz, Vec Vn, Vec Vd, Signedness signedness, FP::RoundingMode rounding_mode) { + if (sz && !Q) { + return v.ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = sz ? 64 : 32; + + const IR::U128 operand = v.V(datasize, Vn); + const IR::U128 result = signedness == Signedness::Signed + ? v.ir.FPVectorToSignedFixed(esize, operand, 0, rounding_mode) + : v.ir.FPVectorToUnsignedFixed(esize, operand, 0, rounding_mode); + + v.V(datasize, Vd, result); + return true; +} + +bool FloatRoundToIntegral(TranslatorVisitor& v, bool Q, bool sz, Vec Vn, Vec Vd, FP::RoundingMode rounding_mode, bool exact) { + if (sz && !Q) { + return v.ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = sz ? 64 : 32; + + const IR::U128 operand = v.V(datasize, Vn); + const IR::U128 result = v.ir.FPVectorRoundInt(esize, operand, rounding_mode, exact); + + v.V(datasize, Vd, result); + return true; +} + +bool FloatRoundToIntegralHalfPrecision(TranslatorVisitor& v, bool Q, Vec Vn, Vec Vd, FP::RoundingMode rounding_mode, bool exact) { + const size_t datasize = Q ? 128 : 64; + const size_t esize = 16; + + const IR::U128 operand = v.V(datasize, Vn); + const IR::U128 result = v.ir.FPVectorRoundInt(esize, operand, rounding_mode, exact); + + v.V(datasize, Vd, result); + return true; +} + +bool SaturatedNarrow(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vn, Vec Vd, IR::U128 (IR::IREmitter::*fn)(size_t, const IR::U128&)) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = 64; + const size_t part = Q ? 1 : 0; + + const IR::U128 operand = v.V(2 * datasize, Vn); + const IR::U128 result = (v.ir.*fn)(2 * esize, operand); + + v.Vpart(datasize, Vd, part, result); + return true; +} + +enum class PairedAddLongExtraBehavior { + None, + Accumulate, +}; + +bool PairedAddLong(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vn, Vec Vd, Signedness sign, PairedAddLongExtraBehavior behavior) { + if (size == 0b11) { + return v.ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = v.V(datasize, Vn); + IR::U128 result = [&] { + if (sign == Signedness::Signed) { + return v.ir.VectorPairedAddSignedWiden(esize, operand); + } + + return v.ir.VectorPairedAddUnsignedWiden(esize, operand); + }(); + + if (behavior == PairedAddLongExtraBehavior::Accumulate) { + result = v.ir.VectorAdd(esize * 2, v.V(datasize, Vd), result); + } + + if (datasize == 64) { + result = v.ir.VectorZeroUpper(result); + } + + v.V(datasize, Vd, result); + return true; +} + +} // Anonymous namespace + +bool TranslatorVisitor::CLS_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 shifted = ir.VectorArithmeticShiftRight(esize, operand, static_cast<u8>(esize)); + const IR::U128 xored = ir.VectorEor(operand, shifted); + const IR::U128 clz = ir.VectorCountLeadingZeros(esize, xored); + IR::U128 result = ir.VectorSub(esize, clz, ir.VectorBroadcast(esize, I(esize, 1))); + + if (datasize == 64) { + result = ir.VectorZeroUpper(result); + } + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::CLZ_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + IR::U128 result = ir.VectorCountLeadingZeros(esize, operand); + + if (datasize == 64) { + result = ir.VectorZeroUpper(result); + } + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::CNT(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size != 0b00) { + return ReservedValue(); + } + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.VectorPopulationCount(operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::CMGE_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return CompareAgainstZero(*this, Q, size, Vn, Vd, ComparisonType::GE); +} + +bool TranslatorVisitor::CMGT_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return CompareAgainstZero(*this, Q, size, Vn, Vd, ComparisonType::GT); +} + +bool TranslatorVisitor::CMEQ_zero_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return CompareAgainstZero(*this, Q, size, Vn, Vd, ComparisonType::EQ); +} + +bool TranslatorVisitor::CMLE_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return CompareAgainstZero(*this, Q, size, Vn, Vd, ComparisonType::LE); +} + +bool TranslatorVisitor::CMLT_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return CompareAgainstZero(*this, Q, size, Vn, Vd, ComparisonType::LT); +} + +bool TranslatorVisitor::ABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (!Q && size == 0b11) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 data = V(datasize, Vn); + const IR::U128 result = ir.VectorAbs(esize, data); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::XTN(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = 64; + const size_t part = Q ? 1 : 0; + + const IR::U128 operand = V(2 * datasize, Vn); + const IR::U128 result = ir.VectorNarrow(2 * esize, operand); + + Vpart(datasize, Vd, part, result); + return true; +} + +bool TranslatorVisitor::FABS_1(bool Q, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + const size_t esize = 16; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.FPVectorAbs(esize, operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FABS_2(bool Q, bool sz, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = sz ? 64 : 32; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.FPVectorAbs(esize, operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FCMEQ_zero_3(bool Q, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 zero = ir.ZeroVector(); + const IR::U128 result = ir.FPVectorEqual(16, operand, zero); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FCMEQ_zero_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FPCompareAgainstZero(*this, Q, sz, Vn, Vd, ComparisonType::EQ); +} + +bool TranslatorVisitor::FCMGE_zero_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FPCompareAgainstZero(*this, Q, sz, Vn, Vd, ComparisonType::GE); +} + +bool TranslatorVisitor::FCMGT_zero_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FPCompareAgainstZero(*this, Q, sz, Vn, Vd, ComparisonType::GT); +} + +bool TranslatorVisitor::FCMLE_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FPCompareAgainstZero(*this, Q, sz, Vn, Vd, ComparisonType::LE); +} + +bool TranslatorVisitor::FCMLT_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FPCompareAgainstZero(*this, Q, sz, Vn, Vd, ComparisonType::LT); +} + +bool TranslatorVisitor::FCVTL(bool Q, bool sz, Vec Vn, Vec Vd) { + const size_t esize = sz ? 32 : 16; + const size_t datasize = 64; + const size_t num_elements = datasize / esize; + + const IR::U128 part = Vpart(64, Vn, Q); + const auto rounding_mode = ir.current_location->FPCR().RMode(); + IR::U128 result = ir.ZeroVector(); + + for (size_t i = 0; i < num_elements; i++) { + IR::U16U32U64 element = ir.VectorGetElement(esize, part, i); + + if (esize == 16) { + element = ir.FPHalfToSingle(element, rounding_mode); + } else if (esize == 32) { + element = ir.FPSingleToDouble(element, rounding_mode); + } + + result = ir.VectorSetElement(2 * esize, result, i, element); + } + + V(128, Vd, result); + return true; +} + +bool TranslatorVisitor::FCVTN(bool Q, bool sz, Vec Vn, Vec Vd) { + const size_t datasize = 64; + const size_t esize = sz ? 32 : 16; + const size_t num_elements = datasize / esize; + + const IR::U128 operand = V(128, Vn); + const auto rounding_mode = ir.current_location->FPCR().RMode(); + IR::U128 result = ir.ZeroVector(); + + for (size_t i = 0; i < num_elements; i++) { + IR::U16U32U64 element = ir.VectorGetElement(2 * esize, operand, i); + + if (esize == 16) { + element = ir.FPSingleToHalf(element, rounding_mode); + } else if (esize == 32) { + element = ir.FPDoubleToSingle(element, rounding_mode); + } + + result = ir.VectorSetElement(esize, result, i, element); + } + + Vpart(datasize, Vd, Q, result); + return true; +} + +bool TranslatorVisitor::FCVTNS_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Signed, FP::RoundingMode::ToNearest_TieEven); +} + +bool TranslatorVisitor::FCVTMS_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Signed, FP::RoundingMode::TowardsMinusInfinity); +} + +bool TranslatorVisitor::FCVTAS_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Signed, FP::RoundingMode::ToNearest_TieAwayFromZero); +} + +bool TranslatorVisitor::FCVTPS_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Signed, FP::RoundingMode::TowardsPlusInfinity); +} + +bool TranslatorVisitor::FCVTXN_2(bool Q, bool sz, Vec Vn, Vec Vd) { + if (!sz) { + return UnallocatedEncoding(); + } + + const size_t part = Q ? 1 : 0; + const auto operand = ir.GetQ(Vn); + auto result = ir.ZeroVector(); + + for (size_t e = 0; e < 2; ++e) { + const IR::U64 element = ir.VectorGetElement(64, operand, e); + const IR::U32 converted = ir.FPDoubleToSingle(element, FP::RoundingMode::ToOdd); + + result = ir.VectorSetElement(32, result, e, converted); + } + + Vpart(64, Vd, part, result); + return true; +} + +bool TranslatorVisitor::FCVTZS_int_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Signed, FP::RoundingMode::TowardsZero); +} + +bool TranslatorVisitor::FCVTNU_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Unsigned, FP::RoundingMode::ToNearest_TieEven); +} + +bool TranslatorVisitor::FCVTMU_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Unsigned, FP::RoundingMode::TowardsMinusInfinity); +} + +bool TranslatorVisitor::FCVTAU_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Unsigned, FP::RoundingMode::ToNearest_TieAwayFromZero); +} + +bool TranslatorVisitor::FCVTPU_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Unsigned, FP::RoundingMode::TowardsPlusInfinity); +} + +bool TranslatorVisitor::FCVTZU_int_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatConvertToInteger(*this, Q, sz, Vn, Vd, Signedness::Unsigned, FP::RoundingMode::TowardsZero); +} + +bool TranslatorVisitor::FRINTN_1(bool Q, Vec Vn, Vec Vd) { + return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, FP::RoundingMode::ToNearest_TieEven, false); +} + +bool TranslatorVisitor::FRINTN_2(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieEven, false); +} + +bool TranslatorVisitor::FRINTM_1(bool Q, Vec Vn, Vec Vd) { + return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, FP::RoundingMode::TowardsMinusInfinity, false); +} + +bool TranslatorVisitor::FRINTM_2(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, FP::RoundingMode::TowardsMinusInfinity, false); +} + +bool TranslatorVisitor::FRINTP_1(bool Q, Vec Vn, Vec Vd) { + return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, FP::RoundingMode::TowardsPlusInfinity, false); +} + +bool TranslatorVisitor::FRINTP_2(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, FP::RoundingMode::TowardsPlusInfinity, false); +} + +bool TranslatorVisitor::FRINTZ_1(bool Q, Vec Vn, Vec Vd) { + return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, FP::RoundingMode::TowardsZero, false); +} + +bool TranslatorVisitor::FRINTZ_2(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, FP::RoundingMode::TowardsZero, false); +} + +bool TranslatorVisitor::FRINTA_1(bool Q, Vec Vn, Vec Vd) { + return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, FP::RoundingMode::ToNearest_TieAwayFromZero, false); +} + +bool TranslatorVisitor::FRINTA_2(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, FP::RoundingMode::ToNearest_TieAwayFromZero, false); +} + +bool TranslatorVisitor::FRINTX_1(bool Q, Vec Vn, Vec Vd) { + return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, ir.current_location->FPCR().RMode(), true); +} + +bool TranslatorVisitor::FRINTX_2(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, ir.current_location->FPCR().RMode(), true); +} + +bool TranslatorVisitor::FRINTI_1(bool Q, Vec Vn, Vec Vd) { + return FloatRoundToIntegralHalfPrecision(*this, Q, Vn, Vd, ir.current_location->FPCR().RMode(), false); +} + +bool TranslatorVisitor::FRINTI_2(bool Q, bool sz, Vec Vn, Vec Vd) { + return FloatRoundToIntegral(*this, Q, sz, Vn, Vd, ir.current_location->FPCR().RMode(), false); +} + +bool TranslatorVisitor::FRECPE_3(bool Q, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + const size_t esize = 16; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.FPVectorRecipEstimate(esize, operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRECPE_4(bool Q, bool sz, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = sz ? 64 : 32; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.FPVectorRecipEstimate(esize, operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FSQRT_2(bool Q, bool sz, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = sz ? 64 : 32; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.FPVectorSqrt(esize, operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRSQRTE_3(bool Q, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + const size_t esize = 16; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.FPVectorRSqrtEstimate(esize, operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FRSQRTE_4(bool Q, bool sz, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = sz ? 64 : 32; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.FPVectorRSqrtEstimate(esize, operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FNEG_1(bool Q, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 mask = ir.VectorBroadcast(64, I(64, 0x8000800080008000)); + const IR::U128 result = ir.VectorEor(operand, mask); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::FNEG_2(bool Q, bool sz, Vec Vn, Vec Vd) { + if (sz && !Q) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = sz ? 64 : 32; + const size_t mask_value = esize == 64 ? 0x8000000000000000 : 0x8000000080000000; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 mask = Q ? ir.VectorBroadcast(esize, I(esize, mask_value)) : ir.VectorBroadcastLower(esize, I(esize, mask_value)); + const IR::U128 result = ir.VectorEor(operand, mask); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::NEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + const size_t esize = 8 << size.ZeroExtend<size_t>(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 zero = ir.ZeroVector(); + const IR::U128 result = ir.VectorSub(esize, zero, operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SQXTUN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return SaturatedNarrow(*this, Q, size, Vn, Vd, &IR::IREmitter::VectorSignedSaturatedNarrowToUnsigned); +} + +bool TranslatorVisitor::SQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return SaturatedNarrow(*this, Q, size, Vn, Vd, &IR::IREmitter::VectorSignedSaturatedNarrowToSigned); +} + +bool TranslatorVisitor::UQXTN_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return SaturatedNarrow(*this, Q, size, Vn, Vd, &IR::IREmitter::VectorUnsignedSaturatedNarrow); +} + +bool TranslatorVisitor::NOT(bool Q, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + IR::U128 result = ir.VectorNot(operand); + + if (datasize == 64) { + result = ir.VectorZeroUpper(result); + } + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::RBIT_asimd(bool Q, Vec Vn, Vec Vd) { + const size_t datasize = Q ? 128 : 64; + + const IR::U128 data = V(datasize, Vn); + const IR::U128 result = ir.VectorReverseBits(data); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::REV16_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size > 0) { + return UnallocatedEncoding(); + } + + const size_t datasize = Q ? 128 : 64; + constexpr size_t esize = 8; + + const IR::U128 data = V(datasize, Vn); + const IR::U128 result = ir.VectorReverseElementsInHalfGroups(esize, data); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::REV32_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size > 1) { + return UnallocatedEncoding(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 data = V(datasize, Vn); + const IR::U128 result = ir.VectorReverseElementsInWordGroups(esize, data); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::REV64_asimd(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size > 2) { + return UnallocatedEncoding(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 data = V(datasize, Vn); + const IR::U128 result = ir.VectorReverseElementsInLongGroups(esize, data); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SQABS_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.VectorSignedSaturatedAbs(esize, operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SQNEG_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.VectorSignedSaturatedNeg(esize, operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SUQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vd); + const IR::U128 result = ir.VectorSignedSaturatedAccumulateUnsigned(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::USQADD_2(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11 && !Q) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vd); + const IR::U128 result = ir.VectorUnsignedSaturatedAccumulateSigned(esize, operand1, operand2); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SADALP(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return PairedAddLong(*this, Q, size, Vn, Vd, Signedness::Signed, PairedAddLongExtraBehavior::Accumulate); +} + +bool TranslatorVisitor::SADDLP(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return PairedAddLong(*this, Q, size, Vn, Vd, Signedness::Signed, PairedAddLongExtraBehavior::None); +} + +bool TranslatorVisitor::UADALP(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return PairedAddLong(*this, Q, size, Vn, Vd, Signedness::Unsigned, PairedAddLongExtraBehavior::Accumulate); +} + +bool TranslatorVisitor::UADDLP(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + return PairedAddLong(*this, Q, size, Vn, Vd, Signedness::Unsigned, PairedAddLongExtraBehavior::None); +} + +bool TranslatorVisitor::URECPE(bool Q, bool sz, Vec Vn, Vec Vd) { + if (sz) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.VectorUnsignedRecipEstimate(operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::URSQRTE(bool Q, bool sz, Vec Vn, Vec Vd) { + if (sz) { + return ReservedValue(); + } + + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand = V(datasize, Vn); + const IR::U128 result = ir.VectorUnsignedRecipSqrtEstimate(operand); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SCVTF_int_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return IntegerConvertToFloat(*this, Q, sz, Vn, Vd, Signedness::Signed); +} + +bool TranslatorVisitor::UCVTF_int_4(bool Q, bool sz, Vec Vn, Vec Vd) { + return IntegerConvertToFloat(*this, Q, sz, Vn, Vd, Signedness::Unsigned); +} + +bool TranslatorVisitor::SHLL(bool Q, Imm<2> size, Vec Vn, Vec Vd) { + if (size == 0b11) { + return ReservedValue(); + } + + const size_t esize = 8 << size.ZeroExtend(); + + const IR::U128 operand = ir.VectorZeroExtend(esize, Vpart(64, Vn, Q)); + const IR::U128 result = ir.VectorLogicalShiftLeft(esize * 2, operand, static_cast<u8>(esize)); + + V(128, Vd, result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp new file mode 100644 index 0000000000..07234fc61b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp @@ -0,0 +1,408 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <utility> + +#include <mcl/assert.hpp> + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { +namespace { +std::pair<size_t, Vec> Combine(Imm<2> size, Imm<1> H, Imm<1> L, Imm<1> M, Imm<4> Vmlo) { + if (size == 0b01) { + return {concatenate(H, L, M).ZeroExtend(), Vmlo.ZeroExtend<Vec>()}; + } + + return {concatenate(H, L).ZeroExtend(), concatenate(M, Vmlo).ZeroExtend<Vec>()}; +} + +enum class ExtraBehavior { + None, + Extended, + Accumulate, + Subtract, +}; + +bool MultiplyByElement(TranslatorVisitor& v, bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior) { + if (size != 0b01 && size != 0b10) { + return v.ReservedValue(); + } + + const auto [index, Vm] = Combine(size, H, L, M, Vmlo); + const size_t idxdsize = H == 1 ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.ir.VectorBroadcastElement(esize, v.V(idxdsize, Vm), index); + const IR::U128 operand3 = v.V(datasize, Vd); + + IR::U128 result = v.ir.VectorMultiply(esize, operand1, operand2); + if (extra_behavior == ExtraBehavior::Accumulate) { + result = v.ir.VectorAdd(esize, operand3, result); + } else if (extra_behavior == ExtraBehavior::Subtract) { + result = v.ir.VectorSub(esize, operand3, result); + } + + v.V(datasize, Vd, result); + return true; +} + +bool FPMultiplyByElement(TranslatorVisitor& v, bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior) { + if (sz && L == 1) { + return v.ReservedValue(); + } + if (sz && !Q) { + return v.ReservedValue(); + } + + const size_t idxdsize = H == 1 ? 128 : 64; + const size_t index = sz ? H.ZeroExtend() : concatenate(H, L).ZeroExtend(); + const Vec Vm = concatenate(M, Vmlo).ZeroExtend<Vec>(); + const size_t esize = sz ? 64 : 32; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = Q ? v.ir.VectorBroadcastElement(esize, v.V(idxdsize, Vm), index) : v.ir.VectorBroadcastElementLower(esize, v.V(idxdsize, Vm), index); + const IR::U128 operand3 = v.V(datasize, Vd); + + const IR::U128 result = [&] { + switch (extra_behavior) { + case ExtraBehavior::None: + return v.ir.FPVectorMul(esize, operand1, operand2); + case ExtraBehavior::Extended: + return v.ir.FPVectorMulX(esize, operand1, operand2); + case ExtraBehavior::Accumulate: + return v.ir.FPVectorMulAdd(esize, operand3, operand1, operand2); + case ExtraBehavior::Subtract: + return v.ir.FPVectorMulAdd(esize, operand3, v.ir.FPVectorNeg(esize, operand1), operand2); + } + UNREACHABLE(); + }(); + v.V(datasize, Vd, result); + return true; +} + +bool FPMultiplyByElementHalfPrecision(TranslatorVisitor& v, bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior) { + const size_t idxdsize = H == 1 ? 128 : 64; + const size_t index = concatenate(H, L, M).ZeroExtend(); + const Vec Vm = Vmlo.ZeroExtend<Vec>(); + const size_t esize = 16; + const size_t datasize = Q ? 128 : 64; + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = Q ? v.ir.VectorBroadcastElement(esize, v.V(idxdsize, Vm), index) : v.ir.VectorBroadcastElementLower(esize, v.V(idxdsize, Vm), index); + const IR::U128 operand3 = v.V(datasize, Vd); + + // TODO: We currently don't implement half-precision paths for + // regular multiplies and extended multiplies. + const IR::U128 result = [&] { + switch (extra_behavior) { + case ExtraBehavior::None: + break; + case ExtraBehavior::Extended: + break; + case ExtraBehavior::Accumulate: + return v.ir.FPVectorMulAdd(esize, operand3, operand1, operand2); + case ExtraBehavior::Subtract: + return v.ir.FPVectorMulAdd(esize, operand3, v.ir.FPVectorNeg(esize, operand1), operand2); + } + UNREACHABLE(); + }(); + v.V(datasize, Vd, result); + return true; +} + +using ExtensionFunction = IR::U32 (IREmitter::*)(const IR::UAny&); + +bool DotProduct(TranslatorVisitor& v, bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtensionFunction extension) { + if (size != 0b10) { + return v.ReservedValue(); + } + + const Vec Vm = concatenate(M, Vmlo).ZeroExtend<Vec>(); + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + const size_t elements = datasize / esize; + const size_t index = concatenate(H, L).ZeroExtend(); + + const IR::U128 operand1 = v.V(datasize, Vn); + const IR::U128 operand2 = v.V(128, Vm); + IR::U128 result = v.V(datasize, Vd); + + for (size_t i = 0; i < elements; i++) { + IR::U32 res_element = v.ir.Imm32(0); + + for (size_t j = 0; j < 4; j++) { + const IR::U32 elem1 = (v.ir.*extension)(v.ir.VectorGetElement(8, operand1, 4 * i + j)); + const IR::U32 elem2 = (v.ir.*extension)(v.ir.VectorGetElement(8, operand2, 4 * index + j)); + + res_element = v.ir.Add(res_element, v.ir.Mul(elem1, elem2)); + } + + res_element = v.ir.Add(v.ir.VectorGetElement(32, result, i), res_element); + result = v.ir.VectorSetElement(32, result, i, res_element); + } + + v.V(datasize, Vd, result); + return true; +} + +enum class Signedness { + Signed, + Unsigned +}; + +bool MultiplyLong(TranslatorVisitor& v, bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd, ExtraBehavior extra_behavior, Signedness sign) { + if (size == 0b00 || size == 0b11) { + return v.ReservedValue(); + } + + const size_t idxsize = H == 1 ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = 64; + const auto [index, Vm] = Combine(size, H, L, M, Vmlo); + + const IR::U128 operand1 = v.Vpart(datasize, Vn, Q); + const IR::U128 operand2 = v.V(idxsize, Vm); + const IR::U128 index_vector = v.ir.VectorBroadcastElement(esize, operand2, index); + + const IR::U128 result = [&] { + const IR::U128 product = sign == Signedness::Signed + ? v.ir.VectorMultiplySignedWiden(esize, operand1, index_vector) + : v.ir.VectorMultiplyUnsignedWiden(esize, operand1, index_vector); + + if (extra_behavior == ExtraBehavior::None) { + return product; + } + + const IR::U128 operand3 = v.V(2 * datasize, Vd); + if (extra_behavior == ExtraBehavior::Accumulate) { + return v.ir.VectorAdd(2 * esize, operand3, product); + } + + return v.ir.VectorSub(2 * esize, operand3, product); + }(); + + v.V(2 * datasize, Vd, result); + return true; +} +} // Anonymous namespace + +bool TranslatorVisitor::MLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyByElement(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate); +} + +bool TranslatorVisitor::MLS_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyByElement(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract); +} + +bool TranslatorVisitor::MUL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyByElement(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None); +} + +bool TranslatorVisitor::FCMLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<2> rot, Imm<1> H, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + if (size == 0b01 && H == 1 && Q == 0) { + return ReservedValue(); + } + + if (size == 0b10 && (L == 1 || Q == 0)) { + return ReservedValue(); + } + + const size_t esize = 8U << size.ZeroExtend(); + + // TODO: We don't support the half-precision floating point variant yet. + if (esize == 16) { + return InterpretThisInstruction(); + } + + const size_t index = [=] { + if (size == 0b01) { + return concatenate(H, L).ZeroExtend(); + } + return H.ZeroExtend(); + }(); + + const Vec Vm = concatenate(M, Vmlo).ZeroExtend<Vec>(); + + const size_t datasize = Q ? 128 : 64; + const size_t num_elements = datasize / esize; + const size_t num_iterations = num_elements / 2; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 operand3 = V(datasize, Vd); + IR::U128 result = ir.ZeroVector(); + + IR::U32U64 element1; + IR::U32U64 element2; + IR::U32U64 element3; + IR::U32U64 element4; + for (size_t e = 0; e < num_iterations; ++e) { + const size_t first = e * 2; + const size_t second = first + 1; + + const size_t index_first = index * 2; + const size_t index_second = index_first + 1; + + switch (rot.ZeroExtend()) { + case 0b00: // 0 degrees + element1 = ir.VectorGetElement(esize, operand2, index_first); + element2 = ir.VectorGetElement(esize, operand1, first); + element3 = ir.VectorGetElement(esize, operand2, index_second); + element4 = ir.VectorGetElement(esize, operand1, first); + break; + case 0b01: // 90 degrees + element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_second)); + element2 = ir.VectorGetElement(esize, operand1, second); + element3 = ir.VectorGetElement(esize, operand2, index_first); + element4 = ir.VectorGetElement(esize, operand1, second); + break; + case 0b10: // 180 degrees + element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_first)); + element2 = ir.VectorGetElement(esize, operand1, first); + element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_second)); + element4 = ir.VectorGetElement(esize, operand1, first); + break; + case 0b11: // 270 degrees + element1 = ir.VectorGetElement(esize, operand2, index_second); + element2 = ir.VectorGetElement(esize, operand1, second); + element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_first)); + element4 = ir.VectorGetElement(esize, operand1, second); + break; + } + + const IR::U32U64 operand3_elem1 = ir.VectorGetElement(esize, operand3, first); + const IR::U32U64 operand3_elem2 = ir.VectorGetElement(esize, operand3, second); + + result = ir.VectorSetElement(esize, result, first, ir.FPMulAdd(operand3_elem1, element2, element1)); + result = ir.VectorSetElement(esize, result, second, ir.FPMulAdd(operand3_elem2, element4, element3)); + } + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::FMLA_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return FPMultiplyByElementHalfPrecision(*this, Q, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate); +} + +bool TranslatorVisitor::FMLA_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate); +} + +bool TranslatorVisitor::FMLS_elt_3(bool Q, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return FPMultiplyByElementHalfPrecision(*this, Q, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract); +} + +bool TranslatorVisitor::FMLS_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract); +} + +bool TranslatorVisitor::FMUL_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None); +} + +bool TranslatorVisitor::FMULX_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Extended); +} + +bool TranslatorVisitor::SMLAL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate, Signedness::Signed); +} + +bool TranslatorVisitor::SMLSL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract, Signedness::Signed); +} + +bool TranslatorVisitor::SMULL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None, Signedness::Signed); +} + +bool TranslatorVisitor::SQDMULL_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + const size_t part = Q ? 1 : 0; + const size_t idxsize = H == 1 ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = 64; + const auto [index, Vm] = Combine(size, H, L, M, Vmlo); + + const IR::U128 operand1 = Vpart(datasize, Vn, part); + const IR::U128 operand2 = V(idxsize, Vm); + const IR::U128 index_vector = ir.VectorBroadcastElement(esize, operand2, index); + const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyLong(esize, operand1, index_vector); + + V(128, Vd, result); + return true; +} + +bool TranslatorVisitor::SQDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + const size_t idxsize = H == 1 ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + const auto [index, Vm] = Combine(size, H, L, M, Vmlo); + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(idxsize, Vm); + const IR::U128 index_vector = ir.VectorBroadcastElement(esize, operand2, index); + const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHigh(esize, operand1, index_vector); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SQRDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + if (size == 0b00 || size == 0b11) { + return ReservedValue(); + } + + const size_t idxsize = H == 1 ? 128 : 64; + const size_t esize = 8 << size.ZeroExtend(); + const size_t datasize = Q ? 128 : 64; + const auto [index, Vm] = Combine(size, H, L, M, Vmlo); + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(idxsize, Vm); + const IR::U128 index_vector = ir.VectorBroadcastElement(esize, operand2, index); + const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyHighRounding(esize, operand1, index_vector); + + V(datasize, Vd, result); + return true; +} + +bool TranslatorVisitor::SDOT_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return DotProduct(*this, Q, size, L, M, Vmlo, H, Vn, Vd, &IREmitter::SignExtendToWord); +} + +bool TranslatorVisitor::UDOT_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return DotProduct(*this, Q, size, L, M, Vmlo, H, Vn, Vd, &IREmitter::ZeroExtendToWord); +} + +bool TranslatorVisitor::UMLAL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate, Signedness::Unsigned); +} + +bool TranslatorVisitor::UMLSL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Subtract, Signedness::Unsigned); +} + +bool TranslatorVisitor::UMULL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) { + return MultiplyLong(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None, Signedness::Unsigned); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_dc.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_dc.cpp new file mode 100644 index 0000000000..aea3b5baf6 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_dc.cpp @@ -0,0 +1,51 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +static bool DataCacheInstruction(TranslatorVisitor& v, DataCacheOperation op, const Reg Rt) { + v.ir.DataCacheOperationRaised(op, v.X(64, Rt)); + return true; +} + +bool TranslatorVisitor::DC_IVAC(Reg Rt) { + return DataCacheInstruction(*this, DataCacheOperation::InvalidateByVAToPoC, Rt); +} + +bool TranslatorVisitor::DC_ISW(Reg Rt) { + return DataCacheInstruction(*this, DataCacheOperation::InvalidateBySetWay, Rt); +} + +bool TranslatorVisitor::DC_CSW(Reg Rt) { + return DataCacheInstruction(*this, DataCacheOperation::CleanBySetWay, Rt); +} + +bool TranslatorVisitor::DC_CISW(Reg Rt) { + return DataCacheInstruction(*this, DataCacheOperation::CleanAndInvalidateBySetWay, Rt); +} + +bool TranslatorVisitor::DC_ZVA(Reg Rt) { + return DataCacheInstruction(*this, DataCacheOperation::ZeroByVA, Rt); +} + +bool TranslatorVisitor::DC_CVAC(Reg Rt) { + return DataCacheInstruction(*this, DataCacheOperation::CleanByVAToPoC, Rt); +} + +bool TranslatorVisitor::DC_CVAU(Reg Rt) { + return DataCacheInstruction(*this, DataCacheOperation::CleanByVAToPoU, Rt); +} + +bool TranslatorVisitor::DC_CVAP(Reg Rt) { + return DataCacheInstruction(*this, DataCacheOperation::CleanByVAToPoP, Rt); +} + +bool TranslatorVisitor::DC_CIVAC(Reg Rt) { + return DataCacheInstruction(*this, DataCacheOperation::CleanAndInvalidateByVAToPoC, Rt); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_ic.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_ic.cpp new file mode 100644 index 0000000000..3c91168cab --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/sys_ic.cpp @@ -0,0 +1,31 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::IC_IALLU() { + ir.InstructionCacheOperationRaised(InstructionCacheOperation::InvalidateAllToPoU, ir.Imm64(0)); + ir.SetPC(ir.Imm64(ir.current_location->PC() + 4)); + ir.SetTerm(IR::Term::CheckHalt{IR::Term::ReturnToDispatch{}}); + return false; +} + +bool TranslatorVisitor::IC_IALLUIS() { + ir.InstructionCacheOperationRaised(InstructionCacheOperation::InvalidateAllToPoUInnerSharable, ir.Imm64(0)); + ir.SetPC(ir.Imm64(ir.current_location->PC() + 4)); + ir.SetTerm(IR::Term::CheckHalt{IR::Term::ReturnToDispatch{}}); + return false; +} + +bool TranslatorVisitor::IC_IVAU(Reg Rt) { + ir.InstructionCacheOperationRaised(InstructionCacheOperation::InvalidateByVAToPoU, X(64, Rt)); + ir.SetPC(ir.Imm64(ir.current_location->PC() + 4)); + ir.SetTerm(IR::Term::CheckHalt{IR::Term::ReturnToDispatch{}}); + return false; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system.cpp new file mode 100644 index 0000000000..9c35a8f364 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system.cpp @@ -0,0 +1,161 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +// Register encodings used by MRS and MSR. +// Order of fields: op0, CRn, op1, op2, CRm. +enum class SystemRegisterEncoding : u32 { + // Counter-timer Frequency register + CNTFRQ_EL0 = 0b11'1110'011'000'0000, + // Counter-timer Physical Count register + CNTPCT_EL0 = 0b11'1110'011'001'0000, + // Cache Type Register + CTR_EL0 = 0b11'0000'011'001'0000, + // Data Cache Zero ID register + DCZID_EL0 = 0b11'0000'011'111'0000, + // Floating-point Control Register + FPCR = 0b11'0100'011'000'0100, + // Floating-point Status Register + FPSR = 0b11'0100'011'001'0100, + // NZCV, Condition Flags + NZCV = 0b11'0100'011'000'0010, + // Read/Write Software Thread ID Register + TPIDR_EL0 = 0b11'1101'011'010'0000, + // Read-Only Software Thread ID Register + TPIDRRO_EL0 = 0b11'1101'011'011'0000, +}; + +bool TranslatorVisitor::HINT([[maybe_unused]] Imm<4> CRm, [[maybe_unused]] Imm<3> op2) { + return true; +} + +bool TranslatorVisitor::NOP() { + return true; +} + +bool TranslatorVisitor::YIELD() { + if (!options.hook_hint_instructions) { + return true; + } + return RaiseException(Exception::Yield); +} + +bool TranslatorVisitor::WFE() { + if (!options.hook_hint_instructions) { + return true; + } + return RaiseException(Exception::WaitForEvent); +} + +bool TranslatorVisitor::WFI() { + if (!options.hook_hint_instructions) { + return true; + } + return RaiseException(Exception::WaitForInterrupt); +} + +bool TranslatorVisitor::SEV() { + if (!options.hook_hint_instructions) { + return true; + } + return RaiseException(Exception::SendEvent); +} + +bool TranslatorVisitor::SEVL() { + if (!options.hook_hint_instructions) { + return true; + } + return RaiseException(Exception::SendEventLocal); +} + +bool TranslatorVisitor::CLREX(Imm<4> /*CRm*/) { + ir.ClearExclusive(); + return true; +} + +bool TranslatorVisitor::DSB(Imm<4> /*CRm*/) { + ir.DataSynchronizationBarrier(); + return true; +} + +bool TranslatorVisitor::DMB(Imm<4> /*CRm*/) { + ir.DataMemoryBarrier(); + return true; +} + +bool TranslatorVisitor::ISB(Imm<4> /*CRm*/) { + ir.InstructionSynchronizationBarrier(); + ir.SetPC(ir.Imm64(ir.current_location->PC() + 4)); + ir.SetTerm(IR::Term::ReturnToDispatch{}); + return false; +} + +bool TranslatorVisitor::MSR_reg(Imm<1> o0, Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt) { + const auto sys_reg = concatenate(Imm<1>{1}, o0, CRn, op1, op2, CRm).ZeroExtend<SystemRegisterEncoding>(); + switch (sys_reg) { + case SystemRegisterEncoding::FPCR: + ir.SetFPCR(X(32, Rt)); + ir.SetPC(ir.Imm64(ir.current_location->PC() + 4)); + ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; + case SystemRegisterEncoding::FPSR: + ir.SetFPSR(X(32, Rt)); + return true; + case SystemRegisterEncoding::NZCV: + ir.SetNZCVRaw(X(32, Rt)); + return true; + case SystemRegisterEncoding::TPIDR_EL0: + ir.SetTPIDR(X(64, Rt)); + return true; + default: + break; + } + return InterpretThisInstruction(); +} + +bool TranslatorVisitor::MRS(Imm<1> o0, Imm<3> op1, Imm<4> CRn, Imm<4> CRm, Imm<3> op2, Reg Rt) { + const auto sys_reg = concatenate(Imm<1>{1}, o0, CRn, op1, op2, CRm).ZeroExtend<SystemRegisterEncoding>(); + switch (sys_reg) { + case SystemRegisterEncoding::CNTFRQ_EL0: + X(32, Rt, ir.GetCNTFRQ()); + return true; + case SystemRegisterEncoding::CNTPCT_EL0: + // HACK: Ensure that this is the first instruction in the block it's emitted in, so the cycle count is most up-to-date. + if (!ir.block.empty() && !options.wall_clock_cntpct) { + ir.block.CycleCount()--; + ir.SetTerm(IR::Term::LinkBlock{*ir.current_location}); + return false; + } + X(64, Rt, ir.GetCNTPCT()); + return true; + case SystemRegisterEncoding::CTR_EL0: + X(32, Rt, ir.GetCTR()); + return true; + case SystemRegisterEncoding::DCZID_EL0: + X(32, Rt, ir.GetDCZID()); + return true; + case SystemRegisterEncoding::FPCR: + X(32, Rt, ir.GetFPCR()); + return true; + case SystemRegisterEncoding::FPSR: + X(32, Rt, ir.GetFPSR()); + return true; + case SystemRegisterEncoding::NZCV: + X(32, Rt, ir.GetNZCVRaw()); + return true; + case SystemRegisterEncoding::TPIDR_EL0: + X(64, Rt, ir.GetTPIDR()); + return true; + case SystemRegisterEncoding::TPIDRRO_EL0: + X(64, Rt, ir.GetTPIDRRO()); + return true; + } + return InterpretThisInstruction(); +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_format.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_format.cpp new file mode 100644 index 0000000000..cbed3c34ed --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_format.cpp @@ -0,0 +1,46 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::AXFlag() { + const IR::U32 nzcv = ir.GetNZCVRaw(); + + const IR::U32 z = ir.And(nzcv, ir.Imm32(0x40000000)); + const IR::U32 c = ir.And(nzcv, ir.Imm32(0x20000000)); + const IR::U32 v = ir.And(nzcv, ir.Imm32(0x10000000)); + + const IR::U32 new_z = ir.Or(ir.LogicalShiftLeft(v, ir.Imm8(2)), z); + const IR::U32 new_c = ir.And(ir.AndNot(c, ir.LogicalShiftLeft(v, ir.Imm8(1))), ir.Imm32(0x20000000)); + + ir.SetNZCVRaw(ir.Or(new_z, new_c)); + return true; +} + +bool TranslatorVisitor::XAFlag() { + const IR::U32 nzcv = ir.GetNZCVRaw(); + + const IR::U32 z = ir.And(nzcv, ir.Imm32(0x40000000)); + const IR::U32 c = ir.And(nzcv, ir.Imm32(0x20000000)); + + const IR::U32 not_z = ir.AndNot(ir.Imm32(0x40000000), z); + const IR::U32 not_c = ir.AndNot(ir.Imm32(0x20000000), c); + + const IR::U32 new_n = ir.And(ir.LogicalShiftLeft(not_c, ir.Imm8(2)), + ir.LogicalShiftLeft(not_z, ir.Imm8(1))); + const IR::U32 new_z = ir.And(z, ir.LogicalShiftLeft(c, ir.Imm8(1))); + const IR::U32 new_c = ir.Or(c, ir.LogicalShiftRight(z, ir.Imm8(1))); + const IR::U32 new_v = ir.And(ir.LogicalShiftRight(not_c, ir.Imm8(1)), + ir.LogicalShiftRight(z, ir.Imm8(2))); + + const IR::U32 result = ir.Or(ir.Or(ir.Or(new_n, new_z), new_c), new_v); + + ir.SetNZCVRaw(result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_manipulation.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_manipulation.cpp new file mode 100644 index 0000000000..65ec23e71c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/system_flag_manipulation.cpp @@ -0,0 +1,62 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/translate/impl/impl.h" + +namespace Dynarmic::A64 { + +bool TranslatorVisitor::CFINV() { + const IR::U32 nzcv = ir.GetNZCVRaw(); + const IR::U32 result = ir.Eor(nzcv, ir.Imm32(0x20000000)); + + ir.SetNZCVRaw(result); + return true; +} + +bool TranslatorVisitor::RMIF(Imm<6> lsb, Reg Rn, Imm<4> mask) { + const u32 mask_value = mask.ZeroExtend(); + + // If no bits are to be moved into the NZCV bits, then we + // just preserve the bits and do no extra work. + if (mask_value == 0) { + ir.SetNZCVRaw(ir.GetNZCVRaw()); + return true; + } + + const IR::U64 tmp_reg = ir.GetX(Rn); + const IR::U64 rotated = ir.RotateRight(tmp_reg, ir.Imm8(lsb.ZeroExtend<u8>())); + const IR::U32 shifted = ir.LeastSignificantWord(ir.LogicalShiftLeft(rotated, ir.Imm8(28))); + + // On the other hand, if all mask bits are set, then we move all four + // relevant bits in the source register to the NZCV bits. + if (mask_value == 0b1111) { + ir.SetNZCVRaw(shifted); + return true; + } + + // Determine which bits from the PSTATE will be preserved during the operation. + u32 preservation_mask = 0; + if ((mask_value & 0b1000) == 0) { + preservation_mask |= 1U << 31; + } + if ((mask_value & 0b0100) == 0) { + preservation_mask |= 1U << 30; + } + if ((mask_value & 0b0010) == 0) { + preservation_mask |= 1U << 29; + } + if ((mask_value & 0b0001) == 0) { + preservation_mask |= 1U << 28; + } + + const IR::U32 masked = ir.And(shifted, ir.Imm32(~preservation_mask)); + const IR::U32 nzcv = ir.And(ir.GetNZCVRaw(), ir.Imm32(preservation_mask)); + const IR::U32 result = ir.Or(nzcv, masked); + + ir.SetNZCVRaw(result); + return true; +} + +} // namespace Dynarmic::A64 diff --git a/externals/dynarmic/src/dynarmic/frontend/decoder/decoder_detail.h b/externals/dynarmic/src/dynarmic/frontend/decoder/decoder_detail.h new file mode 100644 index 0000000000..cf7d0e64bc --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/decoder/decoder_detail.h @@ -0,0 +1,190 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <algorithm> +#include <array> +#include <tuple> + +#include <mcl/assert.hpp> +#include <mcl/bitsizeof.hpp> +#include <mcl/type_traits/function_info.hpp> + +namespace Dynarmic::Decoder { +namespace detail { + +template<size_t N> +inline consteval std::array<char, N> StringToArray(const char (&str)[N + 1]) { + std::array<char, N> result{}; + for (size_t i = 0; i < N; i++) { + result[i] = str[i]; + } + return result; +} + +/** + * Helper functions for the decoders. + * + * @tparam MatcherT The type of the Matcher to use. + */ +template<class MatcherT> +struct detail { + using opcode_type = typename MatcherT::opcode_type; + using visitor_type = typename MatcherT::visitor_type; + + static constexpr size_t opcode_bitsize = mcl::bitsizeof<opcode_type>; + + /** + * Generates the mask and the expected value after masking from a given bitstring. + * A '0' in a bitstring indicates that a zero must be present at that bit position. + * A '1' in a bitstring indicates that a one must be present at that bit position. + */ +#ifdef __clang__ + static constexpr auto GetMaskAndExpect(std::array<char, opcode_bitsize> bitstring) { +#else + static consteval auto GetMaskAndExpect(std::array<char, opcode_bitsize> bitstring) { +#endif + const auto one = static_cast<opcode_type>(1); + opcode_type mask = 0, expect = 0; + for (size_t i = 0; i < opcode_bitsize; i++) { + const size_t bit_position = opcode_bitsize - i - 1; + switch (bitstring[i]) { + case '0': + mask |= one << bit_position; + break; + case '1': + expect |= one << bit_position; + mask |= one << bit_position; + break; + default: + // Ignore + break; + } + } + return std::make_tuple(mask, expect); + } + + /** + * Generates the masks and shifts for each argument. + * A '-' in a bitstring indicates that we don't care about that value. + * An argument is specified by a continuous string of the same character. + */ + template<size_t N> + static consteval auto GetArgInfo(std::array<char, opcode_bitsize> bitstring) { + std::array<opcode_type, N> masks = {}; + std::array<size_t, N> shifts = {}; + size_t arg_index = 0; + char ch = 0; + + for (size_t i = 0; i < opcode_bitsize; i++) { + if (bitstring[i] == '0' || bitstring[i] == '1' || bitstring[i] == '-') { + if (ch != 0) { + ch = 0; + arg_index++; + } + } else { + if (ch == 0) { + ch = bitstring[i]; + } else if (ch != bitstring[i]) { + ch = bitstring[i]; + arg_index++; + } + + if constexpr (N > 0) { + const size_t bit_position = opcode_bitsize - i - 1; + + if (arg_index >= N) + throw std::out_of_range("Unexpected field"); + + masks[arg_index] |= static_cast<opcode_type>(1) << bit_position; + shifts[arg_index] = bit_position; + } else { + throw std::out_of_range("Unexpected field"); + } + } + } + +#if !defined(DYNARMIC_IGNORE_ASSERTS) && !defined(__ANDROID__) + // Avoids a MSVC ICE, and avoids Android NDK issue. + ASSERT(std::all_of(masks.begin(), masks.end(), [](auto m) { return m != 0; })); +#endif + + return std::make_tuple(masks, shifts); + } + + /** + * This struct's Make member function generates a lambda which decodes an instruction based on + * the provided arg_masks and arg_shifts. The Visitor member function to call is provided as a + * template argument. + */ + template<typename FnT> + struct VisitorCaller; + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4800) // forcing value to bool 'true' or 'false' (performance warning) +#endif + template<typename Visitor, typename... Args, typename CallRetT> + struct VisitorCaller<CallRetT (Visitor::*)(Args...)> { + template<size_t... iota> + static auto Make(std::integer_sequence<size_t, iota...>, + CallRetT (Visitor::*const fn)(Args...), + const std::array<opcode_type, sizeof...(iota)> arg_masks, + const std::array<size_t, sizeof...(iota)> arg_shifts) { + static_assert(std::is_same_v<visitor_type, Visitor>, "Member function is not from Matcher's Visitor"); + return [fn, arg_masks, arg_shifts](Visitor& v, opcode_type instruction) { + (void)instruction; + (void)arg_masks; + (void)arg_shifts; + return (v.*fn)(static_cast<Args>((instruction & arg_masks[iota]) >> arg_shifts[iota])...); + }; + } + }; + + template<typename Visitor, typename... Args, typename CallRetT> + struct VisitorCaller<CallRetT (Visitor::*)(Args...) const> { + template<size_t... iota> + static auto Make(std::integer_sequence<size_t, iota...>, + CallRetT (Visitor::*const fn)(Args...) const, + const std::array<opcode_type, sizeof...(iota)> arg_masks, + const std::array<size_t, sizeof...(iota)> arg_shifts) { + static_assert(std::is_same_v<visitor_type, const Visitor>, "Member function is not from Matcher's Visitor"); + return [fn, arg_masks, arg_shifts](const Visitor& v, opcode_type instruction) { + (void)instruction; + (void)arg_masks; + (void)arg_shifts; + return (v.*fn)(static_cast<Args>((instruction & arg_masks[iota]) >> arg_shifts[iota])...); + }; + } + }; +#ifdef _MSC_VER +# pragma warning(pop) +#endif + + /** + * Creates a matcher that can match and parse instructions based on bitstring. + * See also: GetMaskAndExpect and GetArgInfo for format of bitstring. + */ + template<auto bitstring, typename FnT> + static auto GetMatcher(FnT fn, const char* const name) { + constexpr size_t args_count = mcl::parameter_count_v<FnT>; + + constexpr auto mask = std::get<0>(GetMaskAndExpect(bitstring)); + constexpr auto expect = std::get<1>(GetMaskAndExpect(bitstring)); + constexpr auto arg_masks = std::get<0>(GetArgInfo<args_count>(bitstring)); + constexpr auto arg_shifts = std::get<1>(GetArgInfo<args_count>(bitstring)); + + using Iota = std::make_index_sequence<args_count>; + + const auto proxy_fn = VisitorCaller<FnT>::Make(Iota(), fn, arg_masks, arg_shifts); + return MatcherT(name, mask, expect, proxy_fn); + } +}; + +#define DYNARMIC_DECODER_GET_MATCHER(MatcherT, fn, name, bitstring) Decoder::detail::detail<MatcherT<V>>::template GetMatcher<bitstring>(&V::fn, name) + +} // namespace detail +} // namespace Dynarmic::Decoder diff --git a/externals/dynarmic/src/dynarmic/frontend/decoder/matcher.h b/externals/dynarmic/src/dynarmic/frontend/decoder/matcher.h new file mode 100644 index 0000000000..adf9556dd4 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/decoder/matcher.h @@ -0,0 +1,76 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <functional> + +#include <mcl/assert.hpp> + +namespace Dynarmic::Decoder { + +/** + * Generic instruction handling construct. + * + * @tparam Visitor An arbitrary visitor type that will be passed through + * to the function being handled. This type must be the + * type of the first parameter in a handler function. + * + * @tparam OpcodeType Type representing an opcode. This must be the + * type of the second parameter in a handler function. + */ +template<typename Visitor, typename OpcodeType> +class Matcher { +public: + using opcode_type = OpcodeType; + using visitor_type = Visitor; + using handler_return_type = typename Visitor::instruction_return_type; + using handler_function = std::function<handler_return_type(Visitor&, opcode_type)>; + + Matcher(const char* const name, opcode_type mask, opcode_type expected, handler_function func) + : name{name}, mask{mask}, expected{expected}, fn{std::move(func)} {} + + /// Gets the name of this type of instruction. + const char* GetName() const { + return name; + } + + /// Gets the mask for this instruction. + opcode_type GetMask() const { + return mask; + } + + /// Gets the expected value after masking for this instruction. + opcode_type GetExpected() const { + return expected; + } + + /** + * Tests to see if the given instruction is the instruction this matcher represents. + * @param instruction The instruction to test + * @returns true if the given instruction matches. + */ + bool Matches(opcode_type instruction) const { + return (instruction & mask) == expected; + } + + /** + * Calls the corresponding instruction handler on visitor for this type of instruction. + * @param v The visitor to use + * @param instruction The instruction to decode. + */ + handler_return_type call(Visitor& v, opcode_type instruction) const { + ASSERT(Matches(instruction)); + return fn(v, instruction); + } + +private: + const char* name; + opcode_type mask; + opcode_type expected; + handler_function fn; +}; + +} // namespace Dynarmic::Decoder diff --git a/externals/dynarmic/src/dynarmic/frontend/imm.cpp b/externals/dynarmic/src/dynarmic/frontend/imm.cpp new file mode 100644 index 0000000000..c802864df9 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/imm.cpp @@ -0,0 +1,67 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/imm.h" + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/stdint.hpp> + +namespace Dynarmic { + +u64 AdvSIMDExpandImm(bool op, Imm<4> cmode, Imm<8> imm8) { + switch (cmode.Bits<1, 3>()) { + case 0b000: + return mcl::bit::replicate_element<u32, u64>(imm8.ZeroExtend<u64>()); + case 0b001: + return mcl::bit::replicate_element<u32, u64>(imm8.ZeroExtend<u64>() << 8); + case 0b010: + return mcl::bit::replicate_element<u32, u64>(imm8.ZeroExtend<u64>() << 16); + case 0b011: + return mcl::bit::replicate_element<u32, u64>(imm8.ZeroExtend<u64>() << 24); + case 0b100: + return mcl::bit::replicate_element<u16, u64>(imm8.ZeroExtend<u64>()); + case 0b101: + return mcl::bit::replicate_element<u16, u64>(imm8.ZeroExtend<u64>() << 8); + case 0b110: + if (!cmode.Bit<0>()) { + return mcl::bit::replicate_element<u32, u64>((imm8.ZeroExtend<u64>() << 8) | mcl::bit::ones<u64>(8)); + } + return mcl::bit::replicate_element<u32, u64>((imm8.ZeroExtend<u64>() << 16) | mcl::bit::ones<u64>(16)); + case 0b111: + if (!cmode.Bit<0>() && !op) { + return mcl::bit::replicate_element<u8, u64>(imm8.ZeroExtend<u64>()); + } + if (!cmode.Bit<0>() && op) { + u64 result = 0; + result |= imm8.Bit<0>() ? mcl::bit::ones<u64>(8) << (0 * 8) : 0; + result |= imm8.Bit<1>() ? mcl::bit::ones<u64>(8) << (1 * 8) : 0; + result |= imm8.Bit<2>() ? mcl::bit::ones<u64>(8) << (2 * 8) : 0; + result |= imm8.Bit<3>() ? mcl::bit::ones<u64>(8) << (3 * 8) : 0; + result |= imm8.Bit<4>() ? mcl::bit::ones<u64>(8) << (4 * 8) : 0; + result |= imm8.Bit<5>() ? mcl::bit::ones<u64>(8) << (5 * 8) : 0; + result |= imm8.Bit<6>() ? mcl::bit::ones<u64>(8) << (6 * 8) : 0; + result |= imm8.Bit<7>() ? mcl::bit::ones<u64>(8) << (7 * 8) : 0; + return result; + } + if (cmode.Bit<0>() && !op) { + u64 result = 0; + result |= imm8.Bit<7>() ? 0x80000000 : 0; + result |= imm8.Bit<6>() ? 0x3E000000 : 0x40000000; + result |= imm8.Bits<0, 5, u64>() << 19; + return mcl::bit::replicate_element<u32, u64>(result); + } + if (cmode.Bit<0>() && op) { + u64 result = 0; + result |= imm8.Bit<7>() ? 0x80000000'00000000 : 0; + result |= imm8.Bit<6>() ? 0x3FC00000'00000000 : 0x40000000'00000000; + result |= imm8.Bits<0, 5, u64>() << 48; + return result; + } + } + UNREACHABLE(); +} + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/frontend/imm.h b/externals/dynarmic/src/dynarmic/frontend/imm.h new file mode 100644 index 0000000000..7d86abbb61 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/frontend/imm.h @@ -0,0 +1,168 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <type_traits> + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> +#include <mcl/bitsizeof.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/math_util.h" + +namespace Dynarmic { + +/** + * Imm represents an immediate value in an AArch32/AArch64 instruction. + * Imm is used during translation as a typesafe way of passing around immediates of fixed sizes. + */ +template<size_t bit_size_> +class Imm { +public: + static constexpr size_t bit_size = bit_size_; + + explicit Imm(u32 value) + : value(value) { + ASSERT_MSG((mcl::bit::get_bits<0, bit_size - 1>(value) == value), "More bits in value than expected"); + } + + template<typename T = u32> + T ZeroExtend() const { + static_assert(mcl::bitsizeof<T> >= bit_size); + return static_cast<T>(value); + } + + template<typename T = s32> + T SignExtend() const { + static_assert(mcl::bitsizeof<T> >= bit_size); + return static_cast<T>(mcl::bit::sign_extend<bit_size, std::make_unsigned_t<T>>(value)); + } + + template<size_t bit> + bool Bit() const { + static_assert(bit < bit_size); + return mcl::bit::get_bit<bit>(value); + } + + template<size_t begin_bit, size_t end_bit, typename T = u32> + T Bits() const { + static_assert(begin_bit <= end_bit && end_bit < bit_size); + static_assert(mcl::bitsizeof<T> >= end_bit - begin_bit + 1); + return static_cast<T>(mcl::bit::get_bits<begin_bit, end_bit>(value)); + } + + bool operator==(Imm other) const { + return value == other.value; + } + + bool operator!=(Imm other) const { + return !operator==(other); + } + + bool operator<(Imm other) const { + return value < other.value; + } + + bool operator<=(Imm other) const { + return value <= other.value; + } + + bool operator>(Imm other) const { + return value > other.value; + } + + bool operator>=(Imm other) const { + return value >= other.value; + } + +private: + static_assert(bit_size != 0, "Cannot have a zero-sized immediate"); + static_assert(bit_size <= 32, "Cannot have an immediate larger than the instruction size"); + + u32 value; +}; + +template<size_t bit_size> +bool operator==(u32 a, Imm<bit_size> b) { + return Imm<bit_size>{a} == b; +} + +template<size_t bit_size> +bool operator==(Imm<bit_size> a, u32 b) { + return Imm<bit_size>{b} == a; +} + +template<size_t bit_size> +bool operator!=(u32 a, Imm<bit_size> b) { + return !operator==(a, b); +} + +template<size_t bit_size> +bool operator!=(Imm<bit_size> a, u32 b) { + return !operator==(a, b); +} + +template<size_t bit_size> +bool operator<(u32 a, Imm<bit_size> b) { + return Imm<bit_size>{a} < b; +} + +template<size_t bit_size> +bool operator<(Imm<bit_size> a, u32 b) { + return a < Imm<bit_size>{b}; +} + +template<size_t bit_size> +bool operator<=(u32 a, Imm<bit_size> b) { + return !operator<(b, a); +} + +template<size_t bit_size> +bool operator<=(Imm<bit_size> a, u32 b) { + return !operator<(b, a); +} + +template<size_t bit_size> +bool operator>(u32 a, Imm<bit_size> b) { + return operator<(b, a); +} + +template<size_t bit_size> +bool operator>(Imm<bit_size> a, u32 b) { + return operator<(b, a); +} + +template<size_t bit_size> +bool operator>=(u32 a, Imm<bit_size> b) { + return !operator<(a, b); +} + +template<size_t bit_size> +bool operator>=(Imm<bit_size> a, u32 b) { + return !operator<(a, b); +} + +/** + * Concatenate immediates together. + * Left to right corresponds to most significant imm to least significant imm. + * This is equivalent to a:b:...:z in ASL. + */ +template<size_t first_bit_size, size_t... rest_bit_sizes> +auto concatenate(Imm<first_bit_size> first, Imm<rest_bit_sizes>... rest) { + if constexpr (sizeof...(rest) == 0) { + return first; + } else { + const auto concat_rest = concatenate(rest...); + const u32 value = (first.ZeroExtend() << concat_rest.bit_size) | concat_rest.ZeroExtend(); + return Imm<Common::Sum(first_bit_size, rest_bit_sizes...)>{value}; + } +} + +/// Expands an Advanced SIMD modified immediate. +u64 AdvSIMDExpandImm(bool op, Imm<4> cmode, Imm<8> imm8); + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/A32/a32.h b/externals/dynarmic/src/dynarmic/interface/A32/a32.h new file mode 100644 index 0000000000..e1c5b40fd3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/interface/A32/a32.h @@ -0,0 +1,109 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <cstddef> +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include "dynarmic/interface/A32/config.h" +#include "dynarmic/interface/halt_reason.h" + +namespace Dynarmic { +namespace A32 { + +class Jit final { +public: + explicit Jit(UserConfig conf); + ~Jit(); + + /** + * Runs the emulated CPU. + * Cannot be recursively called. + */ + HaltReason Run(); + + /** + * Steps the emulated CPU. + * Cannot be recursively called. + */ + HaltReason Step(); + + /** + * Clears the code cache of all compiled code. + * Can be called at any time. Halts execution if called within a callback. + */ + void ClearCache(); + + /** + * Invalidate the code cache at a range of addresses. + * @param start_address The starting address of the range to invalidate. + * @param length The length (in bytes) of the range to invalidate. + */ + void InvalidateCacheRange(std::uint32_t start_address, std::size_t length); + + /** + * Reset CPU state to state at startup. Does not clear code cache. + * Cannot be called from a callback. + */ + void Reset(); + + /** + * Stops execution in Jit::Run. + */ + void HaltExecution(HaltReason hr = HaltReason::UserDefined1); + + /** + * Clears a halt reason from flags. + * Warning: Only use this if you're sure this won't introduce races. + */ + void ClearHalt(HaltReason hr = HaltReason::UserDefined1); + + /// View and modify registers. + std::array<std::uint32_t, 16>& Regs(); + const std::array<std::uint32_t, 16>& Regs() const; + std::array<std::uint32_t, 64>& ExtRegs(); + const std::array<std::uint32_t, 64>& ExtRegs() const; + + /// View and modify CPSR. + std::uint32_t Cpsr() const; + void SetCpsr(std::uint32_t value); + + /// View and modify FPSCR. + std::uint32_t Fpscr() const; + void SetFpscr(std::uint32_t value); + + /// Clears exclusive state for this core. + void ClearExclusiveState(); + + /** + * Returns true if Jit::Run was called but hasn't returned yet. + * i.e.: We're in a callback. + */ + bool IsExecuting() const { + return is_executing; + } + + /// Debugging: Dump a disassembly all compiled code to the console. + void DumpDisassembly() const; + + /** + * Disassemble the instructions following the current pc and return + * the resulting instructions as a vector of their string representations. + */ + std::vector<std::string> Disassemble() const; + +private: + bool is_executing = false; + + struct Impl; + std::unique_ptr<Impl> impl; +}; + +} // namespace A32 +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/A32/arch_version.h b/externals/dynarmic/src/dynarmic/interface/A32/arch_version.h new file mode 100644 index 0000000000..240e40ee4c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/interface/A32/arch_version.h @@ -0,0 +1,23 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic { +namespace A32 { + +enum class ArchVersion { + v3, + v4, + v4T, + v5TE, + v6K, + v6T2, + v7, + v8, +}; + +} // namespace A32 +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/A32/config.h b/externals/dynarmic/src/dynarmic/interface/A32/config.h new file mode 100644 index 0000000000..a3c2aa159c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/interface/A32/config.h @@ -0,0 +1,246 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <cstddef> +#include <cstdint> +#include <memory> +#include <optional> + +#include "dynarmic/frontend/A32/translate/translate_callbacks.h" +#include "dynarmic/interface/A32/arch_version.h" +#include "dynarmic/interface/optimization_flags.h" + +namespace Dynarmic { +class ExclusiveMonitor; +} // namespace Dynarmic + +namespace Dynarmic { +namespace A32 { + +using VAddr = std::uint32_t; + +class Coprocessor; + +enum class Exception { + /// An UndefinedFault occured due to executing instruction with an unallocated encoding + UndefinedInstruction, + /// An unpredictable instruction is to be executed. Implementation-defined behaviour should now happen. + /// This behaviour is up to the user of this library to define. + UnpredictableInstruction, + /// A decode error occurred when decoding this instruction. This should never happen. + DecodeError, + /// A SEV instruction was executed. The event register of all PEs should be set. (Hint instruction.) + SendEvent, + /// A SEVL instruction was executed. The event register of the current PE should be set. (Hint instruction.) + SendEventLocal, + /// A WFI instruction was executed. You may now enter a low-power state. (Hint instruction.) + WaitForInterrupt, + /// A WFE instruction was executed. You may now enter a low-power state if the event register is clear. (Hint instruction.) + WaitForEvent, + /// A YIELD instruction was executed. (Hint instruction.) + Yield, + /// A BKPT instruction was executed. + Breakpoint, + /// A PLD instruction was executed. (Hint instruction.) + PreloadData, + /// A PLDW instruction was executed. (Hint instruction.) + PreloadDataWithIntentToWrite, + /// A PLI instruction was executed. (Hint instruction.) + PreloadInstruction, + /// Attempted to execute a code block at an address for which MemoryReadCode returned std::nullopt. + /// (Intended to be used to emulate memory protection faults.) + NoExecuteFault, +}; + +/// These function pointers may be inserted into compiled code. +struct UserCallbacks : public TranslateCallbacks { + virtual ~UserCallbacks() = default; + + // All reads through this callback are 4-byte aligned. + // Memory must be interpreted as little endian. + std::optional<std::uint32_t> MemoryReadCode(VAddr vaddr) override { return MemoryRead32(vaddr); } + + // This function is called before the instruction at pc is read. + // IR code can be emitted by the callee prior to instruction handling. + // By returning true the callee precludes the translation of the instruction; + // in such case the callee is responsible for setting the terminal. + bool PreCodeReadHook(bool /*is_thumb*/, VAddr /*pc*/, A32::IREmitter& /*ir*/) override { return true; } + + // Thus function is called before the instruction at pc is interpreted. + // IR code can be emitted by the callee prior to translation of the instruction. + void PreCodeTranslationHook(bool /*is_thumb*/, VAddr /*pc*/, A32::IREmitter& /*ir*/) override {} + + // Reads through these callbacks may not be aligned. + // Memory must be interpreted as if ENDIANSTATE == 0, endianness will be corrected by the JIT. + virtual std::uint8_t MemoryRead8(VAddr vaddr) = 0; + virtual std::uint16_t MemoryRead16(VAddr vaddr) = 0; + virtual std::uint32_t MemoryRead32(VAddr vaddr) = 0; + virtual std::uint64_t MemoryRead64(VAddr vaddr) = 0; + + // Writes through these callbacks may not be aligned. + virtual void MemoryWrite8(VAddr vaddr, std::uint8_t value) = 0; + virtual void MemoryWrite16(VAddr vaddr, std::uint16_t value) = 0; + virtual void MemoryWrite32(VAddr vaddr, std::uint32_t value) = 0; + virtual void MemoryWrite64(VAddr vaddr, std::uint64_t value) = 0; + + // Writes through these callbacks may not be aligned. + virtual bool MemoryWriteExclusive8(VAddr /*vaddr*/, std::uint8_t /*value*/, std::uint8_t /*expected*/) { return false; } + virtual bool MemoryWriteExclusive16(VAddr /*vaddr*/, std::uint16_t /*value*/, std::uint16_t /*expected*/) { return false; } + virtual bool MemoryWriteExclusive32(VAddr /*vaddr*/, std::uint32_t /*value*/, std::uint32_t /*expected*/) { return false; } + virtual bool MemoryWriteExclusive64(VAddr /*vaddr*/, std::uint64_t /*value*/, std::uint64_t /*expected*/) { return false; } + + // If this callback returns true, the JIT will assume MemoryRead* callbacks will always + // return the same value at any point in time for this vaddr. The JIT may use this information + // in optimizations. + // A conservative implementation that always returns false is safe. + virtual bool IsReadOnlyMemory(VAddr /*vaddr*/) { return false; } + + /// The interpreter must execute exactly num_instructions starting from PC. + virtual void InterpreterFallback(VAddr pc, size_t num_instructions) = 0; + + // This callback is called whenever a SVC instruction is executed. + virtual void CallSVC(std::uint32_t swi) = 0; + + virtual void ExceptionRaised(VAddr pc, Exception exception) = 0; + + virtual void InstructionSynchronizationBarrierRaised() {} + + // Timing-related callbacks + // ticks ticks have passed + virtual void AddTicks(std::uint64_t ticks) = 0; + // How many more ticks am I allowed to execute? + virtual std::uint64_t GetTicksRemaining() = 0; + // How many ticks should this instruction take to execute? + std::uint64_t GetTicksForCode(bool /*is_thumb*/, VAddr /*vaddr*/, std::uint32_t /*instruction*/) override { return 1; } +}; + +struct UserConfig { + UserCallbacks* callbacks; + + size_t processor_id = 0; + ExclusiveMonitor* global_monitor = nullptr; + + /// Select the architecture version to use. + /// There are minor behavioural differences between versions. + ArchVersion arch_version = ArchVersion::v8; + + /// This selects other optimizations than can't otherwise be disabled by setting other + /// configuration options. This includes: + /// - IR optimizations + /// - Block linking optimizations + /// - RSB optimizations + /// This is intended to be used for debugging. + OptimizationFlag optimizations = all_safe_optimizations; + + bool HasOptimization(OptimizationFlag f) const { + if (!unsafe_optimizations) { + f &= all_safe_optimizations; + } + return (f & optimizations) != no_optimizations; + } + + /// This enables unsafe optimizations that reduce emulation accuracy in favour of speed. + /// For safety, in order to enable unsafe optimizations you have to set BOTH this flag + /// AND the appropriate flag bits above. + /// The prefered and tested mode for this library is with unsafe optimizations disabled. + bool unsafe_optimizations = false; + + // Page Table + // The page table is used for faster memory access. If an entry in the table is nullptr, + // the JIT will fallback to calling the MemoryRead*/MemoryWrite* callbacks. + static constexpr std::size_t PAGE_BITS = 12; + static constexpr std::size_t NUM_PAGE_TABLE_ENTRIES = 1 << (32 - PAGE_BITS); + std::array<std::uint8_t*, NUM_PAGE_TABLE_ENTRIES>* page_table = nullptr; + /// Determines if the pointer in the page_table shall be offseted locally or globally. + /// 'false' will access page_table[addr >> bits][addr & mask] + /// 'true' will access page_table[addr >> bits][addr] + /// Note: page_table[addr >> bits] will still be checked to verify active pages. + /// So there might be wrongly faulted pages which maps to nullptr. + /// This can be avoided by carefully allocating the memory region. + bool absolute_offset_page_table = false; + /// Masks out the first N bits in host pointers from the page table. + /// The intention behind this is to allow users of Dynarmic to pack attributes in the + /// same integer and update the pointer attribute pair atomically. + /// If the configured value is 3, all pointers will be forcefully aligned to 8 bytes. + int page_table_pointer_mask_bits = 0; + /// Determines if we should detect memory accesses via page_table that straddle are + /// misaligned. Accesses that straddle page boundaries will fallback to the relevant + /// memory callback. + /// This value should be the required access sizes this applies to ORed together. + /// To detect any access, use: 8 | 16 | 32 | 64. + std::uint8_t detect_misaligned_access_via_page_table = 0; + /// Determines if the above option only triggers when the misalignment straddles a + /// page boundary. + bool only_detect_misalignment_via_page_table_on_page_boundary = false; + + // Fastmem Pointer + // This should point to the beginning of a 4GB address space which is in arranged just like + // what you wish for emulated memory to be. If the host page faults on an address, the JIT + // will fallback to calling the MemoryRead*/MemoryWrite* callbacks. + void* fastmem_pointer = nullptr; + /// Determines if instructions that pagefault should cause recompilation of that block + /// with fastmem disabled. + /// Recompiled code will use the page_table if this is available, otherwise memory + /// accesses will hit the memory callbacks. + bool recompile_on_fastmem_failure = true; + + /// Determines if we should use the above fastmem_pointer for exclusive reads and + /// writes. On x64, dynarmic currently relies on x64 cmpxchg semantics which may not + /// provide fully accurate emulation. + bool fastmem_exclusive_access = false; + /// Determines if exclusive access instructions that pagefault should cause + /// recompilation of that block with fastmem disabled. Recompiled code will use memory + /// callbacks. + bool recompile_on_exclusive_fastmem_failure = true; + + // Coprocessors + std::array<std::shared_ptr<Coprocessor>, 16> coprocessors{}; + + /// When set to true, UserCallbacks::InstructionSynchronizationBarrierRaised will be + /// called when an ISB instruction is executed. + /// When set to false, ISB will be treated as a NOP instruction. + bool hook_isb = false; + + /// Hint instructions would cause ExceptionRaised to be called with the appropriate + /// argument. + bool hook_hint_instructions = false; + + /// This option relates to translation. Generally when we run into an unpredictable + /// instruction the ExceptionRaised callback is called. If this is true, we define + /// definite behaviour for some unpredictable instructions. + bool define_unpredictable_behaviour = false; + + /// HACK: + /// This tells the translator a wall clock will be used, thus allowing it + /// to avoid writting certain unnecessary code only needed for cycle timers. + bool wall_clock_cntpct = false; + + /// This allows accurately emulating protection fault handlers. If true, we check + /// for exit after every data memory access by the emulated program. + bool check_halt_on_memory_access = false; + + /// This option allows you to disable cycle counting. If this is set to false, + /// AddTicks and GetTicksRemaining are never called, and no cycle counting is done. + bool enable_cycle_counting = true; + + /// This option relates to the CPSR.E flag. Enabling this option disables modification + /// of CPSR.E by the emulated program, forcing it to 0. + /// NOTE: Calling Jit::SetCpsr with CPSR.E=1 while this option is enabled may result + /// in unusual behavior. + bool always_little_endian = false; + + // Minimum size is about 8MiB. Maximum size is about 128MiB (arm64 host) or 2GiB (x64 host). + // Maximum size is limited by the maximum length of a x86_64 / arm64 jump. + size_t code_cache_size = 128 * 1024 * 1024; // bytes + + /// Internal use only + bool very_verbose_debugging_output = false; +}; + +} // namespace A32 +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/A32/coprocessor.h b/externals/dynarmic/src/dynarmic/interface/A32/coprocessor.h new file mode 100644 index 0000000000..7ee2e96ee3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/interface/A32/coprocessor.h @@ -0,0 +1,110 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <cstdint> +#include <optional> +#include <variant> + +#include "dynarmic/interface/A32/coprocessor_util.h" + +namespace Dynarmic { +namespace A32 { + +class Jit; + +class Coprocessor { +public: + virtual ~Coprocessor() = default; + + struct Callback { + /** + * @param jit CPU state + * @param user_arg Set to Callback::user_arg at runtime + * @param arg0 Purpose of this argument depends on type of callback. + * @param arg1 Purpose of this argument depends on type of callback. + * @return Purpose of return value depends on type of callback. + */ + std::uint64_t (*function)(void* user_arg, std::uint32_t arg0, std::uint32_t arg1); + /// If std::nullopt, function will be called with a user_arg parameter containing garbage. + std::optional<void*> user_arg; + }; + + /** + * std::monostate: coprocessor exception will be compiled + * Callback: a call to the Callback will be compiled + * std::uint32_t*: a write/read to that memory address will be compiled + */ + using CallbackOrAccessOneWord = std::variant<std::monostate, Callback, std::uint32_t*>; + + /** + * std::monostate: coprocessor exception will be compiled + * Callback: a call to the Callback will be compiled + * std::array<std::uint32_t*, 2>: a write/read to those memory addresses will be compiled + */ + using CallbackOrAccessTwoWords = std::variant<std::monostate, Callback, std::array<std::uint32_t*, 2>>; + + /** + * Called when compiling CDP or CDP2 for this coprocessor. + * A return value of std::nullopt will cause a coprocessor exception to be compiled. + * arg0, arg1 and return value of callback are ignored. + */ + virtual std::optional<Callback> CompileInternalOperation(bool two, unsigned opc1, CoprocReg CRd, CoprocReg CRn, CoprocReg CRm, unsigned opc2) = 0; + + /** + * Called when compiling MCR or MCR2 for this coprocessor. + * A return value of std::monostate will cause a coprocessor exception to be compiled. + * arg0 of the callback will contain the word sent to the coprocessor. + * arg1 and return value of the callback are ignored. + */ + virtual CallbackOrAccessOneWord CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn, CoprocReg CRm, unsigned opc2) = 0; + + /** + * Called when compiling MCRR or MCRR2 for this coprocessor. + * A return value of std::monostate will cause a coprocessor exception to be compiled. + * arg0 and arg1 of the callback will contain the words sent to the coprocessor. + * The return value of the callback is ignored. + */ + virtual CallbackOrAccessTwoWords CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) = 0; + + /** + * Called when compiling MRC or MRC2 for this coprocessor. + * A return value of std::monostate will cause a coprocessor exception to be compiled. + * The return value of the callback should contain word from coprocessor. + * The low word of the return value will be stored in Rt. + * arg0 and arg1 of the callback are ignored. + */ + virtual CallbackOrAccessOneWord CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn, CoprocReg CRm, unsigned opc2) = 0; + + /** + * Called when compiling MRRC or MRRC2 for this coprocessor. + * A return value of std::monostate will cause a coprocessor exception to be compiled. + * The return value of the callback should contain words from coprocessor. + * The low word of the return value will be stored in Rt. + * The high word of the return value will be stored in Rt2. + * arg0 and arg1 of the callback are ignored. + */ + virtual CallbackOrAccessTwoWords CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) = 0; + + /** + * Called when compiling LDC or LDC2 for this coprocessor. + * A return value of std::nullopt will cause a coprocessor exception to be compiled. + * arg0 of the callback will contain the start address. + * arg1 and return value of the callback are ignored. + */ + virtual std::optional<Callback> CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd, std::optional<std::uint8_t> option) = 0; + + /** + * Called when compiling STC or STC2 for this coprocessor. + * A return value of std::nullopt will cause a coprocessor exception to be compiled. + * arg0 of the callback will contain the start address. + * arg1 and return value of the callback are ignored. + */ + virtual std::optional<Callback> CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd, std::optional<std::uint8_t> option) = 0; +}; + +} // namespace A32 +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/A32/coprocessor_util.h b/externals/dynarmic/src/dynarmic/interface/A32/coprocessor_util.h new file mode 100644 index 0000000000..ed695d17e9 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/interface/A32/coprocessor_util.h @@ -0,0 +1,31 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic { +namespace A32 { + +enum class CoprocReg { + C0, + C1, + C2, + C3, + C4, + C5, + C6, + C7, + C8, + C9, + C10, + C11, + C12, + C13, + C14, + C15 +}; + +} // namespace A32 +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/A32/disassembler.h b/externals/dynarmic/src/dynarmic/interface/A32/disassembler.h new file mode 100644 index 0000000000..54058bf5ee --- /dev/null +++ b/externals/dynarmic/src/dynarmic/interface/A32/disassembler.h @@ -0,0 +1,18 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <cstdint> +#include <string> + +namespace Dynarmic { +namespace A32 { + +std::string DisassembleArm(std::uint32_t instruction); +std::string DisassembleThumb16(std::uint16_t instruction); + +} // namespace A32 +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/A64/a64.h b/externals/dynarmic/src/dynarmic/interface/A64/a64.h new file mode 100644 index 0000000000..a150da8448 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/interface/A64/a64.h @@ -0,0 +1,137 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <cstddef> +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include "dynarmic/interface/A64/config.h" +#include "dynarmic/interface/halt_reason.h" + +namespace Dynarmic { +namespace A64 { + +class Jit final { +public: + explicit Jit(UserConfig conf); + ~Jit(); + + /** + * Runs the emulated CPU. + * Cannot be recursively called. + */ + HaltReason Run(); + + /** + * Step the emulated CPU for one instruction. + * Cannot be recursively called. + */ + HaltReason Step(); + + /** + * Clears the code cache of all compiled code. + * Can be called at any time. Halts execution if called within a callback. + */ + void ClearCache(); + + /** + * Invalidate the code cache at a range of addresses. + * @param start_address The starting address of the range to invalidate. + * @param length The length (in bytes) of the range to invalidate. + */ + void InvalidateCacheRange(std::uint64_t start_address, std::size_t length); + + /** + * Reset CPU state to state at startup. Does not clear code cache. + * Cannot be called from a callback. + */ + void Reset(); + + /** + * Stops execution in Jit::Run. + */ + void HaltExecution(HaltReason hr = HaltReason::UserDefined1); + + /** + * Clears a halt reason from flags. + * Warning: Only use this if you're sure this won't introduce races. + */ + void ClearHalt(HaltReason hr = HaltReason::UserDefined1); + + /// Read Stack Pointer + std::uint64_t GetSP() const; + /// Modify Stack Pointer + void SetSP(std::uint64_t value); + + /// Read Program Counter + std::uint64_t GetPC() const; + /// Modify Program Counter + void SetPC(std::uint64_t value); + + /// Read general-purpose register. + std::uint64_t GetRegister(std::size_t index) const; + /// Modify general-purpose register. + void SetRegister(size_t index, std::uint64_t value); + + /// Read all general-purpose registers. + std::array<std::uint64_t, 31> GetRegisters() const; + /// Modify all general-purpose registers. + void SetRegisters(const std::array<std::uint64_t, 31>& value); + + /// Read floating point and SIMD register. + Vector GetVector(std::size_t index) const; + /// Modify floating point and SIMD register. + void SetVector(std::size_t index, Vector value); + + /// Read all floating point and SIMD registers. + std::array<Vector, 32> GetVectors() const; + /// Modify all floating point and SIMD registers. + void SetVectors(const std::array<Vector, 32>& value); + + /// View FPCR. + std::uint32_t GetFpcr() const; + /// Modify FPCR. + void SetFpcr(std::uint32_t value); + + /// View FPSR. + std::uint32_t GetFpsr() const; + /// Modify FPSR. + void SetFpsr(std::uint32_t value); + + /// View PSTATE + std::uint32_t GetPstate() const; + /// Modify PSTATE + void SetPstate(std::uint32_t value); + + /// Clears exclusive state for this core. + void ClearExclusiveState(); + + /** + * Returns true if Jit::Run was called but hasn't returned yet. + * i.e.: We're in a callback. + */ + bool IsExecuting() const; + + /// Debugging: Dump a disassembly all of compiled code to the console. + void DumpDisassembly() const; + + /* + * Disassemble the instructions following the current pc and return + * the resulting instructions as a vector of their string representations. + */ + std::vector<std::string> Disassemble() const; + +private: + struct Impl; + std::unique_ptr<Impl> impl; +}; + +} // namespace A64 +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/A64/config.h b/externals/dynarmic/src/dynarmic/interface/A64/config.h new file mode 100644 index 0000000000..b10c65bc6f --- /dev/null +++ b/externals/dynarmic/src/dynarmic/interface/A64/config.h @@ -0,0 +1,297 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <cstddef> +#include <cstdint> +#include <memory> +#include <optional> + +#include "dynarmic/interface/optimization_flags.h" + +namespace Dynarmic { +class ExclusiveMonitor; +} // namespace Dynarmic + +namespace Dynarmic { +namespace A64 { + +using VAddr = std::uint64_t; + +using Vector = std::array<std::uint64_t, 2>; +static_assert(sizeof(Vector) == sizeof(std::uint64_t) * 2, "Vector must be 128 bits in size"); + +enum class Exception { + /// An UndefinedFault occured due to executing instruction with an unallocated encoding + UnallocatedEncoding, + /// An UndefinedFault occured due to executing instruction containing a reserved value + ReservedValue, + /// An unpredictable instruction is to be executed. Implementation-defined behaviour should now happen. + /// This behaviour is up to the user of this library to define. + /// Note: Constraints on unpredictable behaviour are specified in the ARMv8 ARM. + UnpredictableInstruction, + /// A WFI instruction was executed. You may now enter a low-power state. (Hint instruction.) + WaitForInterrupt, + /// A WFE instruction was executed. You may now enter a low-power state if the event register is clear. (Hint instruction.) + WaitForEvent, + /// A SEV instruction was executed. The event register of all PEs should be set. (Hint instruction.) + SendEvent, + /// A SEVL instruction was executed. The event register of the current PE should be set. (Hint instruction.) + SendEventLocal, + /// A YIELD instruction was executed. (Hint instruction.) + Yield, + /// A BRK instruction was executed. (Hint instruction.) + Breakpoint, + /// Attempted to execute a code block at an address for which MemoryReadCode returned std::nullopt. + /// (Intended to be used to emulate memory protection faults.) + NoExecuteFault, +}; + +enum class DataCacheOperation { + /// DC CISW + CleanAndInvalidateBySetWay, + /// DC CIVAC + CleanAndInvalidateByVAToPoC, + /// DC CSW + CleanBySetWay, + /// DC CVAC + CleanByVAToPoC, + /// DC CVAU + CleanByVAToPoU, + /// DC CVAP + CleanByVAToPoP, + /// DC ISW + InvalidateBySetWay, + /// DC IVAC + InvalidateByVAToPoC, + /// DC ZVA + ZeroByVA, +}; + +enum class InstructionCacheOperation { + /// IC IVAU + InvalidateByVAToPoU, + /// IC IALLU + InvalidateAllToPoU, + /// IC IALLUIS + InvalidateAllToPoUInnerSharable +}; + +struct UserCallbacks { + virtual ~UserCallbacks() = default; + + // All reads through this callback are 4-byte aligned. + // Memory must be interpreted as little endian. + virtual std::optional<std::uint32_t> MemoryReadCode(VAddr vaddr) { return MemoryRead32(vaddr); } + + // Reads through these callbacks may not be aligned. + virtual std::uint8_t MemoryRead8(VAddr vaddr) = 0; + virtual std::uint16_t MemoryRead16(VAddr vaddr) = 0; + virtual std::uint32_t MemoryRead32(VAddr vaddr) = 0; + virtual std::uint64_t MemoryRead64(VAddr vaddr) = 0; + virtual Vector MemoryRead128(VAddr vaddr) = 0; + + // Writes through these callbacks may not be aligned. + virtual void MemoryWrite8(VAddr vaddr, std::uint8_t value) = 0; + virtual void MemoryWrite16(VAddr vaddr, std::uint16_t value) = 0; + virtual void MemoryWrite32(VAddr vaddr, std::uint32_t value) = 0; + virtual void MemoryWrite64(VAddr vaddr, std::uint64_t value) = 0; + virtual void MemoryWrite128(VAddr vaddr, Vector value) = 0; + + // Writes through these callbacks may not be aligned. + virtual bool MemoryWriteExclusive8(VAddr /*vaddr*/, std::uint8_t /*value*/, std::uint8_t /*expected*/) { return false; } + virtual bool MemoryWriteExclusive16(VAddr /*vaddr*/, std::uint16_t /*value*/, std::uint16_t /*expected*/) { return false; } + virtual bool MemoryWriteExclusive32(VAddr /*vaddr*/, std::uint32_t /*value*/, std::uint32_t /*expected*/) { return false; } + virtual bool MemoryWriteExclusive64(VAddr /*vaddr*/, std::uint64_t /*value*/, std::uint64_t /*expected*/) { return false; } + virtual bool MemoryWriteExclusive128(VAddr /*vaddr*/, Vector /*value*/, Vector /*expected*/) { return false; } + + // If this callback returns true, the JIT will assume MemoryRead* callbacks will always + // return the same value at any point in time for this vaddr. The JIT may use this information + // in optimizations. + // A conservative implementation that always returns false is safe. + virtual bool IsReadOnlyMemory(VAddr /*vaddr*/) { return false; } + + /// The interpreter must execute exactly num_instructions starting from PC. + virtual void InterpreterFallback(VAddr pc, size_t num_instructions) = 0; + + // This callback is called whenever a SVC instruction is executed. + virtual void CallSVC(std::uint32_t swi) = 0; + + virtual void ExceptionRaised(VAddr pc, Exception exception) = 0; + virtual void DataCacheOperationRaised(DataCacheOperation /*op*/, VAddr /*value*/) {} + virtual void InstructionCacheOperationRaised(InstructionCacheOperation /*op*/, VAddr /*value*/) {} + virtual void InstructionSynchronizationBarrierRaised() {} + + // Timing-related callbacks + // ticks ticks have passed + virtual void AddTicks(std::uint64_t ticks) = 0; + // How many more ticks am I allowed to execute? + virtual std::uint64_t GetTicksRemaining() = 0; + // Get value in the emulated counter-timer physical count register. + virtual std::uint64_t GetCNTPCT() = 0; +}; + +struct UserConfig { + UserCallbacks* callbacks; + + size_t processor_id = 0; + ExclusiveMonitor* global_monitor = nullptr; + + /// This selects other optimizations than can't otherwise be disabled by setting other + /// configuration options. This includes: + /// - IR optimizations + /// - Block linking optimizations + /// - RSB optimizations + /// This is intended to be used for debugging. + OptimizationFlag optimizations = all_safe_optimizations; + + bool HasOptimization(OptimizationFlag f) const { + if (!unsafe_optimizations) { + f &= all_safe_optimizations; + } + return (f & optimizations) != no_optimizations; + } + + /// This enables unsafe optimizations that reduce emulation accuracy in favour of speed. + /// For safety, in order to enable unsafe optimizations you have to set BOTH this flag + /// AND the appropriate flag bits above. + /// The prefered and tested mode for this library is with unsafe optimizations disabled. + bool unsafe_optimizations = false; + + /// When set to true, UserCallbacks::DataCacheOperationRaised will be called when any + /// data cache instruction is executed. Notably DC ZVA will not implicitly do anything. + /// When set to false, UserCallbacks::DataCacheOperationRaised will never be called. + /// Executing DC ZVA in this mode will result in zeros being written to memory. + bool hook_data_cache_operations = false; + + /// When set to true, UserCallbacks::InstructionSynchronizationBarrierRaised will be + /// called when an ISB instruction is executed. + /// When set to false, ISB will be treated as a NOP instruction. + bool hook_isb = false; + + /// When set to true, UserCallbacks::ExceptionRaised will be called when any hint + /// instruction is executed. + bool hook_hint_instructions = false; + + /// Counter-timer frequency register. The value of the register is not interpreted by + /// dynarmic. + std::uint32_t cntfrq_el0 = 600000000; + + /// CTR_EL0<27:24> is log2 of the cache writeback granule in words. + /// CTR_EL0<23:20> is log2 of the exclusives reservation granule in words. + /// CTR_EL0<19:16> is log2 of the smallest data/unified cacheline in words. + /// CTR_EL0<15:14> is the level 1 instruction cache policy. + /// CTR_EL0<3:0> is log2 of the smallest instruction cacheline in words. + std::uint32_t ctr_el0 = 0x8444c004; + + /// DCZID_EL0<3:0> is log2 of the block size in words + /// DCZID_EL0<4> is 0 if the DC ZVA instruction is permitted. + std::uint32_t dczid_el0 = 4; + + /// Pointer to where TPIDRRO_EL0 is stored. This pointer will be inserted into + /// emitted code. + const std::uint64_t* tpidrro_el0 = nullptr; + + /// Pointer to where TPIDR_EL0 is stored. This pointer will be inserted into + /// emitted code. + std::uint64_t* tpidr_el0 = nullptr; + + /// Pointer to the page table which we can use for direct page table access. + /// If an entry in page_table is null, the relevant memory callback will be called. + /// If page_table is nullptr, all memory accesses hit the memory callbacks. + void** page_table = nullptr; + /// Declares how many valid address bits are there in virtual addresses. + /// Determines the size of page_table. Valid values are between 12 and 64 inclusive. + /// This is only used if page_table is not nullptr. + size_t page_table_address_space_bits = 36; + /// Masks out the first N bits in host pointers from the page table. + /// The intention behind this is to allow users of Dynarmic to pack attributes in the + /// same integer and update the pointer attribute pair atomically. + /// If the configured value is 3, all pointers will be forcefully aligned to 8 bytes. + int page_table_pointer_mask_bits = 0; + /// Determines what happens if the guest accesses an entry that is off the end of the + /// page table. If true, Dynarmic will silently mirror page_table's address space. If + /// false, accessing memory outside of page_table bounds will result in a call to the + /// relevant memory callback. + /// This is only used if page_table is not nullptr. + bool silently_mirror_page_table = true; + /// Determines if the pointer in the page_table shall be offseted locally or globally. + /// 'false' will access page_table[addr >> bits][addr & mask] + /// 'true' will access page_table[addr >> bits][addr] + /// Note: page_table[addr >> bits] will still be checked to verify active pages. + /// So there might be wrongly faulted pages which maps to nullptr. + /// This can be avoided by carefully allocating the memory region. + bool absolute_offset_page_table = false; + /// Determines if we should detect memory accesses via page_table that straddle are + /// misaligned. Accesses that straddle page boundaries will fallback to the relevant + /// memory callback. + /// This value should be the required access sizes this applies to ORed together. + /// To detect any access, use: 8 | 16 | 32 | 64 | 128. + std::uint8_t detect_misaligned_access_via_page_table = 0; + /// Determines if the above option only triggers when the misalignment straddles a + /// page boundary. + bool only_detect_misalignment_via_page_table_on_page_boundary = false; + + /// Fastmem Pointer + /// This should point to the beginning of a 2^page_table_address_space_bits bytes + /// address space which is in arranged just like what you wish for emulated memory to + /// be. If the host page faults on an address, the JIT will fallback to calling the + /// MemoryRead*/MemoryWrite* callbacks. + void* fastmem_pointer = nullptr; + /// Determines if instructions that pagefault should cause recompilation of that block + /// with fastmem disabled. + /// Recompiled code will use the page_table if this is available, otherwise memory + /// accesses will hit the memory callbacks. + bool recompile_on_fastmem_failure = true; + /// Declares how many valid address bits are there in virtual addresses. + /// Determines the size of fastmem arena. Valid values are between 12 and 64 inclusive. + /// This is only used if fastmem_pointer is not nullptr. + size_t fastmem_address_space_bits = 36; + /// Determines what happens if the guest accesses an entry that is off the end of the + /// fastmem arena. If true, Dynarmic will silently mirror fastmem's address space. If + /// false, accessing memory outside of fastmem bounds will result in a call to the + /// relevant memory callback. + /// This is only used if fastmem_pointer is not nullptr. + bool silently_mirror_fastmem = true; + + /// Determines if we should use the above fastmem_pointer for exclusive reads and + /// writes. On x64, dynarmic currently relies on x64 cmpxchg semantics which may not + /// provide fully accurate emulation. + bool fastmem_exclusive_access = false; + /// Determines if exclusive access instructions that pagefault should cause + /// recompilation of that block with fastmem disabled. Recompiled code will use memory + /// callbacks. + bool recompile_on_exclusive_fastmem_failure = true; + + /// This option relates to translation. Generally when we run into an unpredictable + /// instruction the ExceptionRaised callback is called. If this is true, we define + /// definite behaviour for some unpredictable instructions. + bool define_unpredictable_behaviour = false; + + /// HACK: + /// This tells the translator a wall clock will be used, thus allowing it + /// to avoid writting certain unnecessary code only needed for cycle timers. + bool wall_clock_cntpct = false; + + /// This allows accurately emulating protection fault handlers. If true, we check + /// for exit after every data memory access by the emulated program. + bool check_halt_on_memory_access = false; + + /// This option allows you to disable cycle counting. If this is set to false, + /// AddTicks and GetTicksRemaining are never called, and no cycle counting is done. + bool enable_cycle_counting = true; + + // Minimum size is about 8MiB. Maximum size is about 128MiB (arm64 host) or 2GiB (x64 host). + // Maximum size is limited by the maximum length of a x86_64 / arm64 jump. + size_t code_cache_size = 128 * 1024 * 1024; // bytes + + /// Internal use only + bool very_verbose_debugging_output = false; +}; + +} // namespace A64 +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/exclusive_monitor.h b/externals/dynarmic/src/dynarmic/interface/exclusive_monitor.h new file mode 100644 index 0000000000..4813675873 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/interface/exclusive_monitor.h @@ -0,0 +1,88 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <atomic> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <vector> + +#include <dynarmic/common/spin_lock.h> + +namespace Dynarmic { + +using VAddr = std::uint64_t; +using Vector = std::array<std::uint64_t, 2>; + +class ExclusiveMonitor { +public: + /// @param processor_count Maximum number of processors using this global + /// exclusive monitor. Each processor must have a + /// unique id. + explicit ExclusiveMonitor(size_t processor_count); + + size_t GetProcessorCount() const; + + /// Marks a region containing [address, address+size) to be exclusive to + /// processor processor_id. + template<typename T, typename Function> + T ReadAndMark(size_t processor_id, VAddr address, Function op) { + static_assert(std::is_trivially_copyable_v<T>); + const VAddr masked_address = address & RESERVATION_GRANULE_MASK; + + Lock(); + exclusive_addresses[processor_id] = masked_address; + const T value = op(); + std::memcpy(exclusive_values[processor_id].data(), &value, sizeof(T)); + Unlock(); + return value; + } + + /// Checks to see if processor processor_id has exclusive access to the + /// specified region. If it does, executes the operation then clears + /// the exclusive state for processors if their exclusive region(s) + /// contain [address, address+size). + template<typename T, typename Function> + bool DoExclusiveOperation(size_t processor_id, VAddr address, Function op) { + static_assert(std::is_trivially_copyable_v<T>); + if (!CheckAndClear(processor_id, address)) { + return false; + } + + T saved_value; + std::memcpy(&saved_value, exclusive_values[processor_id].data(), sizeof(T)); + const bool result = op(saved_value); + + Unlock(); + return result; + } + + /// Unmark everything. + void Clear(); + /// Unmark processor id + void ClearProcessor(size_t processor_id); + +private: + bool CheckAndClear(size_t processor_id, VAddr address); + + void Lock(); + void Unlock(); + + friend volatile int* GetExclusiveMonitorLockPointer(ExclusiveMonitor*); + friend size_t GetExclusiveMonitorProcessorCount(ExclusiveMonitor*); + friend VAddr* GetExclusiveMonitorAddressPointer(ExclusiveMonitor*, size_t index); + friend Vector* GetExclusiveMonitorValuePointer(ExclusiveMonitor*, size_t index); + + static constexpr VAddr RESERVATION_GRANULE_MASK = 0xFFFF'FFFF'FFFF'FFFFull; + static constexpr VAddr INVALID_EXCLUSIVE_ADDRESS = 0xDEAD'DEAD'DEAD'DEADull; + SpinLock lock; + std::vector<VAddr> exclusive_addresses; + std::vector<Vector> exclusive_values; +}; + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/halt_reason.h b/externals/dynarmic/src/dynarmic/interface/halt_reason.h new file mode 100644 index 0000000000..121a8455cc --- /dev/null +++ b/externals/dynarmic/src/dynarmic/interface/halt_reason.h @@ -0,0 +1,54 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <cstdint> + +namespace Dynarmic { + +enum class HaltReason : std::uint32_t { + Step = 0x00000001, + CacheInvalidation = 0x00000002, + MemoryAbort = 0x00000004, + UserDefined1 = 0x01000000, + UserDefined2 = 0x02000000, + UserDefined3 = 0x04000000, + UserDefined4 = 0x08000000, + UserDefined5 = 0x10000000, + UserDefined6 = 0x20000000, + UserDefined7 = 0x40000000, + UserDefined8 = 0x80000000, +}; + +constexpr HaltReason operator~(HaltReason hr) { + return static_cast<HaltReason>(~static_cast<std::uint32_t>(hr)); +} + +constexpr HaltReason operator|(HaltReason hr1, HaltReason hr2) { + return static_cast<HaltReason>(static_cast<std::uint32_t>(hr1) | static_cast<std::uint32_t>(hr2)); +} + +constexpr HaltReason operator&(HaltReason hr1, HaltReason hr2) { + return static_cast<HaltReason>(static_cast<std::uint32_t>(hr1) & static_cast<std::uint32_t>(hr2)); +} + +constexpr HaltReason operator|=(HaltReason& result, HaltReason hr) { + return result = (result | hr); +} + +constexpr HaltReason operator&=(HaltReason& result, HaltReason hr) { + return result = (result & hr); +} + +constexpr bool operator!(HaltReason hr) { + return static_cast<std::uint32_t>(hr) == 0; +} + +constexpr bool Has(HaltReason hr1, HaltReason hr2) { + return (static_cast<std::uint32_t>(hr1) & static_cast<std::uint32_t>(hr2)) != 0; +} + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/optimization_flags.h b/externals/dynarmic/src/dynarmic/interface/optimization_flags.h new file mode 100644 index 0000000000..2f65f0bfa4 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/interface/optimization_flags.h @@ -0,0 +1,81 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <cstdint> + +namespace Dynarmic { + +enum class OptimizationFlag : std::uint32_t { + /// This optimization avoids dispatcher lookups by allowing emitted basic blocks to jump + /// directly to other basic blocks if the destination PC is predictable at JIT-time. + /// This is a safe optimization. + BlockLinking = 0x00000001, + /// This optimization avoids dispatcher lookups by emulating a return stack buffer. This + /// allows for function returns and syscall returns to be predicted at runtime. + /// This is a safe optimization. + ReturnStackBuffer = 0x00000002, + /// This optimization enables a two-tiered dispatch system. + /// A fast dispatcher (written in assembly) first does a look-up in a small MRU cache. + /// If this fails, it falls back to the usual slower dispatcher. + /// This is a safe optimization. + FastDispatch = 0x00000004, + /// This is an IR optimization. This optimization eliminates unnecessary emulated CPU state + /// context lookups. + /// This is a safe optimization. + GetSetElimination = 0x00000008, + /// This is an IR optimization. This optimization does constant propagation. + /// This is a safe optimization. + ConstProp = 0x00000010, + /// This is enables miscellaneous safe IR optimizations. + MiscIROpt = 0x00000020, + + /// This is an UNSAFE optimization that reduces accuracy of fused multiply-add operations. + /// This unfuses fused instructions to improve performance on host CPUs without FMA support. + Unsafe_UnfuseFMA = 0x00010000, + /// This is an UNSAFE optimization that reduces accuracy of certain floating-point instructions. + /// This allows results of FRECPE and FRSQRTE to have **less** error than spec allows. + Unsafe_ReducedErrorFP = 0x00020000, + /// This is an UNSAFE optimization that causes floating-point instructions to not produce correct NaNs. + /// This may also result in inaccurate results when instructions are given certain special values. + Unsafe_InaccurateNaN = 0x00040000, + /// This is an UNSAFE optimization that causes ASIMD floating-point instructions to be run with incorrect + /// rounding modes. This may result in inaccurate results with all floating-point ASIMD instructions. + Unsafe_IgnoreStandardFPCRValue = 0x00080000, + /// This is an UNSAFE optimization that causes the global monitor to be ignored. This may + /// result in unexpected behaviour in multithreaded scenarios, including but not limited + /// to data races and deadlocks. + Unsafe_IgnoreGlobalMonitor = 0x00100000, +}; + +constexpr OptimizationFlag no_optimizations = static_cast<OptimizationFlag>(0); +constexpr OptimizationFlag all_safe_optimizations = static_cast<OptimizationFlag>(0x0000FFFF); + +constexpr OptimizationFlag operator~(OptimizationFlag f) { + return static_cast<OptimizationFlag>(~static_cast<std::uint32_t>(f)); +} + +constexpr OptimizationFlag operator|(OptimizationFlag f1, OptimizationFlag f2) { + return static_cast<OptimizationFlag>(static_cast<std::uint32_t>(f1) | static_cast<std::uint32_t>(f2)); +} + +constexpr OptimizationFlag operator&(OptimizationFlag f1, OptimizationFlag f2) { + return static_cast<OptimizationFlag>(static_cast<std::uint32_t>(f1) & static_cast<std::uint32_t>(f2)); +} + +constexpr OptimizationFlag operator|=(OptimizationFlag& result, OptimizationFlag f) { + return result = (result | f); +} + +constexpr OptimizationFlag operator&=(OptimizationFlag& result, OptimizationFlag f) { + return result = (result & f); +} + +constexpr bool operator!(OptimizationFlag f) { + return f == no_optimizations; +} + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/ir/acc_type.h b/externals/dynarmic/src/dynarmic/ir/acc_type.h new file mode 100644 index 0000000000..f259660b68 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/acc_type.h @@ -0,0 +1,29 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::IR { + +enum class AccType { + NORMAL, + VEC, + STREAM, + VECSTREAM, + ATOMIC, + ORDERED, + ORDEREDRW, + LIMITEDORDERED, + UNPRIV, + IFETCH, + PTW, + DC, + IC, + DCZVA, + AT, + SWAP, // TODO: Remove +}; + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/basic_block.cpp b/externals/dynarmic/src/dynarmic/ir/basic_block.cpp new file mode 100644 index 0000000000..cc7eb48ef6 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/basic_block.cpp @@ -0,0 +1,244 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/ir/basic_block.h" + +#include <algorithm> +#include <initializer_list> +#include <map> +#include <string> + +#include <fmt/format.h> +#include <mcl/assert.hpp> + +#include "dynarmic/common/memory_pool.h" +#include "dynarmic/frontend/A32/a32_types.h" +#include "dynarmic/frontend/A64/a64_types.h" +#include "dynarmic/ir/cond.h" +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::IR { + +Block::Block(const LocationDescriptor& location) + : location{location}, end_location{location}, cond{Cond::AL}, instruction_alloc_pool{std::make_unique<Common::Pool>(sizeof(Inst), 4096)} {} + +Block::~Block() = default; + +Block::Block(Block&&) = default; + +Block& Block::operator=(Block&&) = default; + +void Block::AppendNewInst(Opcode opcode, std::initializer_list<IR::Value> args) { + PrependNewInst(end(), opcode, args); +} + +Block::iterator Block::PrependNewInst(iterator insertion_point, Opcode opcode, std::initializer_list<Value> args) { + IR::Inst* inst = new (instruction_alloc_pool->Alloc()) IR::Inst(opcode); + ASSERT(args.size() == inst->NumArgs()); + + std::for_each(args.begin(), args.end(), [&inst, index = size_t(0)](const auto& arg) mutable { + inst->SetArg(index, arg); + index++; + }); + + return instructions.insert_before(insertion_point, inst); +} + +LocationDescriptor Block::Location() const { + return location; +} + +LocationDescriptor Block::EndLocation() const { + return end_location; +} + +void Block::SetEndLocation(const LocationDescriptor& descriptor) { + end_location = descriptor; +} + +Cond Block::GetCondition() const { + return cond; +} + +void Block::SetCondition(Cond condition) { + cond = condition; +} + +LocationDescriptor Block::ConditionFailedLocation() const { + return *cond_failed; +} + +void Block::SetConditionFailedLocation(LocationDescriptor fail_location) { + cond_failed = fail_location; +} + +size_t& Block::ConditionFailedCycleCount() { + return cond_failed_cycle_count; +} + +const size_t& Block::ConditionFailedCycleCount() const { + return cond_failed_cycle_count; +} + +bool Block::HasConditionFailedLocation() const { + return cond_failed.has_value(); +} + +Block::InstructionList& Block::Instructions() { + return instructions; +} + +const Block::InstructionList& Block::Instructions() const { + return instructions; +} + +Terminal Block::GetTerminal() const { + return terminal; +} + +void Block::SetTerminal(Terminal term) { + ASSERT_MSG(!HasTerminal(), "Terminal has already been set."); + terminal = std::move(term); +} + +void Block::ReplaceTerminal(Terminal term) { + ASSERT_MSG(HasTerminal(), "Terminal has not been set."); + terminal = std::move(term); +} + +bool Block::HasTerminal() const { + return terminal.which() != 0; +} + +size_t& Block::CycleCount() { + return cycle_count; +} + +const size_t& Block::CycleCount() const { + return cycle_count; +} + +static std::string TerminalToString(const Terminal& terminal_variant) { + struct : boost::static_visitor<std::string> { + std::string operator()(const Term::Invalid&) const { + return "<invalid terminal>"; + } + std::string operator()(const Term::Interpret& terminal) const { + return fmt::format("Interpret{{{}}}", terminal.next); + } + std::string operator()(const Term::ReturnToDispatch&) const { + return "ReturnToDispatch{}"; + } + std::string operator()(const Term::LinkBlock& terminal) const { + return fmt::format("LinkBlock{{{}}}", terminal.next); + } + std::string operator()(const Term::LinkBlockFast& terminal) const { + return fmt::format("LinkBlockFast{{{}}}", terminal.next); + } + std::string operator()(const Term::PopRSBHint&) const { + return "PopRSBHint{}"; + } + std::string operator()(const Term::FastDispatchHint&) const { + return "FastDispatchHint{}"; + } + std::string operator()(const Term::If& terminal) const { + return fmt::format("If{{{}, {}, {}}}", A64::CondToString(terminal.if_), TerminalToString(terminal.then_), TerminalToString(terminal.else_)); + } + std::string operator()(const Term::CheckBit& terminal) const { + return fmt::format("CheckBit{{{}, {}}}", TerminalToString(terminal.then_), TerminalToString(terminal.else_)); + } + std::string operator()(const Term::CheckHalt& terminal) const { + return fmt::format("CheckHalt{{{}}}", TerminalToString(terminal.else_)); + } + } visitor; + + return boost::apply_visitor(visitor, terminal_variant); +} + +std::string DumpBlock(const IR::Block& block) { + std::string ret; + + ret += fmt::format("Block: location={}\n", block.Location()); + ret += fmt::format("cycles={}", block.CycleCount()); + ret += fmt::format(", entry_cond={}", A64::CondToString(block.GetCondition())); + if (block.GetCondition() != Cond::AL) { + ret += fmt::format(", cond_fail={}", block.ConditionFailedLocation()); + } + ret += '\n'; + + const auto arg_to_string = [](const IR::Value& arg) -> std::string { + if (arg.IsEmpty()) { + return "<null>"; + } else if (!arg.IsImmediate()) { + if (const unsigned name = arg.GetInst()->GetName()) { + return fmt::format("%{}", name); + } + return fmt::format("%<unnamed inst {:016x}>", reinterpret_cast<u64>(arg.GetInst())); + } + switch (arg.GetType()) { + case Type::U1: + return fmt::format("#{}", arg.GetU1() ? '1' : '0'); + case Type::U8: + return fmt::format("#{}", arg.GetU8()); + case Type::U16: + return fmt::format("#{:#x}", arg.GetU16()); + case Type::U32: + return fmt::format("#{:#x}", arg.GetU32()); + case Type::U64: + return fmt::format("#{:#x}", arg.GetU64()); + case Type::A32Reg: + return A32::RegToString(arg.GetA32RegRef()); + case Type::A32ExtReg: + return A32::ExtRegToString(arg.GetA32ExtRegRef()); + case Type::A64Reg: + return A64::RegToString(arg.GetA64RegRef()); + case Type::A64Vec: + return A64::VecToString(arg.GetA64VecRef()); + default: + return "<unknown immediate type>"; + } + }; + + for (const auto& inst : block) { + const Opcode op = inst.GetOpcode(); + + ret += fmt::format("[{:016x}] ", reinterpret_cast<u64>(&inst)); + if (GetTypeOf(op) != Type::Void) { + if (inst.GetName()) { + ret += fmt::format("%{:<5} = ", inst.GetName()); + } else { + ret += "noname = "; + } + } else { + ret += " "; // '%00000 = ' -> 1 + 5 + 3 = 9 spaces + } + + ret += GetNameOf(op); + + const size_t arg_count = GetNumArgsOf(op); + for (size_t arg_index = 0; arg_index < arg_count; arg_index++) { + const Value arg = inst.GetArg(arg_index); + + ret += arg_index != 0 ? ", " : " "; + ret += arg_to_string(arg); + + Type actual_type = arg.GetType(); + Type expected_type = GetArgTypeOf(op, arg_index); + if (!AreTypesCompatible(actual_type, expected_type)) { + ret += fmt::format("<type error: {} != {}>", GetNameOf(actual_type), GetNameOf(expected_type)); + } + } + + ret += fmt::format(" (uses: {})", inst.UseCount()); + + ret += '\n'; + } + + ret += "terminal = " + TerminalToString(block.GetTerminal()) + '\n'; + + return ret; +} + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/basic_block.h b/externals/dynarmic/src/dynarmic/ir/basic_block.h new file mode 100644 index 0000000000..18d7fadae5 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/basic_block.h @@ -0,0 +1,168 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <initializer_list> +#include <memory> +#include <optional> +#include <string> + +#include <mcl/container/intrusive_list.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/ir/location_descriptor.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/terminal.h" +#include "dynarmic/ir/value.h" + +namespace Dynarmic::Common { +class Pool; +} + +namespace Dynarmic::IR { + +enum class Cond; +enum class Opcode; + +/** + * A basic block. It consists of zero or more instructions followed by exactly one terminal. + * Note that this is a linear IR and not a pure tree-based IR: i.e.: there is an ordering to + * the microinstructions. This only matters before chaining is done in order to correctly + * order memory accesses. + */ +class Block final { +public: + using InstructionList = mcl::intrusive_list<Inst>; + using size_type = InstructionList::size_type; + using iterator = InstructionList::iterator; + using const_iterator = InstructionList::const_iterator; + using reverse_iterator = InstructionList::reverse_iterator; + using const_reverse_iterator = InstructionList::const_reverse_iterator; + + explicit Block(const LocationDescriptor& location); + ~Block(); + + Block(const Block&) = delete; + Block& operator=(const Block&) = delete; + + Block(Block&&); + Block& operator=(Block&&); + + bool empty() const { return instructions.empty(); } + size_type size() const { return instructions.size(); } + + Inst& front() { return instructions.front(); } + const Inst& front() const { return instructions.front(); } + + Inst& back() { return instructions.back(); } + const Inst& back() const { return instructions.back(); } + + iterator begin() { return instructions.begin(); } + const_iterator begin() const { return instructions.begin(); } + iterator end() { return instructions.end(); } + const_iterator end() const { return instructions.end(); } + + reverse_iterator rbegin() { return instructions.rbegin(); } + const_reverse_iterator rbegin() const { return instructions.rbegin(); } + reverse_iterator rend() { return instructions.rend(); } + const_reverse_iterator rend() const { return instructions.rend(); } + + const_iterator cbegin() const { return instructions.cbegin(); } + const_iterator cend() const { return instructions.cend(); } + + const_reverse_iterator crbegin() const { return instructions.crbegin(); } + const_reverse_iterator crend() const { return instructions.crend(); } + + /** + * Appends a new instruction to the end of this basic block, + * handling any allocations necessary to do so. + * + * @param op Opcode representing the instruction to add. + * @param args A sequence of Value instances used as arguments for the instruction. + */ + void AppendNewInst(Opcode op, std::initializer_list<Value> args); + + /** + * Prepends a new instruction to this basic block before the insertion point, + * handling any allocations necessary to do so. + * + * @param insertion_point Where to insert the new instruction. + * @param op Opcode representing the instruction to add. + * @param args A sequence of Value instances used as arguments for the instruction. + * @returns Iterator to the newly created instruction. + */ + iterator PrependNewInst(iterator insertion_point, Opcode op, std::initializer_list<Value> args); + + /// Gets the starting location for this basic block. + LocationDescriptor Location() const; + /// Gets the end location for this basic block. + LocationDescriptor EndLocation() const; + /// Sets the end location for this basic block. + void SetEndLocation(const LocationDescriptor& descriptor); + + /// Gets the condition required to pass in order to execute this block. + Cond GetCondition() const; + /// Sets the condition required to pass in order to execute this block. + void SetCondition(Cond condition); + + /// Gets the location of the block to execute if the predicated condition fails. + LocationDescriptor ConditionFailedLocation() const; + /// Sets the location of the block to execute if the predicated condition fails. + void SetConditionFailedLocation(LocationDescriptor fail_location); + /// Determines whether or not a predicated condition failure block is present. + bool HasConditionFailedLocation() const; + + /// Gets a mutable reference to the condition failed cycle count. + size_t& ConditionFailedCycleCount(); + /// Gets an immutable reference to the condition failed cycle count. + const size_t& ConditionFailedCycleCount() const; + + /// Gets a mutable reference to the instruction list for this basic block. + InstructionList& Instructions(); + /// Gets an immutable reference to the instruction list for this basic block. + const InstructionList& Instructions() const; + + /// Gets the terminal instruction for this basic block. + Terminal GetTerminal() const; + /// Sets the terminal instruction for this basic block. + void SetTerminal(Terminal term); + /// Replaces the terminal instruction for this basic block. + void ReplaceTerminal(Terminal term); + /// Determines whether or not this basic block has a terminal instruction. + bool HasTerminal() const; + + /// Gets a mutable reference to the cycle count for this basic block. + size_t& CycleCount(); + /// Gets an immutable reference to the cycle count for this basic block. + const size_t& CycleCount() const; + +private: + /// Description of the starting location of this block + LocationDescriptor location; + /// Description of the end location of this block + LocationDescriptor end_location; + /// Conditional to pass in order to execute this block + Cond cond; + /// Block to execute next if `cond` did not pass. + std::optional<LocationDescriptor> cond_failed = {}; + /// Number of cycles this block takes to execute if the conditional fails. + size_t cond_failed_cycle_count = 0; + + /// List of instructions in this block. + InstructionList instructions; + /// Memory pool for instruction list + std::unique_ptr<Common::Pool> instruction_alloc_pool; + /// Terminal instruction of this block. + Terminal terminal = Term::Invalid{}; + + /// Number of cycles this block takes to execute. + size_t cycle_count = 0; +}; + +/// Returns a string representation of the contents of block. Intended for debugging. +std::string DumpBlock(const IR::Block& block); + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/cond.h b/externals/dynarmic/src/dynarmic/ir/cond.h new file mode 100644 index 0000000000..5bdd05eddc --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/cond.h @@ -0,0 +1,31 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::IR { + +enum class Cond { + EQ, + NE, + CS, + CC, + MI, + PL, + VS, + VC, + HI, + LS, + GE, + LT, + GT, + LE, + AL, + NV, + HS = CS, + LO = CC, +}; + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/ir_emitter.cpp b/externals/dynarmic/src/dynarmic/ir/ir_emitter.cpp new file mode 100644 index 0000000000..fc4f69b3e0 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/ir_emitter.cpp @@ -0,0 +1,2890 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/ir/ir_emitter.h" + +#include <vector> + +#include <mcl/assert.hpp> +#include <mcl/bit_cast.hpp> + +#include "dynarmic/ir/opcodes.h" + +namespace Dynarmic::IR { + +U1 IREmitter::Imm1(bool imm1) const { + return U1(Value(imm1)); +} + +U8 IREmitter::Imm8(u8 imm8) const { + return U8(Value(imm8)); +} + +U16 IREmitter::Imm16(u16 imm16) const { + return U16(Value(imm16)); +} + +U32 IREmitter::Imm32(u32 imm32) const { + return U32(Value(imm32)); +} + +U64 IREmitter::Imm64(u64 imm64) const { + return U64(Value(imm64)); +} + +void IREmitter::PushRSB(const LocationDescriptor& return_location) { + Inst(Opcode::PushRSB, IR::Value(return_location.Value())); +} + +U64 IREmitter::Pack2x32To1x64(const U32& lo, const U32& hi) { + return Inst<U64>(Opcode::Pack2x32To1x64, lo, hi); +} + +U128 IREmitter::Pack2x64To1x128(const U64& lo, const U64& hi) { + return Inst<U128>(Opcode::Pack2x64To1x128, lo, hi); +} + +UAny IREmitter::LeastSignificant(size_t bitsize, const U32U64& value) { + switch (bitsize) { + case 8: + return LeastSignificantByte(value); + case 16: + return LeastSignificantHalf(value); + case 32: + if (value.GetType() == Type::U32) { + return value; + } + return LeastSignificantWord(value); + case 64: + ASSERT(value.GetType() == Type::U64); + return value; + } + ASSERT_FALSE("Invalid bitsize"); +} + +U32 IREmitter::LeastSignificantWord(const U64& value) { + return Inst<U32>(Opcode::LeastSignificantWord, value); +} + +U16 IREmitter::LeastSignificantHalf(U32U64 value) { + if (value.GetType() == Type::U64) { + value = LeastSignificantWord(value); + } + return Inst<U16>(Opcode::LeastSignificantHalf, value); +} + +U8 IREmitter::LeastSignificantByte(U32U64 value) { + if (value.GetType() == Type::U64) { + value = LeastSignificantWord(value); + } + return Inst<U8>(Opcode::LeastSignificantByte, value); +} + +ResultAndCarry<U32> IREmitter::MostSignificantWord(const U64& value) { + const auto result = Inst<U32>(Opcode::MostSignificantWord, value); + const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result); + return {result, carry_out}; +} + +U1 IREmitter::MostSignificantBit(const U32& value) { + return Inst<U1>(Opcode::MostSignificantBit, value); +} + +U1 IREmitter::IsZero(const U32& value) { + return Inst<U1>(Opcode::IsZero32, value); +} + +U1 IREmitter::IsZero(const U64& value) { + return Inst<U1>(Opcode::IsZero64, value); +} + +U1 IREmitter::IsZero(const U32U64& value) { + if (value.GetType() == Type::U32) { + return Inst<U1>(Opcode::IsZero32, value); + } else { + return Inst<U1>(Opcode::IsZero64, value); + } +} + +U1 IREmitter::TestBit(const U32U64& value, const U8& bit) { + if (value.GetType() == Type::U32) { + return Inst<U1>(Opcode::TestBit, IndeterminateExtendToLong(value), bit); + } else { + return Inst<U1>(Opcode::TestBit, value, bit); + } +} + +U32 IREmitter::ConditionalSelect(Cond cond, const U32& a, const U32& b) { + return Inst<U32>(Opcode::ConditionalSelect32, Value{cond}, a, b); +} + +U64 IREmitter::ConditionalSelect(Cond cond, const U64& a, const U64& b) { + return Inst<U64>(Opcode::ConditionalSelect64, Value{cond}, a, b); +} + +NZCV IREmitter::ConditionalSelect(Cond cond, const NZCV& a, const NZCV& b) { + return Inst<NZCV>(Opcode::ConditionalSelectNZCV, Value{cond}, a, b); +} + +U32U64 IREmitter::ConditionalSelect(Cond cond, const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::ConditionalSelect32, Value{cond}, a, b); + } else { + return Inst<U64>(Opcode::ConditionalSelect64, Value{cond}, a, b); + } +} + +U1 IREmitter::GetCFlagFromNZCV(const NZCV& nzcv) { + return Inst<U1>(Opcode::GetCFlagFromNZCV, nzcv); +} + +NZCV IREmitter::NZCVFromPackedFlags(const U32& a) { + return Inst<NZCV>(Opcode::NZCVFromPackedFlags, a); +} + +NZCV IREmitter::NZCVFrom(const Value& value) { + return Inst<NZCV>(Opcode::GetNZCVFromOp, value); +} + +ResultAndCarry<U32> IREmitter::LogicalShiftLeft(const U32& value_in, const U8& shift_amount, const U1& carry_in) { + const auto result = Inst<U32>(Opcode::LogicalShiftLeft32, value_in, shift_amount, carry_in); + const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result); + return {result, carry_out}; +} + +ResultAndCarry<U32> IREmitter::LogicalShiftRight(const U32& value_in, const U8& shift_amount, const U1& carry_in) { + const auto result = Inst<U32>(Opcode::LogicalShiftRight32, value_in, shift_amount, carry_in); + const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result); + return {result, carry_out}; +} + +ResultAndCarry<U32> IREmitter::ArithmeticShiftRight(const U32& value_in, const U8& shift_amount, const U1& carry_in) { + const auto result = Inst<U32>(Opcode::ArithmeticShiftRight32, value_in, shift_amount, carry_in); + const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result); + return {result, carry_out}; +} + +ResultAndCarry<U32> IREmitter::RotateRight(const U32& value_in, const U8& shift_amount, const U1& carry_in) { + const auto result = Inst<U32>(Opcode::RotateRight32, value_in, shift_amount, carry_in); + const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result); + return {result, carry_out}; +} + +ResultAndCarry<U32> IREmitter::RotateRightExtended(const U32& value_in, const U1& carry_in) { + const auto result = Inst<U32>(Opcode::RotateRightExtended, value_in, carry_in); + const auto carry_out = Inst<U1>(Opcode::GetCarryFromOp, result); + return {result, carry_out}; +} + +U32U64 IREmitter::LogicalShiftLeft(const U32U64& value_in, const U8& shift_amount) { + if (value_in.GetType() == Type::U32) { + return Inst<U32>(Opcode::LogicalShiftLeft32, value_in, shift_amount, Imm1(0)); + } else { + return Inst<U64>(Opcode::LogicalShiftLeft64, value_in, shift_amount); + } +} + +U32U64 IREmitter::LogicalShiftRight(const U32U64& value_in, const U8& shift_amount) { + if (value_in.GetType() == Type::U32) { + return Inst<U32>(Opcode::LogicalShiftRight32, value_in, shift_amount, Imm1(0)); + } else { + return Inst<U64>(Opcode::LogicalShiftRight64, value_in, shift_amount); + } +} + +U32U64 IREmitter::ArithmeticShiftRight(const U32U64& value_in, const U8& shift_amount) { + if (value_in.GetType() == Type::U32) { + return Inst<U32>(Opcode::ArithmeticShiftRight32, value_in, shift_amount, Imm1(0)); + } else { + return Inst<U64>(Opcode::ArithmeticShiftRight64, value_in, shift_amount); + } +} + +U32U64 IREmitter::RotateRight(const U32U64& value_in, const U8& shift_amount) { + if (value_in.GetType() == Type::U32) { + return Inst<U32>(Opcode::RotateRight32, value_in, shift_amount, Imm1(0)); + } else { + return Inst<U64>(Opcode::RotateRight64, value_in, shift_amount); + } +} + +U32U64 IREmitter::LogicalShiftLeftMasked(const U32U64& value_in, const U32U64& shift_amount) { + ASSERT(value_in.GetType() == shift_amount.GetType()); + if (value_in.GetType() == Type::U32) { + return Inst<U32>(Opcode::LogicalShiftLeftMasked32, value_in, shift_amount); + } else { + return Inst<U64>(Opcode::LogicalShiftLeftMasked64, value_in, shift_amount); + } +} + +U32U64 IREmitter::LogicalShiftRightMasked(const U32U64& value_in, const U32U64& shift_amount) { + ASSERT(value_in.GetType() == shift_amount.GetType()); + if (value_in.GetType() == Type::U32) { + return Inst<U32>(Opcode::LogicalShiftRightMasked32, value_in, shift_amount); + } else { + return Inst<U64>(Opcode::LogicalShiftRightMasked64, value_in, shift_amount); + } +} + +U32U64 IREmitter::ArithmeticShiftRightMasked(const U32U64& value_in, const U32U64& shift_amount) { + ASSERT(value_in.GetType() == shift_amount.GetType()); + if (value_in.GetType() == Type::U32) { + return Inst<U32>(Opcode::ArithmeticShiftRightMasked32, value_in, shift_amount); + } else { + return Inst<U64>(Opcode::ArithmeticShiftRightMasked64, value_in, shift_amount); + } +} + +U32U64 IREmitter::RotateRightMasked(const U32U64& value_in, const U32U64& shift_amount) { + ASSERT(value_in.GetType() == shift_amount.GetType()); + if (value_in.GetType() == Type::U32) { + return Inst<U32>(Opcode::RotateRightMasked32, value_in, shift_amount); + } else { + return Inst<U64>(Opcode::RotateRightMasked64, value_in, shift_amount); + } +} + +U32U64 IREmitter::AddWithCarry(const U32U64& a, const U32U64& b, const U1& carry_in) { + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::Add32, a, b, carry_in); + } else { + return Inst<U64>(Opcode::Add64, a, b, carry_in); + } +} + +U32U64 IREmitter::Add(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::Add32, a, b, Imm1(0)); + } else { + return Inst<U64>(Opcode::Add64, a, b, Imm1(0)); + } +} + +U32U64 IREmitter::SubWithCarry(const U32U64& a, const U32U64& b, const U1& carry_in) { + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::Sub32, a, b, carry_in); + } else { + return Inst<U64>(Opcode::Sub64, a, b, carry_in); + } +} + +U32U64 IREmitter::Sub(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::Sub32, a, b, Imm1(1)); + } else { + return Inst<U64>(Opcode::Sub64, a, b, Imm1(1)); + } +} + +U32U64 IREmitter::Mul(const U32U64& a, const U32U64& b) { + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::Mul32, a, b); + } + + return Inst<U64>(Opcode::Mul64, a, b); +} + +U64 IREmitter::UnsignedMultiplyHigh(const U64& a, const U64& b) { + return Inst<U64>(Opcode::UnsignedMultiplyHigh64, a, b); +} + +U64 IREmitter::SignedMultiplyHigh(const U64& a, const U64& b) { + return Inst<U64>(Opcode::SignedMultiplyHigh64, a, b); +} + +U32U64 IREmitter::UnsignedDiv(const U32U64& a, const U32U64& b) { + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::UnsignedDiv32, a, b); + } + + return Inst<U64>(Opcode::UnsignedDiv64, a, b); +} + +U32U64 IREmitter::SignedDiv(const U32U64& a, const U32U64& b) { + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::SignedDiv32, a, b); + } + + return Inst<U64>(Opcode::SignedDiv64, a, b); +} + +U32U64 IREmitter::And(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::And32, a, b); + } else { + return Inst<U64>(Opcode::And64, a, b); + } +} + +U32U64 IREmitter::AndNot(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::AndNot32, a, b); + } else { + return Inst<U64>(Opcode::AndNot64, a, b); + } +} + +U32U64 IREmitter::Eor(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::Eor32, a, b); + } else { + return Inst<U64>(Opcode::Eor64, a, b); + } +} + +U32U64 IREmitter::Or(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::Or32, a, b); + } else { + return Inst<U64>(Opcode::Or64, a, b); + } +} + +U32U64 IREmitter::Not(const U32U64& a) { + if (a.GetType() == Type::U32) { + return Inst<U32>(Opcode::Not32, a); + } else { + return Inst<U64>(Opcode::Not64, a); + } +} + +U64 IREmitter::SignExtendToLong(const UAny& a) { + switch (a.GetType()) { + case Type::U8: + return Inst<U64>(Opcode::SignExtendByteToLong, a); + case Type::U16: + return Inst<U64>(Opcode::SignExtendHalfToLong, a); + case Type::U32: + return Inst<U64>(Opcode::SignExtendWordToLong, a); + case Type::U64: + return U64(a); + default: + UNREACHABLE(); + } +} + +U32 IREmitter::SignExtendToWord(const UAny& a) { + switch (a.GetType()) { + case Type::U8: + return Inst<U32>(Opcode::SignExtendByteToWord, a); + case Type::U16: + return Inst<U32>(Opcode::SignExtendHalfToWord, a); + case Type::U32: + return U32(a); + case Type::U64: + return Inst<U32>(Opcode::LeastSignificantWord, a); + default: + UNREACHABLE(); + } +} + +U64 IREmitter::SignExtendWordToLong(const U32& a) { + return Inst<U64>(Opcode::SignExtendWordToLong, a); +} + +U32 IREmitter::SignExtendHalfToWord(const U16& a) { + return Inst<U32>(Opcode::SignExtendHalfToWord, a); +} + +U32 IREmitter::SignExtendByteToWord(const U8& a) { + return Inst<U32>(Opcode::SignExtendByteToWord, a); +} + +U64 IREmitter::ZeroExtendToLong(const UAny& a) { + switch (a.GetType()) { + case Type::U8: + return Inst<U64>(Opcode::ZeroExtendByteToLong, a); + case Type::U16: + return Inst<U64>(Opcode::ZeroExtendHalfToLong, a); + case Type::U32: + return Inst<U64>(Opcode::ZeroExtendWordToLong, a); + case Type::U64: + return U64(a); + default: + UNREACHABLE(); + } +} + +U32 IREmitter::ZeroExtendToWord(const UAny& a) { + switch (a.GetType()) { + case Type::U8: + return Inst<U32>(Opcode::ZeroExtendByteToWord, a); + case Type::U16: + return Inst<U32>(Opcode::ZeroExtendHalfToWord, a); + case Type::U32: + return U32(a); + case Type::U64: + return Inst<U32>(Opcode::LeastSignificantWord, a); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::ZeroExtendToQuad(const UAny& a) { + return Inst<U128>(Opcode::ZeroExtendLongToQuad, ZeroExtendToLong(a)); +} + +U64 IREmitter::ZeroExtendWordToLong(const U32& a) { + return Inst<U64>(Opcode::ZeroExtendWordToLong, a); +} + +U32 IREmitter::ZeroExtendHalfToWord(const U16& a) { + return Inst<U32>(Opcode::ZeroExtendHalfToWord, a); +} + +U32 IREmitter::ZeroExtendByteToWord(const U8& a) { + return Inst<U32>(Opcode::ZeroExtendByteToWord, a); +} + +U32 IREmitter::IndeterminateExtendToWord(const UAny& a) { + // TODO: Implement properly + return ZeroExtendToWord(a); +} + +U64 IREmitter::IndeterminateExtendToLong(const UAny& a) { + // TODO: Implement properly + return ZeroExtendToLong(a); +} + +U32 IREmitter::ByteReverseWord(const U32& a) { + return Inst<U32>(Opcode::ByteReverseWord, a); +} + +U16 IREmitter::ByteReverseHalf(const U16& a) { + return Inst<U16>(Opcode::ByteReverseHalf, a); +} + +U64 IREmitter::ByteReverseDual(const U64& a) { + return Inst<U64>(Opcode::ByteReverseDual, a); +} + +U32U64 IREmitter::CountLeadingZeros(const U32U64& a) { + if (a.GetType() == IR::Type::U32) { + return Inst<U32>(Opcode::CountLeadingZeros32, a); + } + + return Inst<U64>(Opcode::CountLeadingZeros64, a); +} + +U32U64 IREmitter::ExtractRegister(const U32U64& a, const U32U64& b, const U8& lsb) { + if (a.GetType() == IR::Type::U32) { + return Inst<U32>(Opcode::ExtractRegister32, a, b, lsb); + } + + return Inst<U64>(Opcode::ExtractRegister64, a, b, lsb); +} + +U32U64 IREmitter::ReplicateBit(const U32U64& a, u8 bit) { + if (a.GetType() == IR::Type::U32) { + ASSERT(bit < 32); + return Inst<U32>(Opcode::ReplicateBit32, a, Imm8(bit)); + } + + ASSERT(bit < 64); + return Inst<U64>(Opcode::ReplicateBit64, a, Imm8(bit)); +} + +U32U64 IREmitter::MaxSigned(const U32U64& a, const U32U64& b) { + if (a.GetType() == IR::Type::U32) { + return Inst<U32>(Opcode::MaxSigned32, a, b); + } + + return Inst<U64>(Opcode::MaxSigned64, a, b); +} + +U32U64 IREmitter::MaxUnsigned(const U32U64& a, const U32U64& b) { + if (a.GetType() == IR::Type::U32) { + return Inst<U32>(Opcode::MaxUnsigned32, a, b); + } + + return Inst<U64>(Opcode::MaxUnsigned64, a, b); +} + +U32U64 IREmitter::MinSigned(const U32U64& a, const U32U64& b) { + if (a.GetType() == IR::Type::U32) { + return Inst<U32>(Opcode::MinSigned32, a, b); + } + + return Inst<U64>(Opcode::MinSigned64, a, b); +} + +U32U64 IREmitter::MinUnsigned(const U32U64& a, const U32U64& b) { + if (a.GetType() == IR::Type::U32) { + return Inst<U32>(Opcode::MinUnsigned32, a, b); + } + + return Inst<U64>(Opcode::MinUnsigned64, a, b); +} + +ResultAndOverflow<U32> IREmitter::SignedSaturatedAddWithFlag(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::SignedSaturatedAddWithFlag32, a, b); + const auto overflow = Inst<U1>(Opcode::GetOverflowFromOp, result); + return {result, overflow}; +} + +ResultAndOverflow<U32> IREmitter::SignedSaturatedSubWithFlag(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::SignedSaturatedSubWithFlag32, a, b); + const auto overflow = Inst<U1>(Opcode::GetOverflowFromOp, result); + return {result, overflow}; +} + +ResultAndOverflow<U32> IREmitter::SignedSaturation(const U32& a, size_t bit_size_to_saturate_to) { + ASSERT(bit_size_to_saturate_to >= 1 && bit_size_to_saturate_to <= 32); + const auto result = Inst<U32>(Opcode::SignedSaturation, a, Imm8(static_cast<u8>(bit_size_to_saturate_to))); + const auto overflow = Inst<U1>(Opcode::GetOverflowFromOp, result); + return {result, overflow}; +} + +ResultAndOverflow<U32> IREmitter::UnsignedSaturation(const U32& a, size_t bit_size_to_saturate_to) { + ASSERT(bit_size_to_saturate_to <= 31); + const auto result = Inst<U32>(Opcode::UnsignedSaturation, a, Imm8(static_cast<u8>(bit_size_to_saturate_to))); + const auto overflow = Inst<U1>(Opcode::GetOverflowFromOp, result); + return {result, overflow}; +} + +UAny IREmitter::SignedSaturatedAdd(const UAny& a, const UAny& b) { + ASSERT(a.GetType() == b.GetType()); + const auto result = [&]() -> IR::UAny { + switch (a.GetType()) { + case IR::Type::U8: + return Inst<U8>(Opcode::SignedSaturatedAdd8, a, b); + case IR::Type::U16: + return Inst<U16>(Opcode::SignedSaturatedAdd16, a, b); + case IR::Type::U32: + return Inst<U32>(Opcode::SignedSaturatedAdd32, a, b); + case IR::Type::U64: + return Inst<U64>(Opcode::SignedSaturatedAdd64, a, b); + default: + return IR::UAny{}; + } + }(); + return result; +} + +UAny IREmitter::SignedSaturatedDoublingMultiplyReturnHigh(const UAny& a, const UAny& b) { + ASSERT(a.GetType() == b.GetType()); + const auto result = [&]() -> IR::UAny { + switch (a.GetType()) { + case IR::Type::U16: + return Inst<U16>(Opcode::SignedSaturatedDoublingMultiplyReturnHigh16, a, b); + case IR::Type::U32: + return Inst<U32>(Opcode::SignedSaturatedDoublingMultiplyReturnHigh32, a, b); + default: + UNREACHABLE(); + } + }(); + return result; +} + +UAny IREmitter::SignedSaturatedSub(const UAny& a, const UAny& b) { + ASSERT(a.GetType() == b.GetType()); + const auto result = [&]() -> IR::UAny { + switch (a.GetType()) { + case IR::Type::U8: + return Inst<U8>(Opcode::SignedSaturatedSub8, a, b); + case IR::Type::U16: + return Inst<U16>(Opcode::SignedSaturatedSub16, a, b); + case IR::Type::U32: + return Inst<U32>(Opcode::SignedSaturatedSub32, a, b); + case IR::Type::U64: + return Inst<U64>(Opcode::SignedSaturatedSub64, a, b); + default: + return IR::UAny{}; + } + }(); + return result; +} + +UAny IREmitter::UnsignedSaturatedAdd(const UAny& a, const UAny& b) { + ASSERT(a.GetType() == b.GetType()); + const auto result = [&]() -> IR::UAny { + switch (a.GetType()) { + case IR::Type::U8: + return Inst<U8>(Opcode::UnsignedSaturatedAdd8, a, b); + case IR::Type::U16: + return Inst<U16>(Opcode::UnsignedSaturatedAdd16, a, b); + case IR::Type::U32: + return Inst<U32>(Opcode::UnsignedSaturatedAdd32, a, b); + case IR::Type::U64: + return Inst<U64>(Opcode::UnsignedSaturatedAdd64, a, b); + default: + return IR::UAny{}; + } + }(); + return result; +} + +UAny IREmitter::UnsignedSaturatedSub(const UAny& a, const UAny& b) { + ASSERT(a.GetType() == b.GetType()); + const auto result = [&]() -> IR::UAny { + switch (a.GetType()) { + case IR::Type::U8: + return Inst<U8>(Opcode::UnsignedSaturatedSub8, a, b); + case IR::Type::U16: + return Inst<U16>(Opcode::UnsignedSaturatedSub16, a, b); + case IR::Type::U32: + return Inst<U32>(Opcode::UnsignedSaturatedSub32, a, b); + case IR::Type::U64: + return Inst<U64>(Opcode::UnsignedSaturatedSub64, a, b); + default: + return IR::UAny{}; + } + }(); + return result; +} + +U128 IREmitter::VectorSignedSaturatedAdd(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorSignedSaturatedAdd8, a, b); + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedAdd16, a, b); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedAdd32, a, b); + case 64: + return Inst<U128>(Opcode::VectorSignedSaturatedAdd64, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorSignedSaturatedSub(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorSignedSaturatedSub8, a, b); + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedSub16, a, b); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedSub32, a, b); + case 64: + return Inst<U128>(Opcode::VectorSignedSaturatedSub64, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorUnsignedSaturatedAdd(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorUnsignedSaturatedAdd8, a, b); + case 16: + return Inst<U128>(Opcode::VectorUnsignedSaturatedAdd16, a, b); + case 32: + return Inst<U128>(Opcode::VectorUnsignedSaturatedAdd32, a, b); + case 64: + return Inst<U128>(Opcode::VectorUnsignedSaturatedAdd64, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorUnsignedSaturatedSub(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorUnsignedSaturatedSub8, a, b); + case 16: + return Inst<U128>(Opcode::VectorUnsignedSaturatedSub16, a, b); + case 32: + return Inst<U128>(Opcode::VectorUnsignedSaturatedSub32, a, b); + case 64: + return Inst<U128>(Opcode::VectorUnsignedSaturatedSub64, a, b); + default: + UNREACHABLE(); + } +} + +ResultAndGE<U32> IREmitter::PackedAddU8(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedAddU8, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +ResultAndGE<U32> IREmitter::PackedAddS8(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedAddS8, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +ResultAndGE<U32> IREmitter::PackedAddU16(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedAddU16, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +ResultAndGE<U32> IREmitter::PackedAddS16(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedAddS16, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +ResultAndGE<U32> IREmitter::PackedSubU8(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedSubU8, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +ResultAndGE<U32> IREmitter::PackedSubS8(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedSubS8, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +ResultAndGE<U32> IREmitter::PackedSubU16(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedSubU16, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +ResultAndGE<U32> IREmitter::PackedSubS16(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedSubS16, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +ResultAndGE<U32> IREmitter::PackedAddSubU16(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedAddSubU16, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +ResultAndGE<U32> IREmitter::PackedAddSubS16(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedAddSubS16, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +ResultAndGE<U32> IREmitter::PackedSubAddU16(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedSubAddU16, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +ResultAndGE<U32> IREmitter::PackedSubAddS16(const U32& a, const U32& b) { + const auto result = Inst<U32>(Opcode::PackedSubAddS16, a, b); + const auto ge = Inst<U32>(Opcode::GetGEFromOp, result); + return {result, ge}; +} + +U32 IREmitter::PackedHalvingAddU8(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingAddU8, a, b); +} + +U32 IREmitter::PackedHalvingAddS8(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingAddS8, a, b); +} + +U32 IREmitter::PackedHalvingSubU8(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingSubU8, a, b); +} + +U32 IREmitter::PackedHalvingSubS8(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingSubS8, a, b); +} + +U32 IREmitter::PackedHalvingAddU16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingAddU16, a, b); +} + +U32 IREmitter::PackedHalvingAddS16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingAddS16, a, b); +} + +U32 IREmitter::PackedHalvingSubU16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingSubU16, a, b); +} + +U32 IREmitter::PackedHalvingSubS16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingSubS16, a, b); +} + +U32 IREmitter::PackedHalvingAddSubU16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingAddSubU16, a, b); +} + +U32 IREmitter::PackedHalvingAddSubS16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingAddSubS16, a, b); +} + +U32 IREmitter::PackedHalvingSubAddU16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingSubAddU16, a, b); +} + +U32 IREmitter::PackedHalvingSubAddS16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedHalvingSubAddS16, a, b); +} + +U32 IREmitter::PackedSaturatedAddU8(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedSaturatedAddU8, a, b); +} + +U32 IREmitter::PackedSaturatedAddS8(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedSaturatedAddS8, a, b); +} + +U32 IREmitter::PackedSaturatedSubU8(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedSaturatedSubU8, a, b); +} + +U32 IREmitter::PackedSaturatedSubS8(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedSaturatedSubS8, a, b); +} + +U32 IREmitter::PackedSaturatedAddU16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedSaturatedAddU16, a, b); +} + +U32 IREmitter::PackedSaturatedAddS16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedSaturatedAddS16, a, b); +} + +U32 IREmitter::PackedSaturatedSubU16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedSaturatedSubU16, a, b); +} + +U32 IREmitter::PackedSaturatedSubS16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedSaturatedSubS16, a, b); +} + +U32 IREmitter::PackedAbsDiffSumU8(const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedAbsDiffSumU8, a, b); +} + +U32 IREmitter::PackedSelect(const U32& ge, const U32& a, const U32& b) { + return Inst<U32>(Opcode::PackedSelect, ge, a, b); +} + +U32 IREmitter::CRC32Castagnoli8(const U32& a, const U32& b) { + return Inst<U32>(Opcode::CRC32Castagnoli8, a, b); +} + +U32 IREmitter::CRC32Castagnoli16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::CRC32Castagnoli16, a, b); +} + +U32 IREmitter::CRC32Castagnoli32(const U32& a, const U32& b) { + return Inst<U32>(Opcode::CRC32Castagnoli32, a, b); +} + +U32 IREmitter::CRC32Castagnoli64(const U32& a, const U64& b) { + return Inst<U32>(Opcode::CRC32Castagnoli64, a, b); +} + +U32 IREmitter::CRC32ISO8(const U32& a, const U32& b) { + return Inst<U32>(Opcode::CRC32ISO8, a, b); +} + +U32 IREmitter::CRC32ISO16(const U32& a, const U32& b) { + return Inst<U32>(Opcode::CRC32ISO16, a, b); +} + +U32 IREmitter::CRC32ISO32(const U32& a, const U32& b) { + return Inst<U32>(Opcode::CRC32ISO32, a, b); +} + +U32 IREmitter::CRC32ISO64(const U32& a, const U64& b) { + return Inst<U32>(Opcode::CRC32ISO64, a, b); +} + +U128 IREmitter::AESDecryptSingleRound(const U128& a) { + return Inst<U128>(Opcode::AESDecryptSingleRound, a); +} + +U128 IREmitter::AESEncryptSingleRound(const U128& a) { + return Inst<U128>(Opcode::AESEncryptSingleRound, a); +} + +U128 IREmitter::AESInverseMixColumns(const U128& a) { + return Inst<U128>(Opcode::AESInverseMixColumns, a); +} + +U128 IREmitter::AESMixColumns(const U128& a) { + return Inst<U128>(Opcode::AESMixColumns, a); +} + +U8 IREmitter::SM4AccessSubstitutionBox(const U8& a) { + return Inst<U8>(Opcode::SM4AccessSubstitutionBox, a); +} + +U128 IREmitter::SHA256Hash(const U128& x, const U128& y, const U128& w, bool part1) { + return Inst<U128>(Opcode::SHA256Hash, x, y, w, Imm1(part1)); +} + +U128 IREmitter::SHA256MessageSchedule0(const U128& x, const U128& y) { + return Inst<U128>(Opcode::SHA256MessageSchedule0, x, y); +} + +U128 IREmitter::SHA256MessageSchedule1(const U128& x, const U128& y, const U128& z) { + return Inst<U128>(Opcode::SHA256MessageSchedule1, x, y, z); +} + +UAny IREmitter::VectorGetElement(size_t esize, const U128& a, size_t index) { + ASSERT_MSG(esize * index < 128, "Invalid index"); + switch (esize) { + case 8: + return Inst<U8>(Opcode::VectorGetElement8, a, Imm8(static_cast<u8>(index))); + case 16: + return Inst<U16>(Opcode::VectorGetElement16, a, Imm8(static_cast<u8>(index))); + case 32: + return Inst<U32>(Opcode::VectorGetElement32, a, Imm8(static_cast<u8>(index))); + case 64: + return Inst<U64>(Opcode::VectorGetElement64, a, Imm8(static_cast<u8>(index))); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorSetElement(size_t esize, const U128& a, size_t index, const IR::UAny& elem) { + ASSERT_MSG(esize * index < 128, "Invalid index"); + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorSetElement8, a, Imm8(static_cast<u8>(index)), elem); + case 16: + return Inst<U128>(Opcode::VectorSetElement16, a, Imm8(static_cast<u8>(index)), elem); + case 32: + return Inst<U128>(Opcode::VectorSetElement32, a, Imm8(static_cast<u8>(index)), elem); + case 64: + return Inst<U128>(Opcode::VectorSetElement64, a, Imm8(static_cast<u8>(index)), elem); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorAbs(size_t esize, const U128& a) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorAbs8, a); + case 16: + return Inst<U128>(Opcode::VectorAbs16, a); + case 32: + return Inst<U128>(Opcode::VectorAbs32, a); + case 64: + return Inst<U128>(Opcode::VectorAbs64, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorAdd(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorAdd8, a, b); + case 16: + return Inst<U128>(Opcode::VectorAdd16, a, b); + case 32: + return Inst<U128>(Opcode::VectorAdd32, a, b); + case 64: + return Inst<U128>(Opcode::VectorAdd64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorAnd(const U128& a, const U128& b) { + return Inst<U128>(Opcode::VectorAnd, a, b); +} + +U128 IREmitter::VectorAndNot(const U128& a, const U128& b) { + return Inst<U128>(Opcode::VectorAndNot, a, b); +} + +U128 IREmitter::VectorArithmeticShiftRight(size_t esize, const U128& a, u8 shift_amount) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorArithmeticShiftRight8, a, Imm8(shift_amount)); + case 16: + return Inst<U128>(Opcode::VectorArithmeticShiftRight16, a, Imm8(shift_amount)); + case 32: + return Inst<U128>(Opcode::VectorArithmeticShiftRight32, a, Imm8(shift_amount)); + case 64: + return Inst<U128>(Opcode::VectorArithmeticShiftRight64, a, Imm8(shift_amount)); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorArithmeticVShift(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorArithmeticVShift8, a, b); + case 16: + return Inst<U128>(Opcode::VectorArithmeticVShift16, a, b); + case 32: + return Inst<U128>(Opcode::VectorArithmeticVShift32, a, b); + case 64: + return Inst<U128>(Opcode::VectorArithmeticVShift64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorBroadcastLower(size_t esize, const UAny& a) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorBroadcastLower8, U8(a)); + case 16: + return Inst<U128>(Opcode::VectorBroadcastLower16, U16(a)); + case 32: + return Inst<U128>(Opcode::VectorBroadcastLower32, U32(a)); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorBroadcast(size_t esize, const UAny& a) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorBroadcast8, U8(a)); + case 16: + return Inst<U128>(Opcode::VectorBroadcast16, U16(a)); + case 32: + return Inst<U128>(Opcode::VectorBroadcast32, U32(a)); + case 64: + return Inst<U128>(Opcode::VectorBroadcast64, U64(a)); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorBroadcastElementLower(size_t esize, const U128& a, size_t index) { + ASSERT_MSG(esize * index < 128, "Invalid index"); + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorBroadcastElementLower8, a, u8(index)); + case 16: + return Inst<U128>(Opcode::VectorBroadcastElementLower16, a, u8(index)); + case 32: + return Inst<U128>(Opcode::VectorBroadcastElementLower32, a, u8(index)); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorBroadcastElement(size_t esize, const U128& a, size_t index) { + ASSERT_MSG(esize * index < 128, "Invalid index"); + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorBroadcastElement8, a, u8(index)); + case 16: + return Inst<U128>(Opcode::VectorBroadcastElement16, a, u8(index)); + case 32: + return Inst<U128>(Opcode::VectorBroadcastElement32, a, u8(index)); + case 64: + return Inst<U128>(Opcode::VectorBroadcastElement64, a, u8(index)); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorCountLeadingZeros(size_t esize, const U128& a) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorCountLeadingZeros8, a); + case 16: + return Inst<U128>(Opcode::VectorCountLeadingZeros16, a); + case 32: + return Inst<U128>(Opcode::VectorCountLeadingZeros32, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorDeinterleaveEven8, a, b); + case 16: + return Inst<U128>(Opcode::VectorDeinterleaveEven16, a, b); + case 32: + return Inst<U128>(Opcode::VectorDeinterleaveEven32, a, b); + case 64: + return Inst<U128>(Opcode::VectorDeinterleaveEven64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorDeinterleaveOdd(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorDeinterleaveOdd8, a, b); + case 16: + return Inst<U128>(Opcode::VectorDeinterleaveOdd16, a, b); + case 32: + return Inst<U128>(Opcode::VectorDeinterleaveOdd32, a, b); + case 64: + return Inst<U128>(Opcode::VectorDeinterleaveOdd64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorDeinterleaveEvenLower(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorDeinterleaveEvenLower8, a, b); + case 16: + return Inst<U128>(Opcode::VectorDeinterleaveEvenLower16, a, b); + case 32: + return Inst<U128>(Opcode::VectorDeinterleaveEvenLower32, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorDeinterleaveOddLower(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorDeinterleaveOddLower8, a, b); + case 16: + return Inst<U128>(Opcode::VectorDeinterleaveOddLower16, a, b); + case 32: + return Inst<U128>(Opcode::VectorDeinterleaveOddLower32, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorEor(const U128& a, const U128& b) { + return Inst<U128>(Opcode::VectorEor, a, b); +} + +U128 IREmitter::VectorEqual(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorEqual8, a, b); + case 16: + return Inst<U128>(Opcode::VectorEqual16, a, b); + case 32: + return Inst<U128>(Opcode::VectorEqual32, a, b); + case 64: + return Inst<U128>(Opcode::VectorEqual64, a, b); + case 128: + return Inst<U128>(Opcode::VectorEqual128, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorExtract(const U128& a, const U128& b, size_t position) { + ASSERT(position <= 128); + return Inst<U128>(Opcode::VectorExtract, a, b, Imm8(static_cast<u8>(position))); +} + +U128 IREmitter::VectorExtractLower(const U128& a, const U128& b, size_t position) { + ASSERT(position <= 64); + return Inst<U128>(Opcode::VectorExtractLower, a, b, Imm8(static_cast<u8>(position))); +} + +U128 IREmitter::VectorGreaterSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorGreaterS8, a, b); + case 16: + return Inst<U128>(Opcode::VectorGreaterS16, a, b); + case 32: + return Inst<U128>(Opcode::VectorGreaterS32, a, b); + case 64: + return Inst<U128>(Opcode::VectorGreaterS64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorGreaterEqualSigned(size_t esize, const U128& a, const U128& b) { + return VectorOr(VectorGreaterSigned(esize, a, b), VectorEqual(esize, a, b)); +} + +U128 IREmitter::VectorGreaterEqualUnsigned(size_t esize, const U128& a, const U128& b) { + return VectorEqual(esize, VectorMaxUnsigned(esize, a, b), a); +} + +U128 IREmitter::VectorGreaterUnsigned(size_t esize, const U128& a, const U128& b) { + return VectorNot(VectorEqual(esize, VectorMinUnsigned(esize, a, b), a)); +} + +U128 IREmitter::VectorHalvingAddSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorHalvingAddS8, a, b); + case 16: + return Inst<U128>(Opcode::VectorHalvingAddS16, a, b); + case 32: + return Inst<U128>(Opcode::VectorHalvingAddS32, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorHalvingAddUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorHalvingAddU8, a, b); + case 16: + return Inst<U128>(Opcode::VectorHalvingAddU16, a, b); + case 32: + return Inst<U128>(Opcode::VectorHalvingAddU32, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorHalvingSubSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorHalvingSubS8, a, b); + case 16: + return Inst<U128>(Opcode::VectorHalvingSubS16, a, b); + case 32: + return Inst<U128>(Opcode::VectorHalvingSubS32, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorHalvingSubUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorHalvingSubU8, a, b); + case 16: + return Inst<U128>(Opcode::VectorHalvingSubU16, a, b); + case 32: + return Inst<U128>(Opcode::VectorHalvingSubU32, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorInterleaveLower(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorInterleaveLower8, a, b); + case 16: + return Inst<U128>(Opcode::VectorInterleaveLower16, a, b); + case 32: + return Inst<U128>(Opcode::VectorInterleaveLower32, a, b); + case 64: + return Inst<U128>(Opcode::VectorInterleaveLower64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorInterleaveUpper(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorInterleaveUpper8, a, b); + case 16: + return Inst<U128>(Opcode::VectorInterleaveUpper16, a, b); + case 32: + return Inst<U128>(Opcode::VectorInterleaveUpper32, a, b); + case 64: + return Inst<U128>(Opcode::VectorInterleaveUpper64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorLessEqualSigned(size_t esize, const U128& a, const U128& b) { + return VectorNot(VectorGreaterSigned(esize, a, b)); +} + +U128 IREmitter::VectorLessEqualUnsigned(size_t esize, const U128& a, const U128& b) { + return VectorEqual(esize, VectorMinUnsigned(esize, a, b), a); +} + +U128 IREmitter::VectorLessSigned(size_t esize, const U128& a, const U128& b) { + return VectorNot(VectorOr(VectorGreaterSigned(esize, a, b), VectorEqual(esize, a, b))); +} + +U128 IREmitter::VectorLessUnsigned(size_t esize, const U128& a, const U128& b) { + return VectorNot(VectorEqual(esize, VectorMaxUnsigned(esize, a, b), a)); +} + +U128 IREmitter::VectorLogicalShiftLeft(size_t esize, const U128& a, u8 shift_amount) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorLogicalShiftLeft8, a, Imm8(shift_amount)); + case 16: + return Inst<U128>(Opcode::VectorLogicalShiftLeft16, a, Imm8(shift_amount)); + case 32: + return Inst<U128>(Opcode::VectorLogicalShiftLeft32, a, Imm8(shift_amount)); + case 64: + return Inst<U128>(Opcode::VectorLogicalShiftLeft64, a, Imm8(shift_amount)); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorLogicalShiftRight(size_t esize, const U128& a, u8 shift_amount) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorLogicalShiftRight8, a, Imm8(shift_amount)); + case 16: + return Inst<U128>(Opcode::VectorLogicalShiftRight16, a, Imm8(shift_amount)); + case 32: + return Inst<U128>(Opcode::VectorLogicalShiftRight32, a, Imm8(shift_amount)); + case 64: + return Inst<U128>(Opcode::VectorLogicalShiftRight64, a, Imm8(shift_amount)); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorLogicalVShift(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorLogicalVShift8, a, b); + case 16: + return Inst<U128>(Opcode::VectorLogicalVShift16, a, b); + case 32: + return Inst<U128>(Opcode::VectorLogicalVShift32, a, b); + case 64: + return Inst<U128>(Opcode::VectorLogicalVShift64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorMaxSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorMaxS8, a, b); + case 16: + return Inst<U128>(Opcode::VectorMaxS16, a, b); + case 32: + return Inst<U128>(Opcode::VectorMaxS32, a, b); + case 64: + return Inst<U128>(Opcode::VectorMaxS64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorMaxUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorMaxU8, a, b); + case 16: + return Inst<U128>(Opcode::VectorMaxU16, a, b); + case 32: + return Inst<U128>(Opcode::VectorMaxU32, a, b); + case 64: + return Inst<U128>(Opcode::VectorMaxU64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorMinSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorMinS8, a, b); + case 16: + return Inst<U128>(Opcode::VectorMinS16, a, b); + case 32: + return Inst<U128>(Opcode::VectorMinS32, a, b); + case 64: + return Inst<U128>(Opcode::VectorMinS64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorMinUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorMinU8, a, b); + case 16: + return Inst<U128>(Opcode::VectorMinU16, a, b); + case 32: + return Inst<U128>(Opcode::VectorMinU32, a, b); + case 64: + return Inst<U128>(Opcode::VectorMinU64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorMultiply(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorMultiply8, a, b); + case 16: + return Inst<U128>(Opcode::VectorMultiply16, a, b); + case 32: + return Inst<U128>(Opcode::VectorMultiply32, a, b); + case 64: + return Inst<U128>(Opcode::VectorMultiply64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorMultiplySignedWiden(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorMultiplySignedWiden8, a, b); + case 16: + return Inst<U128>(Opcode::VectorMultiplySignedWiden16, a, b); + case 32: + return Inst<U128>(Opcode::VectorMultiplySignedWiden32, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorMultiplyUnsignedWiden(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorMultiplyUnsignedWiden8, a, b); + case 16: + return Inst<U128>(Opcode::VectorMultiplyUnsignedWiden16, a, b); + case 32: + return Inst<U128>(Opcode::VectorMultiplyUnsignedWiden32, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorNarrow(size_t original_esize, const U128& a) { + switch (original_esize) { + case 16: + return Inst<U128>(Opcode::VectorNarrow16, a); + case 32: + return Inst<U128>(Opcode::VectorNarrow32, a); + case 64: + return Inst<U128>(Opcode::VectorNarrow64, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorNot(const U128& a) { + return Inst<U128>(Opcode::VectorNot, a); +} + +U128 IREmitter::VectorOr(const U128& a, const U128& b) { + return Inst<U128>(Opcode::VectorOr, a, b); +} + +U128 IREmitter::VectorPairedAdd(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedAdd8, a, b); + case 16: + return Inst<U128>(Opcode::VectorPairedAdd16, a, b); + case 32: + return Inst<U128>(Opcode::VectorPairedAdd32, a, b); + case 64: + return Inst<U128>(Opcode::VectorPairedAdd64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorPairedAddLower(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedAddLower8, a, b); + case 16: + return Inst<U128>(Opcode::VectorPairedAddLower16, a, b); + case 32: + return Inst<U128>(Opcode::VectorPairedAddLower32, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorPairedAddSignedWiden(size_t original_esize, const U128& a) { + switch (original_esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedAddSignedWiden8, a); + case 16: + return Inst<U128>(Opcode::VectorPairedAddSignedWiden16, a); + case 32: + return Inst<U128>(Opcode::VectorPairedAddSignedWiden32, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorPairedAddUnsignedWiden(size_t original_esize, const U128& a) { + switch (original_esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedAddUnsignedWiden8, a); + case 16: + return Inst<U128>(Opcode::VectorPairedAddUnsignedWiden16, a); + case 32: + return Inst<U128>(Opcode::VectorPairedAddUnsignedWiden32, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorPairedMaxSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedMaxS8, a, b); + case 16: + return Inst<U128>(Opcode::VectorPairedMaxS16, a, b); + case 32: + return Inst<U128>(Opcode::VectorPairedMaxS32, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorPairedMaxUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedMaxU8, a, b); + case 16: + return Inst<U128>(Opcode::VectorPairedMaxU16, a, b); + case 32: + return Inst<U128>(Opcode::VectorPairedMaxU32, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorPairedMinSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedMinS8, a, b); + case 16: + return Inst<U128>(Opcode::VectorPairedMinS16, a, b); + case 32: + return Inst<U128>(Opcode::VectorPairedMinS32, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorPairedMinUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedMinU8, a, b); + case 16: + return Inst<U128>(Opcode::VectorPairedMinU16, a, b); + case 32: + return Inst<U128>(Opcode::VectorPairedMinU32, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorPairedMaxSignedLower(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedMaxLowerS8, a, b); + case 16: + return Inst<U128>(Opcode::VectorPairedMaxLowerS16, a, b); + case 32: + return Inst<U128>(Opcode::VectorPairedMaxLowerS32, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorPairedMaxUnsignedLower(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedMaxLowerU8, a, b); + case 16: + return Inst<U128>(Opcode::VectorPairedMaxLowerU16, a, b); + case 32: + return Inst<U128>(Opcode::VectorPairedMaxLowerU32, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorPairedMinSignedLower(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedMinLowerS8, a, b); + case 16: + return Inst<U128>(Opcode::VectorPairedMinLowerS16, a, b); + case 32: + return Inst<U128>(Opcode::VectorPairedMinLowerS32, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorPairedMinUnsignedLower(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorPairedMinLowerU8, a, b); + case 16: + return Inst<U128>(Opcode::VectorPairedMinLowerU16, a, b); + case 32: + return Inst<U128>(Opcode::VectorPairedMinLowerU32, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorPolynomialMultiply(const U128& a, const U128& b) { + return Inst<U128>(Opcode::VectorPolynomialMultiply8, a, b); +} + +U128 IREmitter::VectorPolynomialMultiplyLong(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorPolynomialMultiplyLong8, a, b); + case 64: + return Inst<U128>(Opcode::VectorPolynomialMultiplyLong64, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorPopulationCount(const U128& a) { + return Inst<U128>(Opcode::VectorPopulationCount, a); +} + +U128 IREmitter::VectorReverseBits(const U128& a) { + return Inst<U128>(Opcode::VectorReverseBits, a); +} + +U128 IREmitter::VectorReverseElementsInHalfGroups(size_t esize, const U128& a) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorReverseElementsInHalfGroups8, a); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorReverseElementsInWordGroups(size_t esize, const U128& a) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorReverseElementsInWordGroups8, a); + case 16: + return Inst<U128>(Opcode::VectorReverseElementsInWordGroups16, a); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorReverseElementsInLongGroups(size_t esize, const U128& a) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorReverseElementsInLongGroups8, a); + case 16: + return Inst<U128>(Opcode::VectorReverseElementsInLongGroups16, a); + case 32: + return Inst<U128>(Opcode::VectorReverseElementsInLongGroups32, a); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorReduceAdd(size_t esize, const U128& a) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorReduceAdd8, a); + case 16: + return Inst<U128>(Opcode::VectorReduceAdd16, a); + case 32: + return Inst<U128>(Opcode::VectorReduceAdd32, a); + case 64: + return Inst<U128>(Opcode::VectorReduceAdd64, a); + } + + UNREACHABLE(); +} + +U128 IREmitter::VectorRotateLeft(size_t esize, const U128& a, u8 amount) { + ASSERT(amount < esize); + + if (amount == 0) { + return a; + } + + return VectorOr(VectorLogicalShiftLeft(esize, a, amount), + VectorLogicalShiftRight(esize, a, static_cast<u8>(esize - amount))); +} + +U128 IREmitter::VectorRotateRight(size_t esize, const U128& a, u8 amount) { + ASSERT(amount < esize); + + if (amount == 0) { + return a; + } + + return VectorOr(VectorLogicalShiftRight(esize, a, amount), + VectorLogicalShiftLeft(esize, a, static_cast<u8>(esize - amount))); +} + +U128 IREmitter::VectorRotateWholeVectorRight(const U128& a, u8 amount) { + ASSERT(amount % 32 == 0); + return Inst<U128>(Opcode::VectorRotateWholeVectorRight, a, Imm8(amount)); +} + +U128 IREmitter::VectorRoundingHalvingAddSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorRoundingHalvingAddS8, a, b); + case 16: + return Inst<U128>(Opcode::VectorRoundingHalvingAddS16, a, b); + case 32: + return Inst<U128>(Opcode::VectorRoundingHalvingAddS32, a, b); + } + + UNREACHABLE(); +} + +U128 IREmitter::VectorRoundingHalvingAddUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorRoundingHalvingAddU8, a, b); + case 16: + return Inst<U128>(Opcode::VectorRoundingHalvingAddU16, a, b); + case 32: + return Inst<U128>(Opcode::VectorRoundingHalvingAddU32, a, b); + } + + UNREACHABLE(); +} + +U128 IREmitter::VectorRoundingShiftLeftSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorRoundingShiftLeftS8, a, b); + case 16: + return Inst<U128>(Opcode::VectorRoundingShiftLeftS16, a, b); + case 32: + return Inst<U128>(Opcode::VectorRoundingShiftLeftS32, a, b); + case 64: + return Inst<U128>(Opcode::VectorRoundingShiftLeftS64, a, b); + } + + UNREACHABLE(); +} + +U128 IREmitter::VectorRoundingShiftLeftUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorRoundingShiftLeftU8, a, b); + case 16: + return Inst<U128>(Opcode::VectorRoundingShiftLeftU16, a, b); + case 32: + return Inst<U128>(Opcode::VectorRoundingShiftLeftU32, a, b); + case 64: + return Inst<U128>(Opcode::VectorRoundingShiftLeftU64, a, b); + } + + UNREACHABLE(); +} + +U128 IREmitter::VectorSignExtend(size_t original_esize, const U128& a) { + switch (original_esize) { + case 8: + return Inst<U128>(Opcode::VectorSignExtend8, a); + case 16: + return Inst<U128>(Opcode::VectorSignExtend16, a); + case 32: + return Inst<U128>(Opcode::VectorSignExtend32, a); + case 64: + return Inst<U128>(Opcode::VectorSignExtend64, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorSignedAbsoluteDifference8, a, b); + case 16: + return Inst<U128>(Opcode::VectorSignedAbsoluteDifference16, a, b); + case 32: + return Inst<U128>(Opcode::VectorSignedAbsoluteDifference32, a, b); + } + UNREACHABLE(); +} + +UpperAndLower IREmitter::VectorSignedMultiply(size_t esize, const U128& a, const U128& b) { + const Value multiply = [&] { + switch (esize) { + case 16: + return Inst(Opcode::VectorSignedMultiply16, a, b); + case 32: + return Inst(Opcode::VectorSignedMultiply32, a, b); + } + UNREACHABLE(); + }(); + + return { + Inst<U128>(Opcode::GetUpperFromOp, multiply), + Inst<U128>(Opcode::GetLowerFromOp, multiply), + }; +} + +U128 IREmitter::VectorSignedSaturatedAbs(size_t esize, const U128& a) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorSignedSaturatedAbs8, a); + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedAbs16, a); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedAbs32, a); + case 64: + return Inst<U128>(Opcode::VectorSignedSaturatedAbs64, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned8, a, b); + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned16, a, b); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned32, a, b); + case 64: + return Inst<U128>(Opcode::VectorSignedSaturatedAccumulateUnsigned64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorSignedSaturatedDoublingMultiplyHigh(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyHigh16, a, b); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyHigh32, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorSignedSaturatedDoublingMultiplyHighRounding(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding16, a, b); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding32, a, b); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::VectorSignedSaturatedDoublingMultiplyLong(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyLong16, a, b); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedDoublingMultiplyLong32, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a) { + switch (original_esize) { + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToSigned16, a); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToSigned32, a); + case 64: + return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToSigned64, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a) { + switch (original_esize) { + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToUnsigned16, a); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToUnsigned32, a); + case 64: + return Inst<U128>(Opcode::VectorSignedSaturatedNarrowToUnsigned64, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorSignedSaturatedNeg(size_t esize, const U128& a) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorSignedSaturatedNeg8, a); + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedNeg16, a); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedNeg32, a); + case 64: + return Inst<U128>(Opcode::VectorSignedSaturatedNeg64, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorSignedSaturatedShiftLeft(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeft8, a, b); + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeft16, a, b); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeft32, a, b); + case 64: + return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeft64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorSignedSaturatedShiftLeftUnsigned(size_t esize, const U128& a, u8 shift_amount) { + ASSERT(shift_amount < esize); + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeftUnsigned8, a, Imm8(shift_amount)); + case 16: + return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeftUnsigned16, a, Imm8(shift_amount)); + case 32: + return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeftUnsigned32, a, Imm8(shift_amount)); + case 64: + return Inst<U128>(Opcode::VectorSignedSaturatedShiftLeftUnsigned64, a, Imm8(shift_amount)); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorSub(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorSub8, a, b); + case 16: + return Inst<U128>(Opcode::VectorSub16, a, b); + case 32: + return Inst<U128>(Opcode::VectorSub32, a, b); + case 64: + return Inst<U128>(Opcode::VectorSub64, a, b); + } + UNREACHABLE(); +} + +Table IREmitter::VectorTable(std::vector<U64> values) { + ASSERT(values.size() >= 1 && values.size() <= 4); + values.resize(4); + return Inst<Table>(Opcode::VectorTable, values[0], values[1], values[2], values[3]); +} + +Table IREmitter::VectorTable(std::vector<U128> values) { + ASSERT(values.size() >= 1 && values.size() <= 4); + values.resize(4); + return Inst<Table>(Opcode::VectorTable, values[0], values[1], values[2], values[3]); +} + +U64 IREmitter::VectorTableLookup(const U64& defaults, const Table& table, const U64& indices) { + ASSERT(table.GetInst()->GetArg(0).GetType() == Type::U64); + return Inst<U64>(Opcode::VectorTableLookup64, defaults, table, indices); +} + +U128 IREmitter::VectorTableLookup(const U128& defaults, const Table& table, const U128& indices) { + ASSERT(table.GetInst()->GetArg(0).GetType() == Type::U128); + return Inst<U128>(Opcode::VectorTableLookup128, defaults, table, indices); +} + +U128 IREmitter::VectorTranspose(size_t esize, const U128& a, const U128& b, bool part) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorTranspose8, a, b, Imm1(part)); + case 16: + return Inst<U128>(Opcode::VectorTranspose16, a, b, Imm1(part)); + case 32: + return Inst<U128>(Opcode::VectorTranspose32, a, b, Imm1(part)); + case 64: + return Inst<U128>(Opcode::VectorTranspose64, a, b, Imm1(part)); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorUnsignedAbsoluteDifference8, a, b); + case 16: + return Inst<U128>(Opcode::VectorUnsignedAbsoluteDifference16, a, b); + case 32: + return Inst<U128>(Opcode::VectorUnsignedAbsoluteDifference32, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorUnsignedRecipEstimate(const U128& a) { + return Inst<U128>(Opcode::VectorUnsignedRecipEstimate, a); +} + +U128 IREmitter::VectorUnsignedRecipSqrtEstimate(const U128& a) { + return Inst<U128>(Opcode::VectorUnsignedRecipSqrtEstimate, a); +} + +U128 IREmitter::VectorUnsignedSaturatedAccumulateSigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorUnsignedSaturatedAccumulateSigned8, a, b); + case 16: + return Inst<U128>(Opcode::VectorUnsignedSaturatedAccumulateSigned16, a, b); + case 32: + return Inst<U128>(Opcode::VectorUnsignedSaturatedAccumulateSigned32, a, b); + case 64: + return Inst<U128>(Opcode::VectorUnsignedSaturatedAccumulateSigned64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorUnsignedSaturatedNarrow(size_t esize, const U128& a) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::VectorUnsignedSaturatedNarrow16, a); + case 32: + return Inst<U128>(Opcode::VectorUnsignedSaturatedNarrow32, a); + case 64: + return Inst<U128>(Opcode::VectorUnsignedSaturatedNarrow64, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorUnsignedSaturatedShiftLeft(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst<U128>(Opcode::VectorUnsignedSaturatedShiftLeft8, a, b); + case 16: + return Inst<U128>(Opcode::VectorUnsignedSaturatedShiftLeft16, a, b); + case 32: + return Inst<U128>(Opcode::VectorUnsignedSaturatedShiftLeft32, a, b); + case 64: + return Inst<U128>(Opcode::VectorUnsignedSaturatedShiftLeft64, a, b); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorZeroExtend(size_t original_esize, const U128& a) { + switch (original_esize) { + case 8: + return Inst<U128>(Opcode::VectorZeroExtend8, a); + case 16: + return Inst<U128>(Opcode::VectorZeroExtend16, a); + case 32: + return Inst<U128>(Opcode::VectorZeroExtend32, a); + case 64: + return Inst<U128>(Opcode::VectorZeroExtend64, a); + } + UNREACHABLE(); +} + +U128 IREmitter::VectorZeroUpper(const U128& a) { + return Inst<U128>(Opcode::VectorZeroUpper, a); +} + +U128 IREmitter::ZeroVector() { + return Inst<U128>(Opcode::ZeroVector); +} + +U16U32U64 IREmitter::FPAbs(const U16U32U64& a) { + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPAbs16, a); + case Type::U32: + return Inst<U32>(Opcode::FPAbs32, a); + case Type::U64: + return Inst<U64>(Opcode::FPAbs64, a); + default: + UNREACHABLE(); + } +} + +U32U64 IREmitter::FPAdd(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U32: + return Inst<U32>(Opcode::FPAdd32, a, b); + case Type::U64: + return Inst<U64>(Opcode::FPAdd64, a, b); + default: + UNREACHABLE(); + } +} + +NZCV IREmitter::FPCompare(const U32U64& a, const U32U64& b, bool exc_on_qnan) { + ASSERT(a.GetType() == b.GetType()); + + const IR::U1 exc_on_qnan_imm = Imm1(exc_on_qnan); + + switch (a.GetType()) { + case Type::U32: + return Inst<NZCV>(Opcode::FPCompare32, a, b, exc_on_qnan_imm); + case Type::U64: + return Inst<NZCV>(Opcode::FPCompare64, a, b, exc_on_qnan_imm); + default: + UNREACHABLE(); + } +} + +U32U64 IREmitter::FPDiv(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U32: + return Inst<U32>(Opcode::FPDiv32, a, b); + case Type::U64: + return Inst<U64>(Opcode::FPDiv64, a, b); + default: + UNREACHABLE(); + } +} + +U32U64 IREmitter::FPMax(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U32: + return Inst<U32>(Opcode::FPMax32, a, b); + case Type::U64: + return Inst<U64>(Opcode::FPMax64, a, b); + default: + UNREACHABLE(); + } +} + +U32U64 IREmitter::FPMaxNumeric(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U32: + return Inst<U32>(Opcode::FPMaxNumeric32, a, b); + case Type::U64: + return Inst<U64>(Opcode::FPMaxNumeric64, a, b); + default: + UNREACHABLE(); + } +} + +U32U64 IREmitter::FPMin(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U32: + return Inst<U32>(Opcode::FPMin32, a, b); + case Type::U64: + return Inst<U64>(Opcode::FPMin64, a, b); + default: + UNREACHABLE(); + } +} + +U32U64 IREmitter::FPMinNumeric(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U32: + return Inst<U32>(Opcode::FPMinNumeric32, a, b); + case Type::U64: + return Inst<U64>(Opcode::FPMinNumeric64, a, b); + default: + UNREACHABLE(); + } +} + +U32U64 IREmitter::FPMul(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U32: + return Inst<U32>(Opcode::FPMul32, a, b); + case Type::U64: + return Inst<U64>(Opcode::FPMul64, a, b); + default: + UNREACHABLE(); + } +} + +U16U32U64 IREmitter::FPMulAdd(const U16U32U64& a, const U16U32U64& b, const U16U32U64& c) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPMulAdd16, a, b, c); + case Type::U32: + return Inst<U32>(Opcode::FPMulAdd32, a, b, c); + case Type::U64: + return Inst<U64>(Opcode::FPMulAdd64, a, b, c); + default: + UNREACHABLE(); + } +} + +U16U32U64 IREmitter::FPMulSub(const U16U32U64& a, const U16U32U64& b, const U16U32U64& c) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPMulSub16, a, b, c); + case Type::U32: + return Inst<U32>(Opcode::FPMulSub32, a, b, c); + case Type::U64: + return Inst<U64>(Opcode::FPMulSub64, a, b, c); + default: + UNREACHABLE(); + } +} + +U32U64 IREmitter::FPMulX(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U32: + return Inst<U32>(Opcode::FPMulX32, a, b); + case Type::U64: + return Inst<U64>(Opcode::FPMulX64, a, b); + default: + UNREACHABLE(); + } +} + +U16U32U64 IREmitter::FPNeg(const U16U32U64& a) { + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPNeg16, a); + case Type::U32: + return Inst<U32>(Opcode::FPNeg32, a); + case Type::U64: + return Inst<U64>(Opcode::FPNeg64, a); + default: + UNREACHABLE(); + } +} + +U16U32U64 IREmitter::FPRecipEstimate(const U16U32U64& a) { + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPRecipEstimate16, a); + case Type::U32: + return Inst<U32>(Opcode::FPRecipEstimate32, a); + case Type::U64: + return Inst<U64>(Opcode::FPRecipEstimate64, a); + default: + UNREACHABLE(); + } +} + +U16U32U64 IREmitter::FPRecipExponent(const U16U32U64& a) { + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPRecipExponent16, a); + case Type::U32: + return Inst<U32>(Opcode::FPRecipExponent32, a); + case Type::U64: + return Inst<U64>(Opcode::FPRecipExponent64, a); + default: + UNREACHABLE(); + } +} + +U16U32U64 IREmitter::FPRecipStepFused(const U16U32U64& a, const U16U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPRecipStepFused16, a, b); + case Type::U32: + return Inst<U32>(Opcode::FPRecipStepFused32, a, b); + case Type::U64: + return Inst<U64>(Opcode::FPRecipStepFused64, a, b); + default: + UNREACHABLE(); + } +} + +U16U32U64 IREmitter::FPRoundInt(const U16U32U64& a, FP::RoundingMode rounding, bool exact) { + const u8 rounding_value = static_cast<u8>(rounding); + const IR::U1 exact_imm = Imm1(exact); + + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPRoundInt16, a, rounding_value, exact_imm); + case Type::U32: + return Inst<U32>(Opcode::FPRoundInt32, a, rounding_value, exact_imm); + case Type::U64: + return Inst<U64>(Opcode::FPRoundInt64, a, rounding_value, exact_imm); + default: + UNREACHABLE(); + } +} + +U16U32U64 IREmitter::FPRSqrtEstimate(const U16U32U64& a) { + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPRSqrtEstimate16, a); + case Type::U32: + return Inst<U32>(Opcode::FPRSqrtEstimate32, a); + case Type::U64: + return Inst<U64>(Opcode::FPRSqrtEstimate64, a); + default: + UNREACHABLE(); + } +} + +U16U32U64 IREmitter::FPRSqrtStepFused(const U16U32U64& a, const U16U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPRSqrtStepFused16, a, b); + case Type::U32: + return Inst<U32>(Opcode::FPRSqrtStepFused32, a, b); + case Type::U64: + return Inst<U64>(Opcode::FPRSqrtStepFused64, a, b); + default: + UNREACHABLE(); + } +} + +U32U64 IREmitter::FPSqrt(const U32U64& a) { + switch (a.GetType()) { + case Type::U32: + return Inst<U32>(Opcode::FPSqrt32, a); + case Type::U64: + return Inst<U64>(Opcode::FPSqrt64, a); + default: + UNREACHABLE(); + } +} + +U32U64 IREmitter::FPSub(const U32U64& a, const U32U64& b) { + ASSERT(a.GetType() == b.GetType()); + + switch (a.GetType()) { + case Type::U32: + return Inst<U32>(Opcode::FPSub32, a, b); + case Type::U64: + return Inst<U64>(Opcode::FPSub64, a, b); + default: + UNREACHABLE(); + } +} + +U16 IREmitter::FPDoubleToHalf(const U64& a, FP::RoundingMode rounding) { + return Inst<U16>(Opcode::FPDoubleToHalf, a, Imm8(static_cast<u8>(rounding))); +} + +U32 IREmitter::FPDoubleToSingle(const U64& a, FP::RoundingMode rounding) { + return Inst<U32>(Opcode::FPDoubleToSingle, a, Imm8(static_cast<u8>(rounding))); +} + +U64 IREmitter::FPHalfToDouble(const U16& a, FP::RoundingMode rounding) { + return Inst<U64>(Opcode::FPHalfToDouble, a, Imm8(static_cast<u8>(rounding))); +} + +U32 IREmitter::FPHalfToSingle(const U16& a, FP::RoundingMode rounding) { + return Inst<U32>(Opcode::FPHalfToSingle, a, Imm8(static_cast<u8>(rounding))); +} + +U64 IREmitter::FPSingleToDouble(const U32& a, FP::RoundingMode rounding) { + return Inst<U64>(Opcode::FPSingleToDouble, a, Imm8(static_cast<u8>(rounding))); +} + +U16 IREmitter::FPSingleToHalf(const U32& a, FP::RoundingMode rounding) { + return Inst<U16>(Opcode::FPSingleToHalf, a, Imm8(static_cast<u8>(rounding))); +} + +U16 IREmitter::FPToFixedS16(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) { + ASSERT(fbits <= 16); + + const U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPHalfToFixedS16, a, fbits_imm, rounding_imm); + case Type::U32: + return Inst<U16>(Opcode::FPSingleToFixedS16, a, fbits_imm, rounding_imm); + case Type::U64: + return Inst<U16>(Opcode::FPDoubleToFixedS16, a, fbits_imm, rounding_imm); + default: + UNREACHABLE(); + } +} + +U32 IREmitter::FPToFixedS32(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) { + ASSERT(fbits <= 32); + + const U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (a.GetType()) { + case Type::U16: + return Inst<U32>(Opcode::FPHalfToFixedS32, a, fbits_imm, rounding_imm); + case Type::U32: + return Inst<U32>(Opcode::FPSingleToFixedS32, a, fbits_imm, rounding_imm); + case Type::U64: + return Inst<U32>(Opcode::FPDoubleToFixedS32, a, fbits_imm, rounding_imm); + default: + UNREACHABLE(); + } +} + +U64 IREmitter::FPToFixedS64(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) { + ASSERT(fbits <= 64); + + const U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (a.GetType()) { + case Type::U16: + return Inst<U64>(Opcode::FPHalfToFixedS64, a, fbits_imm, rounding_imm); + case Type::U32: + return Inst<U64>(Opcode::FPSingleToFixedS64, a, fbits_imm, rounding_imm); + case Type::U64: + return Inst<U64>(Opcode::FPDoubleToFixedS64, a, fbits_imm, rounding_imm); + default: + UNREACHABLE(); + } +} + +U16 IREmitter::FPToFixedU16(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) { + ASSERT(fbits <= 16); + + const U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (a.GetType()) { + case Type::U16: + return Inst<U16>(Opcode::FPHalfToFixedU16, a, fbits_imm, rounding_imm); + case Type::U32: + return Inst<U16>(Opcode::FPSingleToFixedU16, a, fbits_imm, rounding_imm); + case Type::U64: + return Inst<U16>(Opcode::FPDoubleToFixedU16, a, fbits_imm, rounding_imm); + default: + UNREACHABLE(); + } +} + +U32 IREmitter::FPToFixedU32(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) { + ASSERT(fbits <= 32); + + const U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (a.GetType()) { + case Type::U16: + return Inst<U32>(Opcode::FPHalfToFixedU32, a, fbits_imm, rounding_imm); + case Type::U32: + return Inst<U32>(Opcode::FPSingleToFixedU32, a, fbits_imm, rounding_imm); + case Type::U64: + return Inst<U32>(Opcode::FPDoubleToFixedU32, a, fbits_imm, rounding_imm); + default: + UNREACHABLE(); + } +} + +U64 IREmitter::FPToFixedU64(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) { + ASSERT(fbits <= 64); + + const U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (a.GetType()) { + case Type::U16: + return Inst<U64>(Opcode::FPHalfToFixedU64, a, fbits_imm, rounding_imm); + case Type::U32: + return Inst<U64>(Opcode::FPSingleToFixedU64, a, fbits_imm, rounding_imm); + case Type::U64: + return Inst<U64>(Opcode::FPDoubleToFixedU64, a, fbits_imm, rounding_imm); + default: + UNREACHABLE(); + } +} + +U32 IREmitter::FPSignedFixedToSingle(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) { + ASSERT(fbits <= (a.GetType() == Type::U16 ? 16 : (a.GetType() == Type::U32 ? 32 : 64))); + + const IR::U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const IR::U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (a.GetType()) { + case Type::U16: + return Inst<U32>(Opcode::FPFixedS16ToSingle, a, fbits_imm, rounding_imm); + case Type::U32: + return Inst<U32>(Opcode::FPFixedS32ToSingle, a, fbits_imm, rounding_imm); + case Type::U64: + return Inst<U32>(Opcode::FPFixedS64ToSingle, a, fbits_imm, rounding_imm); + default: + UNREACHABLE(); + } +} + +U32 IREmitter::FPUnsignedFixedToSingle(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) { + ASSERT(fbits <= (a.GetType() == Type::U16 ? 16 : (a.GetType() == Type::U32 ? 32 : 64))); + + const IR::U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const IR::U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (a.GetType()) { + case Type::U16: + return Inst<U32>(Opcode::FPFixedU16ToSingle, a, fbits_imm, rounding_imm); + case Type::U32: + return Inst<U32>(Opcode::FPFixedU32ToSingle, a, fbits_imm, rounding_imm); + case Type::U64: + return Inst<U32>(Opcode::FPFixedU64ToSingle, a, fbits_imm, rounding_imm); + default: + UNREACHABLE(); + } +} + +U64 IREmitter::FPSignedFixedToDouble(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) { + ASSERT(fbits <= (a.GetType() == Type::U16 ? 16 : (a.GetType() == Type::U32 ? 32 : 64))); + + const IR::U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const IR::U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (a.GetType()) { + case Type::U16: + return Inst<U64>(Opcode::FPFixedS16ToDouble, a, fbits_imm, rounding_imm); + case Type::U32: + return Inst<U64>(Opcode::FPFixedS32ToDouble, a, fbits_imm, rounding_imm); + case Type::U64: + return Inst<U64>(Opcode::FPFixedS64ToDouble, a, fbits_imm, rounding_imm); + default: + UNREACHABLE(); + } +} + +U64 IREmitter::FPUnsignedFixedToDouble(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding) { + ASSERT(fbits <= (a.GetType() == Type::U16 ? 16 : (a.GetType() == Type::U32 ? 32 : 64))); + + const IR::U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const IR::U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (a.GetType()) { + case Type::U16: + return Inst<U64>(Opcode::FPFixedU16ToDouble, a, fbits_imm, rounding_imm); + case Type::U32: + return Inst<U64>(Opcode::FPFixedU32ToDouble, a, fbits_imm, rounding_imm); + case Type::U64: + return Inst<U64>(Opcode::FPFixedU64ToDouble, a, fbits_imm, rounding_imm); + default: + UNREACHABLE(); + } +} + +U128 IREmitter::FPVectorAbs(size_t esize, const U128& a) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::FPVectorAbs16, a); + case 32: + return Inst<U128>(Opcode::FPVectorAbs32, a); + case 64: + return Inst<U128>(Opcode::FPVectorAbs64, a); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorAdd(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorAdd32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorAdd64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorDiv(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorDiv32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorDiv64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::FPVectorEqual16, a, b, Imm1(fpcr_controlled)); + case 32: + return Inst<U128>(Opcode::FPVectorEqual32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorEqual64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorFromHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled) { + ASSERT(esize == 32); + return Inst<U128>(Opcode::FPVectorFromHalf32, a, Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled)); +} + +U128 IREmitter::FPVectorFromSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) { + ASSERT(fbits <= esize); + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorFromSignedFixed32, a, Imm8(static_cast<u8>(fbits)), Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorFromSignedFixed64, a, Imm8(static_cast<u8>(fbits)), Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) { + ASSERT(fbits <= esize); + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorFromUnsignedFixed32, a, Imm8(static_cast<u8>(fbits)), Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorFromUnsignedFixed64, a, Imm8(static_cast<u8>(fbits)), Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorGreater32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorGreater64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorGreaterEqual32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorGreaterEqual64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorMax32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorMax64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorMaxNumeric32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorMaxNumeric64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorMin32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorMin64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorMinNumeric32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorMinNumeric64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorMul32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorMul64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorMulAdd(size_t esize, const U128& a, const U128& b, const U128& c, bool fpcr_controlled) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::FPVectorMulAdd16, a, b, c, Imm1(fpcr_controlled)); + case 32: + return Inst<U128>(Opcode::FPVectorMulAdd32, a, b, c, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorMulAdd64, a, b, c, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorMulX(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorMulX32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorMulX64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorNeg(size_t esize, const U128& a) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::FPVectorNeg16, a); + case 32: + return Inst<U128>(Opcode::FPVectorNeg32, a); + case 64: + return Inst<U128>(Opcode::FPVectorNeg64, a); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorPairedAdd(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorPairedAdd32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorPairedAdd64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorPairedAddLower(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorPairedAddLower32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorPairedAddLower64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorRecipEstimate(size_t esize, const U128& a, bool fpcr_controlled) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::FPVectorRecipEstimate16, a, Imm1(fpcr_controlled)); + case 32: + return Inst<U128>(Opcode::FPVectorRecipEstimate32, a, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorRecipEstimate64, a, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorRecipStepFused(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::FPVectorRecipStepFused16, a, b, Imm1(fpcr_controlled)); + case 32: + return Inst<U128>(Opcode::FPVectorRecipStepFused32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorRecipStepFused64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorRoundInt(size_t esize, const U128& operand, FP::RoundingMode rounding, bool exact, bool fpcr_controlled) { + const IR::U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + const IR::U1 exact_imm = Imm1(exact); + + switch (esize) { + case 16: + return Inst<U128>(Opcode::FPVectorRoundInt16, operand, rounding_imm, exact_imm, Imm1(fpcr_controlled)); + case 32: + return Inst<U128>(Opcode::FPVectorRoundInt32, operand, rounding_imm, exact_imm, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorRoundInt64, operand, rounding_imm, exact_imm, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorRSqrtEstimate(size_t esize, const U128& a, bool fpcr_controlled) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::FPVectorRSqrtEstimate16, a, Imm1(fpcr_controlled)); + case 32: + return Inst<U128>(Opcode::FPVectorRSqrtEstimate32, a, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorRSqrtEstimate64, a, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorRSqrtStepFused(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 16: + return Inst<U128>(Opcode::FPVectorRSqrtStepFused16, a, b, Imm1(fpcr_controlled)); + case 32: + return Inst<U128>(Opcode::FPVectorRSqrtStepFused32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorRSqrtStepFused64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorSqrt(size_t esize, const U128& a, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorSqrt32, a, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorSqrt64, a, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorSub(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst<U128>(Opcode::FPVectorSub32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorSub64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + +U128 IREmitter::FPVectorToHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled) { + ASSERT(esize == 32); + return Inst<U128>(Opcode::FPVectorToHalf32, a, Imm8(static_cast<u8>(rounding)), Imm1(fpcr_controlled)); +} + +U128 IREmitter::FPVectorToSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) { + ASSERT(fbits <= esize); + + const U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (esize) { + case 16: + return Inst<U128>(Opcode::FPVectorToSignedFixed16, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled)); + case 32: + return Inst<U128>(Opcode::FPVectorToSignedFixed32, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorToSignedFixed64, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled)); + } + + UNREACHABLE(); +} + +U128 IREmitter::FPVectorToUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) { + ASSERT(fbits <= esize); + + const U8 fbits_imm = Imm8(static_cast<u8>(fbits)); + const U8 rounding_imm = Imm8(static_cast<u8>(rounding)); + + switch (esize) { + case 16: + return Inst<U128>(Opcode::FPVectorToUnsignedFixed16, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled)); + case 32: + return Inst<U128>(Opcode::FPVectorToUnsignedFixed32, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled)); + case 64: + return Inst<U128>(Opcode::FPVectorToUnsignedFixed64, a, fbits_imm, rounding_imm, Imm1(fpcr_controlled)); + } + + UNREACHABLE(); +} + +void IREmitter::Breakpoint() { + Inst(Opcode::Breakpoint); +} + +void IREmitter::CallHostFunction(void (*fn)(void)) { + Inst(Opcode::CallHostFunction, Imm64(mcl::bit_cast<u64>(fn)), Value{}, Value{}, Value{}); +} + +void IREmitter::CallHostFunction(void (*fn)(u64), const U64& arg1) { + Inst(Opcode::CallHostFunction, Imm64(mcl::bit_cast<u64>(fn)), arg1, Value{}, Value{}); +} + +void IREmitter::CallHostFunction(void (*fn)(u64, u64), const U64& arg1, const U64& arg2) { + Inst(Opcode::CallHostFunction, Imm64(mcl::bit_cast<u64>(fn)), arg1, arg2, Value{}); +} + +void IREmitter::CallHostFunction(void (*fn)(u64, u64, u64), const U64& arg1, const U64& arg2, const U64& arg3) { + Inst(Opcode::CallHostFunction, Imm64(mcl::bit_cast<u64>(fn)), arg1, arg2, arg3); +} + +void IREmitter::SetTerm(const Terminal& terminal) { + block.SetTerminal(terminal); +} + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/ir_emitter.h b/externals/dynarmic/src/dynarmic/ir/ir_emitter.h new file mode 100644 index 0000000000..d37df24572 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/ir_emitter.h @@ -0,0 +1,432 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <mcl/stdint.hpp> + +#include "dynarmic/ir/acc_type.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/location_descriptor.h" +#include "dynarmic/ir/terminal.h" +#include "dynarmic/ir/value.h" + +namespace Dynarmic::FP { +enum class RoundingMode; +} // namespace Dynarmic::FP + +// ARM JIT Microinstruction Intermediate Representation +// +// This intermediate representation is an SSA IR. It is designed primarily for analysis, +// though it can be lowered into a reduced form for interpretation. Each IR node (Value) +// is a microinstruction of an idealised ARM CPU. The choice of microinstructions is made +// not based on any existing microarchitecture but on ease of implementation. + +namespace Dynarmic::IR { + +enum class Opcode; + +template<typename T> +struct ResultAndCarry { + T result; + U1 carry; +}; + +template<typename T> +struct ResultAndOverflow { + T result; + U1 overflow; +}; + +template<typename T> +struct ResultAndGE { + T result; + U32 ge; +}; + +struct UpperAndLower { + U128 upper; + U128 lower; +}; + +enum class MemOp { + LOAD, + STORE, + PREFETCH, +}; + +/** + * Convenience class to construct a basic block of the intermediate representation. + * `block` is the resulting block. + * The user of this class updates `current_location` as appropriate. + */ +class IREmitter { +public: + explicit IREmitter(Block& block) + : block(block), insertion_point(block.end()) {} + + Block& block; + + U1 Imm1(bool value) const; + U8 Imm8(u8 value) const; + U16 Imm16(u16 value) const; + U32 Imm32(u32 value) const; + U64 Imm64(u64 value) const; + + void PushRSB(const LocationDescriptor& return_location); + + U64 Pack2x32To1x64(const U32& lo, const U32& hi); + U128 Pack2x64To1x128(const U64& lo, const U64& hi); + UAny LeastSignificant(size_t bitsize, const U32U64& value); + U32 LeastSignificantWord(const U64& value); + U16 LeastSignificantHalf(U32U64 value); + U8 LeastSignificantByte(U32U64 value); + ResultAndCarry<U32> MostSignificantWord(const U64& value); + U1 MostSignificantBit(const U32& value); + U1 IsZero(const U32& value); + U1 IsZero(const U64& value); + U1 IsZero(const U32U64& value); + U1 TestBit(const U32U64& value, const U8& bit); + U32 ConditionalSelect(Cond cond, const U32& a, const U32& b); + U64 ConditionalSelect(Cond cond, const U64& a, const U64& b); + NZCV ConditionalSelect(Cond cond, const NZCV& a, const NZCV& b); + U32U64 ConditionalSelect(Cond cond, const U32U64& a, const U32U64& b); + + U1 GetCFlagFromNZCV(const NZCV& nzcv); + NZCV NZCVFromPackedFlags(const U32& a); + // This pseudo-instruction may only be added to instructions that support it. + NZCV NZCVFrom(const Value& value); + + ResultAndCarry<U32> LogicalShiftLeft(const U32& value_in, const U8& shift_amount, const U1& carry_in); + ResultAndCarry<U32> LogicalShiftRight(const U32& value_in, const U8& shift_amount, const U1& carry_in); + ResultAndCarry<U32> ArithmeticShiftRight(const U32& value_in, const U8& shift_amount, const U1& carry_in); + ResultAndCarry<U32> RotateRight(const U32& value_in, const U8& shift_amount, const U1& carry_in); + U32U64 LogicalShiftLeft(const U32U64& value_in, const U8& shift_amount); + U32U64 LogicalShiftRight(const U32U64& value_in, const U8& shift_amount); + U32U64 ArithmeticShiftRight(const U32U64& value_in, const U8& shift_amount); + U32U64 RotateRight(const U32U64& value_in, const U8& shift_amount); + U32U64 LogicalShiftLeftMasked(const U32U64& value_in, const U32U64& shift_amount); + U32U64 LogicalShiftRightMasked(const U32U64& value_in, const U32U64& shift_amount); + U32U64 ArithmeticShiftRightMasked(const U32U64& value_in, const U32U64& shift_amount); + U32U64 RotateRightMasked(const U32U64& value_in, const U32U64& shift_amount); + ResultAndCarry<U32> RotateRightExtended(const U32& value_in, const U1& carry_in); + U32U64 AddWithCarry(const U32U64& a, const U32U64& b, const U1& carry_in); + U32U64 SubWithCarry(const U32U64& a, const U32U64& b, const U1& carry_in); + U32U64 Add(const U32U64& a, const U32U64& b); + U32U64 Sub(const U32U64& a, const U32U64& b); + U32U64 Mul(const U32U64& a, const U32U64& b); + U64 UnsignedMultiplyHigh(const U64& a, const U64& b); + U64 SignedMultiplyHigh(const U64& a, const U64& b); + U32U64 UnsignedDiv(const U32U64& a, const U32U64& b); + U32U64 SignedDiv(const U32U64& a, const U32U64& b); + U32U64 And(const U32U64& a, const U32U64& b); + U32U64 AndNot(const U32U64& a, const U32U64& b); + U32U64 Eor(const U32U64& a, const U32U64& b); + U32U64 Or(const U32U64& a, const U32U64& b); + U32U64 Not(const U32U64& a); + U32 SignExtendToWord(const UAny& a); + U64 SignExtendToLong(const UAny& a); + U32 SignExtendByteToWord(const U8& a); + U32 SignExtendHalfToWord(const U16& a); + U64 SignExtendWordToLong(const U32& a); + U32 ZeroExtendToWord(const UAny& a); + U64 ZeroExtendToLong(const UAny& a); + U128 ZeroExtendToQuad(const UAny& a); + U32 ZeroExtendByteToWord(const U8& a); + U32 ZeroExtendHalfToWord(const U16& a); + U64 ZeroExtendWordToLong(const U32& a); + U32 IndeterminateExtendToWord(const UAny& a); + U64 IndeterminateExtendToLong(const UAny& a); + U32 ByteReverseWord(const U32& a); + U16 ByteReverseHalf(const U16& a); + U64 ByteReverseDual(const U64& a); + U32U64 CountLeadingZeros(const U32U64& a); + U32U64 ExtractRegister(const U32U64& a, const U32U64& b, const U8& lsb); + U32U64 ReplicateBit(const U32U64& a, u8 bit); + U32U64 MaxSigned(const U32U64& a, const U32U64& b); + U32U64 MaxUnsigned(const U32U64& a, const U32U64& b); + U32U64 MinSigned(const U32U64& a, const U32U64& b); + U32U64 MinUnsigned(const U32U64& a, const U32U64& b); + + ResultAndOverflow<U32> SignedSaturatedAddWithFlag(const U32& a, const U32& b); + ResultAndOverflow<U32> SignedSaturatedSubWithFlag(const U32& a, const U32& b); + ResultAndOverflow<U32> SignedSaturation(const U32& a, size_t bit_size_to_saturate_to); + ResultAndOverflow<U32> UnsignedSaturation(const U32& a, size_t bit_size_to_saturate_to); + + UAny SignedSaturatedAdd(const UAny& a, const UAny& b); + UAny SignedSaturatedDoublingMultiplyReturnHigh(const UAny& a, const UAny& b); + UAny SignedSaturatedSub(const UAny& a, const UAny& b); + UAny UnsignedSaturatedAdd(const UAny& a, const UAny& b); + UAny UnsignedSaturatedSub(const UAny& a, const UAny& b); + + U128 VectorSignedSaturatedAdd(size_t esize, const U128& a, const U128& b); + U128 VectorSignedSaturatedSub(size_t esize, const U128& a, const U128& b); + U128 VectorUnsignedSaturatedAdd(size_t esize, const U128& a, const U128& b); + U128 VectorUnsignedSaturatedSub(size_t esize, const U128& a, const U128& b); + + ResultAndGE<U32> PackedAddU8(const U32& a, const U32& b); + ResultAndGE<U32> PackedAddS8(const U32& a, const U32& b); + ResultAndGE<U32> PackedAddU16(const U32& a, const U32& b); + ResultAndGE<U32> PackedAddS16(const U32& a, const U32& b); + ResultAndGE<U32> PackedSubU8(const U32& a, const U32& b); + ResultAndGE<U32> PackedSubS8(const U32& a, const U32& b); + ResultAndGE<U32> PackedSubU16(const U32& a, const U32& b); + ResultAndGE<U32> PackedSubS16(const U32& a, const U32& b); + ResultAndGE<U32> PackedAddSubU16(const U32& a, const U32& b); + ResultAndGE<U32> PackedAddSubS16(const U32& a, const U32& b); + ResultAndGE<U32> PackedSubAddU16(const U32& a, const U32& b); + ResultAndGE<U32> PackedSubAddS16(const U32& a, const U32& b); + U32 PackedHalvingAddU8(const U32& a, const U32& b); + U32 PackedHalvingAddS8(const U32& a, const U32& b); + U32 PackedHalvingSubU8(const U32& a, const U32& b); + U32 PackedHalvingSubS8(const U32& a, const U32& b); + U32 PackedHalvingAddU16(const U32& a, const U32& b); + U32 PackedHalvingAddS16(const U32& a, const U32& b); + U32 PackedHalvingSubU16(const U32& a, const U32& b); + U32 PackedHalvingSubS16(const U32& a, const U32& b); + U32 PackedHalvingAddSubU16(const U32& a, const U32& b); + U32 PackedHalvingAddSubS16(const U32& a, const U32& b); + U32 PackedHalvingSubAddU16(const U32& a, const U32& b); + U32 PackedHalvingSubAddS16(const U32& a, const U32& b); + U32 PackedSaturatedAddU8(const U32& a, const U32& b); + U32 PackedSaturatedAddS8(const U32& a, const U32& b); + U32 PackedSaturatedSubU8(const U32& a, const U32& b); + U32 PackedSaturatedSubS8(const U32& a, const U32& b); + U32 PackedSaturatedAddU16(const U32& a, const U32& b); + U32 PackedSaturatedAddS16(const U32& a, const U32& b); + U32 PackedSaturatedSubU16(const U32& a, const U32& b); + U32 PackedSaturatedSubS16(const U32& a, const U32& b); + U32 PackedAbsDiffSumU8(const U32& a, const U32& b); + U32 PackedSelect(const U32& ge, const U32& a, const U32& b); + + U32 CRC32Castagnoli8(const U32& a, const U32& b); + U32 CRC32Castagnoli16(const U32& a, const U32& b); + U32 CRC32Castagnoli32(const U32& a, const U32& b); + U32 CRC32Castagnoli64(const U32& a, const U64& b); + U32 CRC32ISO8(const U32& a, const U32& b); + U32 CRC32ISO16(const U32& a, const U32& b); + U32 CRC32ISO32(const U32& a, const U32& b); + U32 CRC32ISO64(const U32& a, const U64& b); + + U128 AESDecryptSingleRound(const U128& a); + U128 AESEncryptSingleRound(const U128& a); + U128 AESInverseMixColumns(const U128& a); + U128 AESMixColumns(const U128& a); + + U8 SM4AccessSubstitutionBox(const U8& a); + + U128 SHA256Hash(const U128& x, const U128& y, const U128& w, bool part1); + U128 SHA256MessageSchedule0(const U128& x, const U128& y); + U128 SHA256MessageSchedule1(const U128& x, const U128& y, const U128& z); + + UAny VectorGetElement(size_t esize, const U128& a, size_t index); + U128 VectorSetElement(size_t esize, const U128& a, size_t index, const UAny& elem); + U128 VectorAbs(size_t esize, const U128& a); + U128 VectorAdd(size_t esize, const U128& a, const U128& b); + U128 VectorAnd(const U128& a, const U128& b); + U128 VectorAndNot(const U128& a, const U128& b); + U128 VectorArithmeticShiftRight(size_t esize, const U128& a, u8 shift_amount); + U128 VectorArithmeticVShift(size_t esize, const U128& a, const U128& b); + U128 VectorBroadcast(size_t esize, const UAny& a); + U128 VectorBroadcastLower(size_t esize, const UAny& a); + U128 VectorBroadcastElement(size_t esize, const U128& a, size_t index); + U128 VectorBroadcastElementLower(size_t esize, const U128& a, size_t index); + U128 VectorCountLeadingZeros(size_t esize, const U128& a); + U128 VectorEor(const U128& a, const U128& b); + U128 VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b); + U128 VectorDeinterleaveEvenLower(size_t esize, const U128& a, const U128& b); + U128 VectorDeinterleaveOdd(size_t esize, const U128& a, const U128& b); + U128 VectorDeinterleaveOddLower(size_t esize, const U128& a, const U128& b); + U128 VectorEqual(size_t esize, const U128& a, const U128& b); + U128 VectorExtract(const U128& a, const U128& b, size_t position); + U128 VectorExtractLower(const U128& a, const U128& b, size_t position); + U128 VectorGreaterEqualSigned(size_t esize, const U128& a, const U128& b); + U128 VectorGreaterEqualUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorGreaterSigned(size_t esize, const U128& a, const U128& b); + U128 VectorGreaterUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorHalvingAddSigned(size_t esize, const U128& a, const U128& b); + U128 VectorHalvingAddUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorHalvingSubSigned(size_t esize, const U128& a, const U128& b); + U128 VectorHalvingSubUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorInterleaveLower(size_t esize, const U128& a, const U128& b); + U128 VectorInterleaveUpper(size_t esize, const U128& a, const U128& b); + U128 VectorLessEqualSigned(size_t esize, const U128& a, const U128& b); + U128 VectorLessEqualUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorLessSigned(size_t esize, const U128& a, const U128& b); + U128 VectorLessUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorLogicalShiftLeft(size_t esize, const U128& a, u8 shift_amount); + U128 VectorLogicalShiftRight(size_t esize, const U128& a, u8 shift_amount); + U128 VectorLogicalVShift(size_t esize, const U128& a, const U128& b); + U128 VectorMaxSigned(size_t esize, const U128& a, const U128& b); + U128 VectorMaxUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorMinSigned(size_t esize, const U128& a, const U128& b); + U128 VectorMinUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorMultiply(size_t esize, const U128& a, const U128& b); + U128 VectorMultiplySignedWiden(size_t esize, const U128& a, const U128& b); + U128 VectorMultiplyUnsignedWiden(size_t esize, const U128& a, const U128& b); + U128 VectorNarrow(size_t original_esize, const U128& a); + U128 VectorNot(const U128& a); + U128 VectorOr(const U128& a, const U128& b); + U128 VectorPairedAdd(size_t esize, const U128& a, const U128& b); + U128 VectorPairedAddLower(size_t esize, const U128& a, const U128& b); + U128 VectorPairedAddSignedWiden(size_t original_esize, const U128& a); + U128 VectorPairedAddUnsignedWiden(size_t original_esize, const U128& a); + U128 VectorPairedMaxSigned(size_t esize, const U128& a, const U128& b); + U128 VectorPairedMaxUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorPairedMinSigned(size_t esize, const U128& a, const U128& b); + U128 VectorPairedMinUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorPairedMaxSignedLower(size_t esize, const U128& a, const U128& b); + U128 VectorPairedMaxUnsignedLower(size_t esize, const U128& a, const U128& b); + U128 VectorPairedMinSignedLower(size_t esize, const U128& a, const U128& b); + U128 VectorPairedMinUnsignedLower(size_t esize, const U128& a, const U128& b); + U128 VectorPolynomialMultiply(const U128& a, const U128& b); + U128 VectorPolynomialMultiplyLong(size_t esize, const U128& a, const U128& b); + U128 VectorPopulationCount(const U128& a); + U128 VectorReverseBits(const U128& a); + U128 VectorReverseElementsInHalfGroups(size_t esize, const U128& a); + U128 VectorReverseElementsInWordGroups(size_t esize, const U128& a); + U128 VectorReverseElementsInLongGroups(size_t esize, const U128& a); + U128 VectorReduceAdd(size_t esize, const U128& a); + U128 VectorRotateLeft(size_t esize, const U128& a, u8 amount); + U128 VectorRotateRight(size_t esize, const U128& a, u8 amount); + U128 VectorRotateWholeVectorRight(const U128& a, u8 amount); + U128 VectorRoundingHalvingAddSigned(size_t esize, const U128& a, const U128& b); + U128 VectorRoundingHalvingAddUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorRoundingShiftLeftSigned(size_t esize, const U128& a, const U128& b); + U128 VectorRoundingShiftLeftUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorSignExtend(size_t original_esize, const U128& a); + U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); + UpperAndLower VectorSignedMultiply(size_t esize, const U128& a, const U128& b); + U128 VectorSignedSaturatedAbs(size_t esize, const U128& a); + U128 VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b); + U128 VectorSignedSaturatedDoublingMultiplyHigh(size_t esize, const U128& a, const U128& b); + U128 VectorSignedSaturatedDoublingMultiplyHighRounding(size_t esize, const U128& a, const U128& b); + U128 VectorSignedSaturatedDoublingMultiplyLong(size_t esize, const U128& a, const U128& b); + U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a); + U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a); + U128 VectorSignedSaturatedNeg(size_t esize, const U128& a); + U128 VectorSignedSaturatedShiftLeft(size_t esize, const U128& a, const U128& b); + U128 VectorSignedSaturatedShiftLeftUnsigned(size_t esize, const U128& a, u8 shift_amount); + U128 VectorSub(size_t esize, const U128& a, const U128& b); + Table VectorTable(std::vector<U64> values); + Table VectorTable(std::vector<U128> values); + U64 VectorTableLookup(const U64& defaults, const Table& table, const U64& indices); + U128 VectorTableLookup(const U128& defaults, const Table& table, const U128& indices); + U128 VectorTranspose(size_t esize, const U128& a, const U128& b, bool part); + U128 VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); + U128 VectorUnsignedRecipEstimate(const U128& a); + U128 VectorUnsignedRecipSqrtEstimate(const U128& a); + U128 VectorUnsignedSaturatedAccumulateSigned(size_t esize, const U128& a, const U128& b); + U128 VectorUnsignedSaturatedNarrow(size_t esize, const U128& a); + U128 VectorUnsignedSaturatedShiftLeft(size_t esize, const U128& a, const U128& b); + U128 VectorZeroExtend(size_t original_esize, const U128& a); + U128 VectorZeroUpper(const U128& a); + U128 ZeroVector(); + + U16U32U64 FPAbs(const U16U32U64& a); + U32U64 FPAdd(const U32U64& a, const U32U64& b); + NZCV FPCompare(const U32U64& a, const U32U64& b, bool exc_on_qnan); + U32U64 FPDiv(const U32U64& a, const U32U64& b); + U32U64 FPMax(const U32U64& a, const U32U64& b); + U32U64 FPMaxNumeric(const U32U64& a, const U32U64& b); + U32U64 FPMin(const U32U64& a, const U32U64& b); + U32U64 FPMinNumeric(const U32U64& a, const U32U64& b); + U32U64 FPMul(const U32U64& a, const U32U64& b); + U16U32U64 FPMulAdd(const U16U32U64& addend, const U16U32U64& op1, const U16U32U64& op2); + U16U32U64 FPMulSub(const U16U32U64& minuend, const U16U32U64& op1, const U16U32U64& op2); + U32U64 FPMulX(const U32U64& a, const U32U64& b); + U16U32U64 FPNeg(const U16U32U64& a); + U16U32U64 FPRecipEstimate(const U16U32U64& a); + U16U32U64 FPRecipExponent(const U16U32U64& a); + U16U32U64 FPRecipStepFused(const U16U32U64& a, const U16U32U64& b); + U16U32U64 FPRoundInt(const U16U32U64& a, FP::RoundingMode rounding, bool exact); + U16U32U64 FPRSqrtEstimate(const U16U32U64& a); + U16U32U64 FPRSqrtStepFused(const U16U32U64& a, const U16U32U64& b); + U32U64 FPSqrt(const U32U64& a); + U32U64 FPSub(const U32U64& a, const U32U64& b); + U16 FPDoubleToHalf(const U64& a, FP::RoundingMode rounding); + U32 FPDoubleToSingle(const U64& a, FP::RoundingMode rounding); + U64 FPHalfToDouble(const U16& a, FP::RoundingMode rounding); + U32 FPHalfToSingle(const U16& a, FP::RoundingMode rounding); + U16 FPSingleToHalf(const U32& a, FP::RoundingMode rounding); + U64 FPSingleToDouble(const U32& a, FP::RoundingMode rounding); + U16 FPToFixedS16(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding); + U32 FPToFixedS32(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding); + U64 FPToFixedS64(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding); + U16 FPToFixedU16(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding); + U32 FPToFixedU32(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding); + U64 FPToFixedU64(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding); + U32 FPSignedFixedToSingle(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding); + U32 FPUnsignedFixedToSingle(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding); + U64 FPSignedFixedToDouble(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding); + U64 FPUnsignedFixedToDouble(const U16U32U64& a, size_t fbits, FP::RoundingMode rounding); + + U128 FPVectorAbs(size_t esize, const U128& a); + U128 FPVectorAdd(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorDiv(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorFromHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled = true); + U128 FPVectorFromSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); + U128 FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); + U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2, bool fpcr_controlled = true); + U128 FPVectorMulX(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorNeg(size_t esize, const U128& a); + U128 FPVectorPairedAdd(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorPairedAddLower(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorRecipEstimate(size_t esize, const U128& a, bool fpcr_controlled = true); + U128 FPVectorRecipStepFused(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorRoundInt(size_t esize, const U128& operand, FP::RoundingMode rounding, bool exact, bool fpcr_controlled = true); + U128 FPVectorRSqrtEstimate(size_t esize, const U128& a, bool fpcr_controlled = true); + U128 FPVectorRSqrtStepFused(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorSqrt(size_t esize, const U128& a, bool fpcr_controlled = true); + U128 FPVectorSub(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorToHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled = true); + U128 FPVectorToSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); + U128 FPVectorToUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); + + void Breakpoint(); + void CallHostFunction(void (*fn)(void)); + void CallHostFunction(void (*fn)(u64), const U64& arg1); + void CallHostFunction(void (*fn)(u64, u64), const U64& arg1, const U64& arg2); + void CallHostFunction(void (*fn)(u64, u64, u64), const U64& arg1, const U64& arg2, const U64& arg3); + + void SetTerm(const Terminal& terminal); + + void SetInsertionPointBefore(IR::Inst* new_insertion_point) { + insertion_point = IR::Block::iterator{*new_insertion_point}; + } + + void SetInsertionPointBefore(IR::Block::iterator new_insertion_point) { + insertion_point = new_insertion_point; + } + + void SetInsertionPointAfter(IR::Inst* new_insertion_point) { + insertion_point = IR::Block::iterator{*new_insertion_point}; + ++insertion_point; + } + + void SetInsertionPointAfter(IR::Block::iterator new_insertion_point) { + insertion_point = new_insertion_point; + ++insertion_point; + } + +protected: + IR::Block::iterator insertion_point; + + template<typename T = Value, typename... Args> + T Inst(Opcode op, Args... args) { + auto iter = block.PrependNewInst(insertion_point, op, {Value(args)...}); + return T(Value(&*iter)); + } +}; + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/location_descriptor.cpp b/externals/dynarmic/src/dynarmic/ir/location_descriptor.cpp new file mode 100644 index 0000000000..e7e640561d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/location_descriptor.cpp @@ -0,0 +1,16 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/ir/location_descriptor.h" + +#include <fmt/format.h> + +namespace Dynarmic::IR { + +std::string ToString(const LocationDescriptor& descriptor) { + return fmt::format("{{{:016x}}}", descriptor.Value()); +} + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/location_descriptor.h b/externals/dynarmic/src/dynarmic/ir/location_descriptor.h new file mode 100644 index 0000000000..48e5e32bb1 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/location_descriptor.h @@ -0,0 +1,64 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <functional> +#include <string> + +#include <fmt/format.h> +#include <mcl/stdint.hpp> + +namespace Dynarmic::IR { + +class LocationDescriptor { +public: + explicit LocationDescriptor(u64 value) + : value(value) {} + + bool operator==(const LocationDescriptor& o) const { + return value == o.Value(); + } + + bool operator!=(const LocationDescriptor& o) const { + return !operator==(o); + } + + u64 Value() const { return value; } + +private: + u64 value; +}; + +std::string ToString(const LocationDescriptor& descriptor); + +inline bool operator<(const LocationDescriptor& x, const LocationDescriptor& y) noexcept { + return x.Value() < y.Value(); +} + +} // namespace Dynarmic::IR + +namespace std { +template<> +struct less<Dynarmic::IR::LocationDescriptor> { + bool operator()(const Dynarmic::IR::LocationDescriptor& x, const Dynarmic::IR::LocationDescriptor& y) const noexcept { + return x < y; + } +}; +template<> +struct hash<Dynarmic::IR::LocationDescriptor> { + size_t operator()(const Dynarmic::IR::LocationDescriptor& x) const noexcept { + return std::hash<u64>()(x.Value()); + } +}; +} // namespace std + +template<> +struct fmt::formatter<Dynarmic::IR::LocationDescriptor> : fmt::formatter<std::string> { + template<typename FormatContext> + auto format(Dynarmic::IR::LocationDescriptor descriptor, FormatContext& ctx) const { + return formatter<std::string>::format(ToString(descriptor), ctx); + } +}; diff --git a/externals/dynarmic/src/dynarmic/ir/microinstruction.cpp b/externals/dynarmic/src/dynarmic/ir/microinstruction.cpp new file mode 100644 index 0000000000..50af036ddf --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/microinstruction.cpp @@ -0,0 +1,712 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/ir/microinstruction.h" + +#include <algorithm> + +#include <mcl/assert.hpp> + +#include "dynarmic/ir/opcodes.h" +#include "dynarmic/ir/type.h" + +namespace Dynarmic::IR { + +bool Inst::IsArithmeticShift() const { + return op == Opcode::ArithmeticShiftRight32 + || op == Opcode::ArithmeticShiftRight64; +} + +bool Inst::IsCircularShift() const { + return op == Opcode::RotateRight32 + || op == Opcode::RotateRight64 + || op == Opcode::RotateRightExtended; +} + +bool Inst::IsLogicalShift() const { + switch (op) { + case Opcode::LogicalShiftLeft32: + case Opcode::LogicalShiftLeft64: + case Opcode::LogicalShiftRight32: + case Opcode::LogicalShiftRight64: + return true; + + default: + return false; + } +} + +bool Inst::IsShift() const { + return IsArithmeticShift() + || IsCircularShift() + || IsLogicalShift(); +} + +bool Inst::IsBarrier() const { + switch (op) { + case Opcode::A32DataMemoryBarrier: + case Opcode::A32DataSynchronizationBarrier: + case Opcode::A32InstructionSynchronizationBarrier: + case Opcode::A64DataMemoryBarrier: + case Opcode::A64DataSynchronizationBarrier: + case Opcode::A64InstructionSynchronizationBarrier: + return true; + + default: + return false; + } +} + +bool Inst::IsSharedMemoryRead() const { + switch (op) { + case Opcode::A32ReadMemory8: + case Opcode::A32ReadMemory16: + case Opcode::A32ReadMemory32: + case Opcode::A32ReadMemory64: + case Opcode::A64ReadMemory8: + case Opcode::A64ReadMemory16: + case Opcode::A64ReadMemory32: + case Opcode::A64ReadMemory64: + case Opcode::A64ReadMemory128: + return true; + + default: + return false; + } +} + +bool Inst::IsSharedMemoryWrite() const { + switch (op) { + case Opcode::A32WriteMemory8: + case Opcode::A32WriteMemory16: + case Opcode::A32WriteMemory32: + case Opcode::A32WriteMemory64: + case Opcode::A64WriteMemory8: + case Opcode::A64WriteMemory16: + case Opcode::A64WriteMemory32: + case Opcode::A64WriteMemory64: + case Opcode::A64WriteMemory128: + return true; + + default: + return false; + } +} + +bool Inst::IsSharedMemoryReadOrWrite() const { + return IsSharedMemoryRead() + || IsSharedMemoryWrite(); +} + +bool Inst::IsExclusiveMemoryRead() const { + switch (op) { + case Opcode::A32ExclusiveReadMemory8: + case Opcode::A32ExclusiveReadMemory16: + case Opcode::A32ExclusiveReadMemory32: + case Opcode::A32ExclusiveReadMemory64: + case Opcode::A64ExclusiveReadMemory8: + case Opcode::A64ExclusiveReadMemory16: + case Opcode::A64ExclusiveReadMemory32: + case Opcode::A64ExclusiveReadMemory64: + case Opcode::A64ExclusiveReadMemory128: + return true; + + default: + return false; + } +} + +bool Inst::IsExclusiveMemoryWrite() const { + switch (op) { + case Opcode::A32ExclusiveWriteMemory8: + case Opcode::A32ExclusiveWriteMemory16: + case Opcode::A32ExclusiveWriteMemory32: + case Opcode::A32ExclusiveWriteMemory64: + case Opcode::A64ExclusiveWriteMemory8: + case Opcode::A64ExclusiveWriteMemory16: + case Opcode::A64ExclusiveWriteMemory32: + case Opcode::A64ExclusiveWriteMemory64: + case Opcode::A64ExclusiveWriteMemory128: + return true; + + default: + return false; + } +} + +bool Inst::IsMemoryRead() const { + return IsSharedMemoryRead() + || IsExclusiveMemoryRead(); +} + +bool Inst::IsMemoryWrite() const { + return IsSharedMemoryWrite() + || IsExclusiveMemoryWrite(); +} + +bool Inst::IsMemoryReadOrWrite() const { + return IsMemoryRead() + || IsMemoryWrite(); +} + +bool Inst::ReadsFromCPSR() const { + switch (op) { + case Opcode::A32GetCpsr: + case Opcode::A32GetCFlag: + case Opcode::A32GetGEFlags: + case Opcode::A32UpdateUpperLocationDescriptor: + case Opcode::A64GetCFlag: + case Opcode::A64GetNZCVRaw: + case Opcode::ConditionalSelect32: + case Opcode::ConditionalSelect64: + case Opcode::ConditionalSelectNZCV: + return true; + + default: + return false; + } +} + +bool Inst::WritesToCPSR() const { + switch (op) { + case Opcode::A32SetCpsr: + case Opcode::A32SetCpsrNZCVRaw: + case Opcode::A32SetCpsrNZCV: + case Opcode::A32SetCpsrNZCVQ: + case Opcode::A32SetCpsrNZ: + case Opcode::A32SetCpsrNZC: + case Opcode::A32OrQFlag: + case Opcode::A32SetGEFlags: + case Opcode::A32SetGEFlagsCompressed: + case Opcode::A32UpdateUpperLocationDescriptor: + case Opcode::A64SetNZCVRaw: + case Opcode::A64SetNZCV: + return true; + + default: + return false; + } +} + +bool Inst::WritesToSystemRegister() const { + switch (op) { + case Opcode::A64SetTPIDR: + return true; + default: + return false; + } +} + +bool Inst::ReadsFromCoreRegister() const { + switch (op) { + case Opcode::A32GetRegister: + case Opcode::A32GetExtendedRegister32: + case Opcode::A32GetExtendedRegister64: + case Opcode::A32GetVector: + case Opcode::A64GetW: + case Opcode::A64GetX: + case Opcode::A64GetS: + case Opcode::A64GetD: + case Opcode::A64GetQ: + case Opcode::A64GetSP: + return true; + + default: + return false; + } +} + +bool Inst::WritesToCoreRegister() const { + switch (op) { + case Opcode::A32SetRegister: + case Opcode::A32SetExtendedRegister32: + case Opcode::A32SetExtendedRegister64: + case Opcode::A32SetVector: + case Opcode::A32BXWritePC: + case Opcode::A64SetW: + case Opcode::A64SetX: + case Opcode::A64SetS: + case Opcode::A64SetD: + case Opcode::A64SetQ: + case Opcode::A64SetSP: + case Opcode::A64SetPC: + return true; + + default: + return false; + } +} + +bool Inst::ReadsFromFPCR() const { + switch (op) { + case Opcode::A32GetFpscr: + case Opcode::A32GetFpscrNZCV: + case Opcode::A64GetFPCR: + return true; + + default: + return false; + } +} + +bool Inst::WritesToFPCR() const { + switch (op) { + case Opcode::A32SetFpscr: + case Opcode::A32SetFpscrNZCV: + case Opcode::A64SetFPCR: + return true; + + default: + return false; + } +} + +bool Inst::ReadsFromFPSR() const { + return op == Opcode::A32GetFpscr + || op == Opcode::A32GetFpscrNZCV + || op == Opcode::A64GetFPSR + || ReadsFromFPSRCumulativeExceptionBits() + || ReadsFromFPSRCumulativeSaturationBit(); +} + +bool Inst::WritesToFPSR() const { + return op == Opcode::A32SetFpscr + || op == Opcode::A32SetFpscrNZCV + || op == Opcode::A64SetFPSR + || WritesToFPSRCumulativeExceptionBits() + || WritesToFPSRCumulativeSaturationBit(); +} + +bool Inst::ReadsFromFPSRCumulativeExceptionBits() const { + return ReadsFromAndWritesToFPSRCumulativeExceptionBits(); +} + +bool Inst::WritesToFPSRCumulativeExceptionBits() const { + return ReadsFromAndWritesToFPSRCumulativeExceptionBits(); +} + +bool Inst::ReadsFromAndWritesToFPSRCumulativeExceptionBits() const { + switch (op) { + case Opcode::FPAdd32: + case Opcode::FPAdd64: + case Opcode::FPCompare32: + case Opcode::FPCompare64: + case Opcode::FPDiv32: + case Opcode::FPDiv64: + case Opcode::FPMax32: + case Opcode::FPMax64: + case Opcode::FPMaxNumeric32: + case Opcode::FPMaxNumeric64: + case Opcode::FPMin32: + case Opcode::FPMin64: + case Opcode::FPMinNumeric32: + case Opcode::FPMinNumeric64: + case Opcode::FPMul32: + case Opcode::FPMul64: + case Opcode::FPMulAdd16: + case Opcode::FPMulAdd32: + case Opcode::FPMulAdd64: + case Opcode::FPMulSub16: + case Opcode::FPMulSub32: + case Opcode::FPMulSub64: + case Opcode::FPRecipEstimate16: + case Opcode::FPRecipEstimate32: + case Opcode::FPRecipEstimate64: + case Opcode::FPRecipExponent16: + case Opcode::FPRecipExponent32: + case Opcode::FPRecipExponent64: + case Opcode::FPRecipStepFused16: + case Opcode::FPRecipStepFused32: + case Opcode::FPRecipStepFused64: + case Opcode::FPRoundInt16: + case Opcode::FPRoundInt32: + case Opcode::FPRoundInt64: + case Opcode::FPRSqrtEstimate16: + case Opcode::FPRSqrtEstimate32: + case Opcode::FPRSqrtEstimate64: + case Opcode::FPRSqrtStepFused16: + case Opcode::FPRSqrtStepFused32: + case Opcode::FPRSqrtStepFused64: + case Opcode::FPSqrt32: + case Opcode::FPSqrt64: + case Opcode::FPSub32: + case Opcode::FPSub64: + case Opcode::FPHalfToDouble: + case Opcode::FPHalfToSingle: + case Opcode::FPSingleToDouble: + case Opcode::FPSingleToHalf: + case Opcode::FPDoubleToHalf: + case Opcode::FPDoubleToSingle: + case Opcode::FPDoubleToFixedS32: + case Opcode::FPDoubleToFixedS64: + case Opcode::FPDoubleToFixedU32: + case Opcode::FPDoubleToFixedU64: + case Opcode::FPHalfToFixedS32: + case Opcode::FPHalfToFixedS64: + case Opcode::FPHalfToFixedU32: + case Opcode::FPHalfToFixedU64: + case Opcode::FPSingleToFixedS32: + case Opcode::FPSingleToFixedS64: + case Opcode::FPSingleToFixedU32: + case Opcode::FPSingleToFixedU64: + case Opcode::FPFixedU32ToSingle: + case Opcode::FPFixedS32ToSingle: + case Opcode::FPFixedU32ToDouble: + case Opcode::FPFixedU64ToDouble: + case Opcode::FPFixedU64ToSingle: + case Opcode::FPFixedS32ToDouble: + case Opcode::FPFixedS64ToDouble: + case Opcode::FPFixedS64ToSingle: + case Opcode::FPVectorAdd32: + case Opcode::FPVectorAdd64: + case Opcode::FPVectorDiv32: + case Opcode::FPVectorDiv64: + case Opcode::FPVectorEqual16: + case Opcode::FPVectorEqual32: + case Opcode::FPVectorEqual64: + case Opcode::FPVectorFromSignedFixed32: + case Opcode::FPVectorFromSignedFixed64: + case Opcode::FPVectorFromUnsignedFixed32: + case Opcode::FPVectorFromUnsignedFixed64: + case Opcode::FPVectorGreater32: + case Opcode::FPVectorGreater64: + case Opcode::FPVectorGreaterEqual32: + case Opcode::FPVectorGreaterEqual64: + case Opcode::FPVectorMul32: + case Opcode::FPVectorMul64: + case Opcode::FPVectorMulAdd16: + case Opcode::FPVectorMulAdd32: + case Opcode::FPVectorMulAdd64: + case Opcode::FPVectorPairedAddLower32: + case Opcode::FPVectorPairedAddLower64: + case Opcode::FPVectorPairedAdd32: + case Opcode::FPVectorPairedAdd64: + case Opcode::FPVectorRecipEstimate16: + case Opcode::FPVectorRecipEstimate32: + case Opcode::FPVectorRecipEstimate64: + case Opcode::FPVectorRecipStepFused16: + case Opcode::FPVectorRecipStepFused32: + case Opcode::FPVectorRecipStepFused64: + case Opcode::FPVectorRoundInt16: + case Opcode::FPVectorRoundInt32: + case Opcode::FPVectorRoundInt64: + case Opcode::FPVectorRSqrtEstimate16: + case Opcode::FPVectorRSqrtEstimate32: + case Opcode::FPVectorRSqrtEstimate64: + case Opcode::FPVectorRSqrtStepFused16: + case Opcode::FPVectorRSqrtStepFused32: + case Opcode::FPVectorRSqrtStepFused64: + case Opcode::FPVectorSqrt32: + case Opcode::FPVectorSqrt64: + case Opcode::FPVectorSub32: + case Opcode::FPVectorSub64: + case Opcode::FPVectorToSignedFixed16: + case Opcode::FPVectorToSignedFixed32: + case Opcode::FPVectorToSignedFixed64: + case Opcode::FPVectorToUnsignedFixed16: + case Opcode::FPVectorToUnsignedFixed32: + case Opcode::FPVectorToUnsignedFixed64: + return true; + + default: + return false; + } +} + +bool Inst::ReadsFromFPSRCumulativeSaturationBit() const { + return false; +} + +bool Inst::WritesToFPSRCumulativeSaturationBit() const { + switch (op) { + case Opcode::SignedSaturatedAdd8: + case Opcode::SignedSaturatedAdd16: + case Opcode::SignedSaturatedAdd32: + case Opcode::SignedSaturatedAdd64: + case Opcode::SignedSaturatedDoublingMultiplyReturnHigh16: + case Opcode::SignedSaturatedDoublingMultiplyReturnHigh32: + case Opcode::SignedSaturatedSub8: + case Opcode::SignedSaturatedSub16: + case Opcode::SignedSaturatedSub32: + case Opcode::SignedSaturatedSub64: + case Opcode::UnsignedSaturatedAdd8: + case Opcode::UnsignedSaturatedAdd16: + case Opcode::UnsignedSaturatedAdd32: + case Opcode::UnsignedSaturatedAdd64: + case Opcode::UnsignedSaturatedSub8: + case Opcode::UnsignedSaturatedSub16: + case Opcode::UnsignedSaturatedSub32: + case Opcode::UnsignedSaturatedSub64: + case Opcode::VectorSignedSaturatedAbs8: + case Opcode::VectorSignedSaturatedAbs16: + case Opcode::VectorSignedSaturatedAbs32: + case Opcode::VectorSignedSaturatedAbs64: + case Opcode::VectorSignedSaturatedAccumulateUnsigned8: + case Opcode::VectorSignedSaturatedAccumulateUnsigned16: + case Opcode::VectorSignedSaturatedAccumulateUnsigned32: + case Opcode::VectorSignedSaturatedAccumulateUnsigned64: + case Opcode::VectorSignedSaturatedAdd8: + case Opcode::VectorSignedSaturatedAdd16: + case Opcode::VectorSignedSaturatedAdd32: + case Opcode::VectorSignedSaturatedAdd64: + case Opcode::VectorSignedSaturatedDoublingMultiplyHigh16: + case Opcode::VectorSignedSaturatedDoublingMultiplyHigh32: + case Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding16: + case Opcode::VectorSignedSaturatedDoublingMultiplyHighRounding32: + case Opcode::VectorSignedSaturatedDoublingMultiplyLong16: + case Opcode::VectorSignedSaturatedDoublingMultiplyLong32: + case Opcode::VectorSignedSaturatedNarrowToSigned16: + case Opcode::VectorSignedSaturatedNarrowToSigned32: + case Opcode::VectorSignedSaturatedNarrowToSigned64: + case Opcode::VectorSignedSaturatedNarrowToUnsigned16: + case Opcode::VectorSignedSaturatedNarrowToUnsigned32: + case Opcode::VectorSignedSaturatedNarrowToUnsigned64: + case Opcode::VectorSignedSaturatedNeg8: + case Opcode::VectorSignedSaturatedNeg16: + case Opcode::VectorSignedSaturatedNeg32: + case Opcode::VectorSignedSaturatedNeg64: + case Opcode::VectorSignedSaturatedShiftLeft8: + case Opcode::VectorSignedSaturatedShiftLeft16: + case Opcode::VectorSignedSaturatedShiftLeft32: + case Opcode::VectorSignedSaturatedShiftLeft64: + case Opcode::VectorSignedSaturatedShiftLeftUnsigned8: + case Opcode::VectorSignedSaturatedShiftLeftUnsigned16: + case Opcode::VectorSignedSaturatedShiftLeftUnsigned32: + case Opcode::VectorSignedSaturatedShiftLeftUnsigned64: + case Opcode::VectorSignedSaturatedSub8: + case Opcode::VectorSignedSaturatedSub16: + case Opcode::VectorSignedSaturatedSub32: + case Opcode::VectorSignedSaturatedSub64: + case Opcode::VectorUnsignedSaturatedAccumulateSigned8: + case Opcode::VectorUnsignedSaturatedAccumulateSigned16: + case Opcode::VectorUnsignedSaturatedAccumulateSigned32: + case Opcode::VectorUnsignedSaturatedAccumulateSigned64: + case Opcode::VectorUnsignedSaturatedAdd8: + case Opcode::VectorUnsignedSaturatedAdd16: + case Opcode::VectorUnsignedSaturatedAdd32: + case Opcode::VectorUnsignedSaturatedAdd64: + case Opcode::VectorUnsignedSaturatedNarrow16: + case Opcode::VectorUnsignedSaturatedNarrow32: + case Opcode::VectorUnsignedSaturatedNarrow64: + case Opcode::VectorUnsignedSaturatedShiftLeft8: + case Opcode::VectorUnsignedSaturatedShiftLeft16: + case Opcode::VectorUnsignedSaturatedShiftLeft32: + case Opcode::VectorUnsignedSaturatedShiftLeft64: + case Opcode::VectorUnsignedSaturatedSub8: + case Opcode::VectorUnsignedSaturatedSub16: + case Opcode::VectorUnsignedSaturatedSub32: + case Opcode::VectorUnsignedSaturatedSub64: + return true; + + default: + return false; + } +} + +bool Inst::CausesCPUException() const { + return op == Opcode::Breakpoint + || op == Opcode::A32CallSupervisor + || op == Opcode::A32ExceptionRaised + || op == Opcode::A64CallSupervisor + || op == Opcode::A64ExceptionRaised; +} + +bool Inst::AltersExclusiveState() const { + return op == Opcode::A32ClearExclusive + || op == Opcode::A64ClearExclusive + || IsExclusiveMemoryRead() + || IsExclusiveMemoryWrite(); +} + +bool Inst::IsCoprocessorInstruction() const { + switch (op) { + case Opcode::A32CoprocInternalOperation: + case Opcode::A32CoprocSendOneWord: + case Opcode::A32CoprocSendTwoWords: + case Opcode::A32CoprocGetOneWord: + case Opcode::A32CoprocGetTwoWords: + case Opcode::A32CoprocLoadWords: + case Opcode::A32CoprocStoreWords: + return true; + + default: + return false; + } +} + +bool Inst::IsSetCheckBitOperation() const { + return op == Opcode::A32SetCheckBit + || op == Opcode::A64SetCheckBit; +} + +bool Inst::MayHaveSideEffects() const { + return op == Opcode::PushRSB + || op == Opcode::CallHostFunction + || op == Opcode::A64DataCacheOperationRaised + || op == Opcode::A64InstructionCacheOperationRaised + || IsSetCheckBitOperation() + || IsBarrier() + || CausesCPUException() + || WritesToCoreRegister() + || WritesToSystemRegister() + || WritesToCPSR() + || WritesToFPCR() + || WritesToFPSR() + || AltersExclusiveState() + || IsMemoryWrite() + || IsCoprocessorInstruction(); +} + +bool Inst::IsAPseudoOperation() const { + switch (op) { + case Opcode::GetCarryFromOp: + case Opcode::GetOverflowFromOp: + case Opcode::GetGEFromOp: + case Opcode::GetNZCVFromOp: + case Opcode::GetNZFromOp: + case Opcode::GetUpperFromOp: + case Opcode::GetLowerFromOp: + case Opcode::MostSignificantBit: + case Opcode::IsZero32: + case Opcode::IsZero64: + return true; + + default: + return false; + } +} + +bool Inst::MayGetNZCVFromOp() const { + switch (op) { + case Opcode::Add32: + case Opcode::Add64: + case Opcode::Sub32: + case Opcode::Sub64: + case Opcode::And32: + case Opcode::And64: + case Opcode::AndNot32: + case Opcode::AndNot64: + case Opcode::Eor32: + case Opcode::Eor64: + case Opcode::Or32: + case Opcode::Or64: + case Opcode::Not32: + case Opcode::Not64: + return true; + + default: + return false; + } +} + +bool Inst::AreAllArgsImmediates() const { + return std::all_of(args.begin(), args.begin() + NumArgs(), [](const auto& value) { return value.IsImmediate(); }); +} + +bool Inst::HasAssociatedPseudoOperation() const { + return next_pseudoop && !IsAPseudoOperation(); +} + +Inst* Inst::GetAssociatedPseudoOperation(Opcode opcode) { + Inst* pseudoop = next_pseudoop; + while (pseudoop) { + if (pseudoop->GetOpcode() == opcode) { + ASSERT(pseudoop->GetArg(0).GetInst() == this); + return pseudoop; + } + pseudoop = pseudoop->next_pseudoop; + } + return nullptr; +} + +Type Inst::GetType() const { + if (op == Opcode::Identity) + return args[0].GetType(); + return GetTypeOf(op); +} + +size_t Inst::NumArgs() const { + return GetNumArgsOf(op); +} + +Value Inst::GetArg(size_t index) const { + ASSERT_MSG(index < GetNumArgsOf(op), "Inst::GetArg: index {} >= number of arguments of {} ({})", index, op, GetNumArgsOf(op)); + ASSERT_MSG(!args[index].IsEmpty() || GetArgTypeOf(op, index) == IR::Type::Opaque, "Inst::GetArg: index {} is empty", index, args[index].GetType()); + + return args[index]; +} + +void Inst::SetArg(size_t index, Value value) { + ASSERT_MSG(index < GetNumArgsOf(op), "Inst::SetArg: index {} >= number of arguments of {} ({})", index, op, GetNumArgsOf(op)); + ASSERT_MSG(AreTypesCompatible(value.GetType(), GetArgTypeOf(op, index)), "Inst::SetArg: type {} of argument {} not compatible with operation {} ({})", value.GetType(), index, op, GetArgTypeOf(op, index)); + + if (!args[index].IsImmediate()) { + UndoUse(args[index]); + } + if (!value.IsImmediate()) { + Use(value); + } + + args[index] = value; +} + +void Inst::Invalidate() { + ClearArgs(); + op = Opcode::Void; +} + +void Inst::ClearArgs() { + for (auto& value : args) { + if (!value.IsImmediate()) { + UndoUse(value); + } + value = {}; + } +} + +void Inst::ReplaceUsesWith(Value replacement) { + Invalidate(); + + op = Opcode::Identity; + + if (!replacement.IsImmediate()) { + Use(replacement); + } + + args[0] = replacement; +} + +void Inst::Use(const Value& value) { + value.GetInst()->use_count++; + + if (IsAPseudoOperation()) { + if (op == Opcode::GetNZCVFromOp) { + ASSERT_MSG(value.GetInst()->MayGetNZCVFromOp(), "This value doesn't support the GetNZCVFromOp pseduo-op"); + } + + Inst* insert_point = value.GetInst(); + while (insert_point->next_pseudoop) { + insert_point = insert_point->next_pseudoop; + DEBUG_ASSERT(insert_point->GetArg(0).GetInst() == value.GetInst()); + } + insert_point->next_pseudoop = this; + } +} + +void Inst::UndoUse(const Value& value) { + value.GetInst()->use_count--; + + if (IsAPseudoOperation()) { + Inst* insert_point = value.GetInst(); + while (insert_point->next_pseudoop != this) { + insert_point = insert_point->next_pseudoop; + DEBUG_ASSERT(insert_point->GetArg(0).GetInst() == value.GetInst()); + } + insert_point->next_pseudoop = next_pseudoop; + next_pseudoop = nullptr; + } +} + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/microinstruction.h b/externals/dynarmic/src/dynarmic/ir/microinstruction.h new file mode 100644 index 0000000000..26b6b899f3 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/microinstruction.h @@ -0,0 +1,162 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> + +#include <mcl/container/intrusive_list.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/ir/value.h" + +namespace Dynarmic::IR { + +enum class Opcode; +enum class Type; + +constexpr size_t max_arg_count = 4; + +/** + * A representation of a microinstruction. A single ARM/Thumb instruction may be + * converted into zero or more microinstructions. + */ +class Inst final : public mcl::intrusive_list_node<Inst> { +public: + explicit Inst(Opcode op) + : op(op) {} + + /// Determines whether or not this instruction performs an arithmetic shift. + bool IsArithmeticShift() const; + /// Determines whether or not this instruction performs a logical shift. + bool IsLogicalShift() const; + /// Determines whether or not this instruction performs a circular shift. + bool IsCircularShift() const; + /// Determines whether or not this instruction performs any kind of shift. + bool IsShift() const; + + /// Determines whether or not this instruction is a form of barrier. + bool IsBarrier() const; + + /// Determines whether or not this instruction performs a shared memory read. + bool IsSharedMemoryRead() const; + /// Determines whether or not this instruction performs a shared memory write. + bool IsSharedMemoryWrite() const; + /// Determines whether or not this instruction performs a shared memory read or write. + bool IsSharedMemoryReadOrWrite() const; + /// Determines whether or not this instruction performs an atomic memory read. + bool IsExclusiveMemoryRead() const; + /// Determines whether or not this instruction performs an atomic memory write. + bool IsExclusiveMemoryWrite() const; + + /// Determines whether or not this instruction performs any kind of memory read. + bool IsMemoryRead() const; + /// Determines whether or not this instruction performs any kind of memory write. + bool IsMemoryWrite() const; + /// Determines whether or not this instruction performs any kind of memory access. + bool IsMemoryReadOrWrite() const; + + /// Determines whether or not this instruction reads from the CPSR. + bool ReadsFromCPSR() const; + /// Determines whether or not this instruction writes to the CPSR. + bool WritesToCPSR() const; + + /// Determines whether or not this instruction writes to a system register. + bool WritesToSystemRegister() const; + + /// Determines whether or not this instruction reads from a core register. + bool ReadsFromCoreRegister() const; + /// Determines whether or not this instruction writes to a core register. + bool WritesToCoreRegister() const; + + /// Determines whether or not this instruction reads from the FPCR. + bool ReadsFromFPCR() const; + /// Determines whether or not this instruction writes to the FPCR. + bool WritesToFPCR() const; + + /// Determines whether or not this instruction reads from the FPSR. + bool ReadsFromFPSR() const; + /// Determines whether or not this instruction writes to the FPSR. + bool WritesToFPSR() const; + + /// Determines whether or not this instruction reads from the FPSR cumulative exception bits. + bool ReadsFromFPSRCumulativeExceptionBits() const; + /// Determines whether or not this instruction writes to the FPSR cumulative exception bits. + bool WritesToFPSRCumulativeExceptionBits() const; + /// Determines whether or not this instruction both reads from and writes to the FPSR cumulative exception bits. + bool ReadsFromAndWritesToFPSRCumulativeExceptionBits() const; + + /// Determines whether or not this instruction reads from the FPSR cumulative saturation bit. + bool ReadsFromFPSRCumulativeSaturationBit() const; + /// Determines whether or not this instruction writes to the FPSR cumulative saturation bit. + bool WritesToFPSRCumulativeSaturationBit() const; + + /// Determines whether or not this instruction alters memory-exclusivity. + bool AltersExclusiveState() const; + + /// Determines whether or not this instruction accesses a coprocessor. + bool IsCoprocessorInstruction() const; + + /// Determines whether or not this instruction causes a CPU exception. + bool CausesCPUException() const; + + /// Determines whether or not this instruction is a SetCheckBit operation. + bool IsSetCheckBitOperation() const; + + /// Determines whether or not this instruction may have side-effects. + bool MayHaveSideEffects() const; + + /// Determines whether or not this instruction is a pseduo-instruction. + /// Pseudo-instructions depend on their parent instructions for their semantics. + bool IsAPseudoOperation() const; + + /// Determines whether or not this instruction supports the GetNZCVFromOp pseudo-operation. + bool MayGetNZCVFromOp() const; + + /// Determines if all arguments of this instruction are immediates. + bool AreAllArgsImmediates() const; + + size_t UseCount() const { return use_count; } + bool HasUses() const { return use_count > 0; } + + /// Determines if there is a pseudo-operation associated with this instruction. + bool HasAssociatedPseudoOperation() const; + /// Gets a pseudo-operation associated with this instruction. + Inst* GetAssociatedPseudoOperation(Opcode opcode); + + /// Get the microop this microinstruction represents. + Opcode GetOpcode() const { return op; } + /// Get the type this instruction returns. + Type GetType() const; + /// Get the number of arguments this instruction has. + size_t NumArgs() const; + + Value GetArg(size_t index) const; + void SetArg(size_t index, Value value); + + void Invalidate(); + void ClearArgs(); + + void ReplaceUsesWith(Value replacement); + + // IR name (i.e. instruction number in block). This is set in the naming pass. Treat 0 as an invalid name. + // This is used for debugging and fastmem instruction identification. + void SetName(unsigned value) { name = value; } + unsigned GetName() const { return name; } + +private: + void Use(const Value& value); + void UndoUse(const Value& value); + + Opcode op; + unsigned use_count = 0; + unsigned name = 0; + std::array<Value, max_arg_count> args; + + // Linked list of pseudooperations associated with this instruction. + Inst* next_pseudoop = nullptr; +}; + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/opcodes.cpp b/externals/dynarmic/src/dynarmic/ir/opcodes.cpp new file mode 100644 index 0000000000..88a52d4f1d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opcodes.cpp @@ -0,0 +1,71 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/ir/opcodes.h" + +#include <array> +#include <vector> + +#include "dynarmic/ir/type.h" + +namespace Dynarmic::IR { + +// Opcode information + +namespace OpcodeInfo { + +struct Meta { + const char* name; + Type type; + std::vector<Type> arg_types; +}; + +constexpr Type Void = Type::Void; +constexpr Type A32Reg = Type::A32Reg; +constexpr Type A32ExtReg = Type::A32ExtReg; +constexpr Type A64Reg = Type::A64Reg; +constexpr Type A64Vec = Type::A64Vec; +constexpr Type Opaque = Type::Opaque; +constexpr Type U1 = Type::U1; +constexpr Type U8 = Type::U8; +constexpr Type U16 = Type::U16; +constexpr Type U32 = Type::U32; +constexpr Type U64 = Type::U64; +constexpr Type U128 = Type::U128; +constexpr Type CoprocInfo = Type::CoprocInfo; +constexpr Type NZCV = Type::NZCVFlags; +constexpr Type Cond = Type::Cond; +constexpr Type Table = Type::Table; +constexpr Type AccType = Type::AccType; + +static const std::array opcode_info{ +#define OPCODE(name, type, ...) Meta{#name, type, {__VA_ARGS__}}, +#define A32OPC(name, type, ...) Meta{#name, type, {__VA_ARGS__}}, +#define A64OPC(name, type, ...) Meta{#name, type, {__VA_ARGS__}}, +#include "./opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC +}; + +} // namespace OpcodeInfo + +Type GetTypeOf(Opcode op) { + return OpcodeInfo::opcode_info.at(static_cast<size_t>(op)).type; +} + +size_t GetNumArgsOf(Opcode op) { + return OpcodeInfo::opcode_info.at(static_cast<size_t>(op)).arg_types.size(); +} + +Type GetArgTypeOf(Opcode op, size_t arg_index) { + return OpcodeInfo::opcode_info.at(static_cast<size_t>(op)).arg_types.at(arg_index); +} + +std::string GetNameOf(Opcode op) { + return OpcodeInfo::opcode_info.at(static_cast<size_t>(op)).name; +} + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/opcodes.h b/externals/dynarmic/src/dynarmic/ir/opcodes.h new file mode 100644 index 0000000000..404f4cdbf4 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opcodes.h @@ -0,0 +1,54 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <string> + +#include <fmt/format.h> +#include <mcl/stdint.hpp> + +namespace Dynarmic::IR { + +enum class Type; + +/** + * The Opcodes of our intermediate representation. + * Type signatures for each opcode can be found in opcodes.inc + */ +enum class Opcode { +#define OPCODE(name, type, ...) name, +#define A32OPC(name, type, ...) A32##name, +#define A64OPC(name, type, ...) A64##name, +#include "./opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + NUM_OPCODE +}; + +constexpr size_t OpcodeCount = static_cast<size_t>(Opcode::NUM_OPCODE); + +/// Get return type of an opcode +Type GetTypeOf(Opcode op); + +/// Get the number of arguments an opcode accepts +size_t GetNumArgsOf(Opcode op); + +/// Get the required type of an argument of an opcode +Type GetArgTypeOf(Opcode op, size_t arg_index); + +/// Get the name of an opcode. +std::string GetNameOf(Opcode op); + +} // namespace Dynarmic::IR + +template<> +struct fmt::formatter<Dynarmic::IR::Opcode> : fmt::formatter<std::string> { + template<typename FormatContext> + auto format(Dynarmic::IR::Opcode op, FormatContext& ctx) const { + return formatter<std::string>::format(Dynarmic::IR::GetNameOf(op), ctx); + } +}; diff --git a/externals/dynarmic/src/dynarmic/ir/opcodes.inc b/externals/dynarmic/src/dynarmic/ir/opcodes.inc new file mode 100644 index 0000000000..dd060e0a99 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opcodes.inc @@ -0,0 +1,770 @@ +// clang-format off + +// opcode name, return type, arg1 type, arg2 type, arg3 type, arg4 type, ... + +OPCODE(Void, Void, ) +OPCODE(Identity, Opaque, Opaque ) +OPCODE(Breakpoint, Void, ) +OPCODE(CallHostFunction, Void, U64, Opaque, Opaque, Opaque ) + +// A32 Context getters/setters +A32OPC(SetCheckBit, Void, U1 ) +A32OPC(GetRegister, U32, A32Reg ) +A32OPC(GetExtendedRegister32, U32, A32ExtReg ) +A32OPC(GetExtendedRegister64, U64, A32ExtReg ) +A32OPC(GetVector, U128, A32ExtReg ) +A32OPC(SetRegister, Void, A32Reg, U32 ) +A32OPC(SetExtendedRegister32, Void, A32ExtReg, U32 ) +A32OPC(SetExtendedRegister64, Void, A32ExtReg, U64 ) +A32OPC(SetVector, Void, A32ExtReg, U128 ) +A32OPC(GetCpsr, U32, ) +A32OPC(SetCpsr, Void, U32 ) +A32OPC(SetCpsrNZCV, Void, NZCV ) +A32OPC(SetCpsrNZCVRaw, Void, U32 ) +A32OPC(SetCpsrNZCVQ, Void, U32 ) +A32OPC(SetCpsrNZ, Void, NZCV ) +A32OPC(SetCpsrNZC, Void, NZCV, U1 ) +A32OPC(GetCFlag, U1, ) +A32OPC(OrQFlag, Void, U1 ) +A32OPC(GetGEFlags, U32, ) +A32OPC(SetGEFlags, Void, U32 ) +A32OPC(SetGEFlagsCompressed, Void, U32 ) +A32OPC(BXWritePC, Void, U32 ) +A32OPC(UpdateUpperLocationDescriptor, Void, ) +A32OPC(CallSupervisor, Void, U32 ) +A32OPC(ExceptionRaised, Void, U32, U64 ) +A32OPC(DataSynchronizationBarrier, Void, ) +A32OPC(DataMemoryBarrier, Void, ) +A32OPC(InstructionSynchronizationBarrier, Void, ) +A32OPC(GetFpscr, U32, ) +A32OPC(SetFpscr, Void, U32, ) +A32OPC(GetFpscrNZCV, U32, ) +A32OPC(SetFpscrNZCV, Void, NZCV ) + +// A64 Context getters/setters +A64OPC(SetCheckBit, Void, U1 ) +A64OPC(GetCFlag, U1, ) +A64OPC(GetNZCVRaw, U32, ) +A64OPC(SetNZCVRaw, Void, U32 ) +A64OPC(SetNZCV, Void, NZCV ) +A64OPC(GetW, U32, A64Reg ) +A64OPC(GetX, U64, A64Reg ) +A64OPC(GetS, U128, A64Vec ) +A64OPC(GetD, U128, A64Vec ) +A64OPC(GetQ, U128, A64Vec ) +A64OPC(GetSP, U64, ) +A64OPC(GetFPCR, U32, ) +A64OPC(GetFPSR, U32, ) +A64OPC(SetW, Void, A64Reg, U32 ) +A64OPC(SetX, Void, A64Reg, U64 ) +A64OPC(SetS, Void, A64Vec, U128 ) +A64OPC(SetD, Void, A64Vec, U128 ) +A64OPC(SetQ, Void, A64Vec, U128 ) +A64OPC(SetSP, Void, U64 ) +A64OPC(SetFPCR, Void, U32 ) +A64OPC(SetFPSR, Void, U32 ) +A64OPC(SetPC, Void, U64 ) +A64OPC(CallSupervisor, Void, U32 ) +A64OPC(ExceptionRaised, Void, U64, U64 ) +A64OPC(DataCacheOperationRaised, Void, U64, U64, U64 ) +A64OPC(InstructionCacheOperationRaised, Void, U64, U64 ) +A64OPC(DataSynchronizationBarrier, Void, ) +A64OPC(DataMemoryBarrier, Void, ) +A64OPC(InstructionSynchronizationBarrier, Void, ) +A64OPC(GetCNTFRQ, U32, ) +A64OPC(GetCNTPCT, U64, ) +A64OPC(GetCTR, U32, ) +A64OPC(GetDCZID, U32, ) +A64OPC(GetTPIDR, U64, ) +A64OPC(GetTPIDRRO, U64, ) +A64OPC(SetTPIDR, Void, U64 ) + +// Hints +OPCODE(PushRSB, Void, U64 ) + +// Pseudo-operation, handled specially at final emit +OPCODE(GetCarryFromOp, U1, Opaque ) +OPCODE(GetOverflowFromOp, U1, Opaque ) +OPCODE(GetGEFromOp, U32, Opaque ) +OPCODE(GetNZCVFromOp, NZCV, Opaque ) +OPCODE(GetNZFromOp, NZCV, Opaque ) +OPCODE(GetUpperFromOp, U128, Opaque ) +OPCODE(GetLowerFromOp, U128, Opaque ) + +OPCODE(GetCFlagFromNZCV, U1, NZCV ) +OPCODE(NZCVFromPackedFlags, NZCV, U32 ) + +// Calculations +OPCODE(Pack2x32To1x64, U64, U32, U32 ) +OPCODE(Pack2x64To1x128, U128, U64, U64 ) +OPCODE(LeastSignificantWord, U32, U64 ) +OPCODE(LeastSignificantHalf, U16, U32 ) +OPCODE(LeastSignificantByte, U8, U32 ) +OPCODE(MostSignificantWord, U32, U64 ) +OPCODE(MostSignificantBit, U1, U32 ) +OPCODE(IsZero32, U1, U32 ) +OPCODE(IsZero64, U1, U64 ) +OPCODE(TestBit, U1, U64, U8 ) +OPCODE(ConditionalSelect32, U32, Cond, U32, U32 ) +OPCODE(ConditionalSelect64, U64, Cond, U64, U64 ) +OPCODE(ConditionalSelectNZCV, NZCV, Cond, NZCV, NZCV ) +OPCODE(LogicalShiftLeft32, U32, U32, U8, U1 ) +OPCODE(LogicalShiftLeft64, U64, U64, U8 ) +OPCODE(LogicalShiftRight32, U32, U32, U8, U1 ) +OPCODE(LogicalShiftRight64, U64, U64, U8 ) +OPCODE(ArithmeticShiftRight32, U32, U32, U8, U1 ) +OPCODE(ArithmeticShiftRight64, U64, U64, U8 ) +OPCODE(RotateRight32, U32, U32, U8, U1 ) +OPCODE(RotateRight64, U64, U64, U8 ) +OPCODE(RotateRightExtended, U32, U32, U1 ) +OPCODE(LogicalShiftLeftMasked32, U32, U32, U32 ) +OPCODE(LogicalShiftLeftMasked64, U64, U64, U64 ) +OPCODE(LogicalShiftRightMasked32, U32, U32, U32 ) +OPCODE(LogicalShiftRightMasked64, U64, U64, U64 ) +OPCODE(ArithmeticShiftRightMasked32, U32, U32, U32 ) +OPCODE(ArithmeticShiftRightMasked64, U64, U64, U64 ) +OPCODE(RotateRightMasked32, U32, U32, U32 ) +OPCODE(RotateRightMasked64, U64, U64, U64 ) +OPCODE(Add32, U32, U32, U32, U1 ) +OPCODE(Add64, U64, U64, U64, U1 ) +OPCODE(Sub32, U32, U32, U32, U1 ) +OPCODE(Sub64, U64, U64, U64, U1 ) +OPCODE(Mul32, U32, U32, U32 ) +OPCODE(Mul64, U64, U64, U64 ) +OPCODE(SignedMultiplyHigh64, U64, U64, U64 ) +OPCODE(UnsignedMultiplyHigh64, U64, U64, U64 ) +OPCODE(UnsignedDiv32, U32, U32, U32 ) +OPCODE(UnsignedDiv64, U64, U64, U64 ) +OPCODE(SignedDiv32, U32, U32, U32 ) +OPCODE(SignedDiv64, U64, U64, U64 ) +OPCODE(And32, U32, U32, U32 ) +OPCODE(And64, U64, U64, U64 ) +OPCODE(AndNot32, U32, U32, U32 ) +OPCODE(AndNot64, U64, U64, U64 ) +OPCODE(Eor32, U32, U32, U32 ) +OPCODE(Eor64, U64, U64, U64 ) +OPCODE(Or32, U32, U32, U32 ) +OPCODE(Or64, U64, U64, U64 ) +OPCODE(Not32, U32, U32 ) +OPCODE(Not64, U64, U64 ) +OPCODE(SignExtendByteToWord, U32, U8 ) +OPCODE(SignExtendHalfToWord, U32, U16 ) +OPCODE(SignExtendByteToLong, U64, U8 ) +OPCODE(SignExtendHalfToLong, U64, U16 ) +OPCODE(SignExtendWordToLong, U64, U32 ) +OPCODE(ZeroExtendByteToWord, U32, U8 ) +OPCODE(ZeroExtendHalfToWord, U32, U16 ) +OPCODE(ZeroExtendByteToLong, U64, U8 ) +OPCODE(ZeroExtendHalfToLong, U64, U16 ) +OPCODE(ZeroExtendWordToLong, U64, U32 ) +OPCODE(ZeroExtendLongToQuad, U128, U64 ) +OPCODE(ByteReverseWord, U32, U32 ) +OPCODE(ByteReverseHalf, U16, U16 ) +OPCODE(ByteReverseDual, U64, U64 ) +OPCODE(CountLeadingZeros32, U32, U32 ) +OPCODE(CountLeadingZeros64, U64, U64 ) +OPCODE(ExtractRegister32, U32, U32, U32, U8 ) +OPCODE(ExtractRegister64, U64, U64, U64, U8 ) +OPCODE(ReplicateBit32, U32, U32, U8 ) +OPCODE(ReplicateBit64, U64, U64, U8 ) +OPCODE(MaxSigned32, U32, U32, U32 ) +OPCODE(MaxSigned64, U64, U64, U64 ) +OPCODE(MaxUnsigned32, U32, U32, U32 ) +OPCODE(MaxUnsigned64, U64, U64, U64 ) +OPCODE(MinSigned32, U32, U32, U32 ) +OPCODE(MinSigned64, U64, U64, U64 ) +OPCODE(MinUnsigned32, U32, U32, U32 ) +OPCODE(MinUnsigned64, U64, U64, U64 ) + +// Saturated instructions +OPCODE(SignedSaturatedAddWithFlag32, U32, U32, U32 ) +OPCODE(SignedSaturatedSubWithFlag32, U32, U32, U32 ) +OPCODE(SignedSaturation, U32, U32, U8 ) +OPCODE(UnsignedSaturation, U32, U32, U8 ) +OPCODE(SignedSaturatedAdd8, U8, U8, U8 ) +OPCODE(SignedSaturatedAdd16, U16, U16, U16 ) +OPCODE(SignedSaturatedAdd32, U32, U32, U32 ) +OPCODE(SignedSaturatedAdd64, U64, U64, U64 ) +OPCODE(SignedSaturatedDoublingMultiplyReturnHigh16, U16, U16, U16 ) +OPCODE(SignedSaturatedDoublingMultiplyReturnHigh32, U32, U32, U32 ) +OPCODE(SignedSaturatedSub8, U8, U8, U8 ) +OPCODE(SignedSaturatedSub16, U16, U16, U16 ) +OPCODE(SignedSaturatedSub32, U32, U32, U32 ) +OPCODE(SignedSaturatedSub64, U64, U64, U64 ) +OPCODE(UnsignedSaturatedAdd8, U8, U8, U8 ) +OPCODE(UnsignedSaturatedAdd16, U16, U16, U16 ) +OPCODE(UnsignedSaturatedAdd32, U32, U32, U32 ) +OPCODE(UnsignedSaturatedAdd64, U64, U64, U64 ) +OPCODE(UnsignedSaturatedSub8, U8, U8, U8 ) +OPCODE(UnsignedSaturatedSub16, U16, U16, U16 ) +OPCODE(UnsignedSaturatedSub32, U32, U32, U32 ) +OPCODE(UnsignedSaturatedSub64, U64, U64, U64 ) + +// Vector saturated instructions +OPCODE(VectorSignedSaturatedAdd8, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAdd16, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAdd32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAdd64, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedSub8, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedSub16, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedSub32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedSub64, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAdd8, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAdd16, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAdd32, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAdd64, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedSub8, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedSub16, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedSub32, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedSub64, U128, U128, U128 ) + +// Packed instructions +OPCODE(PackedAddU8, U32, U32, U32 ) +OPCODE(PackedAddS8, U32, U32, U32 ) +OPCODE(PackedSubU8, U32, U32, U32 ) +OPCODE(PackedSubS8, U32, U32, U32 ) +OPCODE(PackedAddU16, U32, U32, U32 ) +OPCODE(PackedAddS16, U32, U32, U32 ) +OPCODE(PackedSubU16, U32, U32, U32 ) +OPCODE(PackedSubS16, U32, U32, U32 ) +OPCODE(PackedAddSubU16, U32, U32, U32 ) +OPCODE(PackedAddSubS16, U32, U32, U32 ) +OPCODE(PackedSubAddU16, U32, U32, U32 ) +OPCODE(PackedSubAddS16, U32, U32, U32 ) +OPCODE(PackedHalvingAddU8, U32, U32, U32 ) +OPCODE(PackedHalvingAddS8, U32, U32, U32 ) +OPCODE(PackedHalvingSubU8, U32, U32, U32 ) +OPCODE(PackedHalvingSubS8, U32, U32, U32 ) +OPCODE(PackedHalvingAddU16, U32, U32, U32 ) +OPCODE(PackedHalvingAddS16, U32, U32, U32 ) +OPCODE(PackedHalvingSubU16, U32, U32, U32 ) +OPCODE(PackedHalvingSubS16, U32, U32, U32 ) +OPCODE(PackedHalvingAddSubU16, U32, U32, U32 ) +OPCODE(PackedHalvingAddSubS16, U32, U32, U32 ) +OPCODE(PackedHalvingSubAddU16, U32, U32, U32 ) +OPCODE(PackedHalvingSubAddS16, U32, U32, U32 ) +OPCODE(PackedSaturatedAddU8, U32, U32, U32 ) +OPCODE(PackedSaturatedAddS8, U32, U32, U32 ) +OPCODE(PackedSaturatedSubU8, U32, U32, U32 ) +OPCODE(PackedSaturatedSubS8, U32, U32, U32 ) +OPCODE(PackedSaturatedAddU16, U32, U32, U32 ) +OPCODE(PackedSaturatedAddS16, U32, U32, U32 ) +OPCODE(PackedSaturatedSubU16, U32, U32, U32 ) +OPCODE(PackedSaturatedSubS16, U32, U32, U32 ) +OPCODE(PackedAbsDiffSumU8, U32, U32, U32 ) +OPCODE(PackedSelect, U32, U32, U32, U32 ) + +// CRC instructions +OPCODE(CRC32Castagnoli8, U32, U32, U32 ) +OPCODE(CRC32Castagnoli16, U32, U32, U32 ) +OPCODE(CRC32Castagnoli32, U32, U32, U32 ) +OPCODE(CRC32Castagnoli64, U32, U32, U64 ) +OPCODE(CRC32ISO8, U32, U32, U32 ) +OPCODE(CRC32ISO16, U32, U32, U32 ) +OPCODE(CRC32ISO32, U32, U32, U32 ) +OPCODE(CRC32ISO64, U32, U32, U64 ) + +// AES instructions +OPCODE(AESDecryptSingleRound, U128, U128 ) +OPCODE(AESEncryptSingleRound, U128, U128 ) +OPCODE(AESInverseMixColumns, U128, U128 ) +OPCODE(AESMixColumns, U128, U128 ) + +// SM4 instructions +OPCODE(SM4AccessSubstitutionBox, U8, U8 ) + +// SHA instructions +OPCODE(SHA256Hash, U128, U128, U128, U128, U1 ) +OPCODE(SHA256MessageSchedule0, U128, U128, U128 ) +OPCODE(SHA256MessageSchedule1, U128, U128, U128, U128 ) + +// Vector instructions +OPCODE(VectorGetElement8, U8, U128, U8 ) +OPCODE(VectorGetElement16, U16, U128, U8 ) +OPCODE(VectorGetElement32, U32, U128, U8 ) +OPCODE(VectorGetElement64, U64, U128, U8 ) +OPCODE(VectorSetElement8, U128, U128, U8, U8 ) +OPCODE(VectorSetElement16, U128, U128, U8, U16 ) +OPCODE(VectorSetElement32, U128, U128, U8, U32 ) +OPCODE(VectorSetElement64, U128, U128, U8, U64 ) +OPCODE(VectorAbs8, U128, U128 ) +OPCODE(VectorAbs16, U128, U128 ) +OPCODE(VectorAbs32, U128, U128 ) +OPCODE(VectorAbs64, U128, U128 ) +OPCODE(VectorAdd8, U128, U128, U128 ) +OPCODE(VectorAdd16, U128, U128, U128 ) +OPCODE(VectorAdd32, U128, U128, U128 ) +OPCODE(VectorAdd64, U128, U128, U128 ) +OPCODE(VectorAnd, U128, U128, U128 ) +OPCODE(VectorAndNot, U128, U128, U128 ) +OPCODE(VectorArithmeticShiftRight8, U128, U128, U8 ) +OPCODE(VectorArithmeticShiftRight16, U128, U128, U8 ) +OPCODE(VectorArithmeticShiftRight32, U128, U128, U8 ) +OPCODE(VectorArithmeticShiftRight64, U128, U128, U8 ) +OPCODE(VectorArithmeticVShift8, U128, U128, U128 ) +OPCODE(VectorArithmeticVShift16, U128, U128, U128 ) +OPCODE(VectorArithmeticVShift32, U128, U128, U128 ) +OPCODE(VectorArithmeticVShift64, U128, U128, U128 ) +OPCODE(VectorBroadcastLower8, U128, U8 ) +OPCODE(VectorBroadcastLower16, U128, U16 ) +OPCODE(VectorBroadcastLower32, U128, U32 ) +OPCODE(VectorBroadcast8, U128, U8 ) +OPCODE(VectorBroadcast16, U128, U16 ) +OPCODE(VectorBroadcast32, U128, U32 ) +OPCODE(VectorBroadcast64, U128, U64 ) +OPCODE(VectorBroadcastElementLower8, U128, U128, U8 ) +OPCODE(VectorBroadcastElementLower16, U128, U128, U8 ) +OPCODE(VectorBroadcastElementLower32, U128, U128, U8 ) +OPCODE(VectorBroadcastElement8, U128, U128, U8 ) +OPCODE(VectorBroadcastElement16, U128, U128, U8 ) +OPCODE(VectorBroadcastElement32, U128, U128, U8 ) +OPCODE(VectorBroadcastElement64, U128, U128, U8 ) +OPCODE(VectorCountLeadingZeros8, U128, U128 ) +OPCODE(VectorCountLeadingZeros16, U128, U128 ) +OPCODE(VectorCountLeadingZeros32, U128, U128 ) +OPCODE(VectorDeinterleaveEven8, U128, U128, U128 ) +OPCODE(VectorDeinterleaveEven16, U128, U128, U128 ) +OPCODE(VectorDeinterleaveEven32, U128, U128, U128 ) +OPCODE(VectorDeinterleaveEven64, U128, U128, U128 ) +OPCODE(VectorDeinterleaveEvenLower8, U128, U128, U128 ) +OPCODE(VectorDeinterleaveEvenLower16, U128, U128, U128 ) +OPCODE(VectorDeinterleaveEvenLower32, U128, U128, U128 ) +OPCODE(VectorDeinterleaveOdd8, U128, U128, U128 ) +OPCODE(VectorDeinterleaveOdd16, U128, U128, U128 ) +OPCODE(VectorDeinterleaveOdd32, U128, U128, U128 ) +OPCODE(VectorDeinterleaveOdd64, U128, U128, U128 ) +OPCODE(VectorDeinterleaveOddLower8, U128, U128, U128 ) +OPCODE(VectorDeinterleaveOddLower16, U128, U128, U128 ) +OPCODE(VectorDeinterleaveOddLower32, U128, U128, U128 ) +OPCODE(VectorEor, U128, U128, U128 ) +OPCODE(VectorEqual8, U128, U128, U128 ) +OPCODE(VectorEqual16, U128, U128, U128 ) +OPCODE(VectorEqual32, U128, U128, U128 ) +OPCODE(VectorEqual64, U128, U128, U128 ) +OPCODE(VectorEqual128, U128, U128, U128 ) +OPCODE(VectorExtract, U128, U128, U128, U8 ) +OPCODE(VectorExtractLower, U128, U128, U128, U8 ) +OPCODE(VectorGreaterS8, U128, U128, U128 ) +OPCODE(VectorGreaterS16, U128, U128, U128 ) +OPCODE(VectorGreaterS32, U128, U128, U128 ) +OPCODE(VectorGreaterS64, U128, U128, U128 ) +OPCODE(VectorHalvingAddS8, U128, U128, U128 ) +OPCODE(VectorHalvingAddS16, U128, U128, U128 ) +OPCODE(VectorHalvingAddS32, U128, U128, U128 ) +OPCODE(VectorHalvingAddU8, U128, U128, U128 ) +OPCODE(VectorHalvingAddU16, U128, U128, U128 ) +OPCODE(VectorHalvingAddU32, U128, U128, U128 ) +OPCODE(VectorHalvingSubS8, U128, U128, U128 ) +OPCODE(VectorHalvingSubS16, U128, U128, U128 ) +OPCODE(VectorHalvingSubS32, U128, U128, U128 ) +OPCODE(VectorHalvingSubU8, U128, U128, U128 ) +OPCODE(VectorHalvingSubU16, U128, U128, U128 ) +OPCODE(VectorHalvingSubU32, U128, U128, U128 ) +OPCODE(VectorInterleaveLower8, U128, U128, U128 ) +OPCODE(VectorInterleaveLower16, U128, U128, U128 ) +OPCODE(VectorInterleaveLower32, U128, U128, U128 ) +OPCODE(VectorInterleaveLower64, U128, U128, U128 ) +OPCODE(VectorInterleaveUpper8, U128, U128, U128 ) +OPCODE(VectorInterleaveUpper16, U128, U128, U128 ) +OPCODE(VectorInterleaveUpper32, U128, U128, U128 ) +OPCODE(VectorInterleaveUpper64, U128, U128, U128 ) +OPCODE(VectorLogicalShiftLeft8, U128, U128, U8 ) +OPCODE(VectorLogicalShiftLeft16, U128, U128, U8 ) +OPCODE(VectorLogicalShiftLeft32, U128, U128, U8 ) +OPCODE(VectorLogicalShiftLeft64, U128, U128, U8 ) +OPCODE(VectorLogicalShiftRight8, U128, U128, U8 ) +OPCODE(VectorLogicalShiftRight16, U128, U128, U8 ) +OPCODE(VectorLogicalShiftRight32, U128, U128, U8 ) +OPCODE(VectorLogicalShiftRight64, U128, U128, U8 ) +OPCODE(VectorLogicalVShift8, U128, U128, U128 ) +OPCODE(VectorLogicalVShift16, U128, U128, U128 ) +OPCODE(VectorLogicalVShift32, U128, U128, U128 ) +OPCODE(VectorLogicalVShift64, U128, U128, U128 ) +OPCODE(VectorMaxS8, U128, U128, U128 ) +OPCODE(VectorMaxS16, U128, U128, U128 ) +OPCODE(VectorMaxS32, U128, U128, U128 ) +OPCODE(VectorMaxS64, U128, U128, U128 ) +OPCODE(VectorMaxU8, U128, U128, U128 ) +OPCODE(VectorMaxU16, U128, U128, U128 ) +OPCODE(VectorMaxU32, U128, U128, U128 ) +OPCODE(VectorMaxU64, U128, U128, U128 ) +OPCODE(VectorMinS8, U128, U128, U128 ) +OPCODE(VectorMinS16, U128, U128, U128 ) +OPCODE(VectorMinS32, U128, U128, U128 ) +OPCODE(VectorMinS64, U128, U128, U128 ) +OPCODE(VectorMinU8, U128, U128, U128 ) +OPCODE(VectorMinU16, U128, U128, U128 ) +OPCODE(VectorMinU32, U128, U128, U128 ) +OPCODE(VectorMinU64, U128, U128, U128 ) +OPCODE(VectorMultiply8, U128, U128, U128 ) +OPCODE(VectorMultiply16, U128, U128, U128 ) +OPCODE(VectorMultiply32, U128, U128, U128 ) +OPCODE(VectorMultiply64, U128, U128, U128 ) +OPCODE(VectorMultiplySignedWiden8, U128, U128, U128 ) +OPCODE(VectorMultiplySignedWiden16, U128, U128, U128 ) +OPCODE(VectorMultiplySignedWiden32, U128, U128, U128 ) +OPCODE(VectorMultiplyUnsignedWiden8, U128, U128, U128 ) +OPCODE(VectorMultiplyUnsignedWiden16, U128, U128, U128 ) +OPCODE(VectorMultiplyUnsignedWiden32, U128, U128, U128 ) +OPCODE(VectorNarrow16, U128, U128 ) +OPCODE(VectorNarrow32, U128, U128 ) +OPCODE(VectorNarrow64, U128, U128 ) +OPCODE(VectorNot, U128, U128 ) +OPCODE(VectorOr, U128, U128, U128 ) +OPCODE(VectorPairedAddLower8, U128, U128, U128 ) +OPCODE(VectorPairedAddLower16, U128, U128, U128 ) +OPCODE(VectorPairedAddLower32, U128, U128, U128 ) +OPCODE(VectorPairedAddSignedWiden8, U128, U128 ) +OPCODE(VectorPairedAddSignedWiden16, U128, U128 ) +OPCODE(VectorPairedAddSignedWiden32, U128, U128 ) +OPCODE(VectorPairedAddUnsignedWiden8, U128, U128 ) +OPCODE(VectorPairedAddUnsignedWiden16, U128, U128 ) +OPCODE(VectorPairedAddUnsignedWiden32, U128, U128 ) +OPCODE(VectorPairedAdd8, U128, U128, U128 ) +OPCODE(VectorPairedAdd16, U128, U128, U128 ) +OPCODE(VectorPairedAdd32, U128, U128, U128 ) +OPCODE(VectorPairedAdd64, U128, U128, U128 ) +OPCODE(VectorPairedMaxS8, U128, U128, U128 ) +OPCODE(VectorPairedMaxS16, U128, U128, U128 ) +OPCODE(VectorPairedMaxS32, U128, U128, U128 ) +OPCODE(VectorPairedMaxU8, U128, U128, U128 ) +OPCODE(VectorPairedMaxU16, U128, U128, U128 ) +OPCODE(VectorPairedMaxU32, U128, U128, U128 ) +OPCODE(VectorPairedMinS8, U128, U128, U128 ) +OPCODE(VectorPairedMinS16, U128, U128, U128 ) +OPCODE(VectorPairedMinS32, U128, U128, U128 ) +OPCODE(VectorPairedMinU8, U128, U128, U128 ) +OPCODE(VectorPairedMinU16, U128, U128, U128 ) +OPCODE(VectorPairedMinU32, U128, U128, U128 ) +OPCODE(VectorPairedMaxLowerS8, U128, U128, U128 ) +OPCODE(VectorPairedMaxLowerS16, U128, U128, U128 ) +OPCODE(VectorPairedMaxLowerS32, U128, U128, U128 ) +OPCODE(VectorPairedMaxLowerU8, U128, U128, U128 ) +OPCODE(VectorPairedMaxLowerU16, U128, U128, U128 ) +OPCODE(VectorPairedMaxLowerU32, U128, U128, U128 ) +OPCODE(VectorPairedMinLowerS8, U128, U128, U128 ) +OPCODE(VectorPairedMinLowerS16, U128, U128, U128 ) +OPCODE(VectorPairedMinLowerS32, U128, U128, U128 ) +OPCODE(VectorPairedMinLowerU8, U128, U128, U128 ) +OPCODE(VectorPairedMinLowerU16, U128, U128, U128 ) +OPCODE(VectorPairedMinLowerU32, U128, U128, U128 ) +OPCODE(VectorPolynomialMultiply8, U128, U128, U128 ) +OPCODE(VectorPolynomialMultiplyLong8, U128, U128, U128 ) +OPCODE(VectorPolynomialMultiplyLong64, U128, U128, U128 ) +OPCODE(VectorPopulationCount, U128, U128 ) +OPCODE(VectorReverseBits, U128, U128 ) +OPCODE(VectorReverseElementsInHalfGroups8, U128, U128 ) +OPCODE(VectorReverseElementsInWordGroups8, U128, U128 ) +OPCODE(VectorReverseElementsInWordGroups16, U128, U128 ) +OPCODE(VectorReverseElementsInLongGroups8, U128, U128 ) +OPCODE(VectorReverseElementsInLongGroups16, U128, U128 ) +OPCODE(VectorReverseElementsInLongGroups32, U128, U128 ) +OPCODE(VectorReduceAdd8, U128, U128 ) +OPCODE(VectorReduceAdd16, U128, U128 ) +OPCODE(VectorReduceAdd32, U128, U128 ) +OPCODE(VectorReduceAdd64, U128, U128 ) +OPCODE(VectorRotateWholeVectorRight, U128, U128, U8 ) +OPCODE(VectorRoundingHalvingAddS8, U128, U128, U128 ) +OPCODE(VectorRoundingHalvingAddS16, U128, U128, U128 ) +OPCODE(VectorRoundingHalvingAddS32, U128, U128, U128 ) +OPCODE(VectorRoundingHalvingAddU8, U128, U128, U128 ) +OPCODE(VectorRoundingHalvingAddU16, U128, U128, U128 ) +OPCODE(VectorRoundingHalvingAddU32, U128, U128, U128 ) +OPCODE(VectorRoundingShiftLeftS8, U128, U128, U128 ) +OPCODE(VectorRoundingShiftLeftS16, U128, U128, U128 ) +OPCODE(VectorRoundingShiftLeftS32, U128, U128, U128 ) +OPCODE(VectorRoundingShiftLeftS64, U128, U128, U128 ) +OPCODE(VectorRoundingShiftLeftU8, U128, U128, U128 ) +OPCODE(VectorRoundingShiftLeftU16, U128, U128, U128 ) +OPCODE(VectorRoundingShiftLeftU32, U128, U128, U128 ) +OPCODE(VectorRoundingShiftLeftU64, U128, U128, U128 ) +OPCODE(VectorSignExtend8, U128, U128 ) +OPCODE(VectorSignExtend16, U128, U128 ) +OPCODE(VectorSignExtend32, U128, U128 ) +OPCODE(VectorSignExtend64, U128, U128 ) +OPCODE(VectorSignedAbsoluteDifference8, U128, U128, U128 ) +OPCODE(VectorSignedAbsoluteDifference16, U128, U128, U128 ) +OPCODE(VectorSignedAbsoluteDifference32, U128, U128, U128 ) +OPCODE(VectorSignedMultiply16, Void, U128, U128 ) +OPCODE(VectorSignedMultiply32, Void, U128, U128 ) +OPCODE(VectorSignedSaturatedAbs8, U128, U128 ) +OPCODE(VectorSignedSaturatedAbs16, U128, U128 ) +OPCODE(VectorSignedSaturatedAbs32, U128, U128 ) +OPCODE(VectorSignedSaturatedAbs64, U128, U128 ) +OPCODE(VectorSignedSaturatedAccumulateUnsigned8, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAccumulateUnsigned16, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAccumulateUnsigned32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAccumulateUnsigned64, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedDoublingMultiplyHigh16, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedDoublingMultiplyHigh32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedDoublingMultiplyHighRounding16, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedDoublingMultiplyHighRounding32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedDoublingMultiplyLong16, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedDoublingMultiplyLong32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 ) +OPCODE(VectorSignedSaturatedNarrowToSigned32, U128, U128 ) +OPCODE(VectorSignedSaturatedNarrowToSigned64, U128, U128 ) +OPCODE(VectorSignedSaturatedNarrowToUnsigned16, U128, U128 ) +OPCODE(VectorSignedSaturatedNarrowToUnsigned32, U128, U128 ) +OPCODE(VectorSignedSaturatedNarrowToUnsigned64, U128, U128 ) +OPCODE(VectorSignedSaturatedNeg8, U128, U128 ) +OPCODE(VectorSignedSaturatedNeg16, U128, U128 ) +OPCODE(VectorSignedSaturatedNeg32, U128, U128 ) +OPCODE(VectorSignedSaturatedNeg64, U128, U128 ) +OPCODE(VectorSignedSaturatedShiftLeft8, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedShiftLeft16, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedShiftLeft32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedShiftLeft64, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedShiftLeftUnsigned8, U128, U128, U8 ) +OPCODE(VectorSignedSaturatedShiftLeftUnsigned16, U128, U128, U8 ) +OPCODE(VectorSignedSaturatedShiftLeftUnsigned32, U128, U128, U8 ) +OPCODE(VectorSignedSaturatedShiftLeftUnsigned64, U128, U128, U8 ) +OPCODE(VectorSub8, U128, U128, U128 ) +OPCODE(VectorSub16, U128, U128, U128 ) +OPCODE(VectorSub32, U128, U128, U128 ) +OPCODE(VectorSub64, U128, U128, U128 ) +OPCODE(VectorTable, Table, Opaque, Opaque, Opaque, Opaque ) +OPCODE(VectorTableLookup64, U64, U64, Table, U64 ) +OPCODE(VectorTableLookup128, U128, U128, Table, U128 ) +OPCODE(VectorTranspose8, U128, U128, U128, U1 ) +OPCODE(VectorTranspose16, U128, U128, U128, U1 ) +OPCODE(VectorTranspose32, U128, U128, U128, U1 ) +OPCODE(VectorTranspose64, U128, U128, U128, U1 ) +OPCODE(VectorUnsignedAbsoluteDifference8, U128, U128, U128 ) +OPCODE(VectorUnsignedAbsoluteDifference16, U128, U128, U128 ) +OPCODE(VectorUnsignedAbsoluteDifference32, U128, U128, U128 ) +OPCODE(VectorUnsignedMultiply16, Void, U128, U128 ) +OPCODE(VectorUnsignedMultiply32, Void, U128, U128 ) +OPCODE(VectorUnsignedRecipEstimate, U128, U128 ) +OPCODE(VectorUnsignedRecipSqrtEstimate, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAccumulateSigned8, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAccumulateSigned16, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAccumulateSigned32, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedAccumulateSigned64, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedNarrow16, U128, U128 ) +OPCODE(VectorUnsignedSaturatedNarrow32, U128, U128 ) +OPCODE(VectorUnsignedSaturatedNarrow64, U128, U128 ) +OPCODE(VectorUnsignedSaturatedShiftLeft8, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedShiftLeft16, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedShiftLeft32, U128, U128, U128 ) +OPCODE(VectorUnsignedSaturatedShiftLeft64, U128, U128, U128 ) +OPCODE(VectorZeroExtend8, U128, U128 ) +OPCODE(VectorZeroExtend16, U128, U128 ) +OPCODE(VectorZeroExtend32, U128, U128 ) +OPCODE(VectorZeroExtend64, U128, U128 ) +OPCODE(VectorZeroUpper, U128, U128 ) +OPCODE(ZeroVector, U128, ) + +// Floating-point operations +OPCODE(FPAbs16, U16, U16 ) +OPCODE(FPAbs32, U32, U32 ) +OPCODE(FPAbs64, U64, U64 ) +OPCODE(FPAdd32, U32, U32, U32 ) +OPCODE(FPAdd64, U64, U64, U64 ) +OPCODE(FPCompare32, NZCV, U32, U32, U1 ) +OPCODE(FPCompare64, NZCV, U64, U64, U1 ) +OPCODE(FPDiv32, U32, U32, U32 ) +OPCODE(FPDiv64, U64, U64, U64 ) +OPCODE(FPMax32, U32, U32, U32 ) +OPCODE(FPMax64, U64, U64, U64 ) +OPCODE(FPMaxNumeric32, U32, U32, U32 ) +OPCODE(FPMaxNumeric64, U64, U64, U64 ) +OPCODE(FPMin32, U32, U32, U32 ) +OPCODE(FPMin64, U64, U64, U64 ) +OPCODE(FPMinNumeric32, U32, U32, U32 ) +OPCODE(FPMinNumeric64, U64, U64, U64 ) +OPCODE(FPMul32, U32, U32, U32 ) +OPCODE(FPMul64, U64, U64, U64 ) +OPCODE(FPMulAdd16, U16, U16, U16, U16 ) +OPCODE(FPMulAdd32, U32, U32, U32, U32 ) +OPCODE(FPMulAdd64, U64, U64, U64, U64 ) +OPCODE(FPMulSub16, U16, U16, U16, U16 ) +OPCODE(FPMulSub32, U32, U32, U32, U32 ) +OPCODE(FPMulSub64, U64, U64, U64, U64 ) +OPCODE(FPMulX32, U32, U32, U32 ) +OPCODE(FPMulX64, U64, U64, U64 ) +OPCODE(FPNeg16, U16, U16 ) +OPCODE(FPNeg32, U32, U32 ) +OPCODE(FPNeg64, U64, U64 ) +OPCODE(FPRecipEstimate16, U16, U16 ) +OPCODE(FPRecipEstimate32, U32, U32 ) +OPCODE(FPRecipEstimate64, U64, U64 ) +OPCODE(FPRecipExponent16, U16, U16 ) +OPCODE(FPRecipExponent32, U32, U32 ) +OPCODE(FPRecipExponent64, U64, U64 ) +OPCODE(FPRecipStepFused16, U16, U16, U16 ) +OPCODE(FPRecipStepFused32, U32, U32, U32 ) +OPCODE(FPRecipStepFused64, U64, U64, U64 ) +OPCODE(FPRoundInt16, U16, U16, U8, U1 ) +OPCODE(FPRoundInt32, U32, U32, U8, U1 ) +OPCODE(FPRoundInt64, U64, U64, U8, U1 ) +OPCODE(FPRSqrtEstimate16, U16, U16 ) +OPCODE(FPRSqrtEstimate32, U32, U32 ) +OPCODE(FPRSqrtEstimate64, U64, U64 ) +OPCODE(FPRSqrtStepFused16, U16, U16, U16 ) +OPCODE(FPRSqrtStepFused32, U32, U32, U32 ) +OPCODE(FPRSqrtStepFused64, U64, U64, U64 ) +OPCODE(FPSqrt32, U32, U32 ) +OPCODE(FPSqrt64, U64, U64 ) +OPCODE(FPSub32, U32, U32, U32 ) +OPCODE(FPSub64, U64, U64, U64 ) + +// Floating-point conversions +OPCODE(FPHalfToDouble, U64, U16, U8 ) +OPCODE(FPHalfToSingle, U32, U16, U8 ) +OPCODE(FPSingleToDouble, U64, U32, U8 ) +OPCODE(FPSingleToHalf, U16, U32, U8 ) +OPCODE(FPDoubleToHalf, U16, U64, U8 ) +OPCODE(FPDoubleToSingle, U32, U64, U8 ) +OPCODE(FPDoubleToFixedS16, U16, U64, U8, U8 ) +OPCODE(FPDoubleToFixedS32, U32, U64, U8, U8 ) +OPCODE(FPDoubleToFixedS64, U64, U64, U8, U8 ) +OPCODE(FPDoubleToFixedU16, U16, U64, U8, U8 ) +OPCODE(FPDoubleToFixedU32, U32, U64, U8, U8 ) +OPCODE(FPDoubleToFixedU64, U64, U64, U8, U8 ) +OPCODE(FPHalfToFixedS16, U16, U16, U8, U8 ) +OPCODE(FPHalfToFixedS32, U32, U16, U8, U8 ) +OPCODE(FPHalfToFixedS64, U64, U16, U8, U8 ) +OPCODE(FPHalfToFixedU16, U16, U16, U8, U8 ) +OPCODE(FPHalfToFixedU32, U32, U16, U8, U8 ) +OPCODE(FPHalfToFixedU64, U64, U16, U8, U8 ) +OPCODE(FPSingleToFixedS16, U16, U32, U8, U8 ) +OPCODE(FPSingleToFixedS32, U32, U32, U8, U8 ) +OPCODE(FPSingleToFixedS64, U64, U32, U8, U8 ) +OPCODE(FPSingleToFixedU16, U16, U32, U8, U8 ) +OPCODE(FPSingleToFixedU32, U32, U32, U8, U8 ) +OPCODE(FPSingleToFixedU64, U64, U32, U8, U8 ) +OPCODE(FPFixedU16ToSingle, U32, U16, U8, U8 ) +OPCODE(FPFixedS16ToSingle, U32, U16, U8, U8 ) +OPCODE(FPFixedU16ToDouble, U64, U16, U8, U8 ) +OPCODE(FPFixedS16ToDouble, U64, U16, U8, U8 ) +OPCODE(FPFixedU32ToSingle, U32, U32, U8, U8 ) +OPCODE(FPFixedS32ToSingle, U32, U32, U8, U8 ) +OPCODE(FPFixedU32ToDouble, U64, U32, U8, U8 ) +OPCODE(FPFixedS32ToDouble, U64, U32, U8, U8 ) +OPCODE(FPFixedU64ToDouble, U64, U64, U8, U8 ) +OPCODE(FPFixedU64ToSingle, U32, U64, U8, U8 ) +OPCODE(FPFixedS64ToDouble, U64, U64, U8, U8 ) +OPCODE(FPFixedS64ToSingle, U32, U64, U8, U8 ) + +// Floating-point vector instructions +OPCODE(FPVectorAbs16, U128, U128 ) +OPCODE(FPVectorAbs32, U128, U128 ) +OPCODE(FPVectorAbs64, U128, U128 ) +OPCODE(FPVectorAdd32, U128, U128, U128, U1 ) +OPCODE(FPVectorAdd64, U128, U128, U128, U1 ) +OPCODE(FPVectorDiv32, U128, U128, U128, U1 ) +OPCODE(FPVectorDiv64, U128, U128, U128, U1 ) +OPCODE(FPVectorEqual16, U128, U128, U128, U1 ) +OPCODE(FPVectorEqual32, U128, U128, U128, U1 ) +OPCODE(FPVectorEqual64, U128, U128, U128, U1 ) +OPCODE(FPVectorFromHalf32, U128, U128, U8, U1 ) +OPCODE(FPVectorFromSignedFixed32, U128, U128, U8, U8, U1 ) +OPCODE(FPVectorFromSignedFixed64, U128, U128, U8, U8, U1 ) +OPCODE(FPVectorFromUnsignedFixed32, U128, U128, U8, U8, U1 ) +OPCODE(FPVectorFromUnsignedFixed64, U128, U128, U8, U8, U1 ) +OPCODE(FPVectorGreater32, U128, U128, U128, U1 ) +OPCODE(FPVectorGreater64, U128, U128, U128, U1 ) +OPCODE(FPVectorGreaterEqual32, U128, U128, U128, U1 ) +OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 ) +OPCODE(FPVectorMax32, U128, U128, U128, U1 ) +OPCODE(FPVectorMax64, U128, U128, U128, U1 ) +OPCODE(FPVectorMaxNumeric32, U128, U128, U128, U1 ) +OPCODE(FPVectorMaxNumeric64, U128, U128, U128, U1 ) +OPCODE(FPVectorMin32, U128, U128, U128, U1 ) +OPCODE(FPVectorMin64, U128, U128, U128, U1 ) +OPCODE(FPVectorMinNumeric32, U128, U128, U128, U1 ) +OPCODE(FPVectorMinNumeric64, U128, U128, U128, U1 ) +OPCODE(FPVectorMul32, U128, U128, U128, U1 ) +OPCODE(FPVectorMul64, U128, U128, U128, U1 ) +OPCODE(FPVectorMulAdd16, U128, U128, U128, U128, U1 ) +OPCODE(FPVectorMulAdd32, U128, U128, U128, U128, U1 ) +OPCODE(FPVectorMulAdd64, U128, U128, U128, U128, U1 ) +OPCODE(FPVectorMulX32, U128, U128, U128, U1 ) +OPCODE(FPVectorMulX64, U128, U128, U128, U1 ) +OPCODE(FPVectorNeg16, U128, U128 ) +OPCODE(FPVectorNeg32, U128, U128 ) +OPCODE(FPVectorNeg64, U128, U128 ) +OPCODE(FPVectorPairedAdd32, U128, U128, U128, U1 ) +OPCODE(FPVectorPairedAdd64, U128, U128, U128, U1 ) +OPCODE(FPVectorPairedAddLower32, U128, U128, U128, U1 ) +OPCODE(FPVectorPairedAddLower64, U128, U128, U128, U1 ) +OPCODE(FPVectorRecipEstimate16, U128, U128, U1 ) +OPCODE(FPVectorRecipEstimate32, U128, U128, U1 ) +OPCODE(FPVectorRecipEstimate64, U128, U128, U1 ) +OPCODE(FPVectorRecipStepFused16, U128, U128, U128, U1 ) +OPCODE(FPVectorRecipStepFused32, U128, U128, U128, U1 ) +OPCODE(FPVectorRecipStepFused64, U128, U128, U128, U1 ) +OPCODE(FPVectorRoundInt16, U128, U128, U8, U1, U1 ) +OPCODE(FPVectorRoundInt32, U128, U128, U8, U1, U1 ) +OPCODE(FPVectorRoundInt64, U128, U128, U8, U1, U1 ) +OPCODE(FPVectorRSqrtEstimate16, U128, U128, U1 ) +OPCODE(FPVectorRSqrtEstimate32, U128, U128, U1 ) +OPCODE(FPVectorRSqrtEstimate64, U128, U128, U1 ) +OPCODE(FPVectorRSqrtStepFused16, U128, U128, U128, U1 ) +OPCODE(FPVectorRSqrtStepFused32, U128, U128, U128, U1 ) +OPCODE(FPVectorRSqrtStepFused64, U128, U128, U128, U1 ) +OPCODE(FPVectorSqrt32, U128, U128, U1 ) +OPCODE(FPVectorSqrt64, U128, U128, U1 ) +OPCODE(FPVectorSub32, U128, U128, U128, U1 ) +OPCODE(FPVectorSub64, U128, U128, U128, U1 ) +OPCODE(FPVectorToHalf32, U128, U128, U8, U1 ) +OPCODE(FPVectorToSignedFixed16, U128, U128, U8, U8, U1 ) +OPCODE(FPVectorToSignedFixed32, U128, U128, U8, U8, U1 ) +OPCODE(FPVectorToSignedFixed64, U128, U128, U8, U8, U1 ) +OPCODE(FPVectorToUnsignedFixed16, U128, U128, U8, U8, U1 ) +OPCODE(FPVectorToUnsignedFixed32, U128, U128, U8, U8, U1 ) +OPCODE(FPVectorToUnsignedFixed64, U128, U128, U8, U8, U1 ) + +// A32 Memory access +A32OPC(ClearExclusive, Void, ) +A32OPC(ReadMemory8, U8, U64, U32, AccType ) +A32OPC(ReadMemory16, U16, U64, U32, AccType ) +A32OPC(ReadMemory32, U32, U64, U32, AccType ) +A32OPC(ReadMemory64, U64, U64, U32, AccType ) +A32OPC(ExclusiveReadMemory8, U8, U64, U32, AccType ) +A32OPC(ExclusiveReadMemory16, U16, U64, U32, AccType ) +A32OPC(ExclusiveReadMemory32, U32, U64, U32, AccType ) +A32OPC(ExclusiveReadMemory64, U64, U64, U32, AccType ) +A32OPC(WriteMemory8, Void, U64, U32, U8, AccType ) +A32OPC(WriteMemory16, Void, U64, U32, U16, AccType ) +A32OPC(WriteMemory32, Void, U64, U32, U32, AccType ) +A32OPC(WriteMemory64, Void, U64, U32, U64, AccType ) +A32OPC(ExclusiveWriteMemory8, U32, U64, U32, U8, AccType ) +A32OPC(ExclusiveWriteMemory16, U32, U64, U32, U16, AccType ) +A32OPC(ExclusiveWriteMemory32, U32, U64, U32, U32, AccType ) +A32OPC(ExclusiveWriteMemory64, U32, U64, U32, U64, AccType ) + +// A64 Memory access +A64OPC(ClearExclusive, Void, ) +A64OPC(ReadMemory8, U8, U64, U64, AccType ) +A64OPC(ReadMemory16, U16, U64, U64, AccType ) +A64OPC(ReadMemory32, U32, U64, U64, AccType ) +A64OPC(ReadMemory64, U64, U64, U64, AccType ) +A64OPC(ReadMemory128, U128, U64, U64, AccType ) +A64OPC(ExclusiveReadMemory8, U8, U64, U64, AccType ) +A64OPC(ExclusiveReadMemory16, U16, U64, U64, AccType ) +A64OPC(ExclusiveReadMemory32, U32, U64, U64, AccType ) +A64OPC(ExclusiveReadMemory64, U64, U64, U64, AccType ) +A64OPC(ExclusiveReadMemory128, U128, U64, U64, AccType ) +A64OPC(WriteMemory8, Void, U64, U64, U8, AccType ) +A64OPC(WriteMemory16, Void, U64, U64, U16, AccType ) +A64OPC(WriteMemory32, Void, U64, U64, U32, AccType ) +A64OPC(WriteMemory64, Void, U64, U64, U64, AccType ) +A64OPC(WriteMemory128, Void, U64, U64, U128, AccType ) +A64OPC(ExclusiveWriteMemory8, U32, U64, U64, U8, AccType ) +A64OPC(ExclusiveWriteMemory16, U32, U64, U64, U16, AccType ) +A64OPC(ExclusiveWriteMemory32, U32, U64, U64, U32, AccType ) +A64OPC(ExclusiveWriteMemory64, U32, U64, U64, U64, AccType ) +A64OPC(ExclusiveWriteMemory128, U32, U64, U64, U128, AccType ) + +// Coprocessor +A32OPC(CoprocInternalOperation, Void, CoprocInfo ) +A32OPC(CoprocSendOneWord, Void, CoprocInfo, U32 ) +A32OPC(CoprocSendTwoWords, Void, CoprocInfo, U32, U32 ) +A32OPC(CoprocGetOneWord, U32, CoprocInfo ) +A32OPC(CoprocGetTwoWords, U64, CoprocInfo ) +A32OPC(CoprocLoadWords, Void, CoprocInfo, U32 ) +A32OPC(CoprocStoreWords, Void, CoprocInfo, U32 ) + +// clang-format on diff --git a/externals/dynarmic/src/dynarmic/ir/opt/a32_constant_memory_reads_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/a32_constant_memory_reads_pass.cpp new file mode 100644 index 0000000000..9699f18345 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/a32_constant_memory_reads_pass.cpp @@ -0,0 +1,70 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/interface/A32/config.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/opcodes.h" +#include "dynarmic/ir/opt/passes.h" + +namespace Dynarmic::Optimization { + +void A32ConstantMemoryReads(IR::Block& block, A32::UserCallbacks* cb) { + for (auto& inst : block) { + switch (inst.GetOpcode()) { + case IR::Opcode::A32ReadMemory8: { + if (!inst.AreAllArgsImmediates()) { + break; + } + + const u32 vaddr = inst.GetArg(1).GetU32(); + if (cb->IsReadOnlyMemory(vaddr)) { + const u8 value_from_memory = cb->MemoryRead8(vaddr); + inst.ReplaceUsesWith(IR::Value{value_from_memory}); + } + break; + } + case IR::Opcode::A32ReadMemory16: { + if (!inst.AreAllArgsImmediates()) { + break; + } + + const u32 vaddr = inst.GetArg(1).GetU32(); + if (cb->IsReadOnlyMemory(vaddr)) { + const u16 value_from_memory = cb->MemoryRead16(vaddr); + inst.ReplaceUsesWith(IR::Value{value_from_memory}); + } + break; + } + case IR::Opcode::A32ReadMemory32: { + if (!inst.AreAllArgsImmediates()) { + break; + } + + const u32 vaddr = inst.GetArg(1).GetU32(); + if (cb->IsReadOnlyMemory(vaddr)) { + const u32 value_from_memory = cb->MemoryRead32(vaddr); + inst.ReplaceUsesWith(IR::Value{value_from_memory}); + } + break; + } + case IR::Opcode::A32ReadMemory64: { + if (!inst.AreAllArgsImmediates()) { + break; + } + + const u32 vaddr = inst.GetArg(1).GetU32(); + if (cb->IsReadOnlyMemory(vaddr)) { + const u64 value_from_memory = cb->MemoryRead64(vaddr); + inst.ReplaceUsesWith(IR::Value{value_from_memory}); + } + break; + } + default: + break; + } + } +} + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/opt/a32_get_set_elimination_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/a32_get_set_elimination_pass.cpp new file mode 100644 index 0000000000..4b743bff0b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/a32_get_set_elimination_pass.cpp @@ -0,0 +1,377 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <algorithm> +#include <array> +#include <functional> + +#include <mcl/assert.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/A32/a32_ir_emitter.h" +#include "dynarmic/frontend/A32/a32_types.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/opcodes.h" +#include "dynarmic/ir/opt/passes.h" +#include "dynarmic/ir/value.h" + +namespace Dynarmic::Optimization { + +namespace { + +void FlagsPass(IR::Block& block) { + using Iterator = std::reverse_iterator<IR::Block::iterator>; + + struct FlagInfo { + bool set_not_required = false; + bool has_value_request = false; + Iterator value_request = {}; + }; + struct ValuelessFlagInfo { + bool set_not_required = false; + }; + ValuelessFlagInfo nzcvq; + ValuelessFlagInfo nzcv; + ValuelessFlagInfo nz; + FlagInfo c_flag; + FlagInfo ge; + + auto do_set = [&](FlagInfo& info, IR::Value value, Iterator inst) { + if (info.has_value_request) { + info.value_request->ReplaceUsesWith(value); + } + info.has_value_request = false; + + if (info.set_not_required) { + inst->Invalidate(); + } + info.set_not_required = true; + }; + + auto do_set_valueless = [&](ValuelessFlagInfo& info, Iterator inst) { + if (info.set_not_required) { + inst->Invalidate(); + } + info.set_not_required = true; + }; + + auto do_get = [](FlagInfo& info, Iterator inst) { + if (info.has_value_request) { + info.value_request->ReplaceUsesWith(IR::Value{&*inst}); + } + info.has_value_request = true; + info.value_request = inst; + }; + + A32::IREmitter ir{block, A32::LocationDescriptor{block.Location()}, {}}; + + for (auto inst = block.rbegin(); inst != block.rend(); ++inst) { + switch (inst->GetOpcode()) { + case IR::Opcode::A32GetCFlag: { + do_get(c_flag, inst); + break; + } + case IR::Opcode::A32SetCpsrNZCV: { + if (c_flag.has_value_request) { + ir.SetInsertionPointBefore(inst.base()); // base is one ahead + IR::U1 c = ir.GetCFlagFromNZCV(IR::NZCV{inst->GetArg(0)}); + c_flag.value_request->ReplaceUsesWith(c); + c_flag.has_value_request = false; + break; // This case will be executed again because of the above + } + + do_set_valueless(nzcv, inst); + + nz = {.set_not_required = true}; + c_flag = {.set_not_required = true}; + break; + } + case IR::Opcode::A32SetCpsrNZCVRaw: { + if (c_flag.has_value_request) { + nzcv.set_not_required = false; + } + + do_set_valueless(nzcv, inst); + + nzcvq = {}; + nz = {.set_not_required = true}; + c_flag = {.set_not_required = true}; + break; + } + case IR::Opcode::A32SetCpsrNZCVQ: { + if (c_flag.has_value_request) { + nzcvq.set_not_required = false; + } + + do_set_valueless(nzcvq, inst); + + nzcv = {.set_not_required = true}; + nz = {.set_not_required = true}; + c_flag = {.set_not_required = true}; + break; + } + case IR::Opcode::A32SetCpsrNZ: { + do_set_valueless(nz, inst); + + nzcvq = {}; + nzcv = {}; + break; + } + case IR::Opcode::A32SetCpsrNZC: { + if (c_flag.has_value_request) { + c_flag.value_request->ReplaceUsesWith(inst->GetArg(1)); + c_flag.has_value_request = false; + } + + if (!inst->GetArg(1).IsImmediate() && inst->GetArg(1).GetInstRecursive()->GetOpcode() == IR::Opcode::A32GetCFlag) { + const auto nz_value = inst->GetArg(0); + + inst->Invalidate(); + + ir.SetInsertionPointBefore(inst.base()); + ir.SetCpsrNZ(IR::NZCV{nz_value}); + + nzcvq = {}; + nzcv = {}; + nz = {.set_not_required = true}; + break; + } + + if (nz.set_not_required && c_flag.set_not_required) { + inst->Invalidate(); + } else if (nz.set_not_required) { + inst->SetArg(0, IR::Value::EmptyNZCVImmediateMarker()); + } + nz.set_not_required = true; + c_flag.set_not_required = true; + + nzcv = {}; + nzcvq = {}; + break; + } + case IR::Opcode::A32SetGEFlags: { + do_set(ge, inst->GetArg(0), inst); + break; + } + case IR::Opcode::A32GetGEFlags: { + do_get(ge, inst); + break; + } + case IR::Opcode::A32SetGEFlagsCompressed: { + ge = {.set_not_required = true}; + break; + } + case IR::Opcode::A32OrQFlag: { + break; + } + default: { + if (inst->ReadsFromCPSR() || inst->WritesToCPSR()) { + nzcvq = {}; + nzcv = {}; + nz = {}; + c_flag = {}; + ge = {}; + } + break; + } + } + } +} + +void RegisterPass(IR::Block& block) { + using Iterator = IR::Block::iterator; + + struct RegInfo { + IR::Value register_value; + std::optional<Iterator> last_set_instruction; + }; + std::array<RegInfo, 15> reg_info; + + const auto do_get = [](RegInfo& info, Iterator get_inst) { + if (info.register_value.IsEmpty()) { + info.register_value = IR::Value(&*get_inst); + return; + } + get_inst->ReplaceUsesWith(info.register_value); + }; + + const auto do_set = [](RegInfo& info, IR::Value value, Iterator set_inst) { + if (info.last_set_instruction) { + (*info.last_set_instruction)->Invalidate(); + } + info = { + .register_value = value, + .last_set_instruction = set_inst, + }; + }; + + enum class ExtValueType { + Empty, + Single, + Double, + VectorDouble, + VectorQuad, + }; + struct ExtRegInfo { + ExtValueType value_type = {}; + IR::Value register_value; + std::optional<Iterator> last_set_instruction; + }; + std::array<ExtRegInfo, 64> ext_reg_info; + + const auto do_ext_get = [](ExtValueType type, std::initializer_list<std::reference_wrapper<ExtRegInfo>> infos, Iterator get_inst) { + if (!std::all_of(infos.begin(), infos.end(), [type](const auto& info) { return info.get().value_type == type; })) { + for (auto& info : infos) { + info.get() = { + .value_type = type, + .register_value = IR::Value(&*get_inst), + .last_set_instruction = std::nullopt, + }; + } + return; + } + get_inst->ReplaceUsesWith(std::data(infos)[0].get().register_value); + }; + + const auto do_ext_set = [](ExtValueType type, std::initializer_list<std::reference_wrapper<ExtRegInfo>> infos, IR::Value value, Iterator set_inst) { + if (std::all_of(infos.begin(), infos.end(), [type](const auto& info) { return info.get().value_type == type; })) { + if (std::data(infos)[0].get().last_set_instruction) { + (*std::data(infos)[0].get().last_set_instruction)->Invalidate(); + } + } + for (auto& info : infos) { + info.get() = { + .value_type = type, + .register_value = value, + .last_set_instruction = set_inst, + }; + } + }; + + // Location and version don't matter here. + A32::IREmitter ir{block, A32::LocationDescriptor{block.Location()}, {}}; + + for (auto inst = block.begin(); inst != block.end(); ++inst) { + switch (inst->GetOpcode()) { + case IR::Opcode::A32GetRegister: { + const A32::Reg reg = inst->GetArg(0).GetA32RegRef(); + ASSERT(reg != A32::Reg::PC); + const size_t reg_index = static_cast<size_t>(reg); + do_get(reg_info[reg_index], inst); + break; + } + case IR::Opcode::A32SetRegister: { + const A32::Reg reg = inst->GetArg(0).GetA32RegRef(); + if (reg == A32::Reg::PC) { + break; + } + const auto reg_index = static_cast<size_t>(reg); + do_set(reg_info[reg_index], inst->GetArg(1), inst); + break; + } + case IR::Opcode::A32GetExtendedRegister32: { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + const size_t reg_index = A32::RegNumber(reg); + do_ext_get(ExtValueType::Single, {ext_reg_info[reg_index]}, inst); + break; + } + case IR::Opcode::A32SetExtendedRegister32: { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + const size_t reg_index = A32::RegNumber(reg); + do_ext_set(ExtValueType::Single, {ext_reg_info[reg_index]}, inst->GetArg(1), inst); + break; + } + case IR::Opcode::A32GetExtendedRegister64: { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + const size_t reg_index = A32::RegNumber(reg); + do_ext_get(ExtValueType::Double, + { + ext_reg_info[reg_index * 2 + 0], + ext_reg_info[reg_index * 2 + 1], + }, + inst); + break; + } + case IR::Opcode::A32SetExtendedRegister64: { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + const size_t reg_index = A32::RegNumber(reg); + do_ext_set(ExtValueType::Double, + { + ext_reg_info[reg_index * 2 + 0], + ext_reg_info[reg_index * 2 + 1], + }, + inst->GetArg(1), + inst); + break; + } + case IR::Opcode::A32GetVector: { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + const size_t reg_index = A32::RegNumber(reg); + if (A32::IsDoubleExtReg(reg)) { + do_ext_get(ExtValueType::VectorDouble, + { + ext_reg_info[reg_index * 2 + 0], + ext_reg_info[reg_index * 2 + 1], + }, + inst); + } else { + DEBUG_ASSERT(A32::IsQuadExtReg(reg)); + do_ext_get(ExtValueType::VectorQuad, + { + ext_reg_info[reg_index * 4 + 0], + ext_reg_info[reg_index * 4 + 1], + ext_reg_info[reg_index * 4 + 2], + ext_reg_info[reg_index * 4 + 3], + }, + inst); + } + break; + } + case IR::Opcode::A32SetVector: { + const A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + const size_t reg_index = A32::RegNumber(reg); + if (A32::IsDoubleExtReg(reg)) { + ir.SetInsertionPointAfter(inst); + const IR::U128 stored_value = ir.VectorZeroUpper(IR::U128{inst->GetArg(1)}); + do_ext_set(ExtValueType::VectorDouble, + { + ext_reg_info[reg_index * 2 + 0], + ext_reg_info[reg_index * 2 + 1], + }, + stored_value, + inst); + } else { + DEBUG_ASSERT(A32::IsQuadExtReg(reg)); + do_ext_set(ExtValueType::VectorQuad, + { + ext_reg_info[reg_index * 4 + 0], + ext_reg_info[reg_index * 4 + 1], + ext_reg_info[reg_index * 4 + 2], + ext_reg_info[reg_index * 4 + 3], + }, + inst->GetArg(1), + inst); + } + break; + } + default: { + if (inst->ReadsFromCoreRegister() || inst->WritesToCoreRegister()) { + reg_info = {}; + ext_reg_info = {}; + } + break; + } + } + } +} + +} // namespace + +void A32GetSetElimination(IR::Block& block, A32GetSetEliminationOptions) { + FlagsPass(block); + RegisterPass(block); +} + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/opt/a64_callback_config_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/a64_callback_config_pass.cpp new file mode 100644 index 0000000000..79d9769520 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/a64_callback_config_pass.cpp @@ -0,0 +1,57 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/frontend/A64/a64_ir_emitter.h" +#include "dynarmic/interface/A64/config.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" +#include "dynarmic/ir/opt/passes.h" + +namespace Dynarmic::Optimization { + +void A64CallbackConfigPass(IR::Block& block, const A64::UserConfig& conf) { + if (conf.hook_data_cache_operations) { + return; + } + + for (auto& inst : block) { + if (inst.GetOpcode() != IR::Opcode::A64DataCacheOperationRaised) { + continue; + } + + const auto op = static_cast<A64::DataCacheOperation>(inst.GetArg(1).GetU64()); + if (op == A64::DataCacheOperation::ZeroByVA) { + A64::IREmitter ir{block}; + ir.current_location = A64::LocationDescriptor{IR::LocationDescriptor{inst.GetArg(0).GetU64()}}; + ir.SetInsertionPointBefore(&inst); + + size_t bytes = 4 << static_cast<size_t>(conf.dczid_el0 & 0b1111); + IR::U64 addr{inst.GetArg(2)}; + + const IR::U128 zero_u128 = ir.ZeroExtendToQuad(ir.Imm64(0)); + while (bytes >= 16) { + ir.WriteMemory128(addr, zero_u128, IR::AccType::DCZVA); + addr = ir.Add(addr, ir.Imm64(16)); + bytes -= 16; + } + + while (bytes >= 8) { + ir.WriteMemory64(addr, ir.Imm64(0), IR::AccType::DCZVA); + addr = ir.Add(addr, ir.Imm64(8)); + bytes -= 8; + } + + while (bytes >= 4) { + ir.WriteMemory32(addr, ir.Imm32(0), IR::AccType::DCZVA); + addr = ir.Add(addr, ir.Imm64(4)); + bytes -= 4; + } + } + inst.Invalidate(); + } +} + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/opt/a64_get_set_elimination_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/a64_get_set_elimination_pass.cpp new file mode 100644 index 0000000000..f49201562a --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/a64_get_set_elimination_pass.cpp @@ -0,0 +1,161 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <array> + +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/A64/a64_types.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/opcodes.h" +#include "dynarmic/ir/opt/passes.h" +#include "dynarmic/ir/value.h" + +namespace Dynarmic::Optimization { + +void A64GetSetElimination(IR::Block& block) { + using Iterator = IR::Block::iterator; + + enum class TrackingType { + W, + X, + S, + D, + Q, + SP, + NZCV, + NZCVRaw, + }; + struct RegisterInfo { + IR::Value register_value; + TrackingType tracking_type; + bool set_instruction_present = false; + Iterator last_set_instruction; + }; + std::array<RegisterInfo, 31> reg_info; + std::array<RegisterInfo, 32> vec_info; + RegisterInfo sp_info; + RegisterInfo nzcv_info; + + const auto do_set = [&block](RegisterInfo& info, IR::Value value, Iterator set_inst, TrackingType tracking_type) { + if (info.set_instruction_present) { + info.last_set_instruction->Invalidate(); + block.Instructions().erase(info.last_set_instruction); + } + + info.register_value = value; + info.tracking_type = tracking_type; + info.set_instruction_present = true; + info.last_set_instruction = set_inst; + }; + + const auto do_get = [](RegisterInfo& info, Iterator get_inst, TrackingType tracking_type) { + const auto do_nothing = [&] { + info = {}; + info.register_value = IR::Value(&*get_inst); + info.tracking_type = tracking_type; + }; + + if (info.register_value.IsEmpty()) { + do_nothing(); + return; + } + + if (info.tracking_type == tracking_type) { + get_inst->ReplaceUsesWith(info.register_value); + return; + } + + do_nothing(); + }; + + for (auto inst = block.begin(); inst != block.end(); ++inst) { + switch (inst->GetOpcode()) { + case IR::Opcode::A64GetW: { + const size_t index = A64::RegNumber(inst->GetArg(0).GetA64RegRef()); + do_get(reg_info.at(index), inst, TrackingType::W); + break; + } + case IR::Opcode::A64GetX: { + const size_t index = A64::RegNumber(inst->GetArg(0).GetA64RegRef()); + do_get(reg_info.at(index), inst, TrackingType::X); + break; + } + case IR::Opcode::A64GetS: { + const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef()); + do_get(vec_info.at(index), inst, TrackingType::S); + break; + } + case IR::Opcode::A64GetD: { + const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef()); + do_get(vec_info.at(index), inst, TrackingType::D); + break; + } + case IR::Opcode::A64GetQ: { + const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef()); + do_get(vec_info.at(index), inst, TrackingType::Q); + break; + } + case IR::Opcode::A64GetSP: { + do_get(sp_info, inst, TrackingType::SP); + break; + } + case IR::Opcode::A64GetNZCVRaw: { + do_get(nzcv_info, inst, TrackingType::NZCVRaw); + break; + } + case IR::Opcode::A64SetW: { + const size_t index = A64::RegNumber(inst->GetArg(0).GetA64RegRef()); + do_set(reg_info.at(index), inst->GetArg(1), inst, TrackingType::W); + break; + } + case IR::Opcode::A64SetX: { + const size_t index = A64::RegNumber(inst->GetArg(0).GetA64RegRef()); + do_set(reg_info.at(index), inst->GetArg(1), inst, TrackingType::X); + break; + } + case IR::Opcode::A64SetS: { + const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef()); + do_set(vec_info.at(index), inst->GetArg(1), inst, TrackingType::S); + break; + } + case IR::Opcode::A64SetD: { + const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef()); + do_set(vec_info.at(index), inst->GetArg(1), inst, TrackingType::D); + break; + } + case IR::Opcode::A64SetQ: { + const size_t index = A64::VecNumber(inst->GetArg(0).GetA64VecRef()); + do_set(vec_info.at(index), inst->GetArg(1), inst, TrackingType::Q); + break; + } + case IR::Opcode::A64SetSP: { + do_set(sp_info, inst->GetArg(0), inst, TrackingType::SP); + break; + } + case IR::Opcode::A64SetNZCV: { + do_set(nzcv_info, inst->GetArg(0), inst, TrackingType::NZCV); + break; + } + case IR::Opcode::A64SetNZCVRaw: { + do_set(nzcv_info, inst->GetArg(0), inst, TrackingType::NZCVRaw); + break; + } + default: { + if (inst->ReadsFromCPSR() || inst->WritesToCPSR()) { + nzcv_info = {}; + } + if (inst->ReadsFromCoreRegister() || inst->WritesToCoreRegister()) { + reg_info = {}; + vec_info = {}; + sp_info = {}; + } + break; + } + } + } +} + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/opt/a64_merge_interpret_blocks.cpp b/externals/dynarmic/src/dynarmic/ir/opt/a64_merge_interpret_blocks.cpp new file mode 100644 index 0000000000..00a0e1b672 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/a64_merge_interpret_blocks.cpp @@ -0,0 +1,54 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <boost/variant/get.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/frontend/A64/a64_location_descriptor.h" +#include "dynarmic/frontend/A64/translate/a64_translate.h" +#include "dynarmic/interface/A64/config.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/opt/passes.h" + +namespace Dynarmic::Optimization { + +void A64MergeInterpretBlocksPass(IR::Block& block, A64::UserCallbacks* cb) { + const auto is_interpret_instruction = [cb](A64::LocationDescriptor location) { + const auto instruction = cb->MemoryReadCode(location.PC()); + if (!instruction) + return false; + + IR::Block new_block{location}; + A64::TranslateSingleInstruction(new_block, location, *instruction); + + if (!new_block.Instructions().empty()) + return false; + + const IR::Terminal terminal = new_block.GetTerminal(); + if (auto term = boost::get<IR::Term::Interpret>(&terminal)) { + return term->next == location; + } + + return false; + }; + + IR::Terminal terminal = block.GetTerminal(); + auto term = boost::get<IR::Term::Interpret>(&terminal); + if (!term) + return; + + A64::LocationDescriptor location{term->next}; + size_t num_instructions = 1; + + while (is_interpret_instruction(location.AdvancePC(static_cast<int>(num_instructions * 4)))) { + num_instructions++; + } + + term->num_instructions = num_instructions; + block.ReplaceTerminal(terminal); + block.CycleCount() += num_instructions - 1; +} + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/opt/constant_propagation_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/constant_propagation_pass.cpp new file mode 100644 index 0000000000..83ed84998d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/constant_propagation_pass.cpp @@ -0,0 +1,556 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <optional> + +#include <mcl/assert.hpp> +#include <mcl/bit/rotate.hpp> +#include <mcl/bit/swap.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/common/safe_ops.h" +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/ir_emitter.h" +#include "dynarmic/ir/opcodes.h" +#include "dynarmic/ir/opt/passes.h" + +namespace Dynarmic::Optimization { + +using Op = Dynarmic::IR::Opcode; + +namespace { + +// Tiny helper to avoid the need to store based off the opcode +// bit size all over the place within folding functions. +void ReplaceUsesWith(IR::Inst& inst, bool is_32_bit, u64 value) { + if (is_32_bit) { + inst.ReplaceUsesWith(IR::Value{static_cast<u32>(value)}); + } else { + inst.ReplaceUsesWith(IR::Value{value}); + } +} + +IR::Value Value(bool is_32_bit, u64 value) { + return is_32_bit ? IR::Value{static_cast<u32>(value)} : IR::Value{value}; +} + +template<typename ImmFn> +bool FoldCommutative(IR::Inst& inst, bool is_32_bit, ImmFn imm_fn) { + const auto lhs = inst.GetArg(0); + const auto rhs = inst.GetArg(1); + + const bool is_lhs_immediate = lhs.IsImmediate(); + const bool is_rhs_immediate = rhs.IsImmediate(); + + if (is_lhs_immediate && is_rhs_immediate) { + const u64 result = imm_fn(lhs.GetImmediateAsU64(), rhs.GetImmediateAsU64()); + ReplaceUsesWith(inst, is_32_bit, result); + return false; + } + + if (is_lhs_immediate && !is_rhs_immediate) { + const IR::Inst* rhs_inst = rhs.GetInstRecursive(); + if (rhs_inst->GetOpcode() == inst.GetOpcode() && rhs_inst->GetArg(1).IsImmediate()) { + const u64 combined = imm_fn(lhs.GetImmediateAsU64(), rhs_inst->GetArg(1).GetImmediateAsU64()); + inst.SetArg(0, rhs_inst->GetArg(0)); + inst.SetArg(1, Value(is_32_bit, combined)); + } else { + // Normalize + inst.SetArg(0, rhs); + inst.SetArg(1, lhs); + } + } + + if (!is_lhs_immediate && is_rhs_immediate) { + const IR::Inst* lhs_inst = lhs.GetInstRecursive(); + if (lhs_inst->GetOpcode() == inst.GetOpcode() && lhs_inst->GetArg(1).IsImmediate()) { + const u64 combined = imm_fn(rhs.GetImmediateAsU64(), lhs_inst->GetArg(1).GetImmediateAsU64()); + inst.SetArg(0, lhs_inst->GetArg(0)); + inst.SetArg(1, Value(is_32_bit, combined)); + } + } + + return true; +} + +void FoldAdd(IR::Inst& inst, bool is_32_bit) { + const auto lhs = inst.GetArg(0); + const auto rhs = inst.GetArg(1); + const auto carry = inst.GetArg(2); + + if (lhs.IsImmediate() && !rhs.IsImmediate()) { + // Normalize + inst.SetArg(0, rhs); + inst.SetArg(1, lhs); + FoldAdd(inst, is_32_bit); + return; + } + + if (inst.HasAssociatedPseudoOperation()) { + return; + } + + if (!lhs.IsImmediate() && rhs.IsImmediate()) { + const IR::Inst* lhs_inst = lhs.GetInstRecursive(); + if (lhs_inst->GetOpcode() == inst.GetOpcode() && lhs_inst->GetArg(1).IsImmediate() && lhs_inst->GetArg(2).IsImmediate()) { + const u64 combined = rhs.GetImmediateAsU64() + lhs_inst->GetArg(1).GetImmediateAsU64() + lhs_inst->GetArg(2).GetU1(); + if (combined == 0) { + inst.ReplaceUsesWith(lhs_inst->GetArg(0)); + return; + } + inst.SetArg(0, lhs_inst->GetArg(0)); + inst.SetArg(1, Value(is_32_bit, combined)); + return; + } + if (rhs.IsZero() && carry.IsZero()) { + inst.ReplaceUsesWith(lhs); + return; + } + } + + if (inst.AreAllArgsImmediates()) { + const u64 result = lhs.GetImmediateAsU64() + rhs.GetImmediateAsU64() + carry.GetU1(); + ReplaceUsesWith(inst, is_32_bit, result); + return; + } +} + +// Folds AND operations based on the following: +// +// 1. imm_x & imm_y -> result +// 2. x & 0 -> 0 +// 3. 0 & y -> 0 +// 4. x & y -> y (where x has all bits set to 1) +// 5. x & y -> x (where y has all bits set to 1) +// +void FoldAND(IR::Inst& inst, bool is_32_bit) { + if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a & b; })) { + const auto rhs = inst.GetArg(1); + if (rhs.IsZero()) { + ReplaceUsesWith(inst, is_32_bit, 0); + } else if (rhs.HasAllBitsSet()) { + inst.ReplaceUsesWith(inst.GetArg(0)); + } + } +} + +// Folds byte reversal opcodes based on the following: +// +// 1. imm -> swap(imm) +// +void FoldByteReverse(IR::Inst& inst, Op op) { + const auto operand = inst.GetArg(0); + + if (!operand.IsImmediate()) { + return; + } + + if (op == Op::ByteReverseWord) { + const u32 result = mcl::bit::swap_bytes_32(static_cast<u32>(operand.GetImmediateAsU64())); + inst.ReplaceUsesWith(IR::Value{result}); + } else if (op == Op::ByteReverseHalf) { + const u16 result = mcl::bit::swap_bytes_16(static_cast<u16>(operand.GetImmediateAsU64())); + inst.ReplaceUsesWith(IR::Value{result}); + } else { + const u64 result = mcl::bit::swap_bytes_64(operand.GetImmediateAsU64()); + inst.ReplaceUsesWith(IR::Value{result}); + } +} + +// Folds division operations based on the following: +// +// 1. x / 0 -> 0 (NOTE: This is an ARM-specific behavior defined in the architecture reference manual) +// 2. imm_x / imm_y -> result +// 3. x / 1 -> x +// +void FoldDivide(IR::Inst& inst, bool is_32_bit, bool is_signed) { + const auto rhs = inst.GetArg(1); + + if (rhs.IsZero()) { + ReplaceUsesWith(inst, is_32_bit, 0); + return; + } + + const auto lhs = inst.GetArg(0); + if (lhs.IsImmediate() && rhs.IsImmediate()) { + if (is_signed) { + const s64 result = lhs.GetImmediateAsS64() / rhs.GetImmediateAsS64(); + ReplaceUsesWith(inst, is_32_bit, static_cast<u64>(result)); + } else { + const u64 result = lhs.GetImmediateAsU64() / rhs.GetImmediateAsU64(); + ReplaceUsesWith(inst, is_32_bit, result); + } + } else if (rhs.IsUnsignedImmediate(1)) { + inst.ReplaceUsesWith(IR::Value{lhs}); + } +} + +// Folds EOR operations based on the following: +// +// 1. imm_x ^ imm_y -> result +// 2. x ^ 0 -> x +// 3. 0 ^ y -> y +// +void FoldEOR(IR::Inst& inst, bool is_32_bit) { + if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a ^ b; })) { + const auto rhs = inst.GetArg(1); + if (rhs.IsZero()) { + inst.ReplaceUsesWith(inst.GetArg(0)); + } + } +} + +void FoldLeastSignificantByte(IR::Inst& inst) { + if (!inst.AreAllArgsImmediates()) { + return; + } + + const auto operand = inst.GetArg(0); + inst.ReplaceUsesWith(IR::Value{static_cast<u8>(operand.GetImmediateAsU64())}); +} + +void FoldLeastSignificantHalf(IR::Inst& inst) { + if (!inst.AreAllArgsImmediates()) { + return; + } + + const auto operand = inst.GetArg(0); + inst.ReplaceUsesWith(IR::Value{static_cast<u16>(operand.GetImmediateAsU64())}); +} + +void FoldLeastSignificantWord(IR::Inst& inst) { + if (!inst.AreAllArgsImmediates()) { + return; + } + + const auto operand = inst.GetArg(0); + inst.ReplaceUsesWith(IR::Value{static_cast<u32>(operand.GetImmediateAsU64())}); +} + +void FoldMostSignificantBit(IR::Inst& inst) { + if (!inst.AreAllArgsImmediates()) { + return; + } + + const auto operand = inst.GetArg(0); + inst.ReplaceUsesWith(IR::Value{(operand.GetImmediateAsU64() >> 31) != 0}); +} + +void FoldMostSignificantWord(IR::Inst& inst) { + IR::Inst* carry_inst = inst.GetAssociatedPseudoOperation(Op::GetCarryFromOp); + + if (!inst.AreAllArgsImmediates()) { + return; + } + + const auto operand = inst.GetArg(0); + if (carry_inst) { + carry_inst->ReplaceUsesWith(IR::Value{mcl::bit::get_bit<31>(operand.GetImmediateAsU64())}); + } + inst.ReplaceUsesWith(IR::Value{static_cast<u32>(operand.GetImmediateAsU64() >> 32)}); +} + +// Folds multiplication operations based on the following: +// +// 1. imm_x * imm_y -> result +// 2. x * 0 -> 0 +// 3. 0 * y -> 0 +// 4. x * 1 -> x +// 5. 1 * y -> y +// +void FoldMultiply(IR::Inst& inst, bool is_32_bit) { + if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a * b; })) { + const auto rhs = inst.GetArg(1); + if (rhs.IsZero()) { + ReplaceUsesWith(inst, is_32_bit, 0); + } else if (rhs.IsUnsignedImmediate(1)) { + inst.ReplaceUsesWith(inst.GetArg(0)); + } + } +} + +// Folds NOT operations if the contained value is an immediate. +void FoldNOT(IR::Inst& inst, bool is_32_bit) { + const auto operand = inst.GetArg(0); + + if (!operand.IsImmediate()) { + return; + } + + const u64 result = ~operand.GetImmediateAsU64(); + ReplaceUsesWith(inst, is_32_bit, result); +} + +// Folds OR operations based on the following: +// +// 1. imm_x | imm_y -> result +// 2. x | 0 -> x +// 3. 0 | y -> y +// +void FoldOR(IR::Inst& inst, bool is_32_bit) { + if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a | b; })) { + const auto rhs = inst.GetArg(1); + if (rhs.IsZero()) { + inst.ReplaceUsesWith(inst.GetArg(0)); + } + } +} + +bool FoldShifts(IR::Inst& inst) { + IR::Inst* carry_inst = inst.GetAssociatedPseudoOperation(Op::GetCarryFromOp); + + // The 32-bit variants can contain 3 arguments, while the + // 64-bit variants only contain 2. + if (inst.NumArgs() == 3 && !carry_inst) { + inst.SetArg(2, IR::Value(false)); + } + + const auto shift_amount = inst.GetArg(1); + + if (shift_amount.IsZero()) { + if (carry_inst) { + carry_inst->ReplaceUsesWith(inst.GetArg(2)); + } + inst.ReplaceUsesWith(inst.GetArg(0)); + return false; + } + + if (inst.NumArgs() == 3 && shift_amount.IsImmediate() && !shift_amount.IsZero()) { + inst.SetArg(2, IR::Value(false)); + } + + if (!inst.AreAllArgsImmediates() || carry_inst) { + return false; + } + + return true; +} + +void FoldSignExtendXToWord(IR::Inst& inst) { + if (!inst.AreAllArgsImmediates()) { + return; + } + + const s64 value = inst.GetArg(0).GetImmediateAsS64(); + inst.ReplaceUsesWith(IR::Value{static_cast<u32>(value)}); +} + +void FoldSignExtendXToLong(IR::Inst& inst) { + if (!inst.AreAllArgsImmediates()) { + return; + } + + const s64 value = inst.GetArg(0).GetImmediateAsS64(); + inst.ReplaceUsesWith(IR::Value{static_cast<u64>(value)}); +} + +void FoldSub(IR::Inst& inst, bool is_32_bit) { + if (!inst.AreAllArgsImmediates() || inst.HasAssociatedPseudoOperation()) { + return; + } + + const auto lhs = inst.GetArg(0); + const auto rhs = inst.GetArg(1); + const auto carry = inst.GetArg(2); + + const u64 result = lhs.GetImmediateAsU64() + (~rhs.GetImmediateAsU64()) + carry.GetU1(); + ReplaceUsesWith(inst, is_32_bit, result); +} + +void FoldZeroExtendXToWord(IR::Inst& inst) { + if (!inst.AreAllArgsImmediates()) { + return; + } + + const u64 value = inst.GetArg(0).GetImmediateAsU64(); + inst.ReplaceUsesWith(IR::Value{static_cast<u32>(value)}); +} + +void FoldZeroExtendXToLong(IR::Inst& inst) { + if (!inst.AreAllArgsImmediates()) { + return; + } + + const u64 value = inst.GetArg(0).GetImmediateAsU64(); + inst.ReplaceUsesWith(IR::Value{value}); +} +} // Anonymous namespace + +void ConstantPropagation(IR::Block& block) { + for (auto& inst : block) { + const auto opcode = inst.GetOpcode(); + + switch (opcode) { + case Op::LeastSignificantWord: + FoldLeastSignificantWord(inst); + break; + case Op::MostSignificantWord: + FoldMostSignificantWord(inst); + break; + case Op::LeastSignificantHalf: + FoldLeastSignificantHalf(inst); + break; + case Op::LeastSignificantByte: + FoldLeastSignificantByte(inst); + break; + case Op::MostSignificantBit: + FoldMostSignificantBit(inst); + break; + case Op::IsZero32: + if (inst.AreAllArgsImmediates()) { + inst.ReplaceUsesWith(IR::Value{inst.GetArg(0).GetU32() == 0}); + } + break; + case Op::IsZero64: + if (inst.AreAllArgsImmediates()) { + inst.ReplaceUsesWith(IR::Value{inst.GetArg(0).GetU64() == 0}); + } + break; + case Op::LogicalShiftLeft32: + if (FoldShifts(inst)) { + ReplaceUsesWith(inst, true, Safe::LogicalShiftLeft<u32>(inst.GetArg(0).GetU32(), inst.GetArg(1).GetU8())); + } + break; + case Op::LogicalShiftLeft64: + if (FoldShifts(inst)) { + ReplaceUsesWith(inst, false, Safe::LogicalShiftLeft<u64>(inst.GetArg(0).GetU64(), inst.GetArg(1).GetU8())); + } + break; + case Op::LogicalShiftRight32: + if (FoldShifts(inst)) { + ReplaceUsesWith(inst, true, Safe::LogicalShiftRight<u32>(inst.GetArg(0).GetU32(), inst.GetArg(1).GetU8())); + } + break; + case Op::LogicalShiftRight64: + if (FoldShifts(inst)) { + ReplaceUsesWith(inst, false, Safe::LogicalShiftRight<u64>(inst.GetArg(0).GetU64(), inst.GetArg(1).GetU8())); + } + break; + case Op::ArithmeticShiftRight32: + if (FoldShifts(inst)) { + ReplaceUsesWith(inst, true, Safe::ArithmeticShiftRight<u32>(inst.GetArg(0).GetU32(), inst.GetArg(1).GetU8())); + } + break; + case Op::ArithmeticShiftRight64: + if (FoldShifts(inst)) { + ReplaceUsesWith(inst, false, Safe::ArithmeticShiftRight<u64>(inst.GetArg(0).GetU64(), inst.GetArg(1).GetU8())); + } + break; + case Op::RotateRight32: + if (FoldShifts(inst)) { + ReplaceUsesWith(inst, true, mcl::bit::rotate_right<u32>(inst.GetArg(0).GetU32(), inst.GetArg(1).GetU8())); + } + break; + case Op::RotateRight64: + if (FoldShifts(inst)) { + ReplaceUsesWith(inst, false, mcl::bit::rotate_right<u64>(inst.GetArg(0).GetU64(), inst.GetArg(1).GetU8())); + } + break; + case Op::LogicalShiftLeftMasked32: + if (inst.AreAllArgsImmediates()) { + ReplaceUsesWith(inst, true, inst.GetArg(0).GetU32() << (inst.GetArg(1).GetU32() & 0x1f)); + } + break; + case Op::LogicalShiftLeftMasked64: + if (inst.AreAllArgsImmediates()) { + ReplaceUsesWith(inst, false, inst.GetArg(0).GetU64() << (inst.GetArg(1).GetU64() & 0x3f)); + } + break; + case Op::LogicalShiftRightMasked32: + if (inst.AreAllArgsImmediates()) { + ReplaceUsesWith(inst, true, inst.GetArg(0).GetU32() >> (inst.GetArg(1).GetU32() & 0x1f)); + } + break; + case Op::LogicalShiftRightMasked64: + if (inst.AreAllArgsImmediates()) { + ReplaceUsesWith(inst, false, inst.GetArg(0).GetU64() >> (inst.GetArg(1).GetU64() & 0x3f)); + } + break; + case Op::ArithmeticShiftRightMasked32: + if (inst.AreAllArgsImmediates()) { + ReplaceUsesWith(inst, true, static_cast<s32>(inst.GetArg(0).GetU32()) >> (inst.GetArg(1).GetU32() & 0x1f)); + } + break; + case Op::ArithmeticShiftRightMasked64: + if (inst.AreAllArgsImmediates()) { + ReplaceUsesWith(inst, false, static_cast<s64>(inst.GetArg(0).GetU64()) >> (inst.GetArg(1).GetU64() & 0x3f)); + } + break; + case Op::RotateRightMasked32: + if (inst.AreAllArgsImmediates()) { + ReplaceUsesWith(inst, true, mcl::bit::rotate_right<u32>(inst.GetArg(0).GetU32(), inst.GetArg(1).GetU32())); + } + break; + case Op::RotateRightMasked64: + if (inst.AreAllArgsImmediates()) { + ReplaceUsesWith(inst, false, mcl::bit::rotate_right<u64>(inst.GetArg(0).GetU64(), inst.GetArg(1).GetU64())); + } + break; + case Op::Add32: + case Op::Add64: + FoldAdd(inst, opcode == Op::Add32); + break; + case Op::Sub32: + case Op::Sub64: + FoldSub(inst, opcode == Op::Sub32); + break; + case Op::Mul32: + case Op::Mul64: + FoldMultiply(inst, opcode == Op::Mul32); + break; + case Op::SignedDiv32: + case Op::SignedDiv64: + FoldDivide(inst, opcode == Op::SignedDiv32, true); + break; + case Op::UnsignedDiv32: + case Op::UnsignedDiv64: + FoldDivide(inst, opcode == Op::UnsignedDiv32, false); + break; + case Op::And32: + case Op::And64: + FoldAND(inst, opcode == Op::And32); + break; + case Op::Eor32: + case Op::Eor64: + FoldEOR(inst, opcode == Op::Eor32); + break; + case Op::Or32: + case Op::Or64: + FoldOR(inst, opcode == Op::Or32); + break; + case Op::Not32: + case Op::Not64: + FoldNOT(inst, opcode == Op::Not32); + break; + case Op::SignExtendByteToWord: + case Op::SignExtendHalfToWord: + FoldSignExtendXToWord(inst); + break; + case Op::SignExtendByteToLong: + case Op::SignExtendHalfToLong: + case Op::SignExtendWordToLong: + FoldSignExtendXToLong(inst); + break; + case Op::ZeroExtendByteToWord: + case Op::ZeroExtendHalfToWord: + FoldZeroExtendXToWord(inst); + break; + case Op::ZeroExtendByteToLong: + case Op::ZeroExtendHalfToLong: + case Op::ZeroExtendWordToLong: + FoldZeroExtendXToLong(inst); + break; + case Op::ByteReverseWord: + case Op::ByteReverseHalf: + case Op::ByteReverseDual: + FoldByteReverse(inst, opcode); + break; + default: + break; + } + } +} + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/opt/dead_code_elimination_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/dead_code_elimination_pass.cpp new file mode 100644 index 0000000000..0d0de180a8 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/dead_code_elimination_pass.cpp @@ -0,0 +1,23 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <mcl/iterator/reverse.hpp> + +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/opt/passes.h" + +namespace Dynarmic::Optimization { + +void DeadCodeElimination(IR::Block& block) { + // We iterate over the instructions in reverse order. + // This is because removing an instruction reduces the number of uses for earlier instructions. + for (auto& inst : mcl::iterator::reverse(block)) { + if (!inst.HasUses() && !inst.MayHaveSideEffects()) { + inst.Invalidate(); + } + } +} + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/opt/identity_removal_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/identity_removal_pass.cpp new file mode 100644 index 0000000000..e87fcc335b --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/identity_removal_pass.cpp @@ -0,0 +1,44 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <vector> + +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/opcodes.h" +#include "dynarmic/ir/opt/passes.h" + +namespace Dynarmic::Optimization { + +void IdentityRemovalPass(IR::Block& block) { + std::vector<IR::Inst*> to_invalidate; + + auto iter = block.begin(); + while (iter != block.end()) { + IR::Inst& inst = *iter; + + const size_t num_args = inst.NumArgs(); + for (size_t i = 0; i < num_args; i++) { + while (true) { + IR::Value arg = inst.GetArg(i); + if (!arg.IsIdentity()) + break; + inst.SetArg(i, arg.GetInst()->GetArg(0)); + } + } + + if (inst.GetOpcode() == IR::Opcode::Identity || inst.GetOpcode() == IR::Opcode::Void) { + iter = block.Instructions().erase(inst); + to_invalidate.push_back(&inst); + } else { + ++iter; + } + } + + for (IR::Inst* inst : to_invalidate) { + inst->Invalidate(); + } +} + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/opt/ir_matcher.h b/externals/dynarmic/src/dynarmic/ir/opt/ir_matcher.h new file mode 100644 index 0000000000..5eb1a55100 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/ir_matcher.h @@ -0,0 +1,127 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2020 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <optional> +#include <tuple> + +#include <mp/metafunction/apply.h> +#include <mp/typelist/concat.h> +#include <mp/typelist/drop.h> +#include <mp/typelist/get.h> +#include <mp/typelist/head.h> +#include <mp/typelist/list.h> +#include <mp/typelist/prepend.h> + +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" +#include "dynarmic/ir/value.h" + +namespace Dynarmic::Optimization::IRMatcher { + +struct CaptureValue { + using ReturnType = std::tuple<IR::Value>; + + static std::optional<ReturnType> Match(IR::Value value) { + return std::tuple(value); + } +}; + +struct CaptureInst { + using ReturnType = std::tuple<IR::Inst*>; + + static std::optional<ReturnType> Match(IR::Value value) { + if (value.IsImmediate()) + return std::nullopt; + return std::tuple(value.GetInstRecursive()); + } +}; + +struct CaptureUImm { + using ReturnType = std::tuple<u64>; + + static std::optional<ReturnType> Match(IR::Value value) { + return std::tuple(value.GetImmediateAsU64()); + } +}; + +struct CaptureSImm { + using ReturnType = std::tuple<s64>; + + static std::optional<ReturnType> Match(IR::Value value) { + return std::tuple(value.GetImmediateAsS64()); + } +}; + +template<u64 Value> +struct UImm { + using ReturnType = std::tuple<>; + + static std::optional<std::tuple<>> Match(IR::Value value) { + if (value.GetImmediateAsU64() == Value) + return std::tuple(); + return std::nullopt; + } +}; + +template<s64 Value> +struct SImm { + using ReturnType = std::tuple<>; + + static std::optional<std::tuple<>> Match(IR::Value value) { + if (value.GetImmediateAsS64() == Value) + return std::tuple(); + return std::nullopt; + } +}; + +template<IR::Opcode Opcode, typename... Args> +struct Inst { +public: + using ReturnType = mp::concat<std::tuple<>, typename Args::ReturnType...>; + + static std::optional<ReturnType> Match(const IR::Inst& inst) { + if (inst.GetOpcode() != Opcode) + return std::nullopt; + if (inst.HasAssociatedPseudoOperation()) + return std::nullopt; + return MatchArgs<0>(inst); + } + + static std::optional<ReturnType> Match(IR::Value value) { + if (value.IsImmediate()) + return std::nullopt; + return Match(*value.GetInstRecursive()); + } + +private: + template<size_t I> + static auto MatchArgs(const IR::Inst& inst) -> std::optional<mp::apply<mp::concat, mp::prepend<mp::drop<I, mp::list<typename Args::ReturnType...>>, std::tuple<>>>> { + if constexpr (I >= sizeof...(Args)) { + return std::tuple(); + } else { + using Arg = mp::get<I, mp::list<Args...>>; + + if (const auto arg = Arg::Match(inst.GetArg(I))) { + if (const auto rest = MatchArgs<I + 1>(inst)) { + return std::tuple_cat(*arg, *rest); + } + } + + return std::nullopt; + } + } +}; + +inline bool IsSameInst(std::tuple<IR::Inst*, IR::Inst*> t) { + return std::get<0>(t) == std::get<1>(t); +} + +inline bool IsSameInst(std::tuple<IR::Inst*, IR::Inst*, IR::Inst*> t) { + return std::get<0>(t) == std::get<1>(t) && std::get<0>(t) == std::get<2>(t); +} + +} // namespace Dynarmic::Optimization::IRMatcher diff --git a/externals/dynarmic/src/dynarmic/ir/opt/naming_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/naming_pass.cpp new file mode 100644 index 0000000000..a766bdc83f --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/naming_pass.cpp @@ -0,0 +1,18 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2023 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" + +namespace Dynarmic::Optimization { + +void NamingPass(IR::Block& block) { + unsigned name = 1; + for (auto& inst : block) { + inst.SetName(name++); + } +} + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/opt/passes.h b/externals/dynarmic/src/dynarmic/ir/opt/passes.h new file mode 100644 index 0000000000..703145b556 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/passes.h @@ -0,0 +1,47 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic::A32 { +struct UserCallbacks; +} + +namespace Dynarmic::A64 { +struct UserCallbacks; +struct UserConfig; +} // namespace Dynarmic::A64 + +namespace Dynarmic::IR { +class Block; +} + +namespace Dynarmic::Optimization { + +struct PolyfillOptions { + bool sha256 = false; + bool vector_multiply_widen = false; + + bool operator==(const PolyfillOptions&) const = default; +}; + +struct A32GetSetEliminationOptions { + bool convert_nzc_to_nz = false; + bool convert_nz_to_nzc = false; +}; + +void PolyfillPass(IR::Block& block, const PolyfillOptions& opt); +void A32ConstantMemoryReads(IR::Block& block, A32::UserCallbacks* cb); +void A32GetSetElimination(IR::Block& block, A32GetSetEliminationOptions opt); +void A64CallbackConfigPass(IR::Block& block, const A64::UserConfig& conf); +void A64GetSetElimination(IR::Block& block); +void A64MergeInterpretBlocksPass(IR::Block& block, A64::UserCallbacks* cb); +void ConstantPropagation(IR::Block& block); +void DeadCodeElimination(IR::Block& block); +void IdentityRemovalPass(IR::Block& block); +void VerificationPass(const IR::Block& block); +void NamingPass(IR::Block& block); + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp new file mode 100644 index 0000000000..1aa3aea91e --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp @@ -0,0 +1,218 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/ir_emitter.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" +#include "dynarmic/ir/opt/passes.h" + +namespace Dynarmic::Optimization { + +namespace { + +void PolyfillSHA256MessageSchedule0(IR::IREmitter& ir, IR::Inst& inst) { + const IR::U128 x = (IR::U128)inst.GetArg(0); + const IR::U128 y = (IR::U128)inst.GetArg(1); + + const IR::U128 t = ir.VectorExtract(x, y, 32); + + IR::U128 result = ir.ZeroVector(); + for (size_t i = 0; i < 4; i++) { + const IR::U32 modified_element = [&] { + const IR::U32 element = ir.VectorGetElement(32, t, i); + const IR::U32 tmp1 = ir.RotateRight(element, ir.Imm8(7)); + const IR::U32 tmp2 = ir.RotateRight(element, ir.Imm8(18)); + const IR::U32 tmp3 = ir.LogicalShiftRight(element, ir.Imm8(3)); + + return ir.Eor(tmp1, ir.Eor(tmp2, tmp3)); + }(); + + result = ir.VectorSetElement(32, result, i, modified_element); + } + result = ir.VectorAdd(32, result, x); + + inst.ReplaceUsesWith(result); +} + +void PolyfillSHA256MessageSchedule1(IR::IREmitter& ir, IR::Inst& inst) { + const IR::U128 x = (IR::U128)inst.GetArg(0); + const IR::U128 y = (IR::U128)inst.GetArg(1); + const IR::U128 z = (IR::U128)inst.GetArg(2); + + const IR::U128 T0 = ir.VectorExtract(y, z, 32); + + const IR::U128 lower_half = [&] { + const IR::U128 T = ir.VectorRotateWholeVectorRight(z, 64); + const IR::U128 tmp1 = ir.VectorRotateRight(32, T, 17); + const IR::U128 tmp2 = ir.VectorRotateRight(32, T, 19); + const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, T, 10); + const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3)); + const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, x, T0)); + return ir.VectorZeroUpper(tmp5); + }(); + + const IR::U64 upper_half = [&] { + const IR::U128 tmp1 = ir.VectorRotateRight(32, lower_half, 17); + const IR::U128 tmp2 = ir.VectorRotateRight(32, lower_half, 19); + const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, lower_half, 10); + const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3)); + + // Shuffle the top two 32-bit elements downwards [3, 2, 1, 0] -> [1, 0, 3, 2] + const IR::U128 shuffled_d = ir.VectorRotateWholeVectorRight(x, 64); + const IR::U128 shuffled_T0 = ir.VectorRotateWholeVectorRight(T0, 64); + + const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, shuffled_d, shuffled_T0)); + return ir.VectorGetElement(64, tmp5, 0); + }(); + + const IR::U128 result = ir.VectorSetElement(64, lower_half, 1, upper_half); + + inst.ReplaceUsesWith(result); +} + +IR::U32 SHAchoose(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) { + return ir.Eor(ir.And(ir.Eor(y, z), x), z); +} + +IR::U32 SHAmajority(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) { + return ir.Or(ir.And(x, y), ir.And(ir.Or(x, y), z)); +} + +IR::U32 SHAhashSIGMA0(IR::IREmitter& ir, IR::U32 x) { + const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(2)); + const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(13)); + const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(22)); + + return ir.Eor(tmp1, ir.Eor(tmp2, tmp3)); +} + +IR::U32 SHAhashSIGMA1(IR::IREmitter& ir, IR::U32 x) { + const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(6)); + const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(11)); + const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(25)); + + return ir.Eor(tmp1, ir.Eor(tmp2, tmp3)); +} + +void PolyfillSHA256Hash(IR::IREmitter& ir, IR::Inst& inst) { + IR::U128 x = (IR::U128)inst.GetArg(0); + IR::U128 y = (IR::U128)inst.GetArg(1); + const IR::U128 w = (IR::U128)inst.GetArg(2); + const bool part1 = inst.GetArg(3).GetU1(); + + for (size_t i = 0; i < 4; i++) { + const IR::U32 low_x = ir.VectorGetElement(32, x, 0); + const IR::U32 after_low_x = ir.VectorGetElement(32, x, 1); + const IR::U32 before_high_x = ir.VectorGetElement(32, x, 2); + const IR::U32 high_x = ir.VectorGetElement(32, x, 3); + + const IR::U32 low_y = ir.VectorGetElement(32, y, 0); + const IR::U32 after_low_y = ir.VectorGetElement(32, y, 1); + const IR::U32 before_high_y = ir.VectorGetElement(32, y, 2); + const IR::U32 high_y = ir.VectorGetElement(32, y, 3); + + const IR::U32 choice = SHAchoose(ir, low_y, after_low_y, before_high_y); + const IR::U32 majority = SHAmajority(ir, low_x, after_low_x, before_high_x); + + const IR::U32 t = [&] { + const IR::U32 w_element = ir.VectorGetElement(32, w, i); + const IR::U32 sig = SHAhashSIGMA1(ir, low_y); + + return ir.Add(high_y, ir.Add(sig, ir.Add(choice, w_element))); + }(); + + const IR::U32 new_low_x = ir.Add(t, ir.Add(SHAhashSIGMA0(ir, low_x), majority)); + const IR::U32 new_low_y = ir.Add(t, high_x); + + // Shuffle all words left by 1 element: [3, 2, 1, 0] -> [2, 1, 0, 3] + const IR::U128 shuffled_x = ir.VectorRotateWholeVectorRight(x, 96); + const IR::U128 shuffled_y = ir.VectorRotateWholeVectorRight(y, 96); + + x = ir.VectorSetElement(32, shuffled_x, 0, new_low_x); + y = ir.VectorSetElement(32, shuffled_y, 0, new_low_y); + } + + inst.ReplaceUsesWith(part1 ? x : y); +} + +template<size_t esize, bool is_signed> +void PolyfillVectorMultiplyWiden(IR::IREmitter& ir, IR::Inst& inst) { + IR::U128 n = (IR::U128)inst.GetArg(0); + IR::U128 m = (IR::U128)inst.GetArg(1); + + const IR::U128 wide_n = is_signed ? ir.VectorSignExtend(esize, n) : ir.VectorZeroExtend(esize, n); + const IR::U128 wide_m = is_signed ? ir.VectorSignExtend(esize, m) : ir.VectorZeroExtend(esize, m); + + const IR::U128 result = ir.VectorMultiply(esize * 2, wide_n, wide_m); + + inst.ReplaceUsesWith(result); +} + +} // namespace + +void PolyfillPass(IR::Block& block, const PolyfillOptions& polyfill) { + if (polyfill == PolyfillOptions{}) { + return; + } + + IR::IREmitter ir{block}; + + for (auto& inst : block) { + ir.SetInsertionPointBefore(&inst); + + switch (inst.GetOpcode()) { + case IR::Opcode::SHA256MessageSchedule0: + if (polyfill.sha256) { + PolyfillSHA256MessageSchedule0(ir, inst); + } + break; + case IR::Opcode::SHA256MessageSchedule1: + if (polyfill.sha256) { + PolyfillSHA256MessageSchedule1(ir, inst); + } + break; + case IR::Opcode::SHA256Hash: + if (polyfill.sha256) { + PolyfillSHA256Hash(ir, inst); + } + break; + case IR::Opcode::VectorMultiplySignedWiden8: + if (polyfill.vector_multiply_widen) { + PolyfillVectorMultiplyWiden<8, true>(ir, inst); + } + break; + case IR::Opcode::VectorMultiplySignedWiden16: + if (polyfill.vector_multiply_widen) { + PolyfillVectorMultiplyWiden<16, true>(ir, inst); + } + break; + case IR::Opcode::VectorMultiplySignedWiden32: + if (polyfill.vector_multiply_widen) { + PolyfillVectorMultiplyWiden<32, true>(ir, inst); + } + break; + case IR::Opcode::VectorMultiplyUnsignedWiden8: + if (polyfill.vector_multiply_widen) { + PolyfillVectorMultiplyWiden<8, false>(ir, inst); + } + break; + case IR::Opcode::VectorMultiplyUnsignedWiden16: + if (polyfill.vector_multiply_widen) { + PolyfillVectorMultiplyWiden<16, false>(ir, inst); + } + break; + case IR::Opcode::VectorMultiplyUnsignedWiden32: + if (polyfill.vector_multiply_widen) { + PolyfillVectorMultiplyWiden<32, false>(ir, inst); + } + break; + default: + break; + } + } +} + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/opt/verification_pass.cpp b/externals/dynarmic/src/dynarmic/ir/opt/verification_pass.cpp new file mode 100644 index 0000000000..8bc9e6f1f0 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/opt/verification_pass.cpp @@ -0,0 +1,47 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include <cstdio> +#include <map> + +#include <mcl/assert.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/ir/basic_block.h" +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" +#include "dynarmic/ir/opt/passes.h" +#include "dynarmic/ir/type.h" + +namespace Dynarmic::Optimization { + +void VerificationPass(const IR::Block& block) { + for (const auto& inst : block) { + for (size_t i = 0; i < inst.NumArgs(); i++) { + const IR::Type t1 = inst.GetArg(i).GetType(); + const IR::Type t2 = IR::GetArgTypeOf(inst.GetOpcode(), i); + if (!IR::AreTypesCompatible(t1, t2)) { + std::puts(IR::DumpBlock(block).c_str()); + ASSERT_FALSE("above block failed validation"); + } + } + } + + std::map<IR::Inst*, size_t> actual_uses; + for (const auto& inst : block) { + for (size_t i = 0; i < inst.NumArgs(); i++) { + const auto arg = inst.GetArg(i); + if (!arg.IsImmediate()) { + actual_uses[arg.GetInst()]++; + } + } + } + + for (const auto& pair : actual_uses) { + ASSERT(pair.first->UseCount() == pair.second); + } +} + +} // namespace Dynarmic::Optimization diff --git a/externals/dynarmic/src/dynarmic/ir/terminal.h b/externals/dynarmic/src/dynarmic/ir/terminal.h new file mode 100644 index 0000000000..d437ffd5b6 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/terminal.h @@ -0,0 +1,131 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <boost/variant.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/ir/cond.h" +#include "dynarmic/ir/location_descriptor.h" + +namespace Dynarmic::IR { +namespace Term { + +struct Invalid {}; + +/** + * This terminal instruction calls the interpreter, starting at `next`. + * The interpreter must interpret exactly `num_instructions` instructions. + */ +struct Interpret { + explicit Interpret(const LocationDescriptor& next_) + : next(next_) {} + LocationDescriptor next; ///< Location at which interpretation starts. + size_t num_instructions = 1; +}; + +/** + * This terminal instruction returns control to the dispatcher. + * The dispatcher will use the current cpu state to determine what comes next. + */ +struct ReturnToDispatch {}; + +/** + * This terminal instruction jumps to the basic block described by `next` if we have enough + * cycles remaining. If we do not have enough cycles remaining, we return to the + * dispatcher, which will return control to the host. + */ +struct LinkBlock { + explicit LinkBlock(const LocationDescriptor& next_) + : next(next_) {} + LocationDescriptor next; ///< Location descriptor for next block. +}; + +/** + * This terminal instruction jumps to the basic block described by `next` unconditionally. + * This is an optimization and MUST only be emitted when this is guaranteed not to result + * in hanging, even in the face of other optimizations. (In practice, this means that only + * forward jumps to short-ish blocks would use this instruction.) + * A backend that doesn't support this optimization may choose to implement this exactly + * as LinkBlock. + */ +struct LinkBlockFast { + explicit LinkBlockFast(const LocationDescriptor& next_) + : next(next_) {} + LocationDescriptor next; ///< Location descriptor for next block. +}; + +/** + * This terminal instruction checks the top of the Return Stack Buffer against the current + * location descriptor. If RSB lookup fails, control is returned to the dispatcher. + * This is an optimization for faster function calls. A backend that doesn't support + * this optimization or doesn't have a RSB may choose to implement this exactly as + * ReturnToDispatch. + */ +struct PopRSBHint {}; + +/** + * This terminal instruction performs a lookup of the current location descriptor in the + * fast dispatch lookup table. A backend that doesn't support this optimization may choose + * to implement this exactly as ReturnToDispatch. + */ +struct FastDispatchHint {}; + +struct If; +struct CheckBit; +struct CheckHalt; +/// A Terminal is the terminal instruction in a MicroBlock. +using Terminal = boost::variant< + Invalid, + Interpret, + ReturnToDispatch, + LinkBlock, + LinkBlockFast, + PopRSBHint, + FastDispatchHint, + boost::recursive_wrapper<If>, + boost::recursive_wrapper<CheckBit>, + boost::recursive_wrapper<CheckHalt>>; + +/** + * This terminal instruction conditionally executes one terminal or another depending + * on the run-time state of the ARM flags. + */ +struct If { + If(Cond if_, Terminal then_, Terminal else_) + : if_(if_), then_(std::move(then_)), else_(std::move(else_)) {} + Cond if_; + Terminal then_; + Terminal else_; +}; + +/** + * This terminal instruction conditionally executes one terminal or another depending + * on the run-time state of the check bit. + * then_ is executed if the check bit is non-zero, otherwise else_ is executed. + */ +struct CheckBit { + CheckBit(Terminal then_, Terminal else_) + : then_(std::move(then_)), else_(std::move(else_)) {} + Terminal then_; + Terminal else_; +}; + +/** + * This terminal instruction checks if a halt was requested. If it wasn't, else_ is + * executed. + */ +struct CheckHalt { + explicit CheckHalt(Terminal else_) + : else_(std::move(else_)) {} + Terminal else_; +}; + +} // namespace Term + +using Term::Terminal; + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/type.cpp b/externals/dynarmic/src/dynarmic/ir/type.cpp new file mode 100644 index 0000000000..7c0f6bf622 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/type.cpp @@ -0,0 +1,46 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/ir/type.h" + +#include <array> +#include <ostream> +#include <string> + +namespace Dynarmic::IR { + +std::string GetNameOf(Type type) { + static constexpr std::array names{ + "A32Reg", "A32ExtReg", + "A64Reg", "A64Vec", + "Opaque", + "U1", "U8", "U16", "U32", "U64", "U128", + "CoprocInfo", + "NZCVFlags", + "Cond", + "Table"}; + + const size_t bits = static_cast<size_t>(type); + if (bits == 0) { + return "Void"; + } + + std::string result; + for (size_t i = 0; i < names.size(); i++) { + if ((bits & (size_t(1) << i)) != 0) { + if (!result.empty()) { + result += '|'; + } + result += names[i]; + } + } + return result; +} + +bool AreTypesCompatible(Type t1, Type t2) { + return t1 == t2 || t1 == Type::Opaque || t2 == Type::Opaque; +} + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/type.h b/externals/dynarmic/src/dynarmic/ir/type.h new file mode 100644 index 0000000000..65fe76dd65 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/type.h @@ -0,0 +1,60 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <string> + +#include <fmt/format.h> +#include <mcl/stdint.hpp> + +namespace Dynarmic::IR { + +/** + * The intermediate representation is typed. These are the used by our IR. + */ +enum class Type { + Void = 0, + A32Reg = 1 << 0, + A32ExtReg = 1 << 1, + A64Reg = 1 << 2, + A64Vec = 1 << 3, + Opaque = 1 << 4, + U1 = 1 << 5, + U8 = 1 << 6, + U16 = 1 << 7, + U32 = 1 << 8, + U64 = 1 << 9, + U128 = 1 << 10, + CoprocInfo = 1 << 11, + NZCVFlags = 1 << 12, + Cond = 1 << 13, + Table = 1 << 14, + AccType = 1 << 15, +}; + +constexpr Type operator|(Type a, Type b) { + return static_cast<Type>(static_cast<size_t>(a) | static_cast<size_t>(b)); +} + +constexpr Type operator&(Type a, Type b) { + return static_cast<Type>(static_cast<size_t>(a) & static_cast<size_t>(b)); +} + +/// Get the name of a type. +std::string GetNameOf(Type type); + +/// @returns true if t1 and t2 are compatible types +bool AreTypesCompatible(Type t1, Type t2); + +} // namespace Dynarmic::IR + +template<> +struct fmt::formatter<Dynarmic::IR::Type> : fmt::formatter<std::string> { + template<typename FormatContext> + auto format(Dynarmic::IR::Type type, FormatContext& ctx) const { + return formatter<std::string>::format(Dynarmic::IR::GetNameOf(type), ctx); + } +}; diff --git a/externals/dynarmic/src/dynarmic/ir/value.cpp b/externals/dynarmic/src/dynarmic/ir/value.cpp new file mode 100644 index 0000000000..20f94cb5f9 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/value.cpp @@ -0,0 +1,254 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/ir/value.h" + +#include <mcl/assert.hpp> +#include <mcl/bit/bit_field.hpp> + +#include "dynarmic/ir/microinstruction.h" +#include "dynarmic/ir/opcodes.h" +#include "dynarmic/ir/type.h" + +namespace Dynarmic::IR { + +Value::Value(Inst* value) + : type(Type::Opaque) { + inner.inst = value; +} + +Value::Value(A32::Reg value) + : type(Type::A32Reg) { + inner.imm_a32regref = value; +} + +Value::Value(A32::ExtReg value) + : type(Type::A32ExtReg) { + inner.imm_a32extregref = value; +} + +Value::Value(A64::Reg value) + : type(Type::A64Reg) { + inner.imm_a64regref = value; +} + +Value::Value(A64::Vec value) + : type(Type::A64Vec) { + inner.imm_a64vecref = value; +} + +Value::Value(bool value) + : type(Type::U1) { + inner.imm_u1 = value; +} + +Value::Value(u8 value) + : type(Type::U8) { + inner.imm_u8 = value; +} + +Value::Value(u16 value) + : type(Type::U16) { + inner.imm_u16 = value; +} + +Value::Value(u32 value) + : type(Type::U32) { + inner.imm_u32 = value; +} + +Value::Value(u64 value) + : type(Type::U64) { + inner.imm_u64 = value; +} + +Value::Value(CoprocessorInfo value) + : type(Type::CoprocInfo) { + inner.imm_coproc = value; +} + +Value::Value(Cond value) + : type(Type::Cond) { + inner.imm_cond = value; +} + +Value::Value(AccType value) + : type(Type::AccType) { + inner.imm_acctype = value; +} + +Value Value::EmptyNZCVImmediateMarker() { + Value result{}; + result.type = Type::NZCVFlags; + return result; +} + +bool Value::IsIdentity() const { + if (type == Type::Opaque) + return inner.inst->GetOpcode() == Opcode::Identity; + return false; +} + +bool Value::IsImmediate() const { + if (IsIdentity()) + return inner.inst->GetArg(0).IsImmediate(); + return type != Type::Opaque; +} + +bool Value::IsEmpty() const { + return type == Type::Void; +} + +Type Value::GetType() const { + if (IsIdentity()) + return inner.inst->GetArg(0).GetType(); + if (type == Type::Opaque) + return inner.inst->GetType(); + return type; +} + +A32::Reg Value::GetA32RegRef() const { + ASSERT(type == Type::A32Reg); + return inner.imm_a32regref; +} + +A32::ExtReg Value::GetA32ExtRegRef() const { + ASSERT(type == Type::A32ExtReg); + return inner.imm_a32extregref; +} + +A64::Reg Value::GetA64RegRef() const { + ASSERT(type == Type::A64Reg); + return inner.imm_a64regref; +} + +A64::Vec Value::GetA64VecRef() const { + ASSERT(type == Type::A64Vec); + return inner.imm_a64vecref; +} + +Inst* Value::GetInst() const { + ASSERT(type == Type::Opaque); + return inner.inst; +} + +Inst* Value::GetInstRecursive() const { + ASSERT(type == Type::Opaque); + if (IsIdentity()) + return inner.inst->GetArg(0).GetInstRecursive(); + return inner.inst; +} + +bool Value::GetU1() const { + if (IsIdentity()) + return inner.inst->GetArg(0).GetU1(); + ASSERT(type == Type::U1); + return inner.imm_u1; +} + +u8 Value::GetU8() const { + if (IsIdentity()) + return inner.inst->GetArg(0).GetU8(); + ASSERT(type == Type::U8); + return inner.imm_u8; +} + +u16 Value::GetU16() const { + if (IsIdentity()) + return inner.inst->GetArg(0).GetU16(); + ASSERT(type == Type::U16); + return inner.imm_u16; +} + +u32 Value::GetU32() const { + if (IsIdentity()) + return inner.inst->GetArg(0).GetU32(); + ASSERT(type == Type::U32); + return inner.imm_u32; +} + +u64 Value::GetU64() const { + if (IsIdentity()) + return inner.inst->GetArg(0).GetU64(); + ASSERT(type == Type::U64); + return inner.imm_u64; +} + +Value::CoprocessorInfo Value::GetCoprocInfo() const { + if (IsIdentity()) + return inner.inst->GetArg(0).GetCoprocInfo(); + ASSERT(type == Type::CoprocInfo); + return inner.imm_coproc; +} + +Cond Value::GetCond() const { + if (IsIdentity()) + return inner.inst->GetArg(0).GetCond(); + ASSERT(type == Type::Cond); + return inner.imm_cond; +} + +AccType Value::GetAccType() const { + if (IsIdentity()) + return inner.inst->GetArg(0).GetAccType(); + ASSERT(type == Type::AccType); + return inner.imm_acctype; +} + +s64 Value::GetImmediateAsS64() const { + ASSERT(IsImmediate()); + + switch (GetType()) { + case IR::Type::U1: + return s64(GetU1()); + case IR::Type::U8: + return s64(mcl::bit::sign_extend<8, u64>(GetU8())); + case IR::Type::U16: + return s64(mcl::bit::sign_extend<16, u64>(GetU16())); + case IR::Type::U32: + return s64(mcl::bit::sign_extend<32, u64>(GetU32())); + case IR::Type::U64: + return s64(GetU64()); + default: + ASSERT_FALSE("GetImmediateAsS64 called on an incompatible Value type."); + } +} + +u64 Value::GetImmediateAsU64() const { + ASSERT(IsImmediate()); + + switch (GetType()) { + case IR::Type::U1: + return u64(GetU1()); + case IR::Type::U8: + return u64(GetU8()); + case IR::Type::U16: + return u64(GetU16()); + case IR::Type::U32: + return u64(GetU32()); + case IR::Type::U64: + return u64(GetU64()); + default: + ASSERT_FALSE("GetImmediateAsU64 called on an incompatible Value type."); + } +} + +bool Value::IsSignedImmediate(s64 value) const { + return IsImmediate() && GetImmediateAsS64() == value; +} + +bool Value::IsUnsignedImmediate(u64 value) const { + return IsImmediate() && GetImmediateAsU64() == value; +} + +bool Value::HasAllBitsSet() const { + return IsSignedImmediate(-1); +} + +bool Value::IsZero() const { + return IsUnsignedImmediate(0); +} + +} // namespace Dynarmic::IR diff --git a/externals/dynarmic/src/dynarmic/ir/value.h b/externals/dynarmic/src/dynarmic/ir/value.h new file mode 100644 index 0000000000..8f70110a8d --- /dev/null +++ b/externals/dynarmic/src/dynarmic/ir/value.h @@ -0,0 +1,187 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include <array> +#include <type_traits> + +#include <mcl/assert.hpp> +#include <mcl/stdint.hpp> + +#include "dynarmic/ir/type.h" + +namespace Dynarmic::A32 { +enum class ExtReg; +enum class Reg; +} // namespace Dynarmic::A32 + +namespace Dynarmic::A64 { +enum class Reg; +enum class Vec; +} // namespace Dynarmic::A64 + +namespace Dynarmic::IR { + +class Inst; +enum class AccType; +enum class Cond; + +/** + * A representation of a value in the IR. + * A value may either be an immediate or the result of a microinstruction. + */ +class Value { +public: + using CoprocessorInfo = std::array<u8, 8>; + + Value() + : type(Type::Void) {} + explicit Value(Inst* value); + explicit Value(A32::Reg value); + explicit Value(A32::ExtReg value); + explicit Value(A64::Reg value); + explicit Value(A64::Vec value); + explicit Value(bool value); + explicit Value(u8 value); + explicit Value(u16 value); + explicit Value(u32 value); + explicit Value(u64 value); + explicit Value(CoprocessorInfo value); + explicit Value(Cond value); + explicit Value(AccType value); + + static Value EmptyNZCVImmediateMarker(); + + bool IsIdentity() const; + bool IsEmpty() const; + bool IsImmediate() const; + Type GetType() const; + + Inst* GetInst() const; + Inst* GetInstRecursive() const; + A32::Reg GetA32RegRef() const; + A32::ExtReg GetA32ExtRegRef() const; + A64::Reg GetA64RegRef() const; + A64::Vec GetA64VecRef() const; + bool GetU1() const; + u8 GetU8() const; + u16 GetU16() const; + u32 GetU32() const; + u64 GetU64() const; + CoprocessorInfo GetCoprocInfo() const; + Cond GetCond() const; + AccType GetAccType() const; + + /** + * Retrieves the immediate of a Value instance as a signed 64-bit value. + * + * @pre The value contains either a U1, U8, U16, U32, or U64 value. + * Breaking this precondition will cause an assertion to be invoked. + */ + s64 GetImmediateAsS64() const; + + /** + * Retrieves the immediate of a Value instance as an unsigned 64-bit value. + * + * @pre The value contains either a U1, U8, U16, U32, or U64 value. + * Breaking this precondition will cause an assertion to be invoked. + */ + u64 GetImmediateAsU64() const; + + /** + * Determines whether or not the contained value matches the provided signed one. + * + * Note that this function will always return false if the contained + * value is not a a constant value. In other words, if IsImmediate() + * would return false on an instance, then so will this function. + * + * @param value The value to check against the contained value. + */ + bool IsSignedImmediate(s64 value) const; + + /** + * Determines whether or not the contained value matches the provided unsigned one. + * + * Note that this function will always return false if the contained + * value is not a a constant value. In other words, if IsImmediate() + * would return false on an instance, then so will this function. + * + * @param value The value to check against the contained value. + */ + bool IsUnsignedImmediate(u64 value) const; + + /** + * Determines whether or not the contained constant value has all bits set. + * + * @pre The value contains either a U1, U8, U16, U32, or U64 value. + * Breaking this precondition will cause an assertion to be invoked. + */ + bool HasAllBitsSet() const; + + /** + * Whether or not the current value contains a representation of zero. + * + * Note that this function will always return false if the contained + * value is not a a constant value. In other words, if IsImmediate() + * would return false on an instance, then so will this function. + */ + bool IsZero() const; + +private: + Type type; + + union { + Inst* inst; // type == Type::Opaque + A32::Reg imm_a32regref; + A32::ExtReg imm_a32extregref; + A64::Reg imm_a64regref; + A64::Vec imm_a64vecref; + bool imm_u1; + u8 imm_u8; + u16 imm_u16; + u32 imm_u32; + u64 imm_u64; + CoprocessorInfo imm_coproc; + Cond imm_cond; + AccType imm_acctype; + } inner; +}; +static_assert(sizeof(Value) <= 2 * sizeof(u64), "IR::Value should be kept small in size"); + +template<Type type_> +class TypedValue final : public Value { +public: + TypedValue() = default; + + template<Type other_type, typename = std::enable_if_t<(other_type & type_) != Type::Void>> + /* implicit */ TypedValue(const TypedValue<other_type>& value) + : Value(value) { + ASSERT((value.GetType() & type_) != Type::Void); + } + + explicit TypedValue(const Value& value) + : Value(value) { + ASSERT((value.GetType() & type_) != Type::Void); + } + + explicit TypedValue(Inst* inst) + : TypedValue(Value(inst)) {} +}; + +using U1 = TypedValue<Type::U1>; +using U8 = TypedValue<Type::U8>; +using U16 = TypedValue<Type::U16>; +using U32 = TypedValue<Type::U32>; +using U64 = TypedValue<Type::U64>; +using U128 = TypedValue<Type::U128>; +using U32U64 = TypedValue<Type::U32 | Type::U64>; +using U16U32U64 = TypedValue<Type::U16 | Type::U32 | Type::U64>; +using UAny = TypedValue<Type::U8 | Type::U16 | Type::U32 | Type::U64>; +using UAnyU128 = TypedValue<Type::U8 | Type::U16 | Type::U32 | Type::U64 | Type::U128>; +using NZCV = TypedValue<Type::NZCVFlags>; +using Table = TypedValue<Type::Table>; + +} // namespace Dynarmic::IR |