diff options
Diffstat (limited to 'src')
22 files changed, 336 insertions, 164 deletions
diff --git a/src/citra_qt/util/util.h b/src/citra_qt/util/util.h index 0787f9eb7b..98a9440477 100644 --- a/src/citra_qt/util/util.h +++ b/src/citra_qt/util/util.h @@ -2,6 +2,8 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#pragma once + #include <QFont> /// Returns a QFont object appropriate to use as a monospace font for debugging widgets, etc. diff --git a/src/common/microprofile.h b/src/common/microprofile.h index 9eb6016a85..d3b6cb97cd 100644 --- a/src/common/microprofile.h +++ b/src/common/microprofile.h @@ -11,6 +11,11 @@ #define MICROPROFILE_CONTEXT_SWITCH_TRACE 0 #define MICROPROFILE_PER_THREAD_BUFFER_SIZE (2048<<12) // 8 MB +#ifdef _WIN32 +// This isn't defined by the standard library in MSVC2015 +typedef void* HANDLE; +#endif + #include <microprofile.h> #define MP_RGB(r, g, b) ((r) << 16 | (g) << 8 | (b) << 0) diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp index 01c712f249..0fddb07a0a 100644 --- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp +++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp @@ -49,65 +49,47 @@ enum { typedef unsigned int (*shtop_fp_t)(ARMul_State* cpu, unsigned int sht_oper); -static int CondPassed(ARMul_State* cpu, unsigned int cond) { - const u32 NFLAG = cpu->NFlag; - const u32 ZFLAG = cpu->ZFlag; - const u32 CFLAG = cpu->CFlag; - const u32 VFLAG = cpu->VFlag; - - int temp = 0; +static bool CondPassed(ARMul_State* cpu, unsigned int cond) { + const bool n_flag = cpu->NFlag != 0; + const bool z_flag = cpu->ZFlag != 0; + const bool c_flag = cpu->CFlag != 0; + const bool v_flag = cpu->VFlag != 0; switch (cond) { - case 0x0: - temp = ZFLAG; - break; - case 0x1: // NE - temp = !ZFLAG; - break; - case 0x2: // CS - temp = CFLAG; - break; - case 0x3: // CC - temp = !CFLAG; - break; - case 0x4: // MI - temp = NFLAG; - break; - case 0x5: // PL - temp = !NFLAG; - break; - case 0x6: // VS - temp = VFLAG; - break; - case 0x7: // VC - temp = !VFLAG; - break; - case 0x8: // HI - temp = (CFLAG && !ZFLAG); - break; - case 0x9: // LS - temp = (!CFLAG || ZFLAG); - break; - case 0xa: // GE - temp = ((!NFLAG && !VFLAG) || (NFLAG && VFLAG)); - break; - case 0xb: // LT - temp = ((NFLAG && !VFLAG) || (!NFLAG && VFLAG)); - break; - case 0xc: // GT - temp = ((!NFLAG && !VFLAG && !ZFLAG) || (NFLAG && VFLAG && !ZFLAG)); - break; - case 0xd: // LE - temp = ((NFLAG && !VFLAG) || (!NFLAG && VFLAG)) || ZFLAG; - break; - case 0xe: // AL - temp = 1; - break; - case 0xf: - temp = 1; - break; - } - return temp; + case ConditionCode::EQ: + return z_flag; + case ConditionCode::NE: + return !z_flag; + case ConditionCode::CS: + return c_flag; + case ConditionCode::CC: + return !c_flag; + case ConditionCode::MI: + return n_flag; + case ConditionCode::PL: + return !n_flag; + case ConditionCode::VS: + return v_flag; + case ConditionCode::VC: + return !v_flag; + case ConditionCode::HI: + return (c_flag && !z_flag); + case ConditionCode::LS: + return (!c_flag || z_flag); + case ConditionCode::GE: + return (n_flag == v_flag); + case ConditionCode::LT: + return (n_flag != v_flag); + case ConditionCode::GT: + return (!z_flag && (n_flag == v_flag)); + case ConditionCode::LE: + return (z_flag || (n_flag != v_flag)); + case ConditionCode::AL: + case ConditionCode::NV: // Unconditional + return true; + } + + return false; } static unsigned int DPO(Immediate)(ARMul_State* cpu, unsigned int sht_oper) { diff --git a/src/core/file_sys/disk_archive.cpp b/src/core/file_sys/disk_archive.cpp index 1096fd34da..e9ecd2b1cc 100644 --- a/src/core/file_sys/disk_archive.cpp +++ b/src/core/file_sys/disk_archive.cpp @@ -102,7 +102,7 @@ bool DiskFile::Open() { mode_string += "b"; file = Common::make_unique<FileUtil::IOFile>(path, mode_string.c_str()); - return true; + return file->IsOpen(); } size_t DiskFile::Read(const u64 offset, const size_t length, u8* buffer) const { diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp index 124047a53f..c2b4963d4f 100644 --- a/src/core/hle/kernel/process.cpp +++ b/src/core/hle/kernel/process.cpp @@ -129,7 +129,7 @@ void Process::Run(s32 main_thread_priority, u32 stack_size) { } VAddr Process::GetLinearHeapBase() const { - return (kernel_version < 0x22C ? Memory::LINEAR_HEAP_VADDR : Memory::NEW_LINEAR_HEAP_SIZE) + return (kernel_version < 0x22C ? Memory::LINEAR_HEAP_VADDR : Memory::NEW_LINEAR_HEAP_VADDR) + memory_region->base; } @@ -174,6 +174,10 @@ ResultCode Process::HeapFree(VAddr target, u32 size) { return ERR_INVALID_ADDRESS; } + if (size == 0) { + return RESULT_SUCCESS; + } + ResultCode result = vm_manager.UnmapRange(target, size); if (result.IsError()) return result; @@ -226,6 +230,10 @@ ResultCode Process::LinearFree(VAddr target, u32 size) { return ERR_INVALID_ADDRESS; } + if (size == 0) { + return RESULT_SUCCESS; + } + VAddr heap_end = GetLinearHeapBase() + (u32)linheap_memory->size(); if (target + size > heap_end) { return ERR_INVALID_ADDRESS_STATE; diff --git a/src/core/hle/kernel/shared_memory.cpp b/src/core/hle/kernel/shared_memory.cpp index 4137683b5d..1f477664b7 100644 --- a/src/core/hle/kernel/shared_memory.cpp +++ b/src/core/hle/kernel/shared_memory.cpp @@ -20,6 +20,7 @@ SharedPtr<SharedMemory> SharedMemory::Create(u32 size, MemoryPermission permissi shared_memory->name = std::move(name); shared_memory->base_address = 0x0; + shared_memory->fixed_address = 0x0; shared_memory->size = size; shared_memory->permissions = permissions; shared_memory->other_permissions = other_permissions; @@ -30,9 +31,31 @@ SharedPtr<SharedMemory> SharedMemory::Create(u32 size, MemoryPermission permissi ResultCode SharedMemory::Map(VAddr address, MemoryPermission permissions, MemoryPermission other_permissions) { + if (base_address != 0) { + LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s: already mapped at 0x%08X!", + GetObjectId(), address, name.c_str(), base_address); + // TODO: Verify error code with hardware + return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel, + ErrorSummary::InvalidArgument, ErrorLevel::Permanent); + } + + if (fixed_address != 0) { + if (address != 0 && address != fixed_address) { + LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s: fixed_addres is 0x%08X!", + GetObjectId(), address, name.c_str(), fixed_address); + // TODO: Verify error code with hardware + return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel, + ErrorSummary::InvalidArgument, ErrorLevel::Permanent); + } + + // HACK(yuriks): This is only here to support the APT shared font mapping right now. + // Later, this should actually map the memory block onto the address space. + return RESULT_SUCCESS; + } + if (address < Memory::SHARED_MEMORY_VADDR || address + size >= Memory::SHARED_MEMORY_VADDR_END) { - LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X outside of shared mem bounds!", - GetObjectId(), address); + LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s outside of shared mem bounds!", + GetObjectId(), address, name.c_str()); // TODO: Verify error code with hardware return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel, ErrorSummary::InvalidArgument, ErrorLevel::Permanent); diff --git a/src/core/hle/kernel/shared_memory.h b/src/core/hle/kernel/shared_memory.h index 7a29227762..35b550d12c 100644 --- a/src/core/hle/kernel/shared_memory.h +++ b/src/core/hle/kernel/shared_memory.h @@ -61,6 +61,8 @@ public: /// Address of shared memory block in the process. VAddr base_address; + /// Fixed address to allow mapping to. Used for blocks created from the linear heap. + VAddr fixed_address; /// Size of the memory block. Page-aligned. u32 size; /// Permission restrictions applied to the process which created the block. diff --git a/src/core/hle/service/apt/apt.cpp b/src/core/hle/service/apt/apt.cpp index 6a2fdea2bf..ba66569b41 100644 --- a/src/core/hle/service/apt/apt.cpp +++ b/src/core/hle/service/apt/apt.cpp @@ -78,8 +78,8 @@ void GetSharedFont(Service::Interface* self) { if (shared_font != nullptr) { // TODO(yuriks): This is a hack to keep this working right now even with our completely // broken shared memory system. - shared_font_mem->base_address = SHARED_FONT_VADDR; - Kernel::g_current_process->vm_manager.MapMemoryBlock(shared_font_mem->base_address, + shared_font_mem->fixed_address = SHARED_FONT_VADDR; + Kernel::g_current_process->vm_manager.MapMemoryBlock(shared_font_mem->fixed_address, shared_font, 0, shared_font_mem->size, Kernel::MemoryState::Shared); cmd_buff[0] = IPC::MakeHeader(0x44, 2, 2); diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index a78985510b..682be89ec5 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -235,7 +235,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { for (unsigned int index = 0; index < regs.num_vertices; ++index) { - unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; + // Indexed rendering doesn't use the start offset + unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : (index + regs.vertex_offset); // -1 is a common special value used for primitive restart. Since it's unknown if // the PICA supports it, and it would mess up the caching, guard against it here. diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index c73a8178ea..61983bc6cf 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp @@ -49,11 +49,13 @@ std::string Regs::GetCommandName(int index) { ADD_FIELD(vertex_attributes); ADD_FIELD(index_array); ADD_FIELD(num_vertices); + ADD_FIELD(vertex_offset); ADD_FIELD(trigger_draw); ADD_FIELD(trigger_draw_indexed); ADD_FIELD(vs_default_attributes_setup); ADD_FIELD(command_buffer); ADD_FIELD(triangle_topology); + ADD_FIELD(restart_primitive); ADD_FIELD(gs.bool_uniforms); ADD_FIELD(gs.int_uniforms); ADD_FIELD(gs.main_offset); diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 58b924f9e1..855cb442e1 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -441,8 +441,14 @@ struct Regs { }; enum class StencilAction : u32 { - Keep = 0, - Xor = 5, + Keep = 0, + Zero = 1, + Replace = 2, + Increment = 3, + Decrement = 4, + Invert = 5, + IncrementWrap = 6, + DecrementWrap = 7 }; struct { @@ -481,23 +487,29 @@ struct Regs { struct { union { + // Raw value of this register + u32 raw_func; + // If true, enable stencil testing BitField< 0, 1, u32> enable; // Comparison operation for stencil testing BitField< 4, 3, CompareFunc> func; - // Value to calculate the new stencil value from - BitField< 8, 8, u32> replacement_value; + // Mask used to control writing to the stencil buffer + BitField< 8, 8, u32> write_mask; // Value to compare against for stencil testing BitField<16, 8, u32> reference_value; // Mask to apply on stencil test inputs - BitField<24, 8, u32> mask; + BitField<24, 8, u32> input_mask; }; union { + // Raw value of this register + u32 raw_op; + // Action to perform when the stencil test fails BitField< 0, 3, StencilAction> action_stencil_fail; @@ -757,7 +769,12 @@ struct Regs { // Number of vertices to render u32 num_vertices; - INSERT_PADDING_WORDS(0x5); + INSERT_PADDING_WORDS(0x1); + + // The index of the first vertex to render + u32 vertex_offset; + + INSERT_PADDING_WORDS(0x3); // These two trigger rendering of triangles u32 trigger_draw; @@ -811,7 +828,9 @@ struct Regs { BitField<8, 2, TriangleTopology> triangle_topology; - INSERT_PADDING_WORDS(0x21); + u32 restart_primitive; + + INSERT_PADDING_WORDS(0x20); struct ShaderConfig { BitField<0, 16, u32> bool_uniforms; @@ -980,11 +999,13 @@ ASSERT_REG_POSITION(framebuffer, 0x110); ASSERT_REG_POSITION(vertex_attributes, 0x200); ASSERT_REG_POSITION(index_array, 0x227); ASSERT_REG_POSITION(num_vertices, 0x228); +ASSERT_REG_POSITION(vertex_offset, 0x22a); ASSERT_REG_POSITION(trigger_draw, 0x22e); ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f); ASSERT_REG_POSITION(vs_default_attributes_setup, 0x232); ASSERT_REG_POSITION(command_buffer, 0x238); ASSERT_REG_POSITION(triangle_topology, 0x25e); +ASSERT_REG_POSITION(restart_primitive, 0x25f); ASSERT_REG_POSITION(gs, 0x280); ASSERT_REG_POSITION(vs, 0x2b0); @@ -1021,12 +1042,20 @@ struct float24 { return ret; } + static float24 Zero() { + return FromFloat32(0.f); + } + // Not recommended for anything but logging float ToFloat32() const { return value; } float24 operator * (const float24& flt) const { + if ((this->value == 0.f && !std::isnan(flt.value)) || + (flt.value == 0.f && !std::isnan(this->value))) + // PICA gives 0 instead of NaN when multiplying by inf + return Zero(); return float24::FromFloat32(ToFloat32() * flt.ToFloat32()); } @@ -1043,7 +1072,11 @@ struct float24 { } float24& operator *= (const float24& flt) { - value *= flt.ToFloat32(); + if ((this->value == 0.f && !std::isnan(flt.value)) || + (flt.value == 0.f && !std::isnan(this->value))) + // PICA gives 0 instead of NaN when multiplying by inf + *this = Zero(); + else value *= flt.ToFloat32(); return *this; } diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index 4a159da8e7..77eadda9e9 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -216,14 +216,33 @@ static void SetStencil(int x, int y, u8 value) { } } -// TODO: Should the stencil mask be applied to the "dest" or "ref" operands? Most likely not! -static u8 PerformStencilAction(Regs::StencilAction action, u8 dest, u8 ref) { +static u8 PerformStencilAction(Regs::StencilAction action, u8 old_stencil, u8 ref) { switch (action) { case Regs::StencilAction::Keep: - return dest; + return old_stencil; - case Regs::StencilAction::Xor: - return dest ^ ref; + case Regs::StencilAction::Zero: + return 0; + + case Regs::StencilAction::Replace: + return ref; + + case Regs::StencilAction::Increment: + // Saturated increment + return std::min<u8>(old_stencil, 254) + 1; + + case Regs::StencilAction::Decrement: + // Saturated decrement + return std::max<u8>(old_stencil, 1) - 1; + + case Regs::StencilAction::Invert: + return ~old_stencil; + + case Regs::StencilAction::IncrementWrap: + return old_stencil + 1; + + case Regs::StencilAction::DecrementWrap: + return old_stencil - 1; default: LOG_CRITICAL(HW_GPU, "Unknown stencil action %x", (int)action); @@ -783,10 +802,16 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0, } u8 old_stencil = 0; + + auto UpdateStencil = [stencil_test, x, y, &old_stencil](Pica::Regs::StencilAction action) { + u8 new_stencil = PerformStencilAction(action, old_stencil, stencil_test.reference_value); + SetStencil(x >> 4, y >> 4, (new_stencil & stencil_test.write_mask) | (old_stencil & ~stencil_test.write_mask)); + }; + if (stencil_action_enable) { old_stencil = GetStencil(x >> 4, y >> 4); - u8 dest = old_stencil & stencil_test.mask; - u8 ref = stencil_test.reference_value & stencil_test.mask; + u8 dest = old_stencil & stencil_test.input_mask; + u8 ref = stencil_test.reference_value & stencil_test.input_mask; bool pass = false; switch (stencil_test.func) { @@ -824,8 +849,7 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0, } if (!pass) { - u8 new_stencil = PerformStencilAction(stencil_test.action_stencil_fail, old_stencil, stencil_test.replacement_value); - SetStencil(x >> 4, y >> 4, new_stencil); + UpdateStencil(stencil_test.action_stencil_fail); continue; } } @@ -875,23 +899,19 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0, } if (!pass) { - if (stencil_action_enable) { - u8 new_stencil = PerformStencilAction(stencil_test.action_depth_fail, old_stencil, stencil_test.replacement_value); - SetStencil(x >> 4, y >> 4, new_stencil); - } + if (stencil_action_enable) + UpdateStencil(stencil_test.action_depth_fail); continue; } if (output_merger.depth_write_enable) SetDepth(x >> 4, y >> 4, z); - - if (stencil_action_enable) { - // TODO: What happens if stencil testing is enabled, but depth testing is not? Will stencil get updated anyway? - u8 new_stencil = PerformStencilAction(stencil_test.action_depth_pass, old_stencil, stencil_test.replacement_value); - SetStencil(x >> 4, y >> 4, new_stencil); - } } + // The stencil depth_pass action is executed even if depth testing is disabled + if (stencil_action_enable) + UpdateStencil(stencil_test.action_depth_pass); + auto dest = GetPixel(x >> 4, y >> 4); Math::Vec4<u8> blend_output = combiner_output; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index deb9971bb7..d29049508d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -231,8 +231,8 @@ void RasterizerOpenGL::DrawTriangles() { u32 cur_fb_depth_size = Pica::Regs::BytesPerDepthPixel(regs.framebuffer.depth_format) * regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight(); - res_cache.NotifyFlush(cur_fb_color_addr, cur_fb_color_size); - res_cache.NotifyFlush(cur_fb_depth_addr, cur_fb_depth_size); + res_cache.NotifyFlush(cur_fb_color_addr, cur_fb_color_size, true); + res_cache.NotifyFlush(cur_fb_depth_addr, cur_fb_depth_size, true); } void RasterizerOpenGL::CommitFramebuffer() { @@ -269,7 +269,8 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) { break; // Stencil test - case PICA_REG_INDEX(output_merger.stencil_test): + case PICA_REG_INDEX(output_merger.stencil_test.raw_func): + case PICA_REG_INDEX(output_merger.stencil_test.raw_op): SyncStencilTest(); break; @@ -676,7 +677,15 @@ void RasterizerOpenGL::SyncLogicOp() { } void RasterizerOpenGL::SyncStencilTest() { - // TODO: Implement stencil test, mask, and op + const auto& regs = Pica::g_state.regs; + state.stencil.test_enabled = regs.output_merger.stencil_test.enable && regs.framebuffer.depth_format == Pica::Regs::DepthFormat::D24S8; + state.stencil.test_func = PicaToGL::CompareFunc(regs.output_merger.stencil_test.func); + state.stencil.test_ref = regs.output_merger.stencil_test.reference_value; + state.stencil.test_mask = regs.output_merger.stencil_test.input_mask; + state.stencil.write_mask = regs.output_merger.stencil_test.write_mask; + state.stencil.action_stencil_fail = PicaToGL::StencilOp(regs.output_merger.stencil_test.action_stencil_fail); + state.stencil.action_depth_fail = PicaToGL::StencilOp(regs.output_merger.stencil_test.action_depth_fail); + state.stencil.action_depth_pass = PicaToGL::StencilOp(regs.output_merger.stencil_test.action_depth_pass); } void RasterizerOpenGL::SyncDepthTest() { @@ -867,8 +876,15 @@ void RasterizerOpenGL::ReloadDepthBuffer() { state.Apply(); glActiveTexture(GL_TEXTURE0); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, fb_depth_texture.width, fb_depth_texture.height, - fb_depth_texture.gl_format, fb_depth_texture.gl_type, temp_fb_depth_buffer.get()); + if (fb_depth_texture.format == Pica::Regs::DepthFormat::D24S8) { + // TODO(Subv): There is a bug with Intel Windows drivers that makes glTexSubImage2D not change the stencil buffer. + // The bug has been reported to Intel (https://communities.intel.com/message/324464) + glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, fb_depth_texture.width, fb_depth_texture.height, 0, + GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, temp_fb_depth_buffer.get()); + } else { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, fb_depth_texture.width, fb_depth_texture.height, + fb_depth_texture.gl_format, fb_depth_texture.gl_type, temp_fb_depth_buffer.get()); + } state.texture_units[0].texture_2d = 0; state.Apply(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index e4247051c7..1e38c2e6d3 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "common/hash.h" #include "common/make_unique.h" #include "common/math_util.h" #include "common/microprofile.h" @@ -21,7 +22,6 @@ MICROPROFILE_DEFINE(OpenGL_TextureUpload, "OpenGL", "Texture Upload", MP_RGB(128 void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned texture_unit, const Pica::Regs::FullTextureConfig& config) { PAddr texture_addr = config.config.GetPhysicalAddress(); - const auto cached_texture = texture_cache.find(texture_addr); if (cached_texture != texture_cache.end()) { @@ -51,12 +51,14 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text } const auto info = Pica::DebugUtils::TextureInfo::FromPicaRegister(config.config, config.format); + u8* texture_src_data = Memory::GetPhysicalPointer(texture_addr); new_texture->width = info.width; new_texture->height = info.height; - new_texture->size = info.width * info.height * Pica::Regs::NibblesPerPixel(info.format); + new_texture->size = info.stride * info.height; + new_texture->addr = texture_addr; + new_texture->hash = Common::ComputeHash64(texture_src_data, new_texture->size); - u8* texture_src_data = Memory::GetPhysicalPointer(texture_addr); std::unique_ptr<Math::Vec4<u8>[]> temp_texture_buffer_rgba(new Math::Vec4<u8>[info.width * info.height]); for (int y = 0; y < info.height; ++y) { @@ -71,12 +73,18 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text } } -void RasterizerCacheOpenGL::NotifyFlush(PAddr addr, u32 size) { +void RasterizerCacheOpenGL::NotifyFlush(PAddr addr, u32 size, bool ignore_hash) { // Flush any texture that falls in the flushed region // TODO: Optimize by also inserting upper bound (addr + size) of each texture into the same map and also narrow using lower_bound auto cache_upper_bound = texture_cache.upper_bound(addr + size); + for (auto it = texture_cache.begin(); it != cache_upper_bound;) { - if (MathUtil::IntervalsIntersect(addr, size, it->first, it->second->size)) { + const auto& info = *it->second; + + // Flush the texture only if the memory region intersects and a change is detected + if (MathUtil::IntervalsIntersect(addr, size, info.addr, info.size) && + (ignore_hash || info.hash != Common::ComputeHash64(Memory::GetPhysicalPointer(info.addr), info.size))) { + it = texture_cache.erase(it); } else { ++it; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h index 96f3a925c5..d8f9edf59e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h @@ -19,7 +19,7 @@ public: void LoadAndBindTexture(OpenGLState &state, unsigned texture_unit, const Pica::Regs::FullTextureConfig& config); /// Flush any cached resource that touches the flushed region - void NotifyFlush(PAddr addr, u32 size); + void NotifyFlush(PAddr addr, u32 size, bool ignore_hash = false); /// Flush all cached OpenGL resources tracked by this cache manager void FullFlush(); @@ -30,6 +30,8 @@ private: GLuint width; GLuint height; u32 size; + u64 hash; + PAddr addr; }; std::map<PAddr, std::unique_ptr<CachedTexture>> texture_cache; diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index 871324014f..ba47ce8b83 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -26,6 +26,9 @@ OpenGLState::OpenGLState() { stencil.test_ref = 0; stencil.test_mask = -1; stencil.write_mask = -1; + stencil.action_depth_fail = GL_KEEP; + stencil.action_depth_pass = GL_KEEP; + stencil.action_stencil_fail = GL_KEEP; blend.enabled = false; blend.src_rgb_func = GL_ONE; @@ -105,6 +108,12 @@ void OpenGLState::Apply() { glStencilFunc(stencil.test_func, stencil.test_ref, stencil.test_mask); } + if (stencil.action_depth_fail != cur_state.stencil.action_depth_fail || + stencil.action_depth_pass != cur_state.stencil.action_depth_pass || + stencil.action_stencil_fail != cur_state.stencil.action_stencil_fail) { + glStencilOp(stencil.action_stencil_fail, stencil.action_depth_fail, stencil.action_depth_pass); + } + // Stencil mask if (stencil.write_mask != cur_state.stencil.write_mask) { glStencilMask(stencil.write_mask); diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index 3e23790219..81e7e0877b 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -32,6 +32,9 @@ public: GLint test_ref; // GL_STENCIL_REF GLuint test_mask; // GL_STENCIL_VALUE_MASK GLuint write_mask; // GL_STENCIL_WRITEMASK + GLenum action_stencil_fail; // GL_STENCIL_FAIL + GLenum action_depth_fail; // GL_STENCIL_PASS_DEPTH_FAIL + GLenum action_depth_pass; // GL_STENCIL_PASS_DEPTH_PASS } stencil; struct { diff --git a/src/video_core/renderer_opengl/pica_to_gl.h b/src/video_core/renderer_opengl/pica_to_gl.h index 3b562da868..12806fad5e 100644 --- a/src/video_core/renderer_opengl/pica_to_gl.h +++ b/src/video_core/renderer_opengl/pica_to_gl.h @@ -152,6 +152,29 @@ inline GLenum CompareFunc(Pica::Regs::CompareFunc func) { return compare_func_table[(unsigned)func]; } +inline GLenum StencilOp(Pica::Regs::StencilAction action) { + static const GLenum stencil_op_table[] = { + GL_KEEP, // StencilAction::Keep + GL_ZERO, // StencilAction::Zero + GL_REPLACE, // StencilAction::Replace + GL_INCR, // StencilAction::Increment + GL_DECR, // StencilAction::Decrement + GL_INVERT, // StencilAction::Invert + GL_INCR_WRAP, // StencilAction::IncrementWrap + GL_DECR_WRAP // StencilAction::DecrementWrap + }; + + // Range check table for input + if ((unsigned)action >= ARRAY_SIZE(stencil_op_table)) { + LOG_CRITICAL(Render_OpenGL, "Unknown stencil op %d", action); + UNREACHABLE(); + + return GL_KEEP; + } + + return stencil_op_table[(unsigned)action]; +} + inline std::array<GLfloat, 4> ColorRGBA8(const u8* bytes) { return { { bytes[0] / 255.0f, bytes[1] / 255.0f, diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index be5588c007..f89117521f 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -52,7 +52,9 @@ void Setup(UnitState<false>& state) { } void Shutdown() { +#ifdef ARCHITECTURE_x86_64 shader_map.clear(); +#endif // ARCHITECTURE_x86_64 } static Common::Profiling::TimingCategory shader_category("Vertex Shader"); diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index ae5a304419..69e4efa689 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) { if (!swizzle.DestComponentEnabled(i)) continue; - dest[i] = std::max(src1[i], src2[i]); + // NOTE: Exact form required to match NaN semantics to hardware: + // max(0, NaN) -> NaN + // max(NaN, 0) -> 0 + dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i]; } Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; @@ -190,7 +193,10 @@ void RunInterpreter(UnitState<Debug>& state) { if (!swizzle.DestComponentEnabled(i)) continue; - dest[i] = std::min(src1[i], src2[i]); + // NOTE: Exact form required to match NaN semantics to hardware: + // min(0, NaN) -> NaN + // min(NaN, 0) -> 0 + dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i]; } Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index cc66fc8d61..c7b63a9b76 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1; static const X64Reg SRC2 = XMM2; /// Loaded with the third swizzled source register, otherwise can be used as a scratch register static const X64Reg SRC3 = XMM3; +/// Additional scratch register +static const X64Reg SCRATCH2 = XMM4; /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one static const X64Reg ONE = XMM14; /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR @@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); BLENDPS(SCRATCH, R(src), mask); } else { - MOVAPS(XMM4, R(src)); - UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination + MOVAPS(SCRATCH2, R(src)); + UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination // Compute selector to selectively copy source components to destination for SHUFPS instruction @@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); - SHUFPS(SCRATCH, R(XMM4), sel); + SHUFPS(SCRATCH, R(SCRATCH2), sel); } // Store dest back to memory @@ -244,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { } } +void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) { + MOVAPS(scratch, R(src1)); + CMPPS(scratch, R(src2), CMP_ORD); + + MULPS(src1, R(src2)); + + MOVAPS(src2, R(src1)); + CMPPS(src2, R(src2), CMP_UNORD); + + XORPS(scratch, R(src2)); + ANDPS(src1, R(scratch)); +} + void JitCompiler::Compile_EvaluateCondition(Instruction instr) { // Note: NXOR is used below to check for equality switch (instr.flow_control.op) { @@ -307,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - if (Common::GetCPUCaps().sse4_1) { - DPPS(SRC1, R(SRC2), 0x7f); - } else { - MULPS(SRC1, R(SRC2)); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - MOVAPS(SRC2, R(SRC1)); - SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); - MOVAPS(SRC3, R(SRC1)); - SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); + MOVAPS(SRC3, R(SRC1)); + SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); - ADDPS(SRC1, R(SRC2)); - ADDPS(SRC1, R(SRC3)); - } + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); + ADDPS(SRC1, R(SRC2)); + ADDPS(SRC1, R(SRC3)); Compile_DestEnable(instr, SRC1); } @@ -330,19 +341,15 @@ void JitCompiler::Compile_DP4(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - if (Common::GetCPUCaps().sse4_1) { - DPPS(SRC1, R(SRC2), 0xff); - } else { - MULPS(SRC1, R(SRC2)); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - MOVAPS(SRC2, R(SRC1)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY - ADDPS(SRC1, R(SRC2)); + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + ADDPS(SRC1, R(SRC2)); - MOVAPS(SRC2, R(SRC1)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX - ADDPS(SRC1, R(SRC2)); - } + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + ADDPS(SRC1, R(SRC2)); Compile_DestEnable(instr, SRC1); } @@ -359,23 +366,22 @@ void JitCompiler::Compile_DPH(Instruction instr) { if (Common::GetCPUCaps().sse4_1) { // Set 4th component to 1.0 BLENDPS(SRC1, R(ONE), 0x8); // 0b1000 - DPPS(SRC1, R(SRC2), 0xff); } else { - // Reverse to set the 4th component to 1.0 - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); - MOVSS(SRC1, R(ONE)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); + // Set 4th component to 1.0 + MOVAPS(SCRATCH, R(SRC1)); + UNPCKHPS(SCRATCH, R(ONE)); // XYZW, 1111 -> Z1__ + UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1 + } - MULPS(SRC1, R(SRC2)); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); - MOVAPS(SRC2, R(SRC1)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY - ADDPS(SRC1, R(SRC2)); + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + ADDPS(SRC1, R(SRC2)); - MOVAPS(SRC2, R(SRC1)); - SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX - ADDPS(SRC1, R(SRC2)); - } + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + ADDPS(SRC1, R(SRC2)); Compile_DestEnable(instr, SRC1); } @@ -415,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) { void JitCompiler::Compile_MUL(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - MULPS(SRC1, R(SRC2)); + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); Compile_DestEnable(instr, SRC1); } @@ -428,10 +434,10 @@ void JitCompiler::Compile_SGE(Instruction instr) { Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); } - CMPPS(SRC1, R(SRC2), CMP_NLT); - ANDPS(SRC1, R(ONE)); + CMPPS(SRC2, R(SRC1), CMP_LE); + ANDPS(SRC2, R(ONE)); - Compile_DestEnable(instr, SRC1); + Compile_DestEnable(instr, SRC2); } void JitCompiler::Compile_SLT(Instruction instr) { @@ -465,6 +471,7 @@ void JitCompiler::Compile_FLR(Instruction instr) { void JitCompiler::Compile_MAX(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. MAXPS(SRC1, R(SRC2)); Compile_DestEnable(instr, SRC1); } @@ -472,6 +479,7 @@ void JitCompiler::Compile_MAX(Instruction instr) { void JitCompiler::Compile_MIN(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. MINPS(SRC1, R(SRC2)); Compile_DestEnable(instr, SRC1); } @@ -485,8 +493,8 @@ void JitCompiler::Compile_MOVA(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); - // Convert floats to integers (only care about X and Y components) - CVTPS2DQ(SRC1, R(SRC1)); + // Convert floats to integers using truncation (only care about X and Y components) + CVTTPS2DQ(SRC1, R(SRC1)); // Get result MOVQ_xmm(R(RAX), SRC1); @@ -578,27 +586,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) { } void JitCompiler::Compile_CMP(Instruction instr) { + using Op = Instruction::Common::CompareOpType::Op; + Op op_x = instr.common.compare_op.x; + Op op_y = instr.common.compare_op.y; + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); - static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; + // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to + // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here + // because they don't match when used with NaNs. + static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE }; - if (instr.common.compare_op.x == instr.common.compare_op.y) { + bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); + Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1; + Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2; + + if (op_x == op_y) { // Compare X-component and Y-component together - CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); + CMPPS(lhs_x, R(rhs_x), cmp[op_x]); + MOVQ_xmm(R(COND0), lhs_x); - MOVQ_xmm(R(COND0), SRC1); MOV(64, R(COND1), R(COND0)); } else { + bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); + Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1; + Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2; + // Compare X-component - MOVAPS(SCRATCH, R(SRC1)); - CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); + MOVAPS(SCRATCH, R(lhs_x)); + CMPSS(SCRATCH, R(rhs_x), cmp[op_x]); // Compare Y-component - CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); + CMPPS(lhs_y, R(rhs_y), cmp[op_y]); MOVQ_xmm(R(COND0), SCRATCH); - MOVQ_xmm(R(COND1), SRC1); + MOVQ_xmm(R(COND1), lhs_y); } SHR(32, R(COND0), Imm8(31)); @@ -616,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) { Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); } - if (Common::GetCPUCaps().fma) { - VFMADD213PS(SRC1, SRC2, R(SRC3)); - } else { - MULPS(SRC1, R(SRC2)); - ADDPS(SRC1, R(SRC3)); - } + Compile_SanitizedMul(SRC1, SRC2, SCRATCH); + ADDPS(SRC1, R(SRC3)); Compile_DestEnable(instr, SRC1); } diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index fbe19fe933..58828ecc8c 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -68,6 +68,12 @@ private: void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); + /** + * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying + * zero by inf. Clobbers `src2` and `scratch`. + */ + void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch); + void Compile_EvaluateCondition(Instruction instr); void Compile_UniformCondition(Instruction instr); |