22 files changed, 336 insertions, 164 deletions
diff --git a/src/citra_qt/util/util.h b/src/citra_qt/util/util.h
index 0787f9eb7b..98a9440477 100644
--- a/src/citra_qt/util/util.h
+++ b/src/citra_qt/util/util.h
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#pragma once
+
 #include <QFont>
 
 /// Returns a QFont object appropriate to use as a monospace font for debugging widgets, etc.
diff --git a/src/common/microprofile.h b/src/common/microprofile.h
index 9eb6016a85..d3b6cb97cd 100644
--- a/src/common/microprofile.h
+++ b/src/common/microprofile.h
@@ -11,6 +11,11 @@
 #define MICROPROFILE_CONTEXT_SWITCH_TRACE 0
 #define MICROPROFILE_PER_THREAD_BUFFER_SIZE (2048<<12) // 8 MB
 
+#ifdef _WIN32
+// This isn't defined by the standard library in MSVC2015
+typedef void* HANDLE;
+#endif
+
 #include <microprofile.h>
 
 #define MP_RGB(r, g, b) ((r) << 16 | (g) << 8 | (b) << 0)
diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
index 01c712f249..0fddb07a0a 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
@@ -49,65 +49,47 @@ enum {
 
 typedef unsigned int (*shtop_fp_t)(ARMul_State* cpu, unsigned int sht_oper);
 
-static int CondPassed(ARMul_State* cpu, unsigned int cond) {
-    const u32 NFLAG = cpu->NFlag;
-    const u32 ZFLAG = cpu->ZFlag;
-    const u32 CFLAG = cpu->CFlag;
-    const u32 VFLAG = cpu->VFlag;
-
-    int temp = 0;
+static bool CondPassed(ARMul_State* cpu, unsigned int cond) {
+    const bool n_flag = cpu->NFlag != 0;
+    const bool z_flag = cpu->ZFlag != 0;
+    const bool c_flag = cpu->CFlag != 0;
+    const bool v_flag = cpu->VFlag != 0;
 
     switch (cond) {
-    case 0x0:
-        temp = ZFLAG;
-        break;
-    case 0x1: // NE
-        temp = !ZFLAG;
-        break;
-    case 0x2: // CS
-        temp = CFLAG;
-        break;
-    case 0x3: // CC
-        temp = !CFLAG;
-        break;
-    case 0x4: // MI
-        temp = NFLAG;
-        break;
-    case 0x5: // PL
-        temp = !NFLAG;
-        break;
-    case 0x6: // VS
-        temp = VFLAG;
-        break;
-    case 0x7: // VC
-        temp = !VFLAG;
-        break;
-    case 0x8: // HI
-        temp = (CFLAG && !ZFLAG);
-        break;
-    case 0x9: // LS
-        temp = (!CFLAG || ZFLAG);
-        break;
-    case 0xa: // GE
-        temp = ((!NFLAG && !VFLAG) || (NFLAG && VFLAG));
-        break;
-    case 0xb: // LT
-        temp = ((NFLAG && !VFLAG) || (!NFLAG && VFLAG));
-        break;
-    case 0xc: // GT
-        temp = ((!NFLAG && !VFLAG && !ZFLAG) || (NFLAG && VFLAG && !ZFLAG));
-        break;
-    case 0xd: // LE
-        temp = ((NFLAG && !VFLAG) || (!NFLAG && VFLAG)) || ZFLAG;
-        break;
-    case 0xe: // AL
-        temp = 1;
-        break;
-    case 0xf:
-        temp = 1;
-        break;
-    }
-    return temp;
+    case ConditionCode::EQ:
+        return z_flag;
+    case ConditionCode::NE:
+        return !z_flag;
+    case ConditionCode::CS:
+        return c_flag;
+    case ConditionCode::CC:
+        return !c_flag;
+    case ConditionCode::MI:
+        return n_flag;
+    case ConditionCode::PL:
+        return !n_flag;
+    case ConditionCode::VS:
+        return v_flag;
+    case ConditionCode::VC:
+        return !v_flag;
+    case ConditionCode::HI:
+        return (c_flag && !z_flag);
+    case ConditionCode::LS:
+        return (!c_flag || z_flag);
+    case ConditionCode::GE:
+        return (n_flag == v_flag);
+    case ConditionCode::LT:
+        return (n_flag != v_flag);
+    case ConditionCode::GT:
+        return (!z_flag && (n_flag == v_flag));
+    case ConditionCode::LE:
+        return (z_flag || (n_flag != v_flag));
+    case ConditionCode::AL:
+    case ConditionCode::NV: // Unconditional
+        return true;
+    }
+
+    return false;
 }
 
 static unsigned int DPO(Immediate)(ARMul_State* cpu, unsigned int sht_oper) {
diff --git a/src/core/file_sys/disk_archive.cpp b/src/core/file_sys/disk_archive.cpp
index 1096fd34da..e9ecd2b1cc 100644
--- a/src/core/file_sys/disk_archive.cpp
+++ b/src/core/file_sys/disk_archive.cpp
@@ -102,7 +102,7 @@ bool DiskFile::Open() {
     mode_string += "b";
 
     file = Common::make_unique<FileUtil::IOFile>(path, mode_string.c_str());
-    return true;
+    return file->IsOpen();
 }
 
 size_t DiskFile::Read(const u64 offset, const size_t length, u8* buffer) const {
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index 124047a53f..c2b4963d4f 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -129,7 +129,7 @@ void Process::Run(s32 main_thread_priority, u32 stack_size) {
 }
 
 VAddr Process::GetLinearHeapBase() const {
-    return (kernel_version < 0x22C ? Memory::LINEAR_HEAP_VADDR : Memory::NEW_LINEAR_HEAP_SIZE)
+    return (kernel_version < 0x22C ? Memory::LINEAR_HEAP_VADDR : Memory::NEW_LINEAR_HEAP_VADDR)
             + memory_region->base;
 }
 
@@ -174,6 +174,10 @@ ResultCode Process::HeapFree(VAddr target, u32 size) {
         return ERR_INVALID_ADDRESS;
     }
 
+    if (size == 0) {
+        return RESULT_SUCCESS;
+    }
+
     ResultCode result = vm_manager.UnmapRange(target, size);
     if (result.IsError()) return result;
 
@@ -226,6 +230,10 @@ ResultCode Process::LinearFree(VAddr target, u32 size) {
         return ERR_INVALID_ADDRESS;
     }
 
+    if (size == 0) {
+        return RESULT_SUCCESS;
+    }
+
     VAddr heap_end = GetLinearHeapBase() + (u32)linheap_memory->size();
     if (target + size > heap_end) {
         return ERR_INVALID_ADDRESS_STATE;
diff --git a/src/core/hle/kernel/shared_memory.cpp b/src/core/hle/kernel/shared_memory.cpp
index 4137683b5d..1f477664b7 100644
--- a/src/core/hle/kernel/shared_memory.cpp
+++ b/src/core/hle/kernel/shared_memory.cpp
@@ -20,6 +20,7 @@ SharedPtr<SharedMemory> SharedMemory::Create(u32 size, MemoryPermission permissi
 
     shared_memory->name = std::move(name);
     shared_memory->base_address = 0x0;
+    shared_memory->fixed_address = 0x0;
     shared_memory->size = size;
     shared_memory->permissions = permissions;
     shared_memory->other_permissions = other_permissions;
@@ -30,9 +31,31 @@ SharedPtr<SharedMemory> SharedMemory::Create(u32 size, MemoryPermission permissi
 ResultCode SharedMemory::Map(VAddr address, MemoryPermission permissions,
         MemoryPermission other_permissions) {
 
+    if (base_address != 0) {
+        LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s: already mapped at 0x%08X!",
+            GetObjectId(), address, name.c_str(), base_address);
+        // TODO: Verify error code with hardware
+        return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel,
+            ErrorSummary::InvalidArgument, ErrorLevel::Permanent);
+    }
+
+    if (fixed_address != 0) {
+         if (address != 0 && address != fixed_address) {
+            LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s: fixed_addres is 0x%08X!",
+                    GetObjectId(), address, name.c_str(), fixed_address);
+            // TODO: Verify error code with hardware
+            return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel,
+                ErrorSummary::InvalidArgument, ErrorLevel::Permanent);
+        }
+
+        // HACK(yuriks): This is only here to support the APT shared font mapping right now.
+        // Later, this should actually map the memory block onto the address space.
+        return RESULT_SUCCESS;
+    }
+
     if (address < Memory::SHARED_MEMORY_VADDR || address + size >= Memory::SHARED_MEMORY_VADDR_END) {
-        LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X outside of shared mem bounds!",
-                GetObjectId(), address);
+        LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s outside of shared mem bounds!",
+                GetObjectId(), address, name.c_str());
         // TODO: Verify error code with hardware
         return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel,
                 ErrorSummary::InvalidArgument, ErrorLevel::Permanent);
diff --git a/src/core/hle/kernel/shared_memory.h b/src/core/hle/kernel/shared_memory.h
index 7a29227762..35b550d12c 100644
--- a/src/core/hle/kernel/shared_memory.h
+++ b/src/core/hle/kernel/shared_memory.h
@@ -61,6 +61,8 @@ public:
 
     /// Address of shared memory block in the process.
     VAddr base_address;
+    /// Fixed address to allow mapping to. Used for blocks created from the linear heap.
+    VAddr fixed_address;
     /// Size of the memory block. Page-aligned.
     u32 size;
     /// Permission restrictions applied to the process which created the block.
diff --git a/src/core/hle/service/apt/apt.cpp b/src/core/hle/service/apt/apt.cpp
index 6a2fdea2bf..ba66569b41 100644
--- a/src/core/hle/service/apt/apt.cpp
+++ b/src/core/hle/service/apt/apt.cpp
@@ -78,8 +78,8 @@ void GetSharedFont(Service::Interface* self) {
     if (shared_font != nullptr) {
         // TODO(yuriks): This is a hack to keep this working right now even with our completely
         // broken shared memory system.
-        shared_font_mem->base_address = SHARED_FONT_VADDR;
-        Kernel::g_current_process->vm_manager.MapMemoryBlock(shared_font_mem->base_address,
+        shared_font_mem->fixed_address = SHARED_FONT_VADDR;
+        Kernel::g_current_process->vm_manager.MapMemoryBlock(shared_font_mem->fixed_address,
                 shared_font, 0, shared_font_mem->size, Kernel::MemoryState::Shared);
 
         cmd_buff[0] = IPC::MakeHeader(0x44, 2, 2);
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index a78985510b..682be89ec5 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -235,7 +235,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             for (unsigned int index = 0; index < regs.num_vertices; ++index)
             {
-                unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index;
+                // Indexed rendering doesn't use the start offset
+                unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : (index + regs.vertex_offset);
 
                 // -1 is a common special value used for primitive restart. Since it's unknown if
                 // the PICA supports it, and it would mess up the caching, guard against it here.
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index c73a8178ea..61983bc6cf 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -49,11 +49,13 @@ std::string Regs::GetCommandName(int index) {
         ADD_FIELD(vertex_attributes);
         ADD_FIELD(index_array);
         ADD_FIELD(num_vertices);
+        ADD_FIELD(vertex_offset);
         ADD_FIELD(trigger_draw);
         ADD_FIELD(trigger_draw_indexed);
         ADD_FIELD(vs_default_attributes_setup);
         ADD_FIELD(command_buffer);
         ADD_FIELD(triangle_topology);
+        ADD_FIELD(restart_primitive);
         ADD_FIELD(gs.bool_uniforms);
         ADD_FIELD(gs.int_uniforms);
         ADD_FIELD(gs.main_offset);
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 58b924f9e1..855cb442e1 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -441,8 +441,14 @@ struct Regs {
     };
 
     enum class StencilAction : u32 {
-        Keep = 0,
-        Xor  = 5,
+        Keep           = 0,
+        Zero           = 1,
+        Replace        = 2,
+        Increment      = 3,
+        Decrement      = 4,
+        Invert         = 5,
+        IncrementWrap  = 6,
+        DecrementWrap  = 7
     };
 
     struct {
@@ -481,23 +487,29 @@ struct Regs {
 
         struct {
             union {
+                // Raw value of this register
+                u32 raw_func;
+
                 // If true, enable stencil testing
                 BitField< 0, 1, u32> enable;
 
                 // Comparison operation for stencil testing
                 BitField< 4, 3, CompareFunc> func;
 
-                // Value to calculate the new stencil value from
-                BitField< 8, 8, u32> replacement_value;
+                // Mask used to control writing to the stencil buffer
+                BitField< 8, 8, u32> write_mask;
 
                 // Value to compare against for stencil testing
                 BitField<16, 8, u32> reference_value;
 
                 // Mask to apply on stencil test inputs
-                BitField<24, 8, u32> mask;
+                BitField<24, 8, u32> input_mask;
             };
 
             union {
+                // Raw value of this register
+                u32 raw_op;
+
                 // Action to perform when the stencil test fails
                 BitField< 0, 3, StencilAction> action_stencil_fail;
 
@@ -757,7 +769,12 @@ struct Regs {
     // Number of vertices to render
     u32 num_vertices;
 
-    INSERT_PADDING_WORDS(0x5);
+    INSERT_PADDING_WORDS(0x1);
+
+    // The index of the first vertex to render
+    u32 vertex_offset;
+
+    INSERT_PADDING_WORDS(0x3);
 
     // These two trigger rendering of triangles
     u32 trigger_draw;
@@ -811,7 +828,9 @@ struct Regs {
 
     BitField<8, 2, TriangleTopology> triangle_topology;
 
-    INSERT_PADDING_WORDS(0x21);
+    u32 restart_primitive;
+
+    INSERT_PADDING_WORDS(0x20);
 
     struct ShaderConfig {
         BitField<0, 16, u32> bool_uniforms;
@@ -980,11 +999,13 @@ ASSERT_REG_POSITION(framebuffer, 0x110);
 ASSERT_REG_POSITION(vertex_attributes, 0x200);
 ASSERT_REG_POSITION(index_array, 0x227);
 ASSERT_REG_POSITION(num_vertices, 0x228);
+ASSERT_REG_POSITION(vertex_offset, 0x22a);
 ASSERT_REG_POSITION(trigger_draw, 0x22e);
 ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
 ASSERT_REG_POSITION(vs_default_attributes_setup, 0x232);
 ASSERT_REG_POSITION(command_buffer, 0x238);
 ASSERT_REG_POSITION(triangle_topology, 0x25e);
+ASSERT_REG_POSITION(restart_primitive, 0x25f);
 ASSERT_REG_POSITION(gs, 0x280);
 ASSERT_REG_POSITION(vs, 0x2b0);
 
@@ -1021,12 +1042,20 @@ struct float24 {
         return ret;
     }
 
+    static float24 Zero() {
+        return FromFloat32(0.f);
+    }
+
     // Not recommended for anything but logging
     float ToFloat32() const {
         return value;
     }
 
     float24 operator * (const float24& flt) const {
+        if ((this->value == 0.f && !std::isnan(flt.value)) ||
+            (flt.value == 0.f && !std::isnan(this->value)))
+            // PICA gives 0 instead of NaN when multiplying by inf
+            return Zero();
         return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
     }
 
@@ -1043,7 +1072,11 @@ struct float24 {
     }
 
     float24& operator *= (const float24& flt) {
-        value *= flt.ToFloat32();
+        if ((this->value == 0.f && !std::isnan(flt.value)) ||
+            (flt.value == 0.f && !std::isnan(this->value)))
+            // PICA gives 0 instead of NaN when multiplying by inf
+            *this = Zero();
+        else value *= flt.ToFloat32();
         return *this;
     }
 
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 4a159da8e7..77eadda9e9 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -216,14 +216,33 @@ static void SetStencil(int x, int y, u8 value) {
     }
 }
 
-// TODO: Should the stencil mask be applied to the "dest" or "ref" operands? Most likely not!
-static u8 PerformStencilAction(Regs::StencilAction action, u8 dest, u8 ref) {
+static u8 PerformStencilAction(Regs::StencilAction action, u8 old_stencil, u8 ref) {
     switch (action) {
     case Regs::StencilAction::Keep:
-        return dest;
+        return old_stencil;
 
-    case Regs::StencilAction::Xor:
-        return dest ^ ref;
+    case Regs::StencilAction::Zero:
+        return 0;
+
+    case Regs::StencilAction::Replace:
+        return ref;
+
+    case Regs::StencilAction::Increment:
+        // Saturated increment
+        return std::min<u8>(old_stencil, 254) + 1;
+
+    case Regs::StencilAction::Decrement:
+        // Saturated decrement
+        return std::max<u8>(old_stencil, 1) - 1;
+
+    case Regs::StencilAction::Invert:
+        return ~old_stencil;
+
+    case Regs::StencilAction::IncrementWrap:
+        return old_stencil + 1;
+
+    case Regs::StencilAction::DecrementWrap:
+        return old_stencil - 1;
 
     default:
         LOG_CRITICAL(HW_GPU, "Unknown stencil action %x", (int)action);
@@ -783,10 +802,16 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
             }
 
             u8 old_stencil = 0;
+
+            auto UpdateStencil = [stencil_test, x, y, &old_stencil](Pica::Regs::StencilAction action) {
+                u8 new_stencil = PerformStencilAction(action, old_stencil, stencil_test.reference_value);
+                SetStencil(x >> 4, y >> 4, (new_stencil & stencil_test.write_mask) | (old_stencil & ~stencil_test.write_mask));
+            };
+
             if (stencil_action_enable) {
                 old_stencil = GetStencil(x >> 4, y >> 4);
-                u8 dest = old_stencil & stencil_test.mask;
-                u8 ref = stencil_test.reference_value & stencil_test.mask;
+                u8 dest = old_stencil & stencil_test.input_mask;
+                u8 ref = stencil_test.reference_value & stencil_test.input_mask;
 
                 bool pass = false;
                 switch (stencil_test.func) {
@@ -824,8 +849,7 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
                 }
 
                 if (!pass) {
-                    u8 new_stencil = PerformStencilAction(stencil_test.action_stencil_fail, old_stencil, stencil_test.replacement_value);
-                    SetStencil(x >> 4, y >> 4, new_stencil);
+                    UpdateStencil(stencil_test.action_stencil_fail);
                     continue;
                 }
             }
@@ -875,23 +899,19 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
                 }
 
                 if (!pass) {
-                    if (stencil_action_enable) {
-                        u8 new_stencil = PerformStencilAction(stencil_test.action_depth_fail, old_stencil, stencil_test.replacement_value);
-                        SetStencil(x >> 4, y >> 4, new_stencil);
-                    }
+                    if (stencil_action_enable)
+                        UpdateStencil(stencil_test.action_depth_fail);
                     continue;
                 }
 
                 if (output_merger.depth_write_enable)
                     SetDepth(x >> 4, y >> 4, z);
-
-                if (stencil_action_enable) {
-                    // TODO: What happens if stencil testing is enabled, but depth testing is not? Will stencil get updated anyway?
-                    u8 new_stencil = PerformStencilAction(stencil_test.action_depth_pass, old_stencil, stencil_test.replacement_value);
-                    SetStencil(x >> 4, y >> 4, new_stencil);
-                }
             }
 
+            // The stencil depth_pass action is executed even if depth testing is disabled
+            if (stencil_action_enable)
+                UpdateStencil(stencil_test.action_depth_pass);
+
             auto dest = GetPixel(x >> 4, y >> 4);
             Math::Vec4<u8> blend_output = combiner_output;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index deb9971bb7..d29049508d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -231,8 +231,8 @@ void RasterizerOpenGL::DrawTriangles() {
     u32 cur_fb_depth_size = Pica::Regs::BytesPerDepthPixel(regs.framebuffer.depth_format)
                             * regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight();
 
-    res_cache.NotifyFlush(cur_fb_color_addr, cur_fb_color_size);
-    res_cache.NotifyFlush(cur_fb_depth_addr, cur_fb_depth_size);
+    res_cache.NotifyFlush(cur_fb_color_addr, cur_fb_color_size, true);
+    res_cache.NotifyFlush(cur_fb_depth_addr, cur_fb_depth_size, true);
 }
 
 void RasterizerOpenGL::CommitFramebuffer() {
@@ -269,7 +269,8 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
         break;
 
     // Stencil test
-    case PICA_REG_INDEX(output_merger.stencil_test):
+    case PICA_REG_INDEX(output_merger.stencil_test.raw_func):
+    case PICA_REG_INDEX(output_merger.stencil_test.raw_op):
         SyncStencilTest();
         break;
 
@@ -676,7 +677,15 @@ void RasterizerOpenGL::SyncLogicOp() {
 }
 
 void RasterizerOpenGL::SyncStencilTest() {
-    // TODO: Implement stencil test, mask, and op
+    const auto& regs = Pica::g_state.regs;
+    state.stencil.test_enabled = regs.output_merger.stencil_test.enable && regs.framebuffer.depth_format == Pica::Regs::DepthFormat::D24S8;
+    state.stencil.test_func = PicaToGL::CompareFunc(regs.output_merger.stencil_test.func);
+    state.stencil.test_ref = regs.output_merger.stencil_test.reference_value;
+    state.stencil.test_mask = regs.output_merger.stencil_test.input_mask;
+    state.stencil.write_mask = regs.output_merger.stencil_test.write_mask;
+    state.stencil.action_stencil_fail = PicaToGL::StencilOp(regs.output_merger.stencil_test.action_stencil_fail);
+    state.stencil.action_depth_fail = PicaToGL::StencilOp(regs.output_merger.stencil_test.action_depth_fail);
+    state.stencil.action_depth_pass = PicaToGL::StencilOp(regs.output_merger.stencil_test.action_depth_pass);
 }
 
 void RasterizerOpenGL::SyncDepthTest() {
@@ -867,8 +876,15 @@ void RasterizerOpenGL::ReloadDepthBuffer() {
     state.Apply();
 
     glActiveTexture(GL_TEXTURE0);
-    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, fb_depth_texture.width, fb_depth_texture.height,
-                    fb_depth_texture.gl_format, fb_depth_texture.gl_type, temp_fb_depth_buffer.get());
+    if (fb_depth_texture.format == Pica::Regs::DepthFormat::D24S8) {
+        // TODO(Subv): There is a bug with Intel Windows drivers that makes glTexSubImage2D not change the stencil buffer.
+        // The bug has been reported to Intel (https://communities.intel.com/message/324464)
+        glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, fb_depth_texture.width, fb_depth_texture.height, 0,
+            GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, temp_fb_depth_buffer.get());
+    } else {
+        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, fb_depth_texture.width, fb_depth_texture.height,
+            fb_depth_texture.gl_format, fb_depth_texture.gl_type, temp_fb_depth_buffer.get());
+    }
 
     state.texture_units[0].texture_2d = 0;
     state.Apply();
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index e4247051c7..1e38c2e6d3 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "common/hash.h"
 #include "common/make_unique.h"
 #include "common/math_util.h"
 #include "common/microprofile.h"
@@ -21,7 +22,6 @@ MICROPROFILE_DEFINE(OpenGL_TextureUpload, "OpenGL", "Texture Upload", MP_RGB(128
 
 void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned texture_unit, const Pica::Regs::FullTextureConfig& config) {
     PAddr texture_addr = config.config.GetPhysicalAddress();
-
     const auto cached_texture = texture_cache.find(texture_addr);
 
     if (cached_texture != texture_cache.end()) {
@@ -51,12 +51,14 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text
         }
 
         const auto info = Pica::DebugUtils::TextureInfo::FromPicaRegister(config.config, config.format);
+        u8* texture_src_data = Memory::GetPhysicalPointer(texture_addr);
 
         new_texture->width = info.width;
         new_texture->height = info.height;
-        new_texture->size = info.width * info.height * Pica::Regs::NibblesPerPixel(info.format);
+        new_texture->size = info.stride * info.height;
+        new_texture->addr = texture_addr;
+        new_texture->hash = Common::ComputeHash64(texture_src_data, new_texture->size);
 
-        u8* texture_src_data = Memory::GetPhysicalPointer(texture_addr);
         std::unique_ptr<Math::Vec4<u8>[]> temp_texture_buffer_rgba(new Math::Vec4<u8>[info.width * info.height]);
 
         for (int y = 0; y < info.height; ++y) {
@@ -71,12 +73,18 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text
     }
 }
 
-void RasterizerCacheOpenGL::NotifyFlush(PAddr addr, u32 size) {
+void RasterizerCacheOpenGL::NotifyFlush(PAddr addr, u32 size, bool ignore_hash) {
     // Flush any texture that falls in the flushed region
     // TODO: Optimize by also inserting upper bound (addr + size) of each texture into the same map and also narrow using lower_bound
     auto cache_upper_bound = texture_cache.upper_bound(addr + size);
+
     for (auto it = texture_cache.begin(); it != cache_upper_bound;) {
-        if (MathUtil::IntervalsIntersect(addr, size, it->first, it->second->size)) {
+        const auto& info = *it->second;
+
+        // Flush the texture only if the memory region intersects and a change is detected
+        if (MathUtil::IntervalsIntersect(addr, size, info.addr, info.size) &&
+            (ignore_hash || info.hash != Common::ComputeHash64(Memory::GetPhysicalPointer(info.addr), info.size))) {
+
             it = texture_cache.erase(it);
         } else {
             ++it;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 96f3a925c5..d8f9edf59e 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -19,7 +19,7 @@ public:
     void LoadAndBindTexture(OpenGLState &state, unsigned texture_unit, const Pica::Regs::FullTextureConfig& config);
 
     /// Flush any cached resource that touches the flushed region
-    void NotifyFlush(PAddr addr, u32 size);
+    void NotifyFlush(PAddr addr, u32 size, bool ignore_hash = false);
 
     /// Flush all cached OpenGL resources tracked by this cache manager
     void FullFlush();
@@ -30,6 +30,8 @@ private:
         GLuint width;
         GLuint height;
         u32 size;
+        u64 hash;
+        PAddr addr;
     };
 
     std::map<PAddr, std::unique_ptr<CachedTexture>> texture_cache;
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 871324014f..ba47ce8b83 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -26,6 +26,9 @@ OpenGLState::OpenGLState() {
     stencil.test_ref = 0;
     stencil.test_mask = -1;
     stencil.write_mask = -1;
+    stencil.action_depth_fail = GL_KEEP;
+    stencil.action_depth_pass = GL_KEEP;
+    stencil.action_stencil_fail = GL_KEEP;
 
     blend.enabled = false;
     blend.src_rgb_func = GL_ONE;
@@ -105,6 +108,12 @@ void OpenGLState::Apply() {
         glStencilFunc(stencil.test_func, stencil.test_ref, stencil.test_mask);
     }
 
+    if (stencil.action_depth_fail != cur_state.stencil.action_depth_fail ||
+            stencil.action_depth_pass != cur_state.stencil.action_depth_pass ||
+            stencil.action_stencil_fail != cur_state.stencil.action_stencil_fail) {
+        glStencilOp(stencil.action_stencil_fail, stencil.action_depth_fail, stencil.action_depth_pass);
+    }
+
     // Stencil mask
     if (stencil.write_mask != cur_state.stencil.write_mask) {
         glStencilMask(stencil.write_mask);
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 3e23790219..81e7e0877b 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -32,6 +32,9 @@ public:
         GLint test_ref; // GL_STENCIL_REF
         GLuint test_mask; // GL_STENCIL_VALUE_MASK
         GLuint write_mask; // GL_STENCIL_WRITEMASK
+        GLenum action_stencil_fail; // GL_STENCIL_FAIL
+        GLenum action_depth_fail; // GL_STENCIL_PASS_DEPTH_FAIL
+        GLenum action_depth_pass; // GL_STENCIL_PASS_DEPTH_PASS
     } stencil;
 
     struct {
diff --git a/src/video_core/renderer_opengl/pica_to_gl.h b/src/video_core/renderer_opengl/pica_to_gl.h
index 3b562da868..12806fad5e 100644
--- a/src/video_core/renderer_opengl/pica_to_gl.h
+++ b/src/video_core/renderer_opengl/pica_to_gl.h
@@ -152,6 +152,29 @@ inline GLenum CompareFunc(Pica::Regs::CompareFunc func) {
     return compare_func_table[(unsigned)func];
 }
 
+inline GLenum StencilOp(Pica::Regs::StencilAction action) {
+    static const GLenum stencil_op_table[] = {
+        GL_KEEP,        // StencilAction::Keep
+        GL_ZERO,        // StencilAction::Zero
+        GL_REPLACE,     // StencilAction::Replace
+        GL_INCR,        // StencilAction::Increment
+        GL_DECR,        // StencilAction::Decrement
+        GL_INVERT,      // StencilAction::Invert
+        GL_INCR_WRAP,   // StencilAction::IncrementWrap
+        GL_DECR_WRAP    // StencilAction::DecrementWrap
+    };
+
+    // Range check table for input
+    if ((unsigned)action >= ARRAY_SIZE(stencil_op_table)) {
+        LOG_CRITICAL(Render_OpenGL, "Unknown stencil op %d", action);
+        UNREACHABLE();
+
+        return GL_KEEP;
+    }
+
+    return stencil_op_table[(unsigned)action];
+}
+
 inline std::array<GLfloat, 4> ColorRGBA8(const u8* bytes) {
     return { { bytes[0] / 255.0f,
                bytes[1] / 255.0f,
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index be5588c007..f89117521f 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -52,7 +52,9 @@ void Setup(UnitState<false>& state) {
 }
 
 void Shutdown() {
+#ifdef ARCHITECTURE_x86_64
     shader_map.clear();
+#endif // ARCHITECTURE_x86_64
 }
 
 static Common::Profiling::TimingCategory shader_category("Vertex Shader");
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index ae5a304419..69e4efa689 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
 
-                    dest[i] = std::max(src1[i], src2[i]);
+                    // NOTE: Exact form required to match NaN semantics to hardware:
+                    //   max(0, NaN) -> NaN
+                    //   max(NaN, 0) -> 0
+                    dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i];
                 }
                 Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
                 break;
@@ -190,7 +193,10 @@ void RunInterpreter(UnitState<Debug>& state) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
 
-                    dest[i] = std::min(src1[i], src2[i]);
+                    // NOTE: Exact form required to match NaN semantics to hardware:
+                    //   min(0, NaN) -> NaN
+                    //   min(NaN, 0) -> 0
+                    dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i];
                 }
                 Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
                 break;
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index cc66fc8d61..c7b63a9b76 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1;
 static const X64Reg SRC2 = XMM2;
 /// Loaded with the third swizzled source register, otherwise can be used as a scratch register
 static const X64Reg SRC3 = XMM3;
+/// Additional scratch register
+static const X64Reg SCRATCH2 = XMM4;
 /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
 static const X64Reg ONE = XMM14;
 /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
@@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
             u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
             BLENDPS(SCRATCH, R(src), mask);
         } else {
-            MOVAPS(XMM4, R(src));
-            UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination
+            MOVAPS(SCRATCH2, R(src));
+            UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination
             UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination
 
             // Compute selector to selectively copy source components to destination for SHUFPS instruction
@@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
                      ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
                      ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
                      ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
-            SHUFPS(SCRATCH, R(XMM4), sel);
+            SHUFPS(SCRATCH, R(SCRATCH2), sel);
         }
 
         // Store dest back to memory
@@ -244,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
     }
 }
 
+void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
+    MOVAPS(scratch, R(src1));
+    CMPPS(scratch, R(src2), CMP_ORD);
+
+    MULPS(src1, R(src2));
+
+    MOVAPS(src2, R(src1));
+    CMPPS(src2, R(src2), CMP_UNORD);
+
+    XORPS(scratch, R(src2));
+    ANDPS(src1, R(scratch));
+}
+
 void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
     // Note: NXOR is used below to check for equality
     switch (instr.flow_control.op) {
@@ -307,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    if (Common::GetCPUCaps().sse4_1) {
-        DPPS(SRC1, R(SRC2), 0x7f);
-    } else {
-        MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
 
-        MOVAPS(SRC3, R(SRC1));
-        SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
+    MOVAPS(SRC3, R(SRC1));
+    SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
 
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
-        ADDPS(SRC1, R(SRC2));
-        ADDPS(SRC1, R(SRC3));
-    }
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
+    ADDPS(SRC1, R(SRC2));
+    ADDPS(SRC1, R(SRC3));
 
     Compile_DestEnable(instr, SRC1);
 }
@@ -330,19 +341,15 @@ void JitCompiler::Compile_DP4(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    if (Common::GetCPUCaps().sse4_1) {
-        DPPS(SRC1, R(SRC2), 0xff);
-    } else {
-        MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
-        ADDPS(SRC1, R(SRC2));
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+    ADDPS(SRC1, R(SRC2));
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
-        ADDPS(SRC1, R(SRC2));
-    }
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+    ADDPS(SRC1, R(SRC2));
 
     Compile_DestEnable(instr, SRC1);
 }
@@ -359,23 +366,22 @@ void JitCompiler::Compile_DPH(Instruction instr) {
     if (Common::GetCPUCaps().sse4_1) {
         // Set 4th component to 1.0
         BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
-        DPPS(SRC1, R(SRC2), 0xff);
     } else {
-        // Reverse to set the 4th component to 1.0
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
-        MOVSS(SRC1, R(ONE));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
+        // Set 4th component to 1.0
+        MOVAPS(SCRATCH, R(SRC1));
+        UNPCKHPS(SCRATCH, R(ONE));  // XYZW, 1111 -> Z1__
+        UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1
+    }
 
-        MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
-        ADDPS(SRC1, R(SRC2));
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
+    ADDPS(SRC1, R(SRC2));
 
-        MOVAPS(SRC2, R(SRC1));
-        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
-        ADDPS(SRC1, R(SRC2));
-    }
+    MOVAPS(SRC2, R(SRC1));
+    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
+    ADDPS(SRC1, R(SRC2));
 
     Compile_DestEnable(instr, SRC1);
 }
@@ -415,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) {
 void JitCompiler::Compile_MUL(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
-    MULPS(SRC1, R(SRC2));
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
     Compile_DestEnable(instr, SRC1);
 }
 
@@ -428,10 +434,10 @@ void JitCompiler::Compile_SGE(Instruction instr) {
         Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
     }
 
-    CMPPS(SRC1, R(SRC2), CMP_NLT);
-    ANDPS(SRC1, R(ONE));
+    CMPPS(SRC2, R(SRC1), CMP_LE);
+    ANDPS(SRC2, R(ONE));
 
-    Compile_DestEnable(instr, SRC1);
+    Compile_DestEnable(instr, SRC2);
 }
 
 void JitCompiler::Compile_SLT(Instruction instr) {
@@ -465,6 +471,7 @@ void JitCompiler::Compile_FLR(Instruction instr) {
 void JitCompiler::Compile_MAX(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+    // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
     MAXPS(SRC1, R(SRC2));
     Compile_DestEnable(instr, SRC1);
 }
@@ -472,6 +479,7 @@ void JitCompiler::Compile_MAX(Instruction instr) {
 void JitCompiler::Compile_MIN(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
+    // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
     MINPS(SRC1, R(SRC2));
     Compile_DestEnable(instr, SRC1);
 }
@@ -485,8 +493,8 @@ void JitCompiler::Compile_MOVA(Instruction instr) {
 
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
 
-    // Convert floats to integers (only care about X and Y components)
-    CVTPS2DQ(SRC1, R(SRC1));
+    // Convert floats to integers using truncation (only care about X and Y components)
+    CVTTPS2DQ(SRC1, R(SRC1));
 
     // Get result
     MOVQ_xmm(R(RAX), SRC1);
@@ -578,27 +586,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) {
 }
 
 void JitCompiler::Compile_CMP(Instruction instr) {
+    using Op = Instruction::Common::CompareOpType::Op;
+    Op op_x = instr.common.compare_op.x;
+    Op op_y = instr.common.compare_op.y;
+
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
 
-    static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT };
+    // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
+    // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
+    // because they don't match when used with NaNs.
+    static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE };
 
-    if (instr.common.compare_op.x == instr.common.compare_op.y) {
+    bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
+    Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1;
+    Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2;
+
+    if (op_x == op_y) {
         // Compare X-component and Y-component together
-        CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]);
+        CMPPS(lhs_x, R(rhs_x), cmp[op_x]);
+        MOVQ_xmm(R(COND0), lhs_x);
 
-        MOVQ_xmm(R(COND0), SRC1);
         MOV(64, R(COND1), R(COND0));
     } else {
+        bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
+        Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1;
+        Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2;
+
         // Compare X-component
-        MOVAPS(SCRATCH, R(SRC1));
-        CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]);
+        MOVAPS(SCRATCH, R(lhs_x));
+        CMPSS(SCRATCH, R(rhs_x), cmp[op_x]);
 
         // Compare Y-component
-        CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]);
+        CMPPS(lhs_y, R(rhs_y), cmp[op_y]);
 
         MOVQ_xmm(R(COND0), SCRATCH);
-        MOVQ_xmm(R(COND1), SRC1);
+        MOVQ_xmm(R(COND1), lhs_y);
     }
 
     SHR(32, R(COND0), Imm8(31));
@@ -616,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
         Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
     }
 
-    if (Common::GetCPUCaps().fma) {
-        VFMADD213PS(SRC1, SRC2, R(SRC3));
-    } else {
-        MULPS(SRC1, R(SRC2));
-        ADDPS(SRC1, R(SRC3));
-    }
+    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
+    ADDPS(SRC1, R(SRC3));
 
     Compile_DestEnable(instr, SRC1);
 }
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index fbe19fe933..58828ecc8c 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -68,6 +68,12 @@ private:
     void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);
     void Compile_DestEnable(Instruction instr, Gen::X64Reg dest);
 
+    /**
+     * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
+     * zero by inf. Clobbers `src2` and `scratch`.
+     */
+    void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch);
+
     void Compile_EvaluateCondition(Instruction instr);
     void Compile_UniformCondition(Instruction instr);