diff options
author | Logan Stromberg <loganstromberg@gmail.com> | 2022-07-14 11:45:56 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-07-14 15:45:56 -0300 |
commit | 6eb85e846f25ae36a39685d6ac91025deaea306c (patch) | |
tree | 5627dc5d03fc559885e41204e94afde39811b8c1 | |
parent | c5bddfeab8a905ce6c306ffafb079d8550f0b027 (diff) |
Reduce some unnecessary allocations in DMA handler (#2886)1.1.181
* experimental changes to try and reduce allocations in kernel threading and DMA handler
* Simplify the changes in this branch to just 1. Don't make unnecessary copies of data just for texture-texture transfers and 2. Add a fast path for 1bpp linear byte copies
* forgot to check src + dst linearity in 1bpp DMA fast path. Fixes the UE4 regression.
* removing dev log I left in
* Generalizing the DMA linear fast path to cases other than 1bpp copies
* revert kernel changes
* revert whitespace
* remove unneeded references
* PR feedback
Co-authored-by: Logan Stromberg <lostromb@microsoft.com>
Co-authored-by: gdk <gab.dark.100@gmail.com>
-rw-r--r-- | Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs | 47 |
1 files changed, 33 insertions, 14 deletions
diff --git a/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs b/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs index df7e55a1..b1f766b8 100644 --- a/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs +++ b/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs @@ -208,7 +208,6 @@ namespace Ryujinx.Graphics.Gpu.Engine.Dma } ReadOnlySpan<byte> srcSpan = memoryManager.GetSpan(srcGpuVa + (ulong)srcBaseOffset, srcSize, true); - Span<byte> dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray(); bool completeSource = IsTextureCopyComplete(src, srcLinear, srcBpp, srcStride, xCount, yCount); bool completeDest = IsTextureCopyComplete(dst, dstLinear, dstBpp, dstStride, xCount, yCount); @@ -262,43 +261,63 @@ namespace Ryujinx.Graphics.Gpu.Engine.Dma target.SynchronizeMemory(); target.SetData(data); target.SignalModified(); - return; } else if (srcCalculator.LayoutMatches(dstCalculator)) { - srcSpan.CopyTo(dstSpan); // No layout conversion has to be performed, just copy the data entirely. - - memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, dstSpan); - + // No layout conversion has to be performed, just copy the data entirely. + memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, srcSpan); return; } } unsafe bool Convert<T>(Span<byte> dstSpan, ReadOnlySpan<byte> srcSpan) where T : unmanaged { - fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan) + if (srcLinear && dstLinear && srcBpp == dstBpp) { - byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset. - byte* srcBase = srcPtr - srcBaseOffset; - + // Optimized path for purely linear copies - we don't need to calculate every single byte offset, + // and we can make use of Span.CopyTo which is very very fast (even compared to pointers) for (int y = 0; y < yCount; y++) { srcCalculator.SetY(srcRegionY + y); dstCalculator.SetY(dstRegionY + y); + int srcOffset = srcCalculator.GetOffset(srcRegionX); + int dstOffset = dstCalculator.GetOffset(dstRegionX); + srcSpan.Slice(srcOffset - srcBaseOffset, xCount * srcBpp) + .CopyTo(dstSpan.Slice(dstOffset - dstBaseOffset, xCount * dstBpp)); + } + } + else + { + fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan) + { + byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset. + byte* srcBase = srcPtr - srcBaseOffset; - for (int x = 0; x < xCount; x++) + for (int y = 0; y < yCount; y++) { - int srcOffset = srcCalculator.GetOffset(srcRegionX + x); - int dstOffset = dstCalculator.GetOffset(dstRegionX + x); + srcCalculator.SetY(srcRegionY + y); + dstCalculator.SetY(dstRegionY + y); + + for (int x = 0; x < xCount; x++) + { + int srcOffset = srcCalculator.GetOffset(srcRegionX + x); + int dstOffset = dstCalculator.GetOffset(dstRegionX + x); - *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset); + *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset); + } } } } + return true; } + // OPT: This allocates a (potentially) huge temporary array and then copies an existing + // region of memory into it, data that might get overwritten entirely anyways. Ideally this should + // all be rewritten to use pooled arrays, but that gets complicated with packed data and strides + Span<byte> dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray(); + bool _ = srcBpp switch { 1 => Convert<byte>(dstSpan, srcSpan), |