aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLogan Stromberg <loganstromberg@gmail.com>2022-07-14 11:45:56 -0700
committerGitHub <noreply@github.com>2022-07-14 15:45:56 -0300
commit6eb85e846f25ae36a39685d6ac91025deaea306c (patch)
tree5627dc5d03fc559885e41204e94afde39811b8c1
parentc5bddfeab8a905ce6c306ffafb079d8550f0b027 (diff)
Reduce some unnecessary allocations in DMA handler (#2886)1.1.181
* experimental changes to try and reduce allocations in kernel threading and DMA handler * Simplify the changes in this branch to just 1. Don't make unnecessary copies of data just for texture-texture transfers and 2. Add a fast path for 1bpp linear byte copies * forgot to check src + dst linearity in 1bpp DMA fast path. Fixes the UE4 regression. * removing dev log I left in * Generalizing the DMA linear fast path to cases other than 1bpp copies * revert kernel changes * revert whitespace * remove unneeded references * PR feedback Co-authored-by: Logan Stromberg <lostromb@microsoft.com> Co-authored-by: gdk <gab.dark.100@gmail.com>
-rw-r--r--Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs47
1 files changed, 33 insertions, 14 deletions
diff --git a/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs b/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs
index df7e55a1..b1f766b8 100644
--- a/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs
@@ -208,7 +208,6 @@ namespace Ryujinx.Graphics.Gpu.Engine.Dma
}
ReadOnlySpan<byte> srcSpan = memoryManager.GetSpan(srcGpuVa + (ulong)srcBaseOffset, srcSize, true);
- Span<byte> dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray();
bool completeSource = IsTextureCopyComplete(src, srcLinear, srcBpp, srcStride, xCount, yCount);
bool completeDest = IsTextureCopyComplete(dst, dstLinear, dstBpp, dstStride, xCount, yCount);
@@ -262,43 +261,63 @@ namespace Ryujinx.Graphics.Gpu.Engine.Dma
target.SynchronizeMemory();
target.SetData(data);
target.SignalModified();
-
return;
}
else if (srcCalculator.LayoutMatches(dstCalculator))
{
- srcSpan.CopyTo(dstSpan); // No layout conversion has to be performed, just copy the data entirely.
-
- memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, dstSpan);
-
+ // No layout conversion has to be performed, just copy the data entirely.
+ memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, srcSpan);
return;
}
}
unsafe bool Convert<T>(Span<byte> dstSpan, ReadOnlySpan<byte> srcSpan) where T : unmanaged
{
- fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan)
+ if (srcLinear && dstLinear && srcBpp == dstBpp)
{
- byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset.
- byte* srcBase = srcPtr - srcBaseOffset;
-
+ // Optimized path for purely linear copies - we don't need to calculate every single byte offset,
+ // and we can make use of Span.CopyTo which is very very fast (even compared to pointers)
for (int y = 0; y < yCount; y++)
{
srcCalculator.SetY(srcRegionY + y);
dstCalculator.SetY(dstRegionY + y);
+ int srcOffset = srcCalculator.GetOffset(srcRegionX);
+ int dstOffset = dstCalculator.GetOffset(dstRegionX);
+ srcSpan.Slice(srcOffset - srcBaseOffset, xCount * srcBpp)
+ .CopyTo(dstSpan.Slice(dstOffset - dstBaseOffset, xCount * dstBpp));
+ }
+ }
+ else
+ {
+ fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan)
+ {
+ byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset.
+ byte* srcBase = srcPtr - srcBaseOffset;
- for (int x = 0; x < xCount; x++)
+ for (int y = 0; y < yCount; y++)
{
- int srcOffset = srcCalculator.GetOffset(srcRegionX + x);
- int dstOffset = dstCalculator.GetOffset(dstRegionX + x);
+ srcCalculator.SetY(srcRegionY + y);
+ dstCalculator.SetY(dstRegionY + y);
+
+ for (int x = 0; x < xCount; x++)
+ {
+ int srcOffset = srcCalculator.GetOffset(srcRegionX + x);
+ int dstOffset = dstCalculator.GetOffset(dstRegionX + x);
- *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset);
+ *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset);
+ }
}
}
}
+
return true;
}
+ // OPT: This allocates a (potentially) huge temporary array and then copies an existing
+ // region of memory into it, data that might get overwritten entirely anyways. Ideally this should
+ // all be rewritten to use pooled arrays, but that gets complicated with packed data and strides
+ Span<byte> dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray();
+
bool _ = srcBpp switch
{
1 => Convert<byte>(dstSpan, srcSpan),