using Ryujinx.Common; using Ryujinx.Common.Memory; using Ryujinx.Graphics.Device; using Ryujinx.Graphics.Gpu.Engine.Threed; using Ryujinx.Graphics.Gpu.Memory; using Ryujinx.Graphics.Texture; using System; using System.Collections.Generic; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; namespace Ryujinx.Graphics.Gpu.Engine.Dma { /// /// Represents a DMA copy engine class. /// class DmaClass : IDeviceState { private readonly GpuContext _context; private readonly GpuChannel _channel; private readonly ThreedClass _3dEngine; private readonly DeviceState _state; /// /// Copy flags passed on DMA launch. /// [Flags] private enum CopyFlags { SrcLinear = 1 << 7, DstLinear = 1 << 8, MultiLineEnable = 1 << 9, RemapEnable = 1 << 10, } /// /// Texture parameters for copy. /// private readonly struct TextureParams { /// /// Copy region X coordinate. /// public readonly int RegionX; /// /// Copy region Y coordinate. /// public readonly int RegionY; /// /// Offset from the base pointer of the data in memory. /// public readonly int BaseOffset; /// /// Bytes per pixel. /// public readonly int Bpp; /// /// Whether the texture is linear. If false, the texture is block linear. /// public readonly bool Linear; /// /// Pixel offset from XYZ coordinates calculator. /// public readonly OffsetCalculator Calculator; /// /// Creates texture parameters. /// /// Copy region X coordinate /// Copy region Y coordinate /// Offset from the base pointer of the data in memory /// Bytes per pixel /// Whether the texture is linear. If false, the texture is block linear /// Pixel offset from XYZ coordinates calculator public TextureParams(int regionX, int regionY, int baseOffset, int bpp, bool linear, OffsetCalculator calculator) { RegionX = regionX; RegionY = regionY; BaseOffset = baseOffset; Bpp = bpp; Linear = linear; Calculator = calculator; } } [StructLayout(LayoutKind.Sequential, Size = 3, Pack = 1)] private struct UInt24 { public byte Byte0; public byte Byte1; public byte Byte2; } /// /// Creates a new instance of the DMA copy engine class. /// /// GPU context /// GPU channel /// 3D engine public DmaClass(GpuContext context, GpuChannel channel, ThreedClass threedEngine) { _context = context; _channel = channel; _3dEngine = threedEngine; _state = new DeviceState(new Dictionary { { nameof(DmaClassState.LaunchDma), new RwCallback(LaunchDma, null) }, }); } /// /// Reads data from the class registers. /// /// Register byte offset /// Data at the specified offset public int Read(int offset) => _state.Read(offset); /// /// Writes data to the class registers. /// /// Register byte offset /// Data to be written public void Write(int offset, int data) => _state.Write(offset, data); /// /// Determine if a buffer-to-texture region covers the entirety of a texture. /// /// Texture to compare /// True if the texture is linear, false if block linear /// Texture bytes per pixel /// Texture stride /// Number of pixels to be copied /// Number of lines to be copied /// private static bool IsTextureCopyComplete(DmaTexture tex, bool linear, int bpp, int stride, int xCount, int yCount) { if (linear) { // If the stride is negative, the texture has to be flipped, so // the fast copy is not trivial, use the slow path. if (stride <= 0) { return false; } int alignWidth = Constants.StrideAlignment / bpp; return stride / bpp == BitUtils.AlignUp(xCount, alignWidth); } else { int alignWidth = Constants.GobAlignment / bpp; return tex.RegionX == 0 && tex.RegionY == 0 && tex.Width == BitUtils.AlignUp(xCount, alignWidth) && tex.Height == yCount; } } /// /// Releases a semaphore for a given LaunchDma method call. /// /// The LaunchDma call argument private void ReleaseSemaphore(int argument) { LaunchDmaSemaphoreType type = (LaunchDmaSemaphoreType)((argument >> 3) & 0x3); if (type != LaunchDmaSemaphoreType.None) { ulong address = ((ulong)_state.State.SetSemaphoreA << 32) | _state.State.SetSemaphoreB; if (type == LaunchDmaSemaphoreType.ReleaseOneWordSemaphore) { _channel.MemoryManager.Write(address, _state.State.SetSemaphorePayload); } else /* if (type == LaunchDmaSemaphoreType.ReleaseFourWordSemaphore) */ { _channel.MemoryManager.Write(address + 8, _context.GetTimestamp()); _channel.MemoryManager.Write(address, (ulong)_state.State.SetSemaphorePayload); } } } /// /// Performs a buffer to buffer, or buffer to texture copy. /// /// The LaunchDma call argument private void DmaCopy(int argument) { var memoryManager = _channel.MemoryManager; CopyFlags copyFlags = (CopyFlags)argument; bool srcLinear = copyFlags.HasFlag(CopyFlags.SrcLinear); bool dstLinear = copyFlags.HasFlag(CopyFlags.DstLinear); bool copy2D = copyFlags.HasFlag(CopyFlags.MultiLineEnable); bool remap = copyFlags.HasFlag(CopyFlags.RemapEnable); uint size = _state.State.LineLengthIn; if (size == 0) { return; } ulong srcGpuVa = ((ulong)_state.State.OffsetInUpperUpper << 32) | _state.State.OffsetInLower; ulong dstGpuVa = ((ulong)_state.State.OffsetOutUpperUpper << 32) | _state.State.OffsetOutLower; int xCount = (int)_state.State.LineLengthIn; int yCount = (int)_state.State.LineCount; _channel.TextureManager.RefreshModifiedTextures(); _3dEngine.CreatePendingSyncs(); _3dEngine.FlushUboDirty(); if (copy2D) { // Buffer to texture copy. int componentSize = (int)_state.State.SetRemapComponentsComponentSize + 1; int srcComponents = (int)_state.State.SetRemapComponentsNumSrcComponents + 1; int dstComponents = (int)_state.State.SetRemapComponentsNumDstComponents + 1; int srcBpp = remap ? srcComponents * componentSize : 1; int dstBpp = remap ? dstComponents * componentSize : 1; var dst = Unsafe.As(ref _state.State.SetDstBlockSize); var src = Unsafe.As(ref _state.State.SetSrcBlockSize); int srcRegionX = 0, srcRegionY = 0, dstRegionX = 0, dstRegionY = 0; if (!srcLinear) { srcRegionX = src.RegionX; srcRegionY = src.RegionY; } if (!dstLinear) { dstRegionX = dst.RegionX; dstRegionY = dst.RegionY; } int srcStride = (int)_state.State.PitchIn; int dstStride = (int)_state.State.PitchOut; var srcCalculator = new OffsetCalculator( src.Width, src.Height, srcStride, srcLinear, src.MemoryLayout.UnpackGobBlocksInY(), src.MemoryLayout.UnpackGobBlocksInZ(), srcBpp); var dstCalculator = new OffsetCalculator( dst.Width, dst.Height, dstStride, dstLinear, dst.MemoryLayout.UnpackGobBlocksInY(), dst.MemoryLayout.UnpackGobBlocksInZ(), dstBpp); (int srcBaseOffset, int srcSize) = srcCalculator.GetRectangleRange(srcRegionX, srcRegionY, xCount, yCount); (int dstBaseOffset, int dstSize) = dstCalculator.GetRectangleRange(dstRegionX, dstRegionY, xCount, yCount); if (srcLinear && srcStride < 0) { srcBaseOffset += srcStride * (yCount - 1); } if (dstLinear && dstStride < 0) { dstBaseOffset += dstStride * (yCount - 1); } // If remapping is disabled, we always copy the components directly, in order. // If it's enabled, but the mapping is just XYZW, we also copy them in order. bool isIdentityRemap = !remap || (_state.State.SetRemapComponentsDstX == SetRemapComponentsDst.SrcX && (dstComponents < 2 || _state.State.SetRemapComponentsDstY == SetRemapComponentsDst.SrcY) && (dstComponents < 3 || _state.State.SetRemapComponentsDstZ == SetRemapComponentsDst.SrcZ) && (dstComponents < 4 || _state.State.SetRemapComponentsDstW == SetRemapComponentsDst.SrcW)); bool completeSource = IsTextureCopyComplete(src, srcLinear, srcBpp, srcStride, xCount, yCount); bool completeDest = IsTextureCopyComplete(dst, dstLinear, dstBpp, dstStride, xCount, yCount); // Check if the source texture exists on the GPU, if it does, do a GPU side copy. // Otherwise, we would need to flush the source texture which is costly. // We don't expect the source to be linear in such cases, as linear source usually indicates buffer or CPU written data. if (completeSource && completeDest && !srcLinear && isIdentityRemap) { var source = memoryManager.Physical.TextureCache.FindTexture( memoryManager, srcGpuVa, srcBpp, srcStride, src.Height, xCount, yCount, srcLinear, src.MemoryLayout.UnpackGobBlocksInY(), src.MemoryLayout.UnpackGobBlocksInZ()); if (source != null && source.Height == yCount) { source.SynchronizeMemory(); var target = memoryManager.Physical.TextureCache.FindOrCreateTexture( memoryManager, source.Info.FormatInfo, dstGpuVa, xCount, yCount, dstStride, dstLinear, dst.MemoryLayout.UnpackGobBlocksInY(), dst.MemoryLayout.UnpackGobBlocksInZ()); if (source.ScaleFactor != target.ScaleFactor) { target.PropagateScale(source); } source.HostTexture.CopyTo(target.HostTexture, 0, 0); target.SignalModified(); return; } } ReadOnlySpan srcSpan = memoryManager.GetSpan(srcGpuVa + (ulong)srcBaseOffset, srcSize, true); // Try to set the texture data directly, // but only if we are doing a complete copy, // and not for block linear to linear copies, since those are typically accessed from the CPU. if (completeSource && completeDest && !(dstLinear && !srcLinear) && isIdentityRemap) { var target = memoryManager.Physical.TextureCache.FindTexture( memoryManager, dstGpuVa, dstBpp, dstStride, dst.Height, xCount, yCount, dstLinear, dst.MemoryLayout.UnpackGobBlocksInY(), dst.MemoryLayout.UnpackGobBlocksInZ()); if (target != null) { MemoryOwner data; if (srcLinear) { data = LayoutConverter.ConvertLinearStridedToLinear( target.Info.Width, target.Info.Height, 1, 1, xCount * srcBpp, srcStride, target.Info.FormatInfo.BytesPerPixel, srcSpan); } else { data = LayoutConverter.ConvertBlockLinearToLinear( src.Width, src.Height, src.Depth, 1, 1, 1, 1, 1, srcBpp, src.MemoryLayout.UnpackGobBlocksInY(), src.MemoryLayout.UnpackGobBlocksInZ(), 1, new SizeInfo((int)target.Size), srcSpan); } target.SynchronizeMemory(); target.SetData(data); target.SignalModified(); return; } else if (srcCalculator.LayoutMatches(dstCalculator)) { // No layout conversion has to be performed, just copy the data entirely. memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, srcSpan); return; } } // OPT: This allocates a (potentially) huge temporary array and then copies an existing // region of memory into it, data that might get overwritten entirely anyways. Ideally this should // all be rewritten to use pooled arrays, but that gets complicated with packed data and strides Span dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray(); TextureParams srcParams = new(srcRegionX, srcRegionY, srcBaseOffset, srcBpp, srcLinear, srcCalculator); TextureParams dstParams = new(dstRegionX, dstRegionY, dstBaseOffset, dstBpp, dstLinear, dstCalculator); if (isIdentityRemap) { // The order of the components doesn't change, so we can just copy directly // (with layout conversion if necessary). switch (srcBpp) { case 1: Copy(dstSpan, srcSpan, dstParams, srcParams); break; case 2: Copy(dstSpan, srcSpan, dstParams, srcParams); break; case 4: Copy(dstSpan, srcSpan, dstParams, srcParams); break; case 8: Copy(dstSpan, srcSpan, dstParams, srcParams); break; case 12: Copy(dstSpan, srcSpan, dstParams, srcParams); break; case 16: Copy>(dstSpan, srcSpan, dstParams, srcParams); break; default: throw new NotSupportedException($"Unable to copy ${srcBpp} bpp pixel format."); } } else { // The order or value of the components might change. switch (componentSize) { case 1: CopyShuffle(dstSpan, srcSpan, dstParams, srcParams); break; case 2: CopyShuffle(dstSpan, srcSpan, dstParams, srcParams); break; case 3: CopyShuffle(dstSpan, srcSpan, dstParams, srcParams); break; case 4: CopyShuffle(dstSpan, srcSpan, dstParams, srcParams); break; default: throw new NotSupportedException($"Unable to copy ${componentSize} component size."); } } memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, dstSpan); } else { if (remap && _state.State.SetRemapComponentsDstX == SetRemapComponentsDst.ConstA && _state.State.SetRemapComponentsDstY == SetRemapComponentsDst.ConstA && _state.State.SetRemapComponentsDstZ == SetRemapComponentsDst.ConstA && _state.State.SetRemapComponentsDstW == SetRemapComponentsDst.ConstA && _state.State.SetRemapComponentsNumSrcComponents == SetRemapComponentsNumComponents.One && _state.State.SetRemapComponentsNumDstComponents == SetRemapComponentsNumComponents.One && _state.State.SetRemapComponentsComponentSize == SetRemapComponentsComponentSize.Four) { // Fast path for clears when remap is enabled. memoryManager.Physical.BufferCache.ClearBuffer(memoryManager, dstGpuVa, size * 4, _state.State.SetRemapConstA); } else { // TODO: Implement remap functionality. // Buffer to buffer copy. bool srcIsPitchKind = memoryManager.GetKind(srcGpuVa).IsPitch(); bool dstIsPitchKind = memoryManager.GetKind(dstGpuVa).IsPitch(); if (!srcIsPitchKind && dstIsPitchKind) { CopyGobBlockLinearToLinear(memoryManager, srcGpuVa, dstGpuVa, size); } else if (srcIsPitchKind && !dstIsPitchKind) { CopyGobLinearToBlockLinear(memoryManager, srcGpuVa, dstGpuVa, size); } else { memoryManager.Physical.BufferCache.CopyBuffer(memoryManager, srcGpuVa, dstGpuVa, size); } } } } /// /// Copies data from one texture to another, while performing layout conversion if necessary. /// /// Pixel type /// Destination texture memory region /// Source texture memory region /// Destination texture parameters /// Source texture parameters private unsafe void Copy(Span dstSpan, ReadOnlySpan srcSpan, TextureParams dst, TextureParams src) where T : unmanaged { int xCount = (int)_state.State.LineLengthIn; int yCount = (int)_state.State.LineCount; if (src.Linear && dst.Linear && src.Bpp == dst.Bpp) { // Optimized path for purely linear copies - we don't need to calculate every single byte offset, // and we can make use of Span.CopyTo which is very very fast (even compared to pointers) for (int y = 0; y < yCount; y++) { src.Calculator.SetY(src.RegionY + y); dst.Calculator.SetY(dst.RegionY + y); int srcOffset = src.Calculator.GetOffset(src.RegionX); int dstOffset = dst.Calculator.GetOffset(dst.RegionX); srcSpan.Slice(srcOffset - src.BaseOffset, xCount * src.Bpp) .CopyTo(dstSpan.Slice(dstOffset - dst.BaseOffset, xCount * dst.Bpp)); } } else { fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan) { byte* dstBase = dstPtr - dst.BaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset. byte* srcBase = srcPtr - src.BaseOffset; for (int y = 0; y < yCount; y++) { src.Calculator.SetY(src.RegionY + y); dst.Calculator.SetY(dst.RegionY + y); for (int x = 0; x < xCount; x++) { int srcOffset = src.Calculator.GetOffset(src.RegionX + x); int dstOffset = dst.Calculator.GetOffset(dst.RegionX + x); *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset); } } } } } /// /// Sets texture pixel data to a constant value, while performing layout conversion if necessary. /// /// Pixel type /// Destination texture memory region /// Destination texture parameters /// Constant pixel value to be set private unsafe void Fill(Span dstSpan, TextureParams dst, T fillValue) where T : unmanaged { int xCount = (int)_state.State.LineLengthIn; int yCount = (int)_state.State.LineCount; fixed (byte* dstPtr = dstSpan) { byte* dstBase = dstPtr - dst.BaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset. for (int y = 0; y < yCount; y++) { dst.Calculator.SetY(dst.RegionY + y); for (int x = 0; x < xCount; x++) { int dstOffset = dst.Calculator.GetOffset(dst.RegionX + x); *(T*)(dstBase + dstOffset) = fillValue; } } } } /// /// Copies data from one texture to another, while performing layout conversion and component shuffling if necessary. /// /// Pixel type /// Destination texture memory region /// Source texture memory region /// Destination texture parameters /// Source texture parameters private void CopyShuffle(Span dstSpan, ReadOnlySpan srcSpan, TextureParams dst, TextureParams src) where T : unmanaged { int dstComponents = (int)_state.State.SetRemapComponentsNumDstComponents + 1; for (int i = 0; i < dstComponents; i++) { SetRemapComponentsDst componentsDst = i switch { 0 => _state.State.SetRemapComponentsDstX, 1 => _state.State.SetRemapComponentsDstY, 2 => _state.State.SetRemapComponentsDstZ, _ => _state.State.SetRemapComponentsDstW, }; switch (componentsDst) { case SetRemapComponentsDst.SrcX: Copy(dstSpan[(Unsafe.SizeOf() * i)..], srcSpan, dst, src); break; case SetRemapComponentsDst.SrcY: Copy(dstSpan[(Unsafe.SizeOf() * i)..], srcSpan[Unsafe.SizeOf()..], dst, src); break; case SetRemapComponentsDst.SrcZ: Copy(dstSpan[(Unsafe.SizeOf() * i)..], srcSpan[(Unsafe.SizeOf() * 2)..], dst, src); break; case SetRemapComponentsDst.SrcW: Copy(dstSpan[(Unsafe.SizeOf() * i)..], srcSpan[(Unsafe.SizeOf() * 3)..], dst, src); break; case SetRemapComponentsDst.ConstA: Fill(dstSpan[(Unsafe.SizeOf() * i)..], dst, Unsafe.As(ref _state.State.SetRemapConstA)); break; case SetRemapComponentsDst.ConstB: Fill(dstSpan[(Unsafe.SizeOf() * i)..], dst, Unsafe.As(ref _state.State.SetRemapConstB)); break; } } } /// /// Copies block linear data with block linear GOBs to a block linear destination with linear GOBs. /// /// GPU memory manager /// Source GPU virtual address /// Destination GPU virtual address /// Size in bytes of the copy private static void CopyGobBlockLinearToLinear(MemoryManager memoryManager, ulong srcGpuVa, ulong dstGpuVa, ulong size) { if (((srcGpuVa | dstGpuVa | size) & 0xf) == 0) { for (ulong offset = 0; offset < size; offset += 16) { Vector128 data = memoryManager.Read>(ConvertGobLinearToBlockLinearAddress(srcGpuVa + offset), true); memoryManager.Write(dstGpuVa + offset, data); } } else { for (ulong offset = 0; offset < size; offset++) { byte data = memoryManager.Read(ConvertGobLinearToBlockLinearAddress(srcGpuVa + offset), true); memoryManager.Write(dstGpuVa + offset, data); } } } /// /// Copies block linear data with linear GOBs to a block linear destination with block linear GOBs. /// /// GPU memory manager /// Source GPU virtual address /// Destination GPU virtual address /// Size in bytes of the copy private static void CopyGobLinearToBlockLinear(MemoryManager memoryManager, ulong srcGpuVa, ulong dstGpuVa, ulong size) { if (((srcGpuVa | dstGpuVa | size) & 0xf) == 0) { for (ulong offset = 0; offset < size; offset += 16) { Vector128 data = memoryManager.Read>(srcGpuVa + offset, true); memoryManager.Write(ConvertGobLinearToBlockLinearAddress(dstGpuVa + offset), data); } } else { for (ulong offset = 0; offset < size; offset++) { byte data = memoryManager.Read(srcGpuVa + offset, true); memoryManager.Write(ConvertGobLinearToBlockLinearAddress(dstGpuVa + offset), data); } } } /// /// Calculates the GOB block linear address from a linear address. /// /// Linear address /// Block linear address private static ulong ConvertGobLinearToBlockLinearAddress(ulong address) { // y2 y1 y0 x5 x4 x3 x2 x1 x0 -> x5 y2 y1 x4 y0 x3 x2 x1 x0 return (address & ~0x1f0UL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) | ((address & 0x180) >> 1) | ((address & 0x20) << 3); } /// /// Performs a buffer to buffer, or buffer to texture copy, then optionally releases a semaphore. /// /// Method call argument private void LaunchDma(int argument) { DmaCopy(argument); ReleaseSemaphore(argument); } } }