aboutsummaryrefslogtreecommitdiff
path: root/Ryujinx.Graphics.Texture
diff options
context:
space:
mode:
Diffstat (limited to 'Ryujinx.Graphics.Texture')
-rw-r--r--Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs2
-rw-r--r--Ryujinx.Graphics.Texture/BC6Decoder.cs819
-rw-r--r--Ryujinx.Graphics.Texture/BC7Decoder.cs220
-rw-r--r--Ryujinx.Graphics.Texture/BCnDecoder.cs114
-rw-r--r--Ryujinx.Graphics.Texture/BCnEncoder.cs60
-rw-r--r--Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs1005
-rw-r--r--Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs10
-rw-r--r--Ryujinx.Graphics.Texture/Utils/BC67Tables.cs297
-rw-r--r--Ryujinx.Graphics.Texture/Utils/BC67Utils.cs1327
-rw-r--r--Ryujinx.Graphics.Texture/Utils/BC7ModeInfo.cs37
-rw-r--r--Ryujinx.Graphics.Texture/Utils/Block.cs55
-rw-r--r--Ryujinx.Graphics.Texture/Utils/RgbaColor32.cs229
-rw-r--r--Ryujinx.Graphics.Texture/Utils/RgbaColor8.cs84
13 files changed, 4244 insertions, 15 deletions
diff --git a/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs b/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs
index 238f46a0..08738583 100644
--- a/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs
+++ b/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs
@@ -291,7 +291,7 @@ namespace Ryujinx.Graphics.Texture.Astc
int depth,
int levels,
int layers,
- out Span<byte> decoded)
+ out byte[] decoded)
{
byte[] output = new byte[QueryDecompressedSize(width, height, depth, levels, layers)];
diff --git a/Ryujinx.Graphics.Texture/BC6Decoder.cs b/Ryujinx.Graphics.Texture/BC6Decoder.cs
new file mode 100644
index 00000000..819bf022
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/BC6Decoder.cs
@@ -0,0 +1,819 @@
+using Ryujinx.Graphics.Texture.Utils;
+using System;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Texture
+{
+ static class BC6Decoder
+ {
+ private const int HalfOne = 0x3C00;
+
+ public static void Decode(Span<byte> output, ReadOnlySpan<byte> data, int width, int height, bool signed)
+ {
+ ReadOnlySpan<Block> blocks = MemoryMarshal.Cast<byte, Block>(data);
+
+ Span<ulong> output64 = MemoryMarshal.Cast<byte, ulong>(output);
+
+ int wInBlocks = (width + 3) / 4;
+ int hInBlocks = (height + 3) / 4;
+
+ for (int y = 0; y < hInBlocks; y++)
+ {
+ int y2 = y * 4;
+ int bh = Math.Min(4, height - y2);
+
+ for (int x = 0; x < wInBlocks; x++)
+ {
+ int x2 = x * 4;
+ int bw = Math.Min(4, width - x2);
+
+ DecodeBlock(blocks[y * wInBlocks + x], output64.Slice(y2 * width + x2), bw, bh, width, signed);
+ }
+ }
+ }
+
+ private static void DecodeBlock(Block block, Span<ulong> output, int w, int h, int width, bool signed)
+ {
+ int mode = (int)(block.Low & 3);
+ if ((mode & 2) != 0)
+ {
+ mode = (int)(block.Low & 0x1f);
+ }
+
+ Span<RgbaColor32> endPoints = stackalloc RgbaColor32[4];
+ int subsetCount = DecodeEndPoints(ref block, endPoints, mode, signed);
+ if (subsetCount == 0)
+ {
+ // Mode is invalid, the spec mandates that hardware fills the block with
+ // a opaque black color.
+ for (int ty = 0; ty < h; ty++)
+ {
+ int baseOffs = ty * width;
+
+ for (int tx = 0; tx < w; tx++)
+ {
+ output[baseOffs + tx] = (ulong)HalfOne << 48;
+ }
+ }
+
+ return;
+ }
+
+ int partition;
+ int indexBitCount;
+ ulong indices;
+
+ if (subsetCount > 1)
+ {
+ partition = (int)((block.High >> 13) & 0x1F);
+ indexBitCount = 3;
+
+ int fixUpIndex = BC67Tables.FixUpIndices[subsetCount - 1][partition][1] * 3;
+ ulong lowMask = (ulong.MaxValue >> (65 - fixUpIndex)) << 3;
+ ulong highMask = ulong.MaxValue << (fixUpIndex + 3);
+
+ indices = ((block.High >> 16) & highMask) | ((block.High >> 17) & lowMask) | ((block.High >> 18) & 3);
+ }
+ else
+ {
+ partition = 0;
+ indexBitCount = 4;
+ indices = (block.High & ~0xFUL) | ((block.High >> 1) & 7);
+ }
+
+ ulong indexMask = (1UL << indexBitCount) - 1;
+
+ for (int ty = 0; ty < h; ty++)
+ {
+ int baseOffs = ty * width;
+
+ for (int tx = 0; tx < w; tx++)
+ {
+ int offs = baseOffs + tx;
+ int index = (int)(indices & indexMask);
+ int endPointBase = BC67Tables.PartitionTable[subsetCount - 1][partition][ty * 4 + tx] << 1;
+
+ RgbaColor32 color1 = endPoints[endPointBase];
+ RgbaColor32 color2 = endPoints[endPointBase + 1];
+
+ RgbaColor32 color = BC67Utils.Interpolate(color1, color2, index, indexBitCount);
+
+ output[offs] =
+ (ulong)FinishUnquantize(color.R, signed) |
+ ((ulong)FinishUnquantize(color.G, signed) << 16) |
+ ((ulong)FinishUnquantize(color.B, signed) << 32) |
+ ((ulong)HalfOne << 48);
+
+ indices >>= indexBitCount;
+ }
+ }
+ }
+
+ private static int DecodeEndPoints(ref Block block, Span<RgbaColor32> endPoints, int mode, bool signed)
+ {
+ ulong low = block.Low;
+ ulong high = block.High;
+
+ int r0 = 0, g0 = 0, b0 = 0, r1 = 0, g1 = 0, b1 = 0, r2 = 0, g2 = 0, b2 = 0, r3 = 0, g3 = 0, b3 = 0;
+ int subsetCount;
+
+ switch (mode)
+ {
+ case 0:
+ r0 = (int)(low >> 5) & 0x3FF;
+ g0 = (int)(low >> 15) & 0x3FF;
+ b0 = (int)(low >> 25) & 0x3FF;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 10);
+ g0 = SignExtend(g0, 10);
+ b0 = SignExtend(b0, 10);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 5);
+ g1 = g0 + SignExtend((int)(low >> 45), 5);
+ b1 = b0 + SignExtend((int)(low >> 55), 5);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 5);
+ g2 = g0 + SignExtend((int)(((low << 2) & 0x10) | ((low >> 41) & 0xF)), 5);
+ b2 = b0 + SignExtend((int)(((low << 1) & 0x10) | ((high << 3) & 0x08) | (low >> 61)), 5);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 5);
+ g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0xF)), 5);
+ b3 = b0 + SignExtend((int)(
+ ((low) & 0x10) |
+ ((high >> 9) & 0x08) |
+ ((high >> 4) & 0x04) |
+ ((low >> 59) & 0x02) |
+ ((low >> 50) & 0x01)), 5);
+
+ r0 = Unquantize(r0, 10, signed);
+ g0 = Unquantize(g0, 10, signed);
+ b0 = Unquantize(b0, 10, signed);
+
+ r1 = Unquantize(r1 & 0x3FF, 10, signed);
+ g1 = Unquantize(g1 & 0x3FF, 10, signed);
+ b1 = Unquantize(b1 & 0x3FF, 10, signed);
+
+ r2 = Unquantize(r2 & 0x3FF, 10, signed);
+ g2 = Unquantize(g2 & 0x3FF, 10, signed);
+ b2 = Unquantize(b2 & 0x3FF, 10, signed);
+
+ r3 = Unquantize(r3 & 0x3FF, 10, signed);
+ g3 = Unquantize(g3 & 0x3FF, 10, signed);
+ b3 = Unquantize(b3 & 0x3FF, 10, signed);
+
+ subsetCount = 2;
+ break;
+ case 1:
+ r0 = (int)(low >> 5) & 0x7F;
+ g0 = (int)(low >> 15) & 0x7F;
+ b0 = (int)(low >> 25) & 0x7F;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 7);
+ g0 = SignExtend(g0, 7);
+ b0 = SignExtend(b0, 7);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 6);
+ g1 = g0 + SignExtend((int)(low >> 45), 6);
+ b1 = b0 + SignExtend((int)(low >> 55), 6);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 6);
+ g2 = g0 + SignExtend((int)(((low << 3) & 0x20) | ((low >> 20) & 0x10) | ((low >> 41) & 0x0F)), 6);
+ b2 = b0 + SignExtend((int)(
+ ((low >> 17) & 0x20) |
+ ((low >> 10) & 0x10) |
+ ((high << 3) & 0x08) |
+ (low >> 61)), 6);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 6);
+ g3 = g0 + SignExtend((int)(((low << 1) & 0x30) | ((low >> 51) & 0xF)), 6);
+ b3 = b0 + SignExtend((int)(
+ ((low >> 28) & 0x20) |
+ ((low >> 30) & 0x10) |
+ ((low >> 29) & 0x08) |
+ ((low >> 21) & 0x04) |
+ ((low >> 12) & 0x03)), 6);
+
+ r0 = Unquantize(r0, 7, signed);
+ g0 = Unquantize(g0, 7, signed);
+ b0 = Unquantize(b0, 7, signed);
+
+ r1 = Unquantize(r1 & 0x7F, 7, signed);
+ g1 = Unquantize(g1 & 0x7F, 7, signed);
+ b1 = Unquantize(b1 & 0x7F, 7, signed);
+
+ r2 = Unquantize(r2 & 0x7F, 7, signed);
+ g2 = Unquantize(g2 & 0x7F, 7, signed);
+ b2 = Unquantize(b2 & 0x7F, 7, signed);
+
+ r3 = Unquantize(r3 & 0x7F, 7, signed);
+ g3 = Unquantize(g3 & 0x7F, 7, signed);
+ b3 = Unquantize(b3 & 0x7F, 7, signed);
+
+ subsetCount = 2;
+ break;
+ case 2:
+ r0 = (int)(((low >> 30) & 0x400) | ((low >> 5) & 0x3FF));
+ g0 = (int)(((low >> 39) & 0x400) | ((low >> 15) & 0x3FF));
+ b0 = (int)(((low >> 49) & 0x400) | ((low >> 25) & 0x3FF));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 11);
+ g0 = SignExtend(g0, 11);
+ b0 = SignExtend(b0, 11);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 5);
+ g1 = g0 + SignExtend((int)(low >> 45), 4);
+ b1 = b0 + SignExtend((int)(low >> 55), 4);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 5);
+ g2 = g0 + SignExtend((int)(low >> 41), 4);
+ b2 = b0 + SignExtend((int)(((high << 3) & 8) | (low >> 61)), 4);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 5);
+ g3 = g0 + SignExtend((int)(low >> 51), 4);
+ b3 = b0 + SignExtend((int)(
+ ((high >> 9) & 8) |
+ ((high >> 4) & 4) |
+ ((low >> 59) & 2) |
+ ((low >> 50) & 1)), 4);
+
+ r0 = Unquantize(r0, 11, signed);
+ g0 = Unquantize(g0, 11, signed);
+ b0 = Unquantize(b0, 11, signed);
+
+ r1 = Unquantize(r1 & 0x7FF, 11, signed);
+ g1 = Unquantize(g1 & 0x7FF, 11, signed);
+ b1 = Unquantize(b1 & 0x7FF, 11, signed);
+
+ r2 = Unquantize(r2 & 0x7FF, 11, signed);
+ g2 = Unquantize(g2 & 0x7FF, 11, signed);
+ b2 = Unquantize(b2 & 0x7FF, 11, signed);
+
+ r3 = Unquantize(r3 & 0x7FF, 11, signed);
+ g3 = Unquantize(g3 & 0x7FF, 11, signed);
+ b3 = Unquantize(b3 & 0x7FF, 11, signed);
+
+ subsetCount = 2;
+ break;
+ case 3:
+ r0 = (int)(low >> 5) & 0x3FF;
+ g0 = (int)(low >> 15) & 0x3FF;
+ b0 = (int)(low >> 25) & 0x3FF;
+
+ r1 = (int)(low >> 35) & 0x3FF;
+ g1 = (int)(low >> 45) & 0x3FF;
+ b1 = (int)(((high << 9) & 0x200) | (low >> 55));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 10);
+ g0 = SignExtend(g0, 10);
+ b0 = SignExtend(b0, 10);
+
+ r1 = SignExtend(r1, 10);
+ g1 = SignExtend(g1, 10);
+ b1 = SignExtend(b1, 10);
+ }
+
+ r0 = Unquantize(r0, 10, signed);
+ g0 = Unquantize(g0, 10, signed);
+ b0 = Unquantize(b0, 10, signed);
+
+ r1 = Unquantize(r1, 10, signed);
+ g1 = Unquantize(g1, 10, signed);
+ b1 = Unquantize(b1, 10, signed);
+
+ subsetCount = 1;
+ break;
+ case 6:
+ r0 = (int)(((low >> 29) & 0x400) | ((low >> 5) & 0x3FF));
+ g0 = (int)(((low >> 40) & 0x400) | ((low >> 15) & 0x3FF));
+ b0 = (int)(((low >> 49) & 0x400) | ((low >> 25) & 0x3FF));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 11);
+ g0 = SignExtend(g0, 11);
+ b0 = SignExtend(b0, 11);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 4);
+ g1 = g0 + SignExtend((int)(low >> 45), 5);
+ b1 = b0 + SignExtend((int)(low >> 55), 4);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 4);
+ g2 = g0 + SignExtend((int)(((high >> 7) & 0x10) | ((low >> 41) & 0x0F)), 5);
+ b2 = b0 + SignExtend((int)(((high << 3) & 0x08) | ((low >> 61))), 4);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 4);
+ g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0x0F)), 5);
+ b3 = b0 + SignExtend((int)(
+ ((high >> 9) & 8) |
+ ((high >> 4) & 4) |
+ ((low >> 59) & 2) |
+ ((high >> 5) & 1)), 4);
+
+ r0 = Unquantize(r0, 11, signed);
+ g0 = Unquantize(g0, 11, signed);
+ b0 = Unquantize(b0, 11, signed);
+
+ r1 = Unquantize(r1 & 0x7FF, 11, signed);
+ g1 = Unquantize(g1 & 0x7FF, 11, signed);
+ b1 = Unquantize(b1 & 0x7FF, 11, signed);
+
+ r2 = Unquantize(r2 & 0x7FF, 11, signed);
+ g2 = Unquantize(g2 & 0x7FF, 11, signed);
+ b2 = Unquantize(b2 & 0x7FF, 11, signed);
+
+ r3 = Unquantize(r3 & 0x7FF, 11, signed);
+ g3 = Unquantize(g3 & 0x7FF, 11, signed);
+ b3 = Unquantize(b3 & 0x7FF, 11, signed);
+
+ subsetCount = 2;
+ break;
+ case 7:
+ r0 = (int)(((low >> 34) & 0x400) | ((low >> 5) & 0x3FF));
+ g0 = (int)(((low >> 44) & 0x400) | ((low >> 15) & 0x3FF));
+ b0 = (int)(((high << 10) & 0x400) | ((low >> 25) & 0x3FF));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 11);
+ g0 = SignExtend(g0, 11);
+ b0 = SignExtend(b0, 11);
+ }
+
+ r1 = (r0 + SignExtend((int)(low >> 35), 9)) & 0x7FF;
+ g1 = (g0 + SignExtend((int)(low >> 45), 9)) & 0x7FF;
+ b1 = (b0 + SignExtend((int)(low >> 55), 9)) & 0x7FF;
+
+ r0 = Unquantize(r0, 11, signed);
+ g0 = Unquantize(g0, 11, signed);
+ b0 = Unquantize(b0, 11, signed);
+
+ r1 = Unquantize(r1, 11, signed);
+ g1 = Unquantize(g1, 11, signed);
+ b1 = Unquantize(b1, 11, signed);
+
+ subsetCount = 1;
+ break;
+ case 10:
+ r0 = (int)(((low >> 29) & 0x400) | ((low >> 5) & 0x3FF));
+ g0 = (int)(((low >> 39) & 0x400) | ((low >> 15) & 0x3FF));
+ b0 = (int)(((low >> 50) & 0x400) | ((low >> 25) & 0x3FF));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 11);
+ g0 = SignExtend(g0, 11);
+ b0 = SignExtend(b0, 11);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 4);
+ g1 = g0 + SignExtend((int)(low >> 45), 4);
+ b1 = b0 + SignExtend((int)(low >> 55), 5);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 4);
+ g2 = g0 + SignExtend((int)(low >> 41), 4);
+ b2 = b0 + SignExtend((int)(((low >> 36) & 0x10) | ((high << 3) & 8) | (low >> 61)), 5);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 4);
+ g3 = g0 + SignExtend((int)(low >> 51), 4);
+ b3 = b0 + SignExtend((int)(
+ ((high >> 7) & 0x10) |
+ ((high >> 9) & 0x08) |
+ ((high >> 4) & 0x06) |
+ ((low >> 50) & 0x01)), 5);
+
+ r0 = Unquantize(r0, 11, signed);
+ g0 = Unquantize(g0, 11, signed);
+ b0 = Unquantize(b0, 11, signed);
+
+ r1 = Unquantize(r1 & 0x7FF, 11, signed);
+ g1 = Unquantize(g1 & 0x7FF, 11, signed);
+ b1 = Unquantize(b1 & 0x7FF, 11, signed);
+
+ r2 = Unquantize(r2 & 0x7FF, 11, signed);
+ g2 = Unquantize(g2 & 0x7FF, 11, signed);
+ b2 = Unquantize(b2 & 0x7FF, 11, signed);
+
+ r3 = Unquantize(r3 & 0x7FF, 11, signed);
+ g3 = Unquantize(g3 & 0x7FF, 11, signed);
+ b3 = Unquantize(b3 & 0x7FF, 11, signed);
+
+ subsetCount = 2;
+ break;
+ case 11:
+ r0 = (int)(((low >> 32) & 0x800) | ((low >> 34) & 0x400) | ((low >> 5) & 0x3FF));
+ g0 = (int)(((low >> 42) & 0x800) | ((low >> 44) & 0x400) | ((low >> 15) & 0x3FF));
+ b0 = (int)(((low >> 52) & 0x800) | ((high << 10) & 0x400) | ((low >> 25) & 0x3FF));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 12);
+ g0 = SignExtend(g0, 12);
+ b0 = SignExtend(b0, 12);
+ }
+
+ r1 = (r0 + SignExtend((int)(low >> 35), 8)) & 0xFFF;
+ g1 = (g0 + SignExtend((int)(low >> 45), 8)) & 0xFFF;
+ b1 = (b0 + SignExtend((int)(low >> 55), 8)) & 0xFFF;
+
+ r0 = Unquantize(r0, 12, signed);
+ g0 = Unquantize(g0, 12, signed);
+ b0 = Unquantize(b0, 12, signed);
+
+ r1 = Unquantize(r1, 12, signed);
+ g1 = Unquantize(g1, 12, signed);
+ b1 = Unquantize(b1, 12, signed);
+
+ subsetCount = 1;
+ break;
+ case 14:
+ r0 = (int)(low >> 5) & 0x1FF;
+ g0 = (int)(low >> 15) & 0x1FF;
+ b0 = (int)(low >> 25) & 0x1FF;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 9);
+ g0 = SignExtend(g0, 9);
+ b0 = SignExtend(b0, 9);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 5);
+ g1 = g0 + SignExtend((int)(low >> 45), 5);
+ b1 = b0 + SignExtend((int)(low >> 55), 5);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 5);
+ g2 = g0 + SignExtend((int)(((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 5);
+ b2 = b0 + SignExtend((int)(((low >> 10) & 0x10) | ((high << 3) & 8) | (low >> 61)), 5);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 5);
+ g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0xF)), 5);
+ b3 = b0 + SignExtend((int)(
+ ((low >> 30) & 0x10) |
+ ((high >> 9) & 0x08) |
+ ((high >> 4) & 0x04) |
+ ((low >> 59) & 0x02) |
+ ((low >> 50) & 0x01)), 5);
+
+ r0 = Unquantize(r0, 9, signed);
+ g0 = Unquantize(g0, 9, signed);
+ b0 = Unquantize(b0, 9, signed);
+
+ r1 = Unquantize(r1 & 0x1FF, 9, signed);
+ g1 = Unquantize(g1 & 0x1FF, 9, signed);
+ b1 = Unquantize(b1 & 0x1FF, 9, signed);
+
+ r2 = Unquantize(r2 & 0x1FF, 9, signed);
+ g2 = Unquantize(g2 & 0x1FF, 9, signed);
+ b2 = Unquantize(b2 & 0x1FF, 9, signed);
+
+ r3 = Unquantize(r3 & 0x1FF, 9, signed);
+ g3 = Unquantize(g3 & 0x1FF, 9, signed);
+ b3 = Unquantize(b3 & 0x1FF, 9, signed);
+
+ subsetCount = 2;
+ break;
+ case 15:
+ r0 = (BitReverse6((int)(low >> 39) & 0x3F) << 10) | ((int)(low >> 5) & 0x3FF);
+ g0 = (BitReverse6((int)(low >> 49) & 0x3F) << 10) | ((int)(low >> 15) & 0x3FF);
+ b0 = ((BitReverse6((int)(low >> 59)) | (int)(high & 1)) << 10) | ((int)(low >> 25) & 0x3FF);
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 16);
+ g0 = SignExtend(g0, 16);
+ b0 = SignExtend(b0, 16);
+ }
+
+ r1 = (r0 + SignExtend((int)(low >> 35), 4)) & 0xFFFF;
+ g1 = (g0 + SignExtend((int)(low >> 45), 4)) & 0xFFFF;
+ b1 = (b0 + SignExtend((int)(low >> 55), 4)) & 0xFFFF;
+
+ subsetCount = 1;
+ break;
+ case 18:
+ r0 = (int)(low >> 5) & 0xFF;
+ g0 = (int)(low >> 15) & 0xFF;
+ b0 = (int)(low >> 25) & 0xFF;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 8);
+ g0 = SignExtend(g0, 8);
+ b0 = SignExtend(b0, 8);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 6);
+ g1 = g0 + SignExtend((int)(low >> 45), 5);
+ b1 = b0 + SignExtend((int)(low >> 55), 5);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 6);
+ g2 = g0 + SignExtend((int)(((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 5);
+ b2 = b0 + SignExtend((int)(((low >> 10) & 0x10) | ((high << 3) & 8) | (low >> 61)), 5);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 6);
+ g3 = g0 + SignExtend((int)(((low >> 9) & 0x10) | ((low >> 51) & 0xF)), 5);
+ b3 = b0 + SignExtend((int)(
+ ((low >> 30) & 0x18) |
+ ((low >> 21) & 0x04) |
+ ((low >> 59) & 0x02) |
+ ((low >> 50) & 0x01)), 5);
+
+ r0 = Unquantize(r0, 8, signed);
+ g0 = Unquantize(g0, 8, signed);
+ b0 = Unquantize(b0, 8, signed);
+
+ r1 = Unquantize(r1 & 0xFF, 8, signed);
+ g1 = Unquantize(g1 & 0xFF, 8, signed);
+ b1 = Unquantize(b1 & 0xFF, 8, signed);
+
+ r2 = Unquantize(r2 & 0xFF, 8, signed);
+ g2 = Unquantize(g2 & 0xFF, 8, signed);
+ b2 = Unquantize(b2 & 0xFF, 8, signed);
+
+ r3 = Unquantize(r3 & 0xFF, 8, signed);
+ g3 = Unquantize(g3 & 0xFF, 8, signed);
+ b3 = Unquantize(b3 & 0xFF, 8, signed);
+
+ subsetCount = 2;
+ break;
+ case 22:
+ r0 = (int)(low >> 5) & 0xFF;
+ g0 = (int)(low >> 15) & 0xFF;
+ b0 = (int)(low >> 25) & 0xFF;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 8);
+ g0 = SignExtend(g0, 8);
+ b0 = SignExtend(b0, 8);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 5);
+ g1 = g0 + SignExtend((int)(low >> 45), 6);
+ b1 = b0 + SignExtend((int)(low >> 55), 5);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 5);
+ g2 = g0 + SignExtend((int)(((low >> 18) & 0x20) | ((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 6);
+ b2 = b0 + SignExtend((int)(((low >> 10) & 0x10) | ((high << 3) & 0x08) | (low >> 61)), 5);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 5);
+ g3 = g0 + SignExtend((int)(((low >> 28) & 0x20) | ((low >> 36) & 0x10) | ((low >> 51) & 0x0F)), 6);
+ b3 = b0 + SignExtend((int)(
+ ((low >> 30) & 0x10) |
+ ((high >> 9) & 0x08) |
+ ((high >> 4) & 0x04) |
+ ((low >> 59) & 0x02) |
+ ((low >> 13) & 0x01)), 5);
+
+ r0 = Unquantize(r0, 8, signed);
+ g0 = Unquantize(g0, 8, signed);
+ b0 = Unquantize(b0, 8, signed);
+
+ r1 = Unquantize(r1 & 0xFF, 8, signed);
+ g1 = Unquantize(g1 & 0xFF, 8, signed);
+ b1 = Unquantize(b1 & 0xFF, 8, signed);
+
+ r2 = Unquantize(r2 & 0xFF, 8, signed);
+ g2 = Unquantize(g2 & 0xFF, 8, signed);
+ b2 = Unquantize(b2 & 0xFF, 8, signed);
+
+ r3 = Unquantize(r3 & 0xFF, 8, signed);
+ g3 = Unquantize(g3 & 0xFF, 8, signed);
+ b3 = Unquantize(b3 & 0xFF, 8, signed);
+
+ subsetCount = 2;
+ break;
+ case 26:
+ r0 = (int)(low >> 5) & 0xFF;
+ g0 = (int)(low >> 15) & 0xFF;
+ b0 = (int)(low >> 25) & 0xFF;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 8);
+ g0 = SignExtend(g0, 8);
+ b0 = SignExtend(b0, 8);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 5);
+ g1 = g0 + SignExtend((int)(low >> 45), 5);
+ b1 = b0 + SignExtend((int)(low >> 55), 6);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 5);
+ g2 = g0 + SignExtend((int)(((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 5);
+ b2 = b0 + SignExtend((int)(
+ ((low >> 18) & 0x20) |
+ ((low >> 10) & 0x10) |
+ ((high << 3) & 0x08) |
+ (low >> 61)), 6);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 5);
+ g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0xF)), 5);
+ b3 = b0 + SignExtend((int)(
+ ((low >> 28) & 0x20) |
+ ((low >> 30) & 0x10) |
+ ((high >> 9) & 0x08) |
+ ((high >> 4) & 0x04) |
+ ((low >> 12) & 0x02) |
+ ((low >> 50) & 0x01)), 6);
+
+ r0 = Unquantize(r0, 8, signed);
+ g0 = Unquantize(g0, 8, signed);
+ b0 = Unquantize(b0, 8, signed);
+
+ r1 = Unquantize(r1 & 0xFF, 8, signed);
+ g1 = Unquantize(g1 & 0xFF, 8, signed);
+ b1 = Unquantize(b1 & 0xFF, 8, signed);
+
+ r2 = Unquantize(r2 & 0xFF, 8, signed);
+ g2 = Unquantize(g2 & 0xFF, 8, signed);
+ b2 = Unquantize(b2 & 0xFF, 8, signed);
+
+ r3 = Unquantize(r3 & 0xFF, 8, signed);
+ g3 = Unquantize(g3 & 0xFF, 8, signed);
+ b3 = Unquantize(b3 & 0xFF, 8, signed);
+
+ subsetCount = 2;
+ break;
+ case 30:
+ r0 = (int)(low >> 5) & 0x3F;
+ g0 = (int)(low >> 15) & 0x3F;
+ b0 = (int)(low >> 25) & 0x3F;
+
+ r1 = (int)(low >> 35) & 0x3F;
+ g1 = (int)(low >> 45) & 0x3F;
+ b1 = (int)(low >> 55) & 0x3F;
+
+ r2 = (int)(high >> 1) & 0x3F;
+ g2 = (int)(((low >> 16) & 0x20) | ((low >> 20) & 0x10) | ((low >> 41) & 0xF));
+ b2 = (int)(((low >> 17) & 0x20) | ((low >> 10) & 0x10) | ((high << 3) & 0x08) | (low >> 61));
+
+ r3 = (int)(high >> 7) & 0x3F;
+ g3 = (int)(((low >> 26) & 0x20) | ((low >> 7) & 0x10) | ((low >> 51) & 0xF));
+ b3 = (int)(
+ ((low >> 28) & 0x20) |
+ ((low >> 30) & 0x10) |
+ ((low >> 29) & 0x08) |
+ ((low >> 21) & 0x04) |
+ ((low >> 12) & 0x03));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 6);
+ g0 = SignExtend(g0, 6);
+ b0 = SignExtend(b0, 6);
+
+ r1 = SignExtend(r1, 6);
+ g1 = SignExtend(g1, 6);
+ b1 = SignExtend(b1, 6);
+
+ r2 = SignExtend(r2, 6);
+ g2 = SignExtend(g2, 6);
+ b2 = SignExtend(b2, 6);
+
+ r3 = SignExtend(r3, 6);
+ g3 = SignExtend(g3, 6);
+ b3 = SignExtend(b3, 6);
+ }
+
+ r0 = Unquantize(r0, 6, signed);
+ g0 = Unquantize(g0, 6, signed);
+ b0 = Unquantize(b0, 6, signed);
+
+ r1 = Unquantize(r1, 6, signed);
+ g1 = Unquantize(g1, 6, signed);
+ b1 = Unquantize(b1, 6, signed);
+
+ r2 = Unquantize(r2, 6, signed);
+ g2 = Unquantize(g2, 6, signed);
+ b2 = Unquantize(b2, 6, signed);
+
+ r3 = Unquantize(r3, 6, signed);
+ g3 = Unquantize(g3, 6, signed);
+ b3 = Unquantize(b3, 6, signed);
+
+ subsetCount = 2;
+ break;
+ default:
+ subsetCount = 0;
+ break;
+ }
+
+ if (subsetCount > 0)
+ {
+ endPoints[0] = new RgbaColor32(r0, g0, b0, HalfOne);
+ endPoints[1] = new RgbaColor32(r1, g1, b1, HalfOne);
+
+ if (subsetCount > 1)
+ {
+ endPoints[2] = new RgbaColor32(r2, g2, b2, HalfOne);
+ endPoints[3] = new RgbaColor32(r3, g3, b3, HalfOne);
+ }
+ }
+
+ return subsetCount;
+ }
+
+ private static int SignExtend(int value, int bits)
+ {
+ int shift = 32 - bits;
+ return (value << shift) >> shift;
+ }
+
+ private static int Unquantize(int value, int bits, bool signed)
+ {
+ if (signed)
+ {
+ if (bits >= 16)
+ {
+ return value;
+ }
+ else
+ {
+ bool sign = value < 0;
+
+ if (sign)
+ {
+ value = -value;
+ }
+
+ if (value == 0)
+ {
+ return value;
+ }
+ else if (value >= ((1 << (bits - 1)) - 1))
+ {
+ value = 0x7FFF;
+ }
+ else
+ {
+ value = ((value << 15) + 0x4000) >> (bits - 1);
+ }
+
+ if (sign)
+ {
+ value = -value;
+ }
+ }
+ }
+ else
+ {
+ if (bits >= 15 || value == 0)
+ {
+ return value;
+ }
+ else if (value == ((1 << bits) - 1))
+ {
+ return 0xFFFF;
+ }
+ else
+ {
+ return ((value << 16) + 0x8000) >> bits;
+ }
+ }
+
+ return value;
+ }
+
+ private static ushort FinishUnquantize(int value, bool signed)
+ {
+ if (signed)
+ {
+ value = value < 0 ? -((-value * 31) >> 5) : (value * 31) >> 5;
+
+ int sign = 0;
+ if (value < 0)
+ {
+ sign = 0x8000;
+ value = -value;
+ }
+
+ return (ushort)(sign | value);
+ }
+ else
+ {
+ return (ushort)((value * 31) >> 6);
+ }
+ }
+
+ private static int BitReverse6(int value)
+ {
+ value = ((value >> 1) & 0x55) | ((value << 1) & 0xaa);
+ value = ((value >> 2) & 0x33) | ((value << 2) & 0xcc);
+ value = ((value >> 4) & 0x0f) | ((value << 4) & 0xf0);
+ return value >> 2;
+ }
+ }
+}
diff --git a/Ryujinx.Graphics.Texture/BC7Decoder.cs b/Ryujinx.Graphics.Texture/BC7Decoder.cs
new file mode 100644
index 00000000..060d1ab8
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/BC7Decoder.cs
@@ -0,0 +1,220 @@
+using Ryujinx.Graphics.Texture.Utils;
+using System.Diagnostics;
+using System;
+using System.Numerics;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Texture
+{
+ static class BC7Decoder
+ {
+ public static void Decode(Span<byte> output, ReadOnlySpan<byte> data, int width, int height)
+ {
+ ReadOnlySpan<Block> blocks = MemoryMarshal.Cast<byte, Block>(data);
+
+ Span<uint> output32 = MemoryMarshal.Cast<byte, uint>(output);
+
+ int wInBlocks = (width + 3) / 4;
+ int hInBlocks = (height + 3) / 4;
+
+ for (int y = 0; y < hInBlocks; y++)
+ {
+ int y2 = y * 4;
+ int bh = Math.Min(4, height - y2);
+
+ for (int x = 0; x < wInBlocks; x++)
+ {
+ int x2 = x * 4;
+ int bw = Math.Min(4, width - x2);
+
+ DecodeBlock(blocks[y * wInBlocks + x], output32.Slice(y2 * width + x2), bw, bh, width);
+ }
+ }
+ }
+
+ private static void DecodeBlock(Block block, Span<uint> output, int w, int h, int width)
+ {
+ int mode = BitOperations.TrailingZeroCount((byte)block.Low | 0x100);
+ if (mode == 8)
+ {
+ // Mode is invalid, the spec mandates that hardware fills the block with
+ // a transparent black color.
+ for (int ty = 0; ty < h; ty++)
+ {
+ int baseOffs = ty * width;
+
+ for (int tx = 0; tx < w; tx++)
+ {
+ int offs = baseOffs + tx;
+
+ output[offs] = 0;
+ }
+ }
+
+ return;
+ }
+
+ BC7ModeInfo modeInfo = BC67Tables.BC7ModeInfos[mode];
+
+ int offset = mode + 1;
+ int partition = (int)block.Decode(ref offset, modeInfo.PartitionBitCount);
+ int rotation = (int)block.Decode(ref offset, modeInfo.RotationBitCount);
+ int indexMode = (int)block.Decode(ref offset, modeInfo.IndexModeBitCount);
+
+ Debug.Assert(partition < 64);
+ Debug.Assert(rotation < 4);
+ Debug.Assert(indexMode < 2);
+
+ int endPointCount = modeInfo.SubsetCount * 2;
+
+ Span<RgbaColor32> endPoints = stackalloc RgbaColor32[endPointCount];
+ Span<byte> pValues = stackalloc byte[modeInfo.PBits];
+
+ endPoints.Fill(new RgbaColor32(0, 0, 0, 255));
+
+ for (int i = 0; i < endPointCount; i++)
+ {
+ endPoints[i].R = (int)block.Decode(ref offset, modeInfo.ColorDepth);
+ }
+
+ for (int i = 0; i < endPointCount; i++)
+ {
+ endPoints[i].G = (int)block.Decode(ref offset, modeInfo.ColorDepth);
+ }
+
+ for (int i = 0; i < endPointCount; i++)
+ {
+ endPoints[i].B = (int)block.Decode(ref offset, modeInfo.ColorDepth);
+ }
+
+ if (modeInfo.AlphaDepth != 0)
+ {
+ for (int i = 0; i < endPointCount; i++)
+ {
+ endPoints[i].A = (int)block.Decode(ref offset, modeInfo.AlphaDepth);
+ }
+ }
+
+ for (int i = 0; i < modeInfo.PBits; i++)
+ {
+ pValues[i] = (byte)block.Decode(ref offset, 1);
+ }
+
+ for (int i = 0; i < endPointCount; i++)
+ {
+ int pBit = -1;
+
+ if (modeInfo.PBits != 0)
+ {
+ int pIndex = (i * modeInfo.PBits) / endPointCount;
+ pBit = pValues[pIndex];
+ }
+
+ Unquantize(ref endPoints[i], modeInfo.ColorDepth, modeInfo.AlphaDepth, pBit);
+ }
+
+ byte[] partitionTable = BC67Tables.PartitionTable[modeInfo.SubsetCount - 1][partition];
+ byte[] fixUpTable = BC67Tables.FixUpIndices[modeInfo.SubsetCount - 1][partition];
+
+ Span<byte> colorIndices = stackalloc byte[16];
+
+ for (int i = 0; i < 16; i++)
+ {
+ byte subset = partitionTable[i];
+ int bitCount = i == fixUpTable[subset] ? modeInfo.ColorIndexBitCount - 1 : modeInfo.ColorIndexBitCount;
+
+ colorIndices[i] = (byte)block.Decode(ref offset, bitCount);
+ Debug.Assert(colorIndices[i] < 16);
+ }
+
+ Span<byte> alphaIndices = stackalloc byte[16];
+
+ if (modeInfo.AlphaIndexBitCount != 0)
+ {
+ for (int i = 0; i < 16; i++)
+ {
+ int bitCount = i != 0 ? modeInfo.AlphaIndexBitCount : modeInfo.AlphaIndexBitCount - 1;
+
+ alphaIndices[i] = (byte)block.Decode(ref offset, bitCount);
+ Debug.Assert(alphaIndices[i] < 16);
+ }
+ }
+
+ for (int ty = 0; ty < h; ty++)
+ {
+ int baseOffs = ty * width;
+
+ for (int tx = 0; tx < w; tx++)
+ {
+ int i = ty * 4 + tx;
+
+ RgbaColor32 color;
+
+ byte subset = partitionTable[i];
+
+ RgbaColor32 color1 = endPoints[subset * 2];
+ RgbaColor32 color2 = endPoints[subset * 2 + 1];
+
+ if (modeInfo.AlphaIndexBitCount != 0)
+ {
+ if (indexMode == 0)
+ {
+ color = BC67Utils.Interpolate(color1, color2, colorIndices[i], alphaIndices[i], modeInfo.ColorIndexBitCount, modeInfo.AlphaIndexBitCount);
+ }
+ else
+ {
+ color = BC67Utils.Interpolate(color1, color2, alphaIndices[i], colorIndices[i], modeInfo.AlphaIndexBitCount, modeInfo.ColorIndexBitCount);
+ }
+ }
+ else
+ {
+ color = BC67Utils.Interpolate(color1, color2, colorIndices[i], colorIndices[i], modeInfo.ColorIndexBitCount, modeInfo.ColorIndexBitCount);
+ }
+
+ if (rotation != 0)
+ {
+ int a = color.A;
+
+ switch (rotation)
+ {
+ case 1: color.A = color.R; color.R = a; break;
+ case 2: color.A = color.G; color.G = a; break;
+ case 3: color.A = color.B; color.B = a; break;
+ }
+ }
+
+ RgbaColor8 color8 = color.GetColor8();
+
+ output[baseOffs + tx] = color8.ToUInt32();
+ }
+ }
+ }
+
+ private static void Unquantize(ref RgbaColor32 color, int colorDepth, int alphaDepth, int pBit)
+ {
+ color.R = UnquantizeComponent(color.R, colorDepth, pBit);
+ color.G = UnquantizeComponent(color.G, colorDepth, pBit);
+ color.B = UnquantizeComponent(color.B, colorDepth, pBit);
+ color.A = alphaDepth != 0 ? UnquantizeComponent(color.A, alphaDepth, pBit) : 255;
+ }
+
+ private static int UnquantizeComponent(int component, int bits, int pBit)
+ {
+ int shift = 8 - bits;
+ int value = component << shift;
+
+ if (pBit >= 0)
+ {
+ Debug.Assert(pBit <= 1);
+ value |= value >> (bits + 1);
+ value |= pBit << (shift - 1);
+ }
+ else
+ {
+ value |= value >> bits;
+ }
+
+ return value;
+ }
+ }
+}
diff --git a/Ryujinx.Graphics.Texture/BCnDecoder.cs b/Ryujinx.Graphics.Texture/BCnDecoder.cs
index b840cac8..b21fa4d1 100644
--- a/Ryujinx.Graphics.Texture/BCnDecoder.cs
+++ b/Ryujinx.Graphics.Texture/BCnDecoder.cs
@@ -298,9 +298,12 @@ namespace Ryujinx.Graphics.Texture
for (int l = 0; l < levels; l++)
{
- size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
+ size += BitUtils.AlignUp(Math.Max(1, width >> l), 4) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
}
+ // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned.
+ int alignedWidth = BitUtils.AlignUp(width, 4);
+
byte[] output = new byte[size];
Span<byte> outputSpan = new Span<byte>(output);
@@ -331,14 +334,14 @@ namespace Ryujinx.Graphics.Texture
{
int baseY = y * BlockHeight;
int copyHeight = Math.Min(BlockHeight, height - baseY);
- int lineBaseOOffs = imageBaseOOffs + baseY * width;
+ int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth;
if (copyHeight == 4)
{
outputLine0 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs));
- outputLine1 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + width));
- outputLine2 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + width * 2));
- outputLine3 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + width * 3));
+ outputLine1 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth));
+ outputLine2 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth * 2));
+ outputLine3 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth * 3));
}
for (int x = 0; x < w; x++)
@@ -375,7 +378,7 @@ namespace Ryujinx.Graphics.Texture
for (int tY = 0; tY < copyHeight; tY++)
{
- tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + width * tY, copyWidth));
+ tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + alignedWidth * tY, copyWidth));
}
}
@@ -383,13 +386,15 @@ namespace Ryujinx.Graphics.Texture
}
}
- imageBaseOOffs += width * height;
+ imageBaseOOffs += alignedWidth * height;
}
}
width = Math.Max(1, width >> 1);
height = Math.Max(1, height >> 1);
depth = Math.Max(1, depth >> 1);
+
+ alignedWidth = BitUtils.AlignUp(width, 4);
}
return output;
@@ -401,9 +406,12 @@ namespace Ryujinx.Graphics.Texture
for (int l = 0; l < levels; l++)
{
- size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 2;
+ size += BitUtils.AlignUp(Math.Max(1, width >> l), 2) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 2;
}
+ // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned.
+ int alignedWidth = BitUtils.AlignUp(width, 2);
+
byte[] output = new byte[size];
ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
@@ -438,14 +446,14 @@ namespace Ryujinx.Graphics.Texture
{
int baseY = y * BlockHeight;
int copyHeight = Math.Min(BlockHeight, height - baseY);
- int lineBaseOOffs = imageBaseOOffs + baseY * width;
+ int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth;
if (copyHeight == 4)
{
outputLine0 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs));
- outputLine1 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + width));
- outputLine2 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + width * 2));
- outputLine3 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + width * 3));
+ outputLine1 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth));
+ outputLine2 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth * 2));
+ outputLine3 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth * 3));
}
for (int x = 0; x < w; x++)
@@ -488,7 +496,7 @@ namespace Ryujinx.Graphics.Texture
for (int tY = 0; tY < copyHeight; tY++)
{
- int line = pixelBaseOOffs + width * tY;
+ int line = pixelBaseOOffs + alignedWidth * tY;
for (int tX = 0; tX < copyWidth; tX++)
{
@@ -503,7 +511,85 @@ namespace Ryujinx.Graphics.Texture
}
}
- imageBaseOOffs += width * height;
+ imageBaseOOffs += alignedWidth * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+
+ alignedWidth = BitUtils.AlignUp(width, 2);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeBC6(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 8;
+ }
+
+ byte[] output = new byte[size];
+
+ int inputOffset = 0;
+ int outputOffset = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ BC6Decoder.Decode(output.AsSpan().Slice(outputOffset), data.Slice(inputOffset), width, height, signed);
+
+ inputOffset += w * h * 16;
+ outputOffset += width * height * 8;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeBC7(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
+ }
+
+ byte[] output = new byte[size];
+
+ int inputOffset = 0;
+ int outputOffset = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ BC7Decoder.Decode(output.AsSpan().Slice(outputOffset), data.Slice(inputOffset), width, height);
+
+ inputOffset += w * h * 16;
+ outputOffset += width * height * 4;
}
}
diff --git a/Ryujinx.Graphics.Texture/BCnEncoder.cs b/Ryujinx.Graphics.Texture/BCnEncoder.cs
new file mode 100644
index 00000000..02b79c1b
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/BCnEncoder.cs
@@ -0,0 +1,60 @@
+using Ryujinx.Common;
+using Ryujinx.Graphics.Texture.Encoders;
+using System;
+
+namespace Ryujinx.Graphics.Texture
+{
+ public static class BCnEncoder
+ {
+ private const int BlockWidth = 4;
+ private const int BlockHeight = 4;
+
+ public static byte[] EncodeBC7(byte[] data, int width, int height, int depth, int levels, int layers)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(Math.Max(1, width >> l), BlockWidth);
+ int h = BitUtils.DivRoundUp(Math.Max(1, height >> l), BlockHeight);
+
+ size += w * h * 16 * Math.Max(1, depth >> l) * layers;
+ }
+
+ byte[] output = new byte[size];
+
+ int imageBaseIOffs = 0;
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int rgba8Size = width * height * depth * layers * 4;
+
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ BC7Encoder.Encode(
+ output.AsMemory().Slice(imageBaseOOffs),
+ data.AsMemory().Slice(imageBaseIOffs),
+ width,
+ height,
+ EncodeMode.Fast | EncodeMode.Multithreaded);
+
+ imageBaseIOffs += width * height * 4;
+ imageBaseOOffs += w * h * 16;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+ }
+} \ No newline at end of file
diff --git a/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs b/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs
new file mode 100644
index 00000000..35d36bce
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs
@@ -0,0 +1,1005 @@
+using Ryujinx.Graphics.Texture.Utils;
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using System.Threading.Tasks;
+
+namespace Ryujinx.Graphics.Texture.Encoders
+{
+ static class BC7Encoder
+ {
+ private const int MinColorVarianceForModeChange = 160;
+
+ public static void Encode(Memory<byte> outputStorage, ReadOnlyMemory<byte> data, int width, int height, EncodeMode mode)
+ {
+ int widthInBlocks = (width + 3) / 4;
+ int heightInBlocks = (height + 3) / 4;
+
+ bool fastMode = (mode & EncodeMode.ModeMask) == EncodeMode.Fast;
+
+ if (mode.HasFlag(EncodeMode.Multithreaded))
+ {
+ Parallel.For(0, heightInBlocks, (yInBlocks) =>
+ {
+ Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span);
+ int y = yInBlocks * 4;
+
+ for (int xInBlocks = 0; xInBlocks < widthInBlocks; xInBlocks++)
+ {
+ int x = xInBlocks * 4;
+ Block block = CompressBlock(data.Span, x, y, width, height, fastMode);
+
+ int offset = (yInBlocks * widthInBlocks + xInBlocks) * 2;
+ output[offset] = block.Low;
+ output[offset + 1] = block.High;
+ }
+ });
+ }
+ else
+ {
+ Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span);
+ int offset = 0;
+
+ for (int y = 0; y < height; y += 4)
+ {
+ for (int x = 0; x < width; x += 4)
+ {
+ Block block = CompressBlock(data.Span, x, y, width, height, fastMode);
+
+ output[offset++] = block.Low;
+ output[offset++] = block.High;
+ }
+ }
+ }
+ }
+
+ private static readonly int[] _mostFrequentPartitions = new int[]
+ {
+ 0, 13, 2, 1, 15, 14, 10, 23
+ };
+
+ private static Block CompressBlock(ReadOnlySpan<byte> data, int x, int y, int width, int height, bool fastMode)
+ {
+ int w = Math.Min(4, width - x);
+ int h = Math.Min(4, height - y);
+
+ var dataUint = MemoryMarshal.Cast<byte, uint>(data);
+
+ int baseOffset = y * width + x;
+
+ Span<uint> tile = stackalloc uint[w * h];
+
+ for (int ty = 0; ty < h; ty++)
+ {
+ int rowOffset = baseOffset + ty * width;
+
+ for (int tx = 0; tx < w; tx++)
+ {
+ tile[ty * w + tx] = dataUint[rowOffset + tx];
+ }
+ }
+
+ return fastMode ? EncodeFast(tile, w, h) : EncodeExhaustive(tile, w, h);
+ }
+
+ private static Block EncodeFast(ReadOnlySpan<uint> tile, int w, int h)
+ {
+ (RgbaColor8 minColor, RgbaColor8 maxColor) = BC67Utils.GetMinMaxColors(tile, w, h);
+
+ bool alphaNotOne = minColor.A != 255 || maxColor.A != 255;
+ int variance = BC67Utils.SquaredDifference(minColor.GetColor32(), maxColor.GetColor32());
+ int selectedMode;
+ int indexMode = 0;
+
+ if (alphaNotOne)
+ {
+ bool constantAlpha = minColor.A == maxColor.A;
+ if (constantAlpha)
+ {
+ selectedMode = variance > MinColorVarianceForModeChange ? 7 : 6;
+ }
+ else
+ {
+ if (variance > MinColorVarianceForModeChange)
+ {
+ Span<uint> uniqueRGB = stackalloc uint[16];
+ Span<uint> uniqueAlpha = stackalloc uint[16];
+
+ int uniqueRGBCount = 0;
+ int uniqueAlphaCount = 0;
+
+ uint rgbMask = new RgbaColor8(255, 255, 255, 0).ToUInt32();
+ uint alphaMask = new RgbaColor8(0, 0, 0, 255).ToUInt32();
+
+ for (int i = 0; i < tile.Length; i++)
+ {
+ uint c = tile[i];
+
+ if (!uniqueRGB.Slice(0, uniqueRGBCount).Contains(c & rgbMask))
+ {
+ uniqueRGB[uniqueRGBCount++] = c & rgbMask;
+ }
+
+ if (!uniqueAlpha.Slice(0, uniqueAlphaCount).Contains(c & alphaMask))
+ {
+ uniqueAlpha[uniqueAlphaCount++] = c & alphaMask;
+ }
+ }
+
+ selectedMode = 4;
+ indexMode = uniqueRGBCount > uniqueAlphaCount ? 1 : 0;
+ }
+ else
+ {
+ selectedMode = 5;
+ }
+ }
+ }
+ else
+ {
+ if (variance > MinColorVarianceForModeChange)
+ {
+ selectedMode = 1;
+ }
+ else
+ {
+ selectedMode = 6;
+ }
+ }
+
+ int selectedPartition = 0;
+
+ if (selectedMode == 1 || selectedMode == 7)
+ {
+ int partitionSelectionLowestError = int.MaxValue;
+
+ for (int i = 0; i < _mostFrequentPartitions.Length; i++)
+ {
+ int p = _mostFrequentPartitions[i];
+ int error = GetEndPointSelectionErrorFast(tile, 2, p, w, h, partitionSelectionLowestError);
+ if (error < partitionSelectionLowestError)
+ {
+ partitionSelectionLowestError = error;
+ selectedPartition = p;
+ }
+ }
+ }
+
+ return Encode(selectedMode, selectedPartition, 0, indexMode, fastMode: true, tile, w, h, out _);
+ }
+
+ private static Block EncodeExhaustive(ReadOnlySpan<uint> tile, int w, int h)
+ {
+ Block bestBlock = default;
+ int lowestError = int.MaxValue;
+ int lowestErrorSubsets = int.MaxValue;
+
+ for (int m = 0; m < 8; m++)
+ {
+ for (int r = 0; r < (m == 4 || m == 5 ? 4 : 1); r++)
+ {
+ for (int im = 0; im < (m == 4 ? 2 : 1); im++)
+ {
+ for (int p = 0; p < 1 << BC67Tables.BC7ModeInfos[m].PartitionBitCount; p++)
+ {
+ Block block = Encode(m, p, r, im, fastMode: false, tile, w, h, out int maxError);
+ if (maxError < lowestError || (maxError == lowestError && BC67Tables.BC7ModeInfos[m].SubsetCount < lowestErrorSubsets))
+ {
+ lowestError = maxError;
+ lowestErrorSubsets = BC67Tables.BC7ModeInfos[m].SubsetCount;
+ bestBlock = block;
+ }
+ }
+ }
+ }
+ }
+
+ return bestBlock;
+ }
+
+ private static Block Encode(
+ int mode,
+ int partition,
+ int rotation,
+ int indexMode,
+ bool fastMode,
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ out int errorSum)
+ {
+ BC7ModeInfo modeInfo = BC67Tables.BC7ModeInfos[mode];
+ int subsetCount = modeInfo.SubsetCount;
+ int partitionBitCount = modeInfo.PartitionBitCount;
+ int rotationBitCount = modeInfo.RotationBitCount;
+ int indexModeBitCount = modeInfo.IndexModeBitCount;
+ int colorDepth = modeInfo.ColorDepth;
+ int alphaDepth = modeInfo.AlphaDepth;
+ int pBits = modeInfo.PBits;
+ int colorIndexBitCount = modeInfo.ColorIndexBitCount;
+ int alphaIndexBitCount = modeInfo.AlphaIndexBitCount;
+ bool separateAlphaIndices = alphaIndexBitCount != 0;
+
+ uint alphaMask;
+
+ if (separateAlphaIndices)
+ {
+ alphaMask = rotation switch
+ {
+ 1 => new RgbaColor8(255, 0, 0, 0).ToUInt32(),
+ 2 => new RgbaColor8(0, 255, 0, 0).ToUInt32(),
+ 3 => new RgbaColor8(0, 0, 255, 0).ToUInt32(),
+ _ => new RgbaColor8(0, 0, 0, 255).ToUInt32()
+ };
+ }
+ else
+ {
+ alphaMask = new RgbaColor8(0, 0, 0, 0).ToUInt32();
+ }
+
+ if (indexMode != 0)
+ {
+ alphaMask = ~alphaMask;
+ }
+
+ //
+ // Select color palette.
+ //
+
+ Span<uint> endPoints0 = stackalloc uint[subsetCount];
+ Span<uint> endPoints1 = stackalloc uint[subsetCount];
+
+ SelectEndPoints(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ subsetCount,
+ partition,
+ colorIndexBitCount,
+ colorDepth,
+ alphaDepth,
+ ~alphaMask,
+ fastMode);
+
+ if (separateAlphaIndices)
+ {
+ SelectEndPoints(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ subsetCount,
+ partition,
+ alphaIndexBitCount,
+ colorDepth,
+ alphaDepth,
+ alphaMask,
+ fastMode);
+ }
+
+ Span<int> pBitValues = stackalloc int[pBits];
+
+ for (int i = 0; i < pBits; i++)
+ {
+ int pBit;
+
+ if (pBits == subsetCount)
+ {
+ pBit = GetPBit(endPoints0[i], endPoints1[i], colorDepth, alphaDepth);
+ }
+ else
+ {
+ int subset = i >> 1;
+ uint color = (i & 1) == 0 ? endPoints0[subset] : endPoints1[subset];
+ pBit = GetPBit(color, colorDepth, alphaDepth);
+ }
+
+ pBitValues[i] = pBit;
+ }
+
+ int colorIndexCount = 1 << colorIndexBitCount;
+ int alphaIndexCount = 1 << alphaIndexBitCount;
+
+ Span<byte> colorIndices = stackalloc byte[16];
+ Span<byte> alphaIndices = stackalloc byte[16];
+
+ errorSum = BC67Utils.SelectIndices(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ pBitValues,
+ colorIndices,
+ subsetCount,
+ partition,
+ colorIndexBitCount,
+ colorIndexCount,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ alphaMask);
+
+ if (separateAlphaIndices)
+ {
+ errorSum += BC67Utils.SelectIndices(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ pBitValues,
+ alphaIndices,
+ subsetCount,
+ partition,
+ alphaIndexBitCount,
+ alphaIndexCount,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ ~alphaMask);
+ }
+
+ Span<bool> colorSwapSubset = stackalloc bool[3];
+
+ for (int i = 0; i < 3; i++)
+ {
+ colorSwapSubset[i] = colorIndices[BC67Tables.FixUpIndices[subsetCount - 1][partition][i]] >= (colorIndexCount >> 1);
+ }
+
+ bool alphaSwapSubset = alphaIndices[0] >= (alphaIndexCount >> 1);
+
+ Block block = new Block();
+
+ int offset = 0;
+
+ block.Encode(1UL << mode, ref offset, mode + 1);
+ block.Encode((ulong)partition, ref offset, partitionBitCount);
+ block.Encode((ulong)rotation, ref offset, rotationBitCount);
+ block.Encode((ulong)indexMode, ref offset, indexModeBitCount);
+
+ for (int comp = 0; comp < 3; comp++)
+ {
+ int rotatedComp = comp;
+
+ if (((comp + 1) & 3) == rotation)
+ {
+ rotatedComp = 3;
+ }
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]);
+ RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]);
+
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits == subsetCount)
+ {
+ pBit0 = pBit1 = pBitValues[subset];
+ }
+ else if (pBits != 0)
+ {
+ pBit0 = pBitValues[subset * 2];
+ pBit1 = pBitValues[subset * 2 + 1];
+ }
+
+ if (indexMode == 0 ? colorSwapSubset[subset] : alphaSwapSubset)
+ {
+ block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth);
+ block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth);
+ }
+ else
+ {
+ block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth);
+ block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth);
+ }
+ }
+ }
+
+ if (alphaDepth != 0)
+ {
+ int rotatedComp = (rotation - 1) & 3;
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]);
+ RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]);
+
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits == subsetCount)
+ {
+ pBit0 = pBit1 = pBitValues[subset];
+ }
+ else if (pBits != 0)
+ {
+ pBit0 = pBitValues[subset * 2];
+ pBit1 = pBitValues[subset * 2 + 1];
+ }
+
+ if (separateAlphaIndices && indexMode == 0 ? alphaSwapSubset : colorSwapSubset[subset])
+ {
+ block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth);
+ block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth);
+ }
+ else
+ {
+ block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth);
+ block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth);
+ }
+ }
+ }
+
+ for (int i = 0; i < pBits; i++)
+ {
+ block.Encode((ulong)pBitValues[i], ref offset, 1);
+ }
+
+ byte[] fixUpTable = BC67Tables.FixUpIndices[subsetCount - 1][partition];
+
+ for (int i = 0; i < 16; i++)
+ {
+ int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][i];
+ byte index = colorIndices[i];
+
+ if (colorSwapSubset[subset])
+ {
+ index = (byte)(index ^ (colorIndexCount - 1));
+ }
+
+ int finalIndexBitCount = i == fixUpTable[subset] ? colorIndexBitCount - 1 : colorIndexBitCount;
+
+ Debug.Assert(index < (1 << finalIndexBitCount));
+
+ block.Encode(index, ref offset, finalIndexBitCount);
+ }
+
+ if (separateAlphaIndices)
+ {
+ for (int i = 0; i < 16; i++)
+ {
+ byte index = alphaIndices[i];
+
+ if (alphaSwapSubset)
+ {
+ index = (byte)(index ^ (alphaIndexCount - 1));
+ }
+
+ int finalIndexBitCount = i == 0 ? alphaIndexBitCount - 1 : alphaIndexBitCount;
+
+ Debug.Assert(index < (1 << finalIndexBitCount));
+
+ block.Encode(index, ref offset, finalIndexBitCount);
+ }
+ }
+
+ return block;
+ }
+
+ private static unsafe int GetEndPointSelectionErrorFast(ReadOnlySpan<uint> tile, int subsetCount, int partition, int w, int h, int maxError)
+ {
+ byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
+
+ Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount];
+ Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount];
+
+ BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount);
+
+ Span<uint> endPoints0 = stackalloc uint[subsetCount];
+ Span<uint> endPoints1 = stackalloc uint[subsetCount];
+
+ SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, uint.MaxValue);
+
+ Span<RgbaColor32> palette = stackalloc RgbaColor32[8];
+
+ int errorSum = 0;
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
+ int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A;
+ if (sum != 0)
+ {
+ blockDir = (blockDir << 6) / new RgbaColor32(sum);
+ }
+
+ uint c0 = endPoints0[subset];
+ uint c1 = endPoints1[subset];
+
+ int pBit0 = GetPBit(c0, 6, 0);
+ int pBit1 = GetPBit(c1, 6, 0);
+
+ c0 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c0), 6, 0, pBit0).ToUInt32();
+ c1 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c1), 6, 0, pBit1).ToUInt32();
+
+ if (Sse41.IsSupported)
+ {
+ Vector128<byte> c0Rep = Vector128.Create(c0).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
+ {
+ rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
+ lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
+ }
+
+ Vector128<byte> iWeights = Sse2.UnpackLow(rWeights, lWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+
+ static Vector128<short> ShiftRoundToNearest(Vector128<short> x)
+ {
+ return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6);
+ }
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+ Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
+ Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
+
+ for (int i = 0; i < tile.Length; i++)
+ {
+ if (partitionTable[i] != subset)
+ {
+ continue;
+ }
+
+ uint c = tile[i];
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+ Vector128<short> delta2 = Sse2.Subtract(color, pal2);
+ Vector128<short> delta3 = Sse2.Subtract(color, pal3);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+ Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
+ Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+ Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
+
+ Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
+
+ Vector128<ushort> min = Sse41.MinHorizontal(delta);
+
+ errorSum += min.GetElement(0);
+ }
+ }
+ else
+ {
+ RgbaColor32 e032 = RgbaColor8.FromUInt32(c0).GetColor32();
+ RgbaColor32 e132 = RgbaColor8.FromUInt32(c1).GetColor32();
+
+ palette[0] = e032;
+ palette[palette.Length - 1] = e132;
+
+ for (int i = 1; i < palette.Length - 1; i++)
+ {
+ palette[i] = BC67Utils.Interpolate(e032, e132, i, 3);
+ }
+
+ for (int i = 0; i < tile.Length; i++)
+ {
+ if (partitionTable[i] != subset)
+ {
+ continue;
+ }
+
+ uint c = tile[i];
+ RgbaColor32 color = Unsafe.As<uint, RgbaColor8>(ref c).GetColor32();
+
+ int bestMatchScore = int.MaxValue;
+
+ for (int j = 0; j < palette.Length; j++)
+ {
+ int score = BC67Utils.SquaredDifference(color, palette[j]);
+
+ if (score < bestMatchScore)
+ {
+ bestMatchScore = score;
+ }
+ }
+
+ errorSum += bestMatchScore;
+ }
+ }
+
+ // No point in continuing if we are already above maximum.
+ if (errorSum >= maxError)
+ {
+ return int.MaxValue;
+ }
+ }
+
+ return errorSum;
+ }
+
+ private static void SelectEndPoints(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ Span<uint> endPoints0,
+ Span<uint> endPoints1,
+ int subsetCount,
+ int partition,
+ int indexBitCount,
+ int colorDepth,
+ int alphaDepth,
+ uint writeMask,
+ bool fastMode)
+ {
+ byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
+
+ Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount];
+ Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount];
+
+ BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount);
+
+ uint inverseMask = ~writeMask;
+
+ for (int i = 0; i < subsetCount; i++)
+ {
+ Unsafe.As<RgbaColor8, uint>(ref minColors[i]) |= inverseMask;
+ Unsafe.As<RgbaColor8, uint>(ref maxColors[i]) |= inverseMask;
+ }
+
+ if (fastMode)
+ {
+ SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, writeMask);
+ }
+ else
+ {
+ Span<RgbaColor8> colors = stackalloc RgbaColor8[subsetCount * 16];
+ Span<byte> counts = stackalloc byte[subsetCount];
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++)
+ {
+ int subset = partitionTable[ty * 4 + tx];
+ RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++] | inverseMask);
+
+ static void AddIfNew(Span<RgbaColor8> values, RgbaColor8 value, int subset, ref byte count)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ if (values[subset * 16 + i] == value)
+ {
+ return;
+ }
+ }
+
+ values[subset * 16 + count++] = value;
+ }
+
+ AddIfNew(colors, color, subset, ref counts[subset]);
+ }
+ }
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ int offset = subset * 16;
+
+ RgbaColor8 minColor = minColors[subset];
+ RgbaColor8 maxColor = maxColors[subset];
+
+ ReadOnlySpan<RgbaColor8> subsetColors = colors.Slice(offset, counts[subset]);
+
+ (RgbaColor8 e0, RgbaColor8 e1) = SelectEndPoints(subsetColors, minColor, maxColor, indexBitCount, colorDepth, alphaDepth, inverseMask);
+
+ endPoints0[subset] = (endPoints0[subset] & inverseMask) | (e0.ToUInt32() & writeMask);
+ endPoints1[subset] = (endPoints1[subset] & inverseMask) | (e1.ToUInt32() & writeMask);
+ }
+ }
+ }
+
+ private static unsafe void SelectEndPointsFast(
+ ReadOnlySpan<byte> partitionTable,
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ int subsetCount,
+ ReadOnlySpan<RgbaColor8> minColors,
+ ReadOnlySpan<RgbaColor8> maxColors,
+ Span<uint> endPoints0,
+ Span<uint> endPoints1,
+ uint writeMask)
+ {
+ uint inverseMask = ~writeMask;
+
+ if (Sse41.IsSupported && w == 4 && h == 4)
+ {
+ Vector128<byte> row0, row1, row2, row3;
+ Vector128<short> ones = Vector128<short>.AllBitsSet;
+
+ fixed (uint* pTile = tile)
+ {
+ row0 = Sse2.LoadVector128(pTile).AsByte();
+ row1 = Sse2.LoadVector128(pTile + 4).AsByte();
+ row2 = Sse2.LoadVector128(pTile + 8).AsByte();
+ row3 = Sse2.LoadVector128(pTile + 12).AsByte();
+ }
+
+ Vector128<byte> partitionMask;
+
+ fixed (byte* pPartitionTable = partitionTable)
+ {
+ partitionMask = Sse2.LoadVector128(pPartitionTable);
+ }
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
+ int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A;
+ if (sum != 0)
+ {
+ blockDir = (blockDir << 6) / new RgbaColor32(sum);
+ }
+
+ Vector128<byte> bd = Vector128.Create(blockDir.GetColor8().ToUInt32()).AsByte();
+
+ Vector128<short> delta0 = Ssse3.MultiplyAddAdjacent(row0, bd.AsSByte());
+ Vector128<short> delta1 = Ssse3.MultiplyAddAdjacent(row1, bd.AsSByte());
+ Vector128<short> delta2 = Ssse3.MultiplyAddAdjacent(row2, bd.AsSByte());
+ Vector128<short> delta3 = Ssse3.MultiplyAddAdjacent(row3, bd.AsSByte());
+
+ Vector128<short> delta01 = Ssse3.HorizontalAdd(delta0, delta1);
+ Vector128<short> delta23 = Ssse3.HorizontalAdd(delta2, delta3);
+
+ Vector128<byte> subsetMask = Sse2.Xor(Sse2.CompareEqual(partitionMask, Vector128.Create((byte)subset)), ones.AsByte());
+
+ Vector128<short> subsetMask01 = Sse2.UnpackLow(subsetMask, subsetMask).AsInt16();
+ Vector128<short> subsetMask23 = Sse2.UnpackHigh(subsetMask, subsetMask).AsInt16();
+
+ Vector128<ushort> min01 = Sse41.MinHorizontal(Sse2.Or(delta01, subsetMask01).AsUInt16());
+ Vector128<ushort> min23 = Sse41.MinHorizontal(Sse2.Or(delta23, subsetMask23).AsUInt16());
+ Vector128<ushort> max01 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask01, delta01), ones).AsUInt16());
+ Vector128<ushort> max23 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask23, delta23), ones).AsUInt16());
+
+ uint minPos01 = min01.AsUInt32().GetElement(0);
+ uint minPos23 = min23.AsUInt32().GetElement(0);
+ uint maxPos01 = max01.AsUInt32().GetElement(0);
+ uint maxPos23 = max23.AsUInt32().GetElement(0);
+
+ uint minDistColor = (ushort)minPos23 < (ushort)minPos01
+ ? tile[(int)(minPos23 >> 16) + 8]
+ : tile[(int)(minPos01 >> 16)];
+
+ // Note that we calculate the maximum as the minimum of the inverse, so less here is actually greater.
+ uint maxDistColor = (ushort)maxPos23 < (ushort)maxPos01
+ ? tile[(int)(maxPos23 >> 16) + 8]
+ : tile[(int)(maxPos01 >> 16)];
+
+ endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor & writeMask);
+ endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor & writeMask);
+ }
+ }
+ else
+ {
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
+ blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0);
+
+ int minDist = int.MaxValue;
+ int maxDist = int.MinValue;
+
+ RgbaColor8 minDistColor = default;
+ RgbaColor8 maxDistColor = default;
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++, i++)
+ {
+ if (partitionTable[ty * 4 + tx] != subset)
+ {
+ continue;
+ }
+
+ RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]);
+ int dist = RgbaColor32.Dot(color.GetColor32(), blockDir);
+
+ if (minDist > dist)
+ {
+ minDist = dist;
+ minDistColor = color;
+ }
+
+ if (maxDist < dist)
+ {
+ maxDist = dist;
+ maxDistColor = color;
+ }
+ }
+ }
+
+ endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor.ToUInt32() & writeMask);
+ endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor.ToUInt32() & writeMask);
+ }
+ }
+ }
+
+ private static (RgbaColor8, RgbaColor8) SelectEndPoints(
+ ReadOnlySpan<RgbaColor8> values,
+ RgbaColor8 minValue,
+ RgbaColor8 maxValue,
+ int indexBitCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ int n = values.Length;
+ int numInterpolatedColors = 1 << indexBitCount;
+ int numInterpolatedColorsMinus1 = numInterpolatedColors - 1;
+
+ if (n == 0)
+ {
+ return (default, default);
+ }
+
+ minValue = BC67Utils.Quantize(minValue, colorDepth, alphaDepth);
+ maxValue = BC67Utils.Quantize(maxValue, colorDepth, alphaDepth);
+
+ RgbaColor32 blockDir = maxValue.GetColor32() - minValue.GetColor32();
+ blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0);
+
+ int minDist = int.MaxValue;
+ int maxDist = 0;
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ RgbaColor8 color = values[i];
+ int dist = RgbaColor32.Dot(BC67Utils.Quantize(color, colorDepth, alphaDepth).GetColor32(), blockDir);
+
+ if (minDist >= dist)
+ {
+ minDist = dist;
+ }
+
+ if (maxDist <= dist)
+ {
+ maxDist = dist;
+ }
+ }
+
+ Span<RgbaColor8> palette = stackalloc RgbaColor8[numInterpolatedColors];
+
+ int distRange = Math.Max(1, maxDist - minDist);
+
+ RgbaColor32 nV = new RgbaColor32(n);
+
+ int bestErrorSum = int.MaxValue;
+ RgbaColor8 bestE0 = default;
+ RgbaColor8 bestE1 = default;
+
+ Span<int> indices = stackalloc int[n];
+ Span<RgbaColor32> colors = stackalloc RgbaColor32[n];
+
+ for (int maxIndex = numInterpolatedColorsMinus1; maxIndex >= 1; maxIndex--)
+ {
+ int sumX = 0;
+ int sumXX = 0;
+ int sumXXIncrement = 0;
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ RgbaColor32 color = values[i].GetColor32();
+
+ int dist = RgbaColor32.Dot(color, blockDir);
+
+ int normalizedValue = ((dist - minDist) << 6) / distRange;
+ int texelIndex = (normalizedValue * maxIndex + 32) >> 6;
+
+ indices[i] = texelIndex;
+ colors[i] = color;
+
+ sumX += texelIndex;
+ sumXX += texelIndex * texelIndex;
+ sumXXIncrement += 1 + texelIndex * 2;
+ }
+
+ for (int start = 0; start < numInterpolatedColors - maxIndex; start++)
+ {
+ RgbaColor32 sumY = new RgbaColor32(0);
+ RgbaColor32 sumXY = new RgbaColor32(0);
+
+ for (int i = 0; i < indices.Length; i++)
+ {
+ RgbaColor32 y = colors[i];
+
+ sumY += y;
+ sumXY += new RgbaColor32(start + indices[i]) * y;
+ }
+
+ RgbaColor32 sumXV = new RgbaColor32(sumX);
+ RgbaColor32 sumXXV = new RgbaColor32(sumXX);
+ RgbaColor32 m = RgbaColor32.DivideGuarded((nV * sumXY - sumXV * sumY) << 6, nV * sumXXV - sumXV * sumXV, 0);
+ RgbaColor32 b = ((sumY << 6) - m * sumXV) / nV;
+
+ RgbaColor8 candidateE0 = (b >> 6).GetColor8();
+ RgbaColor8 candidateE1 = ((b + m * new RgbaColor32(numInterpolatedColorsMinus1)) >> 6).GetColor8();
+
+ int pBit0 = GetPBit(candidateE0.ToUInt32(), colorDepth, alphaDepth);
+ int pBit1 = GetPBit(candidateE1.ToUInt32(), colorDepth, alphaDepth);
+
+ int errorSum = BC67Utils.SelectIndices(
+ MemoryMarshal.Cast<RgbaColor8, uint>(values),
+ candidateE0.ToUInt32(),
+ candidateE1.ToUInt32(),
+ pBit0,
+ pBit1,
+ indexBitCount,
+ numInterpolatedColors,
+ colorDepth,
+ alphaDepth,
+ alphaMask);
+
+ if (errorSum <= bestErrorSum)
+ {
+ bestErrorSum = errorSum;
+ bestE0 = candidateE0;
+ bestE1 = candidateE1;
+ }
+
+ sumX += n;
+ sumXX += sumXXIncrement;
+ sumXXIncrement += 2 * n;
+ }
+ }
+
+ return (bestE0, bestE1);
+ }
+
+ private static int GetPBit(uint color, int colorDepth, int alphaDepth)
+ {
+ uint mask = 0x808080u >> colorDepth;
+
+ if (alphaDepth != 0)
+ {
+ // If alpha is 0, let's assume the color information is not too important and prefer
+ // to preserve alpha instead.
+ if ((color >> 24) == 0)
+ {
+ return 0;
+ }
+
+ mask |= 0x80000000u >> alphaDepth;
+ }
+
+ color &= 0x7f7f7f7fu;
+ color += mask >> 1;
+
+ int onesCount = BitOperations.PopCount(color & mask);
+ return onesCount >= 2 ? 1 : 0;
+ }
+
+ private static int GetPBit(uint c0, uint c1, int colorDepth, int alphaDepth)
+ {
+ // Giving preference to the first endpoint yields better results,
+ // might be a side effect of the endpoint selection algorithm?
+ return GetPBit(c0, colorDepth, alphaDepth);
+ }
+ }
+}
diff --git a/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs b/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs
new file mode 100644
index 00000000..5734d301
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs
@@ -0,0 +1,10 @@
+namespace Ryujinx.Graphics.Texture.Encoders
+{
+ enum EncodeMode
+ {
+ Fast,
+ Exhaustive,
+ ModeMask = 0xff,
+ Multithreaded = 1 << 8
+ }
+}
diff --git a/Ryujinx.Graphics.Texture/Utils/BC67Tables.cs b/Ryujinx.Graphics.Texture/Utils/BC67Tables.cs
new file mode 100644
index 00000000..d890652c
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/Utils/BC67Tables.cs
@@ -0,0 +1,297 @@
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ static class BC67Tables
+ {
+ public static readonly BC7ModeInfo[] BC7ModeInfos = new BC7ModeInfo[]
+ {
+ new BC7ModeInfo(3, 4, 6, 0, 0, 3, 0, 4, 0),
+ new BC7ModeInfo(2, 6, 2, 0, 0, 3, 0, 6, 0),
+ new BC7ModeInfo(3, 6, 0, 0, 0, 2, 0, 5, 0),
+ new BC7ModeInfo(2, 6, 4, 0, 0, 2, 0, 7, 0),
+ new BC7ModeInfo(1, 0, 0, 2, 1, 2, 3, 5, 6),
+ new BC7ModeInfo(1, 0, 0, 2, 0, 2, 2, 7, 8),
+ new BC7ModeInfo(1, 0, 2, 0, 0, 4, 0, 7, 7),
+ new BC7ModeInfo(2, 6, 4, 0, 0, 2, 0, 5, 5)
+ };
+
+ public static readonly byte[][] Weights =
+ {
+ new byte[] { 0, 21, 43, 64 },
+ new byte[] { 0, 9, 18, 27, 37, 46, 55, 64 },
+ new byte[] { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }
+ };
+
+ public static readonly byte[][] InverseWeights =
+ {
+ new byte[] { 64, 43, 21, 0 },
+ new byte[] { 64, 55, 46, 37, 27, 18, 9, 0 },
+ new byte[] { 64, 60, 55, 51, 47, 43, 38, 34, 30, 26, 21, 17, 13, 9, 4, 0 }
+ };
+
+ public static readonly byte[][][] FixUpIndices = new byte[3][][]
+ {
+ new byte[64][]
+ {
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }
+ },
+ new byte[64][]
+ {
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 },
+ new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 },
+ new byte[] { 0, 8, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 6, 0 }, new byte[] { 0, 8, 0 },
+ new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 },
+ new byte[] { 0, 2, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 6, 0 },
+ new byte[] { 0, 6, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 6, 0 }, new byte[] { 0, 8, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 15, 0 }
+ },
+ new byte[64][]
+ {
+ new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 15, 8 }, new byte[] { 0, 15, 3 },
+ new byte[] { 0, 8, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 15, 8 },
+ new byte[] { 0, 8, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 6, 15 }, new byte[] { 0, 6, 15 },
+ new byte[] { 0, 6, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 },
+ new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 15, 3 },
+ new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 6, 15 }, new byte[] { 0, 10, 8 },
+ new byte[] { 0, 5, 3 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 8, 6 }, new byte[] { 0, 6, 10 },
+ new byte[] { 0, 8, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 15, 10 }, new byte[] { 0, 15, 8 },
+ new byte[] { 0, 8, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 5, 10 },
+ new byte[] { 0, 6, 10 }, new byte[] { 0, 10, 8 }, new byte[] { 0, 8, 9 }, new byte[] { 0, 15, 10 },
+ new byte[] { 0, 15, 6 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 8 }, new byte[] { 0, 5, 15 },
+ new byte[] { 0, 15, 3 }, new byte[] { 0, 15, 6 }, new byte[] { 0, 15, 6 }, new byte[] { 0, 15, 8 },
+ new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 5, 15 },
+ new byte[] { 0, 5, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 10, 15 },
+ new byte[] { 0, 5, 15 }, new byte[] { 0, 10, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 13, 15 },
+ new byte[] { 0, 15, 3 }, new byte[] { 0, 12, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }
+ }
+ };
+
+ public static readonly byte[][][] PartitionTable = new byte[3][][]
+ {
+ new byte[64][]
+ {
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 0
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 1
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 2
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 3
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 4
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 5
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 6
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 7
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 8
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 9
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 10
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 11
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 12
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 13
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 14
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 15
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 16
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 17
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 18
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 19
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 20
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 21
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 22
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 23
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 24
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 25
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 26
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 27
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 28
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 29
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 30
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 31
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 32
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 33
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 34
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 35
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 36
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 37
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 38
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 39
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 40
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 41
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 42
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 43
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 44
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 45
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 46
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 47
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 48
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 49
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 50
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 51
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 52
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 53
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 54
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 55
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 56
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 57
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 58
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 59
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 60
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 61
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 62
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } // 63
+ },
+ new byte[64][]
+ {
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // 0
+ new byte[16] { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 }, // 1
+ new byte[16] { 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 }, // 2
+ new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // 3
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1 }, // 4
+ new byte[16] { 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 5
+ new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 6
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // 7
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1 }, // 8
+ new byte[16] { 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 9
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 10
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1 }, // 11
+ new byte[16] { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 12
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, // 13
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 14
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1 }, // 15
+ new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1 }, // 16
+ new byte[16] { 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // 17
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0 }, // 18
+ new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // 19
+ new byte[16] { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // 20
+ new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0 }, // 21
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // 22
+ new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1 }, // 23
+ new byte[16] { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // 24
+ new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // 25
+ new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0 }, // 26
+ new byte[16] { 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0 }, // 27
+ new byte[16] { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 }, // 28
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // 29
+ new byte[16] { 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0 }, // 30
+ new byte[16] { 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // 31
+ new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, // 32
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 }, // 33
+ new byte[16] { 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0 }, // 34
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0 }, // 35
+ new byte[16] { 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0 }, // 36
+ new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0 }, // 37
+ new byte[16] { 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1 }, // 38
+ new byte[16] { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1 }, // 39
+ new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, // 40
+ new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 }, // 41
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0 }, // 42
+ new byte[16] { 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 }, // 43
+ new byte[16] { 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 }, // 44
+ new byte[16] { 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1 }, // 45
+ new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1 }, // 46
+ new byte[16] { 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 }, // 47
+ new byte[16] { 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, // 48
+ new byte[16] { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0 }, // 49
+ new byte[16] { 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0 }, // 50
+ new byte[16] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0 }, // 51
+ new byte[16] { 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 }, // 52
+ new byte[16] { 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // 53
+ new byte[16] { 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // 54
+ new byte[16] { 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0 }, // 55
+ new byte[16] { 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // 56
+ new byte[16] { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1 }, // 57
+ new byte[16] { 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1 }, // 58
+ new byte[16] { 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1 }, // 59
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // 60
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // 61
+ new byte[16] { 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0 }, // 62
+ new byte[16] { 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1 } // 63
+ },
+ new byte[64][]
+ {
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2 }, // 0
+ new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1 }, // 1
+ new byte[16] { 0, 0, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // 2
+ new byte[16] { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1 }, // 3
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2 }, // 4
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2 }, // 5
+ new byte[16] { 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1 }, // 6
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // 7
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2 }, // 8
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2 }, // 9
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // 10
+ new byte[16] { 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2 }, // 11
+ new byte[16] { 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2 }, // 12
+ new byte[16] { 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2 }, // 13
+ new byte[16] { 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // 14
+ new byte[16] { 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0 }, // 15
+ new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2 }, // 16
+ new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0 }, // 17
+ new byte[16] { 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2 }, // 18
+ new byte[16] { 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1 }, // 19
+ new byte[16] { 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2 }, // 20
+ new byte[16] { 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1 }, // 21
+ new byte[16] { 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2 }, // 22
+ new byte[16] { 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 1, 0 }, // 23
+ new byte[16] { 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0 }, // 24
+ new byte[16] { 0, 0, 1, 2, 0, 0, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2 }, // 25
+ new byte[16] { 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1, 0, 1, 1, 0 }, // 26
+ new byte[16] { 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1 }, // 27
+ new byte[16] { 0, 0, 2, 2, 1, 1, 0, 2, 1, 1, 0, 2, 0, 0, 2, 2 }, // 28
+ new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 2 }, // 29
+ new byte[16] { 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1 }, // 30
+ new byte[16] { 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 2, 2, 1 }, // 31
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // 32
+ new byte[16] { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0, 0, 1, 1 }, // 33
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 2 }, // 34
+ new byte[16] { 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0 }, // 35
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0 }, // 36
+ new byte[16] { 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0 }, // 37
+ new byte[16] { 0, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0 }, // 38
+ new byte[16] { 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1 }, // 39
+ new byte[16] { 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1 }, // 40
+ new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // 41
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1 }, // 42
+ new byte[16] { 0, 0, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 2 }, // 43
+ new byte[16] { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1 }, // 44
+ new byte[16] { 0, 2, 2, 0, 1, 2, 2, 1, 0, 2, 2, 0, 1, 2, 2, 1 }, // 45
+ new byte[16] { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 1 }, // 46
+ new byte[16] { 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }, // 47
+ new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2 }, // 48
+ new byte[16] { 0, 2, 2, 2, 0, 1, 1, 1, 0, 2, 2, 2, 0, 1, 1, 1 }, // 49
+ new byte[16] { 0, 0, 0, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 1, 1, 2 }, // 50
+ new byte[16] { 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2 }, // 51
+ new byte[16] { 0, 2, 2, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2 }, // 52
+ new byte[16] { 0, 0, 0, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2 }, // 53
+ new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2 }, // 54
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2 }, // 55
+ new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2 }, // 56
+ new byte[16] { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2 }, // 57
+ new byte[16] { 0, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2 }, // 58
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2 }, // 59
+ new byte[16] { 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1 }, // 60
+ new byte[16] { 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2 }, // 61
+ new byte[16] { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, // 62
+ new byte[16] { 0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0 } // 63
+ }
+ };
+ }
+}
diff --git a/Ryujinx.Graphics.Texture/Utils/BC67Utils.cs b/Ryujinx.Graphics.Texture/Utils/BC67Utils.cs
new file mode 100644
index 00000000..e6c3f6e7
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/Utils/BC67Utils.cs
@@ -0,0 +1,1327 @@
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ static class BC67Utils
+ {
+ private static byte[][] _quantizationLut;
+ private static byte[][] _quantizationLutNoPBit;
+
+ static BC67Utils()
+ {
+ _quantizationLut = new byte[5][];
+ _quantizationLutNoPBit = new byte[5][];
+
+ for (int depth = 4; depth < 9; depth++)
+ {
+ byte[] lut = new byte[512];
+ byte[] lutNoPBit = new byte[256];
+
+ for (int i = 0; i < lut.Length; i++)
+ {
+ lut[i] = QuantizeComponentForLut((byte)i, depth, i >> 8);
+
+ if (i < lutNoPBit.Length)
+ {
+ lutNoPBit[i] = QuantizeComponentForLut((byte)i, depth);
+ }
+ }
+
+ _quantizationLut[depth - 4] = lut;
+ _quantizationLutNoPBit[depth - 4] = lutNoPBit;
+ }
+ }
+
+ public static (RgbaColor8, RgbaColor8) GetMinMaxColors(ReadOnlySpan<uint> tile, int w, int h)
+ {
+ if (Sse41.IsSupported && w == 4 && h == 4)
+ {
+ GetMinMaxColorsOneSubset4x4Sse41(tile, out RgbaColor8 minColor, out RgbaColor8 maxColor);
+
+ return (minColor, maxColor);
+ }
+ else
+ {
+ RgbaColor8 minColor = new RgbaColor8(255, 255, 255, 255);
+ RgbaColor8 maxColor = default;
+
+ for (int i = 0; i < tile.Length; i++)
+ {
+ RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]);
+
+ minColor.R = Math.Min(minColor.R, color.R);
+ minColor.G = Math.Min(minColor.G, color.G);
+ minColor.B = Math.Min(minColor.B, color.B);
+ minColor.A = Math.Min(minColor.A, color.A);
+
+ maxColor.R = Math.Max(maxColor.R, color.R);
+ maxColor.G = Math.Max(maxColor.G, color.G);
+ maxColor.B = Math.Max(maxColor.B, color.B);
+ maxColor.A = Math.Max(maxColor.A, color.A);
+ }
+
+ return (minColor, maxColor);
+ }
+ }
+
+ public static void GetMinMaxColors(
+ ReadOnlySpan<byte> partitionTable,
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ Span<RgbaColor8> minColors,
+ Span<RgbaColor8> maxColors,
+ int subsetCount)
+ {
+ if (Sse41.IsSupported && w == 4 && h == 4)
+ {
+ if (subsetCount == 1)
+ {
+ GetMinMaxColorsOneSubset4x4Sse41(tile, out minColors[0], out maxColors[0]);
+ return;
+ }
+ else if (subsetCount == 2)
+ {
+ GetMinMaxColorsTwoSubsets4x4Sse41(partitionTable, tile, minColors, maxColors);
+ return;
+ }
+ }
+
+ minColors.Fill(new RgbaColor8(255, 255, 255, 255));
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++)
+ {
+ int subset = partitionTable[ty * w + tx];
+ RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++]);
+
+ minColors[subset].R = Math.Min(minColors[subset].R, color.R);
+ minColors[subset].G = Math.Min(minColors[subset].G, color.G);
+ minColors[subset].B = Math.Min(minColors[subset].B, color.B);
+ minColors[subset].A = Math.Min(minColors[subset].A, color.A);
+
+ maxColors[subset].R = Math.Max(maxColors[subset].R, color.R);
+ maxColors[subset].G = Math.Max(maxColors[subset].G, color.G);
+ maxColors[subset].B = Math.Max(maxColors[subset].B, color.B);
+ maxColors[subset].A = Math.Max(maxColors[subset].A, color.A);
+ }
+ }
+ }
+
+ private static unsafe void GetMinMaxColorsOneSubset4x4Sse41(ReadOnlySpan<uint> tile, out RgbaColor8 minColor, out RgbaColor8 maxColor)
+ {
+ Vector128<byte> min = Vector128<byte>.AllBitsSet;
+ Vector128<byte> max = Vector128<byte>.Zero;
+ Vector128<byte> row0, row1, row2, row3;
+
+ fixed (uint* pTile = tile)
+ {
+ row0 = Sse2.LoadVector128(pTile).AsByte();
+ row1 = Sse2.LoadVector128(pTile + 4).AsByte();
+ row2 = Sse2.LoadVector128(pTile + 8).AsByte();
+ row3 = Sse2.LoadVector128(pTile + 12).AsByte();
+ }
+
+ min = Sse2.Min(min, row0);
+ max = Sse2.Max(max, row0);
+ min = Sse2.Min(min, row1);
+ max = Sse2.Max(max, row1);
+ min = Sse2.Min(min, row2);
+ max = Sse2.Max(max, row2);
+ min = Sse2.Min(min, row3);
+ max = Sse2.Max(max, row3);
+
+ minColor = HorizontalMin(min);
+ maxColor = HorizontalMax(max);
+ }
+
+ private static unsafe void GetMinMaxColorsTwoSubsets4x4Sse41(
+ ReadOnlySpan<byte> partitionTable,
+ ReadOnlySpan<uint> tile,
+ Span<RgbaColor8> minColors,
+ Span<RgbaColor8> maxColors)
+ {
+ Vector128<byte> partitionMask;
+
+ fixed (byte* pPartitionTable = partitionTable)
+ {
+ partitionMask = Sse2.LoadVector128(pPartitionTable);
+ }
+
+ Vector128<byte> subset0Mask = Sse2.CompareEqual(partitionMask, Vector128<byte>.Zero);
+
+ Vector128<byte> subset0MaskRep16Low = Sse2.UnpackLow(subset0Mask, subset0Mask);
+ Vector128<byte> subset0MaskRep16High = Sse2.UnpackHigh(subset0Mask, subset0Mask);
+
+ Vector128<byte> subset0Mask0 = Sse2.UnpackLow(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte();
+ Vector128<byte> subset0Mask1 = Sse2.UnpackHigh(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte();
+ Vector128<byte> subset0Mask2 = Sse2.UnpackLow(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte();
+ Vector128<byte> subset0Mask3 = Sse2.UnpackHigh(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte();
+
+ Vector128<byte> min0 = Vector128<byte>.AllBitsSet;
+ Vector128<byte> min1 = Vector128<byte>.AllBitsSet;
+ Vector128<byte> max0 = Vector128<byte>.Zero;
+ Vector128<byte> max1 = Vector128<byte>.Zero;
+
+ Vector128<byte> row0, row1, row2, row3;
+
+ fixed (uint* pTile = tile)
+ {
+ row0 = Sse2.LoadVector128(pTile).AsByte();
+ row1 = Sse2.LoadVector128(pTile + 4).AsByte();
+ row2 = Sse2.LoadVector128(pTile + 8).AsByte();
+ row3 = Sse2.LoadVector128(pTile + 12).AsByte();
+ }
+
+ min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row0, subset0Mask0));
+ min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row1, subset0Mask1));
+ min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row2, subset0Mask2));
+ min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row3, subset0Mask3));
+
+ min1 = Sse2.Min(min1, Sse2.Or(row0, subset0Mask0));
+ min1 = Sse2.Min(min1, Sse2.Or(row1, subset0Mask1));
+ min1 = Sse2.Min(min1, Sse2.Or(row2, subset0Mask2));
+ min1 = Sse2.Min(min1, Sse2.Or(row3, subset0Mask3));
+
+ max0 = Sse2.Max(max0, Sse2.And(row0, subset0Mask0));
+ max0 = Sse2.Max(max0, Sse2.And(row1, subset0Mask1));
+ max0 = Sse2.Max(max0, Sse2.And(row2, subset0Mask2));
+ max0 = Sse2.Max(max0, Sse2.And(row3, subset0Mask3));
+
+ max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask0, row0));
+ max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask1, row1));
+ max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask2, row2));
+ max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask3, row3));
+
+ minColors[0] = HorizontalMin(min0);
+ minColors[1] = HorizontalMin(min1);
+ maxColors[0] = HorizontalMax(max0);
+ maxColors[1] = HorizontalMax(max1);
+ }
+
+ private static RgbaColor8 HorizontalMin(Vector128<byte> x)
+ {
+ x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte());
+ x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte());
+ return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0));
+ }
+
+ private static RgbaColor8 HorizontalMax(Vector128<byte> x)
+ {
+ x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte());
+ x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte());
+ return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0));
+ }
+
+ public static int SelectIndices(
+ ReadOnlySpan<uint> values,
+ uint endPoint0,
+ uint endPoint1,
+ int pBit0,
+ int pBit1,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ if (Sse41.IsSupported)
+ {
+ if (indexBitCount == 2)
+ {
+ return Select2BitIndicesSse41(
+ values,
+ endPoint0,
+ endPoint1,
+ pBit0,
+ pBit1,
+ indexBitCount,
+ indexCount,
+ colorDepth,
+ alphaDepth,
+ alphaMask);
+ }
+ else if (indexBitCount == 3)
+ {
+ return Select3BitIndicesSse41(
+ values,
+ endPoint0,
+ endPoint1,
+ pBit0,
+ pBit1,
+ indexBitCount,
+ indexCount,
+ colorDepth,
+ alphaDepth,
+ alphaMask);
+ }
+ else if (indexBitCount == 4)
+ {
+ return Select4BitIndicesOneSubsetSse41(
+ values,
+ endPoint0,
+ endPoint1,
+ pBit0,
+ pBit1,
+ indexBitCount,
+ indexCount,
+ colorDepth,
+ alphaDepth,
+ alphaMask);
+ }
+ }
+
+ return SelectIndicesFallback(
+ values,
+ endPoint0,
+ endPoint1,
+ pBit0,
+ pBit1,
+ indexBitCount,
+ indexCount,
+ colorDepth,
+ alphaDepth,
+ alphaMask);
+ }
+
+ private static unsafe int Select2BitIndicesSse41(
+ ReadOnlySpan<uint> values,
+ uint endPoint0,
+ uint endPoint1,
+ int pBit0,
+ int pBit1,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0])
+ {
+ rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte();
+ lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte();
+ }
+
+ Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ uint c = values[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+
+ Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01);
+
+ Vector128<ushort> min = Sse41.MinHorizontal(delta);
+
+ ushort error = min.GetElement(0);
+
+ errorSum += error;
+ }
+
+ return errorSum;
+ }
+
+ private static unsafe int Select3BitIndicesSse41(
+ ReadOnlySpan<uint> values,
+ uint endPoint0,
+ uint endPoint1,
+ int pBit0,
+ int pBit1,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
+ {
+ rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
+ lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
+ }
+
+ Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+ Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
+ Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ uint c = values[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+ Vector128<short> delta2 = Sse2.Subtract(color, pal2);
+ Vector128<short> delta3 = Sse2.Subtract(color, pal3);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+ Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
+ Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+ Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
+
+ Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
+
+ Vector128<ushort> min = Sse41.MinHorizontal(delta);
+
+ ushort error = min.GetElement(0);
+
+ errorSum += error;
+ }
+
+ return errorSum;
+ }
+
+ private static unsafe int Select4BitIndicesOneSubsetSse41(
+ ReadOnlySpan<uint> values,
+ uint endPoint0,
+ uint endPoint1,
+ int pBit0,
+ int pBit1,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2])
+ {
+ rWeights = Sse2.LoadVector128(pWeights);
+ lWeights = Sse2.LoadVector128(pInvWeights);
+ }
+
+ Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
+ Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
+ Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
+ Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
+ Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
+ Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
+ Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+ Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
+ Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
+ Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte()));
+ Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte()));
+ Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte()));
+ Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte()));
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ uint c = values[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+ Vector128<short> delta2 = Sse2.Subtract(color, pal2);
+ Vector128<short> delta3 = Sse2.Subtract(color, pal3);
+ Vector128<short> delta4 = Sse2.Subtract(color, pal4);
+ Vector128<short> delta5 = Sse2.Subtract(color, pal5);
+ Vector128<short> delta6 = Sse2.Subtract(color, pal6);
+ Vector128<short> delta7 = Sse2.Subtract(color, pal7);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+ Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
+ Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
+ Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4);
+ Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5);
+ Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6);
+ Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+ Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
+ Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5);
+ Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7);
+
+ Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
+ Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67);
+
+ Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123);
+ Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567);
+
+ ushort minPos0123 = min0123.GetElement(0);
+ ushort minPos4567 = min4567.GetElement(0);
+
+ if (minPos4567 < minPos0123)
+ {
+ errorSum += minPos4567;
+ }
+ else
+ {
+ errorSum += minPos0123;
+ }
+ }
+
+ return errorSum;
+ }
+
+ private static int SelectIndicesFallback(
+ ReadOnlySpan<uint> values,
+ uint endPoint0,
+ uint endPoint1,
+ int pBit0,
+ int pBit1,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ int errorSum = 0;
+
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ Span<uint> palette = stackalloc uint[indexCount];
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
+
+ Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette;
+ Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette;
+
+ palette[0] = c0.ToUInt32();
+ palette[indexCount - 1] = c1.ToUInt32();
+
+ for (int j = 1; j < indexCount - 1; j++)
+ {
+ palette[j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32();
+ }
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ uint color = values[i] | alphaMask;
+
+ int bestMatchScore = int.MaxValue;
+ int bestMatchIndex = 0;
+
+ for (int j = 0; j < indexCount; j++)
+ {
+ int score = SquaredDifference(
+ RgbaColor8.FromUInt32(color).GetColor32(),
+ RgbaColor8.FromUInt32(palette[j]).GetColor32());
+
+ if (score < bestMatchScore)
+ {
+ bestMatchScore = score;
+ bestMatchIndex = j;
+ }
+ }
+
+ errorSum += bestMatchScore;
+ }
+
+ return errorSum;
+ }
+
+ public static int SelectIndices(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ ReadOnlySpan<uint> endPoints0,
+ ReadOnlySpan<uint> endPoints1,
+ ReadOnlySpan<int> pBitValues,
+ Span<byte> indices,
+ int subsetCount,
+ int partition,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ int pBits,
+ uint alphaMask)
+ {
+ if (Sse41.IsSupported)
+ {
+ if (indexBitCount == 2)
+ {
+ return Select2BitIndicesSse41(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ pBitValues,
+ indices,
+ subsetCount,
+ partition,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ alphaMask);
+ }
+ else if (indexBitCount == 3)
+ {
+ return Select3BitIndicesSse41(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ pBitValues,
+ indices,
+ subsetCount,
+ partition,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ alphaMask);
+ }
+ else if (indexBitCount == 4)
+ {
+ Debug.Assert(subsetCount == 1);
+
+ return Select4BitIndicesOneSubsetSse41(
+ tile,
+ w,
+ h,
+ endPoints0[0],
+ endPoints1[0],
+ pBitValues,
+ indices,
+ partition,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ alphaMask);
+ }
+ }
+
+ return SelectIndicesFallback(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ pBitValues,
+ indices,
+ subsetCount,
+ partition,
+ indexBitCount,
+ indexCount,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ alphaMask);
+ }
+
+ private static unsafe int Select2BitIndicesSse41(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ ReadOnlySpan<uint> endPoints0,
+ ReadOnlySpan<uint> endPoints1,
+ ReadOnlySpan<int> pBitValues,
+ Span<byte> indices,
+ int subsetCount,
+ int partition,
+ int colorDepth,
+ int alphaDepth,
+ int pBits,
+ uint alphaMask)
+ {
+ byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
+
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits == subsetCount)
+ {
+ pBit0 = pBit1 = pBitValues[subset];
+ }
+ else if (pBits != 0)
+ {
+ pBit0 = pBitValues[subset * 2];
+ pBit1 = pBitValues[subset * 2 + 1];
+ }
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0])
+ {
+ rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte();
+ lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte();
+ }
+
+ Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++, i++)
+ {
+ int tileOffset = ty * 4 + tx;
+ if (partitionTable[tileOffset] != subset)
+ {
+ continue;
+ }
+
+ uint c = tile[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+
+ Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01);
+
+ Vector128<ushort> min = Sse41.MinHorizontal(delta);
+
+ uint minPos = min.AsUInt32().GetElement(0);
+ ushort error = (ushort)minPos;
+ uint index = minPos >> 16;
+
+ indices[tileOffset] = (byte)index;
+ errorSum += error;
+ }
+ }
+ }
+
+ return errorSum;
+ }
+
+ private static unsafe int Select3BitIndicesSse41(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ ReadOnlySpan<uint> endPoints0,
+ ReadOnlySpan<uint> endPoints1,
+ ReadOnlySpan<int> pBitValues,
+ Span<byte> indices,
+ int subsetCount,
+ int partition,
+ int colorDepth,
+ int alphaDepth,
+ int pBits,
+ uint alphaMask)
+ {
+ byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
+
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits == subsetCount)
+ {
+ pBit0 = pBit1 = pBitValues[subset];
+ }
+ else if (pBits != 0)
+ {
+ pBit0 = pBitValues[subset * 2];
+ pBit1 = pBitValues[subset * 2 + 1];
+ }
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
+ {
+ rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
+ lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
+ }
+
+ Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+ Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
+ Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++, i++)
+ {
+ int tileOffset = ty * 4 + tx;
+ if (partitionTable[tileOffset] != subset)
+ {
+ continue;
+ }
+
+ uint c = tile[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+ Vector128<short> delta2 = Sse2.Subtract(color, pal2);
+ Vector128<short> delta3 = Sse2.Subtract(color, pal3);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+ Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
+ Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+ Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
+
+ Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
+
+ Vector128<ushort> min = Sse41.MinHorizontal(delta);
+
+ uint minPos = min.AsUInt32().GetElement(0);
+ ushort error = (ushort)minPos;
+ uint index = minPos >> 16;
+
+ indices[tileOffset] = (byte)index;
+ errorSum += error;
+ }
+ }
+ }
+
+ return errorSum;
+ }
+
+ private static unsafe int Select4BitIndicesOneSubsetSse41(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ uint endPoint0,
+ uint endPoint1,
+ ReadOnlySpan<int> pBitValues,
+ Span<byte> indices,
+ int partition,
+ int colorDepth,
+ int alphaDepth,
+ int pBits,
+ uint alphaMask)
+ {
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits != 0)
+ {
+ pBit0 = pBitValues[0];
+ pBit1 = pBitValues[1];
+ }
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2])
+ {
+ rWeights = Sse2.LoadVector128(pWeights);
+ lWeights = Sse2.LoadVector128(pInvWeights);
+ }
+
+ Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
+ Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
+ Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
+ Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
+ Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
+ Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
+ Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+ Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
+ Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
+ Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte()));
+ Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte()));
+ Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte()));
+ Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte()));
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++, i++)
+ {
+ uint c = tile[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+ Vector128<short> delta2 = Sse2.Subtract(color, pal2);
+ Vector128<short> delta3 = Sse2.Subtract(color, pal3);
+ Vector128<short> delta4 = Sse2.Subtract(color, pal4);
+ Vector128<short> delta5 = Sse2.Subtract(color, pal5);
+ Vector128<short> delta6 = Sse2.Subtract(color, pal6);
+ Vector128<short> delta7 = Sse2.Subtract(color, pal7);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+ Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
+ Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
+ Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4);
+ Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5);
+ Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6);
+ Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+ Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
+ Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5);
+ Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7);
+
+ Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
+ Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67);
+
+ Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123);
+ Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567);
+
+ uint minPos0123 = min0123.AsUInt32().GetElement(0);
+ uint minPos4567 = min4567.AsUInt32().GetElement(0);
+
+ if ((ushort)minPos4567 < (ushort)minPos0123)
+ {
+ errorSum += (ushort)minPos4567;
+ indices[ty * 4 + tx] = (byte)(8 + (minPos4567 >> 16));
+ }
+ else
+ {
+ errorSum += (ushort)minPos0123;
+ indices[ty * 4 + tx] = (byte)(minPos0123 >> 16);
+ }
+ }
+ }
+
+ return errorSum;
+ }
+
+ private static Vector128<short> ShiftRoundToNearest(Vector128<short> x)
+ {
+ return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6);
+ }
+
+ private static int SelectIndicesFallback(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ ReadOnlySpan<uint> endPoints0,
+ ReadOnlySpan<uint> endPoints1,
+ ReadOnlySpan<int> pBitValues,
+ Span<byte> indices,
+ int subsetCount,
+ int partition,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ int pBits,
+ uint alphaMask)
+ {
+ int errorSum = 0;
+
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ Span<uint> palette = stackalloc uint[subsetCount * indexCount];
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ int palBase = subset * indexCount;
+
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits == subsetCount)
+ {
+ pBit0 = pBit1 = pBitValues[subset];
+ }
+ else if (pBits != 0)
+ {
+ pBit0 = pBitValues[subset * 2];
+ pBit1 = pBitValues[subset * 2 + 1];
+ }
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
+
+ Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette;
+ Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette;
+
+ palette[palBase + 0] = c0.ToUInt32();
+ palette[palBase + indexCount - 1] = c1.ToUInt32();
+
+ for (int j = 1; j < indexCount - 1; j++)
+ {
+ palette[palBase + j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32();
+ }
+ }
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++)
+ {
+ int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][ty * 4 + tx];
+ uint color = tile[i++] | alphaMask;
+
+ int bestMatchScore = int.MaxValue;
+ int bestMatchIndex = 0;
+
+ for (int j = 0; j < indexCount; j++)
+ {
+ int score = SquaredDifference(
+ RgbaColor8.FromUInt32(color).GetColor32(),
+ RgbaColor8.FromUInt32(palette[subset * indexCount + j]).GetColor32());
+
+ if (score < bestMatchScore)
+ {
+ bestMatchScore = score;
+ bestMatchIndex = j;
+ }
+ }
+
+ indices[ty * 4 + tx] = (byte)bestMatchIndex;
+ errorSum += bestMatchScore;
+ }
+ }
+
+ return errorSum;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static int SquaredDifference(RgbaColor32 color1, RgbaColor32 color2)
+ {
+ RgbaColor32 delta = color1 - color2;
+ return RgbaColor32.Dot(delta, delta);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor8 Interpolate(RgbaColor8 color1, RgbaColor8 color2, int weightIndex, int indexBitCount)
+ {
+ return Interpolate(color1.GetColor32(), color2.GetColor32(), weightIndex, indexBitCount).GetColor8();
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 Interpolate(RgbaColor32 color1, RgbaColor32 color2, int weightIndex, int indexBitCount)
+ {
+ Debug.Assert(indexBitCount >= 2 && indexBitCount <= 4);
+
+ int weight = (((weightIndex << 7) / ((1 << indexBitCount) - 1)) + 1) >> 1;
+
+ RgbaColor32 weightV = new RgbaColor32(weight);
+ RgbaColor32 invWeightV = new RgbaColor32(64 - weight);
+
+ return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 Interpolate(
+ RgbaColor32 color1,
+ RgbaColor32 color2,
+ int colorWeightIndex,
+ int alphaWeightIndex,
+ int colorIndexBitCount,
+ int alphaIndexBitCount)
+ {
+ Debug.Assert(colorIndexBitCount >= 2 && colorIndexBitCount <= 4);
+ Debug.Assert(alphaIndexBitCount >= 2 && alphaIndexBitCount <= 4);
+
+ int colorWeight = BC67Tables.Weights[colorIndexBitCount - 2][colorWeightIndex];
+ int alphaWeight = BC67Tables.Weights[alphaIndexBitCount - 2][alphaWeightIndex];
+
+ RgbaColor32 weightV = new RgbaColor32(colorWeight);
+ weightV.A = alphaWeight;
+ RgbaColor32 invWeightV = new RgbaColor32(64) - weightV;
+
+ return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6;
+ }
+
+ public static RgbaColor8 Quantize(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1)
+ {
+ if (alphaBits == 0)
+ {
+ int colorShift = 8 - colorBits;
+
+ uint c;
+
+ if (pBit >= 0)
+ {
+ byte[] lutColor = _quantizationLut[colorBits - 4];
+
+ Debug.Assert(pBit <= 1);
+ int high = pBit << 8;
+ uint mask = (0xffu >> (colorBits + 1)) * 0x10101;
+
+ c = lutColor[color.R | high];
+ c |= (uint)lutColor[color.G | high] << 8;
+ c |= (uint)lutColor[color.B | high] << 16;
+
+ c <<= colorShift;
+ c |= (c >> (colorBits + 1)) & mask;
+ c |= ((uint)pBit * 0x10101) << (colorShift - 1);
+ }
+ else
+ {
+ byte[] lutColor = _quantizationLutNoPBit[colorBits - 4];
+
+ uint mask = (0xffu >> colorBits) * 0x10101;
+
+ c = lutColor[color.R];
+ c |= (uint)lutColor[color.G] << 8;
+ c |= (uint)lutColor[color.B] << 16;
+
+ c <<= colorShift;
+ c |= (c >> colorBits) & mask;
+ }
+
+ c |= (uint)color.A << 24;
+
+ return RgbaColor8.FromUInt32(c);
+ }
+
+ return QuantizeFallback(color, colorBits, alphaBits, pBit);
+ }
+
+ private static RgbaColor8 QuantizeFallback(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1)
+ {
+ byte r = UnquantizeComponent(QuantizeComponent(color.R, colorBits, pBit), colorBits, pBit);
+ byte g = UnquantizeComponent(QuantizeComponent(color.G, colorBits, pBit), colorBits, pBit);
+ byte b = UnquantizeComponent(QuantizeComponent(color.B, colorBits, pBit), colorBits, pBit);
+ byte a = alphaBits == 0 ? color.A : UnquantizeComponent(QuantizeComponent(color.A, alphaBits, pBit), alphaBits, pBit);
+ return new RgbaColor8(r, g, b, a);
+ }
+
+ public static byte QuantizeComponent(byte component, int bits, int pBit = -1)
+ {
+ return pBit >= 0 ? _quantizationLut[bits - 4][component | (pBit << 8)] : _quantizationLutNoPBit[bits - 4][component];
+ }
+
+ private static byte QuantizeComponentForLut(byte component, int bits, int pBit = -1)
+ {
+ int shift = 8 - bits;
+ int fill = component >> bits;
+
+ if (pBit >= 0)
+ {
+ Debug.Assert(pBit <= 1);
+ fill >>= 1;
+ fill |= pBit << (shift - 1);
+ }
+
+ int q1 = component >> shift;
+ int q2 = Math.Max(q1 - 1, 0);
+ int q3 = Math.Min(q1 + 1, (1 << bits) - 1);
+
+ int delta1 = FastAbs(((q1 << shift) | fill) - component);
+ int delta2 = component - ((q2 << shift) | fill);
+ int delta3 = ((q3 << shift) | fill) - component;
+
+ if (delta1 < delta2 && delta1 < delta3)
+ {
+ return (byte)q1;
+ }
+ else if (delta2 < delta3)
+ {
+ return (byte)q2;
+ }
+ else
+ {
+ return (byte)q3;
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static int FastAbs(int x)
+ {
+ int sign = x >> 31;
+ return (x + sign) ^ sign;
+ }
+
+ private static byte UnquantizeComponent(byte component, int bits, int pBit)
+ {
+ int shift = 8 - bits;
+ int value = component << shift;
+
+ if (pBit >= 0)
+ {
+ Debug.Assert(pBit <= 1);
+ value |= value >> (bits + 1);
+ value |= pBit << (shift - 1);
+ }
+ else
+ {
+ value |= value >> bits;
+ }
+
+ return (byte)value;
+ }
+ }
+}
diff --git a/Ryujinx.Graphics.Texture/Utils/BC7ModeInfo.cs b/Ryujinx.Graphics.Texture/Utils/BC7ModeInfo.cs
new file mode 100644
index 00000000..749324bf
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/Utils/BC7ModeInfo.cs
@@ -0,0 +1,37 @@
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ struct BC7ModeInfo
+ {
+ public readonly int SubsetCount;
+ public readonly int PartitionBitCount;
+ public readonly int PBits;
+ public readonly int RotationBitCount;
+ public readonly int IndexModeBitCount;
+ public readonly int ColorIndexBitCount;
+ public readonly int AlphaIndexBitCount;
+ public readonly int ColorDepth;
+ public readonly int AlphaDepth;
+
+ public BC7ModeInfo(
+ int subsetCount,
+ int partitionBitsCount,
+ int pBits,
+ int rotationBitCount,
+ int indexModeBitCount,
+ int colorIndexBitCount,
+ int alphaIndexBitCount,
+ int colorDepth,
+ int alphaDepth)
+ {
+ SubsetCount = subsetCount;
+ PartitionBitCount = partitionBitsCount;
+ PBits = pBits;
+ RotationBitCount = rotationBitCount;
+ IndexModeBitCount = indexModeBitCount;
+ ColorIndexBitCount = colorIndexBitCount;
+ AlphaIndexBitCount = alphaIndexBitCount;
+ ColorDepth = colorDepth;
+ AlphaDepth = alphaDepth;
+ }
+ }
+} \ No newline at end of file
diff --git a/Ryujinx.Graphics.Texture/Utils/Block.cs b/Ryujinx.Graphics.Texture/Utils/Block.cs
new file mode 100644
index 00000000..a8bae077
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/Utils/Block.cs
@@ -0,0 +1,55 @@
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ struct Block
+ {
+ public ulong Low;
+ public ulong High;
+
+ public void Encode(ulong value, ref int offset, int bits)
+ {
+ if (offset >= 64)
+ {
+ High |= value << (offset - 64);
+ }
+ else
+ {
+ Low |= value << offset;
+
+ if (offset + bits > 64)
+ {
+ int remainder = 64 - offset;
+ High |= value >> remainder;
+ }
+ }
+
+ offset += bits;
+ }
+
+ public ulong Decode(ref int offset, int bits)
+ {
+ ulong value;
+ ulong mask = bits == 64 ? ulong.MaxValue : (1UL << bits) - 1;
+
+ if (offset >= 64)
+ {
+ value = (High >> (offset - 64)) & mask;
+ }
+ else
+ {
+ value = Low >> offset;
+
+ if (offset + bits > 64)
+ {
+ int remainder = 64 - offset;
+ value |= High << remainder;
+ }
+
+ value &= mask;
+ }
+
+ offset += bits;
+
+ return value;
+ }
+ }
+} \ No newline at end of file
diff --git a/Ryujinx.Graphics.Texture/Utils/RgbaColor32.cs b/Ryujinx.Graphics.Texture/Utils/RgbaColor32.cs
new file mode 100644
index 00000000..582044d9
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/Utils/RgbaColor32.cs
@@ -0,0 +1,229 @@
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ struct RgbaColor32 : IEquatable<RgbaColor32>
+ {
+ private Vector128<int> _color;
+
+ public int R
+ {
+ get => _color.GetElement(0);
+ set => _color = _color.WithElement(0, value);
+ }
+
+ public int G
+ {
+ get => _color.GetElement(1);
+ set => _color = _color.WithElement(1, value);
+ }
+
+ public int B
+ {
+ get => _color.GetElement(2);
+ set => _color = _color.WithElement(2, value);
+ }
+
+ public int A
+ {
+ get => _color.GetElement(3);
+ set => _color = _color.WithElement(3, value);
+ }
+
+ public RgbaColor32(Vector128<int> color)
+ {
+ _color = color;
+ }
+
+ public RgbaColor32(int r, int g, int b, int a)
+ {
+ _color = Vector128.Create(r, g, b, a);
+ }
+
+ public RgbaColor32(int scalar)
+ {
+ _color = Vector128.Create(scalar);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 operator +(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse2.IsSupported)
+ {
+ return new RgbaColor32(Sse2.Add(x._color, y._color));
+ }
+ else
+ {
+ return new RgbaColor32(x.R + y.R, x.G + y.G, x.B + y.B, x.A + y.A);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 operator -(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse2.IsSupported)
+ {
+ return new RgbaColor32(Sse2.Subtract(x._color, y._color));
+ }
+ else
+ {
+ return new RgbaColor32(x.R - y.R, x.G - y.G, x.B - y.B, x.A - y.A);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 operator *(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse41.IsSupported)
+ {
+ return new RgbaColor32(Sse41.MultiplyLow(x._color, y._color));
+ }
+ else
+ {
+ return new RgbaColor32(x.R * y.R, x.G * y.G, x.B * y.B, x.A * y.A);
+ }
+ }
+
+ public static RgbaColor32 operator /(RgbaColor32 x, RgbaColor32 y)
+ {
+ return new RgbaColor32(x.R / y.R, x.G / y.G, x.B / y.B, x.A / y.A);
+ }
+
+ public static RgbaColor32 DivideGuarded(RgbaColor32 x, RgbaColor32 y, int resultIfZero)
+ {
+ return new RgbaColor32(
+ DivideGuarded(x.R, y.R, resultIfZero),
+ DivideGuarded(x.G, y.G, resultIfZero),
+ DivideGuarded(x.B, y.B, resultIfZero),
+ DivideGuarded(x.A, y.A, resultIfZero));
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 operator <<(RgbaColor32 x, int shift)
+ {
+ if (Sse2.IsSupported)
+ {
+ return new RgbaColor32(Sse2.ShiftLeftLogical(x._color, (byte)shift));
+ }
+ else
+ {
+ return new RgbaColor32(x.R << shift, x.G << shift, x.B << shift, x.A << shift);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 operator >>(RgbaColor32 x, int shift)
+ {
+ if (Sse2.IsSupported)
+ {
+ return new RgbaColor32(Sse2.ShiftRightLogical(x._color, (byte)shift));
+ }
+ else
+ {
+ return new RgbaColor32(x.R >> shift, x.G >> shift, x.B >> shift, x.A >> shift);
+ }
+ }
+
+ public static bool operator ==(RgbaColor32 x, RgbaColor32 y)
+ {
+ return x.Equals(y);
+ }
+
+ public static bool operator !=(RgbaColor32 x, RgbaColor32 y)
+ {
+ return !x.Equals(y);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static int Dot(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse41.IsSupported)
+ {
+ Vector128<int> product = Sse41.MultiplyLow(x._color, y._color);
+ Vector128<int> sum = Ssse3.HorizontalAdd(product, product);
+ sum = Ssse3.HorizontalAdd(sum, sum);
+ return sum.GetElement(0);
+ }
+ else
+ {
+ return x.R * y.R + x.G * y.G + x.B * y.B + x.A * y.A;
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 Max(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse41.IsSupported)
+ {
+ return new RgbaColor32(Sse41.Max(x._color, y._color));
+ }
+ else
+ {
+ return new RgbaColor32(Math.Max(x.R, y.R), Math.Max(x.G, y.G), Math.Max(x.B, y.B), Math.Max(x.A, y.A));
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 Min(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse41.IsSupported)
+ {
+ return new RgbaColor32(Sse41.Min(x._color, y._color));
+ }
+ else
+ {
+ return new RgbaColor32(Math.Min(x.R, y.R), Math.Min(x.G, y.G), Math.Min(x.B, y.B), Math.Min(x.A, y.A));
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public RgbaColor8 GetColor8()
+ {
+ if (Sse41.IsSupported)
+ {
+ Vector128<int> temp = _color;
+ Vector128<ushort> color16 = Sse41.PackUnsignedSaturate(temp, temp);
+ Vector128<byte> color8 = Sse2.PackUnsignedSaturate(color16.AsInt16(), color16.AsInt16());
+ uint color = color8.AsUInt32().GetElement(0);
+ return Unsafe.As<uint, RgbaColor8>(ref color);
+ }
+ else
+ {
+ return new RgbaColor8(ClampByte(R), ClampByte(G), ClampByte(B), ClampByte(A));
+ }
+ }
+
+ private static int DivideGuarded(int dividend, int divisor, int resultIfZero)
+ {
+ if (divisor == 0)
+ {
+ return resultIfZero;
+ }
+
+ return dividend / divisor;
+ }
+
+ private static byte ClampByte(int value)
+ {
+ return (byte)Math.Clamp(value, 0, 255);
+ }
+
+ public override int GetHashCode()
+ {
+ return HashCode.Combine(R, G, B, A);
+ }
+
+ public override bool Equals(object obj)
+ {
+ return obj is RgbaColor32 other && Equals(other);
+ }
+
+ public bool Equals(RgbaColor32 other)
+ {
+ return _color.Equals(other._color);
+ }
+ }
+}
diff --git a/Ryujinx.Graphics.Texture/Utils/RgbaColor8.cs b/Ryujinx.Graphics.Texture/Utils/RgbaColor8.cs
new file mode 100644
index 00000000..0edf1cce
--- /dev/null
+++ b/Ryujinx.Graphics.Texture/Utils/RgbaColor8.cs
@@ -0,0 +1,84 @@
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ struct RgbaColor8 : IEquatable<RgbaColor8>
+ {
+ public byte R;
+ public byte G;
+ public byte B;
+ public byte A;
+
+ public RgbaColor8(byte r, byte g, byte b, byte a)
+ {
+ R = r;
+ G = g;
+ B = b;
+ A = a;
+ }
+
+ public static RgbaColor8 FromUInt32(uint color)
+ {
+ return Unsafe.As<uint, RgbaColor8>(ref color);
+ }
+
+ public static bool operator ==(RgbaColor8 x, RgbaColor8 y)
+ {
+ return x.Equals(y);
+ }
+
+ public static bool operator !=(RgbaColor8 x, RgbaColor8 y)
+ {
+ return !x.Equals(y);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public RgbaColor32 GetColor32()
+ {
+ if (Sse41.IsSupported)
+ {
+ Vector128<byte> color = Vector128.CreateScalarUnsafe(Unsafe.As<RgbaColor8, uint>(ref this)).AsByte();
+ return new RgbaColor32(Sse41.ConvertToVector128Int32(color));
+ }
+ else
+ {
+ return new RgbaColor32(R, G, B, A);
+ }
+ }
+
+ public uint ToUInt32()
+ {
+ return Unsafe.As<RgbaColor8, uint>(ref this);
+ }
+
+ public override int GetHashCode()
+ {
+ return HashCode.Combine(R, G, B, A);
+ }
+
+ public override bool Equals(object obj)
+ {
+ return obj is RgbaColor8 other && Equals(other);
+ }
+
+ public bool Equals(RgbaColor8 other)
+ {
+ return R == other.R && G == other.G && B == other.B && A == other.A;
+ }
+
+ public byte GetComponent(int index)
+ {
+ return index switch
+ {
+ 0 => R,
+ 1 => G,
+ 2 => B,
+ 3 => A,
+ _ => throw new ArgumentOutOfRangeException(nameof(index))
+ };
+ }
+ }
+}