astc: Enable parallel CPU astc decoding

Given the issues with GPU accelerated ASTC decoding with NVIDIA's latest drivers, parallelize astc decoding on the CPU. Uses half the available threads in the system for astc decoding.
author: Morph <39850852+Morph1984@users.noreply.github.com> 2022-09-01 21:29:22 -0400
committer: Morph <39850852+Morph1984@users.noreply.github.com> 2022-09-16 10:16:42 -0400
commit: 809126c94a0ed8e7964d5a550abf7b3731d00512 (patch)
tree: c0b1554f05ea0863e826d8ca77b7380b78fc2a74
parent: 4b07596b83ad493eacbe75ecaef0867778e40af3 (diff)
1 files changed, 35 insertions, 21 deletions
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index e3f3d3c5d0..b159494c54 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -13,7 +13,9 @@
 
 #include <boost/container/static_vector.hpp>
 
+#include "common/alignment.h"
 #include "common/common_types.h"
+#include "common/thread_worker.h"
 #include "video_core/textures/astc.h"
 
 class InputBitStream {
@@ -1650,29 +1652,41 @@ static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
 
 void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
                 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) {
-    u32 block_index = 0;
-    std::size_t depth_offset = 0;
-    for (u32 z = 0; z < depth; z++) {
-        for (u32 y = 0; y < height; y += block_height) {
-            for (u32 x = 0; x < width; x += block_width) {
-                const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
-
-                // Blocks can be at most 12x12
-                std::array<u32, 12 * 12> uncompData;
-                DecompressBlock(blockPtr, block_width, block_height, uncompData);
-
-                u32 decompWidth = std::min(block_width, width - x);
-                u32 decompHeight = std::min(block_height, height - y);
-
-                const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
-                for (u32 jj = 0; jj < decompHeight; jj++) {
-                    std::memcpy(outRow.data() + jj * width * 4,
-                                uncompData.data() + jj * block_width, decompWidth * 4);
+    const u32 rows = Common::DivideUp(height, block_height);
+    const u32 cols = Common::DivideUp(width, block_width);
+
+    Common::ThreadWorker workers{std::max(std::thread::hardware_concurrency(), 2U) / 2,
+                                 "yuzu:ASTCDecompress"};
+
+    for (u32 z = 0; z < depth; ++z) {
+        const u32 depth_offset = z * height * width * 4;
+        for (u32 y_index = 0; y_index < rows; ++y_index) {
+            auto decompress_stride = [data, width, height, depth, block_width, block_height, output,
+                                      rows, cols, z, depth_offset, y_index] {
+                const u32 y = y_index * block_height;
+                for (u32 x_index = 0; x_index < cols; ++x_index) {
+                    const u32 block_index = (z * rows * cols) + (y_index * cols) + x_index;
+                    const u32 x = x_index * block_width;
+
+                    const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
+
+                    // Blocks can be at most 12x12
+                    std::array<u32, 12 * 12> uncompData;
+                    DecompressBlock(blockPtr, block_width, block_height, uncompData);
+
+                    u32 decompWidth = std::min(block_width, width - x);
+                    u32 decompHeight = std::min(block_height, height - y);
+
+                    const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
+                    for (u32 h = 0; h < decompHeight; ++h) {
+                        std::memcpy(outRow.data() + h * width * 4,
+                                    uncompData.data() + h * block_width, decompWidth * 4);
+                    }
                 }
-                ++block_index;
-            }
+            };
+            workers.QueueWork(std::move(decompress_stride));
         }
-        depth_offset += height * width * 4;
+        workers.WaitForRequests();
     }
 }
author	Morph <39850852+Morph1984@users.noreply.github.com>	2022-09-01 21:29:22 -0400
committer	Morph <39850852+Morph1984@users.noreply.github.com>	2022-09-16 10:16:42 -0400
commit	809126c94a0ed8e7964d5a550abf7b3731d00512 (patch)
tree	c0b1554f05ea0863e826d8ca77b7380b78fc2a74
parent	4b07596b83ad493eacbe75ecaef0867778e40af3 (diff)