using Ryujinx.Common.Logging; using Ryujinx.Graphics.GAL; using Ryujinx.Graphics.Shader; using Ryujinx.Graphics.Shader.Translation; using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.IO; using System.Threading; using static Ryujinx.Graphics.Gpu.Shader.ShaderCache; namespace Ryujinx.Graphics.Gpu.Shader.DiskCache { class ParallelDiskCacheLoader { private const int ThreadCount = 8; private readonly GpuContext _context; private readonly ShaderCacheHashTable _graphicsCache; private readonly ComputeShaderCacheHashTable _computeCache; private readonly DiskCacheHostStorage _hostStorage; private readonly CancellationToken _cancellationToken; private readonly Action _stateChangeCallback; /// /// Indicates if the cache should be loaded. /// public bool Active => !_cancellationToken.IsCancellationRequested; private bool _needsHostRegen; /// /// Number of shaders that failed to compile from the cache. /// public int ErrorCount { get; private set; } /// /// Program validation entry. /// private readonly struct ProgramEntry { /// /// Cached shader program. /// public readonly CachedShaderProgram CachedProgram; /// /// Optional binary code. If not null, it is used instead of the backend host binary. /// public readonly byte[] BinaryCode; /// /// Program index. /// public readonly int ProgramIndex; /// /// Indicates if the program is a compute shader. /// public readonly bool IsCompute; /// /// Indicates if the program is a host binary shader. /// public readonly bool IsBinary; /// /// Creates a new program validation entry. /// /// Cached shader program /// Optional binary code. If not null, it is used instead of the backend host binary /// Program index /// Indicates if the program is a compute shader /// Indicates if the program is a host binary shader public ProgramEntry( CachedShaderProgram cachedProgram, byte[] binaryCode, int programIndex, bool isCompute, bool isBinary) { CachedProgram = cachedProgram; BinaryCode = binaryCode; ProgramIndex = programIndex; IsCompute = isCompute; IsBinary = isBinary; } } /// /// Translated shader compilation entry. /// private readonly struct ProgramCompilation { /// /// Translated shader stages. /// public readonly ShaderProgram[] TranslatedStages; /// /// Cached shaders. /// public readonly CachedShaderStage[] Shaders; /// /// Specialization state. /// public readonly ShaderSpecializationState SpecializationState; /// /// Program index. /// public readonly int ProgramIndex; /// /// Indicates if the program is a compute shader. /// public readonly bool IsCompute; /// /// Creates a new translated shader compilation entry. /// /// Translated shader stages /// Cached shaders /// Specialization state /// Program index /// Indicates if the program is a compute shader public ProgramCompilation( ShaderProgram[] translatedStages, CachedShaderStage[] shaders, ShaderSpecializationState specState, int programIndex, bool isCompute) { TranslatedStages = translatedStages; Shaders = shaders; SpecializationState = specState; ProgramIndex = programIndex; IsCompute = isCompute; } } /// /// Program translation entry. /// private readonly struct AsyncProgramTranslation { /// /// Guest code for each active stage. /// public readonly GuestCodeAndCbData?[] GuestShaders; /// /// Specialization state. /// public readonly ShaderSpecializationState SpecializationState; /// /// Program index. /// public readonly int ProgramIndex; /// /// Indicates if the program is a compute shader. /// public readonly bool IsCompute; /// /// Creates a new program translation entry. /// /// Guest code for each active stage /// Specialization state /// Program index /// Indicates if the program is a compute shader public AsyncProgramTranslation( GuestCodeAndCbData?[] guestShaders, ShaderSpecializationState specState, int programIndex, bool isCompute) { GuestShaders = guestShaders; SpecializationState = specState; ProgramIndex = programIndex; IsCompute = isCompute; } } private readonly Queue _validationQueue; private readonly ConcurrentQueue _compilationQueue; private readonly BlockingCollection _asyncTranslationQueue; private readonly SortedList _programList; private int _backendParallelCompileThreads; private int _compiledCount; private int _totalCount; /// /// Creates a new parallel disk cache loader. /// /// GPU context /// Graphics shader cache /// Compute shader cache /// Disk cache host storage /// Cancellation token /// Function to be called when there is a state change, reporting state, compiled and total shaders count public ParallelDiskCacheLoader( GpuContext context, ShaderCacheHashTable graphicsCache, ComputeShaderCacheHashTable computeCache, DiskCacheHostStorage hostStorage, CancellationToken cancellationToken, Action stateChangeCallback) { _context = context; _graphicsCache = graphicsCache; _computeCache = computeCache; _hostStorage = hostStorage; _cancellationToken = cancellationToken; _stateChangeCallback = stateChangeCallback; _validationQueue = new Queue(); _compilationQueue = new ConcurrentQueue(); _asyncTranslationQueue = new BlockingCollection(ThreadCount); _programList = new SortedList(); _backendParallelCompileThreads = Math.Min(Environment.ProcessorCount, 8); // Must be kept in sync with the backend code. } /// /// Loads all shaders from the cache. /// public void LoadShaders() { Thread[] workThreads = new Thread[ThreadCount]; for (int index = 0; index < ThreadCount; index++) { workThreads[index] = new Thread(ProcessAsyncQueue) { Name = $"GPU.AsyncTranslationThread.{index}" }; } int programCount = _hostStorage.GetProgramCount(); _compiledCount = 0; _totalCount = programCount; _stateChangeCallback(ShaderCacheState.Start, 0, programCount); Logger.Info?.Print(LogClass.Gpu, $"Loading {programCount} shaders from the cache..."); for (int index = 0; index < ThreadCount; index++) { workThreads[index].Start(_cancellationToken); } try { _hostStorage.LoadShaders(_context, this); } catch (DiskCacheLoadException diskCacheLoadException) { Logger.Warning?.Print(LogClass.Gpu, $"Error loading the shader cache. {diskCacheLoadException.Message}"); // If we can't even access the file, then we also can't rebuild. if (diskCacheLoadException.Result != DiskCacheLoadResult.NoAccess) { _needsHostRegen = true; } } catch (InvalidDataException invalidDataException) { Logger.Warning?.Print(LogClass.Gpu, $"Error decompressing the shader cache file. {invalidDataException.Message}"); _needsHostRegen = true; } catch (IOException ioException) { Logger.Warning?.Print(LogClass.Gpu, $"Error reading the shader cache file. {ioException.Message}"); _needsHostRegen = true; } _asyncTranslationQueue.CompleteAdding(); for (int index = 0; index < ThreadCount; index++) { workThreads[index].Join(); } CheckCompilationBlocking(); if (_needsHostRegen && Active) { // Rebuild both shared and host cache files. // Rebuilding shared is required because the shader information returned by the translator // might have changed, and so we have to reconstruct the file with the new information. try { _hostStorage.ClearSharedCache(); _hostStorage.ClearHostCache(_context); if (_programList.Count != 0) { _stateChangeCallback(ShaderCacheState.Packaging, 0, _programList.Count); Logger.Info?.Print(LogClass.Gpu, $"Rebuilding {_programList.Count} shaders..."); using var streams = _hostStorage.GetOutputStreams(_context); int packagedShaders = 0; foreach (var kv in _programList) { if (!Active) { break; } (CachedShaderProgram program, byte[] binaryCode) = kv.Value; _hostStorage.AddShader(_context, program, binaryCode, streams); _stateChangeCallback(ShaderCacheState.Packaging, ++packagedShaders, _programList.Count); } Logger.Info?.Print(LogClass.Gpu, $"Rebuilt {_programList.Count} shaders successfully."); } else { _hostStorage.ClearGuestCache(); Logger.Info?.Print(LogClass.Gpu, "Shader cache deleted due to corruption."); } } catch (DiskCacheLoadException diskCacheLoadException) { Logger.Warning?.Print(LogClass.Gpu, $"Error deleting the shader cache. {diskCacheLoadException.Message}"); } catch (IOException ioException) { Logger.Warning?.Print(LogClass.Gpu, $"Error deleting the shader cache file. {ioException.Message}"); } } Logger.Info?.Print(LogClass.Gpu, "Shader cache loaded."); _stateChangeCallback(ShaderCacheState.Loaded, programCount, programCount); } /// /// Enqueues a host program for compilation. /// /// Cached program /// Host binary code /// Program index /// Indicates if the program is a compute shader public void QueueHostProgram(CachedShaderProgram cachedProgram, byte[] binaryCode, int programIndex, bool isCompute) { EnqueueForValidation(new ProgramEntry(cachedProgram, binaryCode, programIndex, isCompute, isBinary: true)); } /// /// Enqueues a guest program for compilation. /// /// Guest code for each active stage /// Specialization state /// Program index /// Indicates if the program is a compute shader public void QueueGuestProgram(GuestCodeAndCbData?[] guestShaders, ShaderSpecializationState specState, int programIndex, bool isCompute) { try { AsyncProgramTranslation asyncTranslation = new AsyncProgramTranslation(guestShaders, specState, programIndex, isCompute); _asyncTranslationQueue.Add(asyncTranslation, _cancellationToken); } catch (OperationCanceledException) { } } /// /// Check the state of programs that have already been compiled, /// and add to the cache if the compilation was successful. /// public void CheckCompilation() { ProcessCompilationQueue(); // Process programs that already finished compiling. // If not yet compiled, do nothing. This avoids blocking to wait for shader compilation. while (_validationQueue.TryPeek(out ProgramEntry entry)) { ProgramLinkStatus result = entry.CachedProgram.HostProgram.CheckProgramLink(false); if (result != ProgramLinkStatus.Incomplete) { ProcessCompiledProgram(ref entry, result); _validationQueue.Dequeue(); } else { break; } } } /// /// Waits until all programs finishes compiling, then adds the ones /// with successful compilation to the cache. /// private void CheckCompilationBlocking() { ProcessCompilationQueue(); while (_validationQueue.TryDequeue(out ProgramEntry entry) && Active) { ProcessCompiledProgram(ref entry, entry.CachedProgram.HostProgram.CheckProgramLink(true), asyncCompile: false); } } /// /// Process a compiled program result. /// /// Compiled program entry /// Compilation result /// For failed host compilations, indicates if a guest compilation should be done asynchronously private void ProcessCompiledProgram(ref ProgramEntry entry, ProgramLinkStatus result, bool asyncCompile = true) { if (result == ProgramLinkStatus.Success) { // Compilation successful, add to memory cache. if (entry.IsCompute) { _computeCache.Add(entry.CachedProgram); } else { _graphicsCache.Add(entry.CachedProgram); } if (!entry.IsBinary) { _needsHostRegen = true; } // Fetch the binary code from the backend if it isn't already present. byte[] binaryCode = entry.BinaryCode ?? entry.CachedProgram.HostProgram.GetBinary(); _programList.Add(entry.ProgramIndex, (entry.CachedProgram, binaryCode)); SignalCompiled(); } else if (entry.IsBinary) { // If this is a host binary and compilation failed, // we still have a chance to recompile from the guest binary. CachedShaderProgram program = entry.CachedProgram; GuestCodeAndCbData?[] guestShaders = new GuestCodeAndCbData?[program.Shaders.Length]; for (int index = 0; index < program.Shaders.Length; index++) { CachedShaderStage shader = program.Shaders[index]; if (shader != null) { guestShaders[index] = new GuestCodeAndCbData(shader.Code, shader.Cb1Data); } } if (asyncCompile) { QueueGuestProgram(guestShaders, program.SpecializationState, entry.ProgramIndex, entry.IsCompute); } else { RecompileFromGuestCode(guestShaders, program.SpecializationState, entry.ProgramIndex, entry.IsCompute); ProcessCompilationQueue(); } } else { // Failed to compile from both host and guest binary. ErrorCount++; SignalCompiled(); } } /// /// Processes the queue of translated guest programs that should be compiled on the host. /// private void ProcessCompilationQueue() { while (_compilationQueue.TryDequeue(out ProgramCompilation compilation) && Active) { ShaderSource[] shaderSources = new ShaderSource[compilation.TranslatedStages.Length]; int fragmentOutputMap = -1; for (int index = 0; index < compilation.TranslatedStages.Length; index++) { ShaderProgram shader = compilation.TranslatedStages[index]; shaderSources[index] = CreateShaderSource(shader); if (shader.Info.Stage == ShaderStage.Fragment) { fragmentOutputMap = shader.Info.FragmentOutputMap; } } ShaderInfo shaderInfo = compilation.SpecializationState.PipelineState.HasValue ? new ShaderInfo(fragmentOutputMap, compilation.SpecializationState.PipelineState.Value, fromCache: true) : new ShaderInfo(fragmentOutputMap, fromCache: true); IProgram hostProgram = _context.Renderer.CreateProgram(shaderSources, shaderInfo); CachedShaderProgram program = new CachedShaderProgram(hostProgram, compilation.SpecializationState, compilation.Shaders); // Vulkan's binary code is the SPIR-V used for compilation, so it is ready immediately. Other APIs get this after compilation. byte[] binaryCode = _context.Capabilities.Api == TargetApi.Vulkan ? ShaderBinarySerializer.Pack(shaderSources) : null; EnqueueForValidation(new ProgramEntry(program, binaryCode, compilation.ProgramIndex, compilation.IsCompute, isBinary: false)); } } /// /// Enqueues a program for validation, which will check if the program was compiled successfully. /// /// Program entry to be validated private void EnqueueForValidation(ProgramEntry newEntry) { _validationQueue.Enqueue(newEntry); // Do not allow more than N shader compilation in-flight, where N is the maximum number of threads // the driver will be using for parallel compilation. // Submitting more seems to cause NVIDIA OpenGL driver to crash. if (_validationQueue.Count >= _backendParallelCompileThreads && _validationQueue.TryDequeue(out ProgramEntry entry)) { ProcessCompiledProgram(ref entry, entry.CachedProgram.HostProgram.CheckProgramLink(true), asyncCompile: false); } } /// /// Processses the queue of programs that should be translated from guest code. /// /// Cancellation token private void ProcessAsyncQueue(object state) { CancellationToken ct = (CancellationToken)state; try { foreach (AsyncProgramTranslation asyncCompilation in _asyncTranslationQueue.GetConsumingEnumerable(ct)) { RecompileFromGuestCode( asyncCompilation.GuestShaders, asyncCompilation.SpecializationState, asyncCompilation.ProgramIndex, asyncCompilation.IsCompute); } } catch (OperationCanceledException) { } } /// /// Recompiles a program from guest code. /// /// Guest code for each active stage /// Specialization state /// Program index /// Indicates if the program is a compute shader private void RecompileFromGuestCode(GuestCodeAndCbData?[] guestShaders, ShaderSpecializationState specState, int programIndex, bool isCompute) { try { if (isCompute) { RecompileComputeFromGuestCode(guestShaders, specState, programIndex); } else { RecompileGraphicsFromGuestCode(guestShaders, specState, programIndex); } } catch (Exception exception) { Logger.Error?.Print(LogClass.Gpu, $"Error translating guest shader. {exception.Message}"); ErrorCount++; SignalCompiled(); } } /// /// Recompiles a graphics program from guest code. /// /// Guest code for each active stage /// Specialization state /// Program index private void RecompileGraphicsFromGuestCode(GuestCodeAndCbData?[] guestShaders, ShaderSpecializationState specState, int programIndex) { ShaderSpecializationState newSpecState = new ShaderSpecializationState( ref specState.GraphicsState, specState.PipelineState, specState.TransformFeedbackDescriptors); ResourceCounts counts = new ResourceCounts(); TranslatorContext[] translatorContexts = new TranslatorContext[Constants.ShaderStages + 1]; TranslatorContext nextStage = null; TargetApi api = _context.Capabilities.Api; for (int stageIndex = Constants.ShaderStages - 1; stageIndex >= 0; stageIndex--) { if (guestShaders[stageIndex + 1].HasValue) { GuestCodeAndCbData shader = guestShaders[stageIndex + 1].Value; byte[] guestCode = shader.Code; byte[] cb1Data = shader.Cb1Data; DiskCacheGpuAccessor gpuAccessor = new DiskCacheGpuAccessor(_context, guestCode, cb1Data, specState, newSpecState, counts, stageIndex); TranslatorContext currentStage = DecodeGraphicsShader(gpuAccessor, api, DefaultFlags, 0); if (nextStage != null) { currentStage.SetNextStage(nextStage); } if (stageIndex == 0 && guestShaders[0].HasValue) { byte[] guestCodeA = guestShaders[0].Value.Code; byte[] cb1DataA = guestShaders[0].Value.Cb1Data; DiskCacheGpuAccessor gpuAccessorA = new DiskCacheGpuAccessor(_context, guestCodeA, cb1DataA, specState, newSpecState, counts, 0); translatorContexts[0] = DecodeGraphicsShader(gpuAccessorA, api, DefaultFlags | TranslationFlags.VertexA, 0); } translatorContexts[stageIndex + 1] = currentStage; nextStage = currentStage; } } if (!_context.Capabilities.SupportsGeometryShader) { ShaderCache.TryRemoveGeometryStage(translatorContexts); } CachedShaderStage[] shaders = new CachedShaderStage[guestShaders.Length]; List translatedStages = new List(); TranslatorContext previousStage = null; for (int stageIndex = 0; stageIndex < Constants.ShaderStages; stageIndex++) { TranslatorContext currentStage = translatorContexts[stageIndex + 1]; if (currentStage != null) { ShaderProgram program; byte[] guestCode = guestShaders[stageIndex + 1].Value.Code; byte[] cb1Data = guestShaders[stageIndex + 1].Value.Cb1Data; if (stageIndex == 0 && guestShaders[0].HasValue) { program = currentStage.Translate(translatorContexts[0]); byte[] guestCodeA = guestShaders[0].Value.Code; byte[] cb1DataA = guestShaders[0].Value.Cb1Data; shaders[0] = new CachedShaderStage(null, guestCodeA, cb1DataA); shaders[1] = new CachedShaderStage(program.Info, guestCode, cb1Data); } else { program = currentStage.Translate(); shaders[stageIndex + 1] = new CachedShaderStage(program.Info, guestCode, cb1Data); } if (program != null) { translatedStages.Add(program); } previousStage = currentStage; } else if ( previousStage != null && previousStage.LayerOutputWritten && stageIndex == 3 && !_context.Capabilities.SupportsLayerVertexTessellation) { translatedStages.Add(previousStage.GenerateGeometryPassthrough()); } } _compilationQueue.Enqueue(new ProgramCompilation(translatedStages.ToArray(), shaders, newSpecState, programIndex, isCompute: false)); } /// /// Recompiles a compute program from guest code. /// /// Guest code for each active stage /// Specialization state /// Program index private void RecompileComputeFromGuestCode(GuestCodeAndCbData?[] guestShaders, ShaderSpecializationState specState, int programIndex) { GuestCodeAndCbData shader = guestShaders[0].Value; ResourceCounts counts = new ResourceCounts(); ShaderSpecializationState newSpecState = new ShaderSpecializationState(ref specState.ComputeState); DiskCacheGpuAccessor gpuAccessor = new DiskCacheGpuAccessor(_context, shader.Code, shader.Cb1Data, specState, newSpecState, counts, 0); TranslatorContext translatorContext = DecodeComputeShader(gpuAccessor, _context.Capabilities.Api, 0); ShaderProgram program = translatorContext.Translate(); CachedShaderStage[] shaders = new[] { new CachedShaderStage(program.Info, shader.Code, shader.Cb1Data) }; _compilationQueue.Enqueue(new ProgramCompilation(new[] { program }, shaders, newSpecState, programIndex, isCompute: true)); } /// /// Signals that compilation of a program has been finished successfully, /// or that it failed and guest recompilation has also been attempted. /// private void SignalCompiled() { _stateChangeCallback(ShaderCacheState.Loading, ++_compiledCount, _totalCount); } } }