1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
|
using Ryujinx.Common;
using Ryujinx.Common.Configuration;
using Ryujinx.Graphics.GAL.Multithreading.Commands;
using Ryujinx.Graphics.GAL.Multithreading.Commands.Buffer;
using Ryujinx.Graphics.GAL.Multithreading.Commands.Renderer;
using Ryujinx.Graphics.GAL.Multithreading.Model;
using Ryujinx.Graphics.GAL.Multithreading.Resources;
using Ryujinx.Graphics.GAL.Multithreading.Resources.Programs;
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
namespace Ryujinx.Graphics.GAL.Multithreading
{
/// <summary>
/// The ThreadedRenderer is a layer that can be put in front of any Renderer backend to make
/// its processing happen on a separate thread, rather than intertwined with the GPU emulation.
/// A new thread is created to handle the GPU command processing, separate from the renderer thread.
/// Calls to the renderer, pipeline and resources are queued to happen on the renderer thread.
/// </summary>
public class ThreadedRenderer : IRenderer
{
private const int SpanPoolBytes = 4 * 1024 * 1024;
private const int MaxRefsPerCommand = 2;
private const int QueueCount = 10000;
private readonly int _elementSize;
private readonly IRenderer _baseRenderer;
private Thread _gpuThread;
private Thread _backendThread;
private bool _running;
private readonly AutoResetEvent _frameComplete = new(true);
private readonly ManualResetEventSlim _galWorkAvailable;
private readonly CircularSpanPool _spanPool;
private readonly ManualResetEventSlim _invokeRun;
private readonly AutoResetEvent _interruptRun;
private bool _lastSampleCounterClear = true;
private readonly byte[] _commandQueue;
private readonly object[] _refQueue;
private int _consumerPtr;
private int _commandCount;
private int _producerPtr;
private int _lastProducedPtr;
private int _invokePtr;
private int _refProducerPtr;
private int _refConsumerPtr;
private Action _interruptAction;
private readonly object _interruptLock = new();
public event EventHandler<ScreenCaptureImageInfo> ScreenCaptured;
internal BufferMap Buffers { get; }
internal SyncMap Sync { get; }
internal CircularSpanPool SpanPool { get; }
internal ProgramQueue Programs { get; }
public IPipeline Pipeline { get; }
public IWindow Window { get; }
public IRenderer BaseRenderer => _baseRenderer;
public bool PreferThreading => _baseRenderer.PreferThreading;
public ThreadedRenderer(IRenderer renderer)
{
_baseRenderer = renderer;
renderer.ScreenCaptured += (sender, info) => ScreenCaptured?.Invoke(this, info);
renderer.SetInterruptAction(Interrupt);
Pipeline = new ThreadedPipeline(this);
Window = new ThreadedWindow(this, renderer);
Buffers = new BufferMap();
Sync = new SyncMap();
Programs = new ProgramQueue(renderer);
_galWorkAvailable = new ManualResetEventSlim(false);
_invokeRun = new ManualResetEventSlim();
_interruptRun = new AutoResetEvent(false);
_spanPool = new CircularSpanPool(this, SpanPoolBytes);
SpanPool = _spanPool;
_elementSize = BitUtils.AlignUp(CommandHelper.GetMaxCommandSize(), 4);
_commandQueue = new byte[_elementSize * QueueCount];
_refQueue = new object[MaxRefsPerCommand * QueueCount];
}
public void RunLoop(ThreadStart gpuLoop)
{
_running = true;
_backendThread = Thread.CurrentThread;
_gpuThread = new Thread(gpuLoop)
{
Name = "GPU.MainThread",
};
_gpuThread.Start();
RenderLoop();
}
public void RenderLoop()
{
// Power through the render queue until the Gpu thread work is done.
while (_running)
{
_galWorkAvailable.Wait();
_galWorkAvailable.Reset();
if (Volatile.Read(ref _interruptAction) != null)
{
_interruptAction();
_interruptRun.Set();
Interlocked.Exchange(ref _interruptAction, null);
}
// The other thread can only increase the command count.
// We can assume that if it is above 0, it will stay there or get higher.
while (Volatile.Read(ref _commandCount) > 0 && Volatile.Read(ref _interruptAction) == null)
{
int commandPtr = _consumerPtr;
Span<byte> command = new(_commandQueue, commandPtr * _elementSize, _elementSize);
// Run the command.
CommandHelper.RunCommand(command, this, _baseRenderer);
if (Interlocked.CompareExchange(ref _invokePtr, -1, commandPtr) == commandPtr)
{
_invokeRun.Set();
}
_consumerPtr = (_consumerPtr + 1) % QueueCount;
Interlocked.Decrement(ref _commandCount);
}
}
}
internal SpanRef<T> CopySpan<T>(ReadOnlySpan<T> data) where T : unmanaged
{
return _spanPool.Insert(data);
}
private TableRef<T> Ref<T>(T reference)
{
return new TableRef<T>(this, reference);
}
internal ref T New<T>() where T : struct
{
while (_producerPtr == (Volatile.Read(ref _consumerPtr) + QueueCount - 1) % QueueCount)
{
// If incrementing the producer pointer would overflow, we need to wait.
// _consumerPtr can only move forward, so there's no race to worry about here.
Thread.Sleep(1);
}
int taken = _producerPtr;
_lastProducedPtr = taken;
_producerPtr = (_producerPtr + 1) % QueueCount;
Span<byte> memory = new(_commandQueue, taken * _elementSize, _elementSize);
ref T result = ref Unsafe.As<byte, T>(ref MemoryMarshal.GetReference(memory));
memory[^1] = (byte)((IGALCommand)result).CommandType;
return ref result;
}
internal int AddTableRef(object obj)
{
// The reference table is sized so that it will never overflow, so long as the references are taken after the command is allocated.
int index = _refProducerPtr;
_refQueue[index] = obj;
_refProducerPtr = (_refProducerPtr + 1) % _refQueue.Length;
return index;
}
internal object RemoveTableRef(int index)
{
Debug.Assert(index == _refConsumerPtr);
object result = _refQueue[_refConsumerPtr];
_refQueue[_refConsumerPtr] = null;
_refConsumerPtr = (_refConsumerPtr + 1) % _refQueue.Length;
return result;
}
internal void QueueCommand()
{
int result = Interlocked.Increment(ref _commandCount);
if (result == 1)
{
_galWorkAvailable.Set();
}
}
internal void InvokeCommand()
{
_invokeRun.Reset();
_invokePtr = _lastProducedPtr;
QueueCommand();
// Wait for the command to complete.
_invokeRun.Wait();
}
internal void WaitForFrame()
{
_frameComplete.WaitOne();
}
internal void SignalFrame()
{
_frameComplete.Set();
}
internal bool IsGpuThread()
{
return Thread.CurrentThread == _gpuThread;
}
public void BackgroundContextAction(Action action, bool alwaysBackground = false)
{
if (IsGpuThread() && !alwaysBackground)
{
// The action must be performed on the render thread.
New<ActionCommand>().Set(Ref(action));
InvokeCommand();
}
else
{
_baseRenderer.BackgroundContextAction(action, true);
}
}
public BufferHandle CreateBuffer(int size, BufferAccess access)
{
BufferHandle handle = Buffers.CreateBufferHandle();
New<CreateBufferAccessCommand>().Set(handle, size, access);
QueueCommand();
return handle;
}
public BufferHandle CreateBuffer(int size, BufferAccess access, BufferHandle storageHint)
{
BufferHandle handle = Buffers.CreateBufferHandle();
New<CreateBufferCommand>().Set(handle, size, access, storageHint);
QueueCommand();
return handle;
}
public BufferHandle CreateBuffer(nint pointer, int size)
{
BufferHandle handle = Buffers.CreateBufferHandle();
New<CreateHostBufferCommand>().Set(handle, pointer, size);
QueueCommand();
return handle;
}
public BufferHandle CreateBufferSparse(ReadOnlySpan<BufferRange> storageBuffers)
{
BufferHandle handle = Buffers.CreateBufferHandle();
New<CreateBufferSparseCommand>().Set(handle, CopySpan(storageBuffers));
QueueCommand();
return handle;
}
public IImageArray CreateImageArray(int size, bool isBuffer)
{
var imageArray = new ThreadedImageArray(this);
New<CreateImageArrayCommand>().Set(Ref(imageArray), size, isBuffer);
QueueCommand();
return imageArray;
}
public IProgram CreateProgram(ShaderSource[] shaders, ShaderInfo info)
{
var program = new ThreadedProgram(this);
SourceProgramRequest request = new(program, shaders, info);
Programs.Add(request);
New<CreateProgramCommand>().Set(Ref((IProgramRequest)request));
QueueCommand();
return program;
}
public ISampler CreateSampler(SamplerCreateInfo info)
{
var sampler = new ThreadedSampler(this);
New<CreateSamplerCommand>().Set(Ref(sampler), info);
QueueCommand();
return sampler;
}
public void CreateSync(ulong id, bool strict)
{
Sync.CreateSyncHandle(id);
New<CreateSyncCommand>().Set(id, strict);
QueueCommand();
}
public ITexture CreateTexture(TextureCreateInfo info)
{
if (IsGpuThread())
{
var texture = new ThreadedTexture(this, info);
New<CreateTextureCommand>().Set(Ref(texture), info);
QueueCommand();
return texture;
}
else
{
var texture = new ThreadedTexture(this, info)
{
Base = _baseRenderer.CreateTexture(info),
};
return texture;
}
}
public ITextureArray CreateTextureArray(int size, bool isBuffer)
{
var textureArray = new ThreadedTextureArray(this);
New<CreateTextureArrayCommand>().Set(Ref(textureArray), size, isBuffer);
QueueCommand();
return textureArray;
}
public void DeleteBuffer(BufferHandle buffer)
{
New<BufferDisposeCommand>().Set(buffer);
QueueCommand();
}
public PinnedSpan<byte> GetBufferData(BufferHandle buffer, int offset, int size)
{
if (IsGpuThread())
{
ResultBox<PinnedSpan<byte>> box = new();
New<BufferGetDataCommand>().Set(buffer, offset, size, Ref(box));
InvokeCommand();
return box.Result;
}
else
{
return _baseRenderer.GetBufferData(Buffers.MapBufferBlocking(buffer), offset, size);
}
}
public Capabilities GetCapabilities()
{
ResultBox<Capabilities> box = new();
New<GetCapabilitiesCommand>().Set(Ref(box));
InvokeCommand();
return box.Result;
}
public ulong GetCurrentSync()
{
return _baseRenderer.GetCurrentSync();
}
public HardwareInfo GetHardwareInfo()
{
return _baseRenderer.GetHardwareInfo();
}
/// <summary>
/// Initialize the base renderer. Must be called on the render thread.
/// </summary>
/// <param name="logLevel">Log level to use</param>
public void Initialize(GraphicsDebugLevel logLevel)
{
_baseRenderer.Initialize(logLevel);
}
public IProgram LoadProgramBinary(byte[] programBinary, bool hasFragmentShader, ShaderInfo info)
{
var program = new ThreadedProgram(this);
BinaryProgramRequest request = new(program, programBinary, hasFragmentShader, info);
Programs.Add(request);
New<CreateProgramCommand>().Set(Ref((IProgramRequest)request));
QueueCommand();
return program;
}
public void PreFrame()
{
New<PreFrameCommand>();
QueueCommand();
}
public ICounterEvent ReportCounter(CounterType type, EventHandler<ulong> resultHandler, float divisor, bool hostReserved)
{
ThreadedCounterEvent evt = new(this, type, _lastSampleCounterClear);
New<ReportCounterCommand>().Set(Ref(evt), type, Ref(resultHandler), divisor, hostReserved);
QueueCommand();
if (type == CounterType.SamplesPassed)
{
_lastSampleCounterClear = false;
}
return evt;
}
public void ResetCounter(CounterType type)
{
New<ResetCounterCommand>().Set(type);
QueueCommand();
_lastSampleCounterClear = true;
}
public void Screenshot()
{
_baseRenderer.Screenshot();
}
public void SetBufferData(BufferHandle buffer, int offset, ReadOnlySpan<byte> data)
{
New<BufferSetDataCommand>().Set(buffer, offset, CopySpan(data));
QueueCommand();
}
public void UpdateCounters()
{
New<UpdateCountersCommand>();
QueueCommand();
}
public void WaitSync(ulong id)
{
Sync.WaitSyncAvailability(id);
_baseRenderer.WaitSync(id);
}
private void Interrupt(Action action)
{
// Interrupt the backend thread from any external thread and invoke the given action.
if (Thread.CurrentThread == _backendThread)
{
// If this is called from the backend thread, the action can run immediately.
action();
}
else
{
lock (_interruptLock)
{
while (Interlocked.CompareExchange(ref _interruptAction, action, null) != null)
{
}
_galWorkAvailable.Set();
_interruptRun.WaitOne();
}
}
}
public void SetInterruptAction(Action<Action> interruptAction)
{
// Threaded renderer ignores given interrupt action, as it provides its own to the child renderer.
}
public bool PrepareHostMapping(nint address, ulong size)
{
return _baseRenderer.PrepareHostMapping(address, size);
}
public void FlushThreadedCommands()
{
SpinWait wait = new();
while (Volatile.Read(ref _commandCount) > 0)
{
wait.SpinOnce();
}
}
public void Dispose()
{
GC.SuppressFinalize(this);
// Dispose must happen from the render thread, after all commands have completed.
// Stop the GPU thread.
_running = false;
_galWorkAvailable.Set();
if (_gpuThread != null && _gpuThread.IsAlive)
{
_gpuThread.Join();
}
// Dispose the renderer.
_baseRenderer.Dispose();
// Dispose events.
_frameComplete.Dispose();
_galWorkAvailable.Dispose();
_invokeRun.Dispose();
_interruptRun.Dispose();
Sync.Dispose();
}
}
}
|