74 files changed, 1856 insertions, 586 deletions
diff --git a/.ci/scripts/.gitkeep b/.ci/scripts/.gitkeep
deleted file mode 100644
index e69de29bb2..0000000000
--- a/.ci/scripts/.gitkeep
+++ /dev/null
diff --git a/.ci/scripts/common/post-upload.sh b/.ci/scripts/common/post-upload.sh
new file mode 100644
index 0000000000..bb4e9d328c
--- /dev/null
+++ b/.ci/scripts/common/post-upload.sh
@@ -0,0 +1,15 @@
+#!/bin/bash -ex
+
+# Copy documentation
+cp license.txt "$REV_NAME"
+cp README.md "$REV_NAME"
+
+tar $COMPRESSION_FLAGS "$ARCHIVE_NAME" "$REV_NAME"
+
+mv "$REV_NAME" $RELEASE_NAME
+
+7z a "$REV_NAME.7z" $RELEASE_NAME
+
+# move the compiled archive into the artifacts directory to be uploaded by travis releases
+mv "$ARCHIVE_NAME" artifacts/
+mv "$REV_NAME.7z" artifacts/
diff --git a/.ci/scripts/common/pre-upload.sh b/.ci/scripts/common/pre-upload.sh
new file mode 100644
index 0000000000..3c2fc79a2c
--- /dev/null
+++ b/.ci/scripts/common/pre-upload.sh
@@ -0,0 +1,6 @@
+#!/bin/bash -ex
+
+GITDATE="`git show -s --date=short --format='%ad' | sed 's/-//g'`"
+GITREV="`git show -s --format='%h'`"
+
+mkdir -p artifacts
diff --git a/.ci/scripts/format/docker.sh b/.ci/scripts/format/docker.sh
new file mode 100644
index 0000000000..778411e4a4
--- /dev/null
+++ b/.ci/scripts/format/docker.sh
@@ -0,0 +1,6 @@
+#!/bin/bash -ex
+
+# Run clang-format
+cd /yuzu
+chmod a+x ./.ci/scripts/format/script.sh
+./.ci/scripts/format/script.sh
diff --git a/.ci/scripts/format/exec.sh b/.ci/scripts/format/exec.sh
new file mode 100644
index 0000000000..5d6393b384
--- /dev/null
+++ b/.ci/scripts/format/exec.sh
@@ -0,0 +1,4 @@
+#!/bin/bash -ex
+
+chmod a+x ./.ci/scripts/format/docker.sh
+docker run -v $(pwd):/yuzu yuzuemu/build-environments:linux-clang-format /bin/bash -ex /yuzu/.ci/scripts/format/docker.sh
diff --git a/.ci/scripts/format/script.sh b/.ci/scripts/format/script.sh
new file mode 100644
index 0000000000..5ab828d5e4
--- /dev/null
+++ b/.ci/scripts/format/script.sh
@@ -0,0 +1,37 @@
+#!/bin/bash -ex
+
+if grep -nrI '\s$' src *.yml *.txt *.md Doxyfile .gitignore .gitmodules .ci* dist/*.desktop \
+                 dist/*.svg dist/*.xml; then
+    echo Trailing whitespace found, aborting
+    exit 1
+fi
+
+# Default clang-format points to default 3.5 version one
+CLANG_FORMAT=clang-format-6.0
+$CLANG_FORMAT --version
+
+if [ "$TRAVIS_EVENT_TYPE" = "pull_request" ]; then
+    # Get list of every file modified in this pull request
+    files_to_lint="$(git diff --name-only --diff-filter=ACMRTUXB $TRAVIS_COMMIT_RANGE | grep '^src/[^.]*[.]\(cpp\|h\)$' || true)"
+else
+    # Check everything for branch pushes
+    files_to_lint="$(find src/ -name '*.cpp' -or -name '*.h')"
+fi
+
+# Turn off tracing for this because it's too verbose
+set +x
+
+for f in $files_to_lint; do
+    d=$(diff -u "$f" <($CLANG_FORMAT "$f") || true)
+    if ! [ -z "$d" ]; then
+        echo "!!! $f not compliant to coding style, here is the fix:"
+        echo "$d"
+        fail=1
+    fi
+done
+
+set -x
+
+if [ "$fail" = 1 ]; then
+    exit 1
+fi
diff --git a/.ci/scripts/linux/docker.sh b/.ci/scripts/linux/docker.sh
new file mode 100644
index 0000000000..f538a40817
--- /dev/null
+++ b/.ci/scripts/linux/docker.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -ex
+
+cd /yuzu
+
+ccache -s
+
+mkdir build || true && cd build
+cmake .. -G Ninja -DYUZU_USE_BUNDLED_UNICORN=ON -DYUZU_USE_QT_WEB_ENGINE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=/usr/lib/ccache/gcc -DCMAKE_CXX_COMPILER=/usr/lib/ccache/g++ -DYUZU_ENABLE_COMPATIBILITY_REPORTING=${ENABLE_COMPATIBILITY_REPORTING:-"OFF"} -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DUSE_DISCORD_PRESENCE=ON
+
+ninja
+
+ccache -s
+
+ctest -VV -C Release
diff --git a/.ci/scripts/linux/exec.sh b/.ci/scripts/linux/exec.sh
new file mode 100644
index 0000000000..a5a6c34b9a
--- /dev/null
+++ b/.ci/scripts/linux/exec.sh
@@ -0,0 +1,5 @@
+#!/bin/bash -ex
+
+mkdir -p "ccache"  || true
+chmod a+x ./.ci/scripts/linux/docker.sh
+docker run -e ENABLE_COMPATIBILITY_REPORTING -e CCACHE_DIR=/yuzu/ccache -v $(pwd):/yuzu yuzuemu/build-environments:linux-fresh /bin/bash /yuzu/.ci/scripts/linux/docker.sh
diff --git a/.ci/scripts/linux/upload.sh b/.ci/scripts/linux/upload.sh
new file mode 100644
index 0000000000..0d131d1dde
--- /dev/null
+++ b/.ci/scripts/linux/upload.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -ex
+
+. .ci/scripts/common/pre-upload.sh
+
+REV_NAME="yuzu-linux-${GITDATE}-${GITREV}"
+ARCHIVE_NAME="${REV_NAME}.tar.xz"
+COMPRESSION_FLAGS="-cJvf"
+
+mkdir "$REV_NAME"
+
+cp build/bin/yuzu-cmd "$REV_NAME"
+cp build/bin/yuzu "$REV_NAME"
+
+. .ci/scripts/common/post-upload.sh
diff --git a/.ci/scripts/merge/apply-patches-by-label.py b/.ci/scripts/merge/apply-patches-by-label.py
new file mode 100644
index 0000000000..b346001a52
--- /dev/null
+++ b/.ci/scripts/merge/apply-patches-by-label.py
@@ -0,0 +1,28 @@
+# Download all pull requests as patches that match a specific label
+# Usage: python download-patches-by-label.py <Label to Match> <Root Path Folder to DL to>
+
+import requests, sys, json, urllib3.request, shutil, subprocess
+
+http = urllib3.PoolManager()
+dl_list = {}
+
+def check_individual(labels):
+    for label in labels:
+        if (label["name"] == sys.argv[1]):
+            return True
+    return False
+
+try:
+    url = 'https://api.github.com/repos/yuzu-emu/yuzu/pulls'
+    response = requests.get(url)
+    if (response.ok):
+        j = json.loads(response.content)
+        for pr in j:
+            if (check_individual(pr["labels"])):
+                pn = pr["number"]
+                print("Matched PR# %s" % pn)
+                print(subprocess.check_output(["git", "fetch", "https://github.com/yuzu-emu/yuzu.git", "pull/%s/head:pr-%s" % (pn, pn), "-f"]))
+                print(subprocess.check_output(["git", "merge", "--squash", "pr-%s" % pn]))
+                print(subprocess.check_output(["git", "commit", "-m\"Merge PR %s\"" % pn]))
+except:
+    sys.exit(-1)
diff --git a/.ci/scripts/merge/check-label-presence.py b/.ci/scripts/merge/check-label-presence.py
new file mode 100644
index 0000000000..048466d7e5
--- /dev/null
+++ b/.ci/scripts/merge/check-label-presence.py
@@ -0,0 +1,18 @@
+# Checks to see if the specified pull request # has the specified tag
+# Usage: python check-label-presence.py <Pull Request ID> <Name of Label>
+
+import requests, json, sys
+
+try:
+    url = 'https://api.github.com/repos/yuzu-emu/yuzu/issues/%s' % sys.argv[1]
+    response = requests.get(url)
+    if (response.ok):
+        j = json.loads(response.content)
+        for label in j["labels"]:
+            if label["name"] == sys.argv[2]:
+                print('##vso[task.setvariable variable=enabletesting;]true')
+                sys.exit()
+except:
+    sys.exit(-1)
+
+print('##vso[task.setvariable variable=enabletesting;]false')
diff --git a/.ci/scripts/merge/yuzubot-git-config.sh b/.ci/scripts/merge/yuzubot-git-config.sh
new file mode 100644
index 0000000000..d9d595bbc8
--- /dev/null
+++ b/.ci/scripts/merge/yuzubot-git-config.sh
@@ -0,0 +1,2 @@
+git config --global user.email "yuzu@yuzu-emu.org"
+git config --global user.name "yuzubot"
+\ No newline at end of file
diff --git a/.ci/scripts/windows/docker.sh b/.ci/scripts/windows/docker.sh
new file mode 100644
index 0000000000..f7093363bc
--- /dev/null
+++ b/.ci/scripts/windows/docker.sh
@@ -0,0 +1,50 @@
+#!/bin/bash -ex
+
+cd /yuzu
+
+ccache -s
+
+# Dirty hack to trick unicorn makefile into believing we are in a MINGW system
+mv /bin/uname /bin/uname1 && echo -e '#!/bin/sh\necho MINGW64' >> /bin/uname
+chmod +x /bin/uname
+
+# Dirty hack to trick unicorn makefile into believing we have cmd
+echo '' >> /bin/cmd
+chmod +x /bin/cmd
+
+mkdir build || true && cd build
+cmake .. -G Ninja -DCMAKE_TOOLCHAIN_FILE="$(pwd)/../CMakeModules/MinGWCross.cmake" -DUSE_CCACHE=ON -DYUZU_USE_BUNDLED_UNICORN=ON -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DCMAKE_BUILD_TYPE=Release
+ninja
+
+# Clean up the dirty hacks
+rm /bin/uname && mv /bin/uname1 /bin/uname
+rm /bin/cmd
+
+ccache -s
+
+echo "Tests skipped"
+#ctest -VV -C Release
+
+echo 'Prepare binaries...'
+cd ..
+mkdir package
+
+QT_PLATFORM_DLL_PATH='/usr/x86_64-w64-mingw32/lib/qt5/plugins/platforms/'
+find build/ -name "yuzu*.exe" -exec cp {} 'package' \;
+
+# copy Qt plugins
+mkdir package/platforms
+cp "${QT_PLATFORM_DLL_PATH}/qwindows.dll" package/platforms/
+cp -rv "${QT_PLATFORM_DLL_PATH}/../mediaservice/" package/
+cp -rv "${QT_PLATFORM_DLL_PATH}/../imageformats/" package/
+rm -f package/mediaservice/*d.dll
+
+for i in package/*.exe; do
+  # we need to process pdb here, however, cv2pdb
+  # does not work here, so we just simply strip all the debug symbols
+  x86_64-w64-mingw32-strip "${i}"
+done
+
+pip3 install pefile
+python3 .ci/scripts/windows/scan_dll.py package/*.exe "package/"
+python3 .ci/scripts/windows/scan_dll.py package/imageformats/*.dll "package/"
diff --git a/.ci/scripts/windows/exec.sh b/.ci/scripts/windows/exec.sh
new file mode 100644
index 0000000000..d6a994856c
--- /dev/null
+++ b/.ci/scripts/windows/exec.sh
@@ -0,0 +1,5 @@
+#!/bin/bash -ex
+
+mkdir -p "ccache" || true
+chmod a+x ./.ci/scripts/windows/docker.sh
+docker run -e CCACHE_DIR=/yuzu/ccache -v $(pwd):/yuzu yuzuemu/build-environments:linux-mingw /bin/bash -ex /yuzu/.ci/scripts/windows/docker.sh
diff --git a/.ci/scripts/windows/scan_dll.py b/.ci/scripts/windows/scan_dll.py
new file mode 100644
index 0000000000..163183f2e3
--- /dev/null
+++ b/.ci/scripts/windows/scan_dll.py
@@ -0,0 +1,106 @@
+import pefile
+import sys
+import re
+import os
+import queue
+import shutil
+
+# constant definitions
+KNOWN_SYS_DLLS = ['WINMM.DLL', 'MSVCRT.DLL', 'VERSION.DLL', 'MPR.DLL',
+                  'DWMAPI.DLL', 'UXTHEME.DLL', 'DNSAPI.DLL', 'IPHLPAPI.DLL']
+# below is for Ubuntu 18.04 with specified PPA enabled, if you are using
+# other distro or different repositories, change the following accordingly
+DLL_PATH = [
+    '/usr/x86_64-w64-mingw32/bin/',
+    '/usr/x86_64-w64-mingw32/lib/',
+    '/usr/lib/gcc/x86_64-w64-mingw32/7.3-posix/'
+]
+
+missing = []
+
+
+def parse_imports(file_name):
+    results = []
+    pe = pefile.PE(file_name, fast_load=True)
+    pe.parse_data_directories()
+
+    for entry in pe.DIRECTORY_ENTRY_IMPORT:
+        current = entry.dll.decode()
+        current_u = current.upper()  # b/c Windows is often case insensitive
+        # here we filter out system dlls
+        # dll w/ names like *32.dll are likely to be system dlls
+        if current_u.upper() not in KNOWN_SYS_DLLS and not re.match(string=current_u, pattern=r'.*32\.DLL'):
+            results.append(current)
+
+    return results
+
+
+def parse_imports_recursive(file_name, path_list=[]):
+    q = queue.Queue()  # create a FIFO queue
+    # file_name can be a string or a list for the convience
+    if isinstance(file_name, str):
+        q.put(file_name)
+    elif isinstance(file_name, list):
+        for i in file_name:
+            q.put(i)
+    full_list = []
+    while q.qsize():
+        current = q.get_nowait()
+        print('> %s' % current)
+        deps = parse_imports(current)
+        # if this dll does not have any import, ignore it
+        if not deps:
+            continue
+        for dep in deps:
+            # the dependency already included in the list, skip
+            if dep in full_list:
+                continue
+            # find the requested dll in the provided paths
+            full_path = find_dll(dep)
+            if not full_path:
+                missing.append(dep)
+                continue
+            full_list.append(dep)
+            q.put(full_path)
+            path_list.append(full_path)
+    return full_list
+
+
+def find_dll(name):
+    for path in DLL_PATH:
+        for root, _, files in os.walk(path):
+            for f in files:
+                if name.lower() == f.lower():
+                    return os.path.join(root, f)
+
+
+def deploy(name, dst, dry_run=False):
+    dlls_path = []
+    parse_imports_recursive(name, dlls_path)
+    for dll_entry in dlls_path:
+        if not dry_run:
+            shutil.copy(dll_entry, dst)
+        else:
+            print('[Dry-Run] Copy %s to %s' % (dll_entry, dst))
+    print('Deploy completed.')
+    return dlls_path
+
+
+def main():
+    if len(sys.argv) < 3:
+        print('Usage: %s [files to examine ...] [target deploy directory]')
+        return 1
+    to_deploy = sys.argv[1:-1]
+    tgt_dir = sys.argv[-1]
+    if not os.path.isdir(tgt_dir):
+        print('%s is not a directory.' % tgt_dir)
+        return 1
+    print('Scanning dependencies...')
+    deploy(to_deploy, tgt_dir)
+    if missing:
+        print('Following DLLs are not found: %s' % ('\n'.join(missing)))
+    return 0
+
+
+if __name__ == '__main__':
+    main()
diff --git a/.ci/scripts/windows/upload.sh b/.ci/scripts/windows/upload.sh
new file mode 100644
index 0000000000..de73d3541a
--- /dev/null
+++ b/.ci/scripts/windows/upload.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -ex
+
+. .ci/scripts/common/pre-upload.sh
+
+REV_NAME="yuzu-windows-mingw-${GITDATE}-${GITREV}"
+ARCHIVE_NAME="${REV_NAME}.tar.gz"
+COMPRESSION_FLAGS="-czvf"
+
+mkdir "$REV_NAME"
+# get around the permission issues
+cp -r package/* "$REV_NAME"
+
+. .ci/scripts/common/post-upload.sh
diff --git a/.ci/templates/build-single.yml b/.ci/templates/build-single.yml
new file mode 100644
index 0000000000..77eeb96b5b
--- /dev/null
+++ b/.ci/templates/build-single.yml
@@ -0,0 +1,21 @@
+parameters:
+  artifactSource: 'true'
+
+steps:
+- task: DockerInstaller@0
+  displayName: 'Prepare Environment'
+  inputs:
+    dockerVersion: '17.09.0-ce'
+- task: CacheBeta@0
+  displayName: 'Cache Build System'
+  inputs:
+    key: yuzu-v1-$(BuildName)-$(BuildSuffix)-$(CacheSuffix)
+    path: $(System.DefaultWorkingDirectory)/ccache
+    cacheHitVar: CACHE_RESTORED
+- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/exec.sh && ./.ci/scripts/$(ScriptFolder)/exec.sh
+  displayName: 'Build'
+- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && ./.ci/scripts/$(ScriptFolder)/upload.sh
+  displayName: 'Package Artifacts'
+- publish: artifacts
+  artifact: 'yuzu-$(BuildName)-$(BuildSuffix)'
+  displayName: 'Upload Artifacts'
diff --git a/.ci/templates/build-standard.yml b/.ci/templates/build-standard.yml
new file mode 100644
index 0000000000..9975f5c49f
--- /dev/null
+++ b/.ci/templates/build-standard.yml
@@ -0,0 +1,22 @@
+jobs:
+- job: build
+  displayName: 'standard'
+  pool:
+    vmImage: ubuntu-latest
+  strategy: 
+    maxParallel: 10
+    matrix:
+      windows:
+        BuildSuffix: 'windows-mingw'
+        ScriptFolder: 'windows'
+      linux:
+        BuildSuffix: 'linux'
+        ScriptFolder: 'linux'
+  steps:
+  - template: ./sync-source.yml
+    parameters:
+      artifactSource: $(parameters.artifactSource)
+      needSubmodules: 'true'
+  - template: ./build-single.yml
+    parameters:
+      artifactSource: 'false'
+\ No newline at end of file
diff --git a/.ci/templates/build-testing.yml b/.ci/templates/build-testing.yml
new file mode 100644
index 0000000000..101e529963
--- /dev/null
+++ b/.ci/templates/build-testing.yml
@@ -0,0 +1,30 @@
+jobs:
+- job: build_test
+  displayName: 'testing'
+  pool:
+    vmImage: ubuntu-latest
+  strategy: 
+    maxParallel: 10
+    matrix:
+      windows:
+        BuildSuffix: 'windows-testing'
+        ScriptFolder: 'windows'
+  steps:
+  - task: PythonScript@0
+    condition: eq(variables['Build.Reason'], 'PullRequest')
+    displayName: 'Determine Testing Status'
+    inputs:
+      scriptSource: 'filePath'
+      scriptPath: '../scripts/merge/check-label-presence.py'
+      arguments: '$(System.PullRequest.PullRequestNumber) create-testing-build'
+  - ${{ if eq(variables.enabletesting, 'true') }}:
+    - template: ./sync-source.yml
+      parameters:
+        artifactSource: $(parameters.artifactSource)
+        needSubmodules: 'true'
+    - template: ./mergebot.yml
+      parameters:
+        matchLabel: 'testing-merge'
+    - template: ./build-single.yml
+      parameters:
+        artifactSource: 'false'
+\ No newline at end of file
diff --git a/.ci/templates/format-check.yml b/.ci/templates/format-check.yml
new file mode 100644
index 0000000000..5061f1cb8e
--- /dev/null
+++ b/.ci/templates/format-check.yml
@@ -0,0 +1,14 @@
+parameters:
+  artifactSource: 'true'
+
+steps:
+- template: ./sync-source.yml
+  parameters:
+    artifactSource: $(parameters.artifactSource)
+    needSubmodules: 'false'
+- task: DockerInstaller@0
+  displayName: 'Prepare Environment'
+  inputs:
+    dockerVersion: '17.09.0-ce'
+- script: chmod a+x ./.ci/scripts/format/exec.sh && ./.ci/scripts/format/exec.sh
+  displayName: 'Verify Formatting'
diff --git a/.ci/templates/merge.yml b/.ci/templates/merge.yml
new file mode 100644
index 0000000000..efc82778ae
--- /dev/null
+++ b/.ci/templates/merge.yml
@@ -0,0 +1,46 @@
+jobs:
+- job: merge
+  displayName: 'pull requests'
+  steps:
+  - checkout: self
+    submodules: recursive
+  - template: ./mergebot.yml
+    parameters:
+      matchLabel: '$(BuildName)-merge'
+  - task: ArchiveFiles@2
+    displayName: 'Package Source'
+    inputs:
+      rootFolderOrFile: '$(System.DefaultWorkingDirectory)'
+      includeRootFolder: false
+      archiveType: '7z'
+      archiveFile: '$(Build.ArtifactStagingDirectory)/yuzu-$(BuildName)-source.7z'
+  - task: PublishPipelineArtifact@1
+    displayName: 'Upload Artifacts'
+    inputs:
+      targetPath: '$(Build.ArtifactStagingDirectory)/yuzu-$(BuildName)-source.7z'
+      artifact: 'yuzu-$(BuildName)-source'
+      replaceExistingArchive: true
+- job: upload_source
+  displayName: 'upload'
+  dependsOn: merge
+  steps:
+  - template: ./sync-source.yml
+    parameters:
+      artifactSource: 'true'
+      needSubmodules: 'true'
+  - script: chmod a+x $(System.DefaultWorkingDirectory)/.ci/scripts/merge/yuzubot-git-config.sh && $(System.DefaultWorkingDirectory)/.ci/scripts/merge/yuzubot-git-config.sh
+    displayName: 'Apply Git Configuration'
+  - script: git tag -a $(BuildName)-$(Build.BuildId) -m "yuzu $(BuildName) $(Build.BuildNumber) $(Build.DefinitionName)"
+    displayName: 'Tag Source'
+  - script: git remote add other $(GitRepoPushChangesURL)
+    displayName: 'Register Repository'
+  - script: git push --follow-tags --force other HEAD:$(GitPushBranch)
+    displayName: 'Update Code'
+  - script: git rev-list -n 1 $(BuildName)-$(Build.BuildId) > $(Build.ArtifactStagingDirectory)/tag-commit.sha
+    displayName: 'Calculate Release Point'
+  - task: PublishPipelineArtifact@1
+    displayName: 'Upload Release Point'
+    inputs:
+      targetPath: '$(Build.ArtifactStagingDirectory)/tag-commit.sha'
+      artifact: 'yuzu-$(BuildName)-release-point'
+      replaceExistingArchive: true
+\ No newline at end of file
diff --git a/.ci/templates/mergebot.yml b/.ci/templates/mergebot.yml
new file mode 100644
index 0000000000..5211efcc6d
--- /dev/null
+++ b/.ci/templates/mergebot.yml
@@ -0,0 +1,15 @@
+parameters:
+  matchLabel: 'dummy-merge'
+
+steps:
+  - script: mkdir $(System.DefaultWorkingDirectory)/patches && pip install requests urllib3
+    displayName: 'Prepare Environment'
+  - script: chmod a+x $(System.DefaultWorkingDirectory)/.ci/scripts/merge/yuzubot-git-config.sh && $(System.DefaultWorkingDirectory)/.ci/scripts/merge/yuzubot-git-config.sh
+    displayName: 'Apply Git Configuration'
+  - task: PythonScript@0
+    displayName: 'Discover, Download, and Apply Patches'
+    inputs:
+      scriptSource: 'filePath'
+      scriptPath: '.ci/scripts/merge/apply-patches-by-label.py'
+      arguments: '${{ parameters.matchLabel }} patches'
+      workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.ci/templates/release.yml b/.ci/templates/release.yml
new file mode 100644
index 0000000000..60bebd2aad
--- /dev/null
+++ b/.ci/templates/release.yml
@@ -0,0 +1,29 @@
+steps:
+  - task: DownloadPipelineArtifact@2
+    displayName: 'Download Windows Release'
+    inputs:
+      artifactName: 'yuzu-$(BuildName)-windows-mingw'
+      buildType: 'current'
+      targetPath: '$(Build.ArtifactStagingDirectory)'
+  - task: DownloadPipelineArtifact@2
+    displayName: 'Download Linux Release'
+    inputs:
+      artifactName: 'yuzu-$(BuildName)-linux'
+      buildType: 'current'
+      targetPath: '$(Build.ArtifactStagingDirectory)'
+  - task: DownloadPipelineArtifact@2
+    displayName: 'Download Release Point'
+    inputs:
+      artifactName: 'yuzu-$(BuildName)-release-point'
+      buildType: 'current'
+      targetPath: '$(Build.ArtifactStagingDirectory)'
+  - script: echo '##vso[task.setvariable variable=tagcommit]' && cat $(Build.ArtifactStagingDirectory)/tag-commit.sha
+    displayName: 'Calculate Release Point'
+  - task: GitHubRelease@0
+    inputs:
+      gitHubConnection: $(GitHubReleaseConnectionName)
+      repositoryName: '$(GitHubReleaseRepoName)'
+      action: 'create'
+      target: $(variables.tagcommit)
+      title: 'yuzu $(BuildName) #$(Build.BuildId)'
+      assets: '$(Build.ArtifactStagingDirectory)/*'
diff --git a/.ci/templates/retrieve-artifact-source.yml b/.ci/templates/retrieve-artifact-source.yml
new file mode 100644
index 0000000000..47d217e7bd
--- /dev/null
+++ b/.ci/templates/retrieve-artifact-source.yml
@@ -0,0 +1,16 @@
+steps:
+- checkout: none
+- task: DownloadPipelineArtifact@2
+  displayName: 'Download Source'
+  inputs:
+    artifactName: 'yuzu-$(BuildName)-source'
+    buildType: 'current'
+    targetPath: '$(Build.ArtifactStagingDirectory)'
+- script: rm -rf $(System.DefaultWorkingDirectory) && mkdir $(System.DefaultWorkingDirectory)
+  displayName: 'Clean Working Directory'
+- task: ExtractFiles@1
+  displayName: 'Prepare Source'
+  inputs:
+    archiveFilePatterns: '$(Build.ArtifactStagingDirectory)/*.7z'
+    destinationFolder: '$(System.DefaultWorkingDirectory)'
+    cleanDestinationFolder: false
+\ No newline at end of file
diff --git a/.ci/templates/retrieve-master-source.yml b/.ci/templates/retrieve-master-source.yml
new file mode 100644
index 0000000000..a08a3f926f
--- /dev/null
+++ b/.ci/templates/retrieve-master-source.yml
@@ -0,0 +1,11 @@
+parameters:
+  needSubmodules: 'true'
+
+steps:
+- checkout: self
+  displayName: 'Checkout Recursive'
+  submodules: recursive
+#  condition: eq(parameters.needSubmodules, 'true')
+#- checkout: self
+#  displayName: 'Checkout Fast'
+#  condition: ne(parameters.needSubmodules, 'true')
diff --git a/.ci/templates/sync-source.yml b/.ci/templates/sync-source.yml
new file mode 100644
index 0000000000..409e1cd834
--- /dev/null
+++ b/.ci/templates/sync-source.yml
@@ -0,0 +1,7 @@
+steps:
+- ${{ if eq(parameters.artifactSource, 'true') }}:
+  - template: ./retrieve-artifact-source.yml
+- ${{ if ne(parameters.artifactSource, 'true') }}:
+  - template: ./retrieve-master-source.yml
+    parameters:
+      needSubmodules: $(parameters.needSubmodules)
+\ No newline at end of file
diff --git a/.ci/yuzu-mainline.yml b/.ci/yuzu-mainline.yml
index aa912913de..164bcb165b 100644
--- a/.ci/yuzu-mainline.yml
+++ b/.ci/yuzu-mainline.yml
@@ -1,19 +1,23 @@
-# Starter pipeline
-# Start with a minimal pipeline that you can customize to build and deploy your code.
-# Add steps that build, run tests, deploy, and more:
-# https://aka.ms/yaml
-
 trigger:
 - master
 
-pool:
-  vmImage: 'ubuntu-latest'
-
-steps:
-- script: echo Hello, world!
-  displayName: 'Run a one-line script'
-
-- script: |
-    echo Add other tasks to build, test, and deploy your project.
-    echo See https://aka.ms/yaml
-  displayName: 'Run a multi-line script'
+stages:
+- stage: merge
+  displayName: 'merge'
+  jobs:
+  - template: ./templates/merge.yml
+- stage: format
+  dependsOn: merge
+  displayName: 'format'
+  jobs:
+  - job: format
+    displayName: 'clang'
+    pool:
+      vmImage: ubuntu-latest
+    steps:
+    - template: ./templates/format-check.yml
+- stage: build
+  displayName: 'build'
+  dependsOn: format
+  jobs:
+  - template: ./templates/build-standard.yml
diff --git a/.ci/yuzu-verify.yml b/.ci/yuzu-verify.yml
new file mode 100644
index 0000000000..d01c1feed0
--- /dev/null
+++ b/.ci/yuzu-verify.yml
@@ -0,0 +1,18 @@
+stages:
+- stage: format
+  displayName: 'format'
+  jobs:
+  - job: format
+    displayName: 'clang'
+    pool:
+      vmImage: ubuntu-latest
+    steps:
+    - template: ./templates/format-check.yml
+      parameters:
+        artifactSource: 'false'
+- stage: build
+  displayName: 'build'
+  dependsOn: format
+  jobs:
+  - template: ./templates/build-standard.yml
+  - template: ./templates/build-testing.yml
+\ No newline at end of file
diff --git a/.ci/yuzu.yml b/.ci/yuzu.yml
deleted file mode 100644
index aa912913de..0000000000
--- a/.ci/yuzu.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Starter pipeline
-# Start with a minimal pipeline that you can customize to build and deploy your code.
-# Add steps that build, run tests, deploy, and more:
-# https://aka.ms/yaml
-
-trigger:
-- master
-
-pool:
-  vmImage: 'ubuntu-latest'
-
-steps:
-- script: echo Hello, world!
-  displayName: 'Run a one-line script'
-
-- script: |
-    echo Add other tasks to build, test, and deploy your project.
-    echo See https://aka.ms/yaml
-  displayName: 'Run a multi-line script'
diff --git a/src/core/file_sys/program_metadata.cpp b/src/core/file_sys/program_metadata.cpp
index eb76174c5b..7310b36026 100644
--- a/src/core/file_sys/program_metadata.cpp
+++ b/src/core/file_sys/program_metadata.cpp
@@ -94,6 +94,10 @@ u64 ProgramMetadata::GetFilesystemPermissions() const {
     return aci_file_access.permissions;
 }
 
+u32 ProgramMetadata::GetSystemResourceSize() const {
+    return npdm_header.system_resource_size;
+}
+
 const ProgramMetadata::KernelCapabilityDescriptors& ProgramMetadata::GetKernelCapabilities() const {
     return aci_kernel_capabilities;
 }
diff --git a/src/core/file_sys/program_metadata.h b/src/core/file_sys/program_metadata.h
index 43bf2820a9..88ec97d85f 100644
--- a/src/core/file_sys/program_metadata.h
+++ b/src/core/file_sys/program_metadata.h
@@ -58,6 +58,7 @@ public:
     u32 GetMainThreadStackSize() const;
     u64 GetTitleID() const;
     u64 GetFilesystemPermissions() const;
+    u32 GetSystemResourceSize() const;
     const KernelCapabilityDescriptors& GetKernelCapabilities() const;
 
     void Print() const;
@@ -76,7 +77,8 @@ private:
         u8 reserved_3;
         u8 main_thread_priority;
         u8 main_thread_cpu;
-        std::array<u8, 8> reserved_4;
+        std::array<u8, 4> reserved_4;
+        u32_le system_resource_size;
         u32_le process_category;
         u32_le main_stack_size;
         std::array<u8, 0x10> application_name;
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index f45ef05f69..db3ab14cef 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -129,20 +129,17 @@ u64 Process::GetTotalPhysicalMemoryAvailable() const {
     return vm_manager.GetTotalPhysicalMemoryAvailable();
 }
 
-u64 Process::GetTotalPhysicalMemoryAvailableWithoutMmHeap() const {
-    // TODO: Subtract the personal heap size from this when the
-    //       personal heap is implemented.
-    return GetTotalPhysicalMemoryAvailable();
+u64 Process::GetTotalPhysicalMemoryAvailableWithoutSystemResource() const {
+    return GetTotalPhysicalMemoryAvailable() - GetSystemResourceSize();
 }
 
 u64 Process::GetTotalPhysicalMemoryUsed() const {
-    return vm_manager.GetCurrentHeapSize() + main_thread_stack_size + code_memory_size;
+    return vm_manager.GetCurrentHeapSize() + main_thread_stack_size + code_memory_size +
+           GetSystemResourceUsage();
 }
 
-u64 Process::GetTotalPhysicalMemoryUsedWithoutMmHeap() const {
-    // TODO: Subtract the personal heap size from this when the
-    //       personal heap is implemented.
-    return GetTotalPhysicalMemoryUsed();
+u64 Process::GetTotalPhysicalMemoryUsedWithoutSystemResource() const {
+    return GetTotalPhysicalMemoryUsed() - GetSystemResourceUsage();
 }
 
 void Process::RegisterThread(const Thread* thread) {
@@ -172,6 +169,7 @@ ResultCode Process::LoadFromMetadata(const FileSys::ProgramMetadata& metadata) {
     program_id = metadata.GetTitleID();
     ideal_core = metadata.GetMainThreadCore();
     is_64bit_process = metadata.Is64BitProgram();
+    system_resource_size = metadata.GetSystemResourceSize();
 
     vm_manager.Reset(metadata.GetAddressSpaceType());
 
diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h
index 83ea02beec..3196014da3 100644
--- a/src/core/hle/kernel/process.h
+++ b/src/core/hle/kernel/process.h
@@ -168,8 +168,24 @@ public:
         return capabilities.GetPriorityMask();
     }
 
-    u32 IsVirtualMemoryEnabled() const {
-        return is_virtual_address_memory_enabled;
+    /// Gets the amount of secure memory to allocate for memory management.
+    u32 GetSystemResourceSize() const {
+        return system_resource_size;
+    }
+
+    /// Gets the amount of secure memory currently in use for memory management.
+    u32 GetSystemResourceUsage() const {
+        // On hardware, this returns the amount of system resource memory that has
+        // been used by the kernel. This is problematic for Yuzu to emulate, because
+        // system resource memory is used for page tables -- and yuzu doesn't really
+        // have a way to calculate how much memory is required for page tables for
+        // the current process at any given time.
+        // TODO: Is this even worth implementing? Games may retrieve this value via
+        // an SDK function that gets used + available system resource size for debug
+        // or diagnostic purposes. However, it seems unlikely that a game would make
+        // decisions based on how much system memory is dedicated to its page tables.
+        // Is returning a value other than zero wise?
+        return 0;
     }
 
     /// Whether this process is an AArch64 or AArch32 process.
@@ -196,15 +212,15 @@ public:
     u64 GetTotalPhysicalMemoryAvailable() const;
 
     /// Retrieves the total physical memory available to this process in bytes,
-    /// without the size of the personal heap added to it.
-    u64 GetTotalPhysicalMemoryAvailableWithoutMmHeap() const;
+    /// without the size of the personal system resource heap added to it.
+    u64 GetTotalPhysicalMemoryAvailableWithoutSystemResource() const;
 
     /// Retrieves the total physical memory used by this process in bytes.
     u64 GetTotalPhysicalMemoryUsed() const;
 
     /// Retrieves the total physical memory used by this process in bytes,
-    /// without the size of the personal heap added to it.
-    u64 GetTotalPhysicalMemoryUsedWithoutMmHeap() const;
+    /// without the size of the personal system resource heap added to it.
+    u64 GetTotalPhysicalMemoryUsedWithoutSystemResource() const;
 
     /// Gets the list of all threads created with this process as their owner.
     const std::list<const Thread*>& GetThreadList() const {
@@ -298,12 +314,16 @@ private:
     /// Title ID corresponding to the process
     u64 program_id = 0;
 
+    /// Specifies additional memory to be reserved for the process's memory management by the
+    /// system. When this is non-zero, secure memory is allocated and used for page table allocation
+    /// instead of using the normal global page tables/memory block management.
+    u32 system_resource_size = 0;
+
     /// Resource limit descriptor for this process
     SharedPtr<ResourceLimit> resource_limit;
 
     /// The ideal CPU core for this process, threads are scheduled on this core by default.
     u8 ideal_core = 0;
-    u32 is_virtual_address_memory_enabled = 0;
 
     /// The Thread Local Storage area is allocated as processes create threads,
     /// each TLS area is 0x200 bytes, so one page (0x1000) is split up in 8 parts, and each part
diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp
index 58374f8295..a46eed3daa 100644
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -736,16 +736,16 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
         StackRegionBaseAddr = 14,
         StackRegionSize = 15,
         // 3.0.0+
-        IsVirtualAddressMemoryEnabled = 16,
-        PersonalMmHeapUsage = 17,
+        SystemResourceSize = 16,
+        SystemResourceUsage = 17,
         TitleId = 18,
         // 4.0.0+
         PrivilegedProcessId = 19,
         // 5.0.0+
         UserExceptionContextAddr = 20,
         // 6.0.0+
-        TotalPhysicalMemoryAvailableWithoutMmHeap = 21,
-        TotalPhysicalMemoryUsedWithoutMmHeap = 22,
+        TotalPhysicalMemoryAvailableWithoutSystemResource = 21,
+        TotalPhysicalMemoryUsedWithoutSystemResource = 22,
     };
 
     const auto info_id_type = static_cast<GetInfoType>(info_id);
@@ -763,12 +763,12 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
     case GetInfoType::StackRegionSize:
     case GetInfoType::TotalPhysicalMemoryAvailable:
     case GetInfoType::TotalPhysicalMemoryUsed:
-    case GetInfoType::IsVirtualAddressMemoryEnabled:
-    case GetInfoType::PersonalMmHeapUsage:
+    case GetInfoType::SystemResourceSize:
+    case GetInfoType::SystemResourceUsage:
     case GetInfoType::TitleId:
     case GetInfoType::UserExceptionContextAddr:
-    case GetInfoType::TotalPhysicalMemoryAvailableWithoutMmHeap:
-    case GetInfoType::TotalPhysicalMemoryUsedWithoutMmHeap: {
+    case GetInfoType::TotalPhysicalMemoryAvailableWithoutSystemResource:
+    case GetInfoType::TotalPhysicalMemoryUsedWithoutSystemResource: {
         if (info_sub_id != 0) {
             return ERR_INVALID_ENUM_VALUE;
         }
@@ -829,8 +829,13 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
             *result = process->GetTotalPhysicalMemoryUsed();
             return RESULT_SUCCESS;
 
-        case GetInfoType::IsVirtualAddressMemoryEnabled:
-            *result = process->IsVirtualMemoryEnabled();
+        case GetInfoType::SystemResourceSize:
+            *result = process->GetSystemResourceSize();
+            return RESULT_SUCCESS;
+
+        case GetInfoType::SystemResourceUsage:
+            LOG_WARNING(Kernel_SVC, "(STUBBED) Attempted to query system resource usage");
+            *result = process->GetSystemResourceUsage();
             return RESULT_SUCCESS;
 
         case GetInfoType::TitleId:
@@ -843,12 +848,12 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
             *result = 0;
             return RESULT_SUCCESS;
 
-        case GetInfoType::TotalPhysicalMemoryAvailableWithoutMmHeap:
-            *result = process->GetTotalPhysicalMemoryAvailable();
+        case GetInfoType::TotalPhysicalMemoryAvailableWithoutSystemResource:
+            *result = process->GetTotalPhysicalMemoryAvailableWithoutSystemResource();
             return RESULT_SUCCESS;
 
-        case GetInfoType::TotalPhysicalMemoryUsedWithoutMmHeap:
-            *result = process->GetTotalPhysicalMemoryUsedWithoutMmHeap();
+        case GetInfoType::TotalPhysicalMemoryUsedWithoutSystemResource:
+            *result = process->GetTotalPhysicalMemoryUsedWithoutSystemResource();
             return RESULT_SUCCESS;
 
         default:
@@ -953,6 +958,86 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
     }
 }
 
+/// Maps memory at a desired address
+static ResultCode MapPhysicalMemory(Core::System& system, VAddr addr, u64 size) {
+    LOG_DEBUG(Kernel_SVC, "called, addr=0x{:016X}, size=0x{:X}", addr, size);
+
+    if (!Common::Is4KBAligned(addr)) {
+        LOG_ERROR(Kernel_SVC, "Address is not aligned to 4KB, 0x{:016X}", addr);
+        return ERR_INVALID_ADDRESS;
+    }
+
+    if (!Common::Is4KBAligned(size)) {
+        LOG_ERROR(Kernel_SVC, "Size is not aligned to 4KB, 0x{:X}", size);
+        return ERR_INVALID_SIZE;
+    }
+
+    if (size == 0) {
+        LOG_ERROR(Kernel_SVC, "Size is zero");
+        return ERR_INVALID_SIZE;
+    }
+
+    if (!(addr < addr + size)) {
+        LOG_ERROR(Kernel_SVC, "Size causes 64-bit overflow of address");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    Process* const current_process = system.Kernel().CurrentProcess();
+    auto& vm_manager = current_process->VMManager();
+
+    if (current_process->GetSystemResourceSize() == 0) {
+        LOG_ERROR(Kernel_SVC, "System Resource Size is zero");
+        return ERR_INVALID_STATE;
+    }
+
+    if (!vm_manager.IsWithinMapRegion(addr, size)) {
+        LOG_ERROR(Kernel_SVC, "Range not within map region");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    return vm_manager.MapPhysicalMemory(addr, size);
+}
+
+/// Unmaps memory previously mapped via MapPhysicalMemory
+static ResultCode UnmapPhysicalMemory(Core::System& system, VAddr addr, u64 size) {
+    LOG_DEBUG(Kernel_SVC, "called, addr=0x{:016X}, size=0x{:X}", addr, size);
+
+    if (!Common::Is4KBAligned(addr)) {
+        LOG_ERROR(Kernel_SVC, "Address is not aligned to 4KB, 0x{:016X}", addr);
+        return ERR_INVALID_ADDRESS;
+    }
+
+    if (!Common::Is4KBAligned(size)) {
+        LOG_ERROR(Kernel_SVC, "Size is not aligned to 4KB, 0x{:X}", size);
+        return ERR_INVALID_SIZE;
+    }
+
+    if (size == 0) {
+        LOG_ERROR(Kernel_SVC, "Size is zero");
+        return ERR_INVALID_SIZE;
+    }
+
+    if (!(addr < addr + size)) {
+        LOG_ERROR(Kernel_SVC, "Size causes 64-bit overflow of address");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    Process* const current_process = system.Kernel().CurrentProcess();
+    auto& vm_manager = current_process->VMManager();
+
+    if (current_process->GetSystemResourceSize() == 0) {
+        LOG_ERROR(Kernel_SVC, "System Resource Size is zero");
+        return ERR_INVALID_STATE;
+    }
+
+    if (!vm_manager.IsWithinMapRegion(addr, size)) {
+        LOG_ERROR(Kernel_SVC, "Range not within map region");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    return vm_manager.UnmapPhysicalMemory(addr, size);
+}
+
 /// Sets the thread activity
 static ResultCode SetThreadActivity(Core::System& system, Handle handle, u32 activity) {
     LOG_DEBUG(Kernel_SVC, "called, handle=0x{:08X}, activity=0x{:08X}", handle, activity);
@@ -2310,8 +2395,8 @@ static const FunctionDef SVC_Table[] = {
     {0x29, SvcWrap<GetInfo>, "GetInfo"},
     {0x2A, nullptr, "FlushEntireDataCache"},
     {0x2B, nullptr, "FlushDataCache"},
-    {0x2C, nullptr, "MapPhysicalMemory"},
-    {0x2D, nullptr, "UnmapPhysicalMemory"},
+    {0x2C, SvcWrap<MapPhysicalMemory>, "MapPhysicalMemory"},
+    {0x2D, SvcWrap<UnmapPhysicalMemory>, "UnmapPhysicalMemory"},
     {0x2E, nullptr, "GetFutureThreadInfo"},
     {0x2F, nullptr, "GetLastThreadInfo"},
     {0x30, SvcWrap<GetResourceLimitLimitValue>, "GetResourceLimitLimitValue"},
diff --git a/src/core/hle/kernel/svc_wrap.h b/src/core/hle/kernel/svc_wrap.h
index 865473c6fa..c2d8d0dc30 100644
--- a/src/core/hle/kernel/svc_wrap.h
+++ b/src/core/hle/kernel/svc_wrap.h
@@ -32,6 +32,11 @@ void SvcWrap(Core::System& system) {
     FuncReturn(system, func(system, Param(system, 0)).raw);
 }
 
+template <ResultCode func(Core::System&, u64, u64)>
+void SvcWrap(Core::System& system) {
+    FuncReturn(system, func(system, Param(system, 0), Param(system, 1)).raw);
+}
+
 template <ResultCode func(Core::System&, u32)>
 void SvcWrap(Core::System& system) {
     FuncReturn(system, func(system, static_cast<u32>(Param(system, 0))).raw);
diff --git a/src/core/hle/kernel/vm_manager.cpp b/src/core/hle/kernel/vm_manager.cpp
index 7bc925a5f7..4f45fb03b3 100644
--- a/src/core/hle/kernel/vm_manager.cpp
+++ b/src/core/hle/kernel/vm_manager.cpp
@@ -11,6 +11,8 @@
 #include "core/core.h"
 #include "core/file_sys/program_metadata.h"
 #include "core/hle/kernel/errors.h"
+#include "core/hle/kernel/process.h"
+#include "core/hle/kernel/resource_limit.h"
 #include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
 #include "core/memory_setup.h"
@@ -48,10 +50,14 @@ bool VirtualMemoryArea::CanBeMergedWith(const VirtualMemoryArea& next) const {
         type != next.type) {
         return false;
     }
-    if (type == VMAType::AllocatedMemoryBlock &&
-        (backing_block != next.backing_block || offset + size != next.offset)) {
+    if ((attribute & MemoryAttribute::DeviceMapped) == MemoryAttribute::DeviceMapped) {
+        // TODO: Can device mapped memory be merged sanely?
+        // Not merging it may cause inaccuracies versus hardware when memory layout is queried.
         return false;
     }
+    if (type == VMAType::AllocatedMemoryBlock) {
+        return true;
+    }
     if (type == VMAType::BackingMemory && backing_memory + size != next.backing_memory) {
         return false;
     }
@@ -99,7 +105,7 @@ bool VMManager::IsValidHandle(VMAHandle handle) const {
 ResultVal<VMManager::VMAHandle> VMManager::MapMemoryBlock(VAddr target,
                                                           std::shared_ptr<std::vector<u8>> block,
                                                           std::size_t offset, u64 size,
-                                                          MemoryState state) {
+                                                          MemoryState state, VMAPermission perm) {
     ASSERT(block != nullptr);
     ASSERT(offset + size <= block->size());
 
@@ -109,7 +115,7 @@ ResultVal<VMManager::VMAHandle> VMManager::MapMemoryBlock(VAddr target,
     ASSERT(final_vma.size == size);
 
     final_vma.type = VMAType::AllocatedMemoryBlock;
-    final_vma.permissions = VMAPermission::ReadWrite;
+    final_vma.permissions = perm;
     final_vma.state = state;
     final_vma.backing_block = std::move(block);
     final_vma.offset = offset;
@@ -288,6 +294,166 @@ ResultVal<VAddr> VMManager::SetHeapSize(u64 size) {
     return MakeResult<VAddr>(heap_region_base);
 }
 
+ResultCode VMManager::MapPhysicalMemory(VAddr target, u64 size) {
+    const auto end_addr = target + size;
+    const auto last_addr = end_addr - 1;
+    VAddr cur_addr = target;
+
+    ResultCode result = RESULT_SUCCESS;
+
+    // Check how much memory we've already mapped.
+    const auto mapped_size_result = SizeOfAllocatedVMAsInRange(target, size);
+    if (mapped_size_result.Failed()) {
+        return mapped_size_result.Code();
+    }
+
+    // If we've already mapped the desired amount, return early.
+    const std::size_t mapped_size = *mapped_size_result;
+    if (mapped_size == size) {
+        return RESULT_SUCCESS;
+    }
+
+    // Check that we can map the memory we want.
+    const auto res_limit = system.CurrentProcess()->GetResourceLimit();
+    const u64 physmem_remaining = res_limit->GetMaxResourceValue(ResourceType::PhysicalMemory) -
+                                  res_limit->GetCurrentResourceValue(ResourceType::PhysicalMemory);
+    if (physmem_remaining < (size - mapped_size)) {
+        return ERR_RESOURCE_LIMIT_EXCEEDED;
+    }
+
+    // Keep track of the memory regions we unmap.
+    std::vector<std::pair<u64, u64>> mapped_regions;
+
+    // Iterate, trying to map memory.
+    {
+        cur_addr = target;
+
+        auto iter = FindVMA(target);
+        ASSERT_MSG(iter != vma_map.end(), "MapPhysicalMemory iter != end");
+
+        while (true) {
+            const auto& vma = iter->second;
+            const auto vma_start = vma.base;
+            const auto vma_end = vma_start + vma.size;
+            const auto vma_last = vma_end - 1;
+
+            // Map the memory block
+            const auto map_size = std::min(end_addr - cur_addr, vma_end - cur_addr);
+            if (vma.state == MemoryState::Unmapped) {
+                const auto map_res =
+                    MapMemoryBlock(cur_addr, std::make_shared<std::vector<u8>>(map_size, 0), 0,
+                                   map_size, MemoryState::Heap, VMAPermission::ReadWrite);
+                result = map_res.Code();
+                if (result.IsError()) {
+                    break;
+                }
+
+                mapped_regions.emplace_back(cur_addr, map_size);
+            }
+
+            // Break once we hit the end of the range.
+            if (last_addr <= vma_last) {
+                break;
+            }
+
+            // Advance to the next block.
+            cur_addr = vma_end;
+            iter = FindVMA(cur_addr);
+            ASSERT_MSG(iter != vma_map.end(), "MapPhysicalMemory iter != end");
+        }
+    }
+
+    // If we failed, unmap memory.
+    if (result.IsError()) {
+        for (const auto [unmap_address, unmap_size] : mapped_regions) {
+            ASSERT_MSG(UnmapRange(unmap_address, unmap_size).IsSuccess(),
+                       "MapPhysicalMemory un-map on error");
+        }
+
+        return result;
+    }
+
+    // Update amount of mapped physical memory.
+    physical_memory_mapped += size - mapped_size;
+
+    return RESULT_SUCCESS;
+}
+
+ResultCode VMManager::UnmapPhysicalMemory(VAddr target, u64 size) {
+    const auto end_addr = target + size;
+    const auto last_addr = end_addr - 1;
+    VAddr cur_addr = target;
+
+    ResultCode result = RESULT_SUCCESS;
+
+    // Check how much memory is currently mapped.
+    const auto mapped_size_result = SizeOfUnmappablePhysicalMemoryInRange(target, size);
+    if (mapped_size_result.Failed()) {
+        return mapped_size_result.Code();
+    }
+
+    // If we've already unmapped all the memory, return early.
+    const std::size_t mapped_size = *mapped_size_result;
+    if (mapped_size == 0) {
+        return RESULT_SUCCESS;
+    }
+
+    // Keep track of the memory regions we unmap.
+    std::vector<std::pair<u64, u64>> unmapped_regions;
+
+    // Try to unmap regions.
+    {
+        cur_addr = target;
+
+        auto iter = FindVMA(target);
+        ASSERT_MSG(iter != vma_map.end(), "UnmapPhysicalMemory iter != end");
+
+        while (true) {
+            const auto& vma = iter->second;
+            const auto vma_start = vma.base;
+            const auto vma_end = vma_start + vma.size;
+            const auto vma_last = vma_end - 1;
+
+            // Unmap the memory block
+            const auto unmap_size = std::min(end_addr - cur_addr, vma_end - cur_addr);
+            if (vma.state == MemoryState::Heap) {
+                result = UnmapRange(cur_addr, unmap_size);
+                if (result.IsError()) {
+                    break;
+                }
+
+                unmapped_regions.emplace_back(cur_addr, unmap_size);
+            }
+
+            // Break once we hit the end of the range.
+            if (last_addr <= vma_last) {
+                break;
+            }
+
+            // Advance to the next block.
+            cur_addr = vma_end;
+            iter = FindVMA(cur_addr);
+            ASSERT_MSG(iter != vma_map.end(), "UnmapPhysicalMemory iter != end");
+        }
+    }
+
+    // If we failed, re-map regions.
+    // TODO: Preserve memory contents?
+    if (result.IsError()) {
+        for (const auto [map_address, map_size] : unmapped_regions) {
+            const auto remap_res =
+                MapMemoryBlock(map_address, std::make_shared<std::vector<u8>>(map_size, 0), 0,
+                               map_size, MemoryState::Heap, VMAPermission::None);
+            ASSERT_MSG(remap_res.Succeeded(), "UnmapPhysicalMemory re-map on error");
+        }
+    }
+
+    // Update mapped amount
+    physical_memory_mapped -= mapped_size;
+
+    return RESULT_SUCCESS;
+}
+
 ResultCode VMManager::MapCodeMemory(VAddr dst_address, VAddr src_address, u64 size) {
     constexpr auto ignore_attribute = MemoryAttribute::LockedForIPC | MemoryAttribute::DeviceMapped;
     const auto src_check_result = CheckRangeState(
@@ -435,7 +601,7 @@ ResultCode VMManager::MirrorMemory(VAddr dst_addr, VAddr src_addr, u64 size, Mem
     // Protect mirror with permissions from old region
     Reprotect(new_vma, vma->second.permissions);
     // Remove permissions from old region
-    Reprotect(vma, VMAPermission::None);
+    ReprotectRange(src_addr, size, VMAPermission::None);
 
     return RESULT_SUCCESS;
 }
@@ -568,14 +734,14 @@ VMManager::VMAIter VMManager::SplitVMA(VMAIter vma_handle, u64 offset_in_vma) {
 VMManager::VMAIter VMManager::MergeAdjacent(VMAIter iter) {
     const VMAIter next_vma = std::next(iter);
     if (next_vma != vma_map.end() && iter->second.CanBeMergedWith(next_vma->second)) {
-        iter->second.size += next_vma->second.size;
+        MergeAdjacentVMA(iter->second, next_vma->second);
         vma_map.erase(next_vma);
     }
 
     if (iter != vma_map.begin()) {
         VMAIter prev_vma = std::prev(iter);
         if (prev_vma->second.CanBeMergedWith(iter->second)) {
-            prev_vma->second.size += iter->second.size;
+            MergeAdjacentVMA(prev_vma->second, iter->second);
             vma_map.erase(iter);
             iter = prev_vma;
         }
@@ -584,6 +750,38 @@ VMManager::VMAIter VMManager::MergeAdjacent(VMAIter iter) {
     return iter;
 }
 
+void VMManager::MergeAdjacentVMA(VirtualMemoryArea& left, const VirtualMemoryArea& right) {
+    ASSERT(left.CanBeMergedWith(right));
+
+    // Always merge allocated memory blocks, even when they don't share the same backing block.
+    if (left.type == VMAType::AllocatedMemoryBlock &&
+        (left.backing_block != right.backing_block || left.offset + left.size != right.offset)) {
+        // Check if we can save work.
+        if (left.offset == 0 && left.size == left.backing_block->size()) {
+            // Fast case: left is an entire backing block.
+            left.backing_block->insert(left.backing_block->end(),
+                                       right.backing_block->begin() + right.offset,
+                                       right.backing_block->begin() + right.offset + right.size);
+        } else {
+            // Slow case: make a new memory block for left and right.
+            auto new_memory = std::make_shared<std::vector<u8>>();
+            new_memory->insert(new_memory->end(), left.backing_block->begin() + left.offset,
+                               left.backing_block->begin() + left.offset + left.size);
+            new_memory->insert(new_memory->end(), right.backing_block->begin() + right.offset,
+                               right.backing_block->begin() + right.offset + right.size);
+            left.backing_block = new_memory;
+            left.offset = 0;
+        }
+
+        // Page table update is needed, because backing memory changed.
+        left.size += right.size;
+        UpdatePageTableForVMA(left);
+    } else {
+        // Just update the size.
+        left.size += right.size;
+    }
+}
+
 void VMManager::UpdatePageTableForVMA(const VirtualMemoryArea& vma) {
     switch (vma.type) {
     case VMAType::Free:
@@ -758,6 +956,84 @@ VMManager::CheckResults VMManager::CheckRangeState(VAddr address, u64 size, Memo
         std::make_tuple(initial_state, initial_permissions, initial_attributes & ~ignore_mask));
 }
 
+ResultVal<std::size_t> VMManager::SizeOfAllocatedVMAsInRange(VAddr address,
+                                                             std::size_t size) const {
+    const VAddr end_addr = address + size;
+    const VAddr last_addr = end_addr - 1;
+    std::size_t mapped_size = 0;
+
+    VAddr cur_addr = address;
+    auto iter = FindVMA(cur_addr);
+    ASSERT_MSG(iter != vma_map.end(), "SizeOfAllocatedVMAsInRange iter != end");
+
+    while (true) {
+        const auto& vma = iter->second;
+        const VAddr vma_start = vma.base;
+        const VAddr vma_end = vma_start + vma.size;
+        const VAddr vma_last = vma_end - 1;
+
+        // Add size if relevant.
+        if (vma.state != MemoryState::Unmapped) {
+            mapped_size += std::min(end_addr - cur_addr, vma_end - cur_addr);
+        }
+
+        // Break once we hit the end of the range.
+        if (last_addr <= vma_last) {
+            break;
+        }
+
+        // Advance to the next block.
+        cur_addr = vma_end;
+        iter = std::next(iter);
+        ASSERT_MSG(iter != vma_map.end(), "SizeOfAllocatedVMAsInRange iter != end");
+    }
+
+    return MakeResult(mapped_size);
+}
+
+ResultVal<std::size_t> VMManager::SizeOfUnmappablePhysicalMemoryInRange(VAddr address,
+                                                                        std::size_t size) const {
+    const VAddr end_addr = address + size;
+    const VAddr last_addr = end_addr - 1;
+    std::size_t mapped_size = 0;
+
+    VAddr cur_addr = address;
+    auto iter = FindVMA(cur_addr);
+    ASSERT_MSG(iter != vma_map.end(), "SizeOfUnmappablePhysicalMemoryInRange iter != end");
+
+    while (true) {
+        const auto& vma = iter->second;
+        const auto vma_start = vma.base;
+        const auto vma_end = vma_start + vma.size;
+        const auto vma_last = vma_end - 1;
+        const auto state = vma.state;
+        const auto attr = vma.attribute;
+
+        // Memory within region must be free or mapped heap.
+        if (!((state == MemoryState::Heap && attr == MemoryAttribute::None) ||
+              (state == MemoryState::Unmapped))) {
+            return ERR_INVALID_ADDRESS_STATE;
+        }
+
+        // Add size if relevant.
+        if (state != MemoryState::Unmapped) {
+            mapped_size += std::min(end_addr - cur_addr, vma_end - cur_addr);
+        }
+
+        // Break once we hit the end of the range.
+        if (last_addr <= vma_last) {
+            break;
+        }
+
+        // Advance to the next block.
+        cur_addr = vma_end;
+        iter = std::next(iter);
+        ASSERT_MSG(iter != vma_map.end(), "SizeOfUnmappablePhysicalMemoryInRange iter != end");
+    }
+
+    return MakeResult(mapped_size);
+}
+
 u64 VMManager::GetTotalPhysicalMemoryAvailable() const {
     LOG_WARNING(Kernel, "(STUBBED) called");
     return 0xF8000000;
diff --git a/src/core/hle/kernel/vm_manager.h b/src/core/hle/kernel/vm_manager.h
index 9fe6ac3f46..0aecb74991 100644
--- a/src/core/hle/kernel/vm_manager.h
+++ b/src/core/hle/kernel/vm_manager.h
@@ -349,7 +349,8 @@ public:
      * @param state MemoryState tag to attach to the VMA.
      */
     ResultVal<VMAHandle> MapMemoryBlock(VAddr target, std::shared_ptr<std::vector<u8>> block,
-                                        std::size_t offset, u64 size, MemoryState state);
+                                        std::size_t offset, u64 size, MemoryState state,
+                                        VMAPermission perm = VMAPermission::ReadWrite);
 
     /**
      * Maps an unmanaged host memory pointer at a given address.
@@ -450,6 +451,34 @@ public:
     ///
     ResultVal<VAddr> SetHeapSize(u64 size);
 
+    /// Maps memory at a given address.
+    ///
+    /// @param addr The virtual address to map memory at.
+    /// @param size The amount of memory to map.
+    ///
+    /// @note The destination address must lie within the Map region.
+    ///
+    /// @note This function requires that SystemResourceSize be non-zero,
+    ///       however, this is just because if it were not then the
+    ///       resulting page tables could be exploited on hardware by
+    ///       a malicious program. SystemResource usage does not need
+    ///       to be explicitly checked or updated here.
+    ResultCode MapPhysicalMemory(VAddr target, u64 size);
+
+    /// Unmaps memory at a given address.
+    ///
+    /// @param addr The virtual address to unmap memory at.
+    /// @param size The amount of memory to unmap.
+    ///
+    /// @note The destination address must lie within the Map region.
+    ///
+    /// @note This function requires that SystemResourceSize be non-zero,
+    ///       however, this is just because if it were not then the
+    ///       resulting page tables could be exploited on hardware by
+    ///       a malicious program. SystemResource usage does not need
+    ///       to be explicitly checked or updated here.
+    ResultCode UnmapPhysicalMemory(VAddr target, u64 size);
+
     /// Maps a region of memory as code memory.
     ///
     /// @param dst_address The base address of the region to create the aliasing memory region.
@@ -657,6 +686,11 @@ private:
      */
     VMAIter MergeAdjacent(VMAIter vma);
 
+    /**
+     * Merges two adjacent VMAs.
+     */
+    void MergeAdjacentVMA(VirtualMemoryArea& left, const VirtualMemoryArea& right);
+
     /// Updates the pages corresponding to this VMA so they match the VMA's attributes.
     void UpdatePageTableForVMA(const VirtualMemoryArea& vma);
 
@@ -701,6 +735,13 @@ private:
                                  MemoryAttribute attribute_mask, MemoryAttribute attribute,
                                  MemoryAttribute ignore_mask) const;
 
+    /// Gets the amount of memory currently mapped (state != Unmapped) in a range.
+    ResultVal<std::size_t> SizeOfAllocatedVMAsInRange(VAddr address, std::size_t size) const;
+
+    /// Gets the amount of memory unmappable by UnmapPhysicalMemory in a range.
+    ResultVal<std::size_t> SizeOfUnmappablePhysicalMemoryInRange(VAddr address,
+                                                                 std::size_t size) const;
+
     /**
      * A map covering the entirety of the managed address space, keyed by the `base` field of each
      * VMA. It must always be modified by splitting or merging VMAs, so that the invariant
@@ -742,6 +783,11 @@ private:
     // end of the range. This is essentially 'base_address + current_size'.
     VAddr heap_end = 0;
 
+    // The current amount of memory mapped via MapPhysicalMemory.
+    // This is used here (and in Nintendo's kernel) only for debugging, and does not impact
+    // any behavior.
+    u64 physical_memory_mapped = 0;
+
     Core::System& system;
 };
 } // namespace Kernel
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index cd32c65d3b..7c18c27b36 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_library(video_core STATIC
+    buffer_cache.h
     dma_pusher.cpp
     dma_pusher.h
     debug_utils/debug_utils.cpp
@@ -43,8 +44,6 @@ add_library(video_core STATIC
     renderer_opengl/gl_device.h
     renderer_opengl/gl_framebuffer_cache.cpp
     renderer_opengl/gl_framebuffer_cache.h
-    renderer_opengl/gl_global_cache.cpp
-    renderer_opengl/gl_global_cache.h
     renderer_opengl/gl_rasterizer.cpp
     renderer_opengl/gl_rasterizer.h
     renderer_opengl/gl_resource_manager.cpp
diff --git a/src/video_core/buffer_cache.h b/src/video_core/buffer_cache.h
new file mode 100644
index 0000000000..6f868b8b4e
--- /dev/null
+++ b/src/video_core/buffer_cache.h
@@ -0,0 +1,299 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/alignment.h"
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_cache.h"
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace VideoCommon {
+
+template <typename BufferStorageType>
+class CachedBuffer final : public RasterizerCacheObject {
+public:
+    explicit CachedBuffer(VAddr cpu_addr, u8* host_ptr)
+        : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr} {}
+    ~CachedBuffer() override = default;
+
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
+    }
+
+    std::size_t GetSizeInBytes() const override {
+        return size;
+    }
+
+    u8* GetWritableHostPtr() const {
+        return host_ptr;
+    }
+
+    std::size_t GetSize() const {
+        return size;
+    }
+
+    std::size_t GetCapacity() const {
+        return capacity;
+    }
+
+    bool IsInternalized() const {
+        return is_internal;
+    }
+
+    const BufferStorageType& GetBuffer() const {
+        return buffer;
+    }
+
+    void SetSize(std::size_t new_size) {
+        size = new_size;
+    }
+
+    void SetInternalState(bool is_internal_) {
+        is_internal = is_internal_;
+    }
+
+    BufferStorageType ExchangeBuffer(BufferStorageType buffer_, std::size_t new_capacity) {
+        capacity = new_capacity;
+        std::swap(buffer, buffer_);
+        return buffer_;
+    }
+
+private:
+    u8* host_ptr{};
+    VAddr cpu_addr{};
+    std::size_t size{};
+    std::size_t capacity{};
+    bool is_internal{};
+    BufferStorageType buffer;
+};
+
+template <typename BufferStorageType, typename BufferType, typename StreamBuffer>
+class BufferCache : public RasterizerCache<std::shared_ptr<CachedBuffer<BufferStorageType>>> {
+public:
+    using Buffer = std::shared_ptr<CachedBuffer<BufferStorageType>>;
+    using BufferInfo = std::pair<const BufferType*, u64>;
+
+    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+                         std::unique_ptr<StreamBuffer> stream_buffer)
+        : RasterizerCache<Buffer>{rasterizer}, system{system},
+          stream_buffer{std::move(stream_buffer)}, stream_buffer_handle{
+                                                       this->stream_buffer->GetHandle()} {}
+    ~BufferCache() = default;
+
+    void Unregister(const Buffer& entry) override {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+        if (entry->IsInternalized()) {
+            internalized_entries.erase(entry->GetCacheAddr());
+        }
+        ReserveBuffer(entry);
+        RasterizerCache<Buffer>::Unregister(entry);
+    }
+
+    void TickFrame() {
+        marked_for_destruction_index =
+            (marked_for_destruction_index + 1) % marked_for_destruction_ring_buffer.size();
+        MarkedForDestruction().clear();
+    }
+
+    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
+                            bool internalize = false, bool is_written = false) {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+
+        auto& memory_manager = system.GPU().MemoryManager();
+        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+        if (!host_ptr) {
+            return {GetEmptyBuffer(size), 0};
+        }
+        const auto cache_addr = ToCacheAddr(host_ptr);
+
+        // Cache management is a big overhead, so only cache entries with a given size.
+        // TODO: Figure out which size is the best for given games.
+        constexpr std::size_t max_stream_size = 0x800;
+        if (!internalize && size < max_stream_size &&
+            internalized_entries.find(cache_addr) == internalized_entries.end()) {
+            return StreamBufferUpload(host_ptr, size, alignment);
+        }
+
+        auto entry = RasterizerCache<Buffer>::TryGet(cache_addr);
+        if (!entry) {
+            return FixedBufferUpload(gpu_addr, host_ptr, size, internalize, is_written);
+        }
+
+        if (entry->GetSize() < size) {
+            IncreaseBufferSize(entry, size);
+        }
+        if (is_written) {
+            entry->MarkAsModified(true, *this);
+        }
+        return {ToHandle(entry->GetBuffer()), 0};
+    }
+
+    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
+    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
+                                std::size_t alignment = 4) {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+        return StreamBufferUpload(raw_pointer, size, alignment);
+    }
+
+    void Map(std::size_t max_size) {
+        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
+        buffer_offset = buffer_offset_base;
+    }
+
+    /// Finishes the upload stream, returns true on bindings invalidation.
+    bool Unmap() {
+        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
+        return std::exchange(invalidated, false);
+    }
+
+    virtual const BufferType* GetEmptyBuffer(std::size_t size) = 0;
+
+protected:
+    void FlushObjectInner(const Buffer& entry) override {
+        DownloadBufferData(entry->GetBuffer(), 0, entry->GetSize(), entry->GetWritableHostPtr());
+    }
+
+    virtual BufferStorageType CreateBuffer(std::size_t size) = 0;
+
+    virtual const BufferType* ToHandle(const BufferStorageType& storage) = 0;
+
+    virtual void UploadBufferData(const BufferStorageType& buffer, std::size_t offset,
+                                  std::size_t size, const u8* data) = 0;
+
+    virtual void DownloadBufferData(const BufferStorageType& buffer, std::size_t offset,
+                                    std::size_t size, u8* data) = 0;
+
+    virtual void CopyBufferData(const BufferStorageType& src, const BufferStorageType& dst,
+                                std::size_t src_offset, std::size_t dst_offset,
+                                std::size_t size) = 0;
+
+private:
+    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
+                                  std::size_t alignment) {
+        AlignBuffer(alignment);
+        const std::size_t uploaded_offset = buffer_offset;
+        std::memcpy(buffer_ptr, raw_pointer, size);
+
+        buffer_ptr += size;
+        buffer_offset += size;
+        return {&stream_buffer_handle, uploaded_offset};
+    }
+
+    BufferInfo FixedBufferUpload(GPUVAddr gpu_addr, u8* host_ptr, std::size_t size,
+                                 bool internalize, bool is_written) {
+        auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
+        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+        ASSERT(cpu_addr);
+
+        auto entry = GetUncachedBuffer(*cpu_addr, host_ptr);
+        entry->SetSize(size);
+        entry->SetInternalState(internalize);
+        RasterizerCache<Buffer>::Register(entry);
+
+        if (internalize) {
+            internalized_entries.emplace(ToCacheAddr(host_ptr));
+        }
+        if (is_written) {
+            entry->MarkAsModified(true, *this);
+        }
+
+        if (entry->GetCapacity() < size) {
+            MarkedForDestruction().push_back(entry->ExchangeBuffer(CreateBuffer(size), size));
+        }
+
+        UploadBufferData(entry->GetBuffer(), 0, size, host_ptr);
+        return {ToHandle(entry->GetBuffer()), 0};
+    }
+
+    void IncreaseBufferSize(Buffer& entry, std::size_t new_size) {
+        const std::size_t old_size = entry->GetSize();
+        if (entry->GetCapacity() < new_size) {
+            const auto& old_buffer = entry->GetBuffer();
+            auto new_buffer = CreateBuffer(new_size);
+
+            // Copy bits from the old buffer to the new buffer.
+            CopyBufferData(old_buffer, new_buffer, 0, 0, old_size);
+            MarkedForDestruction().push_back(
+                entry->ExchangeBuffer(std::move(new_buffer), new_size));
+
+            // This buffer could have been used
+            invalidated = true;
+        }
+        // Upload the new bits.
+        const std::size_t size_diff = new_size - old_size;
+        UploadBufferData(entry->GetBuffer(), old_size, size_diff, entry->GetHostPtr() + old_size);
+
+        // Update entry's size in the object and in the cache.
+        Unregister(entry);
+
+        entry->SetSize(new_size);
+        RasterizerCache<Buffer>::Register(entry);
+    }
+
+    Buffer GetUncachedBuffer(VAddr cpu_addr, u8* host_ptr) {
+        if (auto entry = TryGetReservedBuffer(host_ptr)) {
+            return entry;
+        }
+        return std::make_shared<CachedBuffer<BufferStorageType>>(cpu_addr, host_ptr);
+    }
+
+    Buffer TryGetReservedBuffer(u8* host_ptr) {
+        const auto it = buffer_reserve.find(ToCacheAddr(host_ptr));
+        if (it == buffer_reserve.end()) {
+            return {};
+        }
+        auto& reserve = it->second;
+        auto entry = reserve.back();
+        reserve.pop_back();
+        return entry;
+    }
+
+    void ReserveBuffer(Buffer entry) {
+        buffer_reserve[entry->GetCacheAddr()].push_back(std::move(entry));
+    }
+
+    void AlignBuffer(std::size_t alignment) {
+        // Align the offset, not the mapped pointer
+        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
+        buffer_ptr += offset_aligned - buffer_offset;
+        buffer_offset = offset_aligned;
+    }
+
+    std::vector<BufferStorageType>& MarkedForDestruction() {
+        return marked_for_destruction_ring_buffer[marked_for_destruction_index];
+    }
+
+    Core::System& system;
+
+    std::unique_ptr<StreamBuffer> stream_buffer;
+    BufferType stream_buffer_handle{};
+
+    bool invalidated = false;
+
+    u8* buffer_ptr = nullptr;
+    u64 buffer_offset = 0;
+    u64 buffer_offset_base = 0;
+
+    std::size_t marked_for_destruction_index = 0;
+    std::array<std::vector<BufferStorageType>, 4> marked_for_destruction_ring_buffer;
+
+    std::unordered_set<CacheAddr> internalized_entries;
+    std::unordered_map<CacheAddr, std::vector<Buffer>> buffer_reserve;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 13e3149444..8d15c8a482 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -67,6 +67,7 @@ public:
         static constexpr std::size_t MaxShaderStage = 5;
         // Maximum number of const buffers per shader stage.
         static constexpr std::size_t MaxConstBuffers = 18;
+        static constexpr std::size_t MaxConstBufferSize = 0x10000;
 
         enum class QueryMode : u32 {
             Write = 0,
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index c3055602b9..79d469b88b 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -78,7 +78,7 @@ union Attribute {
     constexpr explicit Attribute(u64 value) : value(value) {}
 
     enum class Index : u64 {
-        PointSize = 6,
+        LayerViewportPointSize = 6,
         Position = 7,
         Attribute_0 = 8,
         Attribute_31 = 39,
@@ -1278,6 +1278,7 @@ union Instruction {
     union {
         BitField<49, 1, u64> nodep_flag;
         BitField<53, 4, u64> texture_info;
+        BitField<59, 1, u64> fp32_flag;
 
         TextureType GetTextureType() const {
             // The TLDS instruction has a weird encoding for the texture type.
@@ -1776,7 +1777,7 @@ private:
             INST("1101111101010---", Id::TXQ_B, Type::Texture, "TXQ_B"),
             INST("1101-00---------", Id::TEXS, Type::Texture, "TEXS"),
             INST("11011100--11----", Id::TLD, Type::Texture, "TLD"),
-            INST("1101101---------", Id::TLDS, Type::Texture, "TLDS"),
+            INST("1101-01---------", Id::TLDS, Type::Texture, "TLDS"),
             INST("110010----111---", Id::TLD4, Type::Texture, "TLD4"),
             INST("1101111100------", Id::TLD4S, Type::Texture, "TLD4S"),
             INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 52706505b0..1b4975498c 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -31,7 +31,7 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
 
 GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
     auto& rasterizer{renderer.Rasterizer()};
-    memory_manager = std::make_unique<Tegra::MemoryManager>(rasterizer);
+    memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
     dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
     fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp
index c766ed692b..9f59a2dc1f 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -4,14 +4,18 @@
 
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro_interpreter.h"
 
+MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
+
 namespace Tegra {
 
 MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
 
 void MacroInterpreter::Execute(u32 offset, std::vector<u32> parameters) {
+    MICROPROFILE_SCOPE(MacroInterp);
     Reset();
     registers[1] = parameters[0];
     this->parameters = std::move(parameters);
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 3224531162..bffae940c0 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -5,13 +5,17 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
+#include "core/hle/kernel/process.h"
+#include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 
 namespace Tegra {
 
-MemoryManager::MemoryManager(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {
+MemoryManager::MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
+    : rasterizer{rasterizer}, system{system} {
     std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr);
     std::fill(page_table.attributes.begin(), page_table.attributes.end(),
               Common::PageType::Unmapped);
@@ -49,6 +53,11 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) {
     const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)};
 
     MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr);
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::DeviceMapped)
+               .IsSuccess());
 
     return gpu_addr;
 }
@@ -59,7 +68,11 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size)
     const u64 aligned_size{Common::AlignUp(size, page_size)};
 
     MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr);
-
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::DeviceMapped)
+               .IsSuccess());
     return gpu_addr;
 }
 
@@ -68,9 +81,16 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
 
     const u64 aligned_size{Common::AlignUp(size, page_size)};
     const CacheAddr cache_addr{ToCacheAddr(GetPointer(gpu_addr))};
+    const auto cpu_addr = GpuToCpuAddress(gpu_addr);
+    ASSERT(cpu_addr);
 
     rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size);
     UnmapRange(gpu_addr, aligned_size);
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr.value(), size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::None)
+               .IsSuccess());
 
     return gpu_addr;
 }
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 43a84bd528..aea0100870 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -14,6 +14,10 @@ namespace VideoCore {
 class RasterizerInterface;
 }
 
+namespace Core {
+class System;
+}
+
 namespace Tegra {
 
 /**
@@ -47,7 +51,7 @@ struct VirtualMemoryArea {
 
 class MemoryManager final {
 public:
-    explicit MemoryManager(VideoCore::RasterizerInterface& rasterizer);
+    explicit MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer);
     ~MemoryManager();
 
     GPUVAddr AllocateSpace(u64 size, u64 align);
@@ -173,6 +177,8 @@ private:
     Common::PageTable page_table{page_bits};
     VMAMap vma_map;
     VideoCore::RasterizerInterface& rasterizer;
+
+    Core::System& system;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 5ee4f8e8ec..2b7367568e 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -47,6 +47,9 @@ public:
     /// and invalidated
     virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 
+    /// Notify rasterizer that a frame is about to finish
+    virtual void TickFrame() = 0;
+
     /// Attempt to use a faster method to perform a surface copy
     virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                        const Tegra::Engines::Fermi2D::Regs::Surface& dst,
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 2b9bd142e3..2a9b523f5b 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -2,103 +2,57 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <cstring>
 #include <memory>
 
-#include "common/alignment.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
+#include <glad/glad.h>
+
+#include "common/assert.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
 
 namespace OpenGL {
 
-CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
-                                     std::size_t alignment, u8* host_ptr)
-    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size}, offset{offset},
-      alignment{alignment} {}
-
-OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size)
-    : RasterizerCache{rasterizer}, stream_buffer(size, true) {}
-
-GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment,
-                                      bool cache) {
-    std::lock_guard lock{mutex};
-    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-
-    // Cache management is a big overhead, so only cache entries with a given size.
-    // TODO: Figure out which size is the best for given games.
-    cache &= size >= 2048;
-
-    const auto& host_ptr{memory_manager.GetPointer(gpu_addr)};
-    if (cache) {
-        auto entry = TryGet(host_ptr);
-        if (entry) {
-            if (entry->GetSize() >= size && entry->GetAlignment() == alignment) {
-                return entry->GetOffset();
-            }
-            Unregister(entry);
-        }
-    }
+OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+                               std::size_t stream_size)
+    : VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer>{
+          rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}
 
-    AlignBuffer(alignment);
-    const GLintptr uploaded_offset = buffer_offset;
+OGLBufferCache::~OGLBufferCache() = default;
 
-    if (!host_ptr) {
-        return uploaded_offset;
-    }
-
-    std::memcpy(buffer_ptr, host_ptr, size);
-    buffer_ptr += size;
-    buffer_offset += size;
-
-    if (cache) {
-        auto entry = std::make_shared<CachedBufferEntry>(
-            *memory_manager.GpuToCpuAddress(gpu_addr), size, uploaded_offset, alignment, host_ptr);
-        Register(entry);
-    }
-
-    return uploaded_offset;
+OGLBuffer OGLBufferCache::CreateBuffer(std::size_t size) {
+    OGLBuffer buffer;
+    buffer.Create();
+    glNamedBufferData(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    return buffer;
 }
 
-GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t size,
-                                          std::size_t alignment) {
-    std::lock_guard lock{mutex};
-    AlignBuffer(alignment);
-    std::memcpy(buffer_ptr, raw_pointer, size);
-    const GLintptr uploaded_offset = buffer_offset;
-
-    buffer_ptr += size;
-    buffer_offset += size;
-    return uploaded_offset;
+const GLuint* OGLBufferCache::ToHandle(const OGLBuffer& buffer) {
+    return &buffer.handle;
 }
 
-bool OGLBufferCache::Map(std::size_t max_size) {
-    bool invalidate;
-    std::tie(buffer_ptr, buffer_offset_base, invalidate) =
-        stream_buffer.Map(static_cast<GLsizeiptr>(max_size), 4);
-    buffer_offset = buffer_offset_base;
-
-    if (invalidate) {
-        InvalidateAll();
-    }
-    return invalidate;
+const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    static const GLuint null_buffer = 0;
+    return &null_buffer;
 }
 
-void OGLBufferCache::Unmap() {
-    stream_buffer.Unmap(buffer_offset - buffer_offset_base);
+void OGLBufferCache::UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                                      const u8* data) {
+    glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+                         static_cast<GLsizeiptr>(size), data);
 }
 
-GLuint OGLBufferCache::GetHandle() const {
-    return stream_buffer.GetHandle();
+void OGLBufferCache::DownloadBufferData(const OGLBuffer& buffer, std::size_t offset,
+                                        std::size_t size, u8* data) {
+    glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+                            static_cast<GLsizeiptr>(size), data);
 }
 
-void OGLBufferCache::AlignBuffer(std::size_t alignment) {
-    // Align the offset, not the mapped pointer
-    const GLintptr offset_aligned =
-        static_cast<GLintptr>(Common::AlignUp(static_cast<std::size_t>(buffer_offset), alignment));
-    buffer_ptr += offset_aligned - buffer_offset;
-    buffer_offset = offset_aligned;
+void OGLBufferCache::CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst,
+                                    std::size_t src_offset, std::size_t dst_offset,
+                                    std::size_t size) {
+    glCopyNamedBufferSubData(src.handle, dst.handle, static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index f2347581b5..8c8ac4038e 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -4,80 +4,44 @@
 
 #pragma once
 
-#include <cstddef>
 #include <memory>
-#include <tuple>
 
 #include "common/common_types.h"
+#include "video_core/buffer_cache.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
+namespace Core {
+class System;
+}
+
 namespace OpenGL {
 
+class OGLStreamBuffer;
 class RasterizerOpenGL;
 
-class CachedBufferEntry final : public RasterizerCacheObject {
-public:
-    explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
-                               std::size_t alignment, u8* host_ptr);
-
-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return size;
-    }
-
-    std::size_t GetSize() const {
-        return size;
-    }
-
-    GLintptr GetOffset() const {
-        return offset;
-    }
-
-    std::size_t GetAlignment() const {
-        return alignment;
-    }
-
-private:
-    VAddr cpu_addr{};
-    std::size_t size{};
-    GLintptr offset{};
-    std::size_t alignment{};
-};
-
-class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {
+class OGLBufferCache final : public VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer> {
 public:
-    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size);
-
-    /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been
-    /// allocated.
-    GLintptr UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
-                          bool cache = true);
+    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+                            std::size_t stream_size);
+    ~OGLBufferCache();
 
-    /// Uploads from a host memory. Returns host's buffer offset where it's been allocated.
-    GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4);
-
-    bool Map(std::size_t max_size);
-    void Unmap();
-
-    GLuint GetHandle() const;
+    const GLuint* GetEmptyBuffer(std::size_t) override;
 
 protected:
-    void AlignBuffer(std::size_t alignment);
+    OGLBuffer CreateBuffer(std::size_t size) override;
+
+    const GLuint* ToHandle(const OGLBuffer& buffer) override;
 
-    // We do not have to flush this cache as things in it are never modified by us.
-    void FlushObjectInner(const std::shared_ptr<CachedBufferEntry>& object) override {}
+    void UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                          const u8* data) override;
 
-private:
-    OGLStreamBuffer stream_buffer;
+    void DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                            u8* data) override;
 
-    u8* buffer_ptr = nullptr;
-    GLintptr buffer_offset = 0;
-    GLintptr buffer_offset_base = 0;
+    void CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, std::size_t src_offset,
+                        std::size_t dst_offset, std::size_t size) override;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index a48e14d2ee..85424a4c9a 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -24,8 +24,10 @@ T GetInteger(GLenum pname) {
 
 Device::Device() {
     uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
+    shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
     max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
     max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
+    has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = TestComponentIndexingBug();
 }
@@ -34,6 +36,7 @@ Device::Device(std::nullptr_t) {
     uniform_buffer_alignment = 0;
     max_vertex_attributes = 16;
     max_varyings = 15;
+    has_vertex_viewport_layer = true;
     has_variable_aoffi = true;
     has_component_indexing_bug = false;
 }
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 8c8c937600..dc883722df 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -18,6 +18,10 @@ public:
         return uniform_buffer_alignment;
     }
 
+    std::size_t GetShaderStorageBufferAlignment() const {
+        return shader_storage_alignment;
+    }
+
     u32 GetMaxVertexAttributes() const {
         return max_vertex_attributes;
     }
@@ -26,6 +30,10 @@ public:
         return max_varyings;
     }
 
+    bool HasVertexViewportLayer() const {
+        return has_vertex_viewport_layer;
+    }
+
     bool HasVariableAoffi() const {
         return has_variable_aoffi;
     }
@@ -39,8 +47,10 @@ private:
     static bool TestComponentIndexingBug();
 
     std::size_t uniform_buffer_alignment{};
+    std::size_t shader_storage_alignment{};
     u32 max_vertex_attributes{};
     u32 max_varyings{};
+    bool has_vertex_viewport_layer{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
 };
diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp
deleted file mode 100644
index d5e385151f..0000000000
--- a/src/video_core/renderer_opengl/gl_global_cache.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <glad/glad.h>
-
-#include "common/logging/log.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
-#include "video_core/renderer_opengl/gl_global_cache.h"
-#include "video_core/renderer_opengl/gl_rasterizer.h"
-#include "video_core/renderer_opengl/gl_shader_decompiler.h"
-#include "video_core/renderer_opengl/utils.h"
-
-namespace OpenGL {
-
-CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size)
-    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size},
-      max_size{max_size} {
-    buffer.Create();
-    LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
-}
-
-CachedGlobalRegion::~CachedGlobalRegion() = default;
-
-void CachedGlobalRegion::Reload(u32 size_) {
-    size = size_;
-    if (size > max_size) {
-        size = max_size;
-        LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_,
-                     max_size);
-    }
-    glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW);
-}
-
-void CachedGlobalRegion::Flush() {
-    LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr);
-    glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr);
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const {
-    const auto search{reserve.find(addr)};
-    if (search == reserve.end()) {
-        return {};
-    }
-    return search->second;
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr,
-                                                              u32 size) {
-    GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)};
-    if (!region) {
-        // No reserved surface available, create a new one and reserve it
-        auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
-        const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)};
-        ASSERT(cpu_addr);
-
-        region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size);
-        ReserveGlobalRegion(region);
-    }
-    region->Reload(size);
-    return region;
-}
-
-void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) {
-    reserve.insert_or_assign(region->GetCacheAddr(), std::move(region));
-}
-
-GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
-    : RasterizerCache{rasterizer} {
-    GLint max_ssbo_size_;
-    glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_);
-    max_ssbo_size = static_cast<u32>(max_ssbo_size_);
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
-    const GLShader::GlobalMemoryEntry& global_region,
-    Tegra::Engines::Maxwell3D::Regs::ShaderStage stage) {
-    std::lock_guard lock{mutex};
-
-    auto& gpu{Core::System::GetInstance().GPU()};
-    auto& memory_manager{gpu.MemoryManager()};
-    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
-    const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address +
-                    global_region.GetCbufOffset()};
-    const auto actual_addr{memory_manager.Read<u64>(addr)};
-    const auto size{memory_manager.Read<u32>(addr + 8)};
-
-    // Look up global region in the cache based on address
-    const auto& host_ptr{memory_manager.GetPointer(actual_addr)};
-    GlobalRegion region{TryGet(host_ptr)};
-
-    if (!region) {
-        // No global region found - create a new one
-        region = GetUncachedGlobalRegion(actual_addr, host_ptr, size);
-        Register(region);
-    }
-
-    return region;
-}
-
-} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h
deleted file mode 100644
index 2d467a2401..0000000000
--- a/src/video_core/renderer_opengl/gl_global_cache.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <memory>
-#include <unordered_map>
-
-#include <glad/glad.h>
-
-#include "common/assert.h"
-#include "common/common_types.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
-#include "video_core/renderer_opengl/gl_resource_manager.h"
-
-namespace OpenGL {
-
-namespace GLShader {
-class GlobalMemoryEntry;
-}
-
-class RasterizerOpenGL;
-class CachedGlobalRegion;
-using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;
-
-class CachedGlobalRegion final : public RasterizerCacheObject {
-public:
-    explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size);
-    ~CachedGlobalRegion();
-
-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return size;
-    }
-
-    /// Gets the GL program handle for the buffer
-    GLuint GetBufferHandle() const {
-        return buffer.handle;
-    }
-
-    /// Reloads the global region from guest memory
-    void Reload(u32 size_);
-
-    void Flush();
-
-private:
-    VAddr cpu_addr{};
-    u8* host_ptr{};
-    u32 size{};
-    u32 max_size{};
-
-    OGLBuffer buffer;
-};
-
-class GlobalRegionCacheOpenGL final : public RasterizerCache<GlobalRegion> {
-public:
-    explicit GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer);
-
-    /// Gets the current specified shader stage program
-    GlobalRegion GetGlobalRegion(const GLShader::GlobalMemoryEntry& descriptor,
-                                 Tegra::Engines::Maxwell3D::Regs::ShaderStage stage);
-
-protected:
-    void FlushObjectInner(const GlobalRegion& object) override {
-        object->Flush();
-    }
-
-private:
-    GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const;
-    GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size);
-    void ReserveGlobalRegion(GlobalRegion region);
-
-    std::unordered_map<CacheAddr, GlobalRegion> reserve;
-    u32 max_ssbo_size{};
-};
-
-} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index f45a3c5efc..0bb5c068c3 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -20,6 +20,7 @@
 #include "core/hle/kernel/process.h"
 #include "core/settings.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
@@ -80,11 +81,25 @@ struct DrawParameters {
     }
 };
 
+static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
+                                      const GLShader::ConstBufferEntry& entry) {
+    if (!entry.IsIndirect()) {
+        return entry.GetSize();
+    }
+
+    if (buffer.size > Maxwell::MaxConstBufferSize) {
+        LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size,
+                    Maxwell::MaxConstBufferSize);
+        return Maxwell::MaxConstBufferSize;
+    }
+
+    return buffer.size;
+}
+
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
                                    ScreenInfo& info)
     : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device},
-      global_cache{*this}, system{system}, screen_info{info},
-      buffer_cache(*this, STREAM_BUFFER_SIZE) {
+      system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} {
     OpenGLState::ApplyDefaultState();
 
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
@@ -129,8 +144,6 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
         state.draw.vertex_array = vao;
         state.ApplyVertexArrayState();
 
-        glVertexArrayElementBuffer(vao, buffer_cache.GetHandle());
-
         // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
         // Enables the first 16 vertex attributes always, as we don't know which ones are actually
         // used until shader time. Note, Tegra technically supports 32, but we're capping this to 16
@@ -197,11 +210,11 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
 
         ASSERT(end > start);
         const u64 size = end - start + 1;
-        const GLintptr vertex_buffer_offset = buffer_cache.UploadMemory(start, size);
+        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
 
         // Bind the vertex array to the buffer at the current offset.
-        glVertexArrayVertexBuffer(vao, index, buffer_cache.GetHandle(), vertex_buffer_offset,
-                                  vertex_array.stride);
+        vertex_array_pushbuffer.SetVertexBuffer(index, vertex_buffer, vertex_buffer_offset,
+                                                vertex_array.stride);
 
         if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
             // Enable vertex buffer instancing with the specified divisor.
@@ -215,7 +228,19 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
     gpu.dirty_flags.vertex_array.reset();
 }
 
-DrawParameters RasterizerOpenGL::SetupDraw() {
+GLintptr RasterizerOpenGL::SetupIndexBuffer() {
+    if (accelerate_draw != AccelDraw::Indexed) {
+        return 0;
+    }
+    MICROPROFILE_SCOPE(OpenGL_Index);
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    const std::size_t size = CalculateIndexBufferSize();
+    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    vertex_array_pushbuffer.SetIndexBuffer(buffer);
+    return offset;
+}
+
+DrawParameters RasterizerOpenGL::SetupDraw(GLintptr index_buffer_offset) {
     const auto& gpu = system.GPU().Maxwell3D();
     const auto& regs = gpu.regs;
     const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
@@ -227,11 +252,9 @@ DrawParameters RasterizerOpenGL::SetupDraw() {
     params.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);
 
     if (is_indexed) {
-        MICROPROFILE_SCOPE(OpenGL_Index);
         params.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
         params.count = regs.index_array.count;
-        params.index_buffer_offset =
-            buffer_cache.UploadMemory(regs.index_array.IndexStart(), CalculateIndexBufferSize());
+        params.index_buffer_offset = index_buffer_offset;
         params.base_vertex = static_cast<GLint>(regs.vb_element_base);
     } else {
         params.count = regs.vertex_buffer.count;
@@ -247,10 +270,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
     BaseBindings base_bindings;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
 
-    // Prepare packed bindings
-    bind_ubo_pushbuffer.Setup(base_bindings.cbuf);
-    bind_ssbo_pushbuffer.Setup(base_bindings.gmem);
-
     for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
         const auto& shader_config = gpu.regs.shader_config[index];
         const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)};
@@ -271,12 +290,11 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
 
         GLShader::MaxwellUniformData ubo{};
         ubo.SetFromRegs(gpu, stage);
-        const GLintptr offset =
+        const auto [buffer, offset] =
             buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
 
         // Bind the emulation info buffer
-        bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset,
-                                 static_cast<GLsizeiptr>(sizeof(ubo)));
+        bind_ubo_pushbuffer.Push(buffer, offset, static_cast<GLsizeiptr>(sizeof(ubo)));
 
         Shader shader{shader_cache.GetStageProgram(program)};
 
@@ -321,9 +339,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
         base_bindings = next_bindings;
     }
 
-    bind_ubo_pushbuffer.Bind();
-    bind_ssbo_pushbuffer.Bind();
-
     SyncClipEnabled(clip_distances);
 
     gpu.dirty_flags.shaders = false;
@@ -634,26 +649,46 @@ void RasterizerOpenGL::DrawArrays() {
                       Maxwell::MaxShaderStage;
 
     // Add space for at least 18 constant buffers
-    buffer_size +=
-        Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment());
+    buffer_size += Maxwell::MaxConstBuffers *
+                   (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
 
-    const bool invalidate = buffer_cache.Map(buffer_size);
-    if (invalidate) {
-        // As all cached buffers are invalidated, we need to recheck their state.
-        gpu.dirty_flags.vertex_array.set();
-    }
+    // Prepare the vertex array.
+    buffer_cache.Map(buffer_size);
 
+    // Prepare vertex array format.
     const GLuint vao = SetupVertexFormat();
+    vertex_array_pushbuffer.Setup(vao);
+
+    // Upload vertex and index data.
     SetupVertexBuffer(vao);
+    const GLintptr index_buffer_offset = SetupIndexBuffer();
 
-    DrawParameters params = SetupDraw();
+    // Setup draw parameters. It will automatically choose what glDraw* method to use.
+    const DrawParameters params = SetupDraw(index_buffer_offset);
+
+    // Prepare packed bindings.
+    bind_ubo_pushbuffer.Setup(0);
+    bind_ssbo_pushbuffer.Setup(0);
+
+    // Setup shaders and their used resources.
     texture_cache.GuardSamplers(true);
     SetupShaders(params.primitive_mode);
     texture_cache.GuardSamplers(false);
 
     ConfigureFramebuffers(state);
 
-    buffer_cache.Unmap();
+    // Signal the buffer cache that we are not going to upload more things.
+    const bool invalidate = buffer_cache.Unmap();
+
+    // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL.
+    vertex_array_pushbuffer.Bind();
+    bind_ubo_pushbuffer.Bind();
+    bind_ssbo_pushbuffer.Bind();
+
+    if (invalidate) {
+        // As all cached buffers are invalidated, we need to recheck their state.
+        gpu.dirty_flags.vertex_array.set();
+    }
 
     shader_program_manager->ApplyTo(state);
     state.Apply();
@@ -675,7 +710,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
         return;
     }
     texture_cache.FlushRegion(addr, size);
-    global_cache.FlushRegion(addr, size);
+    buffer_cache.FlushRegion(addr, size);
 }
 
 void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@@ -685,7 +720,6 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
     }
     texture_cache.InvalidateRegion(addr, size);
     shader_cache.InvalidateRegion(addr, size);
-    global_cache.InvalidateRegion(addr, size);
     buffer_cache.InvalidateRegion(addr, size);
 }
 
@@ -696,6 +730,10 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
     InvalidateRegion(addr, size);
 }
 
+void RasterizerOpenGL::TickFrame() {
+    buffer_cache.TickFrame();
+}
+
 bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                              const Tegra::Engines::Fermi2D::Regs::Surface& dst,
                                              const Tegra::Engines::Fermi2D::Config& copy_config) {
@@ -739,11 +777,9 @@ void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::Sh
     MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto stage_index = static_cast<std::size_t>(stage);
     const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index];
-    const auto& entries = shader->GetShaderEntries().const_buffers;
 
     // Upload only the enabled buffers from the 16 constbuffers of each shader stage
-    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
-        const auto& entry = entries[bindpoint];
+    for (const auto& entry : shader->GetShaderEntries().const_buffers) {
         SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry);
     }
 }
@@ -752,46 +788,34 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b
                                         const GLShader::ConstBufferEntry& entry) {
     if (!buffer.enabled) {
         // Set values to zero to unbind buffers
-        bind_ubo_pushbuffer.Push(0, 0, 0);
+        bind_ubo_pushbuffer.Push(buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
         return;
     }
 
-    std::size_t size;
-    if (entry.IsIndirect()) {
-        // Buffer is accessed indirectly, so upload the entire thing
-        size = buffer.size;
-
-        if (size > MaxConstbufferSize) {
-            LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size,
-                        MaxConstbufferSize);
-            size = MaxConstbufferSize;
-        }
-    } else {
-        // Buffer is accessed directly, upload just what we use
-        size = entry.GetSize();
-    }
-
     // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
     // UBO alignment requirements.
-    size = Common::AlignUp(size, sizeof(GLvec4));
-    ASSERT_MSG(size <= MaxConstbufferSize, "Constant buffer is too big");
+    const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
 
-    const std::size_t alignment = device.GetUniformBufferAlignment();
-    const GLintptr offset = buffer_cache.UploadMemory(buffer.address, size, alignment);
-    bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, size);
+    const auto alignment = device.GetUniformBufferAlignment();
+    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment);
+    bind_ubo_pushbuffer.Push(cbuf, offset, size);
 }
 
 void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                                           const Shader& shader) {
-    const auto& entries = shader->GetShaderEntries().global_memory_entries;
-    for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
-        const auto& entry{entries[bindpoint]};
-        const auto& region{global_cache.GetGlobalRegion(entry, stage)};
-        if (entry.IsWritten()) {
-            region->MarkAsModified(true, global_cache);
-        }
-        bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0,
-                                  static_cast<GLsizeiptr>(region->GetSizeInBytes()));
+    auto& gpu{system.GPU()};
+    auto& memory_manager{gpu.MemoryManager()};
+    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
+    const auto alignment{device.GetShaderStorageBufferAlignment()};
+
+    for (const auto& entry : shader->GetShaderEntries().global_memory_entries) {
+        const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
+        const auto actual_addr{memory_manager.Read<u64>(addr)};
+        const auto size{memory_manager.Read<u32>(addr + 8)};
+
+        const auto [ssbo, buffer_offset] =
+            buffer_cache.UploadMemory(actual_addr, size, alignment, true, entry.IsWritten());
+        bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
     }
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index d238c12577..40b571d586 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -24,7 +24,6 @@
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_framebuffer_cache.h"
-#include "video_core/renderer_opengl/gl_global_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
@@ -63,6 +62,7 @@ public:
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
     void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+    void TickFrame() override;
     bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                const Tegra::Engines::Fermi2D::Regs::Surface& dst,
                                const Tegra::Engines::Fermi2D::Config& copy_config) override;
@@ -73,11 +73,6 @@ public:
     void LoadDiskResources(const std::atomic_bool& stop_loading,
                            const VideoCore::DiskResourceLoadCallback& callback) override;
 
-    /// Maximum supported size that a constbuffer can have in bytes.
-    static constexpr std::size_t MaxConstbufferSize = 0x10000;
-    static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
-                  "The maximum size of a constbuffer must be a multiple of the size of GLvec4");
-
 private:
     struct FramebufferConfigState {
         bool using_color_fb{};
@@ -191,7 +186,6 @@ private:
 
     TextureCacheOpenGL texture_cache;
     ShaderCacheOpenGL shader_cache;
-    GlobalRegionCacheOpenGL global_cache;
     SamplerCacheOpenGL sampler_cache;
     FramebufferCacheOpenGL framebuffer_cache;
 
@@ -210,6 +204,7 @@ private:
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
     OGLBufferCache buffer_cache;
 
+    VertexArrayPushBuffer vertex_array_pushbuffer;
     BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
     BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
 
@@ -222,7 +217,9 @@ private:
 
     void SetupVertexBuffer(GLuint vao);
 
-    DrawParameters SetupDraw();
+    GLintptr SetupIndexBuffer();
+
+    DrawParameters SetupDraw(GLintptr index_buffer_offset);
 
     void SetupShaders(GLenum primitive_mode);
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 5d76ee12db..32dd9eae79 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -190,8 +190,11 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
     const auto texture_buffer_usage{variant.texture_buffer_usage};
 
     std::string source = "#version 430 core\n"
-                         "#extension GL_ARB_separate_shader_objects : enable\n\n";
-    source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
+                         "#extension GL_ARB_separate_shader_objects : enable\n";
+    if (entries.shader_viewport_layer_array) {
+        source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
+    }
+    source += fmt::format("\n#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
 
     for (const auto& cbuf : entries.const_buffers) {
         source +=
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index bfc975a04d..119073776a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -14,6 +14,7 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/common_types.h"
+#include "common/logging/log.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
@@ -46,7 +47,7 @@ using TextureArgument = std::pair<Type, Node>;
 using TextureIR = std::variant<TextureAoffi, TextureArgument>;
 
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
-    static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
+    static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
 
 class ShaderWriter {
 public:
@@ -246,6 +247,8 @@ public:
                                                        usage.is_read, usage.is_written);
         }
         entries.clip_distances = ir.GetClipDistances();
+        entries.shader_viewport_layer_array =
+            stage == ShaderStage::Vertex && (ir.UsesLayer() || ir.UsesViewportIndex());
         entries.shader_length = ir.GetLength();
         return entries;
     }
@@ -282,22 +285,35 @@ private:
     }
 
     void DeclareVertexRedeclarations() {
-        bool clip_distances_declared = false;
-
         code.AddLine("out gl_PerVertex {{");
         ++code.scope;
 
         code.AddLine("vec4 gl_Position;");
 
-        for (const auto o : ir.GetOutputAttributes()) {
-            if (o == Attribute::Index::PointSize)
-                code.AddLine("float gl_PointSize;");
-            if (!clip_distances_declared && (o == Attribute::Index::ClipDistances0123 ||
-                                             o == Attribute::Index::ClipDistances4567)) {
+        for (const auto attribute : ir.GetOutputAttributes()) {
+            if (attribute == Attribute::Index::ClipDistances0123 ||
+                attribute == Attribute::Index::ClipDistances4567) {
                 code.AddLine("float gl_ClipDistance[];");
-                clip_distances_declared = true;
+                break;
             }
         }
+        if (stage != ShaderStage::Vertex || device.HasVertexViewportLayer()) {
+            if (ir.UsesLayer()) {
+                code.AddLine("int gl_Layer;");
+            }
+            if (ir.UsesViewportIndex()) {
+                code.AddLine("int gl_ViewportIndex;");
+            }
+        } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderStage::Vertex &&
+                   !device.HasVertexViewportLayer()) {
+            LOG_ERROR(
+                Render_OpenGL,
+                "GL_ARB_shader_viewport_layer_array is not available and its required by a shader");
+        }
+
+        if (ir.UsesPointSize()) {
+            code.AddLine("float gl_PointSize;");
+        }
 
         --code.scope;
         code.AddLine("}};");
@@ -805,6 +821,45 @@ private:
         return CastOperand(VisitOperand(operation, operand_index), type);
     }
 
+    std::optional<std::pair<std::string, bool>> GetOutputAttribute(const AbufNode* abuf) {
+        switch (const auto attribute = abuf->GetIndex()) {
+        case Attribute::Index::Position:
+            return std::make_pair("gl_Position"s + GetSwizzle(abuf->GetElement()), false);
+        case Attribute::Index::LayerViewportPointSize:
+            switch (abuf->GetElement()) {
+            case 0:
+                UNIMPLEMENTED();
+                return {};
+            case 1:
+                if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
+                    return {};
+                }
+                return std::make_pair("gl_Layer", true);
+            case 2:
+                if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
+                    return {};
+                }
+                return std::make_pair("gl_ViewportIndex", true);
+            case 3:
+                UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader");
+                return std::make_pair("gl_PointSize", false);
+            }
+            return {};
+        case Attribute::Index::ClipDistances0123:
+            return std::make_pair(fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), false);
+        case Attribute::Index::ClipDistances4567:
+            return std::make_pair(fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4),
+                                  false);
+        default:
+            if (IsGenericAttribute(attribute)) {
+                return std::make_pair(
+                    GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), false);
+            }
+            UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
+            return {};
+        }
+    }
+
     std::string CastOperand(const std::string& value, Type type) const {
         switch (type) {
         case Type::Bool:
@@ -1001,6 +1056,8 @@ private:
         const Node& src = operation[1];
 
         std::string target;
+        bool is_integer = false;
+
         if (const auto gpr = std::get_if<GprNode>(&*dest)) {
             if (gpr->GetIndex() == Register::ZeroIndex) {
                 // Writing to Register::ZeroIndex is a no op
@@ -1009,26 +1066,12 @@ private:
             target = GetRegister(gpr->GetIndex());
         } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
             UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer());
-
-            target = [&]() -> std::string {
-                switch (const auto attribute = abuf->GetIndex(); abuf->GetIndex()) {
-                case Attribute::Index::Position:
-                    return "gl_Position"s + GetSwizzle(abuf->GetElement());
-                case Attribute::Index::PointSize:
-                    return "gl_PointSize";
-                case Attribute::Index::ClipDistances0123:
-                    return fmt::format("gl_ClipDistance[{}]", abuf->GetElement());
-                case Attribute::Index::ClipDistances4567:
-                    return fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4);
-                default:
-                    if (IsGenericAttribute(attribute)) {
-                        return GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement());
-                    }
-                    UNIMPLEMENTED_MSG("Unhandled output attribute: {}",
-                                      static_cast<u32>(attribute));
-                    return "0";
-                }
-            }();
+            const auto result = GetOutputAttribute(abuf);
+            if (!result) {
+                return {};
+            }
+            target = result->first;
+            is_integer = result->second;
         } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
             target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
         } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
@@ -1040,7 +1083,11 @@ private:
             UNREACHABLE_MSG("Assign called without a proper target");
         }
 
-        code.AddLine("{} = {};", target, Visit(src));
+        if (is_integer) {
+            code.AddLine("{} = ftoi({});", target, Visit(src));
+        } else {
+            code.AddLine("{} = {};", target, Visit(src));
+        }
         return {};
     }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 14d11c7fc8..02586736dc 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -78,6 +78,7 @@ struct ShaderEntries {
     std::vector<ImageEntry> images;
     std::vector<GlobalMemoryEntry> global_memory_entries;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
+    bool shader_viewport_layer_array{};
     std::size_t shader_length{};
 };
 
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 10688397bc..7893d1e263 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -373,6 +373,12 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
         }
     }
 
+    bool shader_viewport_layer_array{};
+    if (!LoadObjectFromPrecompiled(shader_viewport_layer_array)) {
+        return {};
+    }
+    entry.entries.shader_viewport_layer_array = shader_viewport_layer_array;
+
     u64 shader_length{};
     if (!LoadObjectFromPrecompiled(shader_length)) {
         return {};
@@ -445,6 +451,10 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std:
         }
     }
 
+    if (!SaveObjectToPrecompiled(entries.shader_viewport_layer_array)) {
+        return false;
+    }
+
     if (!SaveObjectToPrecompiled(static_cast<u64>(entries.shader_length))) {
         return false;
     }
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index d86e137ac1..0eae98afef 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -6,8 +6,11 @@
 #include <glad/glad.h>
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "video_core/renderer_opengl/gl_state.h"
 
+MICROPROFILE_DEFINE(OpenGL_State, "OpenGL", "State Change", MP_RGB(192, 128, 128));
+
 namespace OpenGL {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
@@ -524,6 +527,7 @@ void OpenGLState::ApplySamplers() const {
 }
 
 void OpenGLState::Apply() const {
+    MICROPROFILE_SCOPE(OpenGL_State);
     ApplyFramebufferState();
     ApplyVertexArrayState();
     ApplyShaderProgram();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 08ae1a429b..b1f6bc7c20 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -31,6 +31,8 @@ using VideoCore::Surface::SurfaceType;
 
 MICROPROFILE_DEFINE(OpenGL_Texture_Upload, "OpenGL", "Texture Upload", MP_RGB(128, 192, 128));
 MICROPROFILE_DEFINE(OpenGL_Texture_Download, "OpenGL", "Texture Download", MP_RGB(128, 192, 128));
+MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
+                    MP_RGB(128, 192, 128));
 
 namespace {
 
@@ -535,6 +537,7 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
 }
 
 void TextureCacheOpenGL::BufferCopy(Surface& src_surface, Surface& dst_surface) {
+    MICROPROFILE_SCOPE(OpenGL_Texture_Buffer_Copy);
     const auto& src_params = src_surface->GetSurfaceParams();
     const auto& dst_params = dst_surface->GetSurfaceParams();
     UNIMPLEMENTED_IF(src_params.num_levels > 1 || dst_params.num_levels > 1);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index b142521ecc..9ecdddb0d9 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -101,7 +101,6 @@ RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::Syst
 
 RendererOpenGL::~RendererOpenGL() = default;
 
-/// Swap buffers (render frame)
 void RendererOpenGL::SwapBuffers(
     std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
 
@@ -130,6 +129,8 @@ void RendererOpenGL::SwapBuffers(
 
         DrawScreen(render_window.GetFramebufferLayout());
 
+        rasterizer->TickFrame();
+
         render_window.SwapBuffers();
     }
 
@@ -262,7 +263,6 @@ void RendererOpenGL::CreateRasterizer() {
     if (rasterizer) {
         return;
     }
-    // Initialize sRGB Usage
     OpenGLState::ClearsRGBUsed();
     rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info);
 }
diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp
index 68c36988dd..c504a2c1aa 100644
--- a/src/video_core/renderer_opengl/utils.cpp
+++ b/src/video_core/renderer_opengl/utils.cpp
@@ -13,29 +13,67 @@
 
 namespace OpenGL {
 
+VertexArrayPushBuffer::VertexArrayPushBuffer() = default;
+
+VertexArrayPushBuffer::~VertexArrayPushBuffer() = default;
+
+void VertexArrayPushBuffer::Setup(GLuint vao_) {
+    vao = vao_;
+    index_buffer = nullptr;
+    vertex_buffers.clear();
+}
+
+void VertexArrayPushBuffer::SetIndexBuffer(const GLuint* buffer) {
+    index_buffer = buffer;
+}
+
+void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* buffer,
+                                            GLintptr offset, GLsizei stride) {
+    vertex_buffers.push_back(Entry{binding_index, buffer, offset, stride});
+}
+
+void VertexArrayPushBuffer::Bind() {
+    if (index_buffer) {
+        glVertexArrayElementBuffer(vao, *index_buffer);
+    }
+
+    // TODO(Rodrigo): Find a way to ARB_multi_bind this
+    for (const auto& entry : vertex_buffers) {
+        glVertexArrayVertexBuffer(vao, entry.binding_index, *entry.buffer, entry.offset,
+                                  entry.stride);
+    }
+}
+
 BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {}
 
 BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default;
 
 void BindBuffersRangePushBuffer::Setup(GLuint first_) {
     first = first_;
-    buffers.clear();
+    buffer_pointers.clear();
     offsets.clear();
     sizes.clear();
 }
 
-void BindBuffersRangePushBuffer::Push(GLuint buffer, GLintptr offset, GLsizeiptr size) {
-    buffers.push_back(buffer);
+void BindBuffersRangePushBuffer::Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size) {
+    buffer_pointers.push_back(buffer);
     offsets.push_back(offset);
     sizes.push_back(size);
 }
 
-void BindBuffersRangePushBuffer::Bind() const {
-    const std::size_t count{buffers.size()};
+void BindBuffersRangePushBuffer::Bind() {
+    // Ensure sizes are valid.
+    const std::size_t count{buffer_pointers.size()};
     DEBUG_ASSERT(count == offsets.size() && count == sizes.size());
     if (count == 0) {
         return;
     }
+
+    // Dereference buffers.
+    buffers.resize(count);
+    std::transform(buffer_pointers.begin(), buffer_pointers.end(), buffers.begin(),
+                   [](const GLuint* pointer) { return *pointer; });
+
     glBindBuffersRange(target, first, static_cast<GLsizei>(count), buffers.data(), offsets.data(),
                        sizes.data());
 }
diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h
index 4a752f3b49..6c2b455466 100644
--- a/src/video_core/renderer_opengl/utils.h
+++ b/src/video_core/renderer_opengl/utils.h
@@ -11,20 +11,49 @@
 
 namespace OpenGL {
 
-class BindBuffersRangePushBuffer {
+class VertexArrayPushBuffer final {
 public:
-    BindBuffersRangePushBuffer(GLenum target);
+    explicit VertexArrayPushBuffer();
+    ~VertexArrayPushBuffer();
+
+    void Setup(GLuint vao_);
+
+    void SetIndexBuffer(const GLuint* buffer);
+
+    void SetVertexBuffer(GLuint binding_index, const GLuint* buffer, GLintptr offset,
+                         GLsizei stride);
+
+    void Bind();
+
+private:
+    struct Entry {
+        GLuint binding_index{};
+        const GLuint* buffer{};
+        GLintptr offset{};
+        GLsizei stride{};
+    };
+
+    GLuint vao{};
+    const GLuint* index_buffer{};
+    std::vector<Entry> vertex_buffers;
+};
+
+class BindBuffersRangePushBuffer final {
+public:
+    explicit BindBuffersRangePushBuffer(GLenum target);
     ~BindBuffersRangePushBuffer();
 
     void Setup(GLuint first_);
 
-    void Push(GLuint buffer, GLintptr offset, GLsizeiptr size);
+    void Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size);
 
-    void Bind() const;
+    void Bind();
 
 private:
-    GLenum target;
-    GLuint first;
+    GLenum target{};
+    GLuint first{};
+    std::vector<const GLuint*> buffer_pointers;
+
     std::vector<GLuint> buffers;
     std::vector<GLintptr> offsets;
     std::vector<GLsizeiptr> sizes;
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 1bb04607bc..9b2d8e987d 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -430,20 +430,17 @@ private:
         instance_index = DeclareBuiltIn(spv::BuiltIn::InstanceIndex, spv::StorageClass::Input,
                                         t_in_uint, "instance_index");
 
-        bool is_point_size_declared = false;
         bool is_clip_distances_declared = false;
         for (const auto index : ir.GetOutputAttributes()) {
-            if (index == Attribute::Index::PointSize) {
-                is_point_size_declared = true;
-            } else if (index == Attribute::Index::ClipDistances0123 ||
-                       index == Attribute::Index::ClipDistances4567) {
+            if (index == Attribute::Index::ClipDistances0123 ||
+                index == Attribute::Index::ClipDistances4567) {
                 is_clip_distances_declared = true;
             }
         }
 
         std::vector<Id> members;
         members.push_back(t_float4);
-        if (is_point_size_declared) {
+        if (ir.UsesPointSize()) {
             members.push_back(t_float);
         }
         if (is_clip_distances_declared) {
@@ -466,7 +463,7 @@ private:
 
         position_index = MemberDecorateBuiltIn(spv::BuiltIn::Position, "position", true);
         point_size_index =
-            MemberDecorateBuiltIn(spv::BuiltIn::PointSize, "point_size", is_point_size_declared);
+            MemberDecorateBuiltIn(spv::BuiltIn::PointSize, "point_size", ir.UsesPointSize());
         clip_distances_index = MemberDecorateBuiltIn(spv::BuiltIn::ClipDistance, "clip_distances",
                                                      is_clip_distances_declared);
 
@@ -712,7 +709,8 @@ private:
                 case Attribute::Index::Position:
                     return AccessElement(t_out_float, per_vertex, position_index,
                                          abuf->GetElement());
-                case Attribute::Index::PointSize:
+                case Attribute::Index::LayerViewportPointSize:
+                    UNIMPLEMENTED_IF(abuf->GetElement() != 3);
                     return AccessElement(t_out_float, per_vertex, point_size_index);
                 case Attribute::Index::ClipDistances0123:
                     return AccessElement(t_out_float, per_vertex, clip_distances_index,
diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp
index 24f022cc04..77151a24be 100644
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -95,12 +95,8 @@ const Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::Image
 const Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg,
                                         Tegra::Shader::ImageType type) {
     const Node image_register{GetRegister(reg)};
-    const Node base_image{
+    const auto [base_image, cbuf_index, cbuf_offset]{
         TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size()))};
-    const auto cbuf{std::get_if<CbufNode>(&*base_image)};
-    const auto cbuf_offset_imm{std::get_if<ImmediateNode>(&*cbuf->GetOffset())};
-    const auto cbuf_offset{cbuf_offset_imm->GetValue()};
-    const auto cbuf_index{cbuf->GetIndex()};
     const auto cbuf_key{(static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset)};
 
     // If this image has already been used, return the existing mapping.
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index 80fc0ccfc1..ab207a33be 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -297,18 +297,13 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeB
     const auto addr_register{GetRegister(instr.gmem.gpr)};
     const auto immediate_offset{static_cast<u32>(instr.gmem.offset)};
 
-    const Node base_address{
-        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))};
-    const auto cbuf = std::get_if<CbufNode>(&*base_address);
-    ASSERT(cbuf != nullptr);
-    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset());
-    ASSERT(cbuf_offset_imm != nullptr);
-    const auto cbuf_offset = cbuf_offset_imm->GetValue();
-
-    bb.push_back(
-        Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
-
-    const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
+    const auto [base_address, index, offset] =
+        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()));
+    ASSERT(base_address != nullptr);
+
+    bb.push_back(Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", index, offset)));
+
+    const GlobalMemoryBase descriptor{index, offset};
     const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);
     auto& usage = entry->second;
     if (is_write) {
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index cb480be9bd..e1ee5c190d 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -269,7 +269,13 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
             LOG_WARNING(HW_GPU, "TLDS.NODEP implementation is incomplete");
         }
 
-        WriteTexsInstructionFloat(bb, instr, GetTldsCode(instr, texture_type, is_array));
+        const Node4 components = GetTldsCode(instr, texture_type, is_array);
+
+        if (instr.tlds.fp32_flag) {
+            WriteTexsInstructionFloat(bb, instr, components);
+        } else {
+            WriteTexsInstructionHalfFloat(bb, instr, components);
+        }
         break;
     }
     default:
@@ -302,13 +308,9 @@ const Sampler& ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler, Textu
 const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg, TextureType type,
                                             bool is_array, bool is_shadow) {
     const Node sampler_register = GetRegister(reg);
-    const Node base_sampler =
+    const auto [base_sampler, cbuf_index, cbuf_offset] =
         TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size()));
-    const auto cbuf = std::get_if<CbufNode>(&*base_sampler);
-    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset());
-    ASSERT(cbuf_offset_imm != nullptr);
-    const auto cbuf_offset = cbuf_offset_imm->GetValue();
-    const auto cbuf_index = cbuf->GetIndex();
+    ASSERT(base_sampler != nullptr);
     const auto cbuf_key = (static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset);
 
     // If this sampler has already been used, return the existing mapping.
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 5994bfc4e9..78bd1cf1e5 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -61,7 +61,16 @@ Node ShaderIR::GetConstBufferIndirect(u64 index_, u64 offset_, Node node) {
     const auto [entry, is_new] = used_cbufs.try_emplace(index);
     entry->second.MarkAsUsedIndirect();
 
-    const Node final_offset = Operation(OperationCode::UAdd, NO_PRECISE, node, Immediate(offset));
+    const Node final_offset = [&]() {
+        // Attempt to inline constant buffer without a variable offset. This is done to allow
+        // tracking LDC calls.
+        if (const auto gpr = std::get_if<GprNode>(&*node)) {
+            if (gpr->GetIndex() == Register::ZeroIndex) {
+                return Immediate(offset);
+            }
+        }
+        return Operation(OperationCode::UAdd, NO_PRECISE, node, Immediate(offset));
+    }();
     return MakeNode<CbufNode>(index, final_offset);
 }
 
@@ -89,6 +98,22 @@ Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_addres
 }
 
 Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buffer) {
+    if (index == Attribute::Index::LayerViewportPointSize) {
+        switch (element) {
+        case 0:
+            UNIMPLEMENTED();
+            break;
+        case 1:
+            uses_layer = true;
+            break;
+        case 2:
+            uses_viewport_index = true;
+            break;
+        case 3:
+            uses_point_size = true;
+            break;
+        }
+    }
     if (index == Attribute::Index::ClipDistances0123 ||
         index == Attribute::Index::ClipDistances4567) {
         const auto clip_index =
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 6145f0a707..126c781369 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -115,6 +115,18 @@ public:
         return static_cast<std::size_t>(coverage_end * sizeof(u64));
     }
 
+    bool UsesLayer() const {
+        return uses_layer;
+    }
+
+    bool UsesViewportIndex() const {
+        return uses_viewport_index;
+    }
+
+    bool UsesPointSize() const {
+        return uses_point_size;
+    }
+
     bool HasPhysicalAttributes() const {
         return uses_physical_attributes;
     }
@@ -316,7 +328,7 @@ private:
     void WriteLop3Instruction(NodeBlock& bb, Tegra::Shader::Register dest, Node op_a, Node op_b,
                               Node op_c, Node imm_lut, bool sets_cc);
 
-    Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
+    std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
 
     std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;
 
@@ -346,6 +358,9 @@ private:
     std::set<Image> used_images;
     std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
     std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory;
+    bool uses_layer{};
+    bool uses_viewport_index{};
+    bool uses_point_size{};
     bool uses_physical_attributes{}; // Shader uses AL2P or physical attribute read/writes
 
     Tegra::Shader::Header header;
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index fc957d980e..dc132a4a3d 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -32,39 +32,44 @@ std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
     }
     return {};
 }
-} // namespace
+} // Anonymous namespace
 
-Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const {
+std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,
+                                               s64 cursor) const {
     if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
-        // Cbuf found, but it has to be immediate
-        return std::holds_alternative<ImmediateNode>(*cbuf->GetOffset()) ? tracked : nullptr;
+        // Constant buffer found, test if it's an immediate
+        const auto offset = cbuf->GetOffset();
+        if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
+            return {tracked, cbuf->GetIndex(), immediate->GetValue()};
+        }
+        return {};
     }
     if (const auto gpr = std::get_if<GprNode>(&*tracked)) {
         if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
-            return nullptr;
+            return {};
         }
         // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
         // register that it uses as operand
         const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);
         if (!source) {
-            return nullptr;
+            return {};
         }
         return TrackCbuf(source, code, new_cursor);
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
         for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
-            if (const auto found = TrackCbuf((*operation)[i], code, cursor)) {
-                // Cbuf found in operand
+            if (auto found = TrackCbuf((*operation)[i], code, cursor); std::get<0>(found)) {
+                // Cbuf found in operand.
                 return found;
             }
         }
-        return nullptr;
+        return {};
     }
     if (const auto conditional = std::get_if<ConditionalNode>(&*tracked)) {
         const auto& conditional_code = conditional->GetCode();
         return TrackCbuf(tracked, conditional_code, static_cast<s64>(conditional_code.size()));
     }
-    return nullptr;
+    return {};
 }
 
 std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const {
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 7a0fdb19bc..6af9044cad 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -75,9 +75,12 @@ MatchStructureResult SurfaceBaseImpl::MatchesStructure(const SurfaceParams& rhs)
 
     // Linear Surface check
     if (!params.is_tiled) {
-        if (std::tie(params.width, params.height, params.pitch) ==
-            std::tie(rhs.width, rhs.height, rhs.pitch)) {
-            return MatchStructureResult::FullMatch;
+        if (std::tie(params.height, params.pitch) == std::tie(rhs.height, rhs.pitch)) {
+            if (params.width == rhs.width) {
+                return MatchStructureResult::FullMatch;
+            } else {
+                return MatchStructureResult::SemiMatch;
+            }
         }
         return MatchStructureResult::None;
     }
diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h
index 8ba386a8ac..bcce8d8634 100644
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -200,8 +200,9 @@ public:
         modification_tick = tick;
     }
 
-    void MarkAsRenderTarget(const bool is_target) {
+    void MarkAsRenderTarget(const bool is_target, const u32 index) {
         this->is_target = is_target;
+        this->index = index;
     }
 
     void MarkAsPicked(const bool is_picked) {
@@ -221,6 +222,10 @@ public:
         return is_target;
     }
 
+    u32 GetRenderTarget() const {
+        return index;
+    }
+
     bool IsRegistered() const {
         return is_registered;
     }
@@ -307,10 +312,13 @@ private:
         return view;
     }
 
+    static constexpr u32 NO_RT = 0xFFFFFFFF;
+
     bool is_modified{};
     bool is_target{};
     bool is_registered{};
     bool is_picked{};
+    u32 index{NO_RT};
     u64 modification_tick{};
 };
 
diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp
index 9c56e2b4f1..fd54724513 100644
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -290,12 +290,19 @@ std::size_t SurfaceParams::GetLayerSize(bool as_host_size, bool uncompressed) co
 
 std::size_t SurfaceParams::GetInnerMipmapMemorySize(u32 level, bool as_host_size,
                                                     bool uncompressed) const {
-    const bool tiled{as_host_size ? false : is_tiled};
     const u32 width{GetMipmapSize(uncompressed, GetMipWidth(level), GetDefaultBlockWidth())};
     const u32 height{GetMipmapSize(uncompressed, GetMipHeight(level), GetDefaultBlockHeight())};
     const u32 depth{is_layered ? 1U : GetMipDepth(level)};
-    return Tegra::Texture::CalculateSize(tiled, GetBytesPerPixel(), width, height, depth,
-                                         GetMipBlockHeight(level), GetMipBlockDepth(level));
+    if (is_tiled) {
+        return Tegra::Texture::CalculateSize(!as_host_size, GetBytesPerPixel(), width, height,
+                                             depth, GetMipBlockHeight(level),
+                                             GetMipBlockDepth(level));
+    } else if (as_host_size || IsBuffer()) {
+        return GetBytesPerPixel() * width * height * depth;
+    } else {
+        // Linear Texture Case
+        return pitch * height * depth;
+    }
 }
 
 bool SurfaceParams::operator==(const SurfaceParams& rhs) const {
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index c9e72531a5..7f9623c623 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -133,11 +133,11 @@ public:
             regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)};
         auto surface_view = GetSurface(gpu_addr, depth_params, preserve_contents, true);
         if (depth_buffer.target)
-            depth_buffer.target->MarkAsRenderTarget(false);
+            depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
         depth_buffer.target = surface_view.first;
         depth_buffer.view = surface_view.second;
         if (depth_buffer.target)
-            depth_buffer.target->MarkAsRenderTarget(true);
+            depth_buffer.target->MarkAsRenderTarget(true, DEPTH_RT);
         return surface_view.second;
     }
 
@@ -167,11 +167,11 @@ public:
         auto surface_view = GetSurface(gpu_addr, SurfaceParams::CreateForFramebuffer(system, index),
                                        preserve_contents, true);
         if (render_targets[index].target)
-            render_targets[index].target->MarkAsRenderTarget(false);
+            render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
         render_targets[index].target = surface_view.first;
         render_targets[index].view = surface_view.second;
         if (render_targets[index].target)
-            render_targets[index].target->MarkAsRenderTarget(true);
+            render_targets[index].target->MarkAsRenderTarget(true, static_cast<u32>(index));
         return surface_view.second;
     }
 
@@ -191,7 +191,7 @@ public:
         if (depth_buffer.target == nullptr) {
             return;
         }
-        depth_buffer.target->MarkAsRenderTarget(false);
+        depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
         depth_buffer.target = nullptr;
         depth_buffer.view = nullptr;
     }
@@ -200,7 +200,7 @@ public:
         if (render_targets[index].target == nullptr) {
             return;
         }
-        render_targets[index].target->MarkAsRenderTarget(false);
+        render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
         render_targets[index].target = nullptr;
         render_targets[index].view = nullptr;
     }
@@ -270,6 +270,16 @@ protected:
     // and reading it from a sepparate buffer.
     virtual void BufferCopy(TSurface& src_surface, TSurface& dst_surface) = 0;
 
+    void ManageRenderTargetUnregister(TSurface& surface) {
+        auto& maxwell3d = system.GPU().Maxwell3D();
+        const u32 index = surface->GetRenderTarget();
+        if (index == DEPTH_RT) {
+            maxwell3d.dirty_flags.zeta_buffer = true;
+        } else {
+            maxwell3d.dirty_flags.color_buffer.set(index, true);
+        }
+    }
+
     void Register(TSurface surface) {
         const GPUVAddr gpu_addr = surface->GetGpuAddr();
         const CacheAddr cache_ptr = ToCacheAddr(system.GPU().MemoryManager().GetPointer(gpu_addr));
@@ -294,6 +304,9 @@ protected:
         if (guard_render_targets && surface->IsProtected()) {
             return;
         }
+        if (!guard_render_targets && surface->IsRenderTarget()) {
+            ManageRenderTargetUnregister(surface);
+        }
         const GPUVAddr gpu_addr = surface->GetGpuAddr();
         const CacheAddr cache_ptr = surface->GetCacheAddr();
         const std::size_t size = surface->GetSizeInBytes();
@@ -649,15 +662,6 @@ private:
                 }
                 return {current_surface, *view};
             }
-            // The next case is unsafe, so if we r in accurate GPU, just skip it
-            if (Settings::values.use_accurate_gpu_emulation) {
-                return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
-                                      MatchTopologyResult::FullMatch);
-            }
-            // This is the case the texture is a part of the parent.
-            if (current_surface->MatchesSubTexture(params, gpu_addr)) {
-                return RebuildSurface(current_surface, params, is_render);
-            }
         } else {
             // If there are many overlaps, odds are they are subtextures of the candidate
             // surface. We try to construct a new surface based on the candidate parameters,
@@ -793,6 +797,9 @@ private:
     static constexpr u64 registry_page_size{1 << registry_page_bits};
     std::unordered_map<CacheAddr, std::vector<TSurface>> registry;
 
+    static constexpr u32 DEPTH_RT = 8;
+    static constexpr u32 NO_RT = 0xFFFFFFFF;
+
     // The L1 Cache is used for fast texture lookup before checking the overlaps
     // This avoids calculating size and other stuffs.
     std::unordered_map<CacheAddr, TSurface> l1_cache;