aboutsummaryrefslogtreecommitdiff
path: root/ARMeilleure
diff options
context:
space:
mode:
authorLDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>2020-11-18 19:35:54 +0100
committerGitHub <noreply@github.com>2020-11-18 19:35:54 +0100
commit0679084f115b6838dec4d8c5e85044c33d4122d0 (patch)
tree0d25ace42740e37d6bb2a8cd30fa92c5313d265a /ARMeilleure
parenteafee34feebd432151809df402f3f696e4d93d08 (diff)
CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Now HardwareCapabilities uses CpuId. (#1650)
* net5.0 * CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Switch to .NET 5.0. Nits. Tests performed successfully in both debug and release mode (for all instructions involved). * Address comment. * Update appveyor.yml * Revert "Update appveyor.yml" This reverts commit 27cdd59e8b90e227e6924d9c162af26c00a89013. * Remove Assembler CpuId. * Update appveyor.yml * Address comment.
Diffstat (limited to 'ARMeilleure')
-rw-r--r--ARMeilleure/CodeGen/X86/Assembler.cs8
-rw-r--r--ARMeilleure/CodeGen/X86/HardwareCapabilities.cs62
-rw-r--r--ARMeilleure/CodeGen/X86/IntrinsicTable.cs2
-rw-r--r--ARMeilleure/CodeGen/X86/X86Instruction.cs3
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdCvt.cs88
-rw-r--r--ARMeilleure/IntermediateRepresentation/Intrinsic.cs2
-rw-r--r--ARMeilleure/Optimizations.cs2
-rw-r--r--ARMeilleure/Translation/PTC/Ptc.cs20
8 files changed, 129 insertions, 58 deletions
diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs
index b855f1b1..48053efc 100644
--- a/ARMeilleure/CodeGen/X86/Assembler.cs
+++ b/ARMeilleure/CodeGen/X86/Assembler.cs
@@ -104,7 +104,6 @@ namespace ARMeilleure.CodeGen.X86
Add(X86Instruction.Cmpxchg8, new InstructionInfo(0x00000fb0, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Reg8Src));
Add(X86Instruction.Comisd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Comiss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex));
- Add(X86Instruction.Cpuid, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fa2, InstructionFlags.RegOnly));
Add(X86Instruction.Crc32, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2));
Add(X86Instruction.Crc32_16, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2 | InstructionFlags.Prefix66));
Add(X86Instruction.Crc32_8, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f0, InstructionFlags.PrefixF2 | InstructionFlags.Reg8Src));
@@ -270,6 +269,8 @@ namespace ARMeilleure.CodeGen.X86
Add(X86Instruction.Unpcklps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f14, InstructionFlags.Vex));
Add(X86Instruction.Vblendvpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4b, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vblendvps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66));
+ Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
+ Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None));
Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66));
@@ -386,11 +387,6 @@ namespace ARMeilleure.CodeGen.X86
WriteInstruction(src1, null, src2, X86Instruction.Comiss);
}
- public void Cpuid()
- {
- WriteInstruction(null, null, OperandType.None, X86Instruction.Cpuid);
- }
-
public void Cvtsd2ss(Operand dest, Operand src1, Operand src2)
{
WriteInstruction(dest, src1, src2, X86Instruction.Cvtsd2ss);
diff --git a/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs b/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs
index b622c65c..aa103e30 100644
--- a/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs
+++ b/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs
@@ -1,20 +1,60 @@
+using System;
using System.Runtime.Intrinsics.X86;
namespace ARMeilleure.CodeGen.X86
{
static class HardwareCapabilities
{
- public static bool SupportsSse => Sse.IsSupported;
- public static bool SupportsSse2 => Sse2.IsSupported;
- public static bool SupportsSse3 => Sse3.IsSupported;
- public static bool SupportsSsse3 => Ssse3.IsSupported;
- public static bool SupportsSse41 => Sse41.IsSupported;
- public static bool SupportsSse42 => Sse42.IsSupported;
- public static bool SupportsPclmulqdq => Pclmulqdq.IsSupported;
- public static bool SupportsFma => Fma.IsSupported;
- public static bool SupportsPopcnt => Popcnt.IsSupported;
- public static bool SupportsAesni => Aes.IsSupported;
- public static bool SupportsAvx => Avx.IsSupported;
+ static HardwareCapabilities()
+ {
+ if (!X86Base.IsSupported)
+ {
+ return;
+ }
+
+ (_, _, int ecx, int edx) = X86Base.CpuId(0x00000001, 0x00000000);
+
+ FeatureInfoEdx = (FeatureFlagsEdx)edx;
+ FeatureInfoEcx = (FeatureFlagsEcx)ecx;
+ }
+
+ [Flags]
+ public enum FeatureFlagsEdx
+ {
+ Sse = 1 << 25,
+ Sse2 = 1 << 26
+ }
+
+ [Flags]
+ public enum FeatureFlagsEcx
+ {
+ Sse3 = 1 << 0,
+ Pclmulqdq = 1 << 1,
+ Ssse3 = 1 << 9,
+ Fma = 1 << 12,
+ Sse41 = 1 << 19,
+ Sse42 = 1 << 20,
+ Popcnt = 1 << 23,
+ Aes = 1 << 25,
+ Avx = 1 << 28,
+ F16c = 1 << 29
+ }
+
+ public static FeatureFlagsEdx FeatureInfoEdx { get; }
+ public static FeatureFlagsEcx FeatureInfoEcx { get; }
+
+ public static bool SupportsSse => FeatureInfoEdx.HasFlag(FeatureFlagsEdx.Sse);
+ public static bool SupportsSse2 => FeatureInfoEdx.HasFlag(FeatureFlagsEdx.Sse2);
+ public static bool SupportsSse3 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse3);
+ public static bool SupportsPclmulqdq => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Pclmulqdq);
+ public static bool SupportsSsse3 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Ssse3);
+ public static bool SupportsFma => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Fma);
+ public static bool SupportsSse41 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse41);
+ public static bool SupportsSse42 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse42);
+ public static bool SupportsPopcnt => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Popcnt);
+ public static bool SupportsAesni => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Aes);
+ public static bool SupportsAvx => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Avx);
+ public static bool SupportsF16c => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.F16c);
public static bool ForceLegacySse { get; set; }
diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
index f7469bad..864b0a10 100644
--- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
+++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
@@ -162,6 +162,8 @@ namespace ARMeilleure.CodeGen.X86
Add(Intrinsic.X86Unpckhps, new IntrinsicInfo(X86Instruction.Unpckhps, IntrinsicType.Binary));
Add(Intrinsic.X86Unpcklpd, new IntrinsicInfo(X86Instruction.Unpcklpd, IntrinsicType.Binary));
Add(Intrinsic.X86Unpcklps, new IntrinsicInfo(X86Instruction.Unpcklps, IntrinsicType.Binary));
+ Add(Intrinsic.X86Vcvtph2ps, new IntrinsicInfo(X86Instruction.Vcvtph2ps, IntrinsicType.Unary));
+ Add(Intrinsic.X86Vcvtps2ph, new IntrinsicInfo(X86Instruction.Vcvtps2ph, IntrinsicType.BinaryImm));
Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary));
Add(Intrinsic.X86Xorps, new IntrinsicInfo(X86Instruction.Xorps, IntrinsicType.Binary));
}
diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs
index f9b35d37..fae17b86 100644
--- a/ARMeilleure/CodeGen/X86/X86Instruction.cs
+++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs
@@ -33,7 +33,6 @@ namespace ARMeilleure.CodeGen.X86
Cmpxchg8,
Comisd,
Comiss,
- Cpuid,
Crc32,
Crc32_16,
Crc32_8,
@@ -199,6 +198,8 @@ namespace ARMeilleure.CodeGen.X86
Unpcklps,
Vblendvpd,
Vblendvps,
+ Vcvtph2ps,
+ Vcvtps2ph,
Vpblendvb,
Xor,
Xorpd,
diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt.cs b/ARMeilleure/Instructions/InstEmitSimdCvt.cs
index edcf35d5..0350427c 100644
--- a/ARMeilleure/Instructions/InstEmitSimdCvt.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdCvt.cs
@@ -60,21 +60,48 @@ namespace ARMeilleure.Instructions
}
else if (op.Size == 0 && op.Opc == 3) // Single -> Half.
{
- Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);
+ if (Optimizations.UseF16c)
+ {
+ Debug.Assert(!Optimizations.ForceLegacySse);
- Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne);
+ Operand n = GetVec(op.Rn);
- res = context.ZeroExtend16(OperandType.I64, res);
+ Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
+ res = context.AddIntrinsic(Intrinsic.X86Pslldq, res, Const(14)); // VectorZeroUpper112()
+ res = context.AddIntrinsic(Intrinsic.X86Psrldq, res, Const(14));
- context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1));
+ context.Copy(GetVec(op.Rd), res);
+ }
+ else
+ {
+ Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);
+
+ Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne);
+
+ res = context.ZeroExtend16(OperandType.I64, res);
+
+ context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1));
+ }
}
else if (op.Size == 3 && op.Opc == 0) // Half -> Single.
{
- Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1);
+ if (Optimizations.UseF16c)
+ {
+ Debug.Assert(!Optimizations.ForceLegacySse);
+
+ Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, GetVec(op.Rn));
+ res = context.VectorZeroUpper96(res);
+
+ context.Copy(GetVec(op.Rd), res);
+ }
+ else
+ {
+ Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1);
- Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne);
+ Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne);
- context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
+ context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
+ }
}
else if (op.Size == 1 && op.Opc == 3) // Double -> Half.
{
@@ -129,18 +156,20 @@ namespace ARMeilleure.Instructions
if (Optimizations.UseSse2 && sizeF == 1)
{
Operand n = GetVec(op.Rn);
- Operand res;
- if (op.RegisterSize == RegisterSize.Simd128)
- {
- res = context.AddIntrinsic(Intrinsic.X86Movhlps, n, n);
- }
- else
- {
- res = n;
- }
+ Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
+ res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res);
- res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res);
+ context.Copy(GetVec(op.Rd), res);
+ }
+ else if (Optimizations.UseF16c && sizeF == 0)
+ {
+ Debug.Assert(!Optimizations.ForceLegacySse);
+
+ Operand n = GetVec(op.Rn);
+
+ Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
+ res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, res);
context.Copy(GetVec(op.Rd), res);
}
@@ -210,17 +239,30 @@ namespace ARMeilleure.Instructions
{
Operand d = GetVec(op.Rd);
- Operand res = context.VectorZeroUpper64(d);
+ Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;
Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtpd2ps, GetVec(op.Rn));
+ nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
- nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
+ Operand res = context.VectorZeroUpper64(d);
+ res = context.AddIntrinsic(movInst, res, nInt);
+
+ context.Copy(d, res);
+ }
+ else if (Optimizations.UseF16c && sizeF == 0)
+ {
+ Debug.Assert(!Optimizations.ForceLegacySse);
- Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128
- ? Intrinsic.X86Movlhps
- : Intrinsic.X86Movhlps;
+ Operand d = GetVec(op.Rd);
+ Operand n = GetVec(op.Rn);
+
+ Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;
- res = context.AddIntrinsic(movInst, res, nInt);
+ Operand nInt = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
+ nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
+
+ Operand res = context.VectorZeroUpper64(d);
+ res = context.AddIntrinsic(movInst, res, nInt);
context.Copy(d, res);
}
diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
index 7f891170..cbfa8c71 100644
--- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
+++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
@@ -151,6 +151,8 @@ namespace ARMeilleure.IntermediateRepresentation
X86Unpckhps,
X86Unpcklpd,
X86Unpcklps,
+ X86Vcvtph2ps,
+ X86Vcvtps2ph,
X86Xorpd,
X86Xorps
}
diff --git a/ARMeilleure/Optimizations.cs b/ARMeilleure/Optimizations.cs
index fbbbfdb2..f568eb05 100644
--- a/ARMeilleure/Optimizations.cs
+++ b/ARMeilleure/Optimizations.cs
@@ -14,6 +14,7 @@ namespace ARMeilleure
public static bool UseSse42IfAvailable { get; set; } = true;
public static bool UsePopCntIfAvailable { get; set; } = true;
public static bool UseAvxIfAvailable { get; set; } = true;
+ public static bool UseF16cIfAvailable { get; set; } = true;
public static bool UseAesniIfAvailable { get; set; } = true;
public static bool UsePclmulqdqIfAvailable { get; set; } = true;
@@ -31,6 +32,7 @@ namespace ARMeilleure
internal static bool UseSse42 => UseSse42IfAvailable && HardwareCapabilities.SupportsSse42;
internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt;
internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse;
+ internal static bool UseF16c => UseF16cIfAvailable && HardwareCapabilities.SupportsF16c;
internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni;
internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq;
}
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index dd1c44b2..3baef401 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -1,5 +1,6 @@
using ARMeilleure.CodeGen;
using ARMeilleure.CodeGen.Unwinding;
+using ARMeilleure.CodeGen.X86;
using ARMeilleure.Memory;
using Ryujinx.Common.Configuration;
using Ryujinx.Common.Logging;
@@ -10,7 +11,6 @@ using System.Diagnostics;
using System.IO;
using System.IO.Compression;
using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics.X86;
using System.Runtime.Serialization.Formatters.Binary;
using System.Threading;
using System.Threading.Tasks;
@@ -21,7 +21,7 @@ namespace ARMeilleure.Translation.PTC
{
private const string HeaderMagic = "PTChd";
- private const int InternalVersion = 1273; //! To be incremented manually for each change to the ARMeilleure project.
+ private const int InternalVersion = 1650; //! To be incremented manually for each change to the ARMeilleure project.
private const string ActualDir = "0";
private const string BackupDir = "1";
@@ -646,21 +646,7 @@ namespace ARMeilleure.Translation.PTC
private static ulong GetFeatureInfo()
{
- ulong featureInfo = 0ul;
-
- featureInfo |= (Sse3.IsSupported ? 1ul : 0ul) << 0;
- featureInfo |= (Pclmulqdq.IsSupported ? 1ul : 0ul) << 1;
- featureInfo |= (Ssse3.IsSupported ? 1ul : 0ul) << 9;
- featureInfo |= (Fma.IsSupported ? 1ul : 0ul) << 12;
- featureInfo |= (Sse41.IsSupported ? 1ul : 0ul) << 19;
- featureInfo |= (Sse42.IsSupported ? 1ul : 0ul) << 20;
- featureInfo |= (Popcnt.IsSupported ? 1ul : 0ul) << 23;
- featureInfo |= (Aes.IsSupported ? 1ul : 0ul) << 25;
- featureInfo |= (Avx.IsSupported ? 1ul : 0ul) << 28;
- featureInfo |= (Sse.IsSupported ? 1ul : 0ul) << 57;
- featureInfo |= (Sse2.IsSupported ? 1ul : 0ul) << 58;
-
- return featureInfo;
+ return (ulong)HardwareCapabilities.FeatureInfoEdx << 32 | (uint)HardwareCapabilities.FeatureInfoEcx;
}
private struct Header