aboutsummaryrefslogtreecommitdiff
path: root/ARMeilleure
diff options
context:
space:
mode:
authorFICTURE7 <FICTURE7@gmail.com>2021-01-25 03:01:25 +0400
committerGitHub <noreply@github.com>2021-01-25 10:01:25 +1100
commitddf1105bcb6c9884e1188d5f63f0890ef1806176 (patch)
tree0aaa0cebddb53d40772cdbc10438d451d68c2446 /ARMeilleure
parentf94acdb4efcf48555481f38417f8befa4ca560ad (diff)
Add VCLZ.* fast path (#1917)
* Add VCLZ fast path * Add VCLZ.8B/16B SSSE3 fast path * Add VCLZ.4H/8H SSSE3 fast path * Add VCLZ.2S/4S SSE2 fast path * Improve CLZ.4H/8H fast path * Improve CLZ.2S/4S fast path * Set PPTC version
Diffstat (limited to 'ARMeilleure')
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdArithmetic.cs147
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdHelper.cs5
-rw-r--r--ARMeilleure/Translation/PTC/Ptc.cs2
3 files changed, 145 insertions, 9 deletions
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
index bd6a98be..f18b91cf 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
@@ -120,24 +120,155 @@ namespace ARMeilleure.Instructions
{
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
- Operand res = context.VectorZero();
-
- int elems = op.GetBytesCount() >> op.Size;
-
int eSize = 8 << op.Size;
- for (int index = 0; index < elems; index++)
+ Operand res = eSize switch {
+ 8 => Clz_V_I8 (context, GetVec(op.Rn)),
+ 16 => Clz_V_I16(context, GetVec(op.Rn)),
+ 32 => Clz_V_I32(context, GetVec(op.Rn)),
+ _ => null
+ };
+
+ if (res != null)
{
- Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
+ if (op.RegisterSize == RegisterSize.Simd64)
+ {
+ res = context.VectorZeroUpper64(res);
+ }
+ }
+ else
+ {
+ int elems = op.GetBytesCount() >> op.Size;
- Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
+ res = context.VectorZero();
- res = EmitVectorInsert(context, res, de, index, op.Size);
+ for (int index = 0; index < elems; index++)
+ {
+ Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
+
+ Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
+
+ res = EmitVectorInsert(context, res, de, index, op.Size);
+ }
}
context.Copy(GetVec(op.Rd), res);
}
+ private static Operand Clz_V_I8(ArmEmitterContext context, Operand arg)
+ {
+ if (!Optimizations.UseSsse3)
+ {
+ return null;
+ }
+
+ // CLZ nibble table.
+ Operand clzTable = X86GetScalar(context, 0x01_01_01_01_02_02_03_04);
+
+ Operand maskLow = X86GetAllElements(context, 0x0f_0f_0f_0f);
+ Operand c04 = X86GetAllElements(context, 0x04_04_04_04);
+
+ // CLZ of low 4 bits of elements in arg.
+ Operand loClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, arg);
+
+ // Get the high 4 bits of elements in arg.
+ Operand hiArg = context.AddIntrinsic(Intrinsic.X86Psrlw, arg, Const(4));
+ hiArg = context.AddIntrinsic(Intrinsic.X86Pand, hiArg, maskLow);
+
+ // CLZ of high 4 bits of elements in arg.
+ Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, hiArg);
+
+ // If high 4 bits are not all zero, we discard the CLZ of the low 4 bits.
+ Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqb, hiClz, c04);
+ loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
+
+ return context.AddIntrinsic(Intrinsic.X86Paddb, loClz, hiClz);
+ }
+
+ private static Operand Clz_V_I16(ArmEmitterContext context, Operand arg)
+ {
+ if (!Optimizations.UseSsse3)
+ {
+ return null;
+ }
+
+ Operand maskSwap = X86GetElements(context, 0x80_0f_80_0d_80_0b_80_09, 0x80_07_80_05_80_03_80_01);
+ Operand maskLow = X86GetAllElements(context, 0x00ff_00ff);
+ Operand c0008 = X86GetAllElements(context, 0x0008_0008);
+
+ // CLZ pair of high 8 and low 8 bits of elements in arg.
+ Operand hiloClz = Clz_V_I8(context, arg);
+ // Get CLZ of low 8 bits in each pair.
+ Operand loClz = context.AddIntrinsic(Intrinsic.X86Pand, hiloClz, maskLow);
+ // Get CLZ of high 8 bits in each pair.
+ Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, hiloClz, maskSwap);
+
+ // If high 8 bits are not all zero, we discard the CLZ of the low 8 bits.
+ Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqw, hiClz, c0008);
+ loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
+
+ return context.AddIntrinsic(Intrinsic.X86Paddw, loClz, hiClz);
+ }
+
+ private static Operand Clz_V_I32(ArmEmitterContext context, Operand arg)
+ {
+ // TODO: Use vplzcntd when AVX-512 is supported.
+ if (!Optimizations.UseSse2)
+ {
+ return null;
+ }
+
+ Operand AddVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Paddd, op0, op1);
+ Operand SubVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Psubd, op0, op1);
+ Operand ShiftRightVectorUI32(Operand op0, int imm8) => context.AddIntrinsic(Intrinsic.X86Psrld, op0, Const(imm8));
+ Operand OrVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Por, op0, op1);
+ Operand AndVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Pand, op0, op1);
+ Operand NotVector(Operand op0) => context.AddIntrinsic(Intrinsic.X86Pandn, op0, context.VectorOne());
+
+ Operand c55555555 = X86GetAllElements(context, 0x55555555);
+ Operand c33333333 = X86GetAllElements(context, 0x33333333);
+ Operand c0f0f0f0f = X86GetAllElements(context, 0x0f0f0f0f);
+ Operand c0000003f = X86GetAllElements(context, 0x0000003f);
+
+ Operand tmp0;
+ Operand tmp1;
+ Operand res;
+
+ // Set all bits after highest set bit to 1.
+ res = OrVector(ShiftRightVectorUI32(arg, 1), arg);
+ res = OrVector(ShiftRightVectorUI32(res, 2), res);
+ res = OrVector(ShiftRightVectorUI32(res, 4), res);
+ res = OrVector(ShiftRightVectorUI32(res, 8), res);
+ res = OrVector(ShiftRightVectorUI32(res, 16), res);
+
+ // Make leading 0s into leading 1s.
+ res = NotVector(res);
+
+ // Count leading 1s, which is the population count.
+ tmp0 = ShiftRightVectorUI32(res, 1);
+ tmp0 = AndVector(tmp0, c55555555);
+ res = SubVectorI32(res, tmp0);
+
+ tmp0 = ShiftRightVectorUI32(res, 2);
+ tmp0 = AndVector(tmp0, c33333333);
+ tmp1 = AndVector(res, c33333333);
+ res = AddVectorI32(tmp0, tmp1);
+
+ tmp0 = ShiftRightVectorUI32(res, 4);
+ tmp0 = AddVectorI32(tmp0, res);
+ res = AndVector(tmp0, c0f0f0f0f);
+
+ tmp0 = ShiftRightVectorUI32(res, 8);
+ res = AddVectorI32(tmp0, res);
+
+ tmp0 = ShiftRightVectorUI32(res, 16);
+ res = AddVectorI32(tmp0, res);
+
+ res = AndVector(res, c0000003f);
+
+ return res;
+ }
+
public static void Cnt_V(ArmEmitterContext context)
{
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs
index e9d5303c..da8ccae7 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs
@@ -210,6 +210,11 @@ namespace ARMeilleure.Instructions
public static Operand X86GetElements(ArmEmitterContext context, long e1, long e0)
{
+ return X86GetElements(context, (ulong)e1, (ulong)e0);
+ }
+
+ public static Operand X86GetElements(ArmEmitterContext context, ulong e1, ulong e0)
+ {
Operand vector0 = context.VectorCreateScalar(Const(e0));
Operand vector1 = context.VectorCreateScalar(Const(e1));
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 8f250a55..92094e62 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC
{
private const string HeaderMagic = "PTChd";
- private const int InternalVersion = 1817; //! To be incremented manually for each change to the ARMeilleure project.
+ private const int InternalVersion = 1917; //! To be incremented manually for each change to the ARMeilleure project.
private const string ActualDir = "0";
private const string BackupDir = "1";