diff options
author | LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> | 2021-01-04 23:45:54 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-01-04 23:45:54 +0100 |
commit | 430ba6da65a781196db7d723cc88710bb7f5caf8 (patch) | |
tree | a7ed55f638dde795f4270a324fa5338ffb80ee12 /ARMeilleure/Instructions/InstEmitSimdArithmetic.cs | |
parent | a03ab0c4a0bef3c168874dc2105c43c9051e0807 (diff) |
CPU (A64): Add Pmull_V Inst. with Clmul fast path for the "1/2D -> 1Q" variant & Sse fast path and slow path for both the "8/16B -> 8H" and "1/2D -> 1Q" variants; with Test. (#1817)
* Add Pmull_V Sse fast path only, both "8/16B -> 8H" and "1/2D -> 1Q" variants; with Test.
* Add Clmul fast path for the 128 bits variant.
* Small optimisation (save 60 instructions) for the Sse fast path about the 128 bits variant.
* Add slow path, both variants. Fix V128 Shl/Shr when shift = 0.
* A32: Add Vmull_I P64 variant (slow path); not tested.
* A32: Add Vmull_I_P8_P64 Test and fix P64 variant.
Diffstat (limited to 'ARMeilleure/Instructions/InstEmitSimdArithmetic.cs')
-rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdArithmetic.cs | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs index 3a97bc52..88be07bd 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -10,6 +10,7 @@ using System.Diagnostics; using static ARMeilleure.Instructions.InstEmitHelper; using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper32; using static ARMeilleure.IntermediateRepresentation.OperandHelper; namespace ARMeilleure.Instructions @@ -1928,6 +1929,112 @@ namespace ARMeilleure.Instructions } } + public static void Pmull_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UsePclmulqdq && op.Size == 3) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int imm8 = op.RegisterSize == RegisterSize.Simd64 ? 0b0000_0000 : 0b0001_0001; + + Operand res = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, n, m, Const(imm8)); + + context.Copy(GetVec(op.Rd), res); + } + else if (Optimizations.UseSse41) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd64) + { + n = context.VectorZeroUpper64(n); + m = context.VectorZeroUpper64(m); + } + else /* if (op.RegisterSize == RegisterSize.Simd128) */ + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Operand res = context.VectorZero(); + + if (op.Size == 0) + { + n = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, n); + m = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, m); + + for (int i = 0; i < 8; i++) + { + Operand mask = context.AddIntrinsic(Intrinsic.X86Psllw, n, Const(15 - i)); + mask = context.AddIntrinsic(Intrinsic.X86Psraw, mask, Const(15)); + + Operand tmp = context.AddIntrinsic(Intrinsic.X86Psllw, m, Const(i)); + tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask); + + res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp); + } + } + else /* if (op.Size == 3) */ + { + Operand zero = context.VectorZero(); + + for (int i = 0; i < 64; i++) + { + Operand mask = context.AddIntrinsic(Intrinsic.X86Movlhps, n, n); + mask = context.AddIntrinsic(Intrinsic.X86Psllq, mask, Const(63 - i)); + mask = context.AddIntrinsic(Intrinsic.X86Psrlq, mask, Const(63)); + mask = context.AddIntrinsic(Intrinsic.X86Psubq, zero, mask); + + Operand tmp = EmitSse2Sll_128(context, m, i); + tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask); + + res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp); + } + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res; + + if (op.Size == 0) + { + res = context.VectorZero(); + + int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 8; + + for (int index = 0; index < 8; index++) + { + Operand ne = context.VectorExtract8(n, part + index); + Operand me = context.VectorExtract8(m, part + index); + + Operand de = EmitPolynomialMultiply(context, ne, me, 8); + + res = EmitVectorInsert(context, res, de, index, 1); + } + } + else /* if (op.Size == 3) */ + { + int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 1; + + Operand ne = context.VectorExtract(OperandType.I64, n, part); + Operand me = context.VectorExtract(OperandType.I64, m, part); + + res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me); + } + + context.Copy(GetVec(op.Rd), res); + } + } + public static void Raddhn_V(ArmEmitterContext context) { EmitHighNarrow(context, (op1, op2) => context.Add(op1, op2), round: true); @@ -3690,5 +3797,23 @@ namespace ARMeilleure.Instructions context.Copy(GetVec(op.Rd), res); } + + private static Operand EmitSse2Sll_128(ArmEmitterContext context, Operand op, int shift) + { + // The upper part of op is assumed to be zero. + Debug.Assert(shift >= 0 && shift < 64); + + if (shift == 0) + { + return op; + } + + Operand high = context.AddIntrinsic(Intrinsic.X86Pslldq, op, Const(8)); + high = context.AddIntrinsic(Intrinsic.X86Psrlq, high, Const(64 - shift)); + + Operand low = context.AddIntrinsic(Intrinsic.X86Psllq, op, Const(shift)); + + return context.AddIntrinsic(Intrinsic.X86Por, high, low); + } } } |