CPU (A64): Add Pmull_V Inst. with Clmul fast path for the "1/2D -> 1Q" variant & Sse fast path and slow path for both the "8/16B -> 8H" and "1/2D -> 1Q" variants; with Test. (#1817)

* Add Pmull_V Sse fast path only, both "8/16B -> 8H" and "1/2D -> 1Q" variants; with Test. * Add Clmul fast path for the 128 bits variant. * Small optimisation (save 60 instructions) for the Sse fast path about the 128 bits variant. * Add slow path, both variants. Fix V128 Shl/Shr when shift = 0. * A32: Add Vmull_I P64 variant (slow path); not tested. * A32: Add Vmull_I_P8_P64 Test and fix P64 variant.
author: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> 2021-01-04 23:45:54 +0100
committer: GitHub <noreply@github.com> 2021-01-04 23:45:54 +0100
commit: 430ba6da65a781196db7d723cc88710bb7f5caf8 (patch)
tree: a7ed55f638dde795f4270a324fa5338ffb80ee12 /ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
parent: a03ab0c4a0bef3c168874dc2105c43c9051e0807 (diff)
1 files changed, 125 insertions, 0 deletions
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
index 3a97bc52..88be07bd 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
@@ -10,6 +10,7 @@ using System.Diagnostics;
 
 using static ARMeilleure.Instructions.InstEmitHelper;
 using static ARMeilleure.Instructions.InstEmitSimdHelper;
+using static ARMeilleure.Instructions.InstEmitSimdHelper32;
 using static ARMeilleure.IntermediateRepresentation.OperandHelper;
 
 namespace ARMeilleure.Instructions
@@ -1928,6 +1929,112 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Pmull_V(ArmEmitterContext context)
+        {
+            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
+
+            if (Optimizations.UsePclmulqdq && op.Size == 3)
+            {
+                Operand n = GetVec(op.Rn);
+                Operand m = GetVec(op.Rm);
+
+                int imm8 = op.RegisterSize == RegisterSize.Simd64 ? 0b0000_0000 : 0b0001_0001;
+
+                Operand res = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, n, m, Const(imm8));
+
+                context.Copy(GetVec(op.Rd), res);
+            }
+            else if (Optimizations.UseSse41)
+            {
+                Operand n = GetVec(op.Rn);
+                Operand m = GetVec(op.Rm);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    n = context.VectorZeroUpper64(n);
+                    m = context.VectorZeroUpper64(m);
+                }
+                else /* if (op.RegisterSize == RegisterSize.Simd128) */
+                {
+                    n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
+                    m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
+                }
+
+                Operand res = context.VectorZero();
+
+                if (op.Size == 0)
+                {
+                    n = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, n);
+                    m = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, m);
+
+                    for (int i = 0; i < 8; i++)
+                    {
+                        Operand mask = context.AddIntrinsic(Intrinsic.X86Psllw, n, Const(15 - i));
+                                mask = context.AddIntrinsic(Intrinsic.X86Psraw, mask, Const(15));
+
+                        Operand tmp = context.AddIntrinsic(Intrinsic.X86Psllw, m, Const(i));
+                                tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
+
+                        res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
+                    }
+                }
+                else /* if (op.Size == 3) */
+                {
+                    Operand zero = context.VectorZero();
+
+                    for (int i = 0; i < 64; i++)
+                    {
+                        Operand mask = context.AddIntrinsic(Intrinsic.X86Movlhps, n, n);
+                                mask = context.AddIntrinsic(Intrinsic.X86Psllq, mask, Const(63 - i));
+                                mask = context.AddIntrinsic(Intrinsic.X86Psrlq, mask, Const(63));
+                                mask = context.AddIntrinsic(Intrinsic.X86Psubq, zero, mask);
+
+                        Operand tmp = EmitSse2Sll_128(context, m, i);
+                                tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
+
+                        res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
+                    }
+                }
+
+                context.Copy(GetVec(op.Rd), res);
+            }
+            else
+            {
+                Operand n = GetVec(op.Rn);
+                Operand m = GetVec(op.Rm);
+
+                Operand res;
+
+                if (op.Size == 0)
+                {
+                    res = context.VectorZero();
+
+                    int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 8;
+
+                    for (int index = 0; index < 8; index++)
+                    {
+                        Operand ne = context.VectorExtract8(n, part + index);
+                        Operand me = context.VectorExtract8(m, part + index);
+
+                        Operand de = EmitPolynomialMultiply(context, ne, me, 8);
+
+                        res = EmitVectorInsert(context, res, de, index, 1);
+                    }
+                }
+                else /* if (op.Size == 3) */
+                {
+                    int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 1;
+
+                    Operand ne = context.VectorExtract(OperandType.I64, n, part);
+                    Operand me = context.VectorExtract(OperandType.I64, m, part);
+
+                    res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me);
+                }
+
+                context.Copy(GetVec(op.Rd), res);
+            }
+        }
+
         public static void Raddhn_V(ArmEmitterContext context)
         {
             EmitHighNarrow(context, (op1, op2) => context.Add(op1, op2), round: true);
@@ -3690,5 +3797,23 @@ namespace ARMeilleure.Instructions
 
             context.Copy(GetVec(op.Rd), res);
         }
+
+        private static Operand EmitSse2Sll_128(ArmEmitterContext context, Operand op, int shift)
+        {
+            // The upper part of op is assumed to be zero.
+            Debug.Assert(shift >= 0 && shift < 64);
+
+            if (shift == 0)
+            {
+                return op;
+            }
+
+            Operand high = context.AddIntrinsic(Intrinsic.X86Pslldq, op, Const(8));
+                    high = context.AddIntrinsic(Intrinsic.X86Psrlq, high, Const(64 - shift));
+
+            Operand low = context.AddIntrinsic(Intrinsic.X86Psllq, op, Const(shift));
+
+            return context.AddIntrinsic(Intrinsic.X86Por, high, low);
+        }
     }
 }
author	LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>	2021-01-04 23:45:54 +0100
committer	GitHub <noreply@github.com>	2021-01-04 23:45:54 +0100
commit	430ba6da65a781196db7d723cc88710bb7f5caf8 (patch)
tree	a7ed55f638dde795f4270a324fa5338ffb80ee12 /ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
parent	a03ab0c4a0bef3c168874dc2105c43c9051e0807 (diff)