diff options
Diffstat (limited to 'ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs')
-rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs | 203 |
1 files changed, 124 insertions, 79 deletions
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs index 73f25b98..19a10f68 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs @@ -2,6 +2,7 @@ using ARMeilleure.IntermediateRepresentation; using ARMeilleure.Translation; using System; +using System.Diagnostics; using static ARMeilleure.Instructions.InstEmitFlowHelper; using static ARMeilleure.Instructions.InstEmitHelper; @@ -113,20 +114,13 @@ namespace ARMeilleure.Instructions Operand insert = GetIntA32(context, op.Rt); // Zero extend into an I64, then replicate. Saves the most time over elementwise inserts. - switch (op.Size) + insert = op.Size switch { - case 2: - insert = context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)); - break; - case 1: - insert = context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)); - break; - case 0: - insert = context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)); - break; - default: - throw new InvalidOperationException("Unknown Vdup Size."); - } + 2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)), + 1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)), + 0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)), + _ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\".") + }; InsertScalar(context, op.Vd, insert); if (op.Q) @@ -142,20 +136,13 @@ namespace ARMeilleure.Instructions Operand insert = EmitVectorExtractZx32(context, op.Vm >> 1, ((op.Vm & 1) << (3 - op.Size)) + op.Index, op.Size); // Zero extend into an I64, then replicate. Saves the most time over elementwise inserts. - switch (op.Size) + insert = op.Size switch { - case 2: - insert = context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)); - break; - case 1: - insert = context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)); - break; - case 0: - insert = context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)); - break; - default: - throw new InvalidOperationException("Unknown Vdup Size."); - } + 2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)), + 1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)), + 0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)), + _ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\".") + }; InsertScalar(context, op.Vd, insert); if (op.Q) @@ -575,51 +562,53 @@ namespace ARMeilleure.Instructions } } - public static void Vmul_S(ArmEmitterContext context) + public static void Vmla_S(ArmEmitterContext context) { if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitScalarBinaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd); + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd); } else if (Optimizations.FastFP) { - EmitScalarBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2)); + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + return context.Add(op1, context.Multiply(op2, op3)); + }); } else { - EmitScalarBinaryOpF32(context, (op1, op2) => + EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCall(context, SoftFloat32.FPMul, SoftFloat64.FPMul, op1, op2); + return EmitSoftFloatCall(context, SoftFloat32.FPMulAdd, SoftFloat64.FPMulAdd, op1, op2, op3); }); } } - public static void Vmul_V(ArmEmitterContext context) + public static void Vmla_V(ArmEmitterContext context) { if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitVectorBinaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd); + EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd); } else if (Optimizations.FastFP) { - EmitVectorBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2)); + EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); } else { - EmitVectorBinaryOpF32(context, (op1, op2) => + EmitVectorTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2); + return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3); }); } } - public static void Vmul_I(ArmEmitterContext context) + public static void Vmla_I(ArmEmitterContext context) { - if ((context.CurrOp as OpCode32SimdReg).U) throw new NotImplementedException("Polynomial mode not implemented"); - EmitVectorBinaryOpSx32(context, (op1, op2) => context.Multiply(op1, op2)); + EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); } - public static void Vmul_1(ArmEmitterContext context) + public static void Vmla_1(ArmEmitterContext context) { OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; @@ -627,70 +616,70 @@ namespace ARMeilleure.Instructions { if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd); + EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd); } else if (Optimizations.FastFP) { - EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2)); + EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); } else { - EmitVectorByScalarOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2)); + EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3)); } } else { - EmitVectorByScalarOpI32(context, (op1, op2) => context.Multiply(op1, op2), false); + EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)), false); } } - public static void Vmla_S(ArmEmitterContext context) + public static void Vmls_S(ArmEmitterContext context) { if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd); + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd); } else if (Optimizations.FastFP) { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return context.Add(op1, context.Multiply(op2, op3)); + return context.Subtract(op1, context.Multiply(op2, op3)); }); } else { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCall(context, SoftFloat32.FPMulAdd, SoftFloat64.FPMulAdd, op1, op2, op3); + return EmitSoftFloatCall(context, SoftFloat32.FPMulSub, SoftFloat64.FPMulSub, op1, op2, op3); }); } } - public static void Vmla_V(ArmEmitterContext context) + public static void Vmls_V(ArmEmitterContext context) { if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd); + EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd); } else if (Optimizations.FastFP) { - EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); + EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); } else { EmitVectorTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3); + return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3); }); } } - public static void Vmla_I(ArmEmitterContext context) + public static void Vmls_I(ArmEmitterContext context) { - EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); + EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); } - public static void Vmla_1(ArmEmitterContext context) + public static void Vmls_1(ArmEmitterContext context) { OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; @@ -698,70 +687,83 @@ namespace ARMeilleure.Instructions { if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd); + EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd); } else if (Optimizations.FastFP) { - EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); + EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); } else { - EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3)); + EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3)); } } else { - EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)), false); + EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)), false); } } - public static void Vmls_S(ArmEmitterContext context) + public static void Vmlsl_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitVectorTernaryLongOpI32(context, (opD, op1, op2) => context.Subtract(opD, context.Multiply(op1, op2)), !op.U); + } + + public static void Vmul_S(ArmEmitterContext context) { if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd); + EmitScalarBinaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd); } else if (Optimizations.FastFP) { - EmitScalarTernaryOpF32(context, (op1, op2, op3) => - { - return context.Subtract(op1, context.Multiply(op2, op3)); - }); + EmitScalarBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2)); } else { - EmitScalarTernaryOpF32(context, (op1, op2, op3) => + EmitScalarBinaryOpF32(context, (op1, op2) => { - return EmitSoftFloatCall(context, SoftFloat32.FPMulSub, SoftFloat64.FPMulSub, op1, op2, op3); + return EmitSoftFloatCall(context, SoftFloat32.FPMul, SoftFloat64.FPMul, op1, op2); }); } } - public static void Vmls_V(ArmEmitterContext context) + public static void Vmul_V(ArmEmitterContext context) { if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd); + EmitVectorBinaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd); } else if (Optimizations.FastFP) { - EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); + EmitVectorBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2)); } else { - EmitVectorTernaryOpF32(context, (op1, op2, op3) => + EmitVectorBinaryOpF32(context, (op1, op2) => { - return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3); + return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2); }); } } - public static void Vmls_I(ArmEmitterContext context) + public static void Vmul_I(ArmEmitterContext context) { - EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + if (op.U) // This instruction is always signed, U indicates polynomial mode. + { + EmitVectorBinaryOpZx32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size)); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => context.Multiply(op1, op2)); + } } - public static void Vmls_1(ArmEmitterContext context) + public static void Vmul_1(ArmEmitterContext context) { OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; @@ -769,20 +771,41 @@ namespace ARMeilleure.Instructions { if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd); + EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd); } else if (Optimizations.FastFP) { - EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); + EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2)); } else { - EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3)); + EmitVectorByScalarOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2)); } } else { - EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)), false); + EmitVectorByScalarOpI32(context, (op1, op2) => context.Multiply(op1, op2), false); + } + } + + public static void Vmull_1(ArmEmitterContext context) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + EmitVectorByScalarLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U); + } + + public static void Vmull_I(ArmEmitterContext context) + { + OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp; + + if (op.Polynomial) + { + EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false); + } + else + { + EmitVectorBinaryLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U); } } @@ -1157,5 +1180,27 @@ namespace ARMeilleure.Instructions EmitVectorBinaryOpSimd32(context, genericEmit); } } + + private static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize) + { + Debug.Assert(eSize <= 32); + + Operand result = eSize == 32 ? Const(0L) : Const(0); + + if (eSize == 32) + { + op1 = context.ZeroExtend32(OperandType.I64, op1); + op2 = context.ZeroExtend32(OperandType.I64, op2); + } + + for (int i = 0; i < eSize; i++) + { + Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i)); + + result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask)); + } + + return result; + } } } |