aboutsummaryrefslogtreecommitdiff
path: root/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
diff options
context:
space:
mode:
Diffstat (limited to 'ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs')
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs203
1 files changed, 124 insertions, 79 deletions
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
index 73f25b98..19a10f68 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
@@ -2,6 +2,7 @@
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using System;
+using System.Diagnostics;
using static ARMeilleure.Instructions.InstEmitFlowHelper;
using static ARMeilleure.Instructions.InstEmitHelper;
@@ -113,20 +114,13 @@ namespace ARMeilleure.Instructions
Operand insert = GetIntA32(context, op.Rt);
// Zero extend into an I64, then replicate. Saves the most time over elementwise inserts.
- switch (op.Size)
+ insert = op.Size switch
{
- case 2:
- insert = context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u));
- break;
- case 1:
- insert = context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u));
- break;
- case 0:
- insert = context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u));
- break;
- default:
- throw new InvalidOperationException("Unknown Vdup Size.");
- }
+ 2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)),
+ 1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)),
+ 0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)),
+ _ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\".")
+ };
InsertScalar(context, op.Vd, insert);
if (op.Q)
@@ -142,20 +136,13 @@ namespace ARMeilleure.Instructions
Operand insert = EmitVectorExtractZx32(context, op.Vm >> 1, ((op.Vm & 1) << (3 - op.Size)) + op.Index, op.Size);
// Zero extend into an I64, then replicate. Saves the most time over elementwise inserts.
- switch (op.Size)
+ insert = op.Size switch
{
- case 2:
- insert = context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u));
- break;
- case 1:
- insert = context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u));
- break;
- case 0:
- insert = context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u));
- break;
- default:
- throw new InvalidOperationException("Unknown Vdup Size.");
- }
+ 2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)),
+ 1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)),
+ 0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)),
+ _ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\".")
+ };
InsertScalar(context, op.Vd, insert);
if (op.Q)
@@ -575,51 +562,53 @@ namespace ARMeilleure.Instructions
}
}
- public static void Vmul_S(ArmEmitterContext context)
+ public static void Vmla_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
- EmitScalarBinaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd);
+ EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd);
}
else if (Optimizations.FastFP)
{
- EmitScalarBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2));
+ EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
+ {
+ return context.Add(op1, context.Multiply(op2, op3));
+ });
}
else
{
- EmitScalarBinaryOpF32(context, (op1, op2) =>
+ EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
- return EmitSoftFloatCall(context, SoftFloat32.FPMul, SoftFloat64.FPMul, op1, op2);
+ return EmitSoftFloatCall(context, SoftFloat32.FPMulAdd, SoftFloat64.FPMulAdd, op1, op2, op3);
});
}
}
- public static void Vmul_V(ArmEmitterContext context)
+ public static void Vmla_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
- EmitVectorBinaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
+ EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
}
else if (Optimizations.FastFP)
{
- EmitVectorBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2));
+ EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
}
else
{
- EmitVectorBinaryOpF32(context, (op1, op2) =>
+ EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
{
- return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2);
+ return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3);
});
}
}
- public static void Vmul_I(ArmEmitterContext context)
+ public static void Vmla_I(ArmEmitterContext context)
{
- if ((context.CurrOp as OpCode32SimdReg).U) throw new NotImplementedException("Polynomial mode not implemented");
- EmitVectorBinaryOpSx32(context, (op1, op2) => context.Multiply(op1, op2));
+ EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
}
- public static void Vmul_1(ArmEmitterContext context)
+ public static void Vmla_1(ArmEmitterContext context)
{
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
@@ -627,70 +616,70 @@ namespace ARMeilleure.Instructions
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
- EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
+ EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
}
else if (Optimizations.FastFP)
{
- EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2));
+ EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
}
else
{
- EmitVectorByScalarOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2));
+ EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3));
}
}
else
{
- EmitVectorByScalarOpI32(context, (op1, op2) => context.Multiply(op1, op2), false);
+ EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)), false);
}
}
- public static void Vmla_S(ArmEmitterContext context)
+ public static void Vmls_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
- EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd);
+ EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd);
}
else if (Optimizations.FastFP)
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
- return context.Add(op1, context.Multiply(op2, op3));
+ return context.Subtract(op1, context.Multiply(op2, op3));
});
}
else
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
- return EmitSoftFloatCall(context, SoftFloat32.FPMulAdd, SoftFloat64.FPMulAdd, op1, op2, op3);
+ return EmitSoftFloatCall(context, SoftFloat32.FPMulSub, SoftFloat64.FPMulSub, op1, op2, op3);
});
}
}
- public static void Vmla_V(ArmEmitterContext context)
+ public static void Vmls_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
- EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
+ EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
}
else if (Optimizations.FastFP)
{
- EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
+ EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
}
else
{
EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
{
- return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3);
+ return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3);
});
}
}
- public static void Vmla_I(ArmEmitterContext context)
+ public static void Vmls_I(ArmEmitterContext context)
{
- EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
+ EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
}
- public static void Vmla_1(ArmEmitterContext context)
+ public static void Vmls_1(ArmEmitterContext context)
{
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
@@ -698,70 +687,83 @@ namespace ARMeilleure.Instructions
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
- EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
+ EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
}
else if (Optimizations.FastFP)
{
- EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
+ EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
}
else
{
- EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulAddFpscr, SoftFloat64.FPMulAddFpscr, op1, op2, op3));
+ EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3));
}
}
else
{
- EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)), false);
+ EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)), false);
}
}
- public static void Vmls_S(ArmEmitterContext context)
+ public static void Vmlsl_I(ArmEmitterContext context)
+ {
+ OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
+
+ EmitVectorTernaryLongOpI32(context, (opD, op1, op2) => context.Subtract(opD, context.Multiply(op1, op2)), !op.U);
+ }
+
+ public static void Vmul_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
- EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd);
+ EmitScalarBinaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd);
}
else if (Optimizations.FastFP)
{
- EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
- {
- return context.Subtract(op1, context.Multiply(op2, op3));
- });
+ EmitScalarBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2));
}
else
{
- EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
+ EmitScalarBinaryOpF32(context, (op1, op2) =>
{
- return EmitSoftFloatCall(context, SoftFloat32.FPMulSub, SoftFloat64.FPMulSub, op1, op2, op3);
+ return EmitSoftFloatCall(context, SoftFloat32.FPMul, SoftFloat64.FPMul, op1, op2);
});
}
}
- public static void Vmls_V(ArmEmitterContext context)
+ public static void Vmul_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
- EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
+ EmitVectorBinaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
}
else if (Optimizations.FastFP)
{
- EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
+ EmitVectorBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2));
}
else
{
- EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
+ EmitVectorBinaryOpF32(context, (op1, op2) =>
{
- return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3);
+ return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2);
});
}
}
- public static void Vmls_I(ArmEmitterContext context)
+ public static void Vmul_I(ArmEmitterContext context)
{
- EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
+ OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
+
+ if (op.U) // This instruction is always signed, U indicates polynomial mode.
+ {
+ EmitVectorBinaryOpZx32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size));
+ }
+ else
+ {
+ EmitVectorBinaryOpSx32(context, (op1, op2) => context.Multiply(op1, op2));
+ }
}
- public static void Vmls_1(ArmEmitterContext context)
+ public static void Vmul_1(ArmEmitterContext context)
{
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
@@ -769,20 +771,41 @@ namespace ARMeilleure.Instructions
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
- EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
+ EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
}
else if (Optimizations.FastFP)
{
- EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
+ EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2));
}
else
{
- EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulSubFpscr, SoftFloat64.FPMulSubFpscr, op1, op2, op3));
+ EmitVectorByScalarOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMulFpscr, SoftFloat64.FPMulFpscr, op1, op2));
}
}
else
{
- EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)), false);
+ EmitVectorByScalarOpI32(context, (op1, op2) => context.Multiply(op1, op2), false);
+ }
+ }
+
+ public static void Vmull_1(ArmEmitterContext context)
+ {
+ OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
+
+ EmitVectorByScalarLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U);
+ }
+
+ public static void Vmull_I(ArmEmitterContext context)
+ {
+ OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp;
+
+ if (op.Polynomial)
+ {
+ EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false);
+ }
+ else
+ {
+ EmitVectorBinaryLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U);
}
}
@@ -1157,5 +1180,27 @@ namespace ARMeilleure.Instructions
EmitVectorBinaryOpSimd32(context, genericEmit);
}
}
+
+ private static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize)
+ {
+ Debug.Assert(eSize <= 32);
+
+ Operand result = eSize == 32 ? Const(0L) : Const(0);
+
+ if (eSize == 32)
+ {
+ op1 = context.ZeroExtend32(OperandType.I64, op1);
+ op2 = context.ZeroExtend32(OperandType.I64, op2);
+ }
+
+ for (int i = 0; i < eSize; i++)
+ {
+ Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i));
+
+ result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask));
+ }
+
+ return result;
+ }
}
}