aboutsummaryrefslogtreecommitdiff
path: root/ARMeilleure
diff options
context:
space:
mode:
authorLDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>2020-12-17 20:43:41 +0100
committerGitHub <noreply@github.com>2020-12-17 20:43:41 +0100
commit8a33e884f8f482e93e2b90380b158c1417cc50f8 (patch)
tree65eabad1c3a78d2a3bd7bf7992413fa78056178f /ARMeilleure
parentb5c215111de665ef8d18b38405ac55e17996e30e (diff)
Fix Vnmls_S fast path (F64: losing input d value). Fix Vnmla_S & Vnmls_S slow paths (using fused inst.s). Fix Vfma_V slow path not using StandardFPSCRValue(). (#1775)
* Fix Vnmls_S fast path (F64: losing input d value). Fix Vnmla_S & Vnmls_S slow paths (using fused inst.s). Add Vfma_S & Vfms_S Fma fast paths. Add Vfnma_S inst. with Fma/Sse fast paths and slow path. Add Vfnms_S Sse fast path. Add Tests for affected inst.s. Nits. * InternalVersion = 1775 * Nits. * Fix Vfma_V slow path not using StandardFPSCRValue(). * Nit: Fix Vfma_V order. * Add Vfms_V Sse fast path and slow path. * Add Vfma_V and Vfms_V Test.
Diffstat (limited to 'ARMeilleure')
-rw-r--r--ARMeilleure/CodeGen/X86/Assembler.cs14
-rw-r--r--ARMeilleure/CodeGen/X86/CodeGenerator.cs14
-rw-r--r--ARMeilleure/CodeGen/X86/IntrinsicTable.cs8
-rw-r--r--ARMeilleure/CodeGen/X86/X86Instruction.cs12
-rw-r--r--ARMeilleure/Decoders/OpCodeTable.cs1
-rw-r--r--ARMeilleure/Instructions/InstEmitAlu32.cs2
-rw-r--r--ARMeilleure/Instructions/InstEmitMul32.cs2
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs102
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdHelper32.cs26
-rw-r--r--ARMeilleure/IntermediateRepresentation/Intrinsic.cs8
-rw-r--r--ARMeilleure/Translation/PTC/Ptc.cs2
11 files changed, 103 insertions, 88 deletions
diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs
index 7f19c3c4..2484e251 100644
--- a/ARMeilleure/CodeGen/X86/Assembler.cs
+++ b/ARMeilleure/CodeGen/X86/Assembler.cs
@@ -274,17 +274,15 @@ namespace ARMeilleure.CodeGen.X86
Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vfmadd231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66));
- Add(X86Instruction.Vfmadd231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
- Add(X86Instruction.Vfmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vfmadd231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
- Add(X86Instruction.Vfmsub231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66));
- Add(X86Instruction.Vfmsub231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
- Add(X86Instruction.Vfmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66));
+ Add(X86Instruction.Vfmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vfmsub231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
- Add(X86Instruction.Vfnmsub231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38be, InstructionFlags.Vex | InstructionFlags.Prefix66));
- Add(X86Instruction.Vfnmsub231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38be, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
- Add(X86Instruction.Vfnmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66));
+ Add(X86Instruction.Vfmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66));
+ Add(X86Instruction.Vfnmadd231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66));
+ Add(X86Instruction.Vfnmadd231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
+ Add(X86Instruction.Vfnmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vfnmsub231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
+ Add(X86Instruction.Vfnmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None));
Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66));
diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs
index 29a4cd78..5f41ff79 100644
--- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs
+++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs
@@ -440,9 +440,12 @@ namespace ARMeilleure.CodeGen.X86
else
{
EnsureSameReg(dest, src1);
+
Debug.Assert(src3.GetRegister().Index == 0);
+
context.Assembler.WriteInstruction(info.Inst, dest, src1, src2);
}
+
break;
}
@@ -474,11 +477,16 @@ namespace ARMeilleure.CodeGen.X86
Operand src2 = operation.GetSource(1);
Operand src3 = operation.GetSource(2);
- EnsureSameType(dest, src1, src2, src3);
- EnsureSameReg(dest, src1);
- Debug.Assert(!dest.Type.IsInteger());
Debug.Assert(HardwareCapabilities.SupportsVexEncoding);
+ Debug.Assert(dest.Kind == OperandKind.Register && src1.Kind == OperandKind.Register && src2.Kind == OperandKind.Register);
+ Debug.Assert(src3.Kind == OperandKind.Register || src3.Kind == OperandKind.Memory);
+
+ EnsureSameType(dest, src1, src2, src3);
+ Debug.Assert(dest.Type == OperandType.V128);
+
+ Debug.Assert(dest.Value == src1.Value);
+
context.Assembler.WriteInstruction(info.Inst, dest, src2, src3);
break;
diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
index 195ce91d..9030be3c 100644
--- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
+++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
@@ -166,16 +166,14 @@ namespace ARMeilleure.CodeGen.X86
Add(Intrinsic.X86Unpcklps, new IntrinsicInfo(X86Instruction.Unpcklps, IntrinsicType.Binary));
Add(Intrinsic.X86Vcvtph2ps, new IntrinsicInfo(X86Instruction.Vcvtph2ps, IntrinsicType.Unary));
Add(Intrinsic.X86Vcvtps2ph, new IntrinsicInfo(X86Instruction.Vcvtps2ph, IntrinsicType.BinaryImm));
- Add(Intrinsic.X86Vfmadd231pd, new IntrinsicInfo(X86Instruction.Vfmadd231pd, IntrinsicType.Fma));
Add(Intrinsic.X86Vfmadd231ps, new IntrinsicInfo(X86Instruction.Vfmadd231ps, IntrinsicType.Fma));
Add(Intrinsic.X86Vfmadd231sd, new IntrinsicInfo(X86Instruction.Vfmadd231sd, IntrinsicType.Fma));
Add(Intrinsic.X86Vfmadd231ss, new IntrinsicInfo(X86Instruction.Vfmadd231ss, IntrinsicType.Fma));
- Add(Intrinsic.X86Vfmsub231pd, new IntrinsicInfo(X86Instruction.Vfmsub231pd, IntrinsicType.Fma));
- Add(Intrinsic.X86Vfmsub231ps, new IntrinsicInfo(X86Instruction.Vfmsub231ps, IntrinsicType.Fma));
Add(Intrinsic.X86Vfmsub231sd, new IntrinsicInfo(X86Instruction.Vfmsub231sd, IntrinsicType.Fma));
Add(Intrinsic.X86Vfmsub231ss, new IntrinsicInfo(X86Instruction.Vfmsub231ss, IntrinsicType.Fma));
- Add(Intrinsic.X86Vfnmsub231pd, new IntrinsicInfo(X86Instruction.Vfnmsub231pd, IntrinsicType.Fma));
- Add(Intrinsic.X86Vfnmsub231ps, new IntrinsicInfo(X86Instruction.Vfnmsub231ps, IntrinsicType.Fma));
+ Add(Intrinsic.X86Vfnmadd231ps, new IntrinsicInfo(X86Instruction.Vfnmadd231ps, IntrinsicType.Fma));
+ Add(Intrinsic.X86Vfnmadd231sd, new IntrinsicInfo(X86Instruction.Vfnmadd231sd, IntrinsicType.Fma));
+ Add(Intrinsic.X86Vfnmadd231ss, new IntrinsicInfo(X86Instruction.Vfnmadd231ss, IntrinsicType.Fma));
Add(Intrinsic.X86Vfnmsub231sd, new IntrinsicInfo(X86Instruction.Vfnmsub231sd, IntrinsicType.Fma));
Add(Intrinsic.X86Vfnmsub231ss, new IntrinsicInfo(X86Instruction.Vfnmsub231ss, IntrinsicType.Fma));
Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary));
diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs
index 7ed4841c..ed5b50c5 100644
--- a/ARMeilleure/CodeGen/X86/X86Instruction.cs
+++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs
@@ -203,18 +203,16 @@ namespace ARMeilleure.CodeGen.X86
Vblendvps,
Vcvtph2ps,
Vcvtps2ph,
- Vfmadd231pd,
Vfmadd231ps,
Vfmadd231sd,
Vfmadd231ss,
- Vfmsub231ps,
- Vfmsub231pd,
- Vfmsub231ss,
Vfmsub231sd,
- Vfnmsub231ps,
- Vfnmsub231pd,
- Vfnmsub231ss,
+ Vfmsub231ss,
+ Vfnmadd231ps,
+ Vfnmadd231sd,
+ Vfnmadd231ss,
Vfnmsub231sd,
+ Vfnmsub231ss,
Vpblendvb,
Xor,
Xorpd,
diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs
index 88c68644..665e7129 100644
--- a/ARMeilleure/Decoders/OpCodeTable.cs
+++ b/ARMeilleure/Decoders/OpCodeTable.cs
@@ -822,6 +822,7 @@ namespace ARMeilleure.Decoders
SetA32("<<<<11101x10xxxxxxxx101xx0x0xxxx", InstName.Vfma, InstEmit32.Vfma_S, OpCode32SimdRegS.Create);
SetA32("111100100x00xxxxxxxx1100xxx1xxxx", InstName.Vfma, InstEmit32.Vfma_V, OpCode32SimdReg.Create);
SetA32("<<<<11101x10xxxxxxxx101xx1x0xxxx", InstName.Vfms, InstEmit32.Vfms_S, OpCode32SimdRegS.Create);
+ SetA32("111100100x10xxxxxxxx1100xxx1xxxx", InstName.Vfms, InstEmit32.Vfms_V, OpCode32SimdReg.Create);
SetA32("<<<<11101x01xxxxxxxx101xx1x0xxxx", InstName.Vfnma, InstEmit32.Vfnma_S, OpCode32SimdRegS.Create);
SetA32("<<<<11101x01xxxxxxxx101xx0x0xxxx", InstName.Vfnms, InstEmit32.Vfnms_S, OpCode32SimdRegS.Create);
SetA32("1111001x0x<<xxxxxxxx0000xxx0xxxx", InstName.Vhadd, InstEmit32.Vhadd, OpCode32SimdReg.Create);
diff --git a/ARMeilleure/Instructions/InstEmitAlu32.cs b/ARMeilleure/Instructions/InstEmitAlu32.cs
index d57ff0b6..f3da121c 100644
--- a/ARMeilleure/Instructions/InstEmitAlu32.cs
+++ b/ARMeilleure/Instructions/InstEmitAlu32.cs
@@ -591,7 +591,7 @@ namespace ARMeilleure.Instructions
EmitAluStore(context, res);
}
- public static void EmitDiv(ArmEmitterContext context, bool unsigned)
+ private static void EmitDiv(ArmEmitterContext context, bool unsigned)
{
Operand n = GetAluN(context);
Operand m = GetAluM(context);
diff --git a/ARMeilleure/Instructions/InstEmitMul32.cs b/ARMeilleure/Instructions/InstEmitMul32.cs
index 454d44a4..fa744d25 100644
--- a/ARMeilleure/Instructions/InstEmitMul32.cs
+++ b/ARMeilleure/Instructions/InstEmitMul32.cs
@@ -329,7 +329,7 @@ namespace ARMeilleure.Instructions
EmitGenericAluStoreA32(context, op.RdLo, op.SetFlags, lo);
}
- public static void EmitMlal(ArmEmitterContext context, bool signed)
+ private static void EmitMlal(ArmEmitterContext context, bool signed)
{
OpCode32AluUmull op = (OpCode32AluUmull)context.CurrOp;
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
index 40289520..d35af209 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
@@ -252,44 +252,48 @@ namespace ARMeilleure.Instructions
}
}
- public static void Vfma_V(ArmEmitterContext context) // Fused.
+ public static void Vfma_S(ArmEmitterContext context) // Fused.
{
if (Optimizations.FastFP && Optimizations.UseFma)
{
- // Vectors contain elements that are 32-bits in length always. The only thing that will change is the number of elements in a vector.
- // The 64-bit variant will never be used.
- EmitVectorTernaryOpF32(context, Intrinsic.X86Vfmadd231ps, Intrinsic.X86Vfmadd231pd);
+ EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmadd231ss, Intrinsic.X86Vfmadd231sd);
+ }
+ else if (Optimizations.FastFP && Optimizations.UseSse2)
+ {
+ EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd);
}
else
{
- EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
+ EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3);
});
}
}
- public static void Vfma_S(ArmEmitterContext context) // Fused.
+ public static void Vfma_V(ArmEmitterContext context) // Fused.
{
- if (Optimizations.FastFP && Optimizations.UseSse2)
+ if (Optimizations.FastFP && Optimizations.UseFma)
{
- // TODO: Use FMA instruction set.
- EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd);
+ EmitVectorTernaryOpF32(context, Intrinsic.X86Vfmadd231ps);
}
else
{
- EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
+ EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
{
- return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3);
+ return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3);
});
}
}
public static void Vfms_S(ArmEmitterContext context) // Fused.
{
- if (Optimizations.FastFP && Optimizations.UseSse2)
+ if (Optimizations.FastFP && Optimizations.UseFma)
+ {
+ EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmadd231ss, Intrinsic.X86Vfnmadd231sd);
+ }
+ else if (Optimizations.FastFP && Optimizations.UseSse2)
{
- // TODO: Use FMA instruction set.
EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd);
}
else
@@ -301,17 +305,36 @@ namespace ARMeilleure.Instructions
}
}
+ public static void Vfms_V(ArmEmitterContext context) // Fused.
+ {
+ if (Optimizations.FastFP && Optimizations.UseFma)
+ {
+ EmitVectorTernaryOpF32(context, Intrinsic.X86Vfnmadd231ps);
+ }
+ else
+ {
+ EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
+ {
+ return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3);
+ });
+ }
+ }
+
public static void Vfnma_S(ArmEmitterContext context) // Fused.
{
if (Optimizations.FastFP && Optimizations.UseFma)
{
EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmsub231ss, Intrinsic.X86Vfnmsub231sd);
}
+ else if (Optimizations.FastFP && Optimizations.UseSse2)
+ {
+ EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true);
+ }
else
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
- return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), context.Negate(op1), context.Negate(op2), op3);
+ return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3);
});
}
}
@@ -322,11 +345,15 @@ namespace ARMeilleure.Instructions
{
EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmsub231ss, Intrinsic.X86Vfmsub231sd);
}
+ else if (Optimizations.FastFP && Optimizations.UseSse2)
+ {
+ EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true);
+ }
else
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
- return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), context.Negate(op1), op2, op3);
+ return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3);
});
}
}
@@ -422,36 +449,21 @@ namespace ARMeilleure.Instructions
if (Optimizations.FastFP && Optimizations.UseSse2)
{
- EmitScalarTernaryOpSimd32(context, (d, n, m) =>
- {
- if ((op.Size & 1) == 0)
- {
- Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
- res = context.AddIntrinsic(Intrinsic.X86Addss, d, res);
- Operand mask = X86GetScalar(context, -0f);
- return context.AddIntrinsic(Intrinsic.X86Xorps, mask, res);
- }
- else
- {
- Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
- res = context.AddIntrinsic(Intrinsic.X86Addsd, d, res);
- Operand mask = X86GetScalar(context, -0d);
- return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res);
- }
- });
+ EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true);
}
else if (Optimizations.FastFP)
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
- return context.Negate(context.Add(op1, context.Multiply(op2, op3)));
+ return context.Subtract(context.Negate(op1), context.Multiply(op2, op3));
});
}
else
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
- return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3);
+ Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3);
+ return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), context.Negate(op1), res);
});
}
}
@@ -462,24 +474,7 @@ namespace ARMeilleure.Instructions
if (Optimizations.FastFP && Optimizations.UseSse2)
{
- EmitScalarTernaryOpSimd32(context, (d, n, m) =>
- {
- if ((op.Size & 1) == 0)
- {
- Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
- Operand mask = X86GetScalar(context, -0f);
- d = context.AddIntrinsic(Intrinsic.X86Xorps, mask, d);
- return context.AddIntrinsic(Intrinsic.X86Addss, d, res);
-
- }
- else
- {
- Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
- Operand mask = X86GetScalar(context, -0d);
- d = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res);
- return context.AddIntrinsic(Intrinsic.X86Addsd, d, res);
- }
- });
+ EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true);
}
else if (Optimizations.FastFP)
{
@@ -492,7 +487,8 @@ namespace ARMeilleure.Instructions
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
- return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3);
+ Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3);
+ return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), context.Negate(op1), res);
});
}
}
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
index 2d5d4ba9..39195057 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
@@ -820,15 +820,15 @@ namespace ARMeilleure.Instructions
});
}
- public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
+ public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
- Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
+ Debug.Assert((op.Size & 1) == 0);
EmitVectorTernaryOpSimd32(context, (d, n, m) =>
{
- return context.AddIntrinsic(inst, d, n, m);
+ return context.AddIntrinsic(inst32, d, n, m);
});
}
@@ -927,7 +927,13 @@ namespace ARMeilleure.Instructions
});
}
- public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
+ public static void EmitScalarTernaryOpF32(
+ ArmEmitterContext context,
+ Intrinsic inst32pt1,
+ Intrinsic inst64pt1,
+ Intrinsic inst32pt2,
+ Intrinsic inst64pt2,
+ bool isNegD = false)
{
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
@@ -939,6 +945,18 @@ namespace ARMeilleure.Instructions
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
{
Operand res = context.AddIntrinsic(inst1, n, m);
+
+ if (isNegD)
+ {
+ Operand mask = doubleSize
+ ? X86GetScalar(context, -0d)
+ : X86GetScalar(context, -0f);
+
+ d = doubleSize
+ ? context.AddIntrinsic(Intrinsic.X86Xorpd, mask, d)
+ : context.AddIntrinsic(Intrinsic.X86Xorps, mask, d);
+ }
+
return context.AddIntrinsic(inst2, d, res);
});
}
diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
index 515f1143..e2989863 100644
--- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
+++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
@@ -155,16 +155,14 @@ namespace ARMeilleure.IntermediateRepresentation
X86Unpcklps,
X86Vcvtph2ps,
X86Vcvtps2ph,
- X86Vfmadd231pd,
X86Vfmadd231ps,
X86Vfmadd231sd,
X86Vfmadd231ss,
- X86Vfmsub231pd,
- X86Vfmsub231ps,
X86Vfmsub231sd,
X86Vfmsub231ss,
- X86Vfnmsub231pd,
- X86Vfnmsub231ps,
+ X86Vfnmadd231ps,
+ X86Vfnmadd231sd,
+ X86Vfnmadd231ss,
X86Vfnmsub231sd,
X86Vfnmsub231ss,
X86Xorpd,
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 3150c97c..b5a92b97 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC
{
private const string HeaderMagic = "PTChd";
- private const uint InternalVersion = 1713; //! To be incremented manually for each change to the ARMeilleure project.
+ private const int InternalVersion = 1775; //! To be incremented manually for each change to the ARMeilleure project.
private const string ActualDir = "0";
private const string BackupDir = "1";