diff options
author | riperiperi <rhy3756547@hotmail.com> | 2020-07-13 11:48:14 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-13 20:48:14 +1000 |
commit | d7044b10a253dae31b9a0041a432e3a7adce59f6 (patch) | |
tree | 20947496224af14c43803de8169a6493aa77b45b /ARMeilleure/Instructions/InstEmitHashHelper.cs | |
parent | 30d4f752f47217bcdc4dd05428010acf270189d0 (diff) |
Add SSE4.2 Path for CRC32, add A32 variant, add tests for non-castagnoli variants. (#1328)
* Add CRC32 A32 instructions.
* Fix CRC32 instructions.
* Add CRC intrinsic and fast path.
Loop is currently unrolled, will look into adding temp vars after tests are added.
* Begin work on Crc tests
* Fix SSE4.2 path for CRC32C, finialize tests.
* Remove unused IR path.
* Fix spacing between prefix checks.
* This should be Src.
* PTC Version
* OpCodeTable Order
* Integer check improvement. Value and Crc can be either 32 or 64 size.
* This wasn't necessary...
* If size is 3, value type must be I64.
* Fix same src+dest handling for non crc intrinsics.
* Pre-fix (ha) issue with vex encodings
Diffstat (limited to 'ARMeilleure/Instructions/InstEmitHashHelper.cs')
-rw-r--r-- | ARMeilleure/Instructions/InstEmitHashHelper.cs | 119 |
1 files changed, 119 insertions, 0 deletions
diff --git a/ARMeilleure/Instructions/InstEmitHashHelper.cs b/ARMeilleure/Instructions/InstEmitHashHelper.cs new file mode 100644 index 00000000..9206e6d5 --- /dev/null +++ b/ARMeilleure/Instructions/InstEmitHashHelper.cs @@ -0,0 +1,119 @@ +// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; + +using static ARMeilleure.IntermediateRepresentation.OperandHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; + +namespace ARMeilleure.Instructions +{ + static class InstEmitHashHelper + { + public const uint Crc32RevPoly = 0xedb88320; + public const uint Crc32cRevPoly = 0x82f63b78; + + public static Operand EmitCrc32(ArmEmitterContext context, Operand crc, Operand value, int size, bool castagnoli) + { + Debug.Assert(crc.Type.IsInteger() && value.Type.IsInteger()); + Debug.Assert(size >= 0 && size < 4); + Debug.Assert((size < 3) || (value.Type == OperandType.I64)); + + if (castagnoli && Optimizations.UseSse42) + { + // The CRC32 instruction does not have an immediate variant, so ensure both inputs are in registers. + value = (value.Kind == OperandKind.Constant) ? context.Copy(value) : value; + crc = (crc.Kind == OperandKind.Constant) ? context.Copy(crc) : crc; + + Intrinsic op = size switch + { + 0 => Intrinsic.X86Crc32_8, + 1 => Intrinsic.X86Crc32_16, + _ => Intrinsic.X86Crc32, + }; + + return (size == 3) ? context.ConvertI64ToI32(context.AddIntrinsicLong(op, crc, value)) : context.AddIntrinsicInt(op, crc, value); + } + else if (Optimizations.UsePclmulqdq) + { + return size switch + { + 3 => EmitCrc32Optimized64(context, crc, value, castagnoli), + _ => EmitCrc32Optimized(context, crc, value, castagnoli, size), + }; + } + else + { + string name = (size, castagnoli) switch + { + (0, false) => nameof(SoftFallback.Crc32b), + (1, false) => nameof(SoftFallback.Crc32h), + (2, false) => nameof(SoftFallback.Crc32w), + (3, false) => nameof(SoftFallback.Crc32x), + (0, true) => nameof(SoftFallback.Crc32cb), + (1, true) => nameof(SoftFallback.Crc32ch), + (2, true) => nameof(SoftFallback.Crc32cw), + (3, true) => nameof(SoftFallback.Crc32cx), + _ => throw new ArgumentOutOfRangeException(nameof(size)) + }; + + return context.Call(typeof(SoftFallback).GetMethod(name), crc, value); + } + } + + private static Operand EmitCrc32Optimized(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli, int size) + { + long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' + long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 + + crc = context.VectorInsert(context.VectorZero(), crc, 0); + + switch (size) + { + case 0: data = context.VectorInsert8(context.VectorZero(), data, 0); break; + case 1: data = context.VectorInsert16(context.VectorZero(), data, 0); break; + case 2: data = context.VectorInsert(context.VectorZero(), data, 0); break; + } + + int bitsize = 8 << size; + + Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); + tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + if (bitsize < 32) + { + crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8)); + tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc); + } + + return context.VectorExtract(OperandType.I32, tmp, 2); + } + + private static Operand EmitCrc32Optimized64(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli) + { + long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' + long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 + + crc = context.VectorInsert(context.VectorZero(), crc, 0); + data = context.VectorInsert(context.VectorZero(), data, 0); + + Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); + Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res); + tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + return context.VectorExtract(OperandType.I32, tmp, 2); + } + } +} |