1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using System;
using System.Diagnostics;
using static ARMeilleure.Instructions.InstEmitSimdHelper;
using static ARMeilleure.IntermediateRepresentation.Operand.Factory;
namespace ARMeilleure.Instructions
{
static class InstEmitHashHelper
{
public const uint Crc32RevPoly = 0xedb88320;
public const uint Crc32cRevPoly = 0x82f63b78;
public static Operand EmitCrc32(ArmEmitterContext context, Operand crc, Operand value, int size, bool castagnoli)
{
Debug.Assert(crc.Type.IsInteger() && value.Type.IsInteger());
Debug.Assert(size >= 0 && size < 4);
Debug.Assert((size < 3) || (value.Type == OperandType.I64));
if (castagnoli && Optimizations.UseSse42)
{
// The CRC32 instruction does not have an immediate variant, so ensure both inputs are in registers.
value = (value.Kind == OperandKind.Constant) ? context.Copy(value) : value;
crc = (crc.Kind == OperandKind.Constant) ? context.Copy(crc) : crc;
Intrinsic op = size switch
{
0 => Intrinsic.X86Crc32_8,
1 => Intrinsic.X86Crc32_16,
_ => Intrinsic.X86Crc32,
};
return (size == 3) ? context.ConvertI64ToI32(context.AddIntrinsicLong(op, crc, value)) : context.AddIntrinsicInt(op, crc, value);
}
else if (Optimizations.UsePclmulqdq)
{
return size switch
{
3 => EmitCrc32Optimized64(context, crc, value, castagnoli),
_ => EmitCrc32Optimized(context, crc, value, castagnoli, size),
};
}
else
{
string name = (size, castagnoli) switch
{
(0, false) => nameof(SoftFallback.Crc32b),
(1, false) => nameof(SoftFallback.Crc32h),
(2, false) => nameof(SoftFallback.Crc32w),
(3, false) => nameof(SoftFallback.Crc32x),
(0, true) => nameof(SoftFallback.Crc32cb),
(1, true) => nameof(SoftFallback.Crc32ch),
(2, true) => nameof(SoftFallback.Crc32cw),
(3, true) => nameof(SoftFallback.Crc32cx),
_ => throw new ArgumentOutOfRangeException(nameof(size)),
};
return context.Call(typeof(SoftFallback).GetMethod(name), crc, value);
}
}
private static Operand EmitCrc32Optimized(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli, int size)
{
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
crc = context.VectorInsert(context.VectorZero(), crc, 0);
switch (size)
{
case 0:
data = context.VectorInsert8(context.VectorZero(), data, 0);
break;
case 1:
data = context.VectorInsert16(context.VectorZero(), data, 0);
break;
case 2:
data = context.VectorInsert(context.VectorZero(), data, 0);
break;
}
int bitsize = 8 << size;
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
if (bitsize < 32)
{
crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8));
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc);
}
return context.VectorExtract(OperandType.I32, tmp, 2);
}
private static Operand EmitCrc32Optimized64(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli)
{
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
crc = context.VectorInsert(context.VectorZero(), crc, 0);
data = context.VectorInsert(context.VectorZero(), data, 0);
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res);
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1));
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
return context.VectorExtract(OperandType.I32, tmp, 2);
}
}
}
|