// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf

using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using System;
using System.Diagnostics;
using static ARMeilleure.Instructions.InstEmitSimdHelper;
using static ARMeilleure.IntermediateRepresentation.Operand.Factory;

namespace ARMeilleure.Instructions
{
    static class InstEmitHashHelper
    {
        public const uint Crc32RevPoly = 0xedb88320;
        public const uint Crc32cRevPoly = 0x82f63b78;

        public static Operand EmitCrc32(ArmEmitterContext context, Operand crc, Operand value, int size, bool castagnoli)
        {
            Debug.Assert(crc.Type.IsInteger() && value.Type.IsInteger());
            Debug.Assert(size >= 0 && size < 4);
            Debug.Assert((size < 3) || (value.Type == OperandType.I64));

            if (castagnoli && Optimizations.UseSse42)
            {
                // The CRC32 instruction does not have an immediate variant, so ensure both inputs are in registers.
                value = (value.Kind == OperandKind.Constant) ? context.Copy(value) : value;
                crc = (crc.Kind == OperandKind.Constant) ? context.Copy(crc) : crc;

                Intrinsic op = size switch
                {
                    0 => Intrinsic.X86Crc32_8,
                    1 => Intrinsic.X86Crc32_16,
                    _ => Intrinsic.X86Crc32,
                };

                return (size == 3) ? context.ConvertI64ToI32(context.AddIntrinsicLong(op, crc, value)) : context.AddIntrinsicInt(op, crc, value);
            }
            else if (Optimizations.UsePclmulqdq)
            {
                return size switch
                {
                    3 => EmitCrc32Optimized64(context, crc, value, castagnoli),
                    _ => EmitCrc32Optimized(context, crc, value, castagnoli, size),
                };
            }
            else
            {
                string name = (size, castagnoli) switch
                {
                    (0, false) => nameof(SoftFallback.Crc32b),
                    (1, false) => nameof(SoftFallback.Crc32h),
                    (2, false) => nameof(SoftFallback.Crc32w),
                    (3, false) => nameof(SoftFallback.Crc32x),
                    (0, true) => nameof(SoftFallback.Crc32cb),
                    (1, true) => nameof(SoftFallback.Crc32ch),
                    (2, true) => nameof(SoftFallback.Crc32cw),
                    (3, true) => nameof(SoftFallback.Crc32cx),
                    _ => throw new ArgumentOutOfRangeException(nameof(size)),
                };

                return context.Call(typeof(SoftFallback).GetMethod(name), crc, value);
            }
        }

        private static Operand EmitCrc32Optimized(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli, int size)
        {
            long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
            long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1

            crc = context.VectorInsert(context.VectorZero(), crc, 0);

            switch (size)
            {
                case 0:
                    data = context.VectorInsert8(context.VectorZero(), data, 0);
                    break;
                case 1:
                    data = context.VectorInsert16(context.VectorZero(), data, 0);
                    break;
                case 2:
                    data = context.VectorInsert(context.VectorZero(), data, 0);
                    break;
            }

            int bitsize = 8 << size;

            Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
            tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize));
            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0));
            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));

            if (bitsize < 32)
            {
                crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8));
                tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc);
            }

            return context.VectorExtract(OperandType.I32, tmp, 2);
        }

        private static Operand EmitCrc32Optimized64(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli)
        {
            long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
            long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1

            crc = context.VectorInsert(context.VectorZero(), crc, 0);
            data = context.VectorInsert(context.VectorZero(), data, 0);

            Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
            Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4));

            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0));
            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));

            tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res);
            tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32));

            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1));
            tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));

            return context.VectorExtract(OperandType.I32, tmp, 2);
        }
    }
}