/*
 * Decompiled with CFR 0.152.
 */
package jdk.graal.compiler.lir.aarch64;

import java.util.Arrays;
import jdk.graal.compiler.asm.Label;
import jdk.graal.compiler.asm.aarch64.AArch64ASIMDAssembler;
import jdk.graal.compiler.asm.aarch64.AArch64Address;
import jdk.graal.compiler.asm.aarch64.AArch64Assembler;
import jdk.graal.compiler.asm.aarch64.AArch64MacroAssembler;
import jdk.graal.compiler.core.common.Stride;
import jdk.graal.compiler.debug.GraalError;
import jdk.graal.compiler.lir.LIRInstruction;
import jdk.graal.compiler.lir.LIRInstructionClass;
import jdk.graal.compiler.lir.Opcode;
import jdk.graal.compiler.lir.aarch64.AArch64ComplexVectorOp;
import jdk.graal.compiler.lir.aarch64.AArch64LIRInstruction;
import jdk.graal.compiler.lir.asm.ArrayDataPointerConstant;
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
import jdk.graal.compiler.lir.gen.LIRGeneratorTool;
import jdk.vm.ci.aarch64.AArch64;
import jdk.vm.ci.code.CodeUtil;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.code.ValueUtil;
import jdk.vm.ci.meta.AllocatableValue;
import jdk.vm.ci.meta.JavaKind;
import jdk.vm.ci.meta.Value;

@Opcode(value="VECTORIZED_HASHCODE")
public final class AArch64VectorizedHashCodeOp
extends AArch64ComplexVectorOp {
    public static final LIRInstructionClass<AArch64VectorizedHashCodeOp> TYPE = LIRInstructionClass.create(AArch64VectorizedHashCodeOp.class);
    @LIRInstruction.Def(value={LIRInstruction.OperandFlag.REG})
    private Value resultValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    private Value arrayStart;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    private Value length;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    private Value initialValue;
    private final JavaKind arrayKind;
    private final int nRegs;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    Value[] temp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    Value[] vectorTemp;
    private static final int VECTOR_COUNT = 4;
    private static final int[] POWERS_OF_31_BACKWARDS = new int[]{2111290369, -2010103841, 350799937, 11316127, 693101697, -254736545, 961614017, 31019807, -2077209343, -67006753, 1244764481, -2038056289, 211350913, -408824225, -844471871, -997072353, 1353309697, -510534177, 1507551809, -505558625, -293403007, 129082719, -1796951359, -196513505, -1807454463, 1742810335, 887503681, 28629151, 923521, 29791, 961, 31, 1};

    public AArch64VectorizedHashCodeOp(LIRGeneratorTool tool, AllocatableValue result, AllocatableValue arrayStart, AllocatableValue length, AllocatableValue initialValue, JavaKind arrayKind) {
        super((LIRInstructionClass<? extends AArch64LIRInstruction>)TYPE);
        this.resultValue = result;
        this.arrayStart = arrayStart;
        this.length = length;
        this.initialValue = initialValue;
        this.arrayKind = arrayKind;
        this.nRegs = 4;
        this.temp = AArch64VectorizedHashCodeOp.allocateTempRegisters(tool, 5);
        this.vectorTemp = AArch64VectorizedHashCodeOp.allocateVectorRegisters(tool, 1 + 2 * this.nRegs);
    }

    private static Register[] asRegisterSlice(Value[] registerValues, int start, int end) {
        Register[] regs = new Register[end - start];
        int fromIndex = start;
        int toIndex = 0;
        while (fromIndex < end) {
            regs[toIndex] = ValueUtil.asRegister((Value)registerValues[fromIndex]);
            ++fromIndex;
            ++toIndex;
        }
        return regs;
    }

    private static void arraysHashcodeElload(AArch64MacroAssembler masm, Register dst, AArch64Address src, JavaKind eltype) {
        switch (eltype) {
            case Boolean: {
                masm.ldr(8, dst, src);
                break;
            }
            case Byte: {
                masm.ldrs(32, 8, dst, src);
                break;
            }
            case Short: {
                masm.ldrs(32, 16, dst, src);
                break;
            }
            case Char: {
                masm.ldr(16, dst, src);
                break;
            }
            case Int: {
                masm.ldr(32, dst, src);
                break;
            }
            default: {
                throw GraalError.shouldNotReachHere("Unsupported JavaKind " + String.valueOf(eltype));
            }
        }
    }

    @Override
    public void emitCode(CompilationResultBuilder crb, AArch64MacroAssembler masm) {
        int i;
        int idx;
        Label labelShortUnrolledBegin = new Label();
        Label labelEnd = new Label();
        Register lengthParam = ValueUtil.asRegister((Value)this.length);
        Register arrayStartParam = ValueUtil.asRegister((Value)this.arrayStart);
        Register initialValueParam = ValueUtil.asRegister((Value)this.initialValue);
        Register result = ValueUtil.asRegister((Value)this.resultValue);
        Register ary1 = ValueUtil.asRegister((Value)this.temp[0]);
        Register cnt1 = ValueUtil.asRegister((Value)this.temp[1]);
        Register tmp2 = ValueUtil.asRegister((Value)this.temp[2]);
        Register tmp3 = ValueUtil.asRegister((Value)this.temp[3]);
        Register index = ValueUtil.asRegister((Value)this.temp[4]);
        masm.mov(64, ary1, arrayStartParam);
        masm.mov(32, cnt1, lengthParam);
        masm.mov(32, result, initialValueParam);
        Register vnext = ValueUtil.asRegister((Value)this.vectorTemp[0]);
        Register[] vcoef = AArch64VectorizedHashCodeOp.asRegisterSlice(this.vectorTemp, 1, 1 + this.nRegs);
        Register[] vresult = AArch64VectorizedHashCodeOp.asRegisterSlice(this.vectorTemp, 1 + this.nRegs, 1 + 2 * this.nRegs);
        Register[] vtmp = vcoef;
        boolean unsigned = this.arrayKind == JavaKind.Boolean || this.arrayKind == JavaKind.Char;
        AArch64ASIMDAssembler.ElementSize elSize = switch (this.arrayKind) {
            case JavaKind.Boolean, JavaKind.Byte -> AArch64ASIMDAssembler.ElementSize.Byte;
            case JavaKind.Short, JavaKind.Char -> AArch64ASIMDAssembler.ElementSize.HalfWord;
            case JavaKind.Int -> AArch64ASIMDAssembler.ElementSize.Word;
            default -> throw GraalError.shouldNotReachHereUnexpectedValue(this.arrayKind);
        };
        Stride stride = Stride.fromJavaKind(this.arrayKind);
        GraalError.guarantee(this.nRegs == 2 || this.nRegs == 4 || this.nRegs == 8, "number of vectors must be either 2, 4, or 8");
        int elementsPerVector = AArch64ASIMDAssembler.ASIMDSize.FullReg.bytes() / AArch64ASIMDAssembler.ElementSize.Word.bytes();
        int elementsPerIteration = elementsPerVector * this.nRegs;
        int maxConsecutiveRegs = Math.min(this.nRegs, 4);
        int minBits = elSize.bits() * elementsPerVector;
        int maxBits = Math.min(elSize.bits() * elementsPerIteration, AArch64ASIMDAssembler.ASIMDSize.FullReg.bits());
        AArch64ASIMDAssembler.ASIMDSize loadVecSize = this.nRegs * minBits <= AArch64ASIMDAssembler.ASIMDSize.HalfReg.bits() ? AArch64ASIMDAssembler.ASIMDSize.HalfReg : AArch64ASIMDAssembler.ASIMDSize.FullReg;
        int loadVecBits = loadVecSize.bits();
        GraalError.guarantee(CodeUtil.isPowerOf2((int)loadVecBits) && loadVecBits % minBits == 0 && loadVecBits <= maxBits, "loaded bit width must be (2^n)*%d <= %d for %s", (Object)minBits, (Object)maxBits, (Object)elSize);
        int extensionFactor = Math.max(AArch64ASIMDAssembler.ElementSize.Word.bits() / elSize.bits() / (maxBits / loadVecBits), 1);
        int consecutiveRegs = Math.max(maxConsecutiveRegs / extensionFactor, 1);
        int regsFilledPerLoad = consecutiveRegs * extensionFactor;
        Register bound = tmp2;
        masm.ands(32, bound, cnt1, ~(elementsPerIteration - 1));
        masm.branchConditionally(AArch64Assembler.ConditionFlag.EQ, labelShortUnrolledBegin);
        for (int idx2 = 0; idx2 < this.nRegs; ++idx2) {
            masm.neon.moviVI(AArch64ASIMDAssembler.ASIMDSize.FullReg, vresult[idx2], 0L);
        }
        int powersOf31Start = POWERS_OF_31_BACKWARDS.length - elementsPerIteration;
        int nextPow31 = POWERS_OF_31_BACKWARDS[powersOf31Start - 1];
        Register next = tmp3;
        masm.mov(next, nextPow31);
        masm.neon.dupVG(AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ElementSize.Word, vnext, next);
        masm.sub(32, cnt1, cnt1, bound);
        masm.add(64, bound, ary1, bound, AArch64Assembler.ShiftType.LSL, stride.log2);
        Label labelUnrolledVectorLoopBegin = new Label();
        masm.align(16);
        masm.bind(labelUnrolledVectorLoopBegin);
        masm.mul(32, result, result, next);
        for (int ldVi = 0; ldVi < this.nRegs; ldVi += regsFilledPerLoad) {
            boolean postIndex = true;
            AArch64VectorizedHashCodeOp.loadConsecutiveVectors(masm, loadVecSize, loadVecBits, elSize, ary1, vtmp, ldVi, consecutiveRegs, postIndex, false);
            AArch64VectorizedHashCodeOp.extendVectorsToWord(masm, unsigned, loadVecBits, elSize, vtmp, ldVi, consecutiveRegs);
        }
        for (idx = 0; idx < this.nRegs; ++idx) {
            masm.neon.mlaVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ElementSize.Word, vtmp[idx], vresult[idx], vnext);
        }
        for (idx = 0; idx < this.nRegs; ++idx) {
            masm.neon.moveVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vresult[idx], vtmp[idx]);
        }
        masm.cmp(64, ary1, bound);
        masm.branchConditionally(AArch64Assembler.ConditionFlag.LO, labelUnrolledVectorLoopBegin);
        ArrayDataPointerConstant powersOf31 = new ArrayDataPointerConstant(Arrays.copyOfRange(POWERS_OF_31_BACKWARDS, powersOf31Start, POWERS_OF_31_BACKWARDS.length), 16);
        crb.recordDataReferenceInCode(powersOf31);
        Register coefAddrReg = tmp2;
        masm.adrpAdd(coefAddrReg);
        for (i = 0; i < this.nRegs; i += maxConsecutiveRegs) {
            boolean postIndex = maxConsecutiveRegs < this.nRegs;
            AArch64VectorizedHashCodeOp.loadConsecutiveVectors(masm, AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ASIMDSize.FullReg.bits(), AArch64ASIMDAssembler.ElementSize.Word, coefAddrReg, vcoef, i, maxConsecutiveRegs, postIndex, false);
        }
        for (i = 0; i < this.nRegs; ++i) {
            if (i == 0) {
                masm.neon.mulVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ElementSize.Word, vresult[0], vresult[i], vcoef[i]);
                continue;
            }
            masm.neon.mlaVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ElementSize.Word, vresult[0], vresult[i], vcoef[i]);
        }
        masm.neon.addvSV(AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ElementSize.Word, vresult[0], vresult[0]);
        masm.fmov(32, tmp2, vresult[0]);
        masm.add(32, result, result, tmp2);
        masm.align(16);
        masm.bind(labelShortUnrolledBegin);
        Label labelShortUnrolledLoopExit = new Label();
        AArch64Address postIndexAddr = AArch64Address.createImmediateAddress(elSize.bits(), AArch64Address.AddressingMode.IMMEDIATE_POST_INDEXED, ary1, elSize.bytes());
        masm.mov(index, 1);
        masm.cmp(32, index, cnt1);
        masm.branchConditionally(AArch64Assembler.ConditionFlag.HS, labelShortUnrolledLoopExit);
        try (AArch64MacroAssembler.ScratchRegister scratch1 = masm.getScratchRegister();
             AArch64MacroAssembler.ScratchRegister scratch2 = masm.getScratchRegister();){
            Register tmp31 = scratch1.getRegister();
            Register tmp961 = scratch2.getRegister();
            masm.mov(tmp31, 31);
            masm.mov(tmp961, 961);
            Label labelShortUnrolledLoopBegin = new Label();
            masm.align(16);
            masm.bind(labelShortUnrolledLoopBegin);
            AArch64VectorizedHashCodeOp.arraysHashcodeElload(masm, tmp2, postIndexAddr, this.arrayKind);
            AArch64VectorizedHashCodeOp.arraysHashcodeElload(masm, tmp3, postIndexAddr, this.arrayKind);
            masm.madd(32, tmp3, tmp2, tmp31, tmp3);
            masm.madd(32, result, result, tmp961, tmp3);
            masm.add(32, index, index, 2);
            masm.cmp(32, index, cnt1);
            masm.branchConditionally(AArch64Assembler.ConditionFlag.LO, labelShortUnrolledLoopBegin);
            masm.bind(labelShortUnrolledLoopExit);
            masm.branchConditionally(AArch64Assembler.ConditionFlag.HI, labelEnd);
            masm.mov(tmp31, 31);
            AArch64VectorizedHashCodeOp.arraysHashcodeElload(masm, tmp3, postIndexAddr, this.arrayKind);
            masm.madd(32, result, result, tmp31, tmp3);
        }
        masm.bind(labelEnd);
    }

    private static void loadConsecutiveVectors(AArch64MacroAssembler masm, AArch64ASIMDAssembler.ASIMDSize vecSize, int vecBits, AArch64ASIMDAssembler.ElementSize elSize, Register indexedAddrReg, Register[] vreg, int startReg, int consecutiveRegs, boolean postIndex, boolean preferLd1) {
        assert (consecutiveRegs <= 4) : consecutiveRegs;
        switch (consecutiveRegs) {
            case 2: {
                if (preferLd1 && AArch64VectorizedHashCodeOp.allConsecutiveSIMDRegisters(vreg) && vecBits == vecSize.bits()) {
                    masm.neon.ld1MultipleVV(vecSize, elSize, vreg[startReg], vreg[startReg + 1], AArch64VectorizedHashCodeOp.addressForLd1M(2, vecSize, elSize, indexedAddrReg, postIndex));
                    break;
                }
                masm.fldp(vecBits, vreg[startReg], vreg[startReg + 1], AArch64VectorizedHashCodeOp.addressForLdp(vecSize, indexedAddrReg, postIndex, 0));
                break;
            }
            case 4: {
                if (preferLd1 && AArch64VectorizedHashCodeOp.allConsecutiveSIMDRegisters(vreg) && vecBits == vecSize.bits()) {
                    masm.neon.ld1MultipleVVVV(vecSize, elSize, vreg[startReg], vreg[startReg + 1], vreg[startReg + 2], vreg[startReg + 3], AArch64VectorizedHashCodeOp.addressForLd1M(4, vecSize, elSize, indexedAddrReg, postIndex));
                    break;
                }
                masm.fldp(vecBits, vreg[startReg], vreg[startReg + 1], AArch64VectorizedHashCodeOp.addressForLdp(vecSize, indexedAddrReg, postIndex, 0));
                masm.fldp(vecBits, vreg[startReg + 2], vreg[startReg + 3], AArch64VectorizedHashCodeOp.addressForLdp(vecSize, indexedAddrReg, postIndex, 2));
                break;
            }
            default: {
                boolean useLd1 = preferLd1 && (consecutiveRegs == 1 || postIndex) && vecBits == vecSize.bits();
                for (int i = 0; i < consecutiveRegs; ++i) {
                    if (useLd1) {
                        masm.neon.ld1MultipleV(vecSize, elSize, vreg[startReg + i], AArch64VectorizedHashCodeOp.addressForLd1M(1, vecSize, elSize, indexedAddrReg, postIndex));
                        continue;
                    }
                    masm.fldr(vecBits, vreg[startReg + i], AArch64VectorizedHashCodeOp.addressForLdr(vecSize, indexedAddrReg, postIndex, i));
                }
            }
        }
    }

    private static AArch64Address addressForLdr(AArch64ASIMDAssembler.ASIMDSize vecSize, Register indexedAddrReg, boolean postIndex, int offsetScaled) {
        int scale = vecSize.bytes();
        if (postIndex) {
            return AArch64Address.createImmediateAddress(vecSize.bits(), AArch64Address.AddressingMode.IMMEDIATE_POST_INDEXED, indexedAddrReg, scale);
        }
        return AArch64Address.createImmediateAddress(vecSize.bits(), AArch64Address.AddressingMode.IMMEDIATE_UNSIGNED_SCALED, indexedAddrReg, offsetScaled * scale);
    }

    private static AArch64Address addressForLdp(AArch64ASIMDAssembler.ASIMDSize vecSize, Register indexedAddrReg, boolean postIndex, int offsetScaled) {
        int scale = vecSize.bytes();
        if (postIndex) {
            return AArch64Address.createImmediateAddress(vecSize.bits(), AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, indexedAddrReg, 2 * scale);
        }
        return AArch64Address.createImmediateAddress(vecSize.bits(), AArch64Address.AddressingMode.IMMEDIATE_PAIR_SIGNED_SCALED, indexedAddrReg, offsetScaled * scale);
    }

    private static AArch64Address addressForLd1M(int loadedVectors, AArch64ASIMDAssembler.ASIMDSize vecSize, AArch64ASIMDAssembler.ElementSize elSize, Register indexedAddrReg, boolean postIndex) {
        if (postIndex) {
            AArch64ASIMDAssembler.ASIMDInstruction ld1Instruction = switch (loadedVectors) {
                case 1 -> AArch64ASIMDAssembler.ASIMDInstruction.LD1_MULTIPLE_1R;
                case 2 -> AArch64ASIMDAssembler.ASIMDInstruction.LD1_MULTIPLE_2R;
                case 3 -> AArch64ASIMDAssembler.ASIMDInstruction.LD1_MULTIPLE_3R;
                case 4 -> AArch64ASIMDAssembler.ASIMDInstruction.LD1_MULTIPLE_4R;
                default -> throw GraalError.shouldNotReachHereUnexpectedValue(loadedVectors);
            };
            int incrementUnscaled = vecSize.bytes() * loadedVectors;
            return AArch64Address.createStructureImmediatePostIndexAddress(ld1Instruction, vecSize, elSize, indexedAddrReg, incrementUnscaled);
        }
        return AArch64Address.createStructureNoOffsetAddress(indexedAddrReg);
    }

    private static void extendVectorsToWord(AArch64MacroAssembler masm, boolean unsigned, int startVecBits, AArch64ASIMDAssembler.ElementSize startElSize, Register[] vtmp, int start, int srcRegsInitial) {
        int vecBits = startVecBits;
        AArch64ASIMDAssembler.ElementSize srcElSize = startElSize;
        AArch64ASIMDAssembler.ElementSize endElSize = AArch64ASIMDAssembler.ElementSize.Word;
        int srcRegs = srcRegsInitial;
        while (srcElSize.bits() < endElSize.bits()) {
            if (vecBits < AArch64ASIMDAssembler.ASIMDSize.FullReg.bits()) {
                AArch64VectorizedHashCodeOp.extendSameRegs(masm, unsigned, srcElSize, vtmp, start, srcRegs);
                vecBits *= 2;
            } else {
                AArch64VectorizedHashCodeOp.extendPairwise(masm, unsigned, srcElSize, vtmp, start, srcRegs);
                srcRegs *= 2;
            }
            srcElSize = srcElSize.expand();
        }
    }

    private static void extendSameRegs(AArch64MacroAssembler masm, boolean unsigned, AArch64ASIMDAssembler.ElementSize srcElSize, Register[] vtmp, int start, int srcRegs) {
        for (int i = start; i < start + srcRegs; ++i) {
            AArch64VectorizedHashCodeOp.xtlVV(masm, unsigned, srcElSize, vtmp[i], vtmp[i]);
        }
    }

    private static void extendPairwise(AArch64MacroAssembler masm, boolean unsigned, AArch64ASIMDAssembler.ElementSize srcElSize, Register[] vtmp, int start, int srcRegs) {
        int dstRegs = srcRegs * 2;
        int srci = start + srcRegs - 1;
        int dsti = start + dstRegs - 1;
        while (srci >= start) {
            AArch64VectorizedHashCodeOp.xtl2VV(masm, unsigned, srcElSize, vtmp[dsti], vtmp[srci]);
            AArch64VectorizedHashCodeOp.xtlVV(masm, unsigned, srcElSize, vtmp[dsti - 1], vtmp[srci]);
            --srci;
            dsti -= 2;
        }
    }

    private static void xtlVV(AArch64MacroAssembler masm, boolean unsigned, AArch64ASIMDAssembler.ElementSize elementSize, Register dst, Register src) {
        if (unsigned) {
            masm.neon.uxtlVV(elementSize, dst, src);
        } else {
            masm.neon.sxtlVV(elementSize, dst, src);
        }
    }

    private static void xtl2VV(AArch64MacroAssembler masm, boolean unsigned, AArch64ASIMDAssembler.ElementSize elementSize, Register dst, Register src) {
        if (unsigned) {
            masm.neon.uxtl2VV(elementSize, dst, src);
        } else {
            masm.neon.sxtl2VV(elementSize, dst, src);
        }
    }

    private static boolean allConsecutiveSIMDRegisters(Register[] regs) {
        int numRegs = AArch64.simdRegisters.size();
        for (int i = 1; i < regs.length; ++i) {
            assert (regs[i].getRegisterCategory().equals((Object)AArch64.SIMD)) : regs[i];
            if ((regs[i - 1].encoding + 1) % numRegs == regs[i].encoding) continue;
            return false;
        }
        return true;
    }
}

