/*
 * Decompiled with CFR 0.152.
 */
package org.graalvm.compiler.lir.aarch64;

import jdk.vm.ci.aarch64.AArch64;
import jdk.vm.ci.aarch64.AArch64Kind;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.code.ValueUtil;
import jdk.vm.ci.meta.AllocatableValue;
import jdk.vm.ci.meta.Value;
import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.asm.aarch64.AArch64ASIMDAssembler;
import org.graalvm.compiler.asm.aarch64.AArch64Address;
import org.graalvm.compiler.asm.aarch64.AArch64Assembler;
import org.graalvm.compiler.asm.aarch64.AArch64MacroAssembler;
import org.graalvm.compiler.debug.GraalError;
import org.graalvm.compiler.lir.LIRInstruction;
import org.graalvm.compiler.lir.LIRInstructionClass;
import org.graalvm.compiler.lir.Opcode;
import org.graalvm.compiler.lir.aarch64.AArch64ComplexVectorOp;
import org.graalvm.compiler.lir.aarch64.AArch64LIRInstruction;
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
import org.graalvm.compiler.lir.gen.LIRGeneratorTool;

@Opcode(value="VECTORIZED_MISMATCH")
public final class AArch64VectorizedMismatchOp
extends AArch64ComplexVectorOp {
    public static final LIRInstructionClass<AArch64VectorizedMismatchOp> TYPE = LIRInstructionClass.create(AArch64VectorizedMismatchOp.class);
    @LIRInstruction.Def(value={LIRInstruction.OperandFlag.REG})
    protected Value resultValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    protected Value arrayAValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    protected Value arrayBValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    protected Value lengthValue;
    @LIRInstruction.Alive(value={LIRInstruction.OperandFlag.REG})
    protected Value strideValue;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    protected AllocatableValue[] temp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    protected AllocatableValue[] vectorTemp;

    public AArch64VectorizedMismatchOp(LIRGeneratorTool tool, Value result, Value arrayA, Value arrayB, Value length, Value stride) {
        super((LIRInstructionClass<? extends AArch64LIRInstruction>)TYPE);
        GraalError.guarantee(result.getPlatformKind() == AArch64Kind.DWORD, "int value expected");
        GraalError.guarantee(arrayA.getPlatformKind() == AArch64Kind.QWORD && arrayA.getPlatformKind() == arrayB.getPlatformKind(), "pointer value expected");
        GraalError.guarantee(length.getPlatformKind() == AArch64Kind.DWORD, "int value expected");
        GraalError.guarantee(stride.getPlatformKind() == AArch64Kind.DWORD, "int value expected");
        this.resultValue = result;
        this.arrayAValue = arrayA;
        this.arrayBValue = arrayB;
        this.lengthValue = length;
        this.strideValue = stride;
        this.temp = AArch64VectorizedMismatchOp.allocateTempRegisters(tool, 3);
        this.vectorTemp = AArch64VectorizedMismatchOp.allocateVectorRegisters(tool, 5);
    }

    @Override
    public void emitCode(CompilationResultBuilder crb, AArch64MacroAssembler masm) {
        try (AArch64MacroAssembler.ScratchRegister sc1 = masm.getScratchRegister();
             AArch64MacroAssembler.ScratchRegister sc2 = masm.getScratchRegister();){
            Register arrayA = sc1.getRegister();
            Register arrayB = sc2.getRegister();
            Register length = ValueUtil.asRegister((Value)this.temp[0]);
            Register tmp = ValueUtil.asRegister((Value)this.temp[1]);
            Register ret = ValueUtil.asRegister((Value)this.resultValue);
            Label end = new Label();
            masm.mov(64, arrayA, ValueUtil.asRegister((Value)this.arrayAValue));
            masm.mov(64, arrayB, ValueUtil.asRegister((Value)this.arrayBValue));
            this.emitVectorizedMismatch(masm, arrayA, arrayB, length, tmp, ret, end);
            masm.align(16);
            masm.bind(end);
            masm.asr(64, ret, ret, ValueUtil.asRegister((Value)this.strideValue));
        }
    }

    private void emitVectorizedMismatch(AArch64MacroAssembler asm, Register arrayA, Register arrayB, Register len, Register tmp, Register ret, Label end) {
        Label tailLessThan32 = new Label();
        Label tailLessThan16 = new Label();
        Label tailLessThan8 = new Label();
        Label tailLessThan4 = new Label();
        Label tailLessThan2 = new Label();
        Label vectorLoop = new Label();
        Label diffFound = new Label();
        Label retEqual = new Label();
        Register vecArrayA1 = ValueUtil.asRegister((Value)this.vectorTemp[0]);
        Register vecArrayA2 = ValueUtil.asRegister((Value)this.vectorTemp[1]);
        Register vecArrayB1 = ValueUtil.asRegister((Value)this.vectorTemp[2]);
        Register vecArrayB2 = ValueUtil.asRegister((Value)this.vectorTemp[3]);
        Register vecMask = ValueUtil.asRegister((Value)this.vectorTemp[4]);
        Register refAddress = ValueUtil.asRegister((Value)this.temp[2]);
        asm.lsl(32, len, ValueUtil.asRegister((Value)this.lengthValue), ValueUtil.asRegister((Value)this.strideValue));
        AArch64VectorizedMismatchOp.initCalcIndexOfFirstMatchMask(asm, vecMask, tmp);
        asm.subs(64, len, len, 32);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.MI, tailLessThan32);
        Label fastPathContinue = new Label();
        asm.ldr(64, ret, AArch64Address.createBaseRegisterOnlyAddress(64, arrayA));
        asm.ldr(64, tmp, AArch64Address.createBaseRegisterOnlyAddress(64, arrayB));
        asm.add(64, refAddress, arrayA, len);
        asm.fldp(128, vecArrayA1, vecArrayA2, AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayA, 32));
        asm.fldp(128, vecArrayB1, vecArrayB2, AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayB, 32));
        asm.eor(64, ret, ret, tmp);
        asm.cbz(64, ret, fastPathContinue);
        asm.rbit(64, ret, ret);
        asm.clz(64, ret, ret);
        asm.lsr(64, ret, ret, 3L);
        asm.jmp(end);
        asm.align(16);
        asm.bind(vectorLoop);
        asm.fldp(128, vecArrayA1, vecArrayA2, AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayA, 32));
        asm.fldp(128, vecArrayB1, vecArrayB2, AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayB, 32));
        asm.bind(fastPathContinue);
        asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayA1, vecArrayA1, vecArrayB1);
        asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayA2, vecArrayA2, vecArrayB2);
        asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayB1, vecArrayA1, vecArrayA2);
        asm.neon.umaxvSV(AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ElementSize.Word, vecArrayB1, vecArrayB1);
        asm.fcmpZero(64, vecArrayB1);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.NE, diffFound);
        asm.cmp(64, arrayA, refAddress);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.LO, vectorLoop);
        asm.sub(64, tmp, arrayA, refAddress);
        asm.mov(64, arrayA, refAddress);
        asm.sub(64, arrayB, arrayB, tmp);
        asm.fldp(128, vecArrayA1, vecArrayA2, AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayA, 32));
        asm.fldp(128, vecArrayB1, vecArrayB2, AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, arrayB, 32));
        asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayA1, vecArrayA1, vecArrayB1);
        asm.neon.eorVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayA2, vecArrayA2, vecArrayB2);
        asm.neon.orrVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, vecArrayB1, vecArrayA1, vecArrayA2);
        asm.neon.umaxvSV(AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ElementSize.Word, vecArrayB1, vecArrayB1);
        asm.fcmpZero(64, vecArrayB1);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.EQ, retEqual);
        asm.align(16);
        asm.bind(diffFound);
        asm.neon.cmtstVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ElementSize.Byte, vecArrayA1, vecArrayA1, vecArrayA1);
        asm.neon.cmtstVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ElementSize.Byte, vecArrayA2, vecArrayA2, vecArrayA2);
        AArch64VectorizedMismatchOp.calcIndexOfFirstMatch(asm, ret, vecArrayA1, vecArrayA2, vecMask, false, () -> {
            asm.sub(64, tmp, arrayA, ValueUtil.asRegister((Value)this.arrayAValue));
            asm.sub(64, tmp, tmp, 32);
        });
        asm.add(64, ret, tmp, ret, AArch64Assembler.ShiftType.LSR, 1);
        asm.jmp(end);
        AArch64VectorizedMismatchOp.tailLessThan32(asm, arrayA, arrayB, len, tmp, ret, vecArrayA1, vecArrayA2, vecArrayB1, vecArrayB2, vecMask, tailLessThan32, tailLessThan16, retEqual, end);
        AArch64VectorizedMismatchOp.tail(asm, arrayA, arrayB, len, tmp, ret, end, tailLessThan16, tailLessThan8, retEqual, 8);
        AArch64VectorizedMismatchOp.tail(asm, arrayA, arrayB, len, tmp, ret, end, tailLessThan8, tailLessThan4, retEqual, 4);
        AArch64VectorizedMismatchOp.tail(asm, arrayA, arrayB, len, tmp, ret, end, tailLessThan4, tailLessThan2, retEqual, 2);
        asm.bind(tailLessThan2);
        asm.adds(64, ret, len, 1);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.MI, end);
        asm.ldr(8, ret, AArch64Address.createBaseRegisterOnlyAddress(8, arrayA));
        asm.ldr(8, tmp, AArch64Address.createBaseRegisterOnlyAddress(8, arrayB));
        asm.eor(64, ret, ret, tmp);
        asm.cbz(64, ret, retEqual);
        asm.mov(64, ret, AArch64.zr);
        asm.jmp(end);
        asm.align(16);
        asm.bind(retEqual);
        asm.mov(ret, -1);
        asm.lsl(64, ret, ret, ValueUtil.asRegister((Value)this.strideValue));
    }

    private static void tail(AArch64MacroAssembler asm, Register arrayA, Register arrayB, Register len, Register tmp, Register ret, Label end, Label entry, Label nextTail, Label retEqual, int nBytes) {
        Label endOfArray = new Label();
        int bits = nBytes << 3;
        asm.bind(entry);
        asm.adds(64, len, len, nBytes);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.MI, nextTail);
        asm.ldr(bits, ret, AArch64Address.createBaseRegisterOnlyAddress(bits, arrayA));
        asm.ldr(bits, tmp, AArch64Address.createBaseRegisterOnlyAddress(bits, arrayB));
        asm.eor(64, ret, ret, tmp);
        asm.cbz(64, ret, endOfArray);
        asm.rbit(Math.max(bits, 32), ret, ret);
        asm.clz(Math.max(bits, 32), ret, ret);
        asm.lsr(64, ret, ret, 3L);
        asm.jmp(end);
        asm.bind(endOfArray);
        asm.ldr(bits, ret, AArch64Address.createRegisterOffsetAddress(bits, arrayA, len, false));
        asm.ldr(bits, tmp, AArch64Address.createRegisterOffsetAddress(bits, arrayB, len, false));
        asm.eor(64, ret, ret, tmp);
        asm.cbz(64, ret, retEqual);
        asm.rbit(Math.max(bits, 32), ret, ret);
        asm.clz(Math.max(bits, 32), ret, ret);
        asm.add(64, ret, len, ret, AArch64Assembler.ShiftType.LSR, 3);
        asm.jmp(end);
    }

    private static void tailLessThan32(AArch64MacroAssembler asm, Register arrayA, Register arrayB, Register len, Register tmp, Register ret, Register vecArrayA1, Register vecArrayA2, Register vecArrayB1, Register vecArrayB2, Register vecMask, Label entry, Label nextTail, Label retEqual, Label end) {
        asm.bind(entry);
        asm.adds(64, len, len, 16);
        asm.branchConditionally(AArch64Assembler.ConditionFlag.MI, nextTail);
        asm.fldr(128, vecArrayA1, AArch64Address.createBaseRegisterOnlyAddress(128, arrayA));
        asm.fldr(128, vecArrayB1, AArch64Address.createBaseRegisterOnlyAddress(128, arrayB));
        asm.fldr(128, vecArrayA2, AArch64Address.createRegisterOffsetAddress(128, arrayA, len, false));
        asm.fldr(128, vecArrayB2, AArch64Address.createRegisterOffsetAddress(128, arrayB, len, false));
        asm.neon.cmeqVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ElementSize.Byte, vecArrayA1, vecArrayA1, vecArrayB1);
        asm.neon.cmeqVVV(AArch64ASIMDAssembler.ASIMDSize.FullReg, AArch64ASIMDAssembler.ElementSize.Byte, vecArrayA2, vecArrayA2, vecArrayB2);
        AArch64VectorizedMismatchOp.calcIndexOfFirstMatch(asm, ret, vecArrayA1, vecArrayA2, vecMask, true, () -> asm.cbz(64, ret, retEqual));
        asm.sub(64, tmp, len, 16);
        asm.lsr(64, ret, ret, 1L);
        asm.add(64, tmp, ret, tmp);
        asm.compare(64, ret, 16);
        asm.csel(64, ret, ret, tmp, AArch64Assembler.ConditionFlag.LO);
        asm.jmp(end);
    }
}

