/*
 * Decompiled with CFR 0.152.
 */
package org.graalvm.compiler.lir.amd64;

import java.util.Arrays;
import java.util.Objects;
import jdk.vm.ci.amd64.AMD64;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.code.RegisterValue;
import jdk.vm.ci.code.ValueUtil;
import jdk.vm.ci.meta.AllocatableValue;
import jdk.vm.ci.meta.JavaKind;
import jdk.vm.ci.meta.PlatformKind;
import jdk.vm.ci.meta.Value;
import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.asm.amd64.AMD64Address;
import org.graalvm.compiler.asm.amd64.AMD64Assembler;
import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
import org.graalvm.compiler.asm.amd64.AVXKind;
import org.graalvm.compiler.core.common.LIRKind;
import org.graalvm.compiler.core.common.StrideUtil;
import org.graalvm.compiler.lir.LIRInstruction;
import org.graalvm.compiler.lir.LIRInstructionClass;
import org.graalvm.compiler.lir.Opcode;
import org.graalvm.compiler.lir.amd64.AMD64ComplexVectorOp;
import org.graalvm.compiler.lir.amd64.AMD64ControlFlow;
import org.graalvm.compiler.lir.amd64.AMD64StrideUtil;
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
import org.graalvm.compiler.lir.gen.LIRGeneratorTool;

@Opcode(value="ARRAY_COPY_WITH_CONVERSIONS")
public final class AMD64ArrayCopyWithConversionsOp
extends AMD64ComplexVectorOp {
    public static final LIRInstructionClass<AMD64ArrayCopyWithConversionsOp> TYPE = LIRInstructionClass.create(AMD64ArrayCopyWithConversionsOp.class);
    private static final Register REG_ARRAY_SRC = AMD64.rsi;
    private static final Register REG_OFFSET_SRC = AMD64.rax;
    private static final Register REG_ARRAY_DST = AMD64.rdi;
    private static final Register REG_OFFSET_DST = AMD64.rcx;
    private static final Register REG_LENGTH = AMD64.rdx;
    private static final Register REG_STRIDE = AMD64.r8;
    private final AMD64Address.Scale scaleSrcConst;
    private final AMD64Address.Scale scaleDstConst;
    private final AMD64MacroAssembler.ExtendMode extendMode;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value arraySrc;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value offsetSrc;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value arrayDst;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value offsetDst;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value length;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG, LIRInstruction.OperandFlag.ILLEGAL})
    private Value dynamicStrides;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value arraySrcTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value offsetSrcTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value arrayDstTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value offsetDstTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value lengthTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG, LIRInstruction.OperandFlag.ILLEGAL})
    private Value dynamicStridesTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value[] vectorTemp;

    private AMD64ArrayCopyWithConversionsOp(LIRGeneratorTool tool, JavaKind strideSrc, JavaKind strideDst, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, Value dynamicStrides, AMD64MacroAssembler.ExtendMode extendMode) {
        super(TYPE, tool, AVXKind.AVXSize.YMM);
        this.extendMode = extendMode;
        assert (((AMD64)tool.target().arch).getFeatures().contains(AMD64.CPUFeature.SSE2));
        this.arraySrcTmp = this.arraySrc = arraySrc;
        this.offsetSrcTmp = this.offsetSrc = offsetSrc;
        this.arrayDstTmp = this.arrayDst = arrayDst;
        this.offsetDstTmp = this.offsetDst = offsetDst;
        this.lengthTmp = this.length = length;
        this.dynamicStridesTmp = this.dynamicStrides = dynamicStrides;
        if (StrideUtil.useConstantStrides(dynamicStrides)) {
            assert (strideSrc.isNumericInteger() && strideDst.isNumericInteger());
            this.scaleSrcConst = Objects.requireNonNull(AMD64Address.Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(strideSrc)));
            this.scaleDstConst = Objects.requireNonNull(AMD64Address.Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(strideDst)));
            this.vectorTemp = new Value[AMD64ArrayCopyWithConversionsOp.getNumberOfRequiredVectorRegisters(AMD64ArrayCopyWithConversionsOp.getOp(this.scaleDstConst, this.scaleSrcConst))];
        } else {
            this.scaleSrcConst = null;
            this.scaleDstConst = null;
            this.vectorTemp = new Value[5];
        }
        for (int i = 0; i < this.vectorTemp.length; ++i) {
            this.vectorTemp[i] = tool.newVariable(LIRKind.value((PlatformKind)this.getVectorKind(JavaKind.Byte)));
        }
    }

    public static AMD64ArrayCopyWithConversionsOp movParamsAndCreate(LIRGeneratorTool tool, JavaKind strideSrc, JavaKind strideDst, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, AMD64MacroAssembler.ExtendMode extendMode) {
        return AMD64ArrayCopyWithConversionsOp.movParamsAndCreate(tool, strideSrc, strideDst, arraySrc, offsetSrc, arrayDst, offsetDst, length, (Value)Value.ILLEGAL, extendMode);
    }

    public static AMD64ArrayCopyWithConversionsOp movParamsAndCreate(LIRGeneratorTool tool, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, Value stride, AMD64MacroAssembler.ExtendMode extendMode) {
        return AMD64ArrayCopyWithConversionsOp.movParamsAndCreate(tool, null, null, arraySrc, offsetSrc, arrayDst, offsetDst, length, stride, extendMode);
    }

    private static AMD64ArrayCopyWithConversionsOp movParamsAndCreate(LIRGeneratorTool tool, JavaKind strideSrc, JavaKind strideDst, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, Value dynamicStrides, AMD64MacroAssembler.ExtendMode extendMode) {
        AllocatableValue regDynamicStrides;
        RegisterValue regArraySrc = REG_ARRAY_SRC.asValue(arraySrc.getValueKind());
        RegisterValue regOffsetSrc = REG_OFFSET_SRC.asValue(offsetSrc.getValueKind());
        RegisterValue regArrayDst = REG_ARRAY_DST.asValue(arrayDst.getValueKind());
        RegisterValue regOffsetDst = REG_OFFSET_DST.asValue(offsetDst.getValueKind());
        RegisterValue regLength = REG_LENGTH.asValue(length.getValueKind());
        tool.emitConvertNullToZero((AllocatableValue)regArraySrc, arraySrc);
        tool.emitMove((AllocatableValue)regOffsetSrc, offsetSrc);
        tool.emitConvertNullToZero((AllocatableValue)regArrayDst, arrayDst);
        tool.emitMove((AllocatableValue)regOffsetDst, offsetDst);
        tool.emitMove((AllocatableValue)regLength, length);
        if (ValueUtil.isIllegal((Value)dynamicStrides)) {
            regDynamicStrides = Value.ILLEGAL;
        } else {
            regDynamicStrides = REG_STRIDE.asValue(dynamicStrides.getValueKind());
            tool.emitMove((AllocatableValue)((RegisterValue)regDynamicStrides), dynamicStrides);
        }
        return new AMD64ArrayCopyWithConversionsOp(tool, strideSrc, strideDst, (Value)regArraySrc, (Value)regOffsetSrc, (Value)regArrayDst, (Value)regOffsetDst, (Value)regLength, (Value)regDynamicStrides, extendMode);
    }

    private static Op getOp(AMD64Address.Scale scaleDst, AMD64Address.Scale scaleSrc) {
        if (scaleDst.value == scaleSrc.value) {
            return Op.copy;
        }
        if (scaleDst.value < scaleSrc.value) {
            switch (scaleSrc) {
                case Times2: {
                    assert (scaleDst == AMD64Address.Scale.Times1);
                    return Op.compressCharToByte;
                }
                case Times4: {
                    switch (scaleDst) {
                        case Times1: {
                            return Op.compressIntToByte;
                        }
                        case Times2: {
                            return Op.compressIntToChar;
                        }
                    }
                    throw new UnsupportedOperationException();
                }
            }
            throw new UnsupportedOperationException();
        }
        switch (scaleSrc) {
            case Times1: {
                switch (scaleDst) {
                    case Times2: {
                        return Op.inflateByteToChar;
                    }
                    case Times4: {
                        return Op.inflateByteToInt;
                    }
                }
                throw new UnsupportedOperationException();
            }
            case Times2: {
                assert (scaleDst == AMD64Address.Scale.Times4);
                return Op.inflateCharToInt;
            }
        }
        throw new UnsupportedOperationException();
    }

    private static int getNumberOfRequiredVectorRegisters(Op op) {
        switch (op) {
            case compressCharToByte: 
            case compressIntToChar: {
                return 2;
            }
            case compressIntToByte: {
                return 5;
            }
        }
        return 1;
    }

    @Override
    public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
        Register src = ValueUtil.asRegister((Value)this.arraySrc);
        Register sro = ValueUtil.asRegister((Value)this.offsetSrc);
        Register dst = ValueUtil.asRegister((Value)this.arrayDst);
        Register dso = ValueUtil.asRegister((Value)this.offsetDst);
        Register len = ValueUtil.asRegister((Value)this.length);
        asm.leaq(src, new AMD64Address(src, sro, AMD64Address.Scale.Times1));
        asm.leaq(dst, new AMD64Address(dst, dso, AMD64Address.Scale.Times1));
        if (ValueUtil.isIllegal((Value)this.dynamicStrides)) {
            this.emitOp(crb, asm, this.scaleSrcConst, this.scaleDstConst, src, sro, dst, len);
        } else {
            Label[] variants = new Label[9];
            Label end = new Label();
            for (int i = 0; i < variants.length; ++i) {
                variants[i] = new Label();
            }
            AMD64ControlFlow.RangeTableSwitchOp.emitJumpTable(crb, asm, dso, ValueUtil.asRegister((Value)this.dynamicStrides), 0, 8, Arrays.stream(variants));
            asm.align(crb.target.wordSize * 2);
            asm.bind(variants[AMD64StrideUtil.getDirectStubCallIndex(AMD64Address.Scale.Times4, AMD64Address.Scale.Times4)]);
            asm.shll(len, 1);
            asm.align(crb.target.wordSize * 2);
            asm.bind(variants[AMD64StrideUtil.getDirectStubCallIndex(AMD64Address.Scale.Times2, AMD64Address.Scale.Times2)]);
            asm.shll(len, 1);
            asm.align(crb.target.wordSize * 2);
            asm.bind(variants[AMD64StrideUtil.getDirectStubCallIndex(AMD64Address.Scale.Times1, AMD64Address.Scale.Times1)]);
            this.emitOp(crb, asm, AMD64Address.Scale.Times1, AMD64Address.Scale.Times1, src, sro, dst, len);
            asm.jmp(end);
            for (AMD64Address.Scale scaleSrc : new AMD64Address.Scale[]{AMD64Address.Scale.Times1, AMD64Address.Scale.Times2, AMD64Address.Scale.Times4}) {
                for (AMD64Address.Scale scaleDst : new AMD64Address.Scale[]{AMD64Address.Scale.Times1, AMD64Address.Scale.Times2, AMD64Address.Scale.Times4}) {
                    if (scaleSrc == scaleDst) continue;
                    asm.align(crb.target.wordSize * 2);
                    asm.bind(variants[AMD64StrideUtil.getDirectStubCallIndex(scaleSrc, scaleDst)]);
                    this.emitOp(crb, asm, scaleSrc, scaleDst, src, sro, dst, len);
                    asm.jmp(end);
                }
            }
            asm.bind(end);
        }
    }

    private void emitOp(CompilationResultBuilder crb, AMD64MacroAssembler asm, AMD64Address.Scale scaleSrc, AMD64Address.Scale scaleDst, Register src, Register sro, Register dst, Register len) {
        if (scaleSrc.value < scaleDst.value) {
            this.emitInflate(crb, asm, scaleSrc, scaleDst, src, dst, len, sro);
        } else if (scaleSrc.value == scaleDst.value) {
            this.emitCopy(crb, asm, scaleSrc, scaleDst, src, dst, len, sro);
        } else {
            this.emitCompress(crb, asm, scaleSrc, scaleDst, src, dst, len, sro);
        }
    }

    private void emitCompress(CompilationResultBuilder crb, AMD64MacroAssembler asm, AMD64Address.Scale scaleSrc, AMD64Address.Scale scaleDst, Register src, Register dst, Register len, Register tmp) {
        Op op = AMD64ArrayCopyWithConversionsOp.getOp(scaleDst, scaleSrc);
        Label labelScalarLoop = new Label();
        Label labelDone = new Label();
        if (asm.supports(AMD64.CPUFeature.SSE4_2)) {
            Label labelPackVectorLoop = new Label();
            Label labelPack16Bytes = new Label();
            Label labelPack8Bytes = new Label();
            Label labelCopyTail = new Label();
            int vectorLength = this.vectorSize.getBytes() / scaleDst.value;
            asm.movl(tmp, len);
            asm.andl(tmp, vectorLength - 1);
            asm.andlAndJcc(len, -vectorLength, AMD64Assembler.ConditionFlag.Zero, this.supportsAVX2AndYMM() ? labelPack16Bytes : labelPack8Bytes, false);
            if (this.supportsAVX2AndYMM() && op == Op.compressIntToByte) {
                this.loadMask(crb, asm, ValueUtil.asRegister((Value)this.vectorTemp[4]), new byte[]{0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0});
            }
            asm.leaq(src, new AMD64Address(src, len, scaleSrc));
            asm.leaq(dst, new AMD64Address(dst, len, scaleDst));
            asm.negq(len);
            asm.align(crb.target.wordSize * 2);
            asm.bind(labelPackVectorLoop);
            this.packVector(asm, this.vectorSize, op, scaleSrc, scaleDst, src, dst, len, 0, false);
            asm.addqAndJcc(len, vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelPackVectorLoop, true);
            this.packVector(asm, this.vectorSize, op, scaleSrc, scaleDst, src, dst, tmp, -this.vectorSize.getBytes(), false);
            asm.jmp(labelDone);
            if (this.supportsAVX2AndYMM()) {
                asm.bind(labelPack16Bytes);
                int vectorSizeXMM = AVXKind.AVXSize.XMM.getBytes() / scaleDst.value;
                asm.cmplAndJcc(tmp, vectorSizeXMM, AMD64Assembler.ConditionFlag.Less, labelPack8Bytes, true);
                this.packVector(asm, AVXKind.AVXSize.XMM, op, scaleSrc, scaleDst, src, dst, len, 0, true);
                this.packVector(asm, AVXKind.AVXSize.XMM, op, scaleSrc, scaleDst, src, dst, tmp, -AVXKind.AVXSize.XMM.getBytes(), false);
                asm.jmpb(labelDone);
            }
            asm.bind(labelPack8Bytes);
            int vectorSizeQ = 8 / scaleDst.value;
            asm.cmplAndJcc(tmp, vectorSizeQ, AMD64Assembler.ConditionFlag.Less, labelCopyTail, true);
            this.pack8Bytes(asm, op, scaleSrc, scaleDst, src, dst, len, 0, true);
            this.pack8Bytes(asm, op, scaleSrc, scaleDst, src, dst, tmp, -8, false);
            asm.jmpb(labelDone);
            asm.bind(labelCopyTail);
            asm.movl(len, tmp);
        }
        asm.testlAndJcc(len, len, AMD64Assembler.ConditionFlag.Zero, labelDone, true);
        asm.leaq(src, new AMD64Address(src, len, scaleSrc));
        asm.leaq(dst, new AMD64Address(dst, len, scaleDst));
        asm.negq(len);
        asm.bind(labelScalarLoop);
        switch (op) {
            case compressCharToByte: {
                asm.movzwl(tmp, new AMD64Address(src, len, scaleSrc));
                asm.movb(new AMD64Address(dst, len, scaleDst), tmp);
                break;
            }
            case compressIntToChar: {
                asm.movl(tmp, new AMD64Address(src, len, scaleSrc));
                asm.movw(new AMD64Address(dst, len, scaleDst), tmp);
                break;
            }
            case compressIntToByte: {
                asm.movl(tmp, new AMD64Address(src, len, scaleSrc));
                asm.movb(new AMD64Address(dst, len, scaleDst), tmp);
            }
        }
        asm.incqAndJcc(len, AMD64Assembler.ConditionFlag.NotZero, labelScalarLoop, true);
        asm.bind(labelDone);
    }

    private void loadMask(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register vecMask, byte[] mask) {
        int align = crb.dataBuilder.ensureValidDataAlignment(mask.length);
        AMD64MacroAssembler.movdqu(asm, this.vectorSize, vecMask, (AMD64Address)crb.recordDataReferenceInCode(mask, align));
    }

    private void packVector(AMD64MacroAssembler asm, AVXKind.AVXSize vecSize, Op op, AMD64Address.Scale scaleSrc, AMD64Address.Scale scaleDst, Register src, Register dst, Register index, int displacement, boolean direct) {
        int displacementSrc = displacement << scaleSrc.log2 - scaleDst.log2;
        Register vec1 = ValueUtil.asRegister((Value)this.vectorTemp[0]);
        Register vec2 = ValueUtil.asRegister((Value)this.vectorTemp[1]);
        AMD64MacroAssembler.movdqu(asm, vecSize, vec1, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(scaleSrc, src, index, displacementSrc, 0, direct));
        AMD64MacroAssembler.movdqu(asm, vecSize, vec2, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(scaleSrc, src, index, displacementSrc, vecSize.getBytes(), direct));
        switch (op) {
            case compressCharToByte: {
                AMD64MacroAssembler.packuswb(asm, vecSize, vec1, vec2);
                break;
            }
            case compressIntToChar: {
                AMD64MacroAssembler.packusdw(asm, vecSize, vec1, vec2);
                break;
            }
            case compressIntToByte: {
                Register vec3 = ValueUtil.asRegister((Value)this.vectorTemp[2]);
                Register vec4 = ValueUtil.asRegister((Value)this.vectorTemp[3]);
                AMD64MacroAssembler.movdqu(asm, vecSize, vec3, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(scaleSrc, src, index, displacementSrc, vecSize.getBytes() * 2, direct));
                AMD64MacroAssembler.movdqu(asm, vecSize, vec4, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(scaleSrc, src, index, displacementSrc, vecSize.getBytes() * 3, direct));
                AMD64MacroAssembler.packusdw(asm, vecSize, vec1, vec2);
                AMD64MacroAssembler.packusdw(asm, vecSize, vec3, vec4);
                AMD64MacroAssembler.packuswb(asm, vecSize, vec1, vec3);
            }
        }
        if (vecSize == AVXKind.AVXSize.YMM) {
            if (op == Op.compressIntToByte) {
                AMD64Assembler.VexRVMOp.VPERMD.emit((AMD64Assembler)asm, vecSize, vec1, ValueUtil.asRegister((Value)this.vectorTemp[4]), vec1);
            } else {
                AMD64Assembler.VexRMIOp.VPERMQ.emit((AMD64Assembler)asm, vecSize, vec1, vec1, 216);
            }
        }
        AMD64MacroAssembler.movdqu(asm, vecSize, new AMD64Address(dst, index, scaleDst, displacement), vec1);
    }

    private void pack8Bytes(AMD64MacroAssembler masm, Op op, AMD64Address.Scale scaleSrc, AMD64Address.Scale scaleDst, Register src, Register dst, Register index, int displacement, boolean direct) {
        Register vec1 = ValueUtil.asRegister((Value)this.vectorTemp[0]);
        Register vec2 = ValueUtil.asRegister((Value)this.vectorTemp[1]);
        int displacementSrc = displacement << scaleSrc.log2 - scaleDst.log2;
        masm.movdqu(vec1, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(scaleSrc, src, index, displacementSrc, 0, direct));
        switch (op) {
            case compressCharToByte: {
                masm.pxor(vec2, vec2);
                masm.packuswb(vec1, vec2);
                break;
            }
            case compressIntToChar: {
                masm.pxor(vec2, vec2);
                masm.packusdw(vec1, vec2);
                break;
            }
            case compressIntToByte: {
                masm.movdqu(vec2, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(scaleSrc, src, index, displacementSrc, 16, direct));
                masm.packusdw(vec1, vec2);
                masm.packuswb(vec1, vec2);
            }
        }
        masm.movq(new AMD64Address(dst, index, scaleDst, displacement), vec1);
    }

    private static AMD64Address indexAddressOrDirect(AMD64Address.Scale scaleSrc, Register array, Register index, int baseOffset, int displacement, boolean direct) {
        return direct ? new AMD64Address(array, displacement) : new AMD64Address(array, index, scaleSrc, baseOffset + displacement);
    }

    private void emitInflate(CompilationResultBuilder crb, AMD64MacroAssembler asm, AMD64Address.Scale scaleSrc, AMD64Address.Scale scaleDst, Register src, Register dst, Register len, Register tmp) {
        Op op = AMD64ArrayCopyWithConversionsOp.getOp(scaleDst, scaleSrc);
        Register vec = ValueUtil.asRegister((Value)this.vectorTemp[0]);
        Label labelScalarLoop = new Label();
        Label labelDone = new Label();
        asm.movl(tmp, len);
        if (asm.supports(AMD64.CPUFeature.SSE4_2)) {
            Label labelMainLoop = new Label();
            Label labelSkipXMMHalf = new Label();
            Label labelXMMTail = new Label();
            Label labelTail = new Label();
            int vectorLength = this.vectorSize.getBytes() / scaleDst.value;
            int vectorLengthXMM = AVXKind.AVXSize.XMM.getBytes() / scaleDst.value;
            int scaleDelta = scaleDst.log2 - scaleSrc.log2;
            asm.andl(tmp, vectorLength - 1);
            asm.andlAndJcc(len, -vectorLength, AMD64Assembler.ConditionFlag.Zero, this.supportsAVX2AndYMM() ? labelXMMTail : labelTail, true);
            asm.leaq(src, new AMD64Address(src, len, scaleSrc));
            asm.leaq(dst, new AMD64Address(dst, len, scaleDst));
            asm.negq(len);
            asm.align(crb.target.wordSize * 2);
            asm.bind(labelMainLoop);
            AMD64MacroAssembler.pmovSZx(asm, this.vectorSize, this.extendMode, vec, scaleDst, new AMD64Address(src, len, scaleSrc), scaleSrc);
            AMD64MacroAssembler.movdqu(asm, this.vectorSize, new AMD64Address(dst, len, scaleDst), vec);
            asm.addqAndJcc(len, vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelMainLoop, true);
            AMD64MacroAssembler.pmovSZx(asm, this.vectorSize, this.extendMode, vec, scaleDst, new AMD64Address(src, tmp, scaleSrc, -this.vectorSize.getBytes() >> scaleDelta), scaleSrc);
            AMD64MacroAssembler.movdqu(asm, this.vectorSize, new AMD64Address(dst, tmp, scaleDst, -this.vectorSize.getBytes()), vec);
            asm.jmpb(labelDone);
            if (this.supportsAVX2AndYMM()) {
                asm.bind(labelXMMTail);
                asm.cmplAndJcc(tmp, vectorLengthXMM, AMD64Assembler.ConditionFlag.Less, labelTail, true);
                AMD64MacroAssembler.pmovSZx(asm, AVXKind.AVXSize.XMM, this.extendMode, vec, scaleDst, new AMD64Address(src), scaleSrc);
                asm.movdqu(new AMD64Address(dst), vec);
                AMD64MacroAssembler.pmovSZx(asm, AVXKind.AVXSize.XMM, this.extendMode, vec, scaleDst, new AMD64Address(src, tmp, scaleSrc, -16 >> scaleDelta), scaleSrc);
                asm.movdqu(new AMD64Address(dst, tmp, scaleDst, -16), vec);
                asm.jmpb(labelDone);
            }
            asm.bind(labelTail);
            asm.movl(len, tmp);
            if (op != Op.inflateByteToInt) {
                assert (scaleDelta == 1);
                asm.cmplAndJcc(len, 4 >> scaleSrc.log2, AMD64Assembler.ConditionFlag.Less, labelSkipXMMHalf, true);
                asm.movdl(vec, new AMD64Address(src));
                AMD64MacroAssembler.pmovSZx(asm, AVXKind.AVXSize.XMM, this.extendMode, vec, scaleDst, vec, scaleSrc);
                asm.movq(new AMD64Address(dst), vec);
                asm.movdl(vec, new AMD64Address(src, len, scaleSrc, -4));
                AMD64MacroAssembler.pmovSZx(asm, AVXKind.AVXSize.XMM, this.extendMode, vec, scaleDst, vec, scaleSrc);
                asm.movq(new AMD64Address(dst, len, scaleDst, -8), vec);
                asm.jmpb(labelDone);
            }
            asm.bind(labelSkipXMMHalf);
        }
        asm.testlAndJcc(len, len, AMD64Assembler.ConditionFlag.Zero, labelDone, true);
        asm.leaq(src, new AMD64Address(src, len, scaleSrc));
        asm.leaq(dst, new AMD64Address(dst, len, scaleDst));
        asm.negq(len);
        asm.bind(labelScalarLoop);
        switch (op) {
            case inflateByteToChar: {
                asm.movzbl(tmp, new AMD64Address(src, len, scaleSrc));
                asm.movw(new AMD64Address(dst, len, scaleDst), tmp);
                break;
            }
            case inflateByteToInt: {
                asm.movzbl(tmp, new AMD64Address(src, len, scaleSrc));
                asm.movl(new AMD64Address(dst, len, scaleDst), tmp);
                break;
            }
            case inflateCharToInt: {
                asm.movzwl(tmp, new AMD64Address(src, len, scaleSrc));
                asm.movl(new AMD64Address(dst, len, scaleDst), tmp);
            }
        }
        asm.incqAndJcc(len, AMD64Assembler.ConditionFlag.NotZero, labelScalarLoop, true);
        asm.bind(labelDone);
    }

    private void emitCopy(CompilationResultBuilder crb, AMD64MacroAssembler masm, AMD64Address.Scale scaleSrc, AMD64Address.Scale scaleDst, Register src, Register dst, Register len, Register tmp) {
        Register vec = ValueUtil.asRegister((Value)this.vectorTemp[0]);
        Label labelTailXMM = new Label();
        Label labelTailQWORD = new Label();
        Label labelTailDWORD = new Label();
        Label labelTailWORD = new Label();
        Label labelTailBYTE = new Label();
        Label labelDone = new Label();
        int vectorLength = this.vectorSize.getBytes() / scaleDst.value;
        masm.movl(tmp, len);
        masm.andl(tmp, vectorLength - 1);
        masm.andlAndJcc(len, -vectorLength, AMD64Assembler.ConditionFlag.Zero, this.supportsAVX2AndYMM() ? labelTailXMM : labelTailQWORD, true);
        masm.leaq(src, new AMD64Address(src, len, scaleSrc));
        masm.leaq(dst, new AMD64Address(dst, len, scaleDst));
        masm.negq(len);
        Label labelYMMLoop = new Label();
        Label labelXMMLoop = new Label();
        if (this.supportsAVX2AndYMM()) {
            masm.align(crb.target.wordSize * 2);
            masm.bind(labelYMMLoop);
            masm.vmovdqu(vec, new AMD64Address(src, len, scaleSrc));
            masm.vmovdqu(new AMD64Address(dst, len, scaleDst), vec);
            masm.addqAndJcc(len, vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelYMMLoop, true);
            masm.vmovdqu(vec, new AMD64Address(src, tmp, scaleSrc, -32));
            masm.vmovdqu(new AMD64Address(dst, tmp, scaleDst, -32), vec);
            masm.jmpb(labelDone);
            masm.bind(labelTailXMM);
            masm.cmplAndJcc(tmp, 16 / scaleDst.value, AMD64Assembler.ConditionFlag.Less, labelTailQWORD, true);
            masm.movdqu(vec, new AMD64Address(src));
            masm.movdqu(new AMD64Address(dst), vec);
            masm.movdqu(vec, new AMD64Address(src, tmp, scaleSrc, -16));
            masm.movdqu(new AMD64Address(dst, tmp, scaleDst, -16), vec);
            masm.jmpb(labelDone);
        } else {
            masm.align(crb.target.wordSize * 2);
            masm.bind(labelXMMLoop);
            masm.movdqu(vec, new AMD64Address(src, len, scaleSrc));
            masm.movdqu(new AMD64Address(dst, len, scaleDst), vec);
            masm.addqAndJcc(len, vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelXMMLoop, true);
            masm.movdqu(vec, new AMD64Address(src, tmp, scaleSrc, -16));
            masm.movdqu(new AMD64Address(dst, tmp, scaleDst, -16), vec);
            masm.jmpb(labelDone);
        }
        masm.bind(labelTailQWORD);
        masm.cmplAndJcc(tmp, 8 / scaleDst.value, AMD64Assembler.ConditionFlag.Less, labelTailDWORD, true);
        masm.movq(len, new AMD64Address(src));
        masm.movq(new AMD64Address(dst), len);
        masm.movq(len, new AMD64Address(src, tmp, scaleSrc, -8));
        masm.movq(new AMD64Address(dst, tmp, scaleDst, -8), len);
        masm.jmpb(labelDone);
        masm.bind(labelTailDWORD);
        if (scaleDst.value < 4) {
            masm.cmplAndJcc(tmp, 4 / scaleDst.value, AMD64Assembler.ConditionFlag.Less, labelTailWORD, true);
        } else {
            masm.testlAndJcc(tmp, tmp, AMD64Assembler.ConditionFlag.Zero, labelDone, true);
        }
        masm.movl(len, new AMD64Address(src));
        masm.movl(new AMD64Address(dst), len);
        masm.movl(len, new AMD64Address(src, tmp, scaleSrc, -4));
        masm.movl(new AMD64Address(dst, tmp, scaleDst, -4), len);
        if (scaleDst.value < 4) {
            masm.jmpb(labelDone);
            masm.bind(labelTailWORD);
            if (scaleDst.value < 2) {
                masm.cmplAndJcc(tmp, 2 / scaleDst.value, AMD64Assembler.ConditionFlag.Less, labelTailBYTE, true);
            } else {
                masm.testlAndJcc(tmp, tmp, AMD64Assembler.ConditionFlag.Zero, labelDone, true);
            }
            masm.movw(len, new AMD64Address(src));
            masm.movw(new AMD64Address(dst), len);
            masm.movw(len, new AMD64Address(src, tmp, scaleSrc, -2));
            masm.movw(new AMD64Address(dst, tmp, scaleDst, -2), len);
        }
        if (scaleDst.value < 2) {
            masm.jmpb(labelDone);
            masm.bind(labelTailBYTE);
            masm.testlAndJcc(tmp, tmp, AMD64Assembler.ConditionFlag.Zero, labelDone, true);
            masm.movb(len, new AMD64Address(src));
            masm.movb(new AMD64Address(dst), len);
        }
        masm.bind(labelDone);
    }

    private static enum Op {
        compressCharToByte,
        compressIntToChar,
        compressIntToByte,
        inflateByteToChar,
        inflateByteToInt,
        inflateCharToInt,
        copy;

    }
}

