package org.apache.sysds.runtime.transform.tokenize.builder;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.frame.data.FrameBlock;
import org.apache.sysds.runtime.transform.tokenize.DocumentRepresentation;
import org.apache.sysds.runtime.transform.tokenize.Token;
import org.apache.sysds.runtime.util.UtilFunctions;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

/* loaded from: input_file:org/apache/sysds/runtime/transform/tokenize/builder/TokenizerBuilderNgram.class */
public class TokenizerBuilderNgram extends TokenizerBuilderWhitespaceSplit {
    private static final long serialVersionUID = -6297904316677723802L;
    public int minGram;
    public int maxGram;
    public NgramType ngramType;

    /* loaded from: input_file:org/apache/sysds/runtime/transform/tokenize/builder/TokenizerBuilderNgram$NgramType.class */
    private enum NgramType {
        DOCUMENT,
        TOKEN
    }

    public TokenizerBuilderNgram(int[] iArr, int i, JSONObject jSONObject) throws JSONException {
        super(iArr, i, jSONObject);
        this.minGram = 1;
        this.maxGram = 2;
        this.ngramType = NgramType.DOCUMENT;
        if (jSONObject != null && jSONObject.has("min_gram")) {
            this.minGram = jSONObject.getInt("min_gram");
        }
        if (jSONObject != null && jSONObject.has("max_gram")) {
            this.maxGram = jSONObject.getInt("max_gram");
        }
        if (jSONObject == null || !jSONObject.has("ngram_type")) {
            return;
        }
        String lowerCase = jSONObject.getString("ngram_type").toLowerCase();
        if (lowerCase.equals("document")) {
            this.ngramType = NgramType.DOCUMENT;
        } else {
            if (!lowerCase.equals("token")) {
                throw new DMLRuntimeException("Invalid ngram type, choose between 'token' and 'document'");
            }
            this.ngramType = NgramType.TOKEN;
        }
    }

    public List<Token> splitIntoNgrams(Token token, int i, int i2) {
        if (token.getNumSubTokens() == 0) {
            throw new DMLRuntimeException("Cannot create ngram of token where there are no subTokens");
        }
        if (token.getNumSubTokens() != 1) {
            throw new DMLRuntimeException("Cannot create ngram of token where there are more than 1 subTokens");
        }
        String token2 = token.toString();
        ArrayList arrayList = new ArrayList();
        for (int i3 = i; i3 <= i2; i3++) {
            for (int i4 = 0; i4 < (token2.length() - i3) + 1; i4++) {
                arrayList.add(new Token(token2.substring(i4, i4 + i3), token.getStartIndex(0) + i4));
            }
        }
        return arrayList;
    }

    @Override // org.apache.sysds.runtime.transform.tokenize.builder.TokenizerBuilderWhitespaceSplit, org.apache.sysds.runtime.transform.tokenize.builder.TokenizerBuilder
    public void createInternalRepresentation(FrameBlock frameBlock, DocumentRepresentation[] documentRepresentationArr, int i, int i2) {
        super.createInternalRepresentation(frameBlock, documentRepresentationArr, i, i2);
        int endIndex = UtilFunctions.getEndIndex(frameBlock.getNumRows(), i, i2);
        for (int i3 = i; i3 < endIndex; i3++) {
            DocumentRepresentation documentRepresentation = documentRepresentationArr[i3];
            if (this.ngramType == NgramType.DOCUMENT) {
                documentRepresentation.splitIntoNgrams(this.minGram, this.maxGram);
            } else if (this.ngramType == NgramType.TOKEN) {
                ArrayList arrayList = new ArrayList();
                Iterator<Token> it = documentRepresentation.getTokens().iterator();
                while (it.hasNext()) {
                    arrayList.addAll(splitIntoNgrams(it.next(), this.minGram, this.maxGram));
                }
                documentRepresentation.tokens = arrayList;
            }
        }
    }
}
