package org.apache.sysds.runtime.transform.tokenize;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.sysds.runtime.matrix.data.FrameBlock;
import org.apache.sysds.runtime.transform.tokenize.Tokenizer;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

/* loaded from: input_file:org/apache/sysds/runtime/transform/tokenize/TokenizerPreNgram.class */
public class TokenizerPreNgram implements TokenizerPre {
    private static final long serialVersionUID = -6297904316677723802L;
    private final TokenizerPreWhitespaceSplit tokenizerPreWhitespaceSplit;
    private final Params params;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/apache/sysds/runtime/transform/tokenize/TokenizerPreNgram$Params.class */
    public static class Params implements Serializable {
        private static final long serialVersionUID = -6516419749810062677L;
        public int minGram;
        public int maxGram;

        public Params(JSONObject jSONObject) throws JSONException {
            this.minGram = 1;
            this.maxGram = 2;
            if (jSONObject != null && jSONObject.has("min_gram")) {
                this.minGram = jSONObject.getInt("min_gram");
            }
            if (jSONObject == null || !jSONObject.has("max_gram")) {
                return;
            }
            this.maxGram = jSONObject.getInt("max_gram");
        }
    }

    public TokenizerPreNgram(List<Integer> list, int i, JSONObject jSONObject) throws JSONException {
        this.tokenizerPreWhitespaceSplit = new TokenizerPreWhitespaceSplit(list, i, jSONObject);
        this.params = new Params(jSONObject);
    }

    public List<Tokenizer.Token> wordTokenToNgrams(Tokenizer.Token token) {
        ArrayList arrayList = new ArrayList();
        int length = token.textToken.length();
        int i = this.params.minGram - this.params.maxGram;
        int max = Math.max(length - this.params.minGram, i);
        for (int i2 = i; i2 <= max; i2++) {
            int max2 = Math.max(i2, 0);
            arrayList.add(new Tokenizer.Token(token.textToken.substring(max2, Math.min(i2 + this.params.maxGram, length)), token.startIndex + max2));
        }
        return arrayList;
    }

    public List<Tokenizer.Token> wordTokenListToNgrams(List<Tokenizer.Token> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<Tokenizer.Token> it = list.iterator();
        while (it.hasNext()) {
            arrayList.addAll(wordTokenToNgrams(it.next()));
        }
        return arrayList;
    }

    @Override // org.apache.sysds.runtime.transform.tokenize.TokenizerPre
    public List<Tokenizer.DocumentToTokens> tokenizePre(FrameBlock frameBlock) {
        List<Tokenizer.DocumentToTokens> list = this.tokenizerPreWhitespaceSplit.tokenizePre(frameBlock);
        ArrayList arrayList = new ArrayList();
        for (Tokenizer.DocumentToTokens documentToTokens : list) {
            arrayList.add(new Tokenizer.DocumentToTokens(documentToTokens.keys, wordTokenListToNgrams(documentToTokens.tokens)));
        }
        return arrayList;
    }
}
