package org.apache.sysds.runtime.transform.tokenize;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.sysds.runtime.matrix.data.FrameBlock;
import org.apache.sysds.runtime.transform.tokenize.Tokenizer;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

/* loaded from: input_file:org/apache/sysds/runtime/transform/tokenize/TokenizerPreWhitespaceSplit.class */
public class TokenizerPreWhitespaceSplit implements TokenizerPre {
    private static final long serialVersionUID = 539127244034913364L;
    private final Params params;
    private final List<Integer> idCols;
    private final int tokenizeCol;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/apache/sysds/runtime/transform/tokenize/TokenizerPreWhitespaceSplit$Params.class */
    public static class Params implements Serializable {
        private static final long serialVersionUID = -4368552847660442628L;
        public String regex;

        public Params(JSONObject jSONObject) throws JSONException {
            this.regex = "\\s+";
            if (jSONObject == null || !jSONObject.has("regex")) {
                return;
            }
            this.regex = jSONObject.getString("regex");
        }
    }

    public TokenizerPreWhitespaceSplit(List<Integer> list, int i, JSONObject jSONObject) throws JSONException {
        this.idCols = list;
        this.tokenizeCol = i;
        this.params = new Params(jSONObject);
    }

    public List<Tokenizer.Token> splitToTokens(String str) {
        ArrayList arrayList = new ArrayList();
        int i = 0;
        for (String str2 : str.split(this.params.regex)) {
            int indexOf = str.indexOf(str2, i);
            i = indexOf;
            arrayList.add(new Tokenizer.Token(str2, indexOf));
        }
        return arrayList;
    }

    @Override // org.apache.sysds.runtime.transform.tokenize.TokenizerPre
    public List<Tokenizer.DocumentToTokens> tokenizePre(FrameBlock frameBlock) {
        ArrayList arrayList = new ArrayList();
        frameBlock.getStringRowIterator().forEachRemaining(strArr -> {
            String str = strArr[this.tokenizeCol - 1];
            ArrayList arrayList2 = new ArrayList();
            Iterator<Integer> it = this.idCols.iterator();
            while (it.hasNext()) {
                arrayList2.add(strArr[it.next().intValue() - 1]);
            }
            arrayList.add(new Tokenizer.DocumentToTokens(arrayList2, splitToTokens(str)));
        });
        return arrayList;
    }
}
