package org.apache.spark.ml.feature;

import org.apache.spark.annotation.Experimental;
import org.apache.spark.ml.Estimator;
import org.apache.spark.ml.feature.CountVectorizerParams;
import org.apache.spark.ml.param.DoubleParam;
import org.apache.spark.ml.param.IntParam;
import org.apache.spark.ml.param.Param;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.param.ParamPair;
import org.apache.spark.ml.param.shared.HasInputCol;
import org.apache.spark.ml.param.shared.HasOutputCol;
import org.apache.spark.ml.util.Identifiable$;
import org.apache.spark.rdd.RDD;
import org.apache.spark.rdd.RDD$;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.types.StructType;
import scala.Array$;
import scala.Predef$;
import scala.Tuple2;
import scala.collection.Seq;
import scala.math.Ordering$Long$;
import scala.math.Ordering$String$;
import scala.reflect.ClassTag$;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;

/* compiled from: CountVectorizer.scala */
@ScalaSignature(bytes = "\u0006\u0001M4A!\u0001\u0002\u0001\u001b\ty1i\\;oiZ+7\r^8sSj,'O\u0003\u0002\u0004\t\u00059a-Z1ukJ,'BA\u0003\u0007\u0003\tiGN\u0003\u0002\b\u0011\u0005)1\u000f]1sW*\u0011\u0011BC\u0001\u0007CB\f7\r[3\u000b\u0003-\t1a\u001c:h\u0007\u0001\u00192\u0001\u0001\b\u0017!\ry\u0001CE\u0007\u0002\t%\u0011\u0011\u0003\u0002\u0002\n\u000bN$\u0018.\\1u_J\u0004\"a\u0005\u000b\u000e\u0003\tI!!\u0006\u0002\u0003)\r{WO\u001c;WK\u000e$xN]5{KJlu\u000eZ3m!\t\u0019r#\u0003\u0002\u0019\u0005\t)2i\\;oiZ+7\r^8sSj,'\u000fU1sC6\u001c\b\u0002\u0003\u000e\u0001\u0005\u000b\u0007I\u0011I\u000e\u0002\u0007ULG-F\u0001\u001d!\ti2E\u0004\u0002\u001fC5\tqDC\u0001!\u0003\u0015\u00198-\u00197b\u0013\t\u0011s$\u0001\u0004Qe\u0016$WMZ\u0005\u0003I\u0015\u0012aa\u0015;sS:<'B\u0001\u0012 \u0011!9\u0003A!A!\u0002\u0013a\u0012\u0001B;jI\u0002BQ!\u000b\u0001\u0005\u0002)\na\u0001P5oSRtDCA\u0016-!\t\u0019\u0002\u0001C\u0003\u001bQ\u0001\u0007A\u0004C\u0003*\u0001\u0011\u0005a\u0006F\u0001,\u0011\u0015\u0001\u0004\u0001\"\u00012\u0003-\u0019X\r^%oaV$8i\u001c7\u0015\u0005I\u001aT\"\u0001\u0001\t\u000bQz\u0003\u0019\u0001\u000f\u0002\u000bY\fG.^3\t\u000bY\u0002A\u0011A\u001c\u0002\u0019M,GoT;uaV$8i\u001c7\u0015\u0005IB\u0004\"\u0002\u001b6\u0001\u0004a\u0002\"\u0002\u001e\u0001\t\u0003Y\u0014\u0001D:fiZ{7-\u00192TSj,GC\u0001\u001a=\u0011\u0015!\u0014\b1\u0001>!\tqb(\u0003\u0002@?\t\u0019\u0011J\u001c;\t\u000b\u0005\u0003A\u0011\u0001\"\u0002\u0011M,G/T5o\t\u001a#\"AM\"\t\u000bQ\u0002\u0005\u0019\u0001#\u0011\u0005y)\u0015B\u0001$ \u0005\u0019!u.\u001e2mK\")\u0001\n\u0001C\u0001\u0013\u0006A1/\u001a;NS:$f\t\u0006\u00023\u0015\")Ag\u0012a\u0001\t\")A\n\u0001C!\u001b\u0006\u0019a-\u001b;\u0015\u0005Iq\u0005\"B(L\u0001\u0004\u0001\u0016a\u00023bi\u0006\u001cX\r\u001e\t\u0003#Rk\u0011A\u0015\u0006\u0003'\u001a\t1a]9m\u0013\t)&KA\u0005ECR\fgI]1nK\")q\u000b\u0001C!1\u0006yAO]1og\u001a|'/\\*dQ\u0016l\u0017\r\u0006\u0002Z?B\u0011!,X\u0007\u00027*\u0011ALU\u0001\u0006if\u0004Xm]\u0005\u0003=n\u0013!b\u0015;sk\u000e$H+\u001f9f\u0011\u0015\u0001g\u000b1\u0001Z\u0003\u0019\u00198\r[3nC\")!\r\u0001C!G\u0006!1m\u001c9z)\tYC\rC\u0003fC\u0002\u0007a-A\u0003fqR\u0014\u0018\r\u0005\u0002hU6\t\u0001N\u0003\u0002j\t\u0005)\u0001/\u0019:b[&\u00111\u000e\u001b\u0002\t!\u0006\u0014\u0018-\\'ba\"\u0012\u0001!\u001c\t\u0003]Fl\u0011a\u001c\u0006\u0003a\u001a\t!\"\u00198o_R\fG/[8o\u0013\t\u0011xN\u0001\u0007FqB,'/[7f]R\fG\u000e")
@Experimental
/* loaded from: input_file:org/apache/spark/ml/feature/CountVectorizer.class */
public class CountVectorizer extends Estimator<CountVectorizerModel> implements CountVectorizerParams {
    private final String uid;
    private final IntParam vocabSize;
    private final DoubleParam minDF;
    private final DoubleParam minTF;
    private final Param<String> outputCol;
    private final Param<String> inputCol;

    @Override // org.apache.spark.ml.feature.CountVectorizerParams
    public IntParam vocabSize() {
        return this.vocabSize;
    }

    @Override // org.apache.spark.ml.feature.CountVectorizerParams
    public DoubleParam minDF() {
        return this.minDF;
    }

    @Override // org.apache.spark.ml.feature.CountVectorizerParams
    public DoubleParam minTF() {
        return this.minTF;
    }

    @Override // org.apache.spark.ml.feature.CountVectorizerParams
    public void org$apache$spark$ml$feature$CountVectorizerParams$_setter_$vocabSize_$eq(IntParam intParam) {
        this.vocabSize = intParam;
    }

    @Override // org.apache.spark.ml.feature.CountVectorizerParams
    public void org$apache$spark$ml$feature$CountVectorizerParams$_setter_$minDF_$eq(DoubleParam doubleParam) {
        this.minDF = doubleParam;
    }

    @Override // org.apache.spark.ml.feature.CountVectorizerParams
    public void org$apache$spark$ml$feature$CountVectorizerParams$_setter_$minTF_$eq(DoubleParam doubleParam) {
        this.minTF = doubleParam;
    }

    @Override // org.apache.spark.ml.feature.CountVectorizerParams
    public int getVocabSize() {
        return CountVectorizerParams.Cclass.getVocabSize(this);
    }

    @Override // org.apache.spark.ml.feature.CountVectorizerParams
    public double getMinDF() {
        return CountVectorizerParams.Cclass.getMinDF(this);
    }

    @Override // org.apache.spark.ml.feature.CountVectorizerParams
    public StructType validateAndTransformSchema(StructType structType) {
        return CountVectorizerParams.Cclass.validateAndTransformSchema(this, structType);
    }

    @Override // org.apache.spark.ml.feature.CountVectorizerParams
    public double getMinTF() {
        return CountVectorizerParams.Cclass.getMinTF(this);
    }

    @Override // org.apache.spark.ml.param.shared.HasOutputCol
    public final Param<String> outputCol() {
        return this.outputCol;
    }

    @Override // org.apache.spark.ml.param.shared.HasOutputCol
    public final void org$apache$spark$ml$param$shared$HasOutputCol$_setter_$outputCol_$eq(Param param) {
        this.outputCol = param;
    }

    @Override // org.apache.spark.ml.param.shared.HasOutputCol
    public final String getOutputCol() {
        return HasOutputCol.Cclass.getOutputCol(this);
    }

    @Override // org.apache.spark.ml.param.shared.HasInputCol
    public final Param<String> inputCol() {
        return this.inputCol;
    }

    @Override // org.apache.spark.ml.param.shared.HasInputCol
    public final void org$apache$spark$ml$param$shared$HasInputCol$_setter_$inputCol_$eq(Param param) {
        this.inputCol = param;
    }

    @Override // org.apache.spark.ml.param.shared.HasInputCol
    public final String getInputCol() {
        return HasInputCol.Cclass.getInputCol(this);
    }

    @Override // org.apache.spark.ml.util.Identifiable
    public String uid() {
        return this.uid;
    }

    public CountVectorizer setInputCol(String str) {
        return (CountVectorizer) set((Param<Param<String>>) inputCol(), (Param<String>) str);
    }

    public CountVectorizer setOutputCol(String str) {
        return (CountVectorizer) set((Param<Param<String>>) outputCol(), (Param<String>) str);
    }

    public CountVectorizer setVocabSize(int i) {
        return (CountVectorizer) set((Param<IntParam>) vocabSize(), (IntParam) BoxesRunTime.boxToInteger(i));
    }

    public CountVectorizer setMinDF(double d) {
        return (CountVectorizer) set((Param<DoubleParam>) minDF(), (DoubleParam) BoxesRunTime.boxToDouble(d));
    }

    public CountVectorizer setMinTF(double d) {
        return (CountVectorizer) set((Param<DoubleParam>) minTF(), (DoubleParam) BoxesRunTime.boxToDouble(d));
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // org.apache.spark.ml.Estimator
    public CountVectorizerModel fit(DataFrame dataFrame) {
        transformSchema(dataFrame.schema(), true);
        int unboxToInt = BoxesRunTime.unboxToInt($(vocabSize()));
        RDD cache = RDD$.MODULE$.rddToPairRDDFunctions(dataFrame.select((String) $(inputCol()), Predef$.MODULE$.wrapRefArray(new String[0])).map(new CountVectorizer$$anonfun$1(this), ClassTag$.MODULE$.apply(Seq.class)).flatMap(new CountVectorizer$$anonfun$2(this), ClassTag$.MODULE$.apply(Tuple2.class)), ClassTag$.MODULE$.apply(String.class), ClassTag$.MODULE$.apply(Tuple2.class), Ordering$String$.MODULE$).reduceByKey(new CountVectorizer$$anonfun$3(this)).filter(new CountVectorizer$$anonfun$4(this, BoxesRunTime.unboxToDouble($(minDF())) >= 1.0d ? BoxesRunTime.unboxToDouble($(minDF())) : BoxesRunTime.unboxToDouble($(minDF())) * r0.cache().count())).map(new CountVectorizer$$anonfun$5(this), ClassTag$.MODULE$.apply(Tuple2.class)).cache();
        String[] strArr = (String[]) Predef$.MODULE$.refArrayOps(cache.count() <= ((long) unboxToInt) ? (Tuple2[]) Predef$.MODULE$.refArrayOps((Object[]) cache.collect()).sortBy(new CountVectorizer$$anonfun$6(this), Ordering$Long$.MODULE$) : (Tuple2[]) cache.sortBy(new CountVectorizer$$anonfun$7(this), false, cache.sortBy$default$3(), Ordering$Long$.MODULE$, ClassTag$.MODULE$.Long()).take(unboxToInt)).map(new CountVectorizer$$anonfun$8(this), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class)));
        Predef$.MODULE$.require(strArr.length > 0, new CountVectorizer$$anonfun$fit$1(this));
        return (CountVectorizerModel) copyValues(new CountVectorizerModel(uid(), strArr).setParent(this), copyValues$default$2());
    }

    @Override // org.apache.spark.ml.PipelineStage
    public StructType transformSchema(StructType structType) {
        return validateAndTransformSchema(structType);
    }

    @Override // org.apache.spark.ml.Estimator, org.apache.spark.ml.PipelineStage, org.apache.spark.ml.param.Params
    public CountVectorizer copy(ParamMap paramMap) {
        return (CountVectorizer) defaultCopy(paramMap);
    }

    public CountVectorizer(String str) {
        this.uid = str;
        HasInputCol.Cclass.$init$(this);
        HasOutputCol.Cclass.$init$(this);
        CountVectorizerParams.Cclass.$init$(this);
        setDefault(Predef$.MODULE$.wrapRefArray(new ParamPair[]{vocabSize().$minus$greater(BoxesRunTime.boxToInteger(262144)), minDF().$minus$greater(BoxesRunTime.boxToDouble(1.0d))}));
    }

    public CountVectorizer() {
        this(Identifiable$.MODULE$.randomUID("cntVec"));
    }
}
