package opennlp.uima.tokenize;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import opennlp.tools.ml.maxent.GIS;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenSampleStream;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.uima.util.CasConsumerUtil;
import opennlp.uima.util.ContainingConstraint;
import opennlp.uima.util.OpennlpUtil;
import opennlp.uima.util.SampleTraceStream;
import opennlp.uima.util.UimaUtil;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.ProcessTrace;

/* loaded from: input_file:opennlp/uima/tokenize/TokenizerTrainer.class */
public final class TokenizerTrainer extends CasConsumer_ImplBase {
    public static final String IS_ALPHA_NUMERIC_OPTIMIZATION = "opennlp.uima.tokenizer.IsAlphaNumericOptimization";
    private List<TokenSample> tokenSamples = new ArrayList();
    private UimaContext mContext;
    private Type mSentenceType;
    private Type mTokenType;
    private String mModelName;
    private String additionalTrainingDataFile;
    private String additionalTrainingDataEncoding;
    private String language;
    private Boolean isSkipAlphaNumerics;
    private Logger mLogger;
    private String sampleTraceFileEncoding;
    private File sampleTraceFile;

    public void initialize() throws ResourceInitializationException {
        super.initialize();
        this.mContext = getUimaContext();
        this.mLogger = this.mContext.getLogger();
        if (this.mLogger.isLoggable(Level.INFO)) {
            this.mLogger.log(Level.INFO, "Initializing the OpenNLP Tokenizer trainer.");
        }
        this.mModelName = CasConsumerUtil.getRequiredStringParameter(this.mContext, UimaUtil.MODEL_PARAMETER);
        this.language = CasConsumerUtil.getRequiredStringParameter(this.mContext, UimaUtil.LANGUAGE_PARAMETER);
        this.isSkipAlphaNumerics = CasConsumerUtil.getOptionalBooleanParameter(this.mContext, IS_ALPHA_NUMERIC_OPTIMIZATION);
        if (this.isSkipAlphaNumerics == null) {
            this.isSkipAlphaNumerics = false;
        }
        this.additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter(getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_FILE);
        if (this.additionalTrainingDataFile != null) {
            this.additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_ENCODING);
        }
        String optionalStringParameter = CasConsumerUtil.getOptionalStringParameter(getUimaContext(), "opennlp.uima.SampleTraceFile");
        if (optionalStringParameter != null) {
            this.sampleTraceFile = new File(getUimaContextAdmin().getResourceManager().getDataPath() + File.separatorChar + optionalStringParameter);
            this.sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter(getUimaContext(), "opennlp.uima.SampleTraceFileEncoding");
        }
    }

    public void typeSystemInit(TypeSystem typeSystem) throws ResourceInitializationException {
        this.mSentenceType = CasConsumerUtil.getType(typeSystem, CasConsumerUtil.getRequiredStringParameter(this.mContext, UimaUtil.SENTENCE_TYPE_PARAMETER));
        this.mTokenType = CasConsumerUtil.getType(typeSystem, CasConsumerUtil.getRequiredStringParameter(this.mContext, UimaUtil.TOKEN_TYPE_PARAMETER));
    }

    public void processCas(CAS cas) {
        Iterator it = cas.getAnnotationIndex(this.mSentenceType).iterator();
        while (it.hasNext()) {
            process(cas, (AnnotationFS) it.next());
        }
    }

    private void process(CAS cas, AnnotationFS annotationFS) {
        AnnotationIndex annotationIndex = cas.getAnnotationIndex(this.mTokenType);
        FSIterator createFilteredIterator = cas.createFilteredIterator(annotationIndex.iterator(), new ContainingConstraint(annotationFS));
        LinkedList linkedList = new LinkedList();
        while (createFilteredIterator.hasNext()) {
            AnnotationFS annotationFS2 = (AnnotationFS) createFilteredIterator.next();
            linkedList.add(new Span(annotationFS2.getBegin() - annotationFS.getBegin(), annotationFS2.getEnd() - annotationFS.getBegin()));
        }
        Span[] spanArr = (Span[]) linkedList.toArray(new Span[linkedList.size()]);
        Arrays.sort(spanArr);
        this.tokenSamples.add(new TokenSample(annotationFS.getCoveredText(), spanArr));
    }

    public void collectionProcessComplete(ProcessTrace processTrace) throws ResourceProcessException, IOException {
        if (this.mLogger.isLoggable(Level.INFO)) {
            this.mLogger.log(Level.INFO, "Collected " + this.tokenSamples.size() + " token samples.");
        }
        GIS.PRINT_MESSAGES = false;
        ObjectStream createObjectStream = ObjectStreamUtils.createObjectStream(this.tokenSamples);
        FileInputStream fileInputStream = null;
        try {
            if (this.additionalTrainingDataFile != null) {
                if (this.mLogger.isLoggable(Level.INFO)) {
                    this.mLogger.log(Level.INFO, "Using addional training data file: " + this.additionalTrainingDataFile);
                }
                fileInputStream = new FileInputStream(this.additionalTrainingDataFile);
                createObjectStream = ObjectStreamUtils.createObjectStream(new ObjectStream[]{createObjectStream, new TokenSampleStream(new PlainTextByLineStream(new InputStreamReader(fileInputStream, this.additionalTrainingDataEncoding)))});
            }
            if (this.sampleTraceFile != null) {
                createObjectStream = new SampleTraceStream(createObjectStream, new OutputStreamWriter(new FileOutputStream(this.sampleTraceFile), this.sampleTraceFileEncoding));
            }
            TokenizerModel train = TokenizerME.train(this.language, createObjectStream, this.isSkipAlphaNumerics.booleanValue());
            if (fileInputStream != null) {
                fileInputStream.close();
            }
            this.tokenSamples = null;
            OpennlpUtil.serialize(train, new File(getUimaContextAdmin().getResourceManager().getDataPath() + File.separatorChar + this.mModelName));
        } catch (Throwable th) {
            if (fileInputStream != null) {
                fileInputStream.close();
            }
            throw th;
        }
    }

    public boolean isStateless() {
        return false;
    }

    public void destroy() {
        this.tokenSamples = null;
    }
}
