package de.tudarmstadt.ukp.dkpro.core.io.text;

import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathInfo;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.file.Files;
import java.util.Collections;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.CloseShieldOutputStream;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.internal.EnhancedClassFile;
import org.apache.uima.fit.util.CasUtil;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

@ResourceMetaData(name = "de.tudarmstadt.ukp.dkpro.core.io.text.TokenizedTextWriter", description = "This class writes a set of pre-processed documents into a large text file containing one sentence\nper line and tokens split by whitespaces. Optionally, annotations other than tokens (e.g. lemmas)\nare written as specified by #PARAM_FEATURE_PATH.", version = "1.8.0", vendor = "DKPro Core Project", copyright = "Copyright 2010-2015\n                            Ubiquitous Knowledge Processing (UKP) Lab\n                            Technische Universität Darmstadt")
@EnhancedClassFile
/* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/io/text/TokenizedTextWriter.class */
public class TokenizedTextWriter extends JCasFileWriter_ImplBase {
    private static final String TOKEN_SEPARATOR = " ";
    private static final String NUMBER_REPLACEMENT = "NUM";
    private static final String STOPWORD_REPLACEMENT = "STOP";
    public static final String PARAM_TARGET_ENCODING = "targetEncoding";

    @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = {"UTF-8"}, description = "Encoding for the target file. Default is UTF-8.")
    private String targetEncoding;
    public static final String PARAM_FEATURE_PATH = "featurePath";

    @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = true, defaultValue = {"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"}, description = "The feature path, e.g.\nde.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma/value for lemmas. Default:\nde.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token (i.e. token texts).\n<p>\nIn order to specify a different annotation use the annotation class' type name (e.g.\nToken.class.getTypeName()) and optionally append a field, e.g. /value to\nspecify the feature path. If you do not specify a field, the covered text is used.")
    private String featurePath;
    public static final String PARAM_NUMBER_REGEX = "numberRegex";

    @ConfigurationParameter(name = PARAM_NUMBER_REGEX, mandatory = false, description = "All tokens that match this regex are replaced by NUM. Examples:\n<ul>\n<li>^[0-9]+$\n<li>^[0-9,\\.]+$\n<li>^[0-9]+(\\.[0-9]*)?$\n</ul>\n<p>\nMake sure that these regular expressions are fit to the segmentation, e.g. if your work on\ntokens, your tokenizer might split prefixes such as + and - from the rest of the number.")
    private String numberRegex;
    public static final String PARAM_STOPWORDS_FILE = "stopwordsFile";

    @ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = false, description = "All the tokens listed in this file (one token per line) are replaced by STOP. Empty\nlines and lines starting with # are ignored. Casing is ignored.")
    private File stopwordsFile;
    private Set<String> stopwords;
    private BufferedWriter targetWriter;

    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        super.initialize(uimaContext);
        try {
            if (getTargetLocation() == null) {
                getLogger().info("Writing to file <stdout>");
                this.targetWriter = new BufferedWriter(new OutputStreamWriter((OutputStream) new CloseShieldOutputStream(System.out), this.targetEncoding));
            } else {
                getLogger().info("Writing to file " + getTargetLocation());
                this.targetWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(getTargetLocation()), this.targetEncoding));
            }
            try {
                this.stopwords = this.stopwordsFile == null ? Collections.emptySet() : readStopwordsFile(this.stopwordsFile);
            } catch (IOException e) {
                throw new ResourceInitializationException(e);
            }
        } catch (IOException e2) {
            throw new ResourceInitializationException(e2);
        }
    }

    private static Set<String> readStopwordsFile(File file) throws IOException {
        return (Set) Files.readAllLines(file.toPath()).stream().map((v0) -> {
            return v0.trim();
        }).filter(str -> {
            return !str.isEmpty();
        }).filter(str2 -> {
            return !str2.startsWith("#");
        }).map(str3 -> {
            return str3.toLowerCase();
        }).collect(Collectors.toSet());
    }

    public void process(JCas jCas) throws AnalysisEngineProcessException {
        String[] split = this.featurePath.split("/", 2);
        String str = split[0];
        Type type = jCas.getTypeSystem().getType(str);
        if (type == null) {
            throw new IllegalStateException("Type [" + str + "] not found in type system");
        }
        try {
            writeTokens(jCas, type, initFeaturePathInfo(split));
        } catch (FeaturePathException | IOException e) {
            throw new AnalysisEngineProcessException(e);
        }
    }

    private void writeTokens(JCas jCas, Type type, FeaturePathInfo featurePathInfo) throws IOException {
        for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
            getLogger().trace("Sentence: '" + sentence.getCoveredText() + "'.");
            FeaturePathFactory.FeaturePathIterator featurePathIterator = new FeaturePathFactory.FeaturePathIterator(CasUtil.selectCovered(jCas.getCas(), type, sentence).iterator(), featurePathInfo);
            boolean z = true;
            while (true) {
                boolean z2 = z;
                if (featurePathIterator.hasNext()) {
                    String str = (String) featurePathIterator.next().getValue();
                    String str2 = this.stopwords.contains(str.toLowerCase()) ? STOPWORD_REPLACEMENT : str;
                    String str3 = (this.numberRegex == null || !str2.matches(this.numberRegex)) ? str2 : NUMBER_REPLACEMENT;
                    this.targetWriter.write(z2 ? str3 : TOKEN_SEPARATOR + str3);
                    z = false;
                }
            }
            getLogger().trace("End of sentence.");
            this.targetWriter.newLine();
        }
        this.targetWriter.flush();
    }

    private FeaturePathInfo initFeaturePathInfo(String[] strArr) throws FeaturePathException {
        FeaturePathInfo featurePathInfo = new FeaturePathInfo();
        featurePathInfo.initialize(strArr.length > 1 ? strArr[1] : "");
        return featurePathInfo;
    }

    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        IOUtils.closeQuietly(this.targetWriter);
        if (getTargetLocation() == null) {
            getLogger().info("Output written to file <stdout>");
        } else {
            getLogger().info("Output written to file " + getTargetLocation());
        }
        super.collectionProcessComplete();
    }
}
