package com.googlecode.clearnlp.run;

import com.googlecode.clearnlp.engine.EngineGetter;
import com.googlecode.clearnlp.reader.AbstractColumnReader;
import com.googlecode.clearnlp.reader.AbstractReader;
import com.googlecode.clearnlp.segmentation.AbstractSegmenter;
import com.googlecode.clearnlp.tokenization.AbstractTokenizer;
import com.googlecode.clearnlp.util.UTArray;
import com.googlecode.clearnlp.util.UTInput;
import com.googlecode.clearnlp.util.UTOutput;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Iterator;
import java.util.List;
import org.kohsuke.args4j.Option;

/* loaded from: input_file:com/googlecode/clearnlp/run/Tokenizer.class */
public class Tokenizer extends AbstractRun {

    @Option(name = "-i", usage = "input path (required)", required = true, metaVar = "<filepath>")
    private String s_inputPath;

    @Option(name = "-d", usage = "name of a dictionary file (required)", required = true, metaVar = "<filename>")
    private String s_dictFile;

    @Option(name = "-twit", usage = "if set, tokenize for twits", required = false, metaVar = "<boolean>")
    protected boolean b_twit;

    @Option(name = "-ie", usage = "input file extension (default: .*)", required = false, metaVar = "<regex>")
    private String s_inputExt = ".*";

    @Option(name = "-oe", usage = "output file extension (default: tok)", required = false, metaVar = "<string>")
    private String s_outputExt = "tok";

    @Option(name = "-l", usage = "language (default: en)", required = false, metaVar = "<language>")
    private String s_language = AbstractReader.LANG_EN;

    @Option(name = "-if", usage = "input format (default: raw)", required = false, metaVar = "<string>")
    private String i_format = AbstractReader.TYPE_RAW;

    @Option(name = "-of", usage = "output format (default: line)", required = false, metaVar = "<string>")
    private String o_format = AbstractReader.TYPE_LINE;

    public Tokenizer() {
    }

    public Tokenizer(String[] strArr) {
        initArgs(strArr);
        AbstractTokenizer tokenizer = EngineGetter.getTokenizer(this.s_language, this.s_dictFile);
        AbstractSegmenter segmenter = this.i_format.equals(AbstractReader.TYPE_RAW) ? EngineGetter.getSegmenter(this.s_language, tokenizer) : null;
        List<String[]> filenames = getFilenames(this.s_inputPath, this.s_inputExt, this.s_outputExt);
        boolean equals = this.o_format.equals(AbstractReader.TYPE_LINE);
        tokenizer.setTwit(this.b_twit);
        try {
            for (String[] strArr2 : filenames) {
                System.out.println(strArr2[0]);
                tokenize(tokenizer, segmenter, strArr2[0], strArr2[1], equals);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void tokenize(AbstractTokenizer abstractTokenizer, AbstractSegmenter abstractSegmenter, String str, String str2, boolean z) throws IOException {
        BufferedReader createBufferedFileReader = UTInput.createBufferedFileReader(str);
        PrintStream createPrintBufferedFileStream = UTOutput.createPrintBufferedFileStream(str2);
        if (abstractSegmenter == null) {
            while (true) {
                String readLine = createBufferedFileReader.readLine();
                if (readLine == null) {
                    break;
                } else {
                    print(createPrintBufferedFileStream, abstractTokenizer.getTokens(readLine), z);
                }
            }
        } else {
            Iterator<List<String>> it = abstractSegmenter.getSentences(createBufferedFileReader).iterator();
            while (it.hasNext()) {
                print(createPrintBufferedFileStream, it.next(), z);
            }
        }
        createBufferedFileReader.close();
        createPrintBufferedFileStream.close();
    }

    private void print(PrintStream printStream, List<String> list, boolean z) {
        if (z) {
            printStream.println(UTArray.join(list, " "));
        } else {
            printStream.println(UTArray.join(list, AbstractColumnReader.DELIM_SENTENCE) + AbstractColumnReader.DELIM_SENTENCE);
        }
    }

    public static void main(String[] strArr) {
        new Tokenizer(strArr);
    }
}
