package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.pipeline.CleanXmlAnnotator;
import edu.stanford.nlp.util.Function;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;

/* loaded from: input_file:edu/stanford/nlp/process/DocumentPreprocessor.class */
public class DocumentPreprocessor implements Iterable<List<HasWord>> {
    private Reader inputReader;
    private String inputPath;
    private DocType docType;
    private TokenizerFactory<? extends HasWord> tokenizerFactory;
    private String encoding;
    private String[] sentenceFinalPuncWords;
    private Function<List<HasWord>, List<HasWord>> escaper;
    private String sentenceDelimiter;
    private String tagDelimiter;
    private String elementDelimiter;
    private final String[] sentenceFinalFollowers;

    /* loaded from: input_file:edu/stanford/nlp/process/DocumentPreprocessor$DocType.class */
    public enum DocType {
        Plain,
        XML
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/stanford/nlp/process/DocumentPreprocessor$PlainTextIterator.class */
    public class PlainTextIterator implements Iterator<List<HasWord>> {
        private Tokenizer<? extends HasWord> tokenizer;
        private Set<String> sentDelims;
        private Set<String> delimFollowers;
        private Function<String, String[]> splitTag;
        private List<HasWord> nextSent = null;
        private List<HasWord> nextSentCarryover = new ArrayList();

        public PlainTextIterator() {
            this.delimFollowers = new HashSet(Arrays.asList(DocumentPreprocessor.this.sentenceFinalFollowers));
            this.sentDelims = new HashSet();
            boolean z = false;
            if (DocumentPreprocessor.this.sentenceDelimiter != null) {
                this.sentDelims.add(DocumentPreprocessor.this.sentenceDelimiter);
                this.delimFollowers = new HashSet();
                z = DocumentPreprocessor.this.sentenceDelimiter.matches("\\s+");
                if (z) {
                    this.sentDelims.add(PTBLexer.NEWLINE_TOKEN);
                }
            } else if (DocumentPreprocessor.this.sentenceFinalPuncWords != null) {
                this.sentDelims = new HashSet(Arrays.asList(DocumentPreprocessor.this.sentenceFinalPuncWords));
            }
            if (DocumentPreprocessor.this.tokenizerFactory == null) {
                this.tokenizer = WhitespaceTokenizer.newWordWhitespaceTokenizer(DocumentPreprocessor.this.inputReader, z);
            } else if (z) {
                this.tokenizer = DocumentPreprocessor.this.tokenizerFactory.getTokenizer(DocumentPreprocessor.this.inputReader, "tokenizeNLs");
            } else {
                this.tokenizer = DocumentPreprocessor.this.tokenizerFactory.getTokenizer(DocumentPreprocessor.this.inputReader);
            }
            if (DocumentPreprocessor.this.tagDelimiter != null) {
                this.splitTag = new Function<String, String[]>() { // from class: edu.stanford.nlp.process.DocumentPreprocessor.PlainTextIterator.1
                    private final String splitRegex;

                    {
                        this.splitRegex = String.format("%s(?!.*%s)", DocumentPreprocessor.this.tagDelimiter, DocumentPreprocessor.this.tagDelimiter);
                    }

                    @Override // edu.stanford.nlp.util.Function
                    public String[] apply(String str) {
                        String[] split = str.trim().split(this.splitRegex);
                        return split.length == 2 ? split : new String[]{str};
                    }
                };
            }
        }

        private void primeNext() {
            this.nextSent = new ArrayList(this.nextSentCarryover);
            this.nextSentCarryover.clear();
            boolean z = false;
            while (true) {
                if (!this.tokenizer.hasNext()) {
                    break;
                }
                HasWord next = this.tokenizer.next();
                if (this.splitTag != null) {
                    String[] apply = this.splitTag.apply(next.word());
                    next.setWord(apply[0]);
                    if (apply.length == 2 && (next instanceof HasTag)) {
                        ((HasTag) next).setTag(apply[1]);
                    }
                }
                if (!this.sentDelims.contains(next.word())) {
                    if (z && !this.delimFollowers.contains(next.word())) {
                        this.nextSentCarryover.add(next);
                        break;
                    }
                } else {
                    z = true;
                }
                if (!next.word().matches("\\s+") && !next.word().equals(PTBLexer.NEWLINE_TOKEN)) {
                    this.nextSent.add(next);
                }
                if (z && this.delimFollowers.size() == 0) {
                    if (this.nextSent.size() > 0) {
                        break;
                    } else {
                        z = false;
                    }
                }
            }
            if (this.nextSent.size() == 0 && this.nextSentCarryover.size() == 0) {
                IOUtils.closeIgnoringExceptions(DocumentPreprocessor.this.inputReader);
                DocumentPreprocessor.this.inputReader = null;
                this.nextSent = null;
            } else if (DocumentPreprocessor.this.escaper != null) {
                this.nextSent = (List) DocumentPreprocessor.this.escaper.apply(this.nextSent);
            }
        }

        @Override // java.util.Iterator
        public boolean hasNext() {
            if (this.nextSent == null) {
                primeNext();
            }
            return this.nextSent != null;
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // java.util.Iterator
        public List<HasWord> next() {
            if (this.nextSent == null) {
                primeNext();
            }
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> list = this.nextSent;
            this.nextSent = null;
            return list;
        }

        @Override // java.util.Iterator
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/stanford/nlp/process/DocumentPreprocessor$XMLIterator.class */
    public class XMLIterator implements Iterator<List<HasWord>> {
        private final XMLBeginEndIterator<String> xmlItr;
        private final Reader originalDocReader;
        private PlainTextIterator plainItr;
        private List<HasWord> nextSent;

        public XMLIterator() {
            this.xmlItr = new XMLBeginEndIterator<>(DocumentPreprocessor.this.inputReader, DocumentPreprocessor.this.elementDelimiter);
            this.originalDocReader = DocumentPreprocessor.this.inputReader;
            primeNext();
        }

        private void primeNext() {
            do {
                if (this.plainItr != null && this.plainItr.hasNext()) {
                    this.nextSent = this.plainItr.next();
                } else {
                    if (!this.xmlItr.hasNext()) {
                        IOUtils.closeIgnoringExceptions(this.originalDocReader);
                        this.nextSent = null;
                        return;
                    }
                    String next = this.xmlItr.next();
                    DocumentPreprocessor.this.inputReader = new BufferedReader(new StringReader(next));
                    this.plainItr = new PlainTextIterator();
                    if (this.plainItr.hasNext()) {
                        this.nextSent = this.plainItr.next();
                    } else {
                        this.nextSent = null;
                    }
                }
            } while (this.nextSent == null);
        }

        @Override // java.util.Iterator
        public boolean hasNext() {
            return this.nextSent != null;
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // java.util.Iterator
        public List<HasWord> next() {
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> list = this.nextSent;
            primeNext();
            return list;
        }

        @Override // java.util.Iterator
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    public DocumentPreprocessor(Reader reader) {
        this(reader, DocType.Plain);
    }

    public DocumentPreprocessor(Reader reader, DocType docType) {
        this.inputReader = null;
        this.inputPath = null;
        this.docType = DocType.Plain;
        this.tokenizerFactory = PTBTokenizer.factory();
        this.encoding = null;
        this.sentenceFinalPuncWords = new String[]{".", "?", "!"};
        this.escaper = null;
        this.sentenceDelimiter = null;
        this.tagDelimiter = null;
        this.elementDelimiter = CleanXmlAnnotator.DEFAULT_XML_TAGS;
        this.sentenceFinalFollowers = new String[]{")", "]", "\"", "'", PTBLexer.closedblquote, PTBLexer.closeparen, "-RSB-", PTBLexer.closebrace};
        if (reader == null) {
            throw new RuntimeException("Cannot read from null object!");
        }
        this.docType = docType;
        this.inputReader = reader;
    }

    public DocumentPreprocessor(String str) {
        this(str, DocType.Plain);
    }

    public DocumentPreprocessor(String str, DocType docType) {
        this.inputReader = null;
        this.inputPath = null;
        this.docType = DocType.Plain;
        this.tokenizerFactory = PTBTokenizer.factory();
        this.encoding = null;
        this.sentenceFinalPuncWords = new String[]{".", "?", "!"};
        this.escaper = null;
        this.sentenceDelimiter = null;
        this.tagDelimiter = null;
        this.elementDelimiter = CleanXmlAnnotator.DEFAULT_XML_TAGS;
        this.sentenceFinalFollowers = new String[]{")", "]", "\"", "'", PTBLexer.closedblquote, PTBLexer.closeparen, "-RSB-", PTBLexer.closebrace};
        if (str == null) {
            throw new RuntimeException("Cannot open null document path!");
        }
        this.docType = docType;
        this.inputPath = str;
    }

    public void setEncoding(String str) throws IllegalCharsetNameException {
        if (Charset.isSupported(str)) {
            this.encoding = str;
        }
    }

    public void setSentenceFinalPuncWords(String[] strArr) {
        this.sentenceFinalPuncWords = strArr;
    }

    public void setTokenizerFactory(TokenizerFactory<? extends HasWord> tokenizerFactory) {
        this.tokenizerFactory = tokenizerFactory;
    }

    public void setEscaper(Function<List<HasWord>, List<HasWord>> function) {
        this.escaper = function;
    }

    public void setSentenceDelimiter(String str) {
        this.sentenceDelimiter = str;
    }

    public void setTagDelimiter(String str) {
        this.tagDelimiter = str;
    }

    public void setElementDelimiter(String str) {
        this.elementDelimiter = str;
    }

    @Override // java.lang.Iterable
    public Iterator<List<HasWord>> iterator() {
        try {
            if (this.inputReader == null) {
                this.inputReader = getReaderFromPath(this.inputPath);
            }
        } catch (IOException e) {
            System.err.printf("%s: Could not open path %s\n", getClass().getName(), this.inputPath);
        }
        if (this.docType == DocType.Plain) {
            return new PlainTextIterator();
        }
        if (this.docType == DocType.XML) {
            return new XMLIterator();
        }
        return new Iterator<List<HasWord>>() { // from class: edu.stanford.nlp.process.DocumentPreprocessor.1
            @Override // java.util.Iterator
            public boolean hasNext() {
                return false;
            }

            /* JADX WARN: Can't rename method to resolve collision */
            @Override // java.util.Iterator
            public List<HasWord> next() {
                throw new NoSuchElementException();
            }

            @Override // java.util.Iterator
            public void remove() {
            }
        };
    }

    private Reader getReaderFromPath(String str) throws IOException {
        try {
            return new BufferedReader(new InputStreamReader(new URL(str).openConnection().getInputStream()));
        } catch (MalformedURLException e) {
            File file = new File(str);
            if (file.exists()) {
                return this.encoding == null ? new FileReader(str) : new BufferedReader(new InputStreamReader(new FileInputStream(file), this.encoding));
            }
            throw new IOException("Unable to open " + str);
        }
    }

    public static void main(String[] strArr) throws IOException {
        if (strArr.length < 1) {
            System.err.println("usage: DocumentPreprocessor filename [OPTS]");
            return;
        }
        DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(strArr[0]);
        String str = "utf-8";
        boolean z = false;
        int i = 1;
        while (i < strArr.length) {
            if (strArr[i].equals("-xml")) {
                documentPreprocessor = new DocumentPreprocessor(strArr[0], DocType.XML);
                i++;
                documentPreprocessor.setElementDelimiter(strArr[i]);
            } else if (strArr[i].equals("-encoding") && i + 1 < strArr.length) {
                i++;
                str = strArr[i];
            } else if (strArr[i].equals("-printSentenceLengths")) {
                z = true;
            } else if (strArr[i].equals("-suppressEscaping")) {
                documentPreprocessor.setTokenizerFactory(PTBTokenizer.factory(new WordTokenFactory(), "ptb3Escaping=false"));
            } else if (strArr[i].equals("-tokenizerOptions") && i + 1 < strArr.length) {
                i++;
                documentPreprocessor.setTokenizerFactory(PTBTokenizer.factory(new WordTokenFactory(), strArr[i]));
            } else if (strArr[i].equals("-noTokenization")) {
                documentPreprocessor.setTokenizerFactory(null);
                documentPreprocessor.setSentenceDelimiter(System.getProperty("line.separator"));
            } else if (strArr[i].equals("-tag")) {
                i++;
                documentPreprocessor.setTagDelimiter(strArr[i]);
            } else {
                System.err.println("Unknown option: " + strArr[i]);
            }
            i++;
        }
        documentPreprocessor.setEncoding(str);
        int i2 = 0;
        PrintWriter printWriter = new PrintWriter((Writer) new OutputStreamWriter(System.out, str), true);
        Iterator<List<HasWord>> it = documentPreprocessor.iterator();
        while (it.hasNext()) {
            List<HasWord> next = it.next();
            i2++;
            if (z) {
                System.err.println("Length:\t" + next.size());
            }
            boolean z2 = false;
            for (HasWord hasWord : next) {
                if (z2) {
                    printWriter.print(" ");
                }
                z2 = true;
                printWriter.print(hasWord.word());
            }
            printWriter.println();
        }
        printWriter.close();
        System.err.println("Read in " + i2 + " sentences.");
    }
}
