/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.MultiTokenTag;
import edu.stanford.nlp.ling.tokensregex.SequenceMatcher;
import edu.stanford.nlp.ling.tokensregex.SequencePattern;
import edu.stanford.nlp.process.ListProcessor;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WordToSentenceProcessor<IN>
implements ListProcessor<IN, List<IN>> {
    private static final Redwood.RedwoodChannels log = Redwood.channels(WordToSentenceProcessor.class);
    public static final String DEFAULT_BOUNDARY_REGEX = "\\.|[!?]+";
    public static final String DEFAULT_BOUNDARY_FOLLOWERS_REGEX = "[\\p{Pe}\\p{Pf}\"'>\uff02\uff07\uff1e]|''|-R[CRS]B-";
    public static final Set<String> DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD = Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList("\n", "*NL*")));
    private static final boolean DEBUG = false;
    private final Pattern sentenceBoundaryTokenPattern;
    private final SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern;
    private final Pattern sentenceBoundaryFollowersPattern;
    private final Set<String> sentenceBoundaryToDiscard;
    private final List<Pattern> xmlBreakElementsToDiscard;
    private final List<Pattern> tokenPatternsToDiscard;
    private final Pattern sentenceRegionBeginPattern;
    private final Pattern sentenceRegionEndPattern;
    private final NewlineIsSentenceBreak newlineIsSentenceBreak;
    private final boolean isOneSentence;
    private final boolean allowEmptySentences;

    public static NewlineIsSentenceBreak stringToNewlineIsSentenceBreak(String name) {
        if ("always".equals(name)) {
            return NewlineIsSentenceBreak.ALWAYS;
        }
        if ("never".equals(name)) {
            return NewlineIsSentenceBreak.NEVER;
        }
        if (name != null && name.contains("two")) {
            return NewlineIsSentenceBreak.TWO_CONSECUTIVE;
        }
        throw new IllegalArgumentException("Not a valid NewlineIsSentenceBreak name: '" + name + "' (should be one of 'always', 'never', 'two')");
    }

    private static boolean isForcedEndToken(Object o) {
        if (o instanceof CoreMap) {
            Boolean forcedEndValue = (Boolean)((CoreMap)o).get(CoreAnnotations.ForcedSentenceEndAnnotation.class);
            return forcedEndValue != null && forcedEndValue != false;
        }
        return false;
    }

    private static String getString(Object o) {
        if (o instanceof HasWord) {
            HasWord h = (HasWord)o;
            return h.word();
        }
        if (o instanceof String) {
            return (String)o;
        }
        if (o instanceof CoreMap) {
            return (String)((CoreMap)o).get(CoreAnnotations.TextAnnotation.class);
        }
        throw new RuntimeException("Expected token to be either Word or String.");
    }

    private static boolean matches(List<Pattern> patterns, String word) {
        for (Pattern p : patterns) {
            Matcher m = p.matcher(word);
            if (!m.matches()) continue;
            return true;
        }
        return false;
    }

    private boolean matchesXmlBreakElementToDiscard(String word) {
        return WordToSentenceProcessor.matches(this.xmlBreakElementsToDiscard, word);
    }

    private boolean matchesTokenPatternsToDiscard(String word) {
        return WordToSentenceProcessor.matches(this.tokenPatternsToDiscard, word);
    }

    @Override
    public List<List<IN>> process(List<? extends IN> words) {
        if (this.isOneSentence) {
            ArrayList<List<IN>> sentences = Generics.newArrayList();
            sentences.add(new ArrayList<IN>(words));
            return sentences;
        }
        return this.wordsToSentences(words);
    }

    public List<List<IN>> wordsToSentences(List<? extends IN> words) {
        IdentityHashMap isSentenceBoundary = null;
        if (this.sentenceBoundaryMultiTokenPattern != null) {
            isSentenceBoundary = new IdentityHashMap();
            SequenceMatcher<IN> matcher = this.sentenceBoundaryMultiTokenPattern.getMatcher(words);
            while (matcher.find()) {
                List nodes = matcher.groupNodes();
                if (nodes == null || nodes.isEmpty()) continue;
                isSentenceBoundary.put(nodes.get(nodes.size() - 1), true);
            }
        }
        ArrayList<List<IN>> sentences = Generics.newArrayList();
        ArrayList<IN> currentSentence = new ArrayList<IN>();
        ArrayList<IN> lastSentence = null;
        boolean insideRegion = false;
        boolean inWaitForForcedEnd = false;
        boolean lastTokenWasNewline = false;
        for (IN o : words) {
            String debugText;
            String word = WordToSentenceProcessor.getString(o);
            boolean forcedEnd = WordToSentenceProcessor.isForcedEndToken(o);
            boolean inMultiTokenExpr = false;
            boolean discardToken = false;
            if (o instanceof CoreMap) {
                CoreMap cm = (CoreMap)o;
                Boolean forcedUntilEndValue = (Boolean)cm.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class);
                if (!forcedEnd) {
                    if (forcedUntilEndValue != null && forcedUntilEndValue.booleanValue()) {
                        inWaitForForcedEnd = true;
                    } else {
                        MultiTokenTag mt = (MultiTokenTag)cm.get(CoreAnnotations.MentionTokenAnnotation.class);
                        if (mt != null && !mt.isEnd()) {
                            inMultiTokenExpr = true;
                        }
                    }
                }
            }
            if (this.tokenPatternsToDiscard != null) {
                discardToken = this.matchesTokenPatternsToDiscard(word);
            }
            if (this.sentenceRegionBeginPattern != null && !insideRegion) {
                if (this.sentenceRegionBeginPattern.matcher(word).matches()) {
                    insideRegion = true;
                }
                lastTokenWasNewline = false;
                continue;
            }
            if (lastSentence != null && currentSentence.isEmpty() && this.sentenceBoundaryFollowersPattern.matcher(word).matches()) {
                if (!discardToken) {
                    lastSentence.add(o);
                }
                lastTokenWasNewline = false;
                continue;
            }
            boolean newSent = false;
            String string = debugText = discardToken ? "discarded" : "added to current";
            if (inWaitForForcedEnd && !forcedEnd) {
                if (!discardToken) {
                    currentSentence.add(o);
                }
            } else if (inMultiTokenExpr && !forcedEnd) {
                if (!discardToken) {
                    currentSentence.add(o);
                }
            } else if (this.sentenceBoundaryToDiscard.contains(word)) {
                if (this.newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) {
                    newSent = true;
                } else if (this.newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE && lastTokenWasNewline) {
                    newSent = true;
                }
                lastTokenWasNewline = true;
            } else {
                Boolean isb;
                lastTokenWasNewline = false;
                if (this.xmlBreakElementsToDiscard != null && this.matchesXmlBreakElementToDiscard(word)) {
                    newSent = true;
                } else if (this.sentenceRegionEndPattern != null && this.sentenceRegionEndPattern.matcher(word).matches()) {
                    insideRegion = false;
                    newSent = true;
                } else if (isSentenceBoundary != null && (isb = (Boolean)isSentenceBoundary.get(o)) != null && isb.booleanValue()) {
                    if (!discardToken) {
                        currentSentence.add(o);
                    }
                    newSent = true;
                } else if (this.sentenceBoundaryTokenPattern.matcher(word).matches()) {
                    if (!discardToken) {
                        currentSentence.add(o);
                    }
                    newSent = true;
                } else if (forcedEnd) {
                    if (!discardToken) {
                        currentSentence.add(o);
                    }
                    inWaitForForcedEnd = false;
                    newSent = true;
                } else if (!discardToken) {
                    currentSentence.add(o);
                }
            }
            if (!newSent || currentSentence.isEmpty() && !this.allowEmptySentences) continue;
            sentences.add(currentSentence);
            lastSentence = currentSentence;
            currentSentence = new ArrayList();
        }
        if (!currentSentence.isEmpty()) {
            sentences.add(currentSentence);
        }
        return sentences;
    }

    public <L, F> Document<L, F, List<IN>> processDocument(Document<L, F, IN> in) {
        Document doc = in.blankDocument();
        doc.addAll(this.process(in));
        return doc;
    }

    public WordToSentenceProcessor() {
        this(false);
    }

    public WordToSentenceProcessor(NewlineIsSentenceBreak newlineIsSentenceBreak) {
        this(DEFAULT_BOUNDARY_REGEX, newlineIsSentenceBreak, false);
    }

    public WordToSentenceProcessor(boolean isOneSentence) {
        this(DEFAULT_BOUNDARY_REGEX, NewlineIsSentenceBreak.TWO_CONSECUTIVE, isOneSentence);
    }

    public WordToSentenceProcessor(Set<String> boundaryToDiscard) {
        this("", "", boundaryToDiscard, null, null, NewlineIsSentenceBreak.ALWAYS, null, null, false, true);
    }

    public WordToSentenceProcessor(String boundaryTokenRegex, NewlineIsSentenceBreak newlineIsSentenceBreak, boolean isOneSentence) {
        this(boundaryTokenRegex, DEFAULT_BOUNDARY_FOLLOWERS_REGEX, DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD, null, null, newlineIsSentenceBreak, null, null, isOneSentence, false);
    }

    public WordToSentenceProcessor(String boundaryTokenRegex, String boundaryFollowersRegex, Set<String> boundaryToDiscard, Set<String> xmlBreakElementsToDiscard, NewlineIsSentenceBreak newlineIsSentenceBreak, SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern, Set<String> tokenRegexesToDiscard) {
        this(boundaryTokenRegex == null ? DEFAULT_BOUNDARY_REGEX : boundaryTokenRegex, boundaryFollowersRegex == null ? DEFAULT_BOUNDARY_FOLLOWERS_REGEX : boundaryFollowersRegex, boundaryToDiscard == null || boundaryToDiscard.isEmpty() ? DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD : boundaryToDiscard, xmlBreakElementsToDiscard == null ? Collections.emptySet() : xmlBreakElementsToDiscard, null, newlineIsSentenceBreak, sentenceBoundaryMultiTokenPattern, tokenRegexesToDiscard, false, false);
    }

    public WordToSentenceProcessor(String boundaryTokenRegex, String boundaryFollowersRegex, Set<String> boundariesToDiscard, Set<String> xmlBreakElementsToDiscard, String regionElementRegex, NewlineIsSentenceBreak newlineIsSentenceBreak, SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern, Set<String> tokenRegexesToDiscard, boolean isOneSentence, boolean allowEmptySentences) {
        this.sentenceBoundaryTokenPattern = Pattern.compile(boundaryTokenRegex);
        this.sentenceBoundaryFollowersPattern = Pattern.compile(boundaryFollowersRegex);
        this.sentenceBoundaryToDiscard = Collections.unmodifiableSet(boundariesToDiscard);
        if (xmlBreakElementsToDiscard == null || xmlBreakElementsToDiscard.isEmpty()) {
            this.xmlBreakElementsToDiscard = null;
        } else {
            this.xmlBreakElementsToDiscard = new ArrayList<Pattern>(xmlBreakElementsToDiscard.size());
            for (String s : xmlBreakElementsToDiscard) {
                String regex = "<\\s*(?:/\\s*)?(?:" + s + ")(?:\\s+[^>]+?|\\s*(?:/\\s*)?)>";
                this.xmlBreakElementsToDiscard.add(Pattern.compile(regex, 2));
            }
        }
        if (regionElementRegex != null) {
            this.sentenceRegionBeginPattern = Pattern.compile("<\\s*(?:" + regionElementRegex + ")(?:\\s+[^>]+?)?>");
            this.sentenceRegionEndPattern = Pattern.compile("<\\s*/\\s*(?:" + regionElementRegex + ")\\s*>");
        } else {
            this.sentenceRegionBeginPattern = null;
            this.sentenceRegionEndPattern = null;
        }
        this.newlineIsSentenceBreak = newlineIsSentenceBreak;
        this.sentenceBoundaryMultiTokenPattern = sentenceBoundaryMultiTokenPattern;
        if (tokenRegexesToDiscard != null) {
            this.tokenPatternsToDiscard = new ArrayList<Pattern>(tokenRegexesToDiscard.size());
            for (String s : tokenRegexesToDiscard) {
                this.tokenPatternsToDiscard.add(Pattern.compile(s));
            }
        } else {
            this.tokenPatternsToDiscard = null;
        }
        this.isOneSentence = isOneSentence;
        this.allowEmptySentences = allowEmptySentences;
    }

    public static enum NewlineIsSentenceBreak {
        NEVER,
        ALWAYS,
        TWO_CONSECUTIVE;

    }
}

