package edu.stanford.nlp.process;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.MultiTokenTag;
import edu.stanford.nlp.ling.tokensregex.SequenceMatcher;
import edu.stanford.nlp.ling.tokensregex.SequencePattern;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/process/WordToSentenceProcessor.class */
public class WordToSentenceProcessor<IN> implements ListProcessor<IN, List<IN>> {
    public static final String DEFAULT_BOUNDARY_REGEX = "\\.|[!?]+";
    public static final String DEFAULT_BOUNDARY_FOLLOWERS_REGEX = "[\\p{Pe}\\p{Pf}\"'>＂＇＞]|''|-R[CRS]B-";
    private static final boolean DEBUG = false;
    private final Pattern sentenceBoundaryTokenPattern;
    private final SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern;
    private final Pattern sentenceBoundaryFollowersPattern;
    private final Set<String> sentenceBoundaryToDiscard;
    private final List<Pattern> xmlBreakElementsToDiscard;
    private final List<Pattern> tokenPatternsToDiscard;
    private final Pattern sentenceRegionBeginPattern;
    private final Pattern sentenceRegionEndPattern;
    private final NewlineIsSentenceBreak newlineIsSentenceBreak;
    private final boolean isOneSentence;
    private final boolean allowEmptySentences;
    private static final Redwood.RedwoodChannels log = Redwood.channels(WordToSentenceProcessor.class);
    public static final Set<String> DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD = Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList("\n", PTBTokenizer.getNewlineToken())));

    /* loaded from: input_file:edu/stanford/nlp/process/WordToSentenceProcessor$NewlineIsSentenceBreak.class */
    public enum NewlineIsSentenceBreak {
        NEVER,
        ALWAYS,
        TWO_CONSECUTIVE
    }

    public static NewlineIsSentenceBreak stringToNewlineIsSentenceBreak(String str) {
        if ("always".equals(str)) {
            return NewlineIsSentenceBreak.ALWAYS;
        }
        if ("never".equals(str)) {
            return NewlineIsSentenceBreak.NEVER;
        }
        if (str == null || !str.contains("two")) {
            throw new IllegalArgumentException("Not a valid NewlineIsSentenceBreak name: '" + str + "' (should be one of 'always', 'never', 'two')");
        }
        return NewlineIsSentenceBreak.TWO_CONSECUTIVE;
    }

    private static boolean isForcedEndToken(Object obj) {
        Boolean bool;
        return (obj instanceof CoreMap) && (bool = (Boolean) ((CoreMap) obj).get(CoreAnnotations.ForcedSentenceEndAnnotation.class)) != null && bool.booleanValue();
    }

    private static String getString(Object obj) {
        if (obj instanceof HasWord) {
            return ((HasWord) obj).word();
        }
        if (obj instanceof String) {
            return (String) obj;
        }
        if (obj instanceof CoreMap) {
            return (String) ((CoreMap) obj).get(CoreAnnotations.TextAnnotation.class);
        }
        throw new RuntimeException("Expected token to be either Word or String.");
    }

    private static boolean matches(List<Pattern> list, String str) {
        Iterator<Pattern> it = list.iterator();
        while (it.hasNext()) {
            if (it.next().matcher(str).matches()) {
                return true;
            }
        }
        return false;
    }

    private boolean matchesXmlBreakElementToDiscard(String str) {
        return matches(this.xmlBreakElementsToDiscard, str);
    }

    private boolean matchesTokenPatternsToDiscard(String str) {
        return matches(this.tokenPatternsToDiscard, str);
    }

    @Override // edu.stanford.nlp.process.ListProcessor
    public List<List<IN>> process(List<? extends IN> list) {
        if (!this.isOneSentence) {
            return wordsToSentences(list);
        }
        ArrayList newArrayList = Generics.newArrayList();
        newArrayList.add(new ArrayList(list));
        return newArrayList;
    }

    public List<List<IN>> wordsToSentences(List<? extends IN> list) {
        Boolean bool;
        IdentityHashMap identityHashMap = null;
        if (this.sentenceBoundaryMultiTokenPattern != null) {
            identityHashMap = new IdentityHashMap();
            SequenceMatcher<? super IN> matcher = this.sentenceBoundaryMultiTokenPattern.getMatcher(list);
            while (matcher.find()) {
                List<? super IN> groupNodes = matcher.groupNodes();
                if (groupNodes != null && !groupNodes.isEmpty()) {
                    identityHashMap.put(groupNodes.get(groupNodes.size() - 1), true);
                }
            }
        }
        ArrayList newArrayList = Generics.newArrayList();
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = null;
        boolean z = false;
        boolean z2 = false;
        boolean z3 = false;
        for (IN in : list) {
            String string = getString(in);
            boolean isForcedEndToken = isForcedEndToken(in);
            boolean z4 = false;
            if (in instanceof CoreMap) {
                CoreMap coreMap = (CoreMap) in;
                Boolean bool2 = (Boolean) coreMap.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class);
                if (!isForcedEndToken) {
                    if (bool2 == null || !bool2.booleanValue()) {
                        MultiTokenTag multiTokenTag = (MultiTokenTag) coreMap.get(CoreAnnotations.MentionTokenAnnotation.class);
                        if (multiTokenTag != null && !multiTokenTag.isEnd()) {
                            z4 = true;
                        }
                    } else {
                        z2 = true;
                    }
                }
            }
            boolean matchesTokenPatternsToDiscard = this.tokenPatternsToDiscard != null ? matchesTokenPatternsToDiscard(string) : false;
            if (this.sentenceRegionBeginPattern != null && !z) {
                if (this.sentenceRegionBeginPattern.matcher(string).matches()) {
                    z = true;
                }
                z3 = false;
            } else if (arrayList2 != null && arrayList.isEmpty() && this.sentenceBoundaryFollowersPattern.matcher(string).matches()) {
                if (!matchesTokenPatternsToDiscard) {
                    arrayList2.add(in);
                }
                z3 = false;
            } else {
                boolean z5 = false;
                String str = matchesTokenPatternsToDiscard ? "discarded" : "added to current";
                if (!z2 || isForcedEndToken) {
                    if (!z4 || isForcedEndToken) {
                        if (this.sentenceBoundaryToDiscard.contains(string)) {
                            if (this.newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) {
                                z5 = true;
                            } else if (this.newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE && z3) {
                                z5 = true;
                            }
                            z3 = true;
                        } else {
                            z3 = false;
                            if (this.xmlBreakElementsToDiscard != null && matchesXmlBreakElementToDiscard(string)) {
                                z5 = true;
                            } else if (this.sentenceRegionEndPattern != null && this.sentenceRegionEndPattern.matcher(string).matches()) {
                                z = false;
                                z5 = true;
                            } else if (identityHashMap != null && (bool = (Boolean) identityHashMap.get(in)) != null && bool.booleanValue()) {
                                if (!matchesTokenPatternsToDiscard) {
                                    arrayList.add(in);
                                }
                                z5 = true;
                            } else if (this.sentenceBoundaryTokenPattern.matcher(string).matches()) {
                                if (!matchesTokenPatternsToDiscard) {
                                    arrayList.add(in);
                                }
                                z5 = true;
                            } else if (isForcedEndToken) {
                                if (!matchesTokenPatternsToDiscard) {
                                    arrayList.add(in);
                                }
                                z2 = false;
                                z5 = true;
                            } else if (!matchesTokenPatternsToDiscard) {
                                arrayList.add(in);
                            }
                        }
                    } else if (!matchesTokenPatternsToDiscard) {
                        arrayList.add(in);
                    }
                } else if (!matchesTokenPatternsToDiscard) {
                    arrayList.add(in);
                }
                if (z5 && (!arrayList.isEmpty() || this.allowEmptySentences)) {
                    newArrayList.add(arrayList);
                    arrayList2 = arrayList;
                    arrayList = new ArrayList();
                }
            }
        }
        if (!arrayList.isEmpty()) {
            newArrayList.add(arrayList);
        }
        return newArrayList;
    }

    public <L, F> Document<L, F, List<IN>> processDocument(Document<L, F, IN> document) {
        Document<L, F, List<IN>> document2 = (Document<L, F, List<IN>>) document.blankDocument();
        document2.addAll(process(document));
        return document2;
    }

    public WordToSentenceProcessor() {
        this(false);
    }

    public WordToSentenceProcessor(NewlineIsSentenceBreak newlineIsSentenceBreak) {
        this(DEFAULT_BOUNDARY_REGEX, newlineIsSentenceBreak, false);
    }

    public WordToSentenceProcessor(boolean z) {
        this(DEFAULT_BOUNDARY_REGEX, NewlineIsSentenceBreak.TWO_CONSECUTIVE, z);
    }

    public WordToSentenceProcessor(Set<String> set) {
        this("", "", set, null, null, NewlineIsSentenceBreak.ALWAYS, null, null, false, true);
    }

    public WordToSentenceProcessor(String str, NewlineIsSentenceBreak newlineIsSentenceBreak, boolean z) {
        this(str, DEFAULT_BOUNDARY_FOLLOWERS_REGEX, DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD, null, null, newlineIsSentenceBreak, null, null, z, false);
    }

    public WordToSentenceProcessor(String str, String str2, Set<String> set, Set<String> set2, NewlineIsSentenceBreak newlineIsSentenceBreak, SequencePattern<? super IN> sequencePattern, Set<String> set3) {
        this(str == null ? DEFAULT_BOUNDARY_REGEX : str, str2 == null ? DEFAULT_BOUNDARY_FOLLOWERS_REGEX : str2, (set == null || set.isEmpty()) ? DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD : set, set2 == null ? Collections.emptySet() : set2, null, newlineIsSentenceBreak, sequencePattern, set3, false, false);
    }

    public WordToSentenceProcessor(String str, String str2, Set<String> set, Set<String> set2, String str3, NewlineIsSentenceBreak newlineIsSentenceBreak, SequencePattern<? super IN> sequencePattern, Set<String> set3, boolean z, boolean z2) {
        this.sentenceBoundaryTokenPattern = Pattern.compile(str);
        this.sentenceBoundaryFollowersPattern = Pattern.compile(str2);
        this.sentenceBoundaryToDiscard = Collections.unmodifiableSet(set);
        if (set2 == null || set2.isEmpty()) {
            this.xmlBreakElementsToDiscard = null;
        } else {
            this.xmlBreakElementsToDiscard = new ArrayList(set2.size());
            Iterator<String> it = set2.iterator();
            while (it.hasNext()) {
                this.xmlBreakElementsToDiscard.add(Pattern.compile("<\\s*(?:/\\s*)?(?:" + it.next() + ")(?:\\s+[^>]+?|\\s*(?:/\\s*)?)>", 2));
            }
        }
        if (str3 != null) {
            this.sentenceRegionBeginPattern = Pattern.compile("<\\s*(?:" + str3 + ")(?:\\s+[^>]+?)?>");
            this.sentenceRegionEndPattern = Pattern.compile("<\\s*/\\s*(?:" + str3 + ")\\s*>");
        } else {
            this.sentenceRegionBeginPattern = null;
            this.sentenceRegionEndPattern = null;
        }
        this.newlineIsSentenceBreak = newlineIsSentenceBreak;
        this.sentenceBoundaryMultiTokenPattern = sequencePattern;
        if (set3 != null) {
            this.tokenPatternsToDiscard = new ArrayList(set3.size());
            Iterator<String> it2 = set3.iterator();
            while (it2.hasNext()) {
                this.tokenPatternsToDiscard.add(Pattern.compile(it2.next()));
            }
        } else {
            this.tokenPatternsToDiscard = null;
        }
        this.isOneSentence = z;
        this.allowEmptySentences = z2;
    }
}
