package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.quoteattribution.Sieves.MSSieves.BaselineTopSpeakerSieve;
import edu.stanford.nlp.semgraph.semgrex.ssurgeon.AddNode;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Serializable;
import java.io.StringReader;
import java.io.Writer;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;

/* loaded from: input_file:edu/stanford/nlp/process/ChineseDocumentToSentenceProcessor.class */
public class ChineseDocumentToSentenceProcessor implements Serializable {
    private static final long serialVersionUID = 4054964767812217460L;
    private static final String encoding = "UTF-8";
    private final List<Pair<String, String>> normalizationTable;
    private static Redwood.RedwoodChannels log = Redwood.channels(ChineseDocumentToSentenceProcessor.class);
    private static final Set<Character> fullStopsSet = Generics.newHashSet(Arrays.asList((char) 12290, (char) 65281, (char) 65311, '!', '?'));
    private static final Set<Character> rightMarkSet = Generics.newHashSet(Arrays.asList((char) 8221, (char) 8217, (char) 12299, (char) 12303, (char) 12297, (char) 12301, (char) 65310, (char) 65287, (char) 65289, '\'', '\"', ')', ']', '>'));
    private static final Pattern PAIR_PATTERN = Pattern.compile("([^\\s]+)\\s+([^\\s]+)");
    private static final Pattern WHITEPLUS_PATTERN = Pattern.compile(ChineseUtils.WHITEPLUS);
    private static final Pattern START_WHITEPLUS_PATTERN = Pattern.compile("^[\\s\\p{Zs}]+");
    private static final Pattern END_WHITEPLUS_PATTERN = Pattern.compile("[\\s\\p{Zs}]+$");

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:edu/stanford/nlp/process/ChineseDocumentToSentenceProcessor$MyHTMLParser.class */
    public static class MyHTMLParser extends HTMLEditorKit.ParserCallback {
        protected StringBuilder textBuffer;
        protected List<String> sentences;
        protected String title = "";
        protected boolean isTitle = false;
        protected boolean isBody = false;
        protected boolean isScript = false;
        protected boolean isBreak = false;

        public void handleText(char[] cArr, int i) {
            if (cArr.length == 0) {
                return;
            }
            if (this.isTitle) {
                this.title = new String(cArr);
            } else if (!this.isBody || !this.isScript) {
            }
            this.textBuffer.append(cArr);
            String trim = this.textBuffer.toString().replaceAll(" ", "").trim();
            if (trim.length() == 0) {
                return;
            }
            this.sentences.add(trim);
            this.textBuffer = new StringBuilder(BaselineTopSpeakerSieve.FORWARD_WINDOW);
        }

        public void handleStartTag(HTML.Tag tag, MutableAttributeSet mutableAttributeSet, int i) {
            if (tag == HTML.Tag.TITLE) {
                this.isTitle = true;
            } else if (tag == HTML.Tag.BODY) {
                this.isBody = true;
            } else if (tag == HTML.Tag.SCRIPT) {
                this.isScript = true;
            }
            this.isBreak = tag.breaksFlow();
        }

        public void handleEndTag(HTML.Tag tag, int i) {
            if (tag == HTML.Tag.TITLE) {
                this.isTitle = false;
            } else if (tag == HTML.Tag.BODY) {
                this.isBody = false;
            } else if (tag == HTML.Tag.SCRIPT) {
                this.isScript = false;
            }
        }

        public List<String> parse(URL url) throws IOException {
            return parse(IOUtils.slurpURL(url));
        }

        public List<String> parse(Reader reader) throws IOException {
            return parse(IOUtils.slurpReader(reader));
        }

        public List<String> parse(String str) throws IOException {
            StringReader stringReader = new StringReader(str.replaceAll("/>", ">").replaceAll("<\\?", "<"));
            this.textBuffer = new StringBuilder(200);
            this.sentences = new ArrayList();
            new ParserDelegator().parse(stringReader, this, true);
            return this.sentences;
        }

        public String title() {
            return this.title;
        }
    }

    public ChineseDocumentToSentenceProcessor() {
        this(null);
    }

    public ChineseDocumentToSentenceProcessor(String str) {
        if (str == null) {
            this.normalizationTable = null;
            return;
        }
        this.normalizationTable = new ArrayList();
        Iterator<String> it = ObjectBank.getLineIterator(new File(str), "UTF-8").iterator();
        while (it.hasNext()) {
            String next = it.next();
            Matcher matcher = PAIR_PATTERN.matcher(next);
            if (matcher.find()) {
                this.normalizationTable.add(new Pair<>(matcher.group(1), matcher.group(2)));
            } else {
                log.info("Didn't match: " + next);
            }
        }
    }

    public String normalization(String str) {
        return normalize(ChineseUtils.normalize(str));
    }

    private String normalize(String str) {
        if (this.normalizationTable == null) {
            return str;
        }
        String replaceAll = WHITEPLUS_PATTERN.matcher(str).replaceAll(AddNode.ATOM_DELIMITER);
        for (Pair<String, String> pair : this.normalizationTable) {
            Matcher matcher = Pattern.compile(pair.first(), 16).matcher(replaceAll);
            String second = pair.second();
            if (second.equals("$")) {
                second = "\\$";
            }
            replaceAll = matcher.replaceAll(second);
        }
        return replaceAll;
    }

    /* JADX WARN: Multi-variable type inference failed */
    public static void main(String[] strArr) throws Exception {
        Properties argsToProperties = StringUtils.argsToProperties(strArr);
        boolean containsKey = argsToProperties.containsKey("alwaysAddS");
        if (!argsToProperties.containsKey("file")) {
            log.info("usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] -file filename [-encoding encoding]");
            return;
        }
        new ChineseDocumentToSentenceProcessor();
        if (argsToProperties.containsKey("encoding")) {
            log.info("WARNING: for now the default encoding is UTF-8. It's not changeable for now");
        }
        String slurpFileNoExceptions = IOUtils.slurpFileNoExceptions(argsToProperties.getProperty("file"), "UTF-8");
        if (!argsToProperties.containsKey("segmentIBM")) {
            List<String> fromHTML = fromHTML(slurpFileNoExceptions);
            PrintWriter printWriter = new PrintWriter((Writer) new OutputStreamWriter(System.err, "UTF-8"), true);
            Iterator<String> it = fromHTML.iterator();
            while (it.hasNext()) {
                printWriter.println(it.next());
            }
            return;
        }
        WhitespaceTokenizer<Word> newWordWhitespaceTokenizer = WhitespaceTokenizer.newWordWhitespaceTokenizer(new StringReader(slurpFileNoExceptions), true);
        String property = argsToProperties.getProperty("parseInside");
        if (property == null) {
            property = "";
        }
        PrintWriter printWriter2 = new PrintWriter((Writer) new OutputStreamWriter(System.out, "UTF-8"), true);
        StringBuilder sb = new StringBuilder();
        StringBuilder sb2 = new StringBuilder();
        String str = "";
        Pattern compile = Pattern.compile("<.*>");
        Pattern compile2 = Pattern.compile("\ufeff?<[\\p{Alpha}]+");
        Pattern compile3 = Pattern.compile("[A-Za-z0-9=\"]+>");
        Pattern compile4 = Pattern.compile("<(?:" + property + ")[ >]");
        boolean z = false;
        int i = 0;
        int i2 = 0;
        while (newWordWhitespaceTokenizer.hasNext()) {
            String word = ((Word) newWordWhitespaceTokenizer.next()).word();
            if (compile2.matcher(word).matches()) {
                z = true;
                sb2.append(word).append(AddNode.ATOM_DELIMITER);
            } else if (compile.matcher(word).matches() || ((z && compile3.matcher(word).matches()) || "\n".equals(word))) {
                z = false;
                if (sb.toString().trim().length() > 0) {
                    boolean z2 = false;
                    if (property.equals("")) {
                        z2 = true;
                    } else if (compile4.matcher(str).find()) {
                        z2 = true;
                    }
                    if (z2) {
                        List<String> fromPlainText = fromPlainText(sb.toString(), true);
                        if (containsKey || fromPlainText.size() > 1) {
                            int i3 = 1;
                            for (String str2 : fromPlainText) {
                                printWriter2.print("<s id=\"" + i3 + "\">");
                                printWriter2.print(str2);
                                printWriter2.println("</s>");
                                i3++;
                            }
                            if (fromPlainText.size() > 1) {
                                i++;
                                i2 += fromPlainText.size() - 1;
                            }
                        } else if (fromPlainText.size() == 1) {
                            printWriter2.print(fromPlainText.get(0));
                        }
                    } else {
                        printWriter2.print(sb);
                    }
                    sb = new StringBuilder();
                }
                sb2.append(word);
                printWriter2.print(sb2);
                str = sb2.toString();
                sb2 = new StringBuilder();
            } else if (z) {
                sb2.append(word).append(AddNode.ATOM_DELIMITER);
            } else {
                sb.append(word).append(AddNode.ATOM_DELIMITER);
            }
        }
        printWriter2.flush();
        printWriter2.close();
        log.info("Split " + i + " segments, adding " + i2 + " sentences.");
    }

    public static List<String> fromHTML(String str) throws IOException {
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = new MyHTMLParser().parse(str).iterator();
        while (it.hasNext()) {
            arrayList.addAll(fromPlainText(it.next()));
        }
        return arrayList;
    }

    public static List<String> fromPlainText(String str) throws IOException {
        return fromPlainText(str, false);
    }

    public static List<String> fromPlainText(String str, boolean z) throws IOException {
        String str2 = "";
        char[] charArray = (z ? ChineseUtils.normalize(str, 0, 1) : ChineseUtils.normalize(str, 2, 1)).toCharArray();
        boolean z2 = false;
        ArrayList arrayList = new ArrayList();
        char c = 65535;
        for (char c2 : charArray) {
            Character valueOf = Character.valueOf(c2);
            String ch = valueOf.toString();
            if (z2) {
                if (rightMarkSet.contains(valueOf)) {
                    str2 = str2 + ch;
                } else if (ch.matches("\\s")) {
                    str2 = str2 + ch;
                } else if (fullStopsSet.contains(valueOf)) {
                    str2 = str2 + ch;
                } else {
                    if (str2.length() > 0) {
                        z2 = false;
                    }
                    String removeWhitespace = removeWhitespace(str2, z);
                    if (removeWhitespace.length() > 0) {
                        arrayList.add(removeWhitespace);
                    }
                    str2 = "" + ch;
                }
            } else if (z && fullStopsSet.contains(valueOf) && (c == 65535 || Character.isSpaceChar((int) c))) {
                str2 = str2 + ch;
                z2 = true;
            } else if (z || !fullStopsSet.contains(valueOf)) {
                str2 = str2 + ch;
            } else {
                str2 = str2 + ch;
                z2 = true;
            }
            c = valueOf.charValue();
        }
        String removeWhitespace2 = removeWhitespace(str2, z);
        if (removeWhitespace2.length() > 0) {
            arrayList.add(removeWhitespace2);
        }
        return arrayList;
    }

    private static String removeWhitespace(String str, boolean z) {
        if (str.length() > 0) {
            str = END_WHITEPLUS_PATTERN.matcher(START_WHITEPLUS_PATTERN.matcher(str).replaceAll("")).replaceAll("");
            if (!z) {
                str = WHITEPLUS_PATTERN.matcher(str).replaceAll("");
            }
        }
        return str;
    }
}
