package edu.stanford.nlp.wordseg;

import edu.stanford.nlp.classify.LinearClassifier;
import edu.stanford.nlp.fsm.DFSA;
import edu.stanford.nlp.fsm.DFSAState;
import edu.stanford.nlp.fsm.DFSATransition;
import edu.stanford.nlp.ie.pascal.ISODateInstance;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
import edu.stanford.nlp.objectbank.LineIterator;
import edu.stanford.nlp.process.ChineseDocumentToSentenceProcessor;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import edu.stanford.nlp.sequences.LatticeWriter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;
import edu.stanford.nlp.util.Characters;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.MutableInteger;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/wordseg/Sighan2005DocumentReaderAndWriter.class */
public class Sighan2005DocumentReaderAndWriter implements DocumentReaderAndWriter<CoreLabel>, LatticeWriter<CoreLabel, String, Integer> {
    private static final long serialVersionUID = 3260295150250263237L;
    private static final Redwood.RedwoodChannels logger;
    private static final boolean DEBUG = false;
    private static final boolean DEBUG_MORE = false;
    private static final Pattern dateChars;
    private static final Pattern dateCharsPlus;
    private static final Pattern numberChars;
    private static final Pattern letterChars;
    private static final Pattern periodChars;
    private final Pattern separatingPuncChars = Pattern.compile("[]!\"(),;:<=>?\\[\\\\`{|}~^、-〃〈-】〔-〟〰］！＂（），；：＜＝＞？［＼｀｛｜｝～＾]");
    private final Pattern ambiguousPuncChars = Pattern.compile("[-#$%&'*+/@_－＃＄％＆＇＊＋／＠＿]");
    private final Pattern midDotPattern = Pattern.compile(ChineseUtils.MID_DOT_REGEX_STR);
    private ChineseDocumentToSentenceProcessor cdtos;
    private ChineseDictionary cdict;
    private ChineseDictionary cdict2;
    private SeqClassifierFlags flags;
    private IteratorFromReaderFactory<List<CoreLabel>> factory;
    private OutputFormat outputFormat;
    static final /* synthetic */ boolean $assertionsDisabled;

    /* loaded from: input_file:edu/stanford/nlp/wordseg/Sighan2005DocumentReaderAndWriter$CTBDocumentParser.class */
    class CTBDocumentParser implements Function<String, List<CoreLabel>>, Serializable {
        private static final long serialVersionUID = 3260297180259462337L;
        private String defaultMap = "char=0,answer=1";
        public String[] map = StringUtils.mapStringToArray(this.defaultMap);

        CTBDocumentParser() {
        }

        @Override // java.util.function.Function
        public List<CoreLabel> apply(String str) {
            if (str == null) {
                return null;
            }
            String trim = str.trim();
            ArrayList arrayList = new ArrayList();
            String normalization = Sighan2005DocumentReaderAndWriter.this.cdtos.normalization(trim);
            int i = 0;
            int i2 = 0;
            int i3 = 0;
            int length = normalization.length();
            while (i3 < length) {
                int codePointAt = Character.codePointAt(normalization, i3);
                CoreLabel coreLabel = new CoreLabel();
                if (!Character.isWhitespace(codePointAt) && !Character.isISOControl(codePointAt)) {
                    boolean isSupplementaryCodePoint = Character.isSupplementaryCodePoint(codePointAt);
                    coreLabel.set(CoreAnnotations.CharAnnotation.class, Sighan2005DocumentReaderAndWriter.intern(isSupplementaryCodePoint ? normalization.substring(i3, i3 + 2) : Character.toString(normalization.charAt(i3))));
                    while (true) {
                        if (!Character.isWhitespace(trim.charAt(i)) && !Character.isISOControl(trim.charAt(i)) && trim.charAt(i) != 160) {
                            break;
                        }
                        i++;
                    }
                    String substring = isSupplementaryCodePoint ? trim.substring(i, i + 2) : Character.toString(trim.charAt(i));
                    coreLabel.set(CoreAnnotations.OriginalCharAnnotation.class, Sighan2005DocumentReaderAndWriter.intern(substring));
                    if (Sighan2005DocumentReaderAndWriter.this.flags.useShapeStrings) {
                        coreLabel.set(CoreAnnotations.ShapeAnnotation.class, Sighan2005DocumentReaderAndWriter.this.shapeOf(substring));
                    }
                    if (Sighan2005DocumentReaderAndWriter.this.flags.useUnicodeType || Sighan2005DocumentReaderAndWriter.this.flags.useUnicodeType4gram || Sighan2005DocumentReaderAndWriter.this.flags.useUnicodeType5gram) {
                        coreLabel.set(CoreAnnotations.UTypeAnnotation.class, Integer.valueOf(Character.getType(codePointAt)));
                    }
                    if (Sighan2005DocumentReaderAndWriter.this.flags.useUnicodeBlock) {
                        coreLabel.set(CoreAnnotations.UBlockAnnotation.class, Characters.unicodeBlockStringOf(codePointAt));
                    }
                    if (i3 == 0) {
                        coreLabel.set(CoreAnnotations.AnswerAnnotation.class, "1");
                        coreLabel.set(CoreAnnotations.SpaceBeforeAnnotation.class, "1");
                        coreLabel.set(CoreAnnotations.GoldAnswerAnnotation.class, "1");
                    } else if (Character.isWhitespace(normalization.charAt(i3 - 1)) || Character.isISOControl(normalization.charAt(i3 - 1))) {
                        coreLabel.set(CoreAnnotations.AnswerAnnotation.class, "1");
                        coreLabel.set(CoreAnnotations.SpaceBeforeAnnotation.class, "1");
                        coreLabel.set(CoreAnnotations.GoldAnswerAnnotation.class, "1");
                    } else {
                        coreLabel.set(CoreAnnotations.AnswerAnnotation.class, "0");
                        coreLabel.set(CoreAnnotations.SpaceBeforeAnnotation.class, "0");
                        coreLabel.set(CoreAnnotations.GoldAnswerAnnotation.class, "0");
                    }
                    coreLabel.set(CoreAnnotations.PositionAnnotation.class, Sighan2005DocumentReaderAndWriter.intern(String.valueOf(i2)));
                    i2++;
                    arrayList.add(coreLabel);
                    if (isSupplementaryCodePoint) {
                        i3++;
                        i++;
                    }
                    i++;
                }
                i3++;
            }
            if (Sighan2005DocumentReaderAndWriter.this.flags.dictionary != null || Sighan2005DocumentReaderAndWriter.this.flags.serializedDictionary != null) {
                Sighan2005DocumentReaderAndWriter.addDictionaryFeatures(Sighan2005DocumentReaderAndWriter.this.cdict, CoreAnnotations.LBeginAnnotation.class, CoreAnnotations.LMiddleAnnotation.class, CoreAnnotations.LEndAnnotation.class, arrayList);
            }
            if (Sighan2005DocumentReaderAndWriter.this.flags.dictionary2 != null) {
                Sighan2005DocumentReaderAndWriter.addDictionaryFeatures(Sighan2005DocumentReaderAndWriter.this.cdict2, CoreAnnotations.D2_LBeginAnnotation.class, CoreAnnotations.D2_LMiddleAnnotation.class, CoreAnnotations.D2_LEndAnnotation.class, arrayList);
            }
            return arrayList;
        }
    }

    /* loaded from: input_file:edu/stanford/nlp/wordseg/Sighan2005DocumentReaderAndWriter$OutputFormat.class */
    private enum OutputFormat {
        PLAINTEXT,
        CONLLU
    }

    @Override // edu.stanford.nlp.objectbank.IteratorFromReaderFactory
    public Iterator<List<CoreLabel>> getIterator(Reader reader) {
        return this.factory.getIterator(reader);
    }

    @Override // edu.stanford.nlp.sequences.DocumentReaderAndWriter
    public void init(SeqClassifierFlags seqClassifierFlags) {
        this.flags = seqClassifierFlags;
        this.factory = LineIterator.getFactory(new CTBDocumentParser());
        this.cdtos = new ChineseDocumentToSentenceProcessor(seqClassifierFlags.normalizationTable);
        if (seqClassifierFlags.dictionary != null) {
            this.cdict = new ChineseDictionary(seqClassifierFlags.dictionary.split(","), this.cdtos, seqClassifierFlags.expandMidDot);
        }
        if (seqClassifierFlags.serializedDictionary != null) {
            this.cdict = new ChineseDictionary(seqClassifierFlags.serializedDictionary, this.cdtos, seqClassifierFlags.expandMidDot);
        }
        if (seqClassifierFlags.dictionary2 != null) {
            this.cdict2 = new ChineseDictionary(seqClassifierFlags.dictionary2.split(","), this.cdtos, seqClassifierFlags.expandMidDot);
        }
        if (seqClassifierFlags.outputFormat == null) {
            this.outputFormat = OutputFormat.PLAINTEXT;
        } else {
            this.outputFormat = OutputFormat.valueOf(seqClassifierFlags.outputFormat.toUpperCase(Locale.ROOT));
            logger.info("Output format: " + this.outputFormat);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    public String shapeOf(String str) {
        return (this.flags.augmentedDateChars && dateCharsPlus.matcher(str).matches()) ? "D" : dateChars.matcher(str).matches() ? "D" : numberChars.matcher(str).matches() ? "N" : letterChars.matcher(str).matches() ? "L" : periodChars.matcher(str).matches() ? "P" : this.separatingPuncChars.matcher(str).matches() ? "S" : this.ambiguousPuncChars.matcher(str).matches() ? "A" : (this.flags.useMidDotShape && this.midDotPattern.matcher(str).matches()) ? "M" : ISODateInstance.BOUNDED_RANGE;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static void addDictionaryFeatures(ChineseDictionary chineseDictionary, Class<? extends CoreAnnotation<String>> cls, Class<? extends CoreAnnotation<String>> cls2, Class<? extends CoreAnnotation<String>> cls3, List<CoreLabel> list) {
        int size = list.size();
        int[] iArr = new int[size];
        int[] iArr2 = new int[size];
        int[] iArr3 = new int[size];
        for (int i = 0; i < size; i++) {
            iArr3[i] = 0;
            iArr2[i] = 0;
            iArr[i] = 0;
        }
        StringBuilder sb = new StringBuilder();
        for (int i2 = 0; i2 < size; i2++) {
            sb.setLength(0);
            for (int i3 = 1; i3 <= 6 && (i2 + i3) - 1 < size; i3++) {
                sb.append((String) list.get((i2 + i3) - 1).get(CoreAnnotations.CharAnnotation.class));
                String sb2 = sb.toString();
                if (sb2.length() > 6) {
                    break;
                }
                if (chineseDictionary.contains(sb2)) {
                    if (i3 > iArr[i2]) {
                        iArr[i2] = i3;
                    }
                    int i4 = (i2 + i3) - 1;
                    if (i3 == 6) {
                        i4++;
                    }
                    for (int i5 = i2 + 1; i5 < i4; i5++) {
                        if (i3 > iArr2[i5]) {
                            iArr2[i5] = i3;
                        }
                    }
                    if (i3 < 6 && i3 > iArr3[(i2 + i3) - 1]) {
                        iArr3[(i2 + i3) - 1] = i3;
                    }
                }
            }
        }
        for (int i6 = 0; i6 < size; i6++) {
            StringBuilder sb3 = new StringBuilder();
            sb3.append(iArr[i6]);
            if (iArr[i6] == 6) {
                sb3.append(ATBTreeUtils.morphBoundary);
            }
            list.get(i6).set(cls, sb3.toString());
            StringBuilder sb4 = new StringBuilder();
            sb4.append(iArr2[i6]);
            if (iArr2[i6] == 6) {
                sb4.append(ATBTreeUtils.morphBoundary);
            }
            list.get(i6).set(cls2, sb4.toString());
            StringBuilder sb5 = new StringBuilder();
            sb5.append(iArr3[i6]);
            if (iArr3[i6] == 6) {
                sb5.append(ATBTreeUtils.morphBoundary);
            }
            list.get(i6).set(cls3, sb5.toString());
        }
    }

    private void printPlainTextAnswer(List<CoreLabel> list, PrintWriter printWriter) {
        printWriter.print(ChineseStringUtils.combineSegmentedSentence(list, this.flags));
        printWriter.println();
    }

    private void printConlluAnswer(List<CoreLabel> list, PrintWriter printWriter) {
        String combineSegmentedSentence = ChineseStringUtils.combineSegmentedSentence(list, this.flags);
        printWriter.print("# text = " + combineSegmentedSentence);
        printWriter.println();
        int i = 0;
        Iterator<String> it = StringUtils.split(combineSegmentedSentence).iterator();
        while (it.hasNext()) {
            i++;
            printWriter.print(i + LinearClassifier.TEXT_SERIALIZATION_DELIMITER + it.next());
            printWriter.print("\t_\t_\t_\t_\t");
            printWriter.print(i - 1);
            printWriter.print(LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
            if (i == 1) {
                printWriter.print("root");
            } else {
                printWriter.print("dep");
            }
            printWriter.print("\t_\t_");
            printWriter.println();
        }
        printWriter.println();
    }

    @Override // edu.stanford.nlp.sequences.DocumentReaderAndWriter
    public void printAnswers(List<CoreLabel> list, PrintWriter printWriter) {
        switch (this.outputFormat) {
            case PLAINTEXT:
                printPlainTextAnswer(list, printWriter);
                return;
            case CONLLU:
                printConlluAnswer(list, printWriter);
                return;
            default:
                throw new IllegalArgumentException("Unknown outputFormat: " + this.outputFormat);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static String intern(String str) {
        return str.trim().intern();
    }

    @Override // edu.stanford.nlp.sequences.LatticeWriter
    public void printLattice(DFSA<String, Integer> dfsa, List<CoreLabel> list, PrintWriter printWriter) {
        CoreLabel[] coreLabelArr = (CoreLabel[]) list.toArray(new CoreLabel[list.size()]);
        MutableInteger mutableInteger = new MutableInteger(0);
        DFSA<String, Integer> dfsa2 = new DFSA<>((DFSAState<String, Integer>) null);
        DFSAState<String, Integer> dFSAState = new DFSAState<>(Integer.valueOf(mutableInteger.intValue()), dfsa2);
        dfsa2.setInitialState(dFSAState);
        tagLatticeToAnswerLattice(dfsa.initialState(), dFSAState, new StringBuilder(""), mutableInteger, 0, 0.0d, Generics.newHashMap(), dfsa2, coreLabelArr);
        try {
            dfsa2.printAttFsmFormat(printWriter);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private void tagLatticeToAnswerLattice(DFSAState<String, Integer> dFSAState, DFSAState<String, Integer> dFSAState2, StringBuilder sb, MutableInteger mutableInteger, int i, double d, Map<DFSAState<String, Integer>, DFSAState<String, Integer>> map, DFSA<String, Integer> dfsa, CoreLabel[] coreLabelArr) {
        if (dFSAState.isAccepting() && dFSAState.continuingInputs().isEmpty()) {
            dFSAState.addTransition(new DFSATransition<>("", dFSAState, new DFSAState(-1, null), "1", "", 0.0d));
        }
        CoreLabel coreLabel = i < coreLabelArr.length ? coreLabelArr[i] : null;
        String str = null;
        String str2 = null;
        if (coreLabel != null) {
            str = (String) coreLabel.get(CoreAnnotations.OriginalCharAnnotation.class);
            if (!$assertionsDisabled && str.length() != 1) {
                throw new AssertionError();
            }
            str2 = (String) coreLabel.get(CoreAnnotations.SpaceBeforeAnnotation.class);
        }
        Set<String> continuingInputs = dFSAState.continuingInputs();
        String str3 = null;
        if (i == 0) {
            double d2 = Double.POSITIVE_INFINITY;
            for (String str4 : continuingInputs) {
                double score = dFSAState.transition(str4).score();
                if (score < d2 && str4 != null) {
                    logger.info(String.format("mincost (%s): %e -> %e%n", str4, Double.valueOf(d2), Double.valueOf(score)));
                    d2 = score;
                    str3 = str4;
                }
            }
        }
        for (String str5 : continuingInputs) {
            DFSATransition<String, Integer> transition = dFSAState.transition(str5);
            DFSAState<String, Integer> target = transition.target();
            DFSAState<String, Integer> dFSAState3 = dFSAState2;
            StringBuilder sb2 = new StringBuilder(sb.toString());
            int length = sb2.length();
            String substring = length > 0 ? sb2.substring(length - 1) : null;
            double d3 = d;
            if (str3 != null && !str3.equals(str5)) {
                logger.info(String.format("Skipping transition %s at pos 0.%n", str5));
            } else if (this.flags.keepAllWhitespaces && "0".equals(str5) && "1".equals(str2)) {
                logger.info(String.format("Skipping non-boundary at pos %d, since space in the input.%n", Integer.valueOf(i)));
            } else {
                if ("1".equals(str5) && "0".equals(str2) && substring != null && str != null) {
                    char charAt = substring.charAt(0);
                    char charAt2 = str.charAt(0);
                    if (ChineseStringUtils.isLetterASCII(charAt) && ChineseStringUtils.isLetterASCII(charAt2)) {
                        logger.info(String.format("Not hypothesizing a boundary at pos %d, since between two ASCII letters (%s and %s).%n", Integer.valueOf(i), substring, str));
                    } else if (ChineseUtils.isNumber(charAt) && ChineseUtils.isNumber(charAt2)) {
                        logger.info(String.format("Not hypothesizing a boundary at pos %d, since between two numeral characters (%s and %s).%n", Integer.valueOf(i), substring, str));
                    }
                }
                if ("1".equals(str5) && sb2.toString().length() > 0) {
                    if (map.containsKey(dFSAState)) {
                        dFSAState3.addTransition(new DFSATransition<>("", dFSAState3, map.get(dFSAState), sb2.toString(), "", d3));
                    } else {
                        mutableInteger.incValue(1);
                        DFSAState<String, Integer> dFSAState4 = new DFSAState<>(Integer.valueOf(mutableInteger.intValue()), dfsa, 0.0d);
                        map.put(dFSAState, dFSAState4);
                        dFSAState3.addTransition(new DFSATransition<>("", dFSAState3, dFSAState4, sb2.toString(), "", d3));
                        if (dFSAState.isAccepting()) {
                            dFSAState4.setAccepting(true);
                        } else {
                            dFSAState3 = dFSAState4;
                            sb2 = new StringBuilder();
                            d3 = 0.0d;
                        }
                    }
                }
                if (!$assertionsDisabled && str == null) {
                    throw new AssertionError();
                }
                sb2.append(str);
                double score2 = d3 + transition.score();
                if (score2 < this.flags.searchGraphPrune || ChineseStringUtils.isLetterASCII(str.charAt(0))) {
                    tagLatticeToAnswerLattice(target, dFSAState3, sb2, mutableInteger, i + 1, score2, map, dfsa, coreLabelArr);
                }
            }
        }
    }

    static {
        $assertionsDisabled = !Sighan2005DocumentReaderAndWriter.class.desiredAssertionStatus();
        logger = Redwood.channels(Sighan2005DocumentReaderAndWriter.class);
        dateChars = Pattern.compile("[年月日]");
        dateCharsPlus = Pattern.compile("[年月日号]");
        numberChars = Pattern.compile("[0-9０-９一二三四五六七八九十零〇百千万亿兩○◯〡-〩〸-〺]");
        letterChars = Pattern.compile("[A-Za-zＡ-Ｚａ-ｚ]");
        periodChars = Pattern.compile("[﹒‧．.点]");
    }
}
