package edu.stanford.nlp.wordseg;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.tagger.maxent.TaggerConfig;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/wordseg/ChineseStringUtils.class */
public class ChineseStringUtils {
    private static final boolean DEBUG = false;
    static Pattern[] puncsPat = null;
    static Character[] puncs = null;
    static Character[] colons = {(char) 65109, ':', (char) 65306};
    static Pattern[] colonsPat = null;
    static Pattern[] colonsWhitePat = null;
    private static final Pattern percentsPat = Pattern.compile("[\\s\\p{Zs}]*([％%])[\\s\\p{Zs}]*");
    private static final String percentStr = "[\\s\\p{Zs}]+([％%])";
    private static Pattern percentsWhitePat;

    private ChineseStringUtils() {
    }

    public static boolean isLetterASCII(char c) {
        return c <= 127 && Character.isLetter(c);
    }

    public static String combineSegmentedSentence(List<CoreLabel> list, SeqClassifierFlags seqClassifierFlags) {
        int i = 0;
        StringBuilder sb = new StringBuilder();
        StringBuilder sb2 = new StringBuilder();
        StringBuilder sb3 = new StringBuilder();
        CoreLabel coreLabel = null;
        Iterator<CoreLabel> it = list.iterator();
        while (it.hasNext()) {
            CoreLabel coreLabel2 = coreLabel;
            coreLabel = it.next();
            boolean equals = TaggerConfig.NTHREADS.equals(coreLabel.get(CoreAnnotations.SpaceBeforeAnnotation.class));
            if (!((String) coreLabel.get(CoreAnnotations.AnswerAnnotation.class)).equals(TaggerConfig.NTHREADS) || "0".equals(String.valueOf(coreLabel.get(CoreAnnotations.PositionAnnotation.class)))) {
                boolean z = false;
                if (i > 0) {
                    char charAt = ((String) coreLabel2.get(CoreAnnotations.OriginalCharAnnotation.class)).charAt(0);
                    char charAt2 = ((String) coreLabel.get(CoreAnnotations.OriginalCharAnnotation.class)).charAt(0);
                    if ((charAt < 128) != (charAt2 < 128) && ((!ChineseUtils.isNumber(charAt) || !ChineseUtils.isNumber(charAt2)) && seqClassifierFlags.separateASCIIandRange)) {
                        z = true;
                    }
                }
                if (seqClassifierFlags.keepEnglishWhitespaces && i > 0) {
                    char charAt3 = ((String) coreLabel2.get(CoreAnnotations.OriginalCharAnnotation.class)).charAt(0);
                    char charAt4 = ((String) coreLabel.get(CoreAnnotations.OriginalCharAnnotation.class)).charAt(0);
                    if (((isLetterASCII(charAt3) && isLetterASCII(charAt4)) || ((isLetterASCII(charAt3) && ChineseUtils.isNumber(charAt4)) || (ChineseUtils.isNumber(charAt3) && isLetterASCII(charAt4)))) && TaggerConfig.NTHREADS.equals(coreLabel.get(CoreAnnotations.SpaceBeforeAnnotation.class))) {
                        z = true;
                    }
                }
                if (seqClassifierFlags.keepAllWhitespaces && !"0".equals(String.valueOf(coreLabel.get(CoreAnnotations.PositionAnnotation.class))) && TaggerConfig.NTHREADS.equals(coreLabel.get(CoreAnnotations.SpaceBeforeAnnotation.class))) {
                    z = true;
                }
                if (z) {
                    if (equals) {
                        sb.append((char) 6436);
                    } else {
                        sb.append(' ');
                    }
                }
            } else {
                boolean z2 = true;
                if (seqClassifierFlags.keepEnglishWhitespaces && i > 0) {
                    char charAt5 = ((String) coreLabel2.get(CoreAnnotations.OriginalCharAnnotation.class)).charAt(0);
                    char charAt6 = ((String) coreLabel.get(CoreAnnotations.OriginalCharAnnotation.class)).charAt(0);
                    if (isLetterASCII(charAt5) && isLetterASCII(charAt6) && !equals) {
                        z2 = false;
                    }
                }
                if (seqClassifierFlags.keepAllWhitespaces && equals) {
                    z2 = true;
                }
                if (z2) {
                    if (equals) {
                        sb.append((char) 6436);
                    } else {
                        sb.append(' ');
                    }
                }
                sb2.append(' ');
                sb3.append(' ');
            }
            sb.append((String) coreLabel.get(CoreAnnotations.OriginalCharAnnotation.class));
            sb2.append((String) coreLabel.get(CoreAnnotations.OriginalCharAnnotation.class));
            sb3.append((String) coreLabel.get(CoreAnnotations.CharAnnotation.class));
            i++;
        }
        String sb4 = sb.toString();
        if (seqClassifierFlags.sighanPostProcessing) {
            if (!seqClassifierFlags.keepAllWhitespaces) {
                sb4 = sb4.replaceAll("ᤤ", " ");
            }
            sb4 = postProcessingAnswer(sb4, seqClassifierFlags);
        }
        return sb4.replaceAll("ᤤ", " ");
    }

    private static String postProcessingAnswer(String str, SeqClassifierFlags seqClassifierFlags) {
        return seqClassifierFlags.useHk ? postProcessingAnswerHK(str) : seqClassifierFlags.useAs ? postProcessingAnswerAS(str) : seqClassifierFlags.usePk ? postProcessingAnswerPK(str, seqClassifierFlags.keepAllWhitespaces) : seqClassifierFlags.useMsr ? postProcessingAnswerMSR(str) : postProcessingAnswerCTB(str, seqClassifierFlags.keepAllWhitespaces, seqClassifierFlags.suppressMidDotPostprocessing);
    }

    private static String separatePuncs(String str) {
        if (puncs == null) {
            puncs = new Character[]{(char) 12289, (char) 12290, (char) 12291, (char) 12296, (char) 12297, (char) 12298, (char) 12299, (char) 12300, (char) 12301, (char) 12302, (char) 12303, (char) 12304, (char) 12305, (char) 12308, (char) 12309};
        }
        if (puncsPat == null) {
            puncsPat = new Pattern[puncs.length];
            for (int i = 0; i < puncs.length; i++) {
                puncsPat[i] = Pattern.compile(ChineseUtils.WHITE + puncs[i] + ChineseUtils.WHITE);
            }
        }
        for (int i2 = 0; i2 < puncsPat.length; i2++) {
            str = puncsPat[i2].matcher(str).replaceAll(" " + puncs[i2] + " ");
        }
        return str.trim();
    }

    private static String separatePuncs(Character[] chArr, String str) {
        if (puncs == null) {
            puncs = chArr;
        }
        if (puncsPat == null) {
            puncsPat = new Pattern[puncs.length];
            for (int i = 0; i < puncs.length; i++) {
                Character ch = puncs[i];
                if (ch.charValue() == '(' || ch.charValue() == ')') {
                    puncsPat[i] = Pattern.compile("[\\s\\p{Zs}]*\\" + ch + ChineseUtils.WHITE);
                } else {
                    puncsPat[i] = Pattern.compile(ChineseUtils.WHITE + ch + ChineseUtils.WHITE);
                }
            }
        }
        for (int i2 = 0; i2 < puncsPat.length; i2++) {
            str = puncsPat[i2].matcher(str).replaceAll(" " + puncs[i2] + " ");
        }
        return str.trim();
    }

    private static String gluePunc(Character ch, String str) {
        return Pattern.compile(ch + ChineseUtils.WHITE).matcher(Pattern.compile(ChineseUtils.WHITE + ch).matcher(str).replaceAll(String.valueOf(ch))).replaceAll(String.valueOf(ch)).trim();
    }

    private static String processColons(String str, String str2) {
        if (colonsPat == null) {
            colonsPat = new Pattern[colons.length];
            for (int i = 0; i < colons.length; i++) {
                colonsPat[i] = Pattern.compile(ChineseUtils.WHITE + colons[i] + ChineseUtils.WHITE);
            }
        }
        for (int i2 = 0; i2 < colons.length; i2++) {
            str = colonsPat[i2].matcher(str).replaceAll(" " + colons[i2] + " ");
        }
        if (colonsWhitePat == null) {
            colonsWhitePat = new Pattern[colons.length];
            for (int i3 = 0; i3 < colons.length; i3++) {
                colonsWhitePat[i3] = Pattern.compile("(" + str2 + ")" + ChineseUtils.WHITEPLUS + colons[i3] + ChineseUtils.WHITEPLUS + "(" + str2 + ")");
            }
        }
        for (int i4 = 0; i4 < colons.length; i4++) {
            Character ch = colons[i4];
            Pattern pattern = colonsWhitePat[i4];
            Matcher matcher = pattern.matcher(str);
            while (true) {
                Matcher matcher2 = matcher;
                if (matcher2.find()) {
                    str = matcher2.replaceAll("$1" + ch + "$2");
                    matcher = pattern.matcher(str);
                }
            }
        }
        return str.trim();
    }

    private static String processPercents(String str, String str2) {
        String replaceAll = percentsPat.matcher(str).replaceAll(" $1 ");
        if (percentsWhitePat == null) {
            percentsWhitePat = Pattern.compile("(" + str2 + ")" + percentStr);
        }
        return percentsWhitePat.matcher(replaceAll).replaceAll("$1$2").trim();
    }

    private static String processDots(String str, String str2) {
        Pattern compile = Pattern.compile("(" + str2 + ")" + ChineseUtils.WHITEPLUS + "([﹒‧．.])" + ChineseUtils.WHITEPLUS + "(" + str2 + ")");
        Matcher matcher = compile.matcher(str);
        while (true) {
            Matcher matcher2 = matcher;
            if (!matcher2.find()) {
                break;
            }
            str = matcher2.replaceAll("$1$2$3");
            matcher = compile.matcher(str);
        }
        Pattern compile2 = Pattern.compile("(" + str2 + ")([﹒‧．.])" + ChineseUtils.WHITEPLUS + "(" + str2 + ")");
        Matcher matcher3 = compile2.matcher(str);
        while (true) {
            Matcher matcher4 = matcher3;
            if (!matcher4.find()) {
                break;
            }
            str = matcher4.replaceAll("$1$2$3");
            matcher3 = compile2.matcher(str);
        }
        Pattern compile3 = Pattern.compile("(" + str2 + ")" + ChineseUtils.WHITEPLUS + "([﹒‧．.])(" + str2 + ")");
        Matcher matcher5 = compile3.matcher(str);
        while (true) {
            Matcher matcher6 = matcher5;
            if (!matcher6.find()) {
                return str.trim();
            }
            str = matcher6.replaceAll("$1$2$3");
            matcher5 = compile3.matcher(str);
        }
    }

    private static String processCommas(String str) {
        String replaceAll = str.replaceAll(",", " , ").replaceAll("  ", " ");
        Matcher matcher = Pattern.compile("([0-9０-９])" + ChineseUtils.WHITE + "(,)" + ChineseUtils.WHITE + "([0-9０-９]{3}[^0-9０-９])").matcher(replaceAll);
        if (matcher.find()) {
            replaceAll = matcher.replaceAll("$1$2$3");
        }
        return replaceAll.trim();
    }

    static String postProcessingAnswerCTB(String str, boolean z, boolean z2) {
        String separatePuncs = separatePuncs(new Character[]{(char) 12289, (char) 12290, (char) 12291, (char) 12296, (char) 12297, (char) 12298, (char) 12299, (char) 12300, (char) 12301, (char) 12302, (char) 12303, (char) 12304, (char) 12305, (char) 12308, (char) 12309, '(', ')', '\"', '<', '>'}, str);
        if (!z2) {
            separatePuncs = gluePunc((char) 12539, separatePuncs);
        }
        return processCommas(processDots(processPercents(processColons(separatePuncs, "[0-9０-９]+"), "[0-9０-９]+"), "[0-9０-９]+")).trim();
    }

    private static String postProcessingAnswerPK(String str, boolean z) {
        String separatePuncs = separatePuncs(new Character[]{(char) 12289, (char) 12290, (char) 12291, (char) 12296, (char) 12297, (char) 12298, (char) 12299, (char) 12300, (char) 12301, (char) 12302, (char) 12303, (char) 12304, (char) 12305, (char) 12308, (char) 12309, (char) 8451}, str);
        if (!z) {
            separatePuncs = processCommas(processDots(processPercents(processColons(separatePuncs, "[0-9０-９．·一十百]+"), "[0-9０-９．·一十百]+"), "[0-9０-９．·一十百]+"));
            String[] strArr = {"—[\\s\\p{Zs}]*—[\\s\\p{Zs}]*—", "…[\\s\\p{Zs}]*…"};
            String[] strArr2 = {"———", "……"};
            for (int i = 0; i < strArr.length; i++) {
                separatePuncs = Pattern.compile(ChineseUtils.WHITE + strArr[i] + ChineseUtils.WHITE).matcher(separatePuncs).replaceAll(" " + strArr2[i] + " ");
            }
        }
        return separatePuncs.trim();
    }

    private static String postProcessingAnswerMSR(String str) {
        return separatePuncs(str);
    }

    private static String postProcessingAnswerAS(String str) {
        return processCommas(processDots(processPercents(processColons(separatePuncs(str), "[０-９一二三四五六七八九十百千]+"), "[０-９一二三四五六七八九十百千]+"), "[０-９一二三四五六七八九十百千]+"));
    }

    private static String postProcessingAnswerHK(String str) {
        String processColons = processColons(separatePuncs(new Character[]{(char) 12289, (char) 12290, (char) 12291, (char) 12296, (char) 12297, (char) 12298, (char) 12299, (char) 12300, (char) 12301, (char) 12302, (char) 12303, (char) 12304, (char) 12305, (char) 12308, (char) 12309, (char) 8451}, str), "[0-9]+");
        String[] strArr = {"—[\\s\\p{Zs}]*—[\\s\\p{Zs}]*—", "…[\\s\\p{Zs}]*…"};
        String[] strArr2 = {"———", "……"};
        for (int i = 0; i < strArr.length; i++) {
            processColons = Pattern.compile(ChineseUtils.WHITE + strArr[i] + ChineseUtils.WHITE).matcher(processColons).replaceAll(" " + strArr2[i] + " ");
        }
        return processColons.trim();
    }

    public static void main(String[] strArr) {
        String str = strArr[0];
        Iterator<String> it = ObjectBank.getLineIterator(new File(str), strArr[1]).iterator();
        while (it.hasNext()) {
            EncodingPrintWriter.out.println(processPercents(it.next(), "[0-9０-９]+"), "UTF-8");
        }
    }
}
