package com.googlecode.clearnlp.tokenization;

import com.carrotsearch.hppc.ObjectIntOpenHashMap;
import com.googlecode.clearnlp.constituent.CTLibEn;
import com.googlecode.clearnlp.morphology.MPLib;
import com.googlecode.clearnlp.propbank.PBLib;
import com.googlecode.clearnlp.util.UTArray;
import com.googlecode.clearnlp.util.pair.IntIntPair;
import com.googlecode.clearnlp.util.pair.StringBooleanPair;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import jregex.MatchResult;
import jregex.Replacer;
import jregex.Substitution;
import jregex.TextBuffer;

/* loaded from: input_file:com/googlecode/clearnlp/tokenization/EnglishTokenizer.class */
public class EnglishTokenizer extends AbstractTokenizer {
    protected final String F_DIR = "tokenize/";
    protected final String F_EMOTICONS = "tokenize/emoticons.txt";
    protected final String F_ABBREVIATIONS = "tokenize/abbreviations.txt";
    protected final String F_HYPHENS = "tokenize/hyphens.txt";
    protected final String F_COMPOUNDS = "tokenize/compounds.txt";
    protected final String F_UNITS = "tokenize/units.txt";
    protected final String F_MICROSOFT = "tokenize/microsoft.txt";
    protected final String S_DELIM = " ";
    protected final String S_PROTECTED = "PR0T_";
    protected final String S_D0D = "_DPPD_";
    protected final String S_HYPHEN = "_HYYN_";
    protected final String S_AMPERSAND = "_APSD_";
    protected final String S_APOSTROPHY = "_AOOR_";
    protected final int N_PROTECTED = "PR0T_".length();
    protected final Pattern P_DELIM = Pattern.compile(" ");
    protected final Pattern P_HYPHEN = Pattern.compile(PBLib.DELIM_LABEL);
    protected final Pattern P_ABBREVIATION = Pattern.compile("^(\\p{Alpha}\\.)+\\p{Alpha}?$");
    protected final String[] A_D0D = {CTLibEn.POS_PERIOD, ",", ":", PBLib.DELIM_LABEL, "/", "'"};
    protected Replacer R_URL;
    protected Replacer R_ABBREVIATION;
    protected Replacer R_PERIOD_LIKE;
    protected Replacer R_MARKER;
    protected Replacer R_APOSTROPHY;
    protected Replacer R_USDOLLAR;
    protected Replacer R_AMPERSAND;
    protected Replacer R_WAW;
    protected Replacer R_PUNCTUATION_PRE;
    protected Replacer R_PUNCTUATION_POST;
    protected Replacer[] R_D0D;
    protected Replacer[] R_UNIT;
    protected Set<String> T_EMOTICONS;
    protected Set<String> T_ABBREVIATIONS;
    protected Pattern P_HYPHEN_LIST;
    protected ObjectIntOpenHashMap<String> M_D0D;
    protected ObjectIntOpenHashMap<String> M_COMPOUNDS;
    protected List<IntIntPair[]> L_COMPOUNDS;
    protected Pattern[] P_RECOVER_D0D;
    protected Pattern P_RECOVER_DOT;
    protected Pattern P_RECOVER_HYPHEN;
    protected Pattern P_RECOVER_APOSTROPHY;
    protected Pattern P_RECOVER_AMPERSAND;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/googlecode/clearnlp/tokenization/EnglishTokenizer$SubstitutionD0D.class */
    public class SubstitutionD0D implements Substitution {
        private SubstitutionD0D() {
        }

        public void appendSubstitution(MatchResult matchResult, TextBuffer textBuffer) {
            textBuffer.append(matchResult.group(1));
            textBuffer.append("_DPPD_" + EnglishTokenizer.this.M_D0D.get(matchResult.group(2)) + "_");
            textBuffer.append(matchResult.group(3));
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/googlecode/clearnlp/tokenization/EnglishTokenizer$SubstitutionOne.class */
    public class SubstitutionOne implements Substitution {
        private SubstitutionOne() {
        }

        public void appendSubstitution(MatchResult matchResult, TextBuffer textBuffer) {
            textBuffer.append(" ");
            textBuffer.append("PR0T_");
            textBuffer.append(matchResult.group(0));
            textBuffer.append(" ");
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/googlecode/clearnlp/tokenization/EnglishTokenizer$SubstitutionOnePlus.class */
    public class SubstitutionOnePlus implements Substitution {
        private SubstitutionOnePlus() {
        }

        public void appendSubstitution(MatchResult matchResult, TextBuffer textBuffer) {
            textBuffer.append(" ");
            textBuffer.append("PR0T_");
            textBuffer.append(matchResult.group(1));
            textBuffer.append(" ");
            textBuffer.append(matchResult.group(3));
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/googlecode/clearnlp/tokenization/EnglishTokenizer$SubstitutionTwo.class */
    public class SubstitutionTwo implements Substitution {
        private SubstitutionTwo() {
        }

        public void appendSubstitution(MatchResult matchResult, TextBuffer textBuffer) {
            textBuffer.append(matchResult.group(1));
            textBuffer.append(" ");
            textBuffer.append(matchResult.group(2));
        }
    }

    public EnglishTokenizer(ZipInputStream zipInputStream) {
        initReplacers();
        initMapsD0D();
        initPatterns();
        try {
            initDictionaries(zipInputStream);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    @Override // com.googlecode.clearnlp.tokenization.AbstractTokenizer
    public List<StringBooleanPair> getTokenList(String str) {
        List<StringBooleanPair> list = tokenizeWhiteSpaces(str);
        protectEmoticons(list);
        List<StringBooleanPair> list2 = tokenizePatterns(tokenizePatterns(tokenizePatterns(tokenizePatterns(tokenizePatterns(list, this.R_URL), this.R_ABBREVIATION), this.R_PERIOD_LIKE), this.R_MARKER), this.R_USDOLLAR);
        for (Replacer replacer : this.R_D0D) {
            replaceProtects(list2, replacer);
        }
        replaceHyphens(list2);
        List<StringBooleanPair> list3 = tokenizePatterns(list2, this.R_PUNCTUATION_PRE);
        protectAbbreviations(list3);
        protectFilenames(list3);
        List<StringBooleanPair> list4 = tokenizePatterns(tokenizeCompounds(list3), this.R_APOSTROPHY);
        replaceProtects(list4, this.R_AMPERSAND);
        replaceProtects(list4, this.R_WAW);
        for (Replacer replacer2 : this.R_UNIT) {
            list4 = tokenizePatterns(list4, replacer2);
        }
        if (this.b_twit) {
            protectTwits(list4);
        }
        List<StringBooleanPair> list5 = tokenizePatterns(list4, this.R_PUNCTUATION_POST);
        int length = this.P_RECOVER_D0D.length;
        for (int i = 0; i < length; i++) {
            recoverPatterns(list5, this.P_RECOVER_D0D[i], this.A_D0D[i]);
        }
        recoverPatterns(list5, this.P_RECOVER_HYPHEN, PBLib.DELIM_LABEL);
        recoverPatterns(list5, this.P_RECOVER_APOSTROPHY, "'");
        recoverPatterns(list5, this.P_RECOVER_AMPERSAND, "&");
        return list5;
    }

    private void initReplacers() {
        this.R_URL = MPLib.URL_SPAN.replacer(new SubstitutionOne());
        this.R_ABBREVIATION = new jregex.Pattern("(^(\\p{Alpha}\\.)+)(\\p{Punct}*$)").replacer(new SubstitutionOnePlus());
        this.R_PERIOD_LIKE = new jregex.Pattern("(\\.|\\?|\\!){2,}").replacer(new SubstitutionOne());
        this.R_MARKER = new jregex.Pattern("\\-{2,}|\\*{2,}|\\={2,}|\\~{2,}|\\,{2,}|\\`{2,}|\\'{2,}").replacer(new SubstitutionOne());
        this.R_APOSTROPHY = new jregex.Pattern("(?i)((\\')(s|d|m|z|ll|re|ve|nt)|n(\\')t)$").replacer(new SubstitutionOne());
        this.R_USDOLLAR = new jregex.Pattern("^US\\$").replacer(new SubstitutionOne());
        this.R_AMPERSAND = getReplacerAmpersand();
        this.R_WAW = getReplacerWAWs();
        this.R_PUNCTUATION_PRE = new jregex.Pattern("\\(|\\)|\\[|\\]|\\{|\\}|<|>|\\,|\\:|\\;|\\\"").replacer(new SubstitutionOne());
        this.R_PUNCTUATION_POST = new jregex.Pattern("\\.|\\?|\\!|\\`|\\'|\\-|\\/|\\@|\\#|\\$|\\%|\\&|\\|").replacer(new SubstitutionOne());
        initReplacersD0Ds();
    }

    private Replacer getReplacerAmpersand() {
        return new jregex.Pattern("(\\p{Upper})(\\&)(\\p{Upper})").replacer(new Substitution() { // from class: com.googlecode.clearnlp.tokenization.EnglishTokenizer.1
            public void appendSubstitution(MatchResult matchResult, TextBuffer textBuffer) {
                textBuffer.append(matchResult.group(1));
                textBuffer.append("_APSD_");
                textBuffer.append(matchResult.group(3));
            }
        });
    }

    private Replacer getReplacerWAWs() {
        return new jregex.Pattern("(\\w)(\\')(\\w)").replacer(new Substitution() { // from class: com.googlecode.clearnlp.tokenization.EnglishTokenizer.2
            public void appendSubstitution(MatchResult matchResult, TextBuffer textBuffer) {
                textBuffer.append(matchResult.group(1));
                textBuffer.append("_AOOR_");
                textBuffer.append(matchResult.group(3));
            }
        });
    }

    private void initReplacersD0Ds() {
        String[] strArr = {"(^|\\p{Alnum})(\\.)(\\d)", "(\\d)(,|:|-|\\/)(\\d)", "(^)(\\')(\\d)", "(\\d)(\\')(s)"};
        int length = strArr.length;
        this.R_D0D = new Replacer[length];
        for (int i = 0; i < length; i++) {
            this.R_D0D[i] = new jregex.Pattern(strArr[i]).replacer(new SubstitutionD0D());
        }
    }

    private void initMapsD0D() {
        this.M_D0D = new ObjectIntOpenHashMap<>();
        int length = this.A_D0D.length;
        for (int i = 0; i < length; i++) {
            this.M_D0D.put(this.A_D0D[i], i);
        }
    }

    private void initPatterns() {
        int length = this.A_D0D.length;
        this.P_RECOVER_D0D = new Pattern[length];
        for (int i = 0; i < length; i++) {
            this.P_RECOVER_D0D[i] = Pattern.compile("_DPPD_" + i + "_");
        }
        this.P_RECOVER_HYPHEN = Pattern.compile("_HYYN_");
        this.P_RECOVER_APOSTROPHY = Pattern.compile("_AOOR_");
        this.P_RECOVER_AMPERSAND = Pattern.compile("_APSD_");
    }

    private void initDictionaries(ZipInputStream zipInputStream) throws Exception {
        while (true) {
            ZipEntry nextEntry = zipInputStream.getNextEntry();
            if (nextEntry == null) {
                zipInputStream.close();
                return;
            }
            String name = nextEntry.getName();
            if (name.equals("tokenize/emoticons.txt")) {
                this.T_EMOTICONS = getSet(zipInputStream);
            } else if (name.equals("tokenize/abbreviations.txt")) {
                this.T_ABBREVIATIONS = getSet(zipInputStream);
            } else if (name.equals("tokenize/hyphens.txt")) {
                this.P_HYPHEN_LIST = getHyphenPatterns(zipInputStream);
            } else if (name.equals("tokenize/compounds.txt")) {
                initDictionariesComounds(zipInputStream);
            } else if (name.equals("tokenize/units.txt")) {
                initDictionariesUnits(zipInputStream);
            }
        }
    }

    private Set<String> getSet(ZipInputStream zipInputStream) throws Exception {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(zipInputStream));
        HashSet hashSet = new HashSet();
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                return hashSet;
            }
            hashSet.add(readLine.trim());
        }
    }

    private Pattern getHyphenPatterns(ZipInputStream zipInputStream) throws Exception {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(zipInputStream));
        StringBuilder sb = new StringBuilder();
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                return Pattern.compile(sb.substring(1));
            }
            sb.append("|");
            sb.append(readLine.trim());
        }
    }

    private void initDictionariesComounds(ZipInputStream zipInputStream) throws Exception {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(zipInputStream));
        this.M_COMPOUNDS = new ObjectIntOpenHashMap<>();
        this.L_COMPOUNDS = new ArrayList();
        int i = 1;
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                return;
            }
            String[] split = this.P_DELIM.split(readLine.trim());
            int length = split.length;
            IntIntPair[] intIntPairArr = new IntIntPair[length];
            this.M_COMPOUNDS.put(UTArray.join(split, ""), i);
            this.L_COMPOUNDS.add(intIntPairArr);
            int i2 = 0;
            for (int i3 = 0; i3 < length; i3++) {
                int length2 = i2 + split[i3].length();
                intIntPairArr[i3] = new IntIntPair(i2, length2);
                i2 = length2;
            }
            i++;
        }
    }

    private void initDictionariesUnits(ZipInputStream zipInputStream) throws Exception {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(zipInputStream));
        String trim = bufferedReader.readLine().trim();
        String trim2 = bufferedReader.readLine().trim();
        String trim3 = bufferedReader.readLine().trim();
        this.R_UNIT = new Replacer[4];
        this.R_UNIT[0] = new jregex.Pattern("^(?i)(\\p{Punct}*" + trim + ")(\\d)").replacer(new SubstitutionTwo());
        this.R_UNIT[1] = new jregex.Pattern("^(?i)(\\p{Punct}*" + trim2 + ")(\\d)").replacer(new SubstitutionTwo());
        this.R_UNIT[2] = new jregex.Pattern("(?i)(\\d)(" + trim2 + "\\p{Punct}*)$").replacer(new SubstitutionTwo());
        this.R_UNIT[3] = new jregex.Pattern("(?i)(\\d)(" + trim3 + "\\p{Punct}*)$").replacer(new SubstitutionTwo());
    }

    protected List<StringBooleanPair> tokenizeWhiteSpaces(String str) {
        ArrayList arrayList = new ArrayList();
        for (String str2 : MPLib.splitWhiteSpaces(str)) {
            arrayList.add(new StringBooleanPair(str2, false));
        }
        return arrayList;
    }

    protected void protectTwits(List<StringBooleanPair> list) {
        for (StringBooleanPair stringBooleanPair : list) {
            char charAt = stringBooleanPair.s.charAt(0);
            if (charAt == '@' || charAt == '#') {
                if (MPLib.isAlnum(stringBooleanPair.s.substring(1))) {
                    stringBooleanPair.b = true;
                }
            }
        }
    }

    protected void protectEmoticons(List<StringBooleanPair> list) {
        for (StringBooleanPair stringBooleanPair : list) {
            if (this.T_EMOTICONS.contains(stringBooleanPair.s)) {
                stringBooleanPair.b = true;
            }
        }
    }

    protected void protectAbbreviations(List<StringBooleanPair> list) {
        for (StringBooleanPair stringBooleanPair : list) {
            String lowerCase = stringBooleanPair.s.toLowerCase();
            if (this.T_ABBREVIATIONS.contains(lowerCase) || this.P_ABBREVIATION.matcher(lowerCase).find()) {
                stringBooleanPair.b = true;
            }
        }
    }

    protected void protectFilenames(List<StringBooleanPair> list) {
        for (StringBooleanPair stringBooleanPair : list) {
            if (MPLib.FILE_EXTS.matcher(stringBooleanPair.s.toLowerCase()).find()) {
                stringBooleanPair.b = true;
            }
        }
    }

    protected void replaceProtects(List<StringBooleanPair> list, Replacer replacer) {
        for (StringBooleanPair stringBooleanPair : list) {
            if (!stringBooleanPair.b) {
                stringBooleanPair.s = replacer.replace(stringBooleanPair.s);
            }
        }
    }

    protected void replaceHyphens(List<StringBooleanPair> list) {
        for (StringBooleanPair stringBooleanPair : list) {
            if (!stringBooleanPair.b && this.P_HYPHEN_LIST.matcher(stringBooleanPair.s.toLowerCase()).find()) {
                stringBooleanPair.s = this.P_HYPHEN.matcher(stringBooleanPair.s).replaceAll("_HYYN_");
            }
        }
    }

    protected void recoverPatterns(List<StringBooleanPair> list, Pattern pattern, String str) {
        for (StringBooleanPair stringBooleanPair : list) {
            stringBooleanPair.s = pattern.matcher(stringBooleanPair.s).replaceAll(str);
        }
    }

    protected List<StringBooleanPair> tokenizeCompounds(List<StringBooleanPair> list) {
        int i;
        ArrayList arrayList = new ArrayList();
        for (StringBooleanPair stringBooleanPair : list) {
            if (stringBooleanPair.b || (i = this.M_COMPOUNDS.get(stringBooleanPair.s.toLowerCase()) - 1) < 0) {
                arrayList.add(stringBooleanPair);
            } else {
                for (IntIntPair intIntPair : this.L_COMPOUNDS.get(i)) {
                    arrayList.add(new StringBooleanPair(stringBooleanPair.s.substring(intIntPair.i1, intIntPair.i2), true));
                }
            }
        }
        return arrayList;
    }

    protected List<StringBooleanPair> tokenizePatterns(List<StringBooleanPair> list, Replacer replacer) {
        ArrayList arrayList = new ArrayList();
        for (StringBooleanPair stringBooleanPair : list) {
            if (stringBooleanPair.b) {
                arrayList.add(stringBooleanPair);
            } else {
                tokenizePatternsAux(arrayList, replacer, stringBooleanPair.s);
            }
        }
        return arrayList;
    }

    private void tokenizePatternsAux(List<StringBooleanPair> list, Replacer replacer, String str) {
        for (String str2 : this.P_DELIM.split(replacer.replace(str).trim())) {
            if (str2.startsWith("PR0T_")) {
                list.add(new StringBooleanPair(str2.substring(this.N_PROTECTED), true));
            } else if (!str2.isEmpty()) {
                list.add(new StringBooleanPair(str2, false));
            }
        }
    }
}
