package edu.stanford.nlp.international.french.process;

import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.AbstractTokenizer;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.LexedTokenFactory;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.WordTokenFactory;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;

/* loaded from: input_file:edu/stanford/nlp/international/french/process/FrenchTokenizer.class */
public class FrenchTokenizer<T extends HasWord> extends AbstractTokenizer<T> {
    private static final Redwood.RedwoodChannels log = Redwood.channels(FrenchTokenizer.class);
    private final FrenchLexer lexer;
    private static final String SPLIT_COMPOUNDS_OPTION = "splitCompounds";
    private static final String SPLIT_CONTRACTIONS_OPTION = "splitContractions";
    private final boolean splitCompounds;
    private final boolean splitContractions;
    private List<CoreLabel> compoundBuffer;
    public static final String FTB_OPTIONS = "ellipses=ptb3,normalizeParentheses=true,ptb3Dashes=false,splitContractions=true,splitCompounds=true";
    public static final String DEFAULT_OPTIONS = "invertible,splitCompounds=false,splitContractions=false,quotes=ORIGINAL";

    /* loaded from: input_file:edu/stanford/nlp/international/french/process/FrenchTokenizer$FrenchTokenizerFactory.class */
    public static class FrenchTokenizerFactory<T extends HasWord> implements TokenizerFactory<T>, Serializable {
        private static final long serialVersionUID = 946818805507187330L;
        protected final LexedTokenFactory<T> factory;
        protected Properties lexerProperties;
        protected boolean splitCompoundOption;
        protected boolean splitContractionOption;

        public static TokenizerFactory<CoreLabel> newTokenizerFactory() {
            return new FrenchTokenizerFactory(new CoreLabelTokenFactory(), FrenchTokenizer.DEFAULT_OPTIONS);
        }

        public static TokenizerFactory<Word> newWordTokenizerFactory(String str) {
            return new FrenchTokenizerFactory(new WordTokenFactory(), str);
        }

        private FrenchTokenizerFactory(LexedTokenFactory<T> lexedTokenFactory) {
            this.lexerProperties = new Properties();
            this.splitCompoundOption = false;
            this.splitContractionOption = true;
            this.factory = lexedTokenFactory;
        }

        private FrenchTokenizerFactory(LexedTokenFactory<T> lexedTokenFactory, String str) {
            this(lexedTokenFactory);
            setOptions(str);
        }

        @Override // edu.stanford.nlp.objectbank.IteratorFromReaderFactory
        public Iterator<T> getIterator(Reader reader) {
            return getTokenizer(reader);
        }

        @Override // edu.stanford.nlp.process.TokenizerFactory
        public Tokenizer<T> getTokenizer(Reader reader) {
            return new FrenchTokenizer(reader, this.factory, this.lexerProperties, this.splitCompoundOption, this.splitContractionOption);
        }

        @Override // edu.stanford.nlp.process.TokenizerFactory
        public void setOptions(String str) {
            for (String str2 : str.split(",")) {
                String[] split = str2.split("=");
                if (split.length == 1) {
                    if (split[0].equals(FrenchTokenizer.SPLIT_COMPOUNDS_OPTION)) {
                        this.splitCompoundOption = true;
                    } else if (split[0].equals(FrenchTokenizer.SPLIT_CONTRACTIONS_OPTION)) {
                        this.splitContractionOption = true;
                    } else {
                        this.lexerProperties.setProperty(str2, "true");
                    }
                } else if (split.length != 2) {
                    System.err.printf("%s: Bad option %s%n", getClass().getName(), str2);
                } else if (split[0].equals(FrenchTokenizer.SPLIT_COMPOUNDS_OPTION)) {
                    this.splitCompoundOption = Boolean.valueOf(split[1]).booleanValue();
                } else if (split[0].equals(FrenchTokenizer.SPLIT_CONTRACTIONS_OPTION)) {
                    this.splitContractionOption = Boolean.valueOf(split[1]).booleanValue();
                } else {
                    this.lexerProperties.setProperty(split[0], split[1]);
                }
            }
        }

        @Override // edu.stanford.nlp.process.TokenizerFactory
        public Tokenizer<T> getTokenizer(Reader reader, String str) {
            setOptions(str);
            return getTokenizer(reader);
        }
    }

    public FrenchTokenizer(Reader reader, LexedTokenFactory<T> lexedTokenFactory, Properties properties, boolean z, boolean z2) {
        this.lexer = new FrenchLexer(reader, lexedTokenFactory, properties);
        this.splitCompounds = z;
        this.splitContractions = z2;
        if (z || z2) {
            this.compoundBuffer = Generics.newLinkedList();
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // edu.stanford.nlp.process.AbstractTokenizer
    public T getNext() {
        HasWord remove;
        do {
            try {
                remove = ((this.splitContractions || this.splitCompounds) && this.compoundBuffer.size() > 0) ? this.compoundBuffer.remove(0) : (HasWord) this.lexer.next();
                if (remove == null) {
                    break;
                }
            } catch (IOException e) {
                throw new RuntimeIOException(e);
            }
        } while (remove.word().length() == 0);
        if (this.splitCompounds && (remove instanceof CoreLabel)) {
            CoreLabel coreLabel = (CoreLabel) remove;
            if (coreLabel.containsKey(CoreAnnotations.ParentAnnotation.class) && ((String) coreLabel.get(CoreAnnotations.ParentAnnotation.class)).equals("comp")) {
                remove = processCompound(coreLabel);
            }
        }
        if (this.splitContractions && (remove instanceof CoreLabel)) {
            CoreLabel coreLabel2 = (CoreLabel) remove;
            if (coreLabel2.containsKey(CoreAnnotations.ParentAnnotation.class) && ((String) coreLabel2.get(CoreAnnotations.ParentAnnotation.class)).equals("contraction")) {
                remove = processContraction(coreLabel2);
            }
        }
        return (T) remove;
    }

    private static CoreLabel copyCoreLabel(CoreLabel coreLabel, String str, int i, int i2) {
        CoreLabel coreLabel2 = new CoreLabel(coreLabel);
        coreLabel2.setWord(str);
        coreLabel2.setValue(str);
        coreLabel2.setBeginPosition(i);
        coreLabel2.setEndPosition(i2);
        coreLabel2.set(CoreAnnotations.OriginalTextAnnotation.class, str);
        return coreLabel2;
    }

    private static CoreLabel copyCoreLabel(CoreLabel coreLabel, String str, int i) {
        return copyCoreLabel(coreLabel, str, i, i + str.length());
    }

    private CoreLabel processCompound(CoreLabel coreLabel) {
        coreLabel.remove(CoreAnnotations.ParentAnnotation.class);
        for (String str : coreLabel.word().replaceAll("-", " - ").split("\\s+")) {
            CoreLabel coreLabel2 = new CoreLabel(coreLabel);
            coreLabel2.setWord(str);
            coreLabel2.setValue(str);
            coreLabel2.set(CoreAnnotations.OriginalTextAnnotation.class, str);
            this.compoundBuffer.add(coreLabel2);
        }
        return this.compoundBuffer.remove(0);
    }

    private CoreLabel processContraction(CoreLabel coreLabel) {
        String str;
        String str2;
        int i;
        int i2;
        coreLabel.remove(CoreAnnotations.ParentAnnotation.class);
        String lowerCase = coreLabel.word().toLowerCase();
        boolean z = -1;
        switch (lowerCase.hashCode()) {
            case 3124:
                if (lowerCase.equals("au")) {
                    z = false;
                    break;
                }
                break;
            case 3217:
                if (lowerCase.equals("du")) {
                    z = 2;
                    break;
                }
                break;
            case 96964:
                if (lowerCase.equals("aux")) {
                    z = true;
                    break;
                }
                break;
        }
        switch (z) {
            case false:
                str = "à";
                str2 = "le";
                i = 1;
                i2 = 1;
                break;
            case true:
                str = "à";
                str2 = "les";
                i = 1;
                i2 = 2;
                break;
            case true:
                str = "de";
                str2 = "le";
                i = 1;
                i2 = 1;
                break;
            default:
                throw new IllegalArgumentException("Invalid contraction provided to processContraction");
        }
        int beginPosition = coreLabel.beginPosition() + i;
        this.compoundBuffer.add(copyCoreLabel(coreLabel, str2, beginPosition, beginPosition + i2));
        return copyCoreLabel(coreLabel, str, coreLabel.beginPosition(), beginPosition);
    }

    public static TokenizerFactory<CoreLabel> factory() {
        return FrenchTokenizerFactory.newTokenizerFactory();
    }

    public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> lexedTokenFactory, String str) {
        return new FrenchTokenizerFactory(lexedTokenFactory, str);
    }

    public static TokenizerFactory<CoreLabel> ftbFactory() {
        TokenizerFactory<CoreLabel> newTokenizerFactory = FrenchTokenizerFactory.newTokenizerFactory();
        newTokenizerFactory.setOptions(FTB_OPTIONS);
        return newTokenizerFactory;
    }

    private static String usage() {
        StringBuilder sb = new StringBuilder();
        String property = System.getProperty("line.separator");
        sb.append(String.format("Usage: java %s [OPTIONS] < file%n%n", FrenchTokenizer.class.getName()));
        sb.append("Options:").append(property);
        sb.append("   -help          : Print this message.").append(property);
        sb.append("   -ftb           : Tokenization for experiments in Green et al. (2011).").append(property);
        sb.append("   -lowerCase     : Apply lowercasing.").append(property);
        sb.append("   -encoding type : Encoding format.").append(property);
        sb.append("   -options str   : Orthographic options (see FrenchLexer.java)").append(property);
        return sb.toString();
    }

    private static Map<String, Integer> argOptionDefs() {
        Map<String, Integer> newHashMap = Generics.newHashMap();
        newHashMap.put("help", 0);
        newHashMap.put("ftb", 0);
        newHashMap.put("lowerCase", 0);
        newHashMap.put("encoding", 1);
        newHashMap.put("options", 1);
        return newHashMap;
    }

    public static void main(String[] strArr) {
        Properties argsToProperties = StringUtils.argsToProperties(strArr, argOptionDefs());
        if (argsToProperties.containsKey("help")) {
            log.info(usage());
            return;
        }
        TokenizerFactory<CoreLabel> ftbFactory = argsToProperties.containsKey("ftb") ? ftbFactory() : factory();
        String property = argsToProperties.getProperty("options", "");
        ftbFactory.setOptions(property.isEmpty() ? "tokenizeNLs" : property + ",tokenizeNLs");
        String property2 = argsToProperties.getProperty("encoding", "UTF-8");
        boolean bool = PropertiesUtils.getBool(argsToProperties, "lowerCase", false);
        int i = 0;
        int i2 = 0;
        long nanoTime = System.nanoTime();
        try {
            Tokenizer<CoreLabel> tokenizer = ftbFactory.getTokenizer(new InputStreamReader(System.in, property2));
            boolean z = false;
            while (tokenizer.hasNext()) {
                i2++;
                String word = tokenizer.next().word();
                if (word.equals("*NL*")) {
                    i++;
                    z = false;
                    System.out.println();
                } else {
                    if (z) {
                        System.out.print(" ");
                    }
                    System.out.print(bool ? word.toLowerCase(Locale.FRENCH) : word);
                    z = true;
                }
            }
        } catch (UnsupportedEncodingException e) {
            log.error(e);
        }
        System.err.printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", Integer.valueOf(i), Integer.valueOf(i2), Double.valueOf(i / ((System.nanoTime() - nanoTime) / 1.0E9d)));
    }
}
