package edu.stanford.nlp.international.french.pipeline;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory;
import edu.stanford.nlp.trees.international.french.FrenchXMLTreeReader;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.Generics;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Set;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/international/french/pipeline/MWEPreprocessor.class */
public final class MWEPreprocessor {
    private static final boolean RESOLVE_DUMMY_TAGS = true;
    private static int nMissingPOS = 0;
    private static int nMissingPhrasal = 0;
    static final TregexPattern pMWE = TregexPattern.compile("/^MW/");

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/stanford/nlp/international/french/pipeline/MWEPreprocessor$ManualUWModel.class */
    public static class ManualUWModel {
        private static final String nStr = "A. Alezais alfa Annick Appliances Ardenne Artois baptiste Bargue Bellanger Bregenz clefs Coeurs ...conomie consumer contrôleur Coopérative Coppée cuisson dédoublement demandeuse défraie Domestic dépistage Elektra Elettrodomestici Essonnes Fair Finparcom Gelisim gorge Happy Indesit Italia jockey Lawrence leone Levi machinisme Mc.Donnel MD Merloni Meydan ménagers Muenchener Parcel Prost R. sam Sara Siège silos SPA Stateman Valley Vanity VF Vidal Vives Yorker Young Zemment";
        private static final String aStr = "astral bis bovin gracieux intégrante italiano sanguin sèche";
        private static final String pStr = "c o t";
        private static int nUnknownWordTypes;
        private static final Pattern digit;
        private static final Set<String> nouns = Generics.newHashSet();
        private static final Set<String> adjectives = Generics.newHashSet();
        private static final Set<String> preps = Generics.newHashSet();

        private ManualUWModel() {
        }

        public static String getTag(String str) {
            if (digit.matcher(str).find() || nouns.contains(str)) {
                return "N";
            }
            if (adjectives.contains(str)) {
                return "A";
            }
            if (preps.contains(str)) {
                return "P";
            }
            System.err.println("No POS tag for " + str);
            return "N";
        }

        static {
            nouns.addAll(Arrays.asList(nStr.split("\\s+")));
            adjectives.addAll(Arrays.asList(aStr.split("\\s+")));
            preps.addAll(Arrays.asList(pStr.split("\\s+")));
            nUnknownWordTypes = nouns.size() + adjectives.size() + preps.size();
            digit = Pattern.compile("\\d+");
        }
    }

    private MWEPreprocessor() {
    }

    public static void printCounter(TwoDimensionalCounter<String, String> twoDimensionalCounter, String str) {
        try {
            PrintWriter printWriter = new PrintWriter(new PrintStream((OutputStream) new FileOutputStream(new File(str)), false, "UTF-8"));
            for (String str2 : twoDimensionalCounter.firstKeySet()) {
                for (String str3 : twoDimensionalCounter.getCounter((TwoDimensionalCounter<String, String>) str2).keySet()) {
                    printWriter.printf("%s\t%s\t%d%n", str2, str3, Integer.valueOf((int) twoDimensionalCounter.getCount(str2, str3)));
                }
            }
            printWriter.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (UnsupportedEncodingException e2) {
            e2.printStackTrace();
        }
    }

    public static void updateTagger(TwoDimensionalCounter<String, String> twoDimensionalCounter, Tree tree) {
        for (CoreLabel coreLabel : tree.taggedLabeledYield()) {
            if (!coreLabel.tag().equals(FrenchXMLTreeReader.MISSING_POS)) {
                twoDimensionalCounter.incrementCount(coreLabel.word(), coreLabel.tag());
            }
        }
    }

    public static void traverseAndFix(Tree tree, TwoDimensionalCounter<String, String> twoDimensionalCounter, TwoDimensionalCounter<String, String> twoDimensionalCounter2) {
        if (tree.isPreTerminal()) {
            if (tree.value().equals(FrenchXMLTreeReader.MISSING_POS)) {
                nMissingPOS++;
                String value = tree.firstChild().value();
                tree.setValue(twoDimensionalCounter2.firstKeySet().contains(value) ? (String) Counters.argmax(twoDimensionalCounter2.getCounter((TwoDimensionalCounter<String, String>) value)) : ManualUWModel.getTag(value));
                return;
            }
            return;
        }
        for (Tree tree2 : tree.children()) {
            traverseAndFix(tree2, twoDimensionalCounter, twoDimensionalCounter2);
        }
        if (tree.value().equals(FrenchXMLTreeReader.MISSING_PHRASAL)) {
            nMissingPhrasal++;
            StringBuilder sb = new StringBuilder();
            for (Tree tree3 : tree.children()) {
                sb.append(tree3.value()).append(" ");
            }
            String trim = sb.toString().trim();
            if (twoDimensionalCounter.firstKeySet().contains(trim)) {
                tree.setValue((String) Counters.argmax(twoDimensionalCounter.getCounter((TwoDimensionalCounter<String, String>) trim)));
            } else {
                System.out.println("No phrasal cat for: " + trim);
            }
        }
    }

    private static void resolveDummyTags(File file, TwoDimensionalCounter<String, String> twoDimensionalCounter, TwoDimensionalCounter<String, String> twoDimensionalCounter2) {
        try {
            TreeReader newTreeReader = new FrenchTreeReaderFactory().newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")));
            PrintWriter printWriter = new PrintWriter(new PrintStream((OutputStream) new FileOutputStream(new File(file + ".fixed")), false, "UTF-8"));
            int i = 0;
            while (true) {
                Tree readTree = newTreeReader.readTree();
                if (readTree == null) {
                    printWriter.close();
                    newTreeReader.close();
                    System.out.println("Processed " + i + " trees");
                    return;
                } else {
                    traverseAndFix(readTree, twoDimensionalCounter, twoDimensionalCounter2);
                    printWriter.println(readTree.toString());
                    i++;
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (UnsupportedEncodingException e2) {
            e2.printStackTrace();
        } catch (IOException e3) {
            e3.printStackTrace();
        }
    }

    public static void countMWEStatistics(Tree tree, TwoDimensionalCounter<String, String> twoDimensionalCounter, TwoDimensionalCounter<String, String> twoDimensionalCounter2, TwoDimensionalCounter<String, String> twoDimensionalCounter3, TwoDimensionalCounter<String, String> twoDimensionalCounter4, TwoDimensionalCounter<String, String> twoDimensionalCounter5) {
        updateTagger(twoDimensionalCounter, tree);
        TregexMatcher matcher = pMWE.matcher(tree);
        while (matcher.findNextMatchingNode()) {
            Tree match = matcher.getMatch();
            String value = match.value();
            if (!value.equals(FrenchXMLTreeReader.MISSING_PHRASAL)) {
                String listToString = Sentence.listToString(match.preTerminalYield());
                String listToString2 = Sentence.listToString(match.yield());
                twoDimensionalCounter2.incrementCount(value, listToString);
                twoDimensionalCounter3.incrementCount(listToString, value);
                twoDimensionalCounter4.incrementCount(value, listToString2);
                twoDimensionalCounter5.incrementCount(listToString2, value);
            }
        }
    }

    public static void main(String[] strArr) {
        if (strArr.length != 1) {
            System.err.printf("Usage: java %s file%n", MWEPreprocessor.class.getName());
            System.exit(-1);
        }
        File file = new File(strArr[0]);
        TwoDimensionalCounter twoDimensionalCounter = new TwoDimensionalCounter();
        TwoDimensionalCounter twoDimensionalCounter2 = new TwoDimensionalCounter();
        TwoDimensionalCounter twoDimensionalCounter3 = new TwoDimensionalCounter();
        TwoDimensionalCounter twoDimensionalCounter4 = new TwoDimensionalCounter();
        TwoDimensionalCounter twoDimensionalCounter5 = new TwoDimensionalCounter();
        try {
            TreeReader newTreeReader = new FrenchTreeReaderFactory().newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")));
            while (true) {
                Tree readTree = newTreeReader.readTree();
                if (readTree == null) {
                    newTreeReader.close();
                    System.out.println("Generating {MWE Type -> Terminal}");
                    printCounter(twoDimensionalCounter, "label_term.csv");
                    System.out.println("Generating {Terminal -> MWE Type}");
                    printCounter(twoDimensionalCounter2, "term_label.csv");
                    System.out.println("Generating {MWE Type -> POS sequence}");
                    printCounter(twoDimensionalCounter3, "label_pos.csv");
                    System.out.println("Generating {POS sequence -> MWE Type}");
                    printCounter(twoDimensionalCounter4, "pos_label.csv");
                    System.out.println("Resolving DUMMY tags");
                    resolveDummyTags(file, twoDimensionalCounter4, twoDimensionalCounter5);
                    System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
                    System.out.println("#Missing POS: " + nMissingPOS);
                    System.out.println("#Missing Phrasal: " + nMissingPhrasal);
                    System.out.println("Done!");
                    return;
                }
                countMWEStatistics(readTree, twoDimensionalCounter5, twoDimensionalCounter3, twoDimensionalCounter4, twoDimensionalCounter, twoDimensionalCounter2);
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (UnsupportedEncodingException e2) {
            e2.printStackTrace();
        } catch (IOException e3) {
            e3.printStackTrace();
        }
    }
}
