package edu.stanford.nlp.international.arabic.pipeline;

import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;
import edu.stanford.nlp.trees.international.french.FrenchXMLTreeReader;
import edu.stanford.nlp.trees.treebank.Mapper;
import edu.stanford.nlp.util.Generics;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/international/arabic/pipeline/LDCPosMapper.class */
public class LDCPosMapper implements Mapper {
    protected Pattern startOfTagMap;
    protected Pattern endOfTagMap;
    protected Pattern mapping;
    protected int numExpectedTokens;
    private boolean addDT;
    private final Pattern determiner;
    private final Pattern nounBaseTag;
    private final Pattern adjBaseTag;
    private final Pattern LDCdeterminer;
    protected final Map<String, String> tagMap;
    protected final Set<String> tagsToEscape;

    public LDCPosMapper() {
        this(false);
    }

    public LDCPosMapper(boolean z) {
        this.startOfTagMap = Pattern.compile("\\(tag-map");
        this.endOfTagMap = Pattern.compile("^\\s*\\)\\s*$");
        this.mapping = Pattern.compile("\\((\\S+)\\s+(\\S+)\\)\\s*$");
        this.numExpectedTokens = 2;
        this.addDT = false;
        this.determiner = Pattern.compile("DET");
        this.nounBaseTag = Pattern.compile("NN");
        this.adjBaseTag = Pattern.compile("JJ");
        this.LDCdeterminer = Pattern.compile("DT\\+");
        this.addDT = z;
        this.tagMap = Generics.newHashMap();
        this.tagsToEscape = Generics.newHashSet();
        this.tagsToEscape.add(FrenchXMLTreeReader.EMPTY_LEAF);
        this.tagsToEscape.add(ATBTreeUtils.puncTag);
    }

    @Override // edu.stanford.nlp.trees.treebank.Mapper
    public String map(String str, String str2) {
        String trim = str.trim();
        if (this.tagMap.containsKey(trim)) {
            return this.tagMap.get(trim);
        }
        if (this.tagsToEscape.contains(trim)) {
            return trim;
        }
        System.err.printf("%s: No mapping for %s%n", getClass().getName(), trim);
        return trim;
    }

    private String processShortTag(String str, String str2) {
        if (str2 == null) {
            return null;
        }
        if (str2.startsWith("DT+")) {
            str2 = this.LDCdeterminer.matcher(str2).replaceAll("");
        }
        if (str.equals("NUMERIC_COMMA")) {
            str2 = ATBTreeUtils.puncTag;
        }
        if (this.addDT && str != null) {
            Matcher matcher = this.determiner.matcher(str);
            Matcher matcher2 = this.nounBaseTag.matcher(str2);
            Matcher matcher3 = this.adjBaseTag.matcher(str2);
            if (matcher.find() && (matcher2.find() || matcher3.find())) {
                str2 = "DT" + str2.trim();
            }
        }
        if (!this.tagMap.containsKey(str)) {
            return str2;
        }
        String str3 = this.tagMap.get(str);
        if (!str3.equals(str2)) {
            System.err.printf("%s: Union of mapping files will cause overlap for %s (current: %s new: %s)%n", getClass().getName(), str, str3, str2);
        }
        return str3;
    }

    @Override // edu.stanford.nlp.trees.treebank.Mapper
    public void setup(File file, String... strArr) {
        if (file == null || !file.exists()) {
            return;
        }
        LineNumberReader lineNumberReader = null;
        try {
            lineNumberReader = new LineNumberReader(new FileReader(file));
            boolean z = false;
            while (true) {
                String readLine = lineNumberReader.readLine();
                if (readLine == null) {
                    break;
                }
                String trim = readLine.trim();
                z = this.startOfTagMap.matcher(trim).matches() || z;
                if (z) {
                    if (!trim.startsWith(";")) {
                        Matcher matcher = this.mapping.matcher(trim);
                        if (matcher.find()) {
                            if (matcher.groupCount() == this.numExpectedTokens) {
                                this.tagMap.put(matcher.group(1), processShortTag(matcher.group(1), matcher.group(2)));
                            } else {
                                System.err.printf("%s: Skipping bad mapping in %s (line %d)%n", getClass().getName(), file.getPath(), Integer.valueOf(lineNumberReader.getLineNumber()));
                            }
                        }
                        if (this.endOfTagMap.matcher(trim).matches()) {
                            break;
                        }
                    }
                }
            }
            lineNumberReader.close();
        } catch (FileNotFoundException e) {
            System.err.printf("%s: Could not open mapping file %s%n", getClass().getName(), file.getPath());
        } catch (IOException e2) {
            System.err.printf("%s: Error reading %s (line %d)%n", getClass().getName(), file.getPath(), Integer.valueOf(lineNumberReader == null ? -1 : lineNumberReader.getLineNumber()));
        }
    }

    @Override // edu.stanford.nlp.trees.treebank.Mapper
    public boolean canChangeEncoding(String str, String str2) {
        return true;
    }

    public String toString() {
        StringBuilder sb = new StringBuilder();
        for (String str : this.tagMap.keySet()) {
            sb.append(str).append('\t').append(this.tagMap.get(str)).append('\n');
        }
        return sb.toString();
    }

    public static void main(String[] strArr) {
        LDCPosMapper lDCPosMapper = new LDCPosMapper(true);
        lDCPosMapper.setup(new File("/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp"), new String[0]);
        System.out.printf("%s --> %s\n", "DET+NOUN+NSUFF_FEM_SG+CASE_DEF_ACC", lDCPosMapper.map("DET+NOUN+NSUFF_FEM_SG+CASE_DEF_ACC", null));
        System.out.printf("%s --> %s\n", "ADJXXXXX", lDCPosMapper.map("ADJXXXXX", null));
        System.out.printf("%s --> %s\n", "REL_ADV", lDCPosMapper.map("REL_ADV", null));
        System.out.printf("%s --> %s\n", "NUMERIC_COMMA", lDCPosMapper.map("NUMERIC_COMMA", null));
    }
}
