/*
 * Decompiled with CFR 0.152.
 */
package org.apache.ctakes.core.ae;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Scanner;
import org.apache.ctakes.core.pipeline.PipeBitInfo;
import org.apache.ctakes.core.resource.FileLocator;
import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.ctakes.utils.struct.CounterMap;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CASRuntimeException;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.ml.CleartkAnnotator;
import org.cleartk.ml.DataWriter;
import org.cleartk.ml.Feature;
import org.cleartk.ml.Instance;
import org.cleartk.ml.feature.function.CharacterCategoryPatternFunction;
import org.cleartk.util.ViewUriUtil;

@PipeBitInfo(name="Prose Sentence Detector", description="Sentence detector that uses B I O for determination.  Useful for documents in which newlines may not indicate sentence boundaries.", role=PipeBitInfo.Role.ANNOTATOR, dependencies={PipeBitInfo.TypeProduct.SECTION}, products={PipeBitInfo.TypeProduct.SENTENCE})
public class SentenceDetectorAnnotatorBIO
extends CleartkAnnotator<String> {
    private Logger logger = Logger.getLogger(SentenceDetectorAnnotatorBIO.class);
    private static final int WINDOW_SIZE = 3;
    public static final String PARAM_FEAT_CONFIG = "FeatureConfiguration";
    @ConfigurationParameter(name="FeatureConfiguration", mandatory=false)
    private FEAT_CONFIG featConfig = FEAT_CONFIG.CHAR;
    public static final String PARAM_TOKEN_FILE = "TokenFilename";
    @ConfigurationParameter(name="TokenFilename", mandatory=false)
    private String tokenCountFile = "org/apache/ctakes/core/models/sentdetect/tokenCounts.txt";
    CounterMap<String> tokenCounts = new CounterMap();
    private HashMap<Integer, Double> endCounts = null;
    private double maxLineStrength = -1.0;
    private int maxLineLength = -1;
    static CharacterCategoryPatternFunction<Annotation> shapeFun = new CharacterCategoryPatternFunction(CharacterCategoryPatternFunction.PatternType.REPEATS_AS_KLEENE_PLUS);

    public void initialize(UimaContext context) throws ResourceInitializationException {
        this.logger.info((Object)"Initializing ...");
        super.initialize(context);
        try {
            Scanner scanner = new Scanner(FileLocator.getAsStream(this.tokenCountFile));
            while (scanner.hasNextLine()) {
                String[] pair = scanner.nextLine().trim().split(" : ");
                if (pair.length != 2) continue;
                this.tokenCounts.put((Object)pair[0], (Object)Integer.parseInt(pair[1]));
            }
            scanner.close();
        }
        catch (FileNotFoundException e) {
            throw new ResourceInitializationException((Throwable)e);
        }
    }

    public void process(JCas jcas) throws AnalysisEngineProcessException {
        this.logger.info((Object)"Processing ...");
        String uri = null;
        try {
            uri = ViewUriUtil.getURI((JCas)jcas).toString();
            this.logger.info((Object)String.format("Processing file with uri %s", uri));
        }
        catch (CASRuntimeException e) {
            this.logger.debug((Object)"No uri found, probably not a big deal unless this is an evaluation.");
        }
        if (this.featConfig == FEAT_CONFIG.LINE_POS || this.featConfig == FEAT_CONFIG.CHAR_POS || this.featConfig == FEAT_CONFIG.CHAR_SHAPE_POS) {
            this.buildDocEndlineModel(jcas);
        }
        for (Segment seg : JCasUtil.select((JCas)jcas, Segment.class)) {
            List sents = JCasUtil.selectCovered((JCas)jcas, Sentence.class, (AnnotationFS)seg);
            int sentInd = 0;
            Sentence nextSent = sents.size() > 0 ? (Sentence)sents.get(sentInd++) : null;
            int startInd = 0;
            String prevOutcome = "O";
            String segText = seg.getCoveredText();
            for (int ind = 0; ind < segText.length(); ++ind) {
                String outcome;
                ArrayList<Object> feats = new ArrayList<Object>();
                char curChar = segText.charAt(ind);
                feats.add(new Feature("PrevOutcome", (Object)prevOutcome));
                feats.addAll(SentenceDetectorAnnotatorBIO.getCharFeatures(curChar, "Character"));
                if (this.featConfig == FEAT_CONFIG.CHAR || this.featConfig == FEAT_CONFIG.CHAR_POS || this.featConfig == FEAT_CONFIG.CHAR_SHAPE || this.featConfig == FEAT_CONFIG.CHAR_SHAPE_POS) {
                    for (int window = -3; window <= 3; ++window) {
                        if (ind + window < 0 || ind + window >= segText.length()) continue;
                        char conChar = segText.charAt(ind + window);
                        feats.addAll(SentenceDetectorAnnotatorBIO.getCharFeatures(conChar, "CharOffset_" + window));
                    }
                }
                String nextToken = SentenceDetectorAnnotatorBIO.getNextToken(segText, ind);
                String prevToken = SentenceDetectorAnnotatorBIO.getPrevToken(segText, ind);
                feats.addAll(this.getTokenFeatures(prevToken, nextToken, "Token"));
                if (this.featConfig == FEAT_CONFIG.LINE_POS || this.featConfig == FEAT_CONFIG.CHAR_POS || this.featConfig == FEAT_CONFIG.CHAR_SHAPE_POS) {
                    feats.addAll(this.getPositionFeatures(curChar, ind, segText, nextToken));
                }
                int casInd = seg.getBegin() + ind;
                if (this.isTraining()) {
                    while (nextSent != null && nextSent.getEnd() < casInd && sentInd < sents.size()) {
                        nextSent = (Sentence)sents.get(sentInd++);
                    }
                    outcome = nextSent == null ? "O" : (casInd < nextSent.getBegin() ? "O" : (prevOutcome.equals("O") ? "B" : "I"));
                    this.dataWriter.write(new Instance((Object)outcome, feats));
                } else if (!prevOutcome.equals("O") && Character.isLetterOrDigit(curChar)) {
                    outcome = "I";
                } else {
                    outcome = (String)this.classifier.classify(feats);
                    if (outcome.equals("I") && prevOutcome.equals("O")) {
                        outcome = "B";
                    }
                    if (outcome.equals("B")) {
                        startInd = casInd;
                    } else if (outcome.equals("O") && (prevOutcome.equals("I") || prevOutcome.equals("B"))) {
                        int endInd;
                        for (endInd = casInd; endInd > startInd && Character.isWhitespace(segText.charAt(endInd - seg.getBegin() - 1)); --endInd) {
                        }
                        if (endInd > startInd) {
                            SentenceDetectorAnnotatorBIO.makeSentence(jcas, startInd, endInd);
                        }
                    }
                }
                prevOutcome = outcome;
            }
            if (this.isTraining() || prevOutcome.equals("O")) continue;
            SentenceDetectorAnnotatorBIO.makeSentence(jcas, startInd, seg.getEnd());
        }
    }

    private void buildDocEndlineModel(JCas jcas) {
        int window = 5;
        HashMap<Integer, Double> rawCounts = new HashMap<Integer, Double>();
        this.endCounts = new HashMap();
        this.maxLineStrength = -1.0;
        this.maxLineLength = -1;
        for (Segment seg : JCasUtil.select((JCas)jcas, Segment.class)) {
            String[] lines;
            for (String line : lines = seg.getCoveredText().split("\n+")) {
                if (line.contains("[**") || line.contains("**]")) continue;
                if (!rawCounts.containsKey(line.length())) {
                    rawCounts.put(line.length(), 0.0);
                }
                rawCounts.put(line.length(), (Double)rawCounts.get(line.length()) + 1.0);
                int adjustedLength = line.replaceAll("\\s+$", "").length();
                if (adjustedLength <= this.maxLineLength) continue;
                this.maxLineLength = adjustedLength;
            }
        }
        Iterator<Object> iterator = rawCounts.keySet().iterator();
        while (iterator.hasNext()) {
            int len = (Integer)iterator.next();
            double count = (Double)rawCounts.get(len);
            for (int i = Math.max(0, len - window + 1); i < len + window; ++i) {
                if (!this.endCounts.containsKey(i)) {
                    this.endCounts.put(i, 0.0);
                }
                double partialMass = (double)(window - Math.abs(i - len)) * count / (double)window;
                this.endCounts.put(i, this.endCounts.get(i) + partialMass);
            }
        }
        iterator = this.endCounts.values().iterator();
        while (iterator.hasNext()) {
            double count = (Double)iterator.next();
            if (!(count > this.maxLineStrength)) continue;
            this.maxLineStrength = count;
        }
    }

    public static void makeSentence(JCas jcas, int begin, int end) {
        String docText = jcas.getDocumentText();
        while (begin < docText.length() && Character.isWhitespace(docText.charAt(begin))) {
            ++begin;
        }
        while (end > 0 && Character.isWhitespace(docText.charAt(end - 1))) {
            --end;
        }
        if (begin < end) {
            Sentence sent = new Sentence(jcas, begin, end);
            sent.addToIndexes();
        }
    }

    private static String getNextToken(String segText, int ind) {
        int endInd;
        int startInd;
        for (startInd = ind; startInd < segText.length() && Character.isWhitespace(segText.charAt(startInd)); ++startInd) {
        }
        while (startInd > 0 && !Character.isWhitespace(segText.charAt(startInd - 1))) {
            --startInd;
        }
        for (endInd = startInd; endInd < segText.length() && !Character.isWhitespace(segText.charAt(endInd)); ++endInd) {
        }
        return segText.substring(startInd, endInd);
    }

    private static String getPrevToken(String segText, int ind) {
        int startInd;
        int endInd;
        for (endInd = ind; endInd > 0 && !Character.isWhitespace(segText.charAt(endInd)); --endInd) {
        }
        while (endInd > 0 && Character.isWhitespace(segText.charAt(endInd))) {
            --endInd;
        }
        for (startInd = endInd; startInd > 0 && !Character.isWhitespace(segText.charAt(startInd)) && !Character.isWhitespace(segText.charAt(startInd - 1)); --startInd) {
        }
        return segText.substring(startInd, endInd + 1);
    }

    private Collection<? extends Feature> getTokenFeatures(String prevToken, String nextToken, String prefix) {
        ArrayList<Feature> feats = new ArrayList<Feature>();
        Feature prevTokenFeat = new Feature(prefix + "PrevIdentity", (Object)prevToken);
        feats.add(prevTokenFeat);
        Feature nextTokenFeat = new Feature(prefix + "NextIdentity", (Object)nextToken);
        feats.add(nextTokenFeat);
        if (this.featConfig != FEAT_CONFIG.GILLICK) {
            feats.add(new Feature(prefix + "NextLength=" + nextToken.length(), (Object)true));
        }
        feats.add(new Feature(prefix + "PrevLength=" + prevToken.length(), (Object)true));
        feats.add(new Feature(prefix + "cap", (Object)(nextToken.length() > 0 && Character.isUpperCase(nextToken.charAt(0)) ? 1 : 0)));
        if (this.featConfig == FEAT_CONFIG.CHAR_SHAPE_POS || this.featConfig == FEAT_CONFIG.CHAR_SHAPE || this.featConfig == FEAT_CONFIG.SHAPE) {
            feats.addAll(shapeFun.apply(prevTokenFeat));
            feats.addAll(shapeFun.apply(nextTokenFeat));
        }
        int rightLower = (int)Math.round(Math.log(this.tokenCounts.get((Object)nextToken.toLowerCase()).intValue()));
        feats.add(new Feature(prefix + "_RightLower_" + rightLower, (Object)true));
        String prevDotless = prevToken;
        if (prevToken.endsWith(".")) {
            prevDotless = prevToken.substring(0, prevToken.length() - 1);
        }
        int leftDotless = (int)Math.round(Math.log(this.tokenCounts.get((Object)prevDotless).intValue()));
        feats.add(new Feature(prefix + "_LeftDotless_" + leftDotless, (Object)true));
        feats.add(new Feature((Object)("TokenContextCat_" + prevToken + "_" + nextToken)));
        feats.add(new Feature("LeftWordRightCap", (Object)(prevToken + "_" + (nextToken.length() > 0 && Character.isUpperCase(nextToken.charAt(0))))));
        return feats;
    }

    public static List<Feature> getCharFeatures(char ch, String prefix) {
        ArrayList<Feature> feats = new ArrayList<Feature>();
        feats.add(new Feature(prefix + "_Id", ch == '\n' ? "<LF>" : Character.valueOf(ch)));
        feats.add(new Feature(prefix + "_Upper", (Object)Character.isUpperCase(ch)));
        feats.add(new Feature(prefix + "_Lower", (Object)Character.isLowerCase(ch)));
        feats.add(new Feature(prefix + "_Digit", (Object)Character.isDigit(ch)));
        feats.add(new Feature(prefix + "_Space", (Object)Character.isWhitespace(ch)));
        feats.add(new Feature(prefix + "_Type" + Character.getType(ch), (Object)true));
        return feats;
    }

    public List<Feature> getPositionFeatures(char curChar, int ind, String segText, String nextToken) {
        ArrayList<Feature> feats = new ArrayList<Feature>();
        if (curChar == '\n' && ind > 0) {
            int prevNewlineInd = segText.lastIndexOf(10, ind - 1);
            int thisLineLength = ind - prevNewlineInd - 1;
            int endInd = thisLineLength + nextToken.length();
            if (thisLineLength <= this.maxLineLength && thisLineLength + 1 + nextToken.length() > this.maxLineLength) {
                feats.add(new Feature("NextWordWrapsLine", (Object)true));
            }
            double beginStrength = this.endCounts.containsKey(thisLineLength) ? this.endCounts.get(thisLineLength) : 0.0;
            double endStrength = this.endCounts.containsKey(endInd) ? this.endCounts.get(endInd) : 0.0;
            for (int intLens = thisLineLength; intLens < thisLineLength + 1 + nextToken.length(); ++intLens) {
                double strength;
                if (!this.endCounts.containsKey(intLens) || !((strength = this.endCounts.get(intLens).doubleValue()) > endStrength)) continue;
                feats.add(new Feature("LinePosNextWrapsLocalMax", (Object)true));
                break;
            }
            if (this.endCounts.containsKey(thisLineLength)) {
                feats.add(new Feature("LinePosStrength", (Object)(this.endCounts.get(thisLineLength) / this.maxLineStrength)));
            }
        }
        return feats;
    }

    public static AnalysisEngineDescription getDataWriter(File outputDirectory, Class<? extends DataWriter<?>> class1) throws ResourceInitializationException {
        return AnalysisEngineFactory.createEngineDescription(SentenceDetectorAnnotatorBIO.class, (Object[])new Object[]{"isTraining", true, "outputDirectory", outputDirectory, "dataWriterClassName", class1, PARAM_FEAT_CONFIG, FEAT_CONFIG.CHAR});
    }

    public static AnalysisEngineDescription getDescription(String modelPath) throws ResourceInitializationException {
        return AnalysisEngineFactory.createEngineDescription(SentenceDetectorAnnotatorBIO.class, (Object[])new Object[]{"isTraining", false, "classifierJarPath", modelPath, PARAM_FEAT_CONFIG, FEAT_CONFIG.CHAR});
    }

    public static AnalysisEngineDescription getDescription() throws ResourceInitializationException {
        return SentenceDetectorAnnotatorBIO.getDescription("/org/apache/ctakes/core/models/sentdetect/model.jar");
    }

    public static enum FEAT_CONFIG {
        GILLICK,
        CHAR,
        SHAPE,
        LINE_POS,
        CHAR_SHAPE,
        CHAR_POS,
        CHAR_SHAPE_POS;

    }
}

