package org.cleartk.corpus.genia.pos;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashSet;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.SofaCapability;
import org.apache.uima.fit.factory.CollectionReaderFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.pear.util.FileUtil;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.cleartk.corpus.genia.pos.util.GeniaParse;
import org.cleartk.corpus.genia.pos.util.GeniaPosParser;
import org.cleartk.corpus.genia.pos.util.GeniaSentence;
import org.cleartk.corpus.genia.pos.util.GeniaTag;
import org.cleartk.corpus.genia.pos.util.Span;
import org.cleartk.token.type.Sentence;
import org.cleartk.token.type.Token;
import org.cleartk.util.ViewUriUtil;
import org.jdom2.JDOMException;

@SofaCapability(outputSofas = {"UriView", "GeniaPOSView"})
/* loaded from: input_file:org/cleartk/corpus/genia/pos/GeniaPosGoldReader.class */
public class GeniaPosGoldReader extends JCasCollectionReader_ImplBase {
    public static final String GENIA_POS_VIEW = "GeniaPOSView";
    public static final String PARAM_GENIA_CORPUS_FILE = "geniaCorpusFile";

    @ConfigurationParameter(name = PARAM_GENIA_CORPUS_FILE, description = "names the file that is the Genia corpus to be loaded. A good value is probably '.../GENIAcorpus3.02.pos.xml'.  Please see README in this directory for edits that you may need to make to this file manually.", mandatory = true)
    private File geniaCorpusFile;
    public static final String PARAM_LOAD_SENTENCES = "loadSentences";
    public static final String PARAM_LOAD_TOKENS = "loadTokens";
    public static final String PARAM_LOAD_POS_TAGS = "loadPosTags";
    public static final String PARAM_ARTICLE_IDS_LIST_FILE = "articleIdsListFile";

    @ConfigurationParameter(name = PARAM_ARTICLE_IDS_LIST_FILE, mandatory = false, description = "names the file used to specify the article ids that should be read in")
    File articleIdsListFile;
    private boolean filterArticles;
    private Set<String> articleIds;
    private GeniaPosParser parser;
    private GeniaParse parse;
    public static String[] TEST_FOLDS = {"resources/genia/article_ids/fold-1-test.txt", "resources/genia/article_ids/fold-2-test.txt", "resources/genia/article_ids/fold-3-test.txt", "resources/genia/article_ids/fold-4-test.txt", "resources/genia/article_ids/fold-5-test.txt", "resources/genia/article_ids/fold-6-test.txt", "resources/genia/article_ids/fold-7-test.txt", "resources/genia/article_ids/fold-8-test.txt", "resources/genia/article_ids/fold-9-test.txt", "resources/genia/article_ids/fold-10-test.txt"};
    public static String[] TRAIN_FOLDS = {"resources/genia/article_ids/fold-1-train.txt", "resources/genia/article_ids/fold-2-train.txt", "resources/genia/article_ids/fold-3-train.txt", "resources/genia/article_ids/fold-4-train.txt", "resources/genia/article_ids/fold-5-train.txt", "resources/genia/article_ids/fold-6-train.txt", "resources/genia/article_ids/fold-7-train.txt", "resources/genia/article_ids/fold-8-train.txt", "resources/genia/article_ids/fold-9-train.txt", "resources/genia/article_ids/fold-10-train.txt"};

    @ConfigurationParameter(name = PARAM_LOAD_SENTENCES, mandatory = false, description = "determines whether sentence annotations will be added from the Genia corpus.", defaultValue = {"true"})
    private boolean loadSentences = true;

    @ConfigurationParameter(name = PARAM_LOAD_TOKENS, mandatory = false, description = "determines whether tokens annotations will be added from the Genia corpus. ", defaultValue = {"true"})
    private boolean loadTokens = true;

    @ConfigurationParameter(name = PARAM_LOAD_POS_TAGS, mandatory = false, description = "determines whether the part of speech tags assigned to each token in the genia corpus will be loaded. The default value of 'true' is used if this parameter is unspecified. If 'loadTokens' is 'false', then 'loadPOSTags' will be treated as 'false' regardless of what is given in the descriptor file.", defaultValue = {"true"})
    private boolean loadPosTags = true;
    private int progress = 0;

    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        this.articleIds = new HashSet();
        try {
            if (this.articleIdsListFile == null) {
                this.filterArticles = false;
            } else {
                this.filterArticles = true;
                for (String str : FileUtil.loadListOfStrings(this.articleIdsListFile)) {
                    this.articleIds.add(str);
                }
            }
            this.parser = new GeniaPosParser(this.geniaCorpusFile);
            this.loadPosTags = this.loadTokens & this.loadPosTags;
        } catch (IOException e) {
            throw new ResourceInitializationException(e);
        } catch (JDOMException e2) {
            throw new ResourceInitializationException(e2);
        }
    }

    public void getNext(JCas jCas) throws IOException, CollectionException {
        if (!hasNext()) {
            throw new CollectionException("Should not be calling getNext() because hasNext returns false", (Object[]) null);
        }
        try {
            JCas view = jCas.getView("_InitialView");
            view.setDocumentText(this.parse.getText());
            for (GeniaSentence geniaSentence : this.parse.getSentences()) {
                if (this.loadTokens) {
                    for (GeniaTag geniaTag : geniaSentence.getPosTags()) {
                        Span span = geniaTag.getSpans().get(0);
                        Token token = new Token(view, span.getBegin(), span.getEnd());
                        if (this.loadPosTags) {
                            token.setPos(geniaTag.getLabel());
                        }
                        token.addToIndexes();
                    }
                }
                if (this.loadSentences) {
                    new Sentence(view, geniaSentence.getSpan().getBegin(), geniaSentence.getSpan().getEnd()).addToIndexes();
                }
            }
            URI uri = this.geniaCorpusFile.toURI();
            try {
                ViewUriUtil.setURI(jCas, new URI(uri.getScheme(), uri.getHost(), uri.getPath(), this.parse.getMedline()));
                jCas.createView("GeniaPOSView").setDocumentText(this.parse.getXml());
                this.parse = null;
            } catch (URISyntaxException e) {
                throw new RuntimeException(e);
            }
        } catch (CASException e2) {
            throw new CollectionException(e2);
        }
    }

    public void close() throws IOException {
    }

    public Progress[] getProgress() {
        return this.filterArticles ? new Progress[]{new ProgressImpl(this.progress, this.articleIds.size(), "entities")} : new Progress[]{new ProgressImpl(this.progress, 2000, "entities")};
    }

    public boolean hasNext() throws IOException, CollectionException {
        if (this.parse != null) {
            return true;
        }
        while (this.parser.hasNext()) {
            this.parse = this.parser.next();
            if (!this.filterArticles) {
                this.progress++;
                return true;
            }
            if (this.articleIds.contains(this.parse.getMedline())) {
                this.progress++;
                return true;
            }
        }
        return false;
    }

    public static CollectionReader getDescription(String str) throws ResourceInitializationException {
        return CollectionReaderFactory.createReader(GeniaPosGoldReader.class, new Object[]{PARAM_GENIA_CORPUS_FILE, str});
    }
}
