/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.utils.vectors.lucene;

import com.google.common.io.Closeables;
import com.google.common.io.Files;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.commons.io.Charsets;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.OpenBitSet;
import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.stats.LogLikelihood;
import org.apache.mahout.utils.clustering.ClusterDumper;
import org.apache.mahout.utils.vectors.TermEntry;
import org.apache.mahout.utils.vectors.lucene.TermInfoClusterInOut;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ClusterLabels {
    private static final Logger log = LoggerFactory.getLogger(ClusterLabels.class);
    public static final int DEFAULT_MIN_IDS = 50;
    public static final int DEFAULT_MAX_LABELS = 25;
    private final String indexDir;
    private final String contentField;
    private String idField;
    private final Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
    private String output;
    private final int minNumIds;
    private final int maxLabels;

    public ClusterLabels(Path seqFileDir, Path pointsDir, String indexDir, String contentField, int minNumIds, int maxLabels) {
        this.indexDir = indexDir;
        this.contentField = contentField;
        this.minNumIds = minNumIds;
        this.maxLabels = maxLabels;
        ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir);
        this.clusterIdToPoints = clusterDumper.getClusterIdToPoints();
    }

    public void getLabels() throws IOException {
        try (OutputStreamWriter writer = this.output == null ? new OutputStreamWriter((OutputStream)System.out, Charsets.UTF_8) : Files.newWriter((File)new File(this.output), (Charset)Charsets.UTF_8);){
            for (Map.Entry<Integer, List<WeightedPropertyVectorWritable>> integerListEntry : this.clusterIdToPoints.entrySet()) {
                List<WeightedPropertyVectorWritable> wpvws = integerListEntry.getValue();
                List<TermInfoClusterInOut> termInfos = this.getClusterLabels(integerListEntry.getKey(), wpvws);
                if (termInfos == null) continue;
                ((Writer)writer).write(10);
                writer.write("Top labels for Cluster ");
                writer.write(String.valueOf(integerListEntry.getKey()));
                writer.write(" containing ");
                writer.write(String.valueOf(wpvws.size()));
                writer.write(" vectors");
                ((Writer)writer).write(10);
                writer.write("Term \t\t LLR \t\t In-ClusterDF \t\t Out-ClusterDF ");
                ((Writer)writer).write(10);
                for (TermInfoClusterInOut termInfo : termInfos) {
                    writer.write(termInfo.getTerm());
                    writer.write("\t\t");
                    writer.write(String.valueOf(termInfo.getLogLikelihoodRatio()));
                    writer.write("\t\t");
                    writer.write(String.valueOf(termInfo.getInClusterDF()));
                    writer.write("\t\t");
                    writer.write(String.valueOf(termInfo.getOutClusterDF()));
                    ((Writer)writer).write(10);
                }
            }
        }
    }

    protected List<TermInfoClusterInOut> getClusterLabels(Integer integer, Collection<WeightedPropertyVectorWritable> wpvws) throws IOException {
        BytesRef term;
        if (wpvws.size() < this.minNumIds) {
            log.info("Skipping small cluster {} with size: {}", (Object)integer, (Object)wpvws.size());
            return null;
        }
        log.info("Processing Cluster {} with {} documents", (Object)integer, (Object)wpvws.size());
        FSDirectory dir = FSDirectory.open((File)new File(this.indexDir));
        DirectoryReader reader = DirectoryReader.open((Directory)dir);
        log.info("# of documents in the index {}", (Object)reader.numDocs());
        HashSet<String> idSet = new HashSet<String>();
        for (WeightedPropertyVectorWritable wpvw : wpvws) {
            Vector vector = wpvw.getVector();
            if (!(vector instanceof NamedVector)) continue;
            idSet.add(((NamedVector)vector).getName());
        }
        int numDocs = reader.numDocs();
        OpenBitSet clusterDocBitset = ClusterLabels.getClusterDocBitset((IndexReader)reader, idSet, this.idField);
        log.info("Populating term infos from the index");
        Terms t = MultiFields.getTerms((IndexReader)reader, (String)this.contentField);
        TermsEnum te = t.iterator(null);
        LinkedHashMap<String, TermEntry> termEntryMap = new LinkedHashMap<String, TermEntry>();
        Bits liveDocs = MultiFields.getLiveDocs((IndexReader)reader);
        int count = 0;
        while ((term = te.next()) != null) {
            int docID;
            OpenBitSet termBitset = new OpenBitSet((long)reader.maxDoc());
            DocsEnum docsEnum = MultiFields.getTermDocsEnum((IndexReader)reader, null, (String)this.contentField, (BytesRef)term);
            while ((docID = docsEnum.nextDoc()) != Integer.MAX_VALUE) {
                if (liveDocs == null || liveDocs.get(docID)) continue;
                termBitset.set((long)docsEnum.docID());
            }
            termBitset.and(clusterDocBitset);
            int inclusterDF = (int)termBitset.cardinality();
            TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF);
            termEntryMap.put(entry.getTerm(), entry);
        }
        LinkedList<TermInfoClusterInOut> clusteredTermInfo = new LinkedList<TermInfoClusterInOut>();
        int clusterSize = wpvws.size();
        for (TermEntry termEntry : termEntryMap.values()) {
            int corpusDF = reader.docFreq(new Term(this.contentField, termEntry.getTerm()));
            int outDF = corpusDF - termEntry.getDocFreq();
            int inDF = termEntry.getDocFreq();
            double logLikelihoodRatio = ClusterLabels.scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs);
            TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF, logLikelihoodRatio);
            clusteredTermInfo.add(termInfoCluster);
        }
        Collections.sort(clusteredTermInfo);
        Closeables.close((Closeable)reader, (boolean)true);
        termEntryMap.clear();
        return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), this.maxLabels));
    }

    private static OpenBitSet getClusterDocBitset(IndexReader reader, Collection<String> idSet, String idField) throws IOException {
        int numDocs = reader.numDocs();
        OpenBitSet bitset = new OpenBitSet((long)numDocs);
        TreeSet<String> idFieldSelector = null;
        if (idField != null) {
            idFieldSelector = new TreeSet<String>();
            idFieldSelector.add(idField);
        }
        for (int i = 0; i < numDocs; ++i) {
            String id = idField == null ? Integer.toString(i) : reader.document(i, idFieldSelector).get(idField);
            if (!idSet.contains(id)) continue;
            bitset.set((long)i);
        }
        log.info("Created bitset for in-cluster documents : {}", (Object)bitset.cardinality());
        return bitset;
    }

    private static double scoreDocumentFrequencies(long inDF, long outDF, long clusterSize, long corpusSize) {
        long k12 = clusterSize - inDF;
        long k22 = corpusSize - clusterSize - outDF;
        return LogLikelihood.logLikelihoodRatio((long)inDF, (long)k12, (long)outDF, (long)k22);
    }

    public String getIdField() {
        return this.idField;
    }

    public void setIdField(String idField) {
        this.idField = idField;
    }

    public String getOutput() {
        return this.output;
    }

    public void setOutput(String output) {
        this.output = output;
    }

    public static void main(String[] args) {
        DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
        ArgumentBuilder abuilder = new ArgumentBuilder();
        GroupBuilder gbuilder = new GroupBuilder();
        DefaultOption indexOpt = obuilder.withLongName("dir").withRequired(true).withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create()).withDescription("The Lucene index directory").withShortName("d").create();
        DefaultOption outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output file. If not specified, the result is printed on console.").withShortName("o").create();
        DefaultOption fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription("The content field in the index").withShortName("f").create();
        DefaultOption idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription("The field for the document ID in the index.  If null, then the Lucene internal doc id is used which is prone to error if the underlying index changes").withShortName("i").create();
        DefaultOption seqOpt = obuilder.withLongName("seqFileDir").withRequired(true).withArgument(abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).withDescription("The directory containing Sequence Files for the Clusters").withShortName("s").create();
        DefaultOption pointsOpt = obuilder.withLongName("pointsDir").withRequired(true).withArgument(abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription("The directory containing points sequence files mapping input vectors to their cluster.  ").withShortName("p").create();
        DefaultOption minClusterSizeOpt = obuilder.withLongName("minClusterSize").withRequired(false).withArgument(abuilder.withName("minClusterSize").withMinimum(1).withMaximum(1).create()).withDescription("The minimum number of points required in a cluster to print the labels for").withShortName("m").create();
        DefaultOption maxLabelsOpt = obuilder.withLongName("maxLabels").withRequired(false).withArgument(abuilder.withName("maxLabels").withMinimum(1).withMaximum(1).create()).withDescription("The maximum number of labels to print per cluster").withShortName("x").create();
        Option helpOpt = DefaultOptionCreator.helpOption();
        Group group = gbuilder.withName("Options").withOption((Option)indexOpt).withOption((Option)idFieldOpt).withOption((Option)outputOpt).withOption((Option)fieldOpt).withOption((Option)seqOpt).withOption((Option)pointsOpt).withOption(helpOpt).withOption((Option)maxLabelsOpt).withOption((Option)minClusterSizeOpt).create();
        try {
            Parser parser = new Parser();
            parser.setGroup(group);
            CommandLine cmdLine = parser.parse(args);
            if (cmdLine.hasOption(helpOpt)) {
                CommandLineUtil.printHelp((Group)group);
                return;
            }
            Path seqFileDir = new Path(cmdLine.getValue((Option)seqOpt).toString());
            Path pointsDir = new Path(cmdLine.getValue((Option)pointsOpt).toString());
            String indexDir = cmdLine.getValue((Option)indexOpt).toString();
            String contentField = cmdLine.getValue((Option)fieldOpt).toString();
            String idField = null;
            if (cmdLine.hasOption((Option)idFieldOpt)) {
                idField = cmdLine.getValue((Option)idFieldOpt).toString();
            }
            String output = null;
            if (cmdLine.hasOption((Option)outputOpt)) {
                output = cmdLine.getValue((Option)outputOpt).toString();
            }
            int maxLabels = 25;
            if (cmdLine.hasOption((Option)maxLabelsOpt)) {
                maxLabels = Integer.parseInt(cmdLine.getValue((Option)maxLabelsOpt).toString());
            }
            int minSize = 50;
            if (cmdLine.hasOption((Option)minClusterSizeOpt)) {
                minSize = Integer.parseInt(cmdLine.getValue((Option)minClusterSizeOpt).toString());
            }
            ClusterLabels clusterLabel = new ClusterLabels(seqFileDir, pointsDir, indexDir, contentField, minSize, maxLabels);
            if (idField != null) {
                clusterLabel.setIdField(idField);
            }
            if (output != null) {
                clusterLabel.setOutput(output);
            }
            clusterLabel.getLabels();
        }
        catch (OptionException e) {
            log.error("Exception", (Throwable)e);
            CommandLineUtil.printHelp((Group)group);
        }
        catch (IOException e) {
            log.error("Exception", (Throwable)e);
        }
    }
}

