/*
 * Decompiled with CFR 0.152.
 */
package pitt.search.semanticvectors;

import ch.akuhn.edu.mit.tedlab.DMat;
import ch.akuhn.edu.mit.tedlab.SMat;
import ch.akuhn.edu.mit.tedlab.SVDRec;
import ch.akuhn.edu.mit.tedlab.Svdlib;
import java.io.IOException;
import java.nio.file.FileSystems;
import java.util.logging.Logger;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import pitt.search.semanticvectors.FlagConfig;
import pitt.search.semanticvectors.LuceneUtils;
import pitt.search.semanticvectors.VectorStoreUtils;
import pitt.search.semanticvectors.VectorStoreWriter;
import pitt.search.semanticvectors.utils.VerbatimLogger;
import pitt.search.semanticvectors.vectors.RealVector;
import pitt.search.semanticvectors.vectors.VectorType;

public class LSA {
    private static final Logger logger = Logger.getLogger(LSA.class.getCanonicalName());
    public static String usageMessage = "\nLSA class in package pitt.search.semanticvectors\nUsage: java pitt.search.semanticvectors.LSA [other flags] -luceneindexpath PATH_TO_LUCENE_INDEXUse flags to configure dimension, min term frequency, etc. See online documentation for other available flags";
    private FlagConfig flagConfig;
    private String[] termList;
    private String contentsField;
    private LuceneUtils luceneUtils;

    private LSA(String luceneIndexDir, FlagConfig flagConfig) throws IOException {
        this.flagConfig = flagConfig;
        this.luceneUtils = new LuceneUtils(flagConfig);
        if (flagConfig.contentsfields().length > 1) {
            logger.warning("LSA implementation only supports a single -contentsfield. Only '" + flagConfig.contentsfields()[0] + "' will be indexed.");
        }
        this.contentsField = flagConfig.contentsfields()[0];
        if (flagConfig.dimension() > this.luceneUtils.getNumDocs()) {
            logger.warning("Dimension for SVD cannot be greater than number of documents ... Setting dimension to " + this.luceneUtils.getNumDocs());
            flagConfig.setDimension(this.luceneUtils.getNumDocs());
        }
        if (flagConfig.termweight().equals((Object)LuceneUtils.TermWeight.LOGENTROPY)) {
            VerbatimLogger.info("Term weighting: log-entropy.\n");
        }
        VerbatimLogger.info("Set up LSA indexer.\nDimension: " + flagConfig.dimension() + " Lucene index contents field: '" + this.contentsField + "' Minimum frequency = " + flagConfig.minfrequency() + " Maximum frequency = " + flagConfig.maxfrequency() + " Number non-alphabet characters = " + flagConfig.maxnonalphabetchars() + "\n");
    }

    private SMat smatFromIndex() throws IOException {
        BytesRef bytes;
        Terms terms = this.luceneUtils.getTermsForField(this.contentsField);
        TermsEnum termsEnumForCount = terms.iterator();
        int numTerms = 0;
        int nonZeroVals = 0;
        while ((bytes = termsEnumForCount.next()) != null) {
            Term term = new Term(this.contentsField, bytes);
            if (this.luceneUtils.termFilter(term)) {
                ++numTerms;
            }
            PostingsEnum docsEnum = this.luceneUtils.getDocsForTerm(term);
            while (docsEnum.nextDoc() != Integer.MAX_VALUE) {
                ++nonZeroVals;
            }
        }
        VerbatimLogger.info(String.format("There are %d terms (and %d docs).\n", numTerms, this.luceneUtils.getNumDocs()));
        this.termList = new String[numTerms];
        SMat S = new SMat(this.luceneUtils.getNumDocs(), numTerms, nonZeroVals);
        TermsEnum termsEnum = terms.iterator();
        int termCounter = 0;
        int firstNonZero = 0;
        while ((bytes = termsEnum.next()) != null) {
            Term term = new Term(this.contentsField, bytes);
            if (!this.luceneUtils.termFilter(term)) continue;
            S.pointr[termCounter] = firstNonZero;
            this.termList[termCounter] = term.text();
            PostingsEnum docsEnum = this.luceneUtils.getDocsForTerm(term);
            while (docsEnum.nextDoc() != Integer.MAX_VALUE) {
                S.rowind[firstNonZero] = docsEnum.docID();
                float value = this.luceneUtils.getGlobalTermWeight(term);
                S.value[firstNonZero] = value *= this.luceneUtils.getLocalTermWeight(docsEnum.freq());
                ++firstNonZero;
            }
            ++termCounter;
        }
        S.pointr[S.cols] = S.vals;
        return S;
    }

    private void writeOutput(DMat vT, DMat uT) throws IOException {
        float[] tmp;
        int cnt;
        FSDirectory fsDirectory = FSDirectory.open(FileSystems.getDefault().getPath(".", new String[0]));
        IndexOutput outputStream = fsDirectory.createOutput(VectorStoreUtils.getStoreFileName(this.flagConfig.termvectorsfile(), this.flagConfig), IOContext.DEFAULT);
        outputStream.writeString(VectorStoreWriter.generateHeaderString(this.flagConfig));
        for (cnt = 0; cnt < this.termList.length; ++cnt) {
            outputStream.writeString(this.termList[cnt]);
            tmp = new float[this.flagConfig.dimension()];
            for (int i = 0; i < this.flagConfig.dimension(); ++i) {
                tmp[i] = (float)vT.value[i][cnt];
            }
            RealVector termVector = new RealVector(tmp);
            termVector.normalize();
            termVector.writeToLuceneStream(outputStream);
        }
        outputStream.close();
        VerbatimLogger.info("Wrote " + cnt + " term vectors incrementally to file " + this.flagConfig.termvectorsfile() + ".\n");
        outputStream = fsDirectory.createOutput(VectorStoreUtils.getStoreFileName(this.flagConfig.docvectorsfile(), this.flagConfig), IOContext.DEFAULT);
        outputStream.writeString(VectorStoreWriter.generateHeaderString(this.flagConfig));
        for (cnt = 0; cnt < this.luceneUtils.getNumDocs(); ++cnt) {
            String thePath = this.luceneUtils.getDoc(cnt).get(this.flagConfig.docidfield());
            outputStream.writeString(thePath);
            tmp = new float[this.flagConfig.dimension()];
            for (int i = 0; i < this.flagConfig.dimension(); ++i) {
                tmp[i] = (float)uT.value[i][cnt];
            }
            RealVector docVector = new RealVector(tmp);
            docVector.normalize();
            docVector.writeToLuceneStream(outputStream);
        }
        outputStream.close();
        VerbatimLogger.info("Wrote " + cnt + " document vectors incrementally to file " + this.flagConfig.docvectorsfile() + ". Done.\n");
    }

    public static void main(String[] args) throws IllegalArgumentException, IOException {
        FlagConfig flagConfig;
        try {
            flagConfig = FlagConfig.getFlagConfig(args);
            args = flagConfig.remainingArgs;
        }
        catch (IllegalArgumentException e) {
            System.out.println(usageMessage);
            throw e;
        }
        if (flagConfig.vectortype() != VectorType.REAL) {
            logger.warning("LSA is only supported for real vectors ... setting vectortype to 'real'.");
        }
        if (flagConfig.luceneindexpath().isEmpty()) {
            throw new IllegalArgumentException("-luceneindexpath must be set.");
        }
        if (flagConfig.contentsfields().length != 1) {
            throw new IllegalArgumentException("LSA only supports one -contentsfield, more than this may cause a corrupt matrix.");
        }
        LSA lsaIndexer = new LSA(flagConfig.luceneindexpath(), flagConfig);
        SMat A = lsaIndexer.smatFromIndex();
        Svdlib svd = new Svdlib();
        VerbatimLogger.info("Starting SVD using algorithm LAS2 ...\n");
        SVDRec svdR = svd.svdLAS2A(A, flagConfig.dimension());
        DMat vT = svdR.Vt;
        DMat uT = svdR.Ut;
        lsaIndexer.writeOutput(vT, uT);
    }
}

