/*
 * Decompiled with CFR 0.152.
 */
package net.doo.datamining.preprocessing;

import com.beust.jcommander.Parameter;
import com.google.common.base.Objects;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import net.doo.datamining.io.BinaryChunk;
import net.doo.datamining.preprocessing.BagOfWords;
import net.doo.datamining.preprocessing.Dictionary;
import net.doo.datamining.preprocessing.DictionaryFactory;
import net.doo.datamining.preprocessing.WordVector;

public class DictionaryFactory {
    @Parameter(names={"-dictionarySize"}, description="The number of words to remember for classification. Value must be > 0. If 'perClassDictionary' is used, this limit is used per class and the dictionary of each class is combined to one dictionary which is generally larger than this limit. If `perClassDictionary` is not set, the total dictionary is limited by this value.")
    private int dictionarySize = 1500;
    @Parameter(names={"-perClassDictionary"}, description="If set, the dictionary is built by taking the most common words in each class respectively, limiting the amount of these words by 'dictionarySize' and then combining these dictionaries into one.If not set, the dictionary is built from the most common words over all classes and the dictionary size is used on this big dictionary. It is recommended to set this flag.", arity=1)
    private boolean perClassDictionary = true;
    @Parameter(names={"-minWordCount"}, description="The minimum number of times a word has to occur to be considered a candidate for the dictionary. If `perClassDictionary` is set, the word has to occur at least this often in one of the classes to be added to the dictionary. If `perClassDictionary` is not set, the word has to occur this often in the complete corpus.")
    private int minWordCount = 15;
    @Parameter(names={"-frequencyType"}, description="Determines the way word frequencies (the number of occurences of a word in a document) are handled in the word vector. 'bernoulli' ignores the frequency and simply sets the entry for the word to '1' if the word is present and `0` if it isn't. `multinomial` sets the entry to the number of occurences of the word. `tfidf` sets the vector entry to the number of occurences multiplied with the inverse document frequency of the word. `idf` sets it simply to the document frequency of the word if it is present and to `0` if it is not.", converter=FrequencyConverter.class)
    private FrequencyType frequencyType = FrequencyType.bernoulli;
    @Parameter(names={"-normalizeWordVector"}, description="Whether to normalize the word vectors create from the documents or not.", arity=1)
    private boolean normalizeWordVector = false;

    public boolean isNormalizeWordVector() {
        return this.normalizeWordVector;
    }

    public DictionaryFactory fromChunk(BinaryChunk b) throws IOException {
        BinaryChunk d = b.readChunk("dict");
        Set<Long> flags = d.readFlags64();
        this.dictionarySize = d.readI32();
        this.minWordCount = d.readI32();
        this.frequencyType = d.readEnum(FrequencyType.values());
        this.perClassDictionary = flags.contains(1L);
        this.normalizeWordVector = flags.contains(2L);
        return this;
    }

    public WordVector buildWordVector(BagOfWords document, Dictionary d) {
        SortedMap<Integer, Double> vector;
        switch (this.frequencyType) {
            case bernoulli: {
                vector = DictionaryFactory.buildBernoulliVector(document, d);
                break;
            }
            case multinomial: {
                vector = DictionaryFactory.buildMultinomialVector(document, d);
                break;
            }
            case tfidf: {
                vector = DictionaryFactory.buildTfIdfVector(document, d);
                break;
            }
            case idf: {
                vector = DictionaryFactory.buildIdfVector(document, d);
                break;
            }
            default: {
                throw new IllegalStateException("Unknown frequency type: " + (Object)((Object)this.frequencyType));
            }
        }
        return new WordVector(document.categoryKey, document.originalFilename, document.originalContentLength, vector);
    }

    private static SortedMap<Integer, Double> buildBernoulliVector(BagOfWords document, Dictionary d) {
        TreeMap<Integer, Double> v = new TreeMap<Integer, Double>();
        for (String word : document.getWords()) {
            Integer idx = d.getWordToIndex().get(word);
            if (idx == null) continue;
            v.put(idx, 1.0);
        }
        return v;
    }

    private static SortedMap<Integer, Double> buildMultinomialVector(BagOfWords document, Dictionary d) {
        TreeMap<Integer, Double> v = new TreeMap<Integer, Double>();
        for (String word : document.getWords()) {
            Integer idx = d.getWordToIndex().get(word);
            if (idx == null) continue;
            Double value = (Double)v.get(idx);
            if (value == null) {
                v.put(idx, 1.0);
                continue;
            }
            v.put(idx, value + 1.0);
        }
        return v;
    }

    private static SortedMap<Integer, Double> buildTfIdfVector(BagOfWords document, Dictionary d) {
        TreeMap<Integer, Double> v = new TreeMap<Integer, Double>();
        for (Map.Entry<Integer, Double> entry : DictionaryFactory.buildMultinomialVector(document, d).entrySet()) {
            v.put(entry.getKey(), entry.getValue() * (Double)d.getIndexedIdf().get(entry.getKey()));
        }
        return v;
    }

    private static SortedMap<Integer, Double> buildIdfVector(BagOfWords document, Dictionary d) {
        TreeMap<Integer, Double> v = new TreeMap<Integer, Double>();
        for (String word : document.getWords()) {
            Integer idx = d.getWordToIndex().get(word);
            if (idx == null) continue;
            v.put(idx, (Double)d.getIndexedIdf().get(idx));
        }
        return v;
    }

    public String toString() {
        return Objects.toStringHelper(this).add("dictionarySize", this.dictionarySize).add("perClassDictionary?", this.perClassDictionary).add("minWordCount", this.minWordCount).add("frequencyType", (Object)this.frequencyType).add("normalizeWordVector", this.normalizeWordVector).toString();
    }

    public static final class FrequencyType
    extends Enum<FrequencyType> {
        public static final /* enum */ FrequencyType bernoulli = new FrequencyType();
        public static final /* enum */ FrequencyType multinomial = new FrequencyType();
        public static final /* enum */ FrequencyType tfidf = new FrequencyType();
        public static final /* enum */ FrequencyType idf = new FrequencyType();
        private static final /* synthetic */ FrequencyType[] $VALUES;

        public static FrequencyType[] values() {
            return (FrequencyType[])$VALUES.clone();
        }

        static {
            $VALUES = new FrequencyType[]{bernoulli, multinomial, tfidf, idf};
        }
    }
}

