package net.doo.datamining.preprocessing;

import com.beust.jcommander.Parameter;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import net.doo.datamining.io.BinaryChunk;
import net.doo.datamining.io.EnumConverter;

/* loaded from: input_file:net/doo/datamining/preprocessing/DictionaryFactory.class */
public class DictionaryFactory {

    @Parameter(names = {"-dictionarySize"}, description = "The number of words to remember for classification. Value must be > 0. If 'perClassDictionary' is used, this limit is used per class and the dictionary of each class is combined to one dictionary which is generally larger than this limit. If `perClassDictionary` is not set, the total dictionary is limited by this value.")
    private int dictionarySize = 1500;

    @Parameter(names = {"-perClassDictionary"}, description = "If set, the dictionary is built by taking the most common words in each class respectively, limiting the amount of these words by 'dictionarySize' and then combining these dictionaries into one.If not set, the dictionary is built from the most common words over all classes and the dictionary size is used on this big dictionary. It is recommended to set this flag.", arity = 1)
    private boolean perClassDictionary = true;

    @Parameter(names = {"-minWordCount"}, description = "The minimum number of times a word has to occur to be considered a candidate for the dictionary. If `perClassDictionary` is set, the word has to occur at least this often in one of the classes to be added to the dictionary. If `perClassDictionary` is not set, the word has to occur this often in the complete corpus.")
    private int minWordCount = 15;

    @Parameter(names = {"-frequencyType"}, description = "Determines the way word frequencies (the number of occurences of a word in a document) are handled in the word vector. 'bernoulli' ignores the frequency and simply sets the entry for the word to '1' if the word is present and `0` if it isn't. `multinomial` sets the entry to the number of occurences of the word. `tfidf` sets the vector entry to the number of occurences multiplied with the inverse document frequency of the word. `idf` sets it simply to the document frequency of the word if it is present and to `0` if it is not.", converter = FrequencyConverter.class)
    private FrequencyType frequencyType = FrequencyType.bernoulli;

    @Parameter(names = {"-normalizeWordVector"}, description = "Whether to normalize the word vectors create from the documents or not.", arity = 1)
    private boolean normalizeWordVector = false;

    /* loaded from: input_file:net/doo/datamining/preprocessing/DictionaryFactory$FrequencyConverter.class */
    public static class FrequencyConverter extends EnumConverter<FrequencyType> {
    }

    /* loaded from: input_file:net/doo/datamining/preprocessing/DictionaryFactory$FrequencyType.class */
    public enum FrequencyType {
        bernoulli,
        multinomial,
        tfidf,
        idf
    }

    public boolean isNormalizeWordVector() {
        return this.normalizeWordVector;
    }

    public DictionaryFactory fromChunk(BinaryChunk binaryChunk) throws IOException {
        BinaryChunk readChunk = binaryChunk.readChunk("dict");
        Set<Long> readFlags64 = readChunk.readFlags64();
        this.dictionarySize = readChunk.readI32();
        this.minWordCount = readChunk.readI32();
        this.frequencyType = (FrequencyType) readChunk.readEnum(FrequencyType.values());
        this.perClassDictionary = readFlags64.contains(1L);
        this.normalizeWordVector = readFlags64.contains(2L);
        return this;
    }

    public WordVector buildWordVector(BagOfWords bagOfWords, Dictionary dictionary) {
        SortedMap<Integer, Double> buildIdfVector;
        switch (this.frequencyType) {
            case bernoulli:
                buildIdfVector = buildBernoulliVector(bagOfWords, dictionary);
                break;
            case multinomial:
                buildIdfVector = buildMultinomialVector(bagOfWords, dictionary);
                break;
            case tfidf:
                buildIdfVector = buildTfIdfVector(bagOfWords, dictionary);
                break;
            case idf:
                buildIdfVector = buildIdfVector(bagOfWords, dictionary);
                break;
            default:
                throw new IllegalStateException("Unknown frequency type: " + this.frequencyType);
        }
        return new WordVector(bagOfWords.categoryKey, bagOfWords.originalFilename, bagOfWords.originalContentLength, buildIdfVector);
    }

    private static SortedMap<Integer, Double> buildBernoulliVector(BagOfWords bagOfWords, Dictionary dictionary) {
        TreeMap treeMap = new TreeMap();
        Iterator<String> it = bagOfWords.getWords().iterator();
        while (it.hasNext()) {
            Integer num = (Integer) dictionary.getWordToIndex().get(it.next());
            if (num != null) {
                treeMap.put(num, Double.valueOf(1.0d));
            }
        }
        return treeMap;
    }

    private static SortedMap<Integer, Double> buildMultinomialVector(BagOfWords bagOfWords, Dictionary dictionary) {
        TreeMap treeMap = new TreeMap();
        Iterator<String> it = bagOfWords.getWords().iterator();
        while (it.hasNext()) {
            Integer num = (Integer) dictionary.getWordToIndex().get(it.next());
            if (num != null) {
                Double d = (Double) treeMap.get(num);
                if (d == null) {
                    treeMap.put(num, Double.valueOf(1.0d));
                } else {
                    treeMap.put(num, Double.valueOf(d.doubleValue() + 1.0d));
                }
            }
        }
        return treeMap;
    }

    private static SortedMap<Integer, Double> buildTfIdfVector(BagOfWords bagOfWords, Dictionary dictionary) {
        TreeMap treeMap = new TreeMap();
        for (Map.Entry<Integer, Double> entry : buildMultinomialVector(bagOfWords, dictionary).entrySet()) {
            treeMap.put(entry.getKey(), Double.valueOf(entry.getValue().doubleValue() * ((Double) dictionary.getIndexedIdf().get(entry.getKey().intValue())).doubleValue()));
        }
        return treeMap;
    }

    private static SortedMap<Integer, Double> buildIdfVector(BagOfWords bagOfWords, Dictionary dictionary) {
        TreeMap treeMap = new TreeMap();
        Iterator<String> it = bagOfWords.getWords().iterator();
        while (it.hasNext()) {
            Integer num = (Integer) dictionary.getWordToIndex().get(it.next());
            if (num != null) {
                treeMap.put(num, dictionary.getIndexedIdf().get(num.intValue()));
            }
        }
        return treeMap;
    }
}
