/*
 * Decompiled with CFR 0.152.
 */
package net.doo.datamining.preprocessing;

import com.beust.jcommander.Parameter;
import com.google.common.base.Charsets;
import com.google.common.base.MoreObjects;
import com.google.common.collect.ImmutableSet;
import com.google.common.io.Files;
import java.io.File;
import java.io.IOException;
import java.util.Set;
import net.doo.datamining.io.BinaryChunk;
import net.doo.datamining.preprocessing.BagOfWords;
import net.doo.datamining.preprocessing.BagOfWordsFactory;
import net.doo.datamining.preprocessing.StringFilter;
import net.doo.datamining.preprocessing.WordReader;
import net.doo.datamining.util.Pair;

public class BagOfWordsFactory {
    @Parameter(names={"-minWordLength"}, description="The minimum length for a word so it does not get filtered out.")
    private int minWordLength = 2;
    @Parameter(names={"-stopwords"}, description="Language code for the stopword set to use. If not set, no stopword filtering is performed. You can supply a comma separated list of stopwords without whitespace, e.g. deu,eng.", converter=StopwordsConverter.class)
    private Set<String> stopwords = ImmutableSet.of();
    @Parameter(names={"-dropoffFactor"}, description="This factor influences how much of the beginning of a document is analyzed. -1 uses the whole document. If something > -1 is used, the first hundred words are always used, if the amount of words is larger than this, it is reduced logarithmically and the result scaled by this constant. Larger numbers = more words.")
    private int dropoffFactor = 50;

    public int getMinWordLength() {
        return this.minWordLength;
    }

    public int getDropoffFactor() {
        return this.dropoffFactor;
    }

    public Set<String> getStopwords() {
        return this.stopwords;
    }

    public BagOfWordsFactory fromChunk(BinaryChunk b) throws IOException {
        BinaryChunk t = b.readChunk("word");
        this.minWordLength = t.readI32();
        this.dropoffFactor = t.readI32();
        int stopwordSize = t.readI32();
        ImmutableSet.Builder stopwordsBuilder = ImmutableSet.builder();
        for (int n = 0; n < stopwordSize; ++n) {
            stopwordsBuilder.add(t.readString());
        }
        this.stopwords = stopwordsBuilder.build();
        return this;
    }

    public BagOfWords preprocess(Pair<String, String> key, File f, String relativePath, StringFilter filter, boolean fastPreprocessing) throws IOException {
        String content = Files.toString(f, Charsets.UTF_8);
        if (fastPreprocessing) {
            return this.preprocess(key, relativePath, content, filter);
        }
        WordReader wordIterator = new WordReader(this, filter, f);
        return new BagOfWords(key, relativePath, content.length(), wordIterator);
    }

    public BagOfWords preprocess(Pair<String, String> key, String originalFilename, String content, StringFilter filter) {
        WordReader wordIterator = new WordReader(this, filter, new File(originalFilename));
        String c = filter.filterString(content);
        return new BagOfWords(key, originalFilename, content.length(), wordIterator.tokenize(c));
    }

    public String toString() {
        return MoreObjects.toStringHelper(this).add("minWordLength", this.minWordLength).add("dropoffFactor", this.dropoffFactor).toString();
    }
}

