/*
 * Decompiled with CFR 0.152.
 */
package net.doo.datamining.preprocessing;

import com.beust.jcommander.Parameter;
import java.io.IOException;
import java.util.Set;
import net.doo.datamining.io.BinaryChunk;
import net.doo.datamining.preprocessing.CNormalizer;
import net.doo.datamining.preprocessing.StringFilter;
import net.doo.datamining.util.HashSetInt;

public class StringFilter {
    @Parameter(names={"-digitChar"}, description="The char to replace digits with. If not specified, digits are not replaced and will be filtered out completely, if the `filter` flag is set.", converter=CharConverter.class)
    private char digitChar = '\u0000';
    @Parameter(names={"-filter"}, description="Whether to filter the input document contents or not. If filtering is on, only letters and chars that are explicitly allowed are let through", arity=1)
    protected boolean filter = true;
    @Parameter(names={"-allowed"}, description="The chars that are allowed apart from letters. Only used when filter=true. Filtering increases the results in almost all cases.", converter=AllowedConverter.class)
    protected HashSetInt allowed = new HashSetInt();
    @Parameter(names={"-toLowerCase"}, description="Whether to lower case all chars before processing or not. Should be turned on.", arity=1)
    protected boolean toLowerCase = true;

    public StringFilter fromChunk(BinaryChunk b) throws IOException {
        BinaryChunk f = b.readChunk("filt");
        Set<Long> flags = f.readFlags64();
        this.digitChar = (char)f.readI32();
        int allowedSize = f.readI32();
        this.allowed = new HashSetInt(allowedSize);
        for (int n = 0; n < allowedSize; ++n) {
            this.allowed.add(f.readI32());
        }
        this.filter = flags.contains(1L);
        this.toLowerCase = flags.contains(2L);
        return this;
    }

    public static boolean isDiacretic(char c) {
        return c >= '\u0300' && c < '\u0370' || c >= '\u1dc0' && c < '\u1e00' || c >= '\u20d0' && c < '\u2100' || c >= '\ufe20' && c < '\ufe30';
    }

    public boolean isAllowed(char c) {
        if (this.filter) {
            return Character.isLetter(c) || StringFilter.isDiacretic(c) || this.allowed.contains(c);
        }
        return true;
    }

    public char map(char c) {
        if (this.digitChar != '\u0000' && c >= '0' && c <= '9') {
            return this.digitChar;
        }
        if (Character.isWhitespace(c) || Character.isSpaceChar(c) || !this.isAllowed(c)) {
            return ' ';
        }
        if (this.toLowerCase) {
            return Character.toLowerCase(c);
        }
        return c;
    }

    public String filterString(String in) {
        String inNormalized = CNormalizer.normalizeAndPreprocess(in);
        StringBuilder result = new StringBuilder(inNormalized.length());
        boolean lastWs = false;
        int l = inNormalized.length();
        for (int n = 0; n < l; ++n) {
            boolean isWs;
            char c = this.map(inNormalized.charAt(n));
            boolean bl = isWs = c == ' ';
            if (!isWs || !lastWs) {
                result.append(c);
            }
            lastWs = isWs;
        }
        return result.toString();
    }
}

