package net.doo.datamining.preprocessing;

import com.beust.jcommander.Parameter;
import java.io.IOException;
import java.util.Set;
import net.doo.datamining.io.BinaryChunk;
import net.doo.datamining.util.HashSetInt;
import net.doo.datamining.util.Strings;

/* loaded from: input_file:net/doo/datamining/preprocessing/StringFilter.class */
public class StringFilter {

    @Parameter(names = {"-digitChar"}, description = "The char to replace digits with. If not specified, digits are not replaced and will be filtered out completely, if the `filter` flag is set.", converter = CharConverter.class)
    private char digitChar = 0;

    @Parameter(names = {"-filter"}, description = "Whether to filter the input document contents or not. If filtering is on, only letters and chars that are explicitly allowed are let through", arity = 1)
    protected boolean filter = true;

    @Parameter(names = {"-allowed"}, description = "The chars that are allowed apart from letters. Only used when filter=true. Filtering increases the results in almost all cases.", converter = AllowedConverter.class)
    protected HashSetInt allowed = new HashSetInt();

    @Parameter(names = {"-toLowerCase"}, description = "Whether to lower case all chars before processing or not. Should be turned on.", arity = 1)
    protected boolean toLowerCase = true;

    /* loaded from: input_file:net/doo/datamining/preprocessing/StringFilter$AllowedConverter.class */
    public static class AllowedConverter {
        /* renamed from: convert, reason: merged with bridge method [inline-methods] */
        public HashSetInt m13convert(String str) {
            return new HashSetInt(str);
        }
    }

    /* loaded from: input_file:net/doo/datamining/preprocessing/StringFilter$CharConverter.class */
    public static class CharConverter {
        /* renamed from: convert, reason: merged with bridge method [inline-methods] */
        public Character m14convert(String str) {
            if (Strings.isNullOrEmpty(str)) {
                return (char) 0;
            }
            return Character.valueOf(str.charAt(0));
        }
    }

    public StringFilter fromChunk(BinaryChunk binaryChunk) throws IOException {
        BinaryChunk readChunk = binaryChunk.readChunk("filt");
        Set<Long> readFlags64 = readChunk.readFlags64();
        this.digitChar = (char) readChunk.readI32();
        int readI32 = readChunk.readI32();
        this.allowed = new HashSetInt(readI32);
        for (int i = 0; i < readI32; i++) {
            this.allowed.add(readChunk.readI32());
        }
        this.filter = readFlags64.contains(1L);
        this.toLowerCase = readFlags64.contains(2L);
        return this;
    }

    public static boolean isDiacretic(char c) {
        return (c >= 768 && c < 880) || (c >= 7616 && c < 7680) || ((c >= 8400 && c < 8448) || (c >= 65056 && c < 65072));
    }

    public boolean isAllowed(char c) {
        return !this.filter || Character.isLetter(c) || isDiacretic(c) || this.allowed.contains(c);
    }

    public char map(char c) {
        if (this.digitChar != 0 && c >= '0' && c <= '9') {
            return this.digitChar;
        }
        if (Character.isWhitespace(c) || Character.isSpaceChar(c) || !isAllowed(c)) {
            return ' ';
        }
        return this.toLowerCase ? Character.toLowerCase(c) : c;
    }

    public String filterString(String str) {
        String normalizeAndPreprocess = CNormalizer.normalizeAndPreprocess(str);
        StringBuilder sb = new StringBuilder(normalizeAndPreprocess.length());
        boolean z = false;
        int length = normalizeAndPreprocess.length();
        for (int i = 0; i < length; i++) {
            char map = map(normalizeAndPreprocess.charAt(i));
            boolean z2 = map == ' ';
            if (!z2 || !z) {
                sb.append(map);
            }
            z = z2;
        }
        return sb.toString();
    }
}
