/*
 * Decompiled with CFR 0.152.
 */
package org.tribuo.util.tokens.impl.wordpiece;

import com.oracle.labs.mlrg.olcut.config.Config;
import com.oracle.labs.mlrg.olcut.config.Configurable;
import com.oracle.labs.mlrg.olcut.util.IOUtil;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class Wordpiece
implements Configurable {
    public static final String DEFAULT_UNKNOWN_TOKEN = "[UNK]";
    @Config(mandatory=true, description="path to a vocabulary data file.")
    private String vocabPath;
    @Config(mandatory=false, description="the value to use for 'UNKNOWN' tokens. Defaults to '[UNK]' which is a common default in BERT-based solutions.")
    private String unknownToken = "[UNK]";
    @Config(mandatory=false, description="the maximum number of characters per word to consider. This helps eliminate doing extra work on pathological cases.")
    private int maxInputCharactersPerWord = 100;
    private Set<String> vocab;

    private Wordpiece() {
    }

    public Wordpiece(Set<String> vocab) {
        this(vocab, DEFAULT_UNKNOWN_TOKEN);
    }

    public Wordpiece(Set<String> vocab, String unknownToken) {
        this(vocab, unknownToken, 100);
    }

    public Wordpiece(Set<String> vocab, String unknownToken, int maxInputCharactersPerWord) {
        this.vocab = Collections.unmodifiableSet(vocab);
        this.unknownToken = unknownToken;
        this.maxInputCharactersPerWord = maxInputCharactersPerWord;
    }

    public Wordpiece(String vocabPath) {
        this.vocabPath = vocabPath;
        try {
            this.postConfig();
        }
        catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    public Wordpiece(String vocabPath, String unknownToken, int maxInputCharactersPerWord) {
        this.vocabPath = vocabPath;
        this.unknownToken = unknownToken;
        this.maxInputCharactersPerWord = maxInputCharactersPerWord;
        try {
            this.postConfig();
        }
        catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    public void postConfig() throws IOException {
        this.vocab = Collections.unmodifiableSet(new HashSet(IOUtil.getLines((String)this.vocabPath)));
    }

    public List<String> wordpiece(String token) {
        if (token.length() > this.maxInputCharactersPerWord) {
            return Collections.singletonList(this.unknownToken);
        }
        ArrayList<String> subTokens = new ArrayList<String>();
        boolean isBad = false;
        int start = 0;
        while (start < token.length()) {
            int end;
            String currentSubstring = null;
            for (end = token.length(); start < end; --end) {
                String substring = token.substring(start, end);
                if (start > 0) {
                    substring = "##" + substring;
                }
                if (!this.vocab.contains(substring)) continue;
                currentSubstring = substring;
                break;
            }
            if (currentSubstring == null) {
                isBad = true;
                break;
            }
            subTokens.add(currentSubstring);
            start = end;
        }
        if (isBad) {
            return Collections.singletonList(this.unknownToken);
        }
        return subTokens;
    }

    public String getUnknownToken() {
        return this.unknownToken;
    }

    public int getMaxInputCharactersPerWord() {
        return this.maxInputCharactersPerWord;
    }
}

