TokenSequenceBuilder.java

/*
 * (c) Copyright 2021 Hasan Selman Kara. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package li.selman.jpbe.dsl.token;

import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.function.BiFunction;

/**
 * @author Hasan Selman Kara
 */
public class TokenSequenceBuilder {

    private final int maxLength;
    private final BiFunction<Character, Token, Optional<Token>> computeTokenForCharHook;
    private final Tokens tokens;

    private static Optional<Token> defaultHook(Character character, Token token) {
        return Optional.empty();
    }

    public TokenSequenceBuilder(int maxLength, Tokens tokens) {
        this(maxLength, TokenSequenceBuilder::defaultHook, tokens);
    }

    public TokenSequenceBuilder(int maxLength, BiFunction<Character, Token, Optional<Token>> computeTokenForCharHook,
                                Tokens tokens) {
        if (maxLength <= 0) {
            throw new IllegalArgumentException("MaxLength cannot be smaller than 1");
        }
        if (computeTokenForCharHook == null) {
            throw new IllegalArgumentException("Hook cannot be null. Use default hook!");
        }
        if (tokens == null) {
            throw new IllegalArgumentException("Tokens cannot be null");
        }

        this.maxLength = maxLength;
        this.computeTokenForCharHook = computeTokenForCharHook;
        this.tokens = tokens;
    }

    /**
     * @param input the whole input string provided by the data set
     * @param from  start index for sub-string on {@code input}
     * @param to    end index for sub-string on {@code input}
     * @return sequence of tokens representing the token structure of a substring on {@code input}
     */
    @SuppressWarnings("checkstyle:CyclomaticComplexity")
    public TokenSequence computeTokenSequence(String input, int from, int to) {
        if (to < 1 || to > input.length()) throw new IllegalArgumentException("'to' index is invalid.");
        if (from < 0 || from >= input.length()) throw new IllegalArgumentException("'from' index is invalid.");
        if (to <= from) throw new IllegalArgumentException("'from' index must be smaller than 'to' index.");

        List<Token> tmpTokens = new ArrayList<>();

        if (from == 0) {
            tmpTokens.add(Token.START);
        }

        String substr = input.substring(from, to);
        Token last = null;
        for (int i = 0; i < substr.length(); i++) {
            char character = substr.charAt(i);
            if (last == null) {
                // Handle first token
                last = computeTokenForChar(character, getLastOrNull(tmpTokens));
                tmpTokens.add(last);
            }

            Token next = computeTokenForChar(character, getLastOrNull(tmpTokens));
            if (!last.equals(next)) {
                last = next;
                tmpTokens.add(last);
            }

            if (tmpTokens.size() > maxLength) {
                // Already too long, preemptive cancellation with current tokens
                return TokenSequence.of(tmpTokens);
            }
        }

        if (to == input.length()) {
            tmpTokens.add(Token.END);
        }

        return TokenSequence.of(tmpTokens);
    }

    private <T> T getLastOrNull(List<T> list) {
        if (list.isEmpty()) {
            return null;
        } else {
            return list.get(list.size() - 1);
        }
    }

    Token computeTokenForChar(char c, Token lastToken) {

        Optional<Token> hookToken = computeTokenForCharHook.apply(c, lastToken);
        if (hookToken.isPresent()) {
            return hookToken.get();
        }

        return tokens.getTokens().stream()
                .filter(token -> token.matches(c, lastToken))
                .findFirst()
                .orElse(tokens.getElseToken());
    }

}