StringTokenizer.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.text;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.ListIterator;
import java.util.NoSuchElementException;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.matcher.StringMatcher;
import org.apache.commons.text.matcher.StringMatcherFactory;

/**
 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
 * <p>
 * This class can split a String into many smaller strings. It aims to do a similar job to
 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
 * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
 * <p>
 * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a
 * <em>delimiter</em>. One or more delimiter characters must be specified.
 * <p>
 * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be
 * escaped within a quoted section by duplicating itself.
 * <p>
 * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher
 * specifies these characters. One usage might be to trim whitespace characters.
 * <p>
 * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies
 * these characters to be removed. One usage might be to remove new line characters.
 * <p>
 * Empty tokens may be removed or returned as null.
 *
 * <pre>
 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
 * </pre>
 *
 * <table>
 * <caption>StringTokenizer properties and options</caption>
 * <tr>
 * <th>Property</th>
 * <th>Type</th>
 * <th>Default</th>
 * </tr>
 * <tr>
 * <td>delim</td>
 * <td>CharSetMatcher</td>
 * <td>{ \t\n\r\f}</td>
 * </tr>
 * <tr>
 * <td>quote</td>
 * <td>NoneMatcher</td>
 * <td>{}</td>
 * </tr>
 * <tr>
 * <td>ignore</td>
 * <td>NoneMatcher</td>
 * <td>{}</td>
 * </tr>
 * <tr>
 * <td>emptyTokenAsNull</td>
 * <td>boolean</td>
 * <td>false</td>
 * </tr>
 * <tr>
 * <td>ignoreEmptyTokens</td>
 * <td>boolean</td>
 * <td>true</td>
 * </tr>
 * </table>
 *
 * @since 1.3
 */
public class StringTokenizer implements ListIterator<String>, Cloneable {

    /** Comma separated values tokenizer internal variable. */
    private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE;

    /** Tab separated values tokenizer internal variable. */
    private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE;

    static {
        CSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher());
        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);

        TSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher());
        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
    }

    /**
     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
     *
     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
     */
    private static StringTokenizer getCSVClone() {
        return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
    }

    /**
     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
     * setTrimmer method).
     * <p>
     * You must call a "reset" method to set the string which you want to parse.
     * </p>
     *
     * @return a new tokenizer instance which parses Comma Separated Value strings
     */
    public static StringTokenizer getCSVInstance() {
        return getCSVClone();
    }

    /**
     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
     * setTrimmer method).
     *
     * @param input
     *            the text to parse
     * @return a new tokenizer instance which parses Comma Separated Value strings
     */
    public static StringTokenizer getCSVInstance(final char[] input) {
        return getCSVClone().reset(input);
    }

    /**
     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
     * setTrimmer method).
     *
     * @param input
     *            the text to parse
     * @return a new tokenizer instance which parses Comma Separated Value strings
     */
    public static StringTokenizer getCSVInstance(final String input) {
        return getCSVClone().reset(input);
    }

    /**
     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
     *
     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
     */
    private static StringTokenizer getTSVClone() {
        return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
    }

    /**
     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
     * <p>
     * You must call a "reset" method to set the string which you want to parse.
     * </p>
     *
     * @return a new tokenizer instance which parses Tab Separated Value strings.
     */
    public static StringTokenizer getTSVInstance() {
        return getTSVClone();
    }

    /**
     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
     *
     * @param input
     *            the string to parse
     * @return a new tokenizer instance which parses Tab Separated Value strings.
     */
    public static StringTokenizer getTSVInstance(final char[] input) {
        return getTSVClone().reset(input);
    }

    /**
     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
     *
     * @param input
     *            the string to parse
     * @return a new tokenizer instance which parses Tab Separated Value strings.
     */
    public static StringTokenizer getTSVInstance(final String input) {
        return getTSVClone().reset(input);
    }

    /** The text to work on. */
    private char[] chars;

    /** The parsed tokens. */
    private String[] tokens;

    /** The current iteration position. */
    private int tokenPos;

    /** The delimiter matcher. */
    private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();

    /** The quote matcher. */
    private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();

    /** The ignored matcher. */
    private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();

    /** The trimmer matcher. */
    private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();

    /** Whether to return empty tokens as null. */
    private boolean emptyAsNull;

    /** Whether to ignore empty tokens. */
    private boolean ignoreEmptyTokens = true;

    /**
     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
     * tokenize.
     * <p>
     * This constructor is normally used with {@link #reset(String)}.
     * </p>
     */
    public StringTokenizer() {
        this.chars = null;
    }

    /**
     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
     *
     * @param input
     *            the string which is to be parsed, not cloned
     */
    public StringTokenizer(final char[] input) {
        this.chars = input != null ? input.clone() : null;
    }

    /**
     * Constructs a tokenizer splitting on the specified character.
     *
     * @param input
     *            the string which is to be parsed, not cloned
     * @param delim
     *            the field delimiter character
     */
    public StringTokenizer(final char[] input, final char delim) {
        this(input);
        setDelimiterChar(delim);
    }

    /**
     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
     * quote character.
     *
     * @param input
     *            the string which is to be parsed, not cloned
     * @param delim
     *            the field delimiter character
     * @param quote
     *            the field quoted string character
     */
    public StringTokenizer(final char[] input, final char delim, final char quote) {
        this(input, delim);
        setQuoteChar(quote);
    }

    /**
     * Constructs a tokenizer splitting on the specified string.
     *
     * @param input
     *            the string which is to be parsed, not cloned
     * @param delim
     *            the field delimiter string
     */
    public StringTokenizer(final char[] input, final String delim) {
        this(input);
        setDelimiterString(delim);
    }

    /**
     * Constructs a tokenizer splitting using the specified delimiter matcher.
     *
     * @param input
     *            the string which is to be parsed, not cloned
     * @param delim
     *            the field delimiter matcher
     */
    public StringTokenizer(final char[] input, final StringMatcher delim) {
        this(input);
        setDelimiterMatcher(delim);
    }

    /**
     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
     * quote matcher.
     *
     * @param input
     *            the string which is to be parsed, not cloned
     * @param delim
     *            the field delimiter character
     * @param quote
     *            the field quoted string character
     */
    public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
        this(input, delim);
        setQuoteMatcher(quote);
    }

    /**
     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
     *
     * @param input
     *            the string which is to be parsed
     */
    public StringTokenizer(final String input) {
        this.chars = input != null ? input.toCharArray() : null;
    }

    /**
     * Constructs a tokenizer splitting on the specified delimiter character.
     *
     * @param input
     *            the string which is to be parsed
     * @param delim
     *            the field delimiter character
     */
    public StringTokenizer(final String input, final char delim) {
        this(input);
        setDelimiterChar(delim);
    }

    /**
     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
     * quote character.
     *
     * @param input
     *            the string which is to be parsed
     * @param delim
     *            the field delimiter character
     * @param quote
     *            the field quoted string character
     */
    public StringTokenizer(final String input, final char delim, final char quote) {
        this(input, delim);
        setQuoteChar(quote);
    }

    /**
     * Constructs a tokenizer splitting on the specified delimiter string.
     *
     * @param input
     *            the string which is to be parsed
     * @param delim
     *            the field delimiter string
     */
    public StringTokenizer(final String input, final String delim) {
        this(input);
        setDelimiterString(delim);
    }

    /**
     * Constructs a tokenizer splitting using the specified delimiter matcher.
     *
     * @param input
     *            the string which is to be parsed
     * @param delim
     *            the field delimiter matcher
     */
    public StringTokenizer(final String input, final StringMatcher delim) {
        this(input);
        setDelimiterMatcher(delim);
    }

    /**
     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
     * quote matcher.
     *
     * @param input
     *            the string which is to be parsed
     * @param delim
     *            the field delimiter matcher
     * @param quote
     *            the field quoted string matcher
     */
    public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
        this(input, delim);
        setQuoteMatcher(quote);
    }

    /**
     * Unsupported ListIterator operation.
     *
     * @param obj
     *            this parameter ignored.
     * @throws UnsupportedOperationException
     *             always
     */
    @Override
    public void add(final String obj) {
        throw new UnsupportedOperationException("add() is unsupported");
    }

    /**
     * Adds a token to a list, paying attention to the parameters we've set.
     *
     * @param list
     *            the list to add to
     * @param tok
     *            the token to add
     */
    private void addToken(final List<String> list, String tok) {
        if (tok == null || tok.isEmpty()) {
            if (isIgnoreEmptyTokens()) {
                return;
            }
            if (isEmptyTokenAsNull()) {
                tok = null;
            }
        }
        list.add(tok);
    }

    /**
     * Checks if tokenization has been done, and if not then do it.
     */
    private void checkTokenized() {
        if (tokens == null) {
            final List<String> split;
            if (chars == null) {
                // still call tokenize as subclass may do some work
                split = tokenize(null, 0, 0);
            } else {
                split = tokenize(chars, 0, chars.length);
            }
            tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
        }
    }

    /**
     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
     * list. If a {@link CloneNotSupportedException} is caught, return {@code null}.
     *
     * @return a new instance of this Tokenizer which has been reset.
     */
    @Override
    public Object clone() {
        try {
            return cloneReset();
        } catch (final CloneNotSupportedException ex) {
            return null;
        }
    }

    /**
     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
     * list.
     *
     * @return a new instance of this Tokenizer which has been reset.
     * @throws CloneNotSupportedException
     *             if there is a problem cloning
     */
    Object cloneReset() throws CloneNotSupportedException {
        // this method exists to enable 100% test coverage
        final StringTokenizer cloned = (StringTokenizer) super.clone();
        if (cloned.chars != null) {
            cloned.chars = cloned.chars.clone();
        }
        cloned.reset();
        return cloned;
    }

    /**
     * Gets the String content that the tokenizer is parsing.
     *
     * @return The string content being parsed
     */
    public String getContent() {
        if (chars == null) {
            return null;
        }
        return new String(chars);
    }

    /**
     * Gets the field delimiter matcher.
     *
     * @return The delimiter matcher in use
     */
    public StringMatcher getDelimiterMatcher() {
        return this.delimMatcher;
    }

    /**
     * Gets the ignored character matcher.
     * <p>
     * These characters are ignored when parsing the String, unless they are within a quoted region. The default value
     * is not to ignore anything.
     * </p>
     *
     * @return The ignored matcher in use
     */
    public StringMatcher getIgnoredMatcher() {
        return ignoredMatcher;
    }

    /**
     * Gets the quote matcher currently in use.
     * <p>
     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
     * default value is '"' (double quote).
     * </p>
     *
     * @return The quote matcher in use
     */
    public StringMatcher getQuoteMatcher() {
        return quoteMatcher;
    }

    /**
     * Gets a copy of the full token list as an independent modifiable array.
     *
     * @return The tokens as a String array
     */
    public String[] getTokenArray() {
        checkTokenized();
        return tokens.clone();
    }

    /**
     * Gets a copy of the full token list as an independent modifiable list.
     *
     * @return The tokens as a String list
     */
    public List<String> getTokenList() {
        checkTokenized();
        return new ArrayList<>(Arrays.asList(tokens));
    }

    /**
     * Gets the trimmer character matcher.
     * <p>
     * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
     * value is not to trim anything.
     * </p>
     *
     * @return The trimmer matcher in use
     */
    public StringMatcher getTrimmerMatcher() {
        return trimmerMatcher;
    }

    /**
     * Tests whether there are any more tokens.
     *
     * @return true if there are more tokens
     */
    @Override
    public boolean hasNext() {
        checkTokenized();
        return tokenPos < tokens.length;
    }

    /**
     * Tests whether there are any previous tokens that can be iterated to.
     *
     * @return true if there are previous tokens
     */
    @Override
    public boolean hasPrevious() {
        checkTokenized();
        return tokenPos > 0;
    }

    /**
     * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false.
     *
     * @return true if empty tokens are returned as null
     */
    public boolean isEmptyTokenAsNull() {
        return this.emptyAsNull;
    }

    /**
     * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true.
     *
     * @return true if empty tokens are not returned
     */
    public boolean isIgnoreEmptyTokens() {
        return ignoreEmptyTokens;
    }

    /**
     * Tests if the characters at the index specified match the quote already matched in readNextToken().
     *
     * @param srcChars
     *            the character array being tokenized
     * @param pos
     *            the position to check for a quote
     * @param len
     *            the length of the character array being tokenized
     * @param quoteStart
     *            the start position of the matched quote, 0 if no quoting
     * @param quoteLen
     *            the length of the matched quote, 0 if no quoting
     * @return true if a quote is matched
     */
    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
            final int quoteLen) {
        for (int i = 0; i < quoteLen; i++) {
            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
                return false;
            }
        }
        return true;
    }

    /**
     * Gets the next token.
     *
     * @return The next String token
     * @throws NoSuchElementException
     *             if there are no more elements
     */
    @Override
    public String next() {
        if (hasNext()) {
            return tokens[tokenPos++];
        }
        throw new NoSuchElementException();
    }

    /**
     * Gets the index of the next token to return.
     *
     * @return The next token index
     */
    @Override
    public int nextIndex() {
        return tokenPos;
    }

    /**
     * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
     * {@link NoSuchElementException} when no tokens remain.
     *
     * @return The next sequential token, or null when no more tokens are found
     */
    public String nextToken() {
        if (hasNext()) {
            return tokens[tokenPos++];
        }
        return null;
    }

    /**
     * Gets the token previous to the last returned token.
     *
     * @return The previous token
     */
    @Override
    public String previous() {
        if (hasPrevious()) {
            return tokens[--tokenPos];
        }
        throw new NoSuchElementException();
    }

    /**
     * Gets the index of the previous token.
     *
     * @return The previous token index
     */
    @Override
    public int previousIndex() {
        return tokenPos - 1;
    }

    /**
     * Gets the previous token from the String.
     *
     * @return The previous sequential token, or null when no more tokens are found
     */
    public String previousToken() {
        if (hasPrevious()) {
            return tokens[--tokenPos];
        }
        return null;
    }

    /**
     * Reads character by character through the String to get the next token.
     *
     * @param srcChars
     *            the character array being tokenized
     * @param start
     *            the first character of field
     * @param len
     *            the length of the character array being tokenized
     * @param workArea
     *            a temporary work area
     * @param tokenList
     *            the list of parsed tokens
     * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of
     *         string found
     */
    private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
            final List<String> tokenList) {
        // skip all leading whitespace, unless it is the
        // field delimiter or the quote character
        while (start < len) {
            final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
            if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
                break;
            }
            start += removeLen;
        }

        // handle reaching end
        if (start >= len) {
            addToken(tokenList, StringUtils.EMPTY);
            return -1;
        }

        // handle empty token
        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
        if (delimLen > 0) {
            addToken(tokenList, StringUtils.EMPTY);
            return start + delimLen;
        }

        // handle found token
        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
        if (quoteLen > 0) {
            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
        }
        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
    }

    /**
     * Reads a possibly quoted string token.
     *
     * @param srcChars
     *            the character array being tokenized
     * @param start
     *            the first character of field
     * @param len
     *            the length of the character array being tokenized
     * @param workArea
     *            a temporary work area
     * @param tokenList
     *            the list of parsed tokens
     * @param quoteStart
     *            the start position of the matched quote, 0 if no quoting
     * @param quoteLen
     *            the length of the matched quote, 0 if no quoting
     * @return The starting position of the next field (the character immediately after the delimiter, or if end of
     *         string found, then the length of string
     */
    private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
            final List<String> tokenList, final int quoteStart, final int quoteLen) {
        // Loop until we've found the end of the quoted
        // string or the end of the input
        workArea.clear();
        int pos = start;
        boolean quoting = quoteLen > 0;
        int trimStart = 0;

        while (pos < len) {
            // quoting mode can occur several times throughout a string
            // we must switch between quoting and non-quoting until we
            // encounter a non-quoted delimiter, or end of string
            if (quoting) {
                // In quoting mode

                // If we've found a quote character, see if it's
                // followed by a second quote. If so, then we need
                // to actually put the quote character into the token
                // rather than end the token.
                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
                        // matched pair of quotes, thus an escaped quote
                        workArea.append(srcChars, pos, quoteLen);
                        pos += quoteLen * 2;
                        trimStart = workArea.size();
                        continue;
                    }

                    // end of quoting
                    quoting = false;
                    pos += quoteLen;
                    continue;
                }

            } else {
                // Not in quoting mode

                // check for delimiter, and thus end of token
                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
                if (delimLen > 0) {
                    // return condition when end of token found
                    addToken(tokenList, workArea.substring(0, trimStart));
                    return pos + delimLen;
                }

                // check for quote, and thus back into quoting mode
                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
                    quoting = true;
                    pos += quoteLen;
                    continue;
                }

                // check for ignored (outside quotes), and ignore
                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
                if (ignoredLen > 0) {
                    pos += ignoredLen;
                    continue;
                }

                // check for trimmed character
                // don't yet know if its at the end, so copy to workArea
                // use trimStart to keep track of trim at the end
                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
                if (trimmedLen > 0) {
                    workArea.append(srcChars, pos, trimmedLen);
                    pos += trimmedLen;
                    continue;
                }
            }
            // copy regular character from inside quotes
            workArea.append(srcChars[pos++]);
            trimStart = workArea.size();
        }

        // return condition when end of string found
        addToken(tokenList, workArea.substring(0, trimStart));
        return -1;
    }

    /**
     * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
     *
     * @throws UnsupportedOperationException
     *             always
     */
    @Override
    public void remove() {
        throw new UnsupportedOperationException("remove() is unsupported");
    }

    /**
     * Resets this tokenizer, forgetting all parsing and iteration already completed.
     * <p>
     * This method allows the same tokenizer to be reused for the same String.
     * </p>
     *
     * @return this, to enable chaining
     */
    public StringTokenizer reset() {
        tokenPos = 0;
        tokens = null;
        return this;
    }

    /**
     * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
     * same settings on multiple input lines.
     *
     * @param input
     *            the new character array to tokenize, not cloned, null sets no text to parse
     * @return this, to enable chaining
     */
    public StringTokenizer reset(final char[] input) {
        reset();
        this.chars = input != null ? input.clone() : null;
        return this;
    }

    /**
     * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
     * same settings on multiple input lines.
     *
     * @param input
     *            the new string to tokenize, null sets no text to parse
     * @return this, to enable chaining
     */
    public StringTokenizer reset(final String input) {
        reset();
        this.chars = input != null ? input.toCharArray() : null;
        return this;
    }

    /**
     * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
     *
     * @param obj
     *            this parameter ignored.
     * @throws UnsupportedOperationException
     *             always
     */
    @Override
    public void set(final String obj) {
        throw new UnsupportedOperationException("set() is unsupported");
    }

    /**
     * Sets the field delimiter character.
     *
     * @param delim
     *            the delimiter character to use
     * @return this, to enable chaining
     */
    public StringTokenizer setDelimiterChar(final char delim) {
        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
    }

    /**
     * Sets the field delimiter matcher.
     * <p>
     * The delimiter is used to separate one token from another.
     * </p>
     *
     * @param delim
     *            the delimiter matcher to use
     * @return this, to enable chaining
     */
    public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
        this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim;
        return this;
    }

    /**
     * Sets the field delimiter string.
     *
     * @param delim
     *            the delimiter string to use
     * @return this, to enable chaining
     */
    public StringTokenizer setDelimiterString(final String delim) {
        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
    }

    /**
     * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
     *
     * @param emptyAsNull
     *            whether empty tokens are returned as null
     * @return this, to enable chaining
     */
    public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
        this.emptyAsNull = emptyAsNull;
        return this;
    }

    /**
     * Sets the character to ignore.
     * <p>
     * This character is ignored when parsing the String, unless it is within a quoted region.
     * </p>
     *
     * @param ignored
     *            the ignored character to use
     * @return this, to enable chaining
     */
    public StringTokenizer setIgnoredChar(final char ignored) {
        return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
    }

    /**
     * Sets the matcher for characters to ignore.
     * <p>
     * These characters are ignored when parsing the String, unless they are within a quoted region.
     * </p>
     *
     * @param ignored
     *            the ignored matcher to use, null ignored
     * @return this, to enable chaining
     */
    public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
        if (ignored != null) {
            this.ignoredMatcher = ignored;
        }
        return this;
    }

    /**
     * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
     *
     * @param ignoreEmptyTokens
     *            whether empty tokens are not returned
     * @return this, to enable chaining
     */
    public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
        this.ignoreEmptyTokens = ignoreEmptyTokens;
        return this;
    }

    /**
     * Sets the quote character to use.
     * <p>
     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
     * </p>
     *
     * @param quote
     *            the quote character to use
     * @return this, to enable chaining
     */
    public StringTokenizer setQuoteChar(final char quote) {
        return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
    }

    /**
     * Sets the quote matcher to use.
     * <p>
     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
     * </p>
     *
     * @param quote
     *            the quote matcher to use, null ignored
     * @return this, to enable chaining
     */
    public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
        if (quote != null) {
            this.quoteMatcher = quote;
        }
        return this;
    }

    /**
     * Sets the matcher for characters to trim.
     * <p>
     * These characters are trimmed off on each side of the delimiter until the token or quote is found.
     *
     * @param trimmer
     *            the trimmer matcher to use, null ignored
     * @return this, to enable chaining
     */
    public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
        if (trimmer != null) {
            this.trimmerMatcher = trimmer;
        }
        return this;
    }

    /**
     * Gets the number of tokens found in the String.
     *
     * @return The number of matched tokens
     */
    public int size() {
        checkTokenized();
        return tokens.length;
    }

    /**
     * Internal method to performs the tokenization.
     * <p>
     * Most users of this class do not need to call this method. This method will be called automatically by other
     * (public) methods when required.
     * </p>
     * <p>
     * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
     * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
     * strings. It is also be possible to filter the results.
     * </p>
     * <p>
     * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this
     * method, however a subclass may pass other values, or even an entirely different array.
     * </p>
     *
     * @param srcChars
     *            the character array being tokenized, may be null
     * @param offset
     *            the start position within the character array, must be valid
     * @param count
     *            the number of characters to tokenize, must be valid
     * @return The modifiable list of String tokens, unmodifiable if null array or zero count
     */
    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
        if (srcChars == null || count == 0) {
            return Collections.emptyList();
        }
        final TextStringBuilder buf = new TextStringBuilder();
        final List<String> tokenList = new ArrayList<>();
        int pos = offset;

        // loop around the entire buffer
        while (pos >= 0 && pos < count) {
            // find next token
            pos = readNextToken(srcChars, pos, count, buf, tokenList);

            // handle case where end of string is a delimiter
            if (pos >= count) {
                addToken(tokenList, StringUtils.EMPTY);
            }
        }
        return tokenList;
    }

    /**
     * Gets the String content that the tokenizer is parsing.
     *
     * @return The string content being parsed
     */
    @Override
    public String toString() {
        if (tokens == null) {
            return "StringTokenizer[not tokenized yet]";
        }
        return "StringTokenizer" + getTokenList();
    }

}