001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.List;
023import java.util.ListIterator;
024import java.util.NoSuchElementException;
025
026import org.apache.commons.lang3.ArrayUtils;
027import org.apache.commons.lang3.StringUtils;
028import org.apache.commons.text.matcher.StringMatcher;
029import org.apache.commons.text.matcher.StringMatcherFactory;
030
031/**
032 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
033 * <p>
034 * This class can split a String into many smaller strings. It aims to do a similar job to
035 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
036 * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
037 * <p>
038 * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a
039 * <em>delimiter</em>. One or more delimiter characters must be specified.
040 * <p>
041 * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be
042 * escaped within a quoted section by duplicating itself.
043 * <p>
044 * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher
045 * specifies these characters. One usage might be to trim whitespace characters.
046 * <p>
047 * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies
048 * these characters to be removed. One usage might be to remove new line characters.
049 * <p>
050 * Empty tokens may be removed or returned as null.
051 *
052 * <pre>
053 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
054 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
055 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
056 * </pre>
057 *
058 * <table>
059 * <caption>StringTokenizer properties and options</caption>
060 * <tr>
061 * <th>Property</th>
062 * <th>Type</th>
063 * <th>Default</th>
064 * </tr>
065 * <tr>
066 * <td>delim</td>
067 * <td>CharSetMatcher</td>
068 * <td>{ \t\n\r\f}</td>
069 * </tr>
070 * <tr>
071 * <td>quote</td>
072 * <td>NoneMatcher</td>
073 * <td>{}</td>
074 * </tr>
075 * <tr>
076 * <td>ignore</td>
077 * <td>NoneMatcher</td>
078 * <td>{}</td>
079 * </tr>
080 * <tr>
081 * <td>emptyTokenAsNull</td>
082 * <td>boolean</td>
083 * <td>false</td>
084 * </tr>
085 * <tr>
086 * <td>ignoreEmptyTokens</td>
087 * <td>boolean</td>
088 * <td>true</td>
089 * </tr>
090 * </table>
091 *
092 * @since 1.3
093 */
094public class StringTokenizer implements ListIterator<String>, Cloneable {
095
096    /** Comma separated values tokenizer internal variable. */
097    private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE;
098
099    /** Tab separated values tokenizer internal variable. */
100    private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE;
101
102    static {
103        CSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
104        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher());
105        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
106        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
107        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
108        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
109        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
110
111        TSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
112        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher());
113        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
114        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
115        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
116        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
117        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
118    }
119
120    /**
121     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
122     *
123     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
124     */
125    private static StringTokenizer getCSVClone() {
126        return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
127    }
128
129    /**
130     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
131     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
132     * setTrimmer method).
133     * <p>
134     * You must call a "reset" method to set the string which you want to parse.
135     * </p>
136     *
137     * @return a new tokenizer instance which parses Comma Separated Value strings
138     */
139    public static StringTokenizer getCSVInstance() {
140        return getCSVClone();
141    }
142
143    /**
144     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
145     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
146     * setTrimmer method).
147     *
148     * @param input
149     *            the text to parse
150     * @return a new tokenizer instance which parses Comma Separated Value strings
151     */
152    public static StringTokenizer getCSVInstance(final char[] input) {
153        return getCSVClone().reset(input);
154    }
155
156    /**
157     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
158     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
159     * setTrimmer method).
160     *
161     * @param input
162     *            the text to parse
163     * @return a new tokenizer instance which parses Comma Separated Value strings
164     */
165    public static StringTokenizer getCSVInstance(final String input) {
166        return getCSVClone().reset(input);
167    }
168
169    /**
170     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171     *
172     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
173     */
174    private static StringTokenizer getTSVClone() {
175        return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
176    }
177
178    /**
179     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
180     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
181     * <p>
182     * You must call a "reset" method to set the string which you want to parse.
183     * </p>
184     *
185     * @return a new tokenizer instance which parses Tab Separated Value strings.
186     */
187    public static StringTokenizer getTSVInstance() {
188        return getTSVClone();
189    }
190
191    /**
192     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
193     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
194     *
195     * @param input
196     *            the string to parse
197     * @return a new tokenizer instance which parses Tab Separated Value strings.
198     */
199    public static StringTokenizer getTSVInstance(final char[] input) {
200        return getTSVClone().reset(input);
201    }
202
203    /**
204     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
205     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
206     *
207     * @param input
208     *            the string to parse
209     * @return a new tokenizer instance which parses Tab Separated Value strings.
210     */
211    public static StringTokenizer getTSVInstance(final String input) {
212        return getTSVClone().reset(input);
213    }
214
215    /** The text to work on. */
216    private char[] chars;
217
218    /** The parsed tokens. */
219    private String[] tokens;
220
221    /** The current iteration position. */
222    private int tokenPos;
223
224    /** The delimiter matcher. */
225    private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
226
227    /** The quote matcher. */
228    private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
229
230    /** The ignored matcher. */
231    private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
232
233    /** The trimmer matcher. */
234    private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
235
236    /** Whether to return empty tokens as null. */
237    private boolean emptyAsNull;
238
239    /** Whether to ignore empty tokens. */
240    private boolean ignoreEmptyTokens = true;
241
242    /**
243     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
244     * tokenize.
245     * <p>
246     * This constructor is normally used with {@link #reset(String)}.
247     * </p>
248     */
249    public StringTokenizer() {
250        this.chars = null;
251    }
252
253    /**
254     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
255     *
256     * @param input
257     *            the string which is to be parsed, not cloned
258     */
259    public StringTokenizer(final char[] input) {
260        this.chars = input != null ? input.clone() : null;
261    }
262
263    /**
264     * Constructs a tokenizer splitting on the specified character.
265     *
266     * @param input
267     *            the string which is to be parsed, not cloned
268     * @param delim
269     *            the field delimiter character
270     */
271    public StringTokenizer(final char[] input, final char delim) {
272        this(input);
273        setDelimiterChar(delim);
274    }
275
276    /**
277     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
278     * quote character.
279     *
280     * @param input
281     *            the string which is to be parsed, not cloned
282     * @param delim
283     *            the field delimiter character
284     * @param quote
285     *            the field quoted string character
286     */
287    public StringTokenizer(final char[] input, final char delim, final char quote) {
288        this(input, delim);
289        setQuoteChar(quote);
290    }
291
292    /**
293     * Constructs a tokenizer splitting on the specified string.
294     *
295     * @param input
296     *            the string which is to be parsed, not cloned
297     * @param delim
298     *            the field delimiter string
299     */
300    public StringTokenizer(final char[] input, final String delim) {
301        this(input);
302        setDelimiterString(delim);
303    }
304
305    /**
306     * Constructs a tokenizer splitting using the specified delimiter matcher.
307     *
308     * @param input
309     *            the string which is to be parsed, not cloned
310     * @param delim
311     *            the field delimiter matcher
312     */
313    public StringTokenizer(final char[] input, final StringMatcher delim) {
314        this(input);
315        setDelimiterMatcher(delim);
316    }
317
318    /**
319     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
320     * quote matcher.
321     *
322     * @param input
323     *            the string which is to be parsed, not cloned
324     * @param delim
325     *            the field delimiter character
326     * @param quote
327     *            the field quoted string character
328     */
329    public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
330        this(input, delim);
331        setQuoteMatcher(quote);
332    }
333
334    /**
335     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
336     *
337     * @param input
338     *            the string which is to be parsed
339     */
340    public StringTokenizer(final String input) {
341        this.chars = input != null ? input.toCharArray() : null;
342    }
343
344    /**
345     * Constructs a tokenizer splitting on the specified delimiter character.
346     *
347     * @param input
348     *            the string which is to be parsed
349     * @param delim
350     *            the field delimiter character
351     */
352    public StringTokenizer(final String input, final char delim) {
353        this(input);
354        setDelimiterChar(delim);
355    }
356
357    /**
358     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
359     * quote character.
360     *
361     * @param input
362     *            the string which is to be parsed
363     * @param delim
364     *            the field delimiter character
365     * @param quote
366     *            the field quoted string character
367     */
368    public StringTokenizer(final String input, final char delim, final char quote) {
369        this(input, delim);
370        setQuoteChar(quote);
371    }
372
373    /**
374     * Constructs a tokenizer splitting on the specified delimiter string.
375     *
376     * @param input
377     *            the string which is to be parsed
378     * @param delim
379     *            the field delimiter string
380     */
381    public StringTokenizer(final String input, final String delim) {
382        this(input);
383        setDelimiterString(delim);
384    }
385
386    /**
387     * Constructs a tokenizer splitting using the specified delimiter matcher.
388     *
389     * @param input
390     *            the string which is to be parsed
391     * @param delim
392     *            the field delimiter matcher
393     */
394    public StringTokenizer(final String input, final StringMatcher delim) {
395        this(input);
396        setDelimiterMatcher(delim);
397    }
398
399    /**
400     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
401     * quote matcher.
402     *
403     * @param input
404     *            the string which is to be parsed
405     * @param delim
406     *            the field delimiter matcher
407     * @param quote
408     *            the field quoted string matcher
409     */
410    public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
411        this(input, delim);
412        setQuoteMatcher(quote);
413    }
414
415    /**
416     * Unsupported ListIterator operation.
417     *
418     * @param obj
419     *            this parameter ignored.
420     * @throws UnsupportedOperationException
421     *             always
422     */
423    @Override
424    public void add(final String obj) {
425        throw new UnsupportedOperationException("add() is unsupported");
426    }
427
428    /**
429     * Adds a token to a list, paying attention to the parameters we've set.
430     *
431     * @param list
432     *            the list to add to
433     * @param tok
434     *            the token to add
435     */
436    private void addToken(final List<String> list, String tok) {
437        if (tok == null || tok.isEmpty()) {
438            if (isIgnoreEmptyTokens()) {
439                return;
440            }
441            if (isEmptyTokenAsNull()) {
442                tok = null;
443            }
444        }
445        list.add(tok);
446    }
447
448    /**
449     * Checks if tokenization has been done, and if not then do it.
450     */
451    private void checkTokenized() {
452        if (tokens == null) {
453            final List<String> split;
454            if (chars == null) {
455                // still call tokenize as subclass may do some work
456                split = tokenize(null, 0, 0);
457            } else {
458                split = tokenize(chars, 0, chars.length);
459            }
460            tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
461        }
462    }
463
464    /**
465     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
466     * list. If a {@link CloneNotSupportedException} is caught, return {@code null}.
467     *
468     * @return a new instance of this Tokenizer which has been reset.
469     */
470    @Override
471    public Object clone() {
472        try {
473            return cloneReset();
474        } catch (final CloneNotSupportedException ex) {
475            return null;
476        }
477    }
478
479    /**
480     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
481     * list.
482     *
483     * @return a new instance of this Tokenizer which has been reset.
484     * @throws CloneNotSupportedException
485     *             if there is a problem cloning
486     */
487    Object cloneReset() throws CloneNotSupportedException {
488        // this method exists to enable 100% test coverage
489        final StringTokenizer cloned = (StringTokenizer) super.clone();
490        if (cloned.chars != null) {
491            cloned.chars = cloned.chars.clone();
492        }
493        cloned.reset();
494        return cloned;
495    }
496
497    /**
498     * Gets the String content that the tokenizer is parsing.
499     *
500     * @return The string content being parsed
501     */
502    public String getContent() {
503        if (chars == null) {
504            return null;
505        }
506        return new String(chars);
507    }
508
509    /**
510     * Gets the field delimiter matcher.
511     *
512     * @return The delimiter matcher in use
513     */
514    public StringMatcher getDelimiterMatcher() {
515        return this.delimMatcher;
516    }
517
518    /**
519     * Gets the ignored character matcher.
520     * <p>
521     * These characters are ignored when parsing the String, unless they are within a quoted region. The default value
522     * is not to ignore anything.
523     * </p>
524     *
525     * @return The ignored matcher in use
526     */
527    public StringMatcher getIgnoredMatcher() {
528        return ignoredMatcher;
529    }
530
531    /**
532     * Gets the quote matcher currently in use.
533     * <p>
534     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
535     * default value is '"' (double quote).
536     * </p>
537     *
538     * @return The quote matcher in use
539     */
540    public StringMatcher getQuoteMatcher() {
541        return quoteMatcher;
542    }
543
544    /**
545     * Gets a copy of the full token list as an independent modifiable array.
546     *
547     * @return The tokens as a String array
548     */
549    public String[] getTokenArray() {
550        checkTokenized();
551        return tokens.clone();
552    }
553
554    /**
555     * Gets a copy of the full token list as an independent modifiable list.
556     *
557     * @return The tokens as a String list
558     */
559    public List<String> getTokenList() {
560        checkTokenized();
561        return new ArrayList<>(Arrays.asList(tokens));
562    }
563
564    /**
565     * Gets the trimmer character matcher.
566     * <p>
567     * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
568     * value is not to trim anything.
569     * </p>
570     *
571     * @return The trimmer matcher in use
572     */
573    public StringMatcher getTrimmerMatcher() {
574        return trimmerMatcher;
575    }
576
577    /**
578     * Tests whether there are any more tokens.
579     *
580     * @return true if there are more tokens
581     */
582    @Override
583    public boolean hasNext() {
584        checkTokenized();
585        return tokenPos < tokens.length;
586    }
587
588    /**
589     * Tests whether there are any previous tokens that can be iterated to.
590     *
591     * @return true if there are previous tokens
592     */
593    @Override
594    public boolean hasPrevious() {
595        checkTokenized();
596        return tokenPos > 0;
597    }
598
599    /**
600     * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false.
601     *
602     * @return true if empty tokens are returned as null
603     */
604    public boolean isEmptyTokenAsNull() {
605        return this.emptyAsNull;
606    }
607
608    /**
609     * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true.
610     *
611     * @return true if empty tokens are not returned
612     */
613    public boolean isIgnoreEmptyTokens() {
614        return ignoreEmptyTokens;
615    }
616
617    /**
618     * Tests if the characters at the index specified match the quote already matched in readNextToken().
619     *
620     * @param srcChars
621     *            the character array being tokenized
622     * @param pos
623     *            the position to check for a quote
624     * @param len
625     *            the length of the character array being tokenized
626     * @param quoteStart
627     *            the start position of the matched quote, 0 if no quoting
628     * @param quoteLen
629     *            the length of the matched quote, 0 if no quoting
630     * @return true if a quote is matched
631     */
632    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
633            final int quoteLen) {
634        for (int i = 0; i < quoteLen; i++) {
635            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
636                return false;
637            }
638        }
639        return true;
640    }
641
642    /**
643     * Gets the next token.
644     *
645     * @return The next String token
646     * @throws NoSuchElementException
647     *             if there are no more elements
648     */
649    @Override
650    public String next() {
651        if (hasNext()) {
652            return tokens[tokenPos++];
653        }
654        throw new NoSuchElementException();
655    }
656
657    /**
658     * Gets the index of the next token to return.
659     *
660     * @return The next token index
661     */
662    @Override
663    public int nextIndex() {
664        return tokenPos;
665    }
666
667    /**
668     * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
669     * {@link NoSuchElementException} when no tokens remain.
670     *
671     * @return The next sequential token, or null when no more tokens are found
672     */
673    public String nextToken() {
674        if (hasNext()) {
675            return tokens[tokenPos++];
676        }
677        return null;
678    }
679
680    /**
681     * Gets the token previous to the last returned token.
682     *
683     * @return The previous token
684     */
685    @Override
686    public String previous() {
687        if (hasPrevious()) {
688            return tokens[--tokenPos];
689        }
690        throw new NoSuchElementException();
691    }
692
693    /**
694     * Gets the index of the previous token.
695     *
696     * @return The previous token index
697     */
698    @Override
699    public int previousIndex() {
700        return tokenPos - 1;
701    }
702
703    /**
704     * Gets the previous token from the String.
705     *
706     * @return The previous sequential token, or null when no more tokens are found
707     */
708    public String previousToken() {
709        if (hasPrevious()) {
710            return tokens[--tokenPos];
711        }
712        return null;
713    }
714
715    /**
716     * Reads character by character through the String to get the next token.
717     *
718     * @param srcChars
719     *            the character array being tokenized
720     * @param start
721     *            the first character of field
722     * @param len
723     *            the length of the character array being tokenized
724     * @param workArea
725     *            a temporary work area
726     * @param tokenList
727     *            the list of parsed tokens
728     * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of
729     *         string found
730     */
731    private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
732            final List<String> tokenList) {
733        // skip all leading whitespace, unless it is the
734        // field delimiter or the quote character
735        while (start < len) {
736            final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
737                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
738            if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
739                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
740                break;
741            }
742            start += removeLen;
743        }
744
745        // handle reaching end
746        if (start >= len) {
747            addToken(tokenList, StringUtils.EMPTY);
748            return -1;
749        }
750
751        // handle empty token
752        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
753        if (delimLen > 0) {
754            addToken(tokenList, StringUtils.EMPTY);
755            return start + delimLen;
756        }
757
758        // handle found token
759        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
760        if (quoteLen > 0) {
761            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
762        }
763        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
764    }
765
766    /**
767     * Reads a possibly quoted string token.
768     *
769     * @param srcChars
770     *            the character array being tokenized
771     * @param start
772     *            the first character of field
773     * @param len
774     *            the length of the character array being tokenized
775     * @param workArea
776     *            a temporary work area
777     * @param tokenList
778     *            the list of parsed tokens
779     * @param quoteStart
780     *            the start position of the matched quote, 0 if no quoting
781     * @param quoteLen
782     *            the length of the matched quote, 0 if no quoting
783     * @return The starting position of the next field (the character immediately after the delimiter, or if end of
784     *         string found, then the length of string
785     */
786    private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
787            final List<String> tokenList, final int quoteStart, final int quoteLen) {
788        // Loop until we've found the end of the quoted
789        // string or the end of the input
790        workArea.clear();
791        int pos = start;
792        boolean quoting = quoteLen > 0;
793        int trimStart = 0;
794
795        while (pos < len) {
796            // quoting mode can occur several times throughout a string
797            // we must switch between quoting and non-quoting until we
798            // encounter a non-quoted delimiter, or end of string
799            if (quoting) {
800                // In quoting mode
801
802                // If we've found a quote character, see if it's
803                // followed by a second quote. If so, then we need
804                // to actually put the quote character into the token
805                // rather than end the token.
806                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
807                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
808                        // matched pair of quotes, thus an escaped quote
809                        workArea.append(srcChars, pos, quoteLen);
810                        pos += quoteLen * 2;
811                        trimStart = workArea.size();
812                        continue;
813                    }
814
815                    // end of quoting
816                    quoting = false;
817                    pos += quoteLen;
818                    continue;
819                }
820
821            } else {
822                // Not in quoting mode
823
824                // check for delimiter, and thus end of token
825                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
826                if (delimLen > 0) {
827                    // return condition when end of token found
828                    addToken(tokenList, workArea.substring(0, trimStart));
829                    return pos + delimLen;
830                }
831
832                // check for quote, and thus back into quoting mode
833                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
834                    quoting = true;
835                    pos += quoteLen;
836                    continue;
837                }
838
839                // check for ignored (outside quotes), and ignore
840                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
841                if (ignoredLen > 0) {
842                    pos += ignoredLen;
843                    continue;
844                }
845
846                // check for trimmed character
847                // don't yet know if its at the end, so copy to workArea
848                // use trimStart to keep track of trim at the end
849                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
850                if (trimmedLen > 0) {
851                    workArea.append(srcChars, pos, trimmedLen);
852                    pos += trimmedLen;
853                    continue;
854                }
855            }
856            // copy regular character from inside quotes
857            workArea.append(srcChars[pos++]);
858            trimStart = workArea.size();
859        }
860
861        // return condition when end of string found
862        addToken(tokenList, workArea.substring(0, trimStart));
863        return -1;
864    }
865
866    /**
867     * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
868     *
869     * @throws UnsupportedOperationException
870     *             always
871     */
872    @Override
873    public void remove() {
874        throw new UnsupportedOperationException("remove() is unsupported");
875    }
876
877    /**
878     * Resets this tokenizer, forgetting all parsing and iteration already completed.
879     * <p>
880     * This method allows the same tokenizer to be reused for the same String.
881     * </p>
882     *
883     * @return this, to enable chaining
884     */
885    public StringTokenizer reset() {
886        tokenPos = 0;
887        tokens = null;
888        return this;
889    }
890
891    /**
892     * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
893     * same settings on multiple input lines.
894     *
895     * @param input
896     *            the new character array to tokenize, not cloned, null sets no text to parse
897     * @return this, to enable chaining
898     */
899    public StringTokenizer reset(final char[] input) {
900        reset();
901        this.chars = input != null ? input.clone() : null;
902        return this;
903    }
904
905    /**
906     * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
907     * same settings on multiple input lines.
908     *
909     * @param input
910     *            the new string to tokenize, null sets no text to parse
911     * @return this, to enable chaining
912     */
913    public StringTokenizer reset(final String input) {
914        reset();
915        this.chars = input != null ? input.toCharArray() : null;
916        return this;
917    }
918
919    /**
920     * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
921     *
922     * @param obj
923     *            this parameter ignored.
924     * @throws UnsupportedOperationException
925     *             always
926     */
927    @Override
928    public void set(final String obj) {
929        throw new UnsupportedOperationException("set() is unsupported");
930    }
931
932    /**
933     * Sets the field delimiter character.
934     *
935     * @param delim
936     *            the delimiter character to use
937     * @return this, to enable chaining
938     */
939    public StringTokenizer setDelimiterChar(final char delim) {
940        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
941    }
942
943    /**
944     * Sets the field delimiter matcher.
945     * <p>
946     * The delimiter is used to separate one token from another.
947     * </p>
948     *
949     * @param delim
950     *            the delimiter matcher to use
951     * @return this, to enable chaining
952     */
953    public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
954        this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim;
955        return this;
956    }
957
958    /**
959     * Sets the field delimiter string.
960     *
961     * @param delim
962     *            the delimiter string to use
963     * @return this, to enable chaining
964     */
965    public StringTokenizer setDelimiterString(final String delim) {
966        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
967    }
968
969    /**
970     * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
971     *
972     * @param emptyAsNull
973     *            whether empty tokens are returned as null
974     * @return this, to enable chaining
975     */
976    public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
977        this.emptyAsNull = emptyAsNull;
978        return this;
979    }
980
981    /**
982     * Sets the character to ignore.
983     * <p>
984     * This character is ignored when parsing the String, unless it is within a quoted region.
985     * </p>
986     *
987     * @param ignored
988     *            the ignored character to use
989     * @return this, to enable chaining
990     */
991    public StringTokenizer setIgnoredChar(final char ignored) {
992        return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
993    }
994
995    /**
996     * Sets the matcher for characters to ignore.
997     * <p>
998     * These characters are ignored when parsing the String, unless they are within a quoted region.
999     * </p>
1000     *
1001     * @param ignored
1002     *            the ignored matcher to use, null ignored
1003     * @return this, to enable chaining
1004     */
1005    public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
1006        if (ignored != null) {
1007            this.ignoredMatcher = ignored;
1008        }
1009        return this;
1010    }
1011
1012    /**
1013     * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
1014     *
1015     * @param ignoreEmptyTokens
1016     *            whether empty tokens are not returned
1017     * @return this, to enable chaining
1018     */
1019    public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1020        this.ignoreEmptyTokens = ignoreEmptyTokens;
1021        return this;
1022    }
1023
1024    /**
1025     * Sets the quote character to use.
1026     * <p>
1027     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1028     * </p>
1029     *
1030     * @param quote
1031     *            the quote character to use
1032     * @return this, to enable chaining
1033     */
1034    public StringTokenizer setQuoteChar(final char quote) {
1035        return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
1036    }
1037
1038    /**
1039     * Sets the quote matcher to use.
1040     * <p>
1041     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1042     * </p>
1043     *
1044     * @param quote
1045     *            the quote matcher to use, null ignored
1046     * @return this, to enable chaining
1047     */
1048    public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
1049        if (quote != null) {
1050            this.quoteMatcher = quote;
1051        }
1052        return this;
1053    }
1054
1055    /**
1056     * Sets the matcher for characters to trim.
1057     * <p>
1058     * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1059     *
1060     * @param trimmer
1061     *            the trimmer matcher to use, null ignored
1062     * @return this, to enable chaining
1063     */
1064    public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
1065        if (trimmer != null) {
1066            this.trimmerMatcher = trimmer;
1067        }
1068        return this;
1069    }
1070
1071    /**
1072     * Gets the number of tokens found in the String.
1073     *
1074     * @return The number of matched tokens
1075     */
1076    public int size() {
1077        checkTokenized();
1078        return tokens.length;
1079    }
1080
1081    /**
1082     * Internal method to performs the tokenization.
1083     * <p>
1084     * Most users of this class do not need to call this method. This method will be called automatically by other
1085     * (public) methods when required.
1086     * </p>
1087     * <p>
1088     * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
1089     * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
1090     * strings. It is also be possible to filter the results.
1091     * </p>
1092     * <p>
1093     * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this
1094     * method, however a subclass may pass other values, or even an entirely different array.
1095     * </p>
1096     *
1097     * @param srcChars
1098     *            the character array being tokenized, may be null
1099     * @param offset
1100     *            the start position within the character array, must be valid
1101     * @param count
1102     *            the number of characters to tokenize, must be valid
1103     * @return The modifiable list of String tokens, unmodifiable if null array or zero count
1104     */
1105    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1106        if (srcChars == null || count == 0) {
1107            return Collections.emptyList();
1108        }
1109        final TextStringBuilder buf = new TextStringBuilder();
1110        final List<String> tokenList = new ArrayList<>();
1111        int pos = offset;
1112
1113        // loop around the entire buffer
1114        while (pos >= 0 && pos < count) {
1115            // find next token
1116            pos = readNextToken(srcChars, pos, count, buf, tokenList);
1117
1118            // handle case where end of string is a delimiter
1119            if (pos >= count) {
1120                addToken(tokenList, StringUtils.EMPTY);
1121            }
1122        }
1123        return tokenList;
1124    }
1125
1126    /**
1127     * Gets the String content that the tokenizer is parsing.
1128     *
1129     * @return The string content being parsed
1130     */
1131    @Override
1132    public String toString() {
1133        if (tokens == null) {
1134            return "StringTokenizer[not tokenized yet]";
1135        }
1136        return "StringTokenizer" + getTokenList();
1137    }
1138
1139}