Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025import org.apache.commons.lang3.ArrayUtils;
026import org.apache.commons.lang3.StringUtils;
027
028/**
029 * Tokenizes a string based on delimiters (separators)
030 * and supporting quoting and ignored character concepts.
031 * <p>
032 * This class can split a String into many smaller strings. It aims
033 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
034 * however it offers much more control and flexibility including implementing
035 * the {@code ListIterator} interface. By default, it is set up
036 * like {@code StringTokenizer}.
037 * <p>
038 * The input String is split into a number of <em>tokens</em>.
039 * Each token is separated from the next String by a <em>delimiter</em>.
040 * One or more delimiter characters must be specified.
041 * <p>
042 * Each token may be surrounded by quotes.
043 * The <em>quote</em> matcher specifies the quote character(s).
044 * A quote may be escaped within a quoted section by duplicating itself.
045 * <p>
046 * Between each token and the delimiter are potentially characters that need trimming.
047 * The <em>trimmer</em> matcher specifies these characters.
048 * One usage might be to trim whitespace characters.
049 * <p>
050 * At any point outside the quotes there might potentially be invalid characters.
051 * The <em>ignored</em> matcher specifies these characters to be removed.
052 * One usage might be to remove new line characters.
053 * <p>
054 * Empty tokens may be removed or returned as null.
055 * <pre>
056 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
057 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
058 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
059 * </pre>
060 *
061 * <table>
062 *  <caption>StrTokenizer properties and options</caption>
063 *  <tr>
064 *   <th>Property</th><th>Type</th><th>Default</th>
065 *  </tr>
066 *  <tr>
067 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
068 *  </tr>
069 *  <tr>
070 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
071 *  </tr>
072 *  <tr>
073 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
074 *  </tr>
075 *  <tr>
076 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
077 *  </tr>
078 *  <tr>
079 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
080 *  </tr>
081 * </table>
082 *
083 * @since 1.0
084 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
085 */
086@Deprecated
087public class StrTokenizer implements ListIterator<String>, Cloneable {
088
089    /** Comma separated values tokenizer internal variable. */
090    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
091
092    /** Tab separated values tokenizer internal variable. */
093    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
094
095    static {
096        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
097        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
098        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
099        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
100        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
101        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
102        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
103
104        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
105        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
106        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
107        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
108        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
109        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
110        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
111    }
112
113    /**
114     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
115     *
116     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
117     */
118    private static StrTokenizer getCSVClone() {
119        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
120    }
121
122    /**
123     * Gets a new tokenizer instance which parses Comma Separated Value strings
124     * initializing it with the given input.  The default for CSV processing
125     * will be trim whitespace from both ends (which can be overridden with
126     * the setTrimmer method).
127     * <p>
128     * You must call a "reset" method to set the string which you want to parse.
129     * </p>
130     * @return a new tokenizer instance which parses Comma Separated Value strings
131     */
132    public static StrTokenizer getCSVInstance() {
133        return getCSVClone();
134    }
135
136    /**
137     * Gets a new tokenizer instance which parses Comma Separated Value strings
138     * initializing it with the given input.  The default for CSV processing
139     * will be trim whitespace from both ends (which can be overridden with
140     * the setTrimmer method).
141     *
142     * @param input  the text to parse
143     * @return a new tokenizer instance which parses Comma Separated Value strings
144     */
145    public static StrTokenizer getCSVInstance(final char[] input) {
146        final StrTokenizer tok = getCSVClone();
147        tok.reset(input);
148        return tok;
149    }
150
151    /**
152     * Gets a new tokenizer instance which parses Comma Separated Value strings
153     * initializing it with the given input.  The default for CSV processing
154     * will be trim whitespace from both ends (which can be overridden with
155     * the setTrimmer method).
156     *
157     * @param input  the text to parse
158     * @return a new tokenizer instance which parses Comma Separated Value strings
159     */
160    public static StrTokenizer getCSVInstance(final String input) {
161        final StrTokenizer tok = getCSVClone();
162        tok.reset(input);
163        return tok;
164    }
165    /**
166     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
167     *
168     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
169     */
170    private static StrTokenizer getTSVClone() {
171        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
172    }
173
174    /**
175     * Gets a new tokenizer instance which parses Tab Separated Value strings.
176     * The default for CSV processing will be trim whitespace from both ends
177     * (which can be overridden with the setTrimmer method).
178     * <p>
179     * You must call a "reset" method to set the string which you want to parse.
180     * </p>
181     * @return a new tokenizer instance which parses Tab Separated Value strings.
182     */
183    public static StrTokenizer getTSVInstance() {
184        return getTSVClone();
185    }
186
187    /**
188     * Gets a new tokenizer instance which parses Tab Separated Value strings.
189     * The default for CSV processing will be trim whitespace from both ends
190     * (which can be overridden with the setTrimmer method).
191     * @param input  the string to parse
192     * @return a new tokenizer instance which parses Tab Separated Value strings.
193     */
194    public static StrTokenizer getTSVInstance(final char[] input) {
195        final StrTokenizer tok = getTSVClone();
196        tok.reset(input);
197        return tok;
198    }
199
200    /**
201     * Gets a new tokenizer instance which parses Tab Separated Value strings.
202     * The default for CSV processing will be trim whitespace from both ends
203     * (which can be overridden with the setTrimmer method).
204     * @param input  the string to parse
205     * @return a new tokenizer instance which parses Tab Separated Value strings.
206     */
207    public static StrTokenizer getTSVInstance(final String input) {
208        final StrTokenizer tok = getTSVClone();
209        tok.reset(input);
210        return tok;
211    }
212
213    /** The text to work on. */
214    private char[] chars;
215
216    /** The parsed tokens. */
217    private String[] tokens;
218
219    /** The current iteration position. */
220    private int tokenPos;
221
222    /** The delimiter matcher. */
223    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
224
225    /** The quote matcher. */
226    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
227
228    /** The ignored matcher. */
229    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
230
231    /** The trimmer matcher. */
232    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
233
234    /** Whether to return empty tokens as null. */
235    private boolean emptyAsNull;
236
237    /** Whether to ignore empty tokens. */
238    private boolean ignoreEmptyTokens = true;
239
240    /**
241     * Constructs a tokenizer splitting on space, tab, newline and form feed
242     * as per StringTokenizer, but with no text to tokenize.
243     * <p>
244     * This constructor is normally used with {@link #reset(String)}.
245     * </p>
246     */
247    public StrTokenizer() {
248        this.chars = null;
249    }
250
251    /**
252     * Constructs a tokenizer splitting on space, tab, newline and form feed
253     * as per StringTokenizer.
254     *
255     * @param input  the string which is to be parsed, not cloned
256     */
257    public StrTokenizer(final char[] input) {
258        if (input == null) {
259            this.chars = null;
260        } else {
261            this.chars = input.clone();
262        }
263    }
264
265    /**
266     * Constructs a tokenizer splitting on the specified character.
267     *
268     * @param input  the string which is to be parsed, not cloned
269     * @param delim the field delimiter character
270     */
271    public StrTokenizer(final char[] input, final char delim) {
272        this(input);
273        setDelimiterChar(delim);
274    }
275
276    /**
277     * Constructs a tokenizer splitting on the specified delimiter character
278     * and handling quotes using the specified quote character.
279     *
280     * @param input  the string which is to be parsed, not cloned
281     * @param delim  the field delimiter character
282     * @param quote  the field quoted string character
283     */
284    public StrTokenizer(final char[] input, final char delim, final char quote) {
285        this(input, delim);
286        setQuoteChar(quote);
287    }
288
289    /**
290     * Constructs a tokenizer splitting on the specified string.
291     *
292     * @param input  the string which is to be parsed, not cloned
293     * @param delim the field delimiter string
294     */
295    public StrTokenizer(final char[] input, final String delim) {
296        this(input);
297        setDelimiterString(delim);
298    }
299
300    /**
301     * Constructs a tokenizer splitting using the specified delimiter matcher.
302     *
303     * @param input  the string which is to be parsed, not cloned
304     * @param delim  the field delimiter matcher
305     */
306    public StrTokenizer(final char[] input, final StrMatcher delim) {
307        this(input);
308        setDelimiterMatcher(delim);
309    }
310
311    /**
312     * Constructs a tokenizer splitting using the specified delimiter matcher
313     * and handling quotes using the specified quote matcher.
314     *
315     * @param input  the string which is to be parsed, not cloned
316     * @param delim  the field delimiter character
317     * @param quote  the field quoted string character
318     */
319    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
320        this(input, delim);
321        setQuoteMatcher(quote);
322    }
323
324    /**
325     * Constructs a tokenizer splitting on space, tab, newline and form feed
326     * as per StringTokenizer.
327     *
328     * @param input  the string which is to be parsed
329     */
330    public StrTokenizer(final String input) {
331        if (input != null) {
332            chars = input.toCharArray();
333        } else {
334            chars = null;
335        }
336    }
337
338    /**
339     * Constructs a tokenizer splitting on the specified delimiter character.
340     *
341     * @param input  the string which is to be parsed
342     * @param delim  the field delimiter character
343     */
344    public StrTokenizer(final String input, final char delim) {
345        this(input);
346        setDelimiterChar(delim);
347    }
348
349    /**
350     * Constructs a tokenizer splitting on the specified delimiter character
351     * and handling quotes using the specified quote character.
352     *
353     * @param input  the string which is to be parsed
354     * @param delim  the field delimiter character
355     * @param quote  the field quoted string character
356     */
357    public StrTokenizer(final String input, final char delim, final char quote) {
358        this(input, delim);
359        setQuoteChar(quote);
360    }
361
362    /**
363     * Constructs a tokenizer splitting on the specified delimiter string.
364     *
365     * @param input  the string which is to be parsed
366     * @param delim  the field delimiter string
367     */
368    public StrTokenizer(final String input, final String delim) {
369        this(input);
370        setDelimiterString(delim);
371    }
372
373    /**
374     * Constructs a tokenizer splitting using the specified delimiter matcher.
375     *
376     * @param input  the string which is to be parsed
377     * @param delim  the field delimiter matcher
378     */
379    public StrTokenizer(final String input, final StrMatcher delim) {
380        this(input);
381        setDelimiterMatcher(delim);
382    }
383
384    /**
385     * Constructs a tokenizer splitting using the specified delimiter matcher
386     * and handling quotes using the specified quote matcher.
387     *
388     * @param input  the string which is to be parsed
389     * @param delim  the field delimiter matcher
390     * @param quote  the field quoted string matcher
391     */
392    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
393        this(input, delim);
394        setQuoteMatcher(quote);
395    }
396
397    /**
398     * Unsupported ListIterator operation.
399     * @param obj this parameter ignored.
400     * @throws UnsupportedOperationException always
401     */
402    @Override
403    public void add(final String obj) {
404        throw new UnsupportedOperationException("add() is unsupported");
405    }
406
407    /**
408     * Adds a token to a list, paying attention to the parameters we've set.
409     *
410     * @param list  the list to add to
411     * @param tok  the token to add
412     */
413    private void addToken(final List<String> list, String tok) {
414        if (tok == null || tok.isEmpty()) {
415            if (isIgnoreEmptyTokens()) {
416                return;
417            }
418            if (isEmptyTokenAsNull()) {
419                tok = null;
420            }
421        }
422        list.add(tok);
423    }
424
425    /**
426     * Checks if tokenization has been done, and if not then do it.
427     */
428    private void checkTokenized() {
429        if (tokens == null) {
430            if (chars == null) {
431                // still call tokenize as subclass may do some work
432                final List<String> split = tokenize(null, 0, 0);
433                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
434            } else {
435                final List<String> split = tokenize(chars, 0, chars.length);
436                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
437            }
438        }
439    }
440
441    /**
442     * Creates a new instance of this Tokenizer. The new instance is reset so
443     * that it will be at the start of the token list.
444     * If a {@link CloneNotSupportedException} is caught, return {@code null}.
445     *
446     * @return a new instance of this Tokenizer which has been reset.
447     */
448    @Override
449    public Object clone() {
450        try {
451            return cloneReset();
452        } catch (final CloneNotSupportedException ex) {
453            return null;
454        }
455    }
456
457    /**
458     * Creates a new instance of this Tokenizer. The new instance is reset so that
459     * it will be at the start of the token list.
460     *
461     * @return a new instance of this Tokenizer which has been reset.
462     * @throws CloneNotSupportedException if there is a problem cloning
463     */
464    Object cloneReset() throws CloneNotSupportedException {
465        // this method exists to enable 100% test coverage
466        final StrTokenizer cloned = (StrTokenizer) super.clone();
467        if (cloned.chars != null) {
468            cloned.chars = cloned.chars.clone();
469        }
470        cloned.reset();
471        return cloned;
472    }
473
474    /**
475     * Gets the String content that the tokenizer is parsing.
476     *
477     * @return The string content being parsed
478     */
479    public String getContent() {
480        if (chars == null) {
481            return null;
482        }
483        return new String(chars);
484    }
485
486    /**
487     * Gets the field delimiter matcher.
488     *
489     * @return The delimiter matcher in use
490     */
491    public StrMatcher getDelimiterMatcher() {
492        return this.delimMatcher;
493    }
494
495    /**
496     * Gets the ignored character matcher.
497     * <p>
498     * These characters are ignored when parsing the String, unless they are
499     * within a quoted region.
500     * The default value is not to ignore anything.
501     * </p>
502     *
503     * @return The ignored matcher in use
504     */
505    public StrMatcher getIgnoredMatcher() {
506        return ignoredMatcher;
507    }
508
509    /**
510     * Gets the quote matcher currently in use.
511     * <p>
512     * The quote character is used to wrap data between the tokens.
513     * This enables delimiters to be entered as data.
514     * The default value is '"' (double quote).
515     * </p>
516     *
517     * @return The quote matcher in use
518     */
519    public StrMatcher getQuoteMatcher() {
520        return quoteMatcher;
521    }
522
523    /**
524     * Gets a copy of the full token list as an independent modifiable array.
525     *
526     * @return The tokens as a String array
527     */
528    public String[] getTokenArray() {
529        checkTokenized();
530        return tokens.clone();
531    }
532
533    /**
534     * Gets a copy of the full token list as an independent modifiable list.
535     *
536     * @return The tokens as a String array
537     */
538    public List<String> getTokenList() {
539        checkTokenized();
540        final List<String> list = new ArrayList<>(tokens.length);
541        Collections.addAll(list, tokens);
542
543        return list;
544    }
545
546    /**
547     * Gets the trimmer character matcher.
548     * <p>
549     * These characters are trimmed off on each side of the delimiter
550     * until the token or quote is found.
551     * The default value is not to trim anything.
552     * </p>
553     *
554     * @return The trimmer matcher in use
555     */
556    public StrMatcher getTrimmerMatcher() {
557        return trimmerMatcher;
558    }
559
560    /**
561     * Checks whether there are any more tokens.
562     *
563     * @return true if there are more tokens
564     */
565    @Override
566    public boolean hasNext() {
567        checkTokenized();
568        return tokenPos < tokens.length;
569    }
570
571    /**
572     * Checks whether there are any previous tokens that can be iterated to.
573     *
574     * @return true if there are previous tokens
575     */
576    @Override
577    public boolean hasPrevious() {
578        checkTokenized();
579        return tokenPos > 0;
580    }
581
582    /**
583     * Gets whether the tokenizer currently returns empty tokens as null.
584     * The default for this property is false.
585     *
586     * @return true if empty tokens are returned as null
587     */
588    public boolean isEmptyTokenAsNull() {
589        return this.emptyAsNull;
590    }
591
592    /**
593     * Gets whether the tokenizer currently ignores empty tokens.
594     * The default for this property is true.
595     *
596     * @return true if empty tokens are not returned
597     */
598    public boolean isIgnoreEmptyTokens() {
599        return ignoreEmptyTokens;
600    }
601
602    /**
603     * Checks if the characters at the index specified match the quote
604     * already matched in readNextToken().
605     *
606     * @param srcChars  the character array being tokenized
607     * @param pos  the position to check for a quote
608     * @param len  the length of the character array being tokenized
609     * @param quoteStart  the start position of the matched quote, 0 if no quoting
610     * @param quoteLen  the length of the matched quote, 0 if no quoting
611     * @return true if a quote is matched
612     */
613    private boolean isQuote(final char[] srcChars,
614                            final int pos,
615                            final int len,
616                            final int quoteStart,
617                            final int quoteLen) {
618        for (int i = 0; i < quoteLen; i++) {
619            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
620                return false;
621            }
622        }
623        return true;
624    }
625
626    /**
627     * Gets the next token.
628     *
629     * @return The next String token
630     * @throws NoSuchElementException if there are no more elements
631     */
632    @Override
633    public String next() {
634        if (hasNext()) {
635            return tokens[tokenPos++];
636        }
637        throw new NoSuchElementException();
638    }
639
640    /**
641     * Gets the index of the next token to return.
642     *
643     * @return The next token index
644     */
645    @Override
646    public int nextIndex() {
647        return tokenPos;
648    }
649
650    /**
651     * Gets the next token from the String.
652     * Equivalent to {@link #next()} except it returns null rather than
653     * throwing {@link NoSuchElementException} when no tokens remain.
654     *
655     * @return The next sequential token, or null when no more tokens are found
656     */
657    public String nextToken() {
658        if (hasNext()) {
659            return tokens[tokenPos++];
660        }
661        return null;
662    }
663
664    /**
665     * Gets the token previous to the last returned token.
666     *
667     * @return The previous token
668     */
669    @Override
670    public String previous() {
671        if (hasPrevious()) {
672            return tokens[--tokenPos];
673        }
674        throw new NoSuchElementException();
675    }
676
677    /**
678     * Gets the index of the previous token.
679     *
680     * @return The previous token index
681     */
682    @Override
683    public int previousIndex() {
684        return tokenPos - 1;
685    }
686
687    /**
688     * Gets the previous token from the String.
689     *
690     * @return The previous sequential token, or null when no more tokens are found
691     */
692    public String previousToken() {
693        if (hasPrevious()) {
694            return tokens[--tokenPos];
695        }
696        return null;
697    }
698
699    /**
700     * Reads character by character through the String to get the next token.
701     *
702     * @param srcChars  the character array being tokenized
703     * @param start  the first character of field
704     * @param len  the length of the character array being tokenized
705     * @param workArea  a temporary work area
706     * @param tokenList  the list of parsed tokens
707     * @return The starting position of the next field (the character
708     *  immediately after the delimiter), or -1 if end of string found
709     */
710    private int readNextToken(final char[] srcChars,
711                              int start,
712                              final int len,
713                              final StrBuilder workArea,
714                              final List<String> tokenList) {
715        // skip all leading whitespace, unless it is the
716        // field delimiter or the quote character
717        while (start < len) {
718            final int removeLen = Math.max(
719                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
720                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
721            if (removeLen == 0
722                    || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
723                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
724                break;
725            }
726            start += removeLen;
727        }
728
729        // handle reaching end
730        if (start >= len) {
731            addToken(tokenList, StringUtils.EMPTY);
732            return -1;
733        }
734
735        // handle empty token
736        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
737        if (delimLen > 0) {
738            addToken(tokenList, StringUtils.EMPTY);
739            return start + delimLen;
740        }
741
742        // handle found token
743        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
744        if (quoteLen > 0) {
745            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
746        }
747        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
748    }
749
750    /**
751     * Reads a possibly quoted string token.
752     *
753     * @param srcChars  the character array being tokenized
754     * @param start  the first character of field
755     * @param len  the length of the character array being tokenized
756     * @param workArea  a temporary work area
757     * @param tokenList  the list of parsed tokens
758     * @param quoteStart  the start position of the matched quote, 0 if no quoting
759     * @param quoteLen  the length of the matched quote, 0 if no quoting
760     * @return The starting position of the next field (the character
761     *  immediately after the delimiter, or if end of string found,
762     *  then the length of string
763     */
764    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
765                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
766        // Loop until we've found the end of the quoted
767        // string or the end of the input
768        workArea.clear();
769        int pos = start;
770        boolean quoting = quoteLen > 0;
771        int trimStart = 0;
772
773        while (pos < len) {
774            // quoting mode can occur several times throughout a string
775            // we must switch between quoting and non-quoting until we
776            // encounter a non-quoted delimiter, or end of string
777            if (quoting) {
778                // In quoting mode
779
780                // If we've found a quote character, see if it's
781                // followed by a second quote.  If so, then we need
782                // to actually put the quote character into the token
783                // rather than end the token.
784                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
785                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
786                        // matched pair of quotes, thus an escaped quote
787                        workArea.append(srcChars, pos, quoteLen);
788                        pos += quoteLen * 2;
789                        trimStart = workArea.size();
790                        continue;
791                    }
792
793                    // end of quoting
794                    quoting = false;
795                    pos += quoteLen;
796                    continue;
797                }
798
799            } else {
800                // Not in quoting mode
801
802                // check for delimiter, and thus end of token
803                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
804                if (delimLen > 0) {
805                    // return condition when end of token found
806                    addToken(tokenList, workArea.substring(0, trimStart));
807                    return pos + delimLen;
808                }
809
810                // check for quote, and thus back into quoting mode
811                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
812                    quoting = true;
813                    pos += quoteLen;
814                    continue;
815                }
816
817                // check for ignored (outside quotes), and ignore
818                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
819                if (ignoredLen > 0) {
820                    pos += ignoredLen;
821                    continue;
822                }
823
824                // check for trimmed character
825                // don't yet know if its at the end, so copy to workArea
826                // use trimStart to keep track of trim at the end
827                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
828                if (trimmedLen > 0) {
829                    workArea.append(srcChars, pos, trimmedLen);
830                    pos += trimmedLen;
831                    continue;
832                }
833
834            }
835            // copy regular character from inside quotes
836            workArea.append(srcChars[pos++]);
837            trimStart = workArea.size();
838        }
839
840        // return condition when end of string found
841        addToken(tokenList, workArea.substring(0, trimStart));
842        return -1;
843    }
844
845    /**
846     * Unsupported ListIterator operation.
847     *
848     * @throws UnsupportedOperationException always
849     */
850    @Override
851    public void remove() {
852        throw new UnsupportedOperationException("remove() is unsupported");
853    }
854
855    /**
856     * Resets this tokenizer, forgetting all parsing and iteration already completed.
857     * <p>
858     * This method allows the same tokenizer to be reused for the same String.
859     *
860     * @return this, to enable chaining
861     */
862    public StrTokenizer reset() {
863        tokenPos = 0;
864        tokens = null;
865        return this;
866    }
867
868    /**
869     * Reset this tokenizer, giving it a new input string to parse.
870     * In this manner you can re-use a tokenizer with the same settings
871     * on multiple input lines.
872     *
873     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
874     * @return this, to enable chaining
875     */
876    public StrTokenizer reset(final char[] input) {
877        reset();
878        if (input != null) {
879            this.chars = input.clone();
880        } else {
881            this.chars = null;
882        }
883        return this;
884    }
885
886    /**
887     * Reset this tokenizer, giving it a new input string to parse.
888     * In this manner you can re-use a tokenizer with the same settings
889     * on multiple input lines.
890     *
891     * @param input  the new string to tokenize, null sets no text to parse
892     * @return this, to enable chaining
893     */
894    public StrTokenizer reset(final String input) {
895        reset();
896        if (input != null) {
897            this.chars = input.toCharArray();
898        } else {
899            this.chars = null;
900        }
901        return this;
902    }
903
904    /**
905     * Unsupported ListIterator operation.
906     * @param obj this parameter ignored.
907     * @throws UnsupportedOperationException always
908     */
909    @Override
910    public void set(final String obj) {
911        throw new UnsupportedOperationException("set() is unsupported");
912    }
913
914    /**
915     * Sets the field delimiter character.
916     *
917     * @param delim  the delimiter character to use
918     * @return this, to enable chaining
919     */
920    public StrTokenizer setDelimiterChar(final char delim) {
921        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
922    }
923
924    /**
925     * Sets the field delimiter matcher.
926     * <p>
927     * The delimiter is used to separate one token from another.
928     * </p>
929     *
930     * @param delim  the delimiter matcher to use
931     * @return this, to enable chaining
932     */
933    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
934        if (delim == null) {
935            this.delimMatcher = StrMatcher.noneMatcher();
936        } else {
937            this.delimMatcher = delim;
938        }
939        return this;
940    }
941
942    /**
943     * Sets the field delimiter string.
944     *
945     * @param delim  the delimiter string to use
946     * @return this, to enable chaining
947     */
948    public StrTokenizer setDelimiterString(final String delim) {
949        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
950    }
951
952    /**
953     * Sets whether the tokenizer should return empty tokens as null.
954     * The default for this property is false.
955     *
956     * @param emptyAsNull  whether empty tokens are returned as null
957     * @return this, to enable chaining
958     */
959    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
960        this.emptyAsNull = emptyAsNull;
961        return this;
962    }
963
964    /**
965     * Sets the character to ignore.
966     * <p>
967     * This character is ignored when parsing the String, unless it is
968     * within a quoted region.
969     * </p>
970     *
971     * @param ignored  the ignored character to use
972     * @return this, to enable chaining
973     */
974    public StrTokenizer setIgnoredChar(final char ignored) {
975        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
976    }
977
978    /**
979     * Sets the matcher for characters to ignore.
980     * <p>
981     * These characters are ignored when parsing the String, unless they are
982     * within a quoted region.
983     * </p>
984     *
985     * @param ignored  the ignored matcher to use, null ignored
986     * @return this, to enable chaining
987     */
988    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
989        if (ignored != null) {
990            this.ignoredMatcher = ignored;
991        }
992        return this;
993    }
994
995    /**
996     * Sets whether the tokenizer should ignore and not return empty tokens.
997     * The default for this property is true.
998     *
999     * @param ignoreEmptyTokens  whether empty tokens are not returned
1000     * @return this, to enable chaining
1001     */
1002    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1003        this.ignoreEmptyTokens = ignoreEmptyTokens;
1004        return this;
1005    }
1006
1007    /**
1008     * Sets the quote character to use.
1009     * <p>
1010     * The quote character is used to wrap data between the tokens.
1011     * This enables delimiters to be entered as data.
1012     * </p>
1013     *
1014     * @param quote  the quote character to use
1015     * @return this, to enable chaining
1016     */
1017    public StrTokenizer setQuoteChar(final char quote) {
1018        return setQuoteMatcher(StrMatcher.charMatcher(quote));
1019    }
1020
1021    /**
1022     * Sets the quote matcher to use.
1023     * <p>
1024     * The quote character is used to wrap data between the tokens.
1025     * This enables delimiters to be entered as data.
1026     * </p>
1027     *
1028     * @param quote  the quote matcher to use, null ignored
1029     * @return this, to enable chaining
1030     */
1031    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1032        if (quote != null) {
1033            this.quoteMatcher = quote;
1034        }
1035        return this;
1036    }
1037
1038    /**
1039     * Sets the matcher for characters to trim.
1040     * <p>
1041     * These characters are trimmed off on each side of the delimiter
1042     * until the token or quote is found.
1043     * </p>
1044     *
1045     * @param trimmer  the trimmer matcher to use, null ignored
1046     * @return this, to enable chaining
1047     */
1048    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1049        if (trimmer != null) {
1050            this.trimmerMatcher = trimmer;
1051        }
1052        return this;
1053    }
1054
1055    /**
1056     * Gets the number of tokens found in the String.
1057     *
1058     * @return The number of matched tokens
1059     */
1060    public int size() {
1061        checkTokenized();
1062        return tokens.length;
1063    }
1064
1065    /**
1066     * Internal method to performs the tokenization.
1067     * <p>
1068     * Most users of this class do not need to call this method. This method
1069     * will be called automatically by other (public) methods when required.
1070     * </p>
1071     * <p>
1072     * This method exists to allow subclasses to add code before or after the
1073     * tokenization. For example, a subclass could alter the character array,
1074     * offset or count to be parsed, or call the tokenizer multiple times on
1075     * multiple strings. It is also be possible to filter the results.
1076     * </p>
1077     * <p>
1078     * {@code StrTokenizer} will always pass a zero offset and a count
1079     * equal to the length of the array to this method, however a subclass
1080     * may pass other values, or even an entirely different array.
1081     * </p>
1082     *
1083     * @param srcChars  the character array being tokenized, may be null
1084     * @param offset  the start position within the character array, must be valid
1085     * @param count  the number of characters to tokenize, must be valid
1086     * @return The modifiable list of String tokens, unmodifiable if null array or zero count
1087     */
1088    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1089        if (srcChars == null || count == 0) {
1090            return Collections.emptyList();
1091        }
1092        final StrBuilder buf = new StrBuilder();
1093        final List<String> tokenList = new ArrayList<>();
1094        int pos = offset;
1095
1096        // loop around the entire buffer
1097        while (pos >= 0 && pos < count) {
1098            // find next token
1099            pos = readNextToken(srcChars, pos, count, buf, tokenList);
1100
1101            // handle case where end of string is a delimiter
1102            if (pos >= count) {
1103                addToken(tokenList, StringUtils.EMPTY);
1104            }
1105        }
1106        return tokenList;
1107    }
1108
1109    /**
1110     * Gets the String content that the tokenizer is parsing.
1111     *
1112     * @return The string content being parsed
1113     */
1114    @Override
1115    public String toString() {
1116        if (tokens == null) {
1117            return "StrTokenizer[not tokenized yet]";
1118        }
1119        return "StrTokenizer" + getTokenList();
1120    }
1121
1122}