View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.lang3.text;
18  
19  import java.util.ArrayList;
20  import java.util.Arrays;
21  import java.util.Collections;
22  import java.util.List;
23  import java.util.ListIterator;
24  import java.util.NoSuchElementException;
25  import java.util.StringTokenizer;
26  
27  import org.apache.commons.lang3.ArrayUtils;
28  import org.apache.commons.lang3.StringUtils;
29  
30  /**
31   * Tokenizes a string based on delimiters (separators)
32   * and supporting quoting and ignored character concepts.
33   * <p>
34   * This class can split a String into many smaller strings. It aims
35   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
36   * however it offers much more control and flexibility including implementing
37   * the {@link ListIterator} interface. By default, it is set up
38   * like {@link StringTokenizer}.
39   * </p>
40   * <p>
41   * The input String is split into a number of <em>tokens</em>.
42   * Each token is separated from the next String by a <em>delimiter</em>.
43   * One or more delimiter characters must be specified.
44   * </p>
45   * <p>
46   * Each token may be surrounded by quotes.
47   * The <em>quote</em> matcher specifies the quote character(s).
48   * A quote may be escaped within a quoted section by duplicating itself.
49   * </p>
50   * <p>
51   * Between each token and the delimiter are potentially characters that need trimming.
52   * The <em>trimmer</em> matcher specifies these characters.
53   * One usage might be to trim whitespace characters.
54   * </p>
55   * <p>
56   * At any point outside the quotes there might potentially be invalid characters.
57   * The <em>ignored</em> matcher specifies these characters to be removed.
58   * One usage might be to remove new line characters.
59   * </p>
60   * <p>
61   * Empty tokens may be removed or returned as null.
62   * </p>
63   * <pre>
64   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
65   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
66   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
67   * </pre>
68   *
69   * <table>
70   *  <caption>StrTokenizer properties and options</caption>
71   *  <tr>
72   *   <th>Property</th><th>Type</th><th>Default</th>
73   *  </tr>
74   *  <tr>
75   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
76   *  </tr>
77   *  <tr>
78   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
79   *  </tr>
80   *  <tr>
81   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
82   *  </tr>
83   *  <tr>
84   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
85   *  </tr>
86   *  <tr>
87   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
88   *  </tr>
89   * </table>
90   *
91   * @since 2.2
92   * @deprecated As of 3.6, use Apache Commons Text
93   * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
94   * StringTokenizer</a> instead
95   */
96  @Deprecated
97  public class StrTokenizer implements ListIterator<String>, Cloneable {
98  
99      private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
100     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
101     static {
102         CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
103         CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
104         CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
105         CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
106         CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
107         CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
108         CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
109 
110         TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
111         TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
112         TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
113         TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
114         TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
115         TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
116         TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
117     }
118 
119     /**
120      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
121      *
122      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
123      */
124     private static StrTokenizer getCSVClone() {
125         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
126     }
127     /**
128      * Gets a new tokenizer instance which parses Comma Separated Value strings
129      * initializing it with the given input.  The default for CSV processing
130      * will be trim whitespace from both ends (which can be overridden with
131      * the setTrimmer method).
132      * <p>
133      * You must call a "reset" method to set the string which you want to parse.
134      * </p>
135      * @return a new tokenizer instance which parses Comma Separated Value strings
136      */
137     public static StrTokenizer getCSVInstance() {
138         return getCSVClone();
139     }
140     /**
141      * Gets a new tokenizer instance which parses Comma Separated Value strings
142      * initializing it with the given input.  The default for CSV processing
143      * will be trim whitespace from both ends (which can be overridden with
144      * the setTrimmer method).
145      *
146      * @param input  the text to parse
147      * @return a new tokenizer instance which parses Comma Separated Value strings
148      */
149     public static StrTokenizer getCSVInstance(final char[] input) {
150         final StrTokenizer tok = getCSVClone();
151         tok.reset(input);
152         return tok;
153     }
154 
155     /**
156      * Gets a new tokenizer instance which parses Comma Separated Value strings
157      * initializing it with the given input.  The default for CSV processing
158      * will be trim whitespace from both ends (which can be overridden with
159      * the setTrimmer method).
160      *
161      * @param input  the text to parse
162      * @return a new tokenizer instance which parses Comma Separated Value strings
163      */
164     public static StrTokenizer getCSVInstance(final String input) {
165         final StrTokenizer tok = getCSVClone();
166         tok.reset(input);
167         return tok;
168     }
169     /**
170      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171      *
172      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
173      */
174     private static StrTokenizer getTSVClone() {
175         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
176     }
177     /**
178      * Gets a new tokenizer instance which parses Tab Separated Value strings.
179      * The default for CSV processing will be trim whitespace from both ends
180      * (which can be overridden with the setTrimmer method).
181      * <p>
182      * You must call a "reset" method to set the string which you want to parse.
183      * </p>
184      * @return a new tokenizer instance which parses Tab Separated Value strings.
185      */
186     public static StrTokenizer getTSVInstance() {
187         return getTSVClone();
188     }
189     /**
190      * Gets a new tokenizer instance which parses Tab Separated Value strings.
191      * The default for CSV processing will be trim whitespace from both ends
192      * (which can be overridden with the setTrimmer method).
193      * @param input  the string to parse
194      * @return a new tokenizer instance which parses Tab Separated Value strings.
195      */
196     public static StrTokenizer getTSVInstance(final char[] input) {
197         final StrTokenizer tok = getTSVClone();
198         tok.reset(input);
199         return tok;
200     }
201 
202     /**
203      * Gets a new tokenizer instance which parses Tab Separated Value strings.
204      * The default for CSV processing will be trim whitespace from both ends
205      * (which can be overridden with the setTrimmer method).
206      * @param input  the string to parse
207      * @return a new tokenizer instance which parses Tab Separated Value strings.
208      */
209     public static StrTokenizer getTSVInstance(final String input) {
210         final StrTokenizer tok = getTSVClone();
211         tok.reset(input);
212         return tok;
213     }
214     /** The text to work on. */
215     private char[] chars;
216 
217     /** The parsed tokens */
218     private String[] tokens;
219 
220     /** The current iteration position */
221     private int tokenPos;
222 
223     /** The delimiter matcher */
224     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
225 
226     /** The quote matcher */
227     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
228 
229     /** The ignored matcher */
230     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
231 
232     /** The trimmer matcher */
233     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
234 
235     /** Whether to return empty tokens as null */
236     private boolean emptyAsNull;
237 
238     /** Whether to ignore empty tokens */
239     private boolean ignoreEmptyTokens = true;
240 
241     /**
242      * Constructs a tokenizer splitting on space, tab, newline and formfeed
243      * as per StringTokenizer, but with no text to tokenize.
244      * <p>
245      * This constructor is normally used with {@link #reset(String)}.
246      * </p>
247      */
248     public StrTokenizer() {
249         this.chars = null;
250     }
251 
252     /**
253      * Constructs a tokenizer splitting on space, tab, newline and formfeed
254      * as per StringTokenizer.
255      *
256      * @param input  the string which is to be parsed, not cloned
257      */
258     public StrTokenizer(final char[] input) {
259         this.chars = ArrayUtils.clone(input);
260     }
261 
262     /**
263      * Constructs a tokenizer splitting on the specified character.
264      *
265      * @param input  the string which is to be parsed, not cloned
266      * @param delim the field delimiter character
267      */
268     public StrTokenizer(final char[] input, final char delim) {
269         this(input);
270         setDelimiterChar(delim);
271     }
272 
273     /**
274      * Constructs a tokenizer splitting on the specified delimiter character
275      * and handling quotes using the specified quote character.
276      *
277      * @param input  the string which is to be parsed, not cloned
278      * @param delim  the field delimiter character
279      * @param quote  the field quoted string character
280      */
281     public StrTokenizer(final char[] input, final char delim, final char quote) {
282         this(input, delim);
283         setQuoteChar(quote);
284     }
285 
286     /**
287      * Constructs a tokenizer splitting on the specified string.
288      *
289      * @param input  the string which is to be parsed, not cloned
290      * @param delim the field delimiter string
291      */
292     public StrTokenizer(final char[] input, final String delim) {
293         this(input);
294         setDelimiterString(delim);
295     }
296 
297     /**
298      * Constructs a tokenizer splitting using the specified delimiter matcher.
299      *
300      * @param input  the string which is to be parsed, not cloned
301      * @param delim  the field delimiter matcher
302      */
303     public StrTokenizer(final char[] input, final StrMatcher delim) {
304         this(input);
305         setDelimiterMatcher(delim);
306     }
307 
308     /**
309      * Constructs a tokenizer splitting using the specified delimiter matcher
310      * and handling quotes using the specified quote matcher.
311      *
312      * @param input  the string which is to be parsed, not cloned
313      * @param delim  the field delimiter character
314      * @param quote  the field quoted string character
315      */
316     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
317         this(input, delim);
318         setQuoteMatcher(quote);
319     }
320 
321     /**
322      * Constructs a tokenizer splitting on space, tab, newline and formfeed
323      * as per StringTokenizer.
324      *
325      * @param input  the string which is to be parsed
326      */
327     public StrTokenizer(final String input) {
328         if (input != null) {
329             chars = input.toCharArray();
330         } else {
331             chars = null;
332         }
333     }
334 
335     /**
336      * Constructs a tokenizer splitting on the specified delimiter character.
337      *
338      * @param input  the string which is to be parsed
339      * @param delim  the field delimiter character
340      */
341     public StrTokenizer(final String input, final char delim) {
342         this(input);
343         setDelimiterChar(delim);
344     }
345 
346     /**
347      * Constructs a tokenizer splitting on the specified delimiter character
348      * and handling quotes using the specified quote character.
349      *
350      * @param input  the string which is to be parsed
351      * @param delim  the field delimiter character
352      * @param quote  the field quoted string character
353      */
354     public StrTokenizer(final String input, final char delim, final char quote) {
355         this(input, delim);
356         setQuoteChar(quote);
357     }
358 
359     /**
360      * Constructs a tokenizer splitting on the specified delimiter string.
361      *
362      * @param input  the string which is to be parsed
363      * @param delim  the field delimiter string
364      */
365     public StrTokenizer(final String input, final String delim) {
366         this(input);
367         setDelimiterString(delim);
368     }
369 
370     /**
371      * Constructs a tokenizer splitting using the specified delimiter matcher.
372      *
373      * @param input  the string which is to be parsed
374      * @param delim  the field delimiter matcher
375      */
376     public StrTokenizer(final String input, final StrMatcher delim) {
377         this(input);
378         setDelimiterMatcher(delim);
379     }
380 
381     /**
382      * Constructs a tokenizer splitting using the specified delimiter matcher
383      * and handling quotes using the specified quote matcher.
384      *
385      * @param input  the string which is to be parsed
386      * @param delim  the field delimiter matcher
387      * @param quote  the field quoted string matcher
388      */
389     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
390         this(input, delim);
391         setQuoteMatcher(quote);
392     }
393 
394     /**
395      * Unsupported ListIterator operation.
396      * @param obj this parameter ignored.
397      * @throws UnsupportedOperationException always
398      */
399     @Override
400     public void add(final String obj) {
401         throw new UnsupportedOperationException("add() is unsupported");
402     }
403 
404     /**
405      * Adds a token to a list, paying attention to the parameters we've set.
406      *
407      * @param list  the list to add to
408      * @param tok  the token to add
409      */
410     private void addToken(final List<String> list, String tok) {
411         if (StringUtils.isEmpty(tok)) {
412             if (isIgnoreEmptyTokens()) {
413                 return;
414             }
415             if (isEmptyTokenAsNull()) {
416                 tok = null;
417             }
418         }
419         list.add(tok);
420     }
421 
422     /**
423      * Checks if tokenization has been done, and if not then do it.
424      */
425     private void checkTokenized() {
426         if (tokens == null) {
427             if (chars == null) {
428                 // still call tokenize as subclass may do some work
429                 final List<String> split = tokenize(null, 0, 0);
430                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
431             } else {
432                 final List<String> split = tokenize(chars, 0, chars.length);
433                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
434             }
435         }
436     }
437 
438     /**
439      * Creates a new instance of this Tokenizer. The new instance is reset so
440      * that it will be at the start of the token list.
441      * If a {@link CloneNotSupportedException} is caught, return {@code null}.
442      *
443      * @return a new instance of this Tokenizer which has been reset.
444      */
445     @Override
446     public Object clone() {
447         try {
448             return cloneReset();
449         } catch (final CloneNotSupportedException ex) {
450             return null;
451         }
452     }
453 
454     /**
455      * Creates a new instance of this Tokenizer. The new instance is reset so that
456      * it will be at the start of the token list.
457      *
458      * @return a new instance of this Tokenizer which has been reset.
459      * @throws CloneNotSupportedException if there is a problem cloning
460      */
461     Object cloneReset() throws CloneNotSupportedException {
462         // this method exists to enable 100% test coverage
463         final StrTokenizer cloned = (StrTokenizer) super.clone();
464         if (cloned.chars != null) {
465             cloned.chars = cloned.chars.clone();
466         }
467         cloned.reset();
468         return cloned;
469     }
470 
471     /**
472      * Gets the String content that the tokenizer is parsing.
473      *
474      * @return the string content being parsed
475      */
476     public String getContent() {
477         if (chars == null) {
478             return null;
479         }
480         return new String(chars);
481     }
482 
483     /**
484      * Gets the field delimiter matcher.
485      *
486      * @return the delimiter matcher in use
487      */
488     public StrMatcher getDelimiterMatcher() {
489         return this.delimMatcher;
490     }
491 
492     // Ignored
493     /**
494      * Gets the ignored character matcher.
495      * <p>
496      * These characters are ignored when parsing the String, unless they are
497      * within a quoted region.
498      * The default value is not to ignore anything.
499      * </p>
500      *
501      * @return the ignored matcher in use
502      */
503     public StrMatcher getIgnoredMatcher() {
504         return ignoredMatcher;
505     }
506 
507     /**
508      * Gets the quote matcher currently in use.
509      * <p>
510      * The quote character is used to wrap data between the tokens.
511      * This enables delimiters to be entered as data.
512      * The default value is '"' (double quote).
513      * </p>
514      *
515      * @return the quote matcher in use
516      */
517     public StrMatcher getQuoteMatcher() {
518         return quoteMatcher;
519     }
520 
521     /**
522      * Gets a copy of the full token list as an independent modifiable array.
523      *
524      * @return the tokens as a String array
525      */
526     public String[] getTokenArray() {
527         checkTokenized();
528         return tokens.clone();
529     }
530 
531     /**
532      * Gets a copy of the full token list as an independent modifiable list.
533      *
534      * @return the tokens as a String array
535      */
536     public List<String> getTokenList() {
537         checkTokenized();
538         final List<String> list = new ArrayList<>(tokens.length);
539         list.addAll(Arrays.asList(tokens));
540         return list;
541     }
542 
543     /**
544      * Gets the trimmer character matcher.
545      * <p>
546      * These characters are trimmed off on each side of the delimiter
547      * until the token or quote is found.
548      * The default value is not to trim anything.
549      * </p>
550      *
551      * @return the trimmer matcher in use
552      */
553     public StrMatcher getTrimmerMatcher() {
554         return trimmerMatcher;
555     }
556 
557     /**
558      * Checks whether there are any more tokens.
559      *
560      * @return true if there are more tokens
561      */
562     @Override
563     public boolean hasNext() {
564         checkTokenized();
565         return tokenPos < tokens.length;
566     }
567 
568     /**
569      * Checks whether there are any previous tokens that can be iterated to.
570      *
571      * @return true if there are previous tokens
572      */
573     @Override
574     public boolean hasPrevious() {
575         checkTokenized();
576         return tokenPos > 0;
577     }
578 
579     /**
580      * Gets whether the tokenizer currently returns empty tokens as null.
581      * The default for this property is false.
582      *
583      * @return true if empty tokens are returned as null
584      */
585     public boolean isEmptyTokenAsNull() {
586         return this.emptyAsNull;
587     }
588 
589     /**
590      * Gets whether the tokenizer currently ignores empty tokens.
591      * The default for this property is true.
592      *
593      * @return true if empty tokens are not returned
594      */
595     public boolean isIgnoreEmptyTokens() {
596         return ignoreEmptyTokens;
597     }
598 
599     /**
600      * Checks if the characters at the index specified match the quote
601      * already matched in readNextToken().
602      *
603      * @param srcChars  the character array being tokenized
604      * @param pos  the position to check for a quote
605      * @param len  the length of the character array being tokenized
606      * @param quoteStart  the start position of the matched quote, 0 if no quoting
607      * @param quoteLen  the length of the matched quote, 0 if no quoting
608      * @return true if a quote is matched
609      */
610     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
611         for (int i = 0; i < quoteLen; i++) {
612             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
613                 return false;
614             }
615         }
616         return true;
617     }
618 
619     /**
620      * Gets the next token.
621      *
622      * @return the next String token
623      * @throws NoSuchElementException if there are no more elements
624      */
625     @Override
626     public String next() {
627         if (hasNext()) {
628             return tokens[tokenPos++];
629         }
630         throw new NoSuchElementException();
631     }
632 
633     /**
634      * Gets the index of the next token to return.
635      *
636      * @return the next token index
637      */
638     @Override
639     public int nextIndex() {
640         return tokenPos;
641     }
642 
643     /**
644      * Gets the next token from the String.
645      * Equivalent to {@link #next()} except it returns null rather than
646      * throwing {@link NoSuchElementException} when no tokens remain.
647      *
648      * @return the next sequential token, or null when no more tokens are found
649      */
650     public String nextToken() {
651         if (hasNext()) {
652             return tokens[tokenPos++];
653         }
654         return null;
655     }
656 
657     /**
658      * Gets the token previous to the last returned token.
659      *
660      * @return the previous token
661      */
662     @Override
663     public String previous() {
664         if (hasPrevious()) {
665             return tokens[--tokenPos];
666         }
667         throw new NoSuchElementException();
668     }
669 
670     /**
671      * Gets the index of the previous token.
672      *
673      * @return the previous token index
674      */
675     @Override
676     public int previousIndex() {
677         return tokenPos - 1;
678     }
679 
680     /**
681      * Gets the previous token from the String.
682      *
683      * @return the previous sequential token, or null when no more tokens are found
684      */
685     public String previousToken() {
686         if (hasPrevious()) {
687             return tokens[--tokenPos];
688         }
689         return null;
690     }
691 
692     /**
693      * Reads character by character through the String to get the next token.
694      *
695      * @param srcChars  the character array being tokenized
696      * @param start  the first character of field
697      * @param len  the length of the character array being tokenized
698      * @param workArea  a temporary work area
699      * @param tokenList  the list of parsed tokens
700      * @return the starting position of the next field (the character
701      *  immediately after the delimiter), or -1 if end of string found
702      */
703     private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
704         // skip all leading whitespace, unless it is the
705         // field delimiter or the quote character
706         while (start < len) {
707             final int removeLen = Math.max(
708                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
709                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
710             if (removeLen == 0 ||
711                 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
712                 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
713                 break;
714             }
715             start += removeLen;
716         }
717 
718         // handle reaching end
719         if (start >= len) {
720             addToken(tokenList, StringUtils.EMPTY);
721             return -1;
722         }
723 
724         // handle empty token
725         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
726         if (delimLen > 0) {
727             addToken(tokenList, StringUtils.EMPTY);
728             return start + delimLen;
729         }
730 
731         // handle found token
732         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
733         if (quoteLen > 0) {
734             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
735         }
736         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
737     }
738 
739     /**
740      * Reads a possibly quoted string token.
741      *
742      * @param srcChars  the character array being tokenized
743      * @param start  the first character of field
744      * @param len  the length of the character array being tokenized
745      * @param workArea  a temporary work area
746      * @param tokenList  the list of parsed tokens
747      * @param quoteStart  the start position of the matched quote, 0 if no quoting
748      * @param quoteLen  the length of the matched quote, 0 if no quoting
749      * @return the starting position of the next field (the character
750      *  immediately after the delimiter, or if end of string found,
751      *  then the length of string
752      */
753     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
754                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
755         // Loop until we've found the end of the quoted
756         // string or the end of the input
757         workArea.clear();
758         int pos = start;
759         boolean quoting = quoteLen > 0;
760         int trimStart = 0;
761 
762         while (pos < len) {
763             // quoting mode can occur several times throughout a string
764             // we must switch between quoting and non-quoting until we
765             // encounter a non-quoted delimiter, or end of string
766             if (quoting) {
767                 // In quoting mode
768 
769                 // If we've found a quote character, see if it's
770                 // followed by a second quote.  If so, then we need
771                 // to actually put the quote character into the token
772                 // rather than end the token.
773                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
774                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
775                         // matched pair of quotes, thus an escaped quote
776                         workArea.append(srcChars, pos, quoteLen);
777                         pos += quoteLen * 2;
778                         trimStart = workArea.size();
779                         continue;
780                     }
781 
782                     // end of quoting
783                     quoting = false;
784                     pos += quoteLen;
785                     continue;
786                 }
787 
788             } else {
789                 // Not in quoting mode
790 
791                 // check for delimiter, and thus end of token
792                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
793                 if (delimLen > 0) {
794                     // return condition when end of token found
795                     addToken(tokenList, workArea.substring(0, trimStart));
796                     return pos + delimLen;
797                 }
798 
799                 // check for quote, and thus back into quoting mode
800                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
801                     quoting = true;
802                     pos += quoteLen;
803                     continue;
804                 }
805 
806                 // check for ignored (outside quotes), and ignore
807                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
808                 if (ignoredLen > 0) {
809                     pos += ignoredLen;
810                     continue;
811                 }
812 
813                 // check for trimmed character
814                 // don't yet know if it's at the end, so copy to workArea
815                 // use trimStart to keep track of trim at the end
816                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
817                 if (trimmedLen > 0) {
818                     workArea.append(srcChars, pos, trimmedLen);
819                     pos += trimmedLen;
820                     continue;
821                 }
822             }
823             // copy regular character from inside quotes
824             workArea.append(srcChars[pos++]);
825             trimStart = workArea.size();
826         }
827 
828         // return condition when end of string found
829         addToken(tokenList, workArea.substring(0, trimStart));
830         return -1;
831     }
832 
833     /**
834      * Unsupported ListIterator operation.
835      *
836      * @throws UnsupportedOperationException always
837      */
838     @Override
839     public void remove() {
840         throw new UnsupportedOperationException("remove() is unsupported");
841     }
842 
843     /**
844      * Resets this tokenizer, forgetting all parsing and iteration already completed.
845      * <p>
846      * This method allows the same tokenizer to be reused for the same String.
847      * </p>
848      *
849      * @return this, to enable chaining
850      */
851     public StrTokenizer reset() {
852         tokenPos = 0;
853         tokens = null;
854         return this;
855     }
856 
857     /**
858      * Reset this tokenizer, giving it a new input string to parse.
859      * In this manner you can re-use a tokenizer with the same settings
860      * on multiple input lines.
861      *
862      * @param input  the new character array to tokenize, not cloned, null sets no text to parse
863      * @return this, to enable chaining
864      */
865     public StrTokenizer reset(final char[] input) {
866         reset();
867         this.chars = ArrayUtils.clone(input);
868         return this;
869     }
870 
871     /**
872      * Reset this tokenizer, giving it a new input string to parse.
873      * In this manner you can re-use a tokenizer with the same settings
874      * on multiple input lines.
875      *
876      * @param input  the new string to tokenize, null sets no text to parse
877      * @return this, to enable chaining
878      */
879     public StrTokenizer reset(final String input) {
880         reset();
881         if (input != null) {
882             this.chars = input.toCharArray();
883         } else {
884             this.chars = null;
885         }
886         return this;
887     }
888 
889     /**
890      * Unsupported ListIterator operation.
891      * @param obj this parameter ignored.
892      * @throws UnsupportedOperationException always
893      */
894     @Override
895     public void set(final String obj) {
896         throw new UnsupportedOperationException("set() is unsupported");
897     }
898 
899     /**
900      * Sets the field delimiter character.
901      *
902      * @param delim  the delimiter character to use
903      * @return this, to enable chaining
904      */
905     public StrTokenizer setDelimiterChar(final char delim) {
906         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
907     }
908 
909     /**
910      * Sets the field delimiter matcher.
911      * <p>
912      * The delimiter is used to separate one token from another.
913      * </p>
914      *
915      * @param delim  the delimiter matcher to use
916      * @return this, to enable chaining
917      */
918     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
919         if (delim == null) {
920             this.delimMatcher = StrMatcher.noneMatcher();
921         } else {
922             this.delimMatcher = delim;
923         }
924         return this;
925     }
926 
927     /**
928      * Sets the field delimiter string.
929      *
930      * @param delim  the delimiter string to use
931      * @return this, to enable chaining
932      */
933     public StrTokenizer setDelimiterString(final String delim) {
934         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
935     }
936 
937     /**
938      * Sets whether the tokenizer should return empty tokens as null.
939      * The default for this property is false.
940      *
941      * @param emptyAsNull  whether empty tokens are returned as null
942      * @return this, to enable chaining
943      */
944     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
945         this.emptyAsNull = emptyAsNull;
946         return this;
947     }
948 
949     /**
950      * Sets the character to ignore.
951      * <p>
952      * This character is ignored when parsing the String, unless it is
953      * within a quoted region.
954      *
955      * @param ignored  the ignored character to use
956      * @return this, to enable chaining
957      */
958     public StrTokenizer setIgnoredChar(final char ignored) {
959         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
960     }
961 
962     /**
963      * Sets the matcher for characters to ignore.
964      * <p>
965      * These characters are ignored when parsing the String, unless they are
966      * within a quoted region.
967      * </p>
968      *
969      * @param ignored  the ignored matcher to use, null ignored
970      * @return this, to enable chaining
971      */
972     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
973         if (ignored != null) {
974             this.ignoredMatcher = ignored;
975         }
976         return this;
977     }
978 
979     /**
980      * Sets whether the tokenizer should ignore and not return empty tokens.
981      * The default for this property is true.
982      *
983      * @param ignoreEmptyTokens  whether empty tokens are not returned
984      * @return this, to enable chaining
985      */
986     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
987         this.ignoreEmptyTokens = ignoreEmptyTokens;
988         return this;
989     }
990 
991     /**
992      * Sets the quote character to use.
993      * <p>
994      * The quote character is used to wrap data between the tokens.
995      * This enables delimiters to be entered as data.
996      * </p>
997      *
998      * @param quote  the quote character to use
999      * @return this, to enable chaining
1000      */
1001     public StrTokenizer setQuoteChar(final char quote) {
1002         return setQuoteMatcher(StrMatcher.charMatcher(quote));
1003     }
1004 
1005     /**
1006      * Sets the quote matcher to use.
1007      * <p>
1008      * The quote character is used to wrap data between the tokens.
1009      * This enables delimiters to be entered as data.
1010      * </p>
1011      *
1012      * @param quote  the quote matcher to use, null ignored
1013      * @return this, to enable chaining
1014      */
1015     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1016         if (quote != null) {
1017             this.quoteMatcher = quote;
1018         }
1019         return this;
1020     }
1021 
1022     /**
1023      * Sets the matcher for characters to trim.
1024      * <p>
1025      * These characters are trimmed off on each side of the delimiter
1026      * until the token or quote is found.
1027      * </p>
1028      *
1029      * @param trimmer  the trimmer matcher to use, null ignored
1030      * @return this, to enable chaining
1031      */
1032     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1033         if (trimmer != null) {
1034             this.trimmerMatcher = trimmer;
1035         }
1036         return this;
1037     }
1038 
1039     // API
1040     /**
1041      * Gets the number of tokens found in the String.
1042      *
1043      * @return the number of matched tokens
1044      */
1045     public int size() {
1046         checkTokenized();
1047         return tokens.length;
1048     }
1049 
1050     /**
1051      * Internal method to performs the tokenization.
1052      * <p>
1053      * Most users of this class do not need to call this method. This method
1054      * will be called automatically by other (public) methods when required.
1055      * </p>
1056      * <p>
1057      * This method exists to allow subclasses to add code before or after the
1058      * tokenization. For example, a subclass could alter the character array,
1059      * offset or count to be parsed, or call the tokenizer multiple times on
1060      * multiple strings. It is also be possible to filter the results.
1061      * </p>
1062      * <p>
1063      * {@link StrTokenizer} will always pass a zero offset and a count
1064      * equal to the length of the array to this method, however a subclass
1065      * may pass other values, or even an entirely different array.
1066      * </p>
1067      *
1068      * @param srcChars  the character array being tokenized, may be null
1069      * @param offset  the start position within the character array, must be valid
1070      * @param count  the number of characters to tokenize, must be valid
1071      * @return the modifiable list of String tokens, unmodifiable if null array or zero count
1072      */
1073     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1074         if (ArrayUtils.isEmpty(srcChars)) {
1075             return Collections.emptyList();
1076         }
1077         final StrBuilder buf = new StrBuilder();
1078         final List<String> tokenList = new ArrayList<>();
1079         int pos = offset;
1080 
1081         // loop around the entire buffer
1082         while (pos >= 0 && pos < count) {
1083             // find next token
1084             pos = readNextToken(srcChars, pos, count, buf, tokenList);
1085 
1086             // handle case where end of string is a delimiter
1087             if (pos >= count) {
1088                 addToken(tokenList, StringUtils.EMPTY);
1089             }
1090         }
1091         return tokenList;
1092     }
1093 
1094     /**
1095      * Gets the String content that the tokenizer is parsing.
1096      *
1097      * @return the string content being parsed
1098      */
1099     @Override
1100     public String toString() {
1101         if (tokens == null) {
1102             return "StrTokenizer[not tokenized yet]";
1103         }
1104         return "StrTokenizer" + getTokenList();
1105     }
1106 
1107 }