001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3.text; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.List; 023import java.util.ListIterator; 024import java.util.NoSuchElementException; 025import java.util.StringTokenizer; 026 027import org.apache.commons.lang3.ArrayUtils; 028import org.apache.commons.lang3.StringUtils; 029 030/** 031 * Tokenizes a string based on delimiters (separators) 032 * and supporting quoting and ignored character concepts. 033 * <p> 034 * This class can split a String into many smaller strings. It aims 035 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 036 * however it offers much more control and flexibility including implementing 037 * the {@link ListIterator} interface. By default, it is set up 038 * like {@link StringTokenizer}. 039 * </p> 040 * <p> 041 * The input String is split into a number of <em>tokens</em>. 042 * Each token is separated from the next String by a <em>delimiter</em>. 043 * One or more delimiter characters must be specified. 044 * </p> 045 * <p> 046 * Each token may be surrounded by quotes. 047 * The <em>quote</em> matcher specifies the quote character(s). 048 * A quote may be escaped within a quoted section by duplicating itself. 049 * </p> 050 * <p> 051 * Between each token and the delimiter are potentially characters that need trimming. 052 * The <em>trimmer</em> matcher specifies these characters. 053 * One usage might be to trim whitespace characters. 054 * </p> 055 * <p> 056 * At any point outside the quotes there might potentially be invalid characters. 057 * The <em>ignored</em> matcher specifies these characters to be removed. 058 * One usage might be to remove new line characters. 059 * </p> 060 * <p> 061 * Empty tokens may be removed or returned as null. 062 * </p> 063 * <pre> 064 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 065 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 066 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 067 * </pre> 068 * 069 * <table> 070 * <caption>StrTokenizer properties and options</caption> 071 * <tr> 072 * <th>Property</th><th>Type</th><th>Default</th> 073 * </tr> 074 * <tr> 075 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 076 * </tr> 077 * <tr> 078 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 079 * </tr> 080 * <tr> 081 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 082 * </tr> 083 * <tr> 084 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 085 * </tr> 086 * <tr> 087 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 088 * </tr> 089 * </table> 090 * 091 * @since 2.2 092 * @deprecated As of 3.6, use Apache Commons Text 093 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html"> 094 * StringTokenizer</a> instead 095 */ 096@Deprecated 097public class StrTokenizer implements ListIterator<String>, Cloneable { 098 099 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 100 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 101 static { 102 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 103 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 104 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 105 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 106 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 107 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 108 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 109 110 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 111 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 112 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 113 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 114 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 115 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 116 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 117 } 118 119 /** 120 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 121 * 122 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 123 */ 124 private static StrTokenizer getCSVClone() { 125 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 126 } 127 /** 128 * Gets a new tokenizer instance which parses Comma Separated Value strings 129 * initializing it with the given input. The default for CSV processing 130 * will be trim whitespace from both ends (which can be overridden with 131 * the setTrimmer method). 132 * <p> 133 * You must call a "reset" method to set the string which you want to parse. 134 * </p> 135 * @return a new tokenizer instance which parses Comma Separated Value strings 136 */ 137 public static StrTokenizer getCSVInstance() { 138 return getCSVClone(); 139 } 140 /** 141 * Gets a new tokenizer instance which parses Comma Separated Value strings 142 * initializing it with the given input. The default for CSV processing 143 * will be trim whitespace from both ends (which can be overridden with 144 * the setTrimmer method). 145 * 146 * @param input the text to parse 147 * @return a new tokenizer instance which parses Comma Separated Value strings 148 */ 149 public static StrTokenizer getCSVInstance(final char[] input) { 150 final StrTokenizer tok = getCSVClone(); 151 tok.reset(input); 152 return tok; 153 } 154 155 /** 156 * Gets a new tokenizer instance which parses Comma Separated Value strings 157 * initializing it with the given input. The default for CSV processing 158 * will be trim whitespace from both ends (which can be overridden with 159 * the setTrimmer method). 160 * 161 * @param input the text to parse 162 * @return a new tokenizer instance which parses Comma Separated Value strings 163 */ 164 public static StrTokenizer getCSVInstance(final String input) { 165 final StrTokenizer tok = getCSVClone(); 166 tok.reset(input); 167 return tok; 168 } 169 /** 170 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 171 * 172 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 173 */ 174 private static StrTokenizer getTSVClone() { 175 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 176 } 177 /** 178 * Gets a new tokenizer instance which parses Tab Separated Value strings. 179 * The default for CSV processing will be trim whitespace from both ends 180 * (which can be overridden with the setTrimmer method). 181 * <p> 182 * You must call a "reset" method to set the string which you want to parse. 183 * </p> 184 * @return a new tokenizer instance which parses Tab Separated Value strings. 185 */ 186 public static StrTokenizer getTSVInstance() { 187 return getTSVClone(); 188 } 189 /** 190 * Gets a new tokenizer instance which parses Tab Separated Value strings. 191 * The default for CSV processing will be trim whitespace from both ends 192 * (which can be overridden with the setTrimmer method). 193 * @param input the string to parse 194 * @return a new tokenizer instance which parses Tab Separated Value strings. 195 */ 196 public static StrTokenizer getTSVInstance(final char[] input) { 197 final StrTokenizer tok = getTSVClone(); 198 tok.reset(input); 199 return tok; 200 } 201 202 /** 203 * Gets a new tokenizer instance which parses Tab Separated Value strings. 204 * The default for CSV processing will be trim whitespace from both ends 205 * (which can be overridden with the setTrimmer method). 206 * @param input the string to parse 207 * @return a new tokenizer instance which parses Tab Separated Value strings. 208 */ 209 public static StrTokenizer getTSVInstance(final String input) { 210 final StrTokenizer tok = getTSVClone(); 211 tok.reset(input); 212 return tok; 213 } 214 /** The text to work on. */ 215 private char[] chars; 216 217 /** The parsed tokens */ 218 private String[] tokens; 219 220 /** The current iteration position */ 221 private int tokenPos; 222 223 /** The delimiter matcher */ 224 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 225 226 /** The quote matcher */ 227 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 228 229 /** The ignored matcher */ 230 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 231 232 /** The trimmer matcher */ 233 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 234 235 /** Whether to return empty tokens as null */ 236 private boolean emptyAsNull; 237 238 /** Whether to ignore empty tokens */ 239 private boolean ignoreEmptyTokens = true; 240 241 /** 242 * Constructs a tokenizer splitting on space, tab, newline and formfeed 243 * as per StringTokenizer, but with no text to tokenize. 244 * <p> 245 * This constructor is normally used with {@link #reset(String)}. 246 * </p> 247 */ 248 public StrTokenizer() { 249 this.chars = null; 250 } 251 252 /** 253 * Constructs a tokenizer splitting on space, tab, newline and formfeed 254 * as per StringTokenizer. 255 * 256 * @param input the string which is to be parsed, not cloned 257 */ 258 public StrTokenizer(final char[] input) { 259 this.chars = ArrayUtils.clone(input); 260 } 261 262 /** 263 * Constructs a tokenizer splitting on the specified character. 264 * 265 * @param input the string which is to be parsed, not cloned 266 * @param delim the field delimiter character 267 */ 268 public StrTokenizer(final char[] input, final char delim) { 269 this(input); 270 setDelimiterChar(delim); 271 } 272 273 /** 274 * Constructs a tokenizer splitting on the specified delimiter character 275 * and handling quotes using the specified quote character. 276 * 277 * @param input the string which is to be parsed, not cloned 278 * @param delim the field delimiter character 279 * @param quote the field quoted string character 280 */ 281 public StrTokenizer(final char[] input, final char delim, final char quote) { 282 this(input, delim); 283 setQuoteChar(quote); 284 } 285 286 /** 287 * Constructs a tokenizer splitting on the specified string. 288 * 289 * @param input the string which is to be parsed, not cloned 290 * @param delim the field delimiter string 291 */ 292 public StrTokenizer(final char[] input, final String delim) { 293 this(input); 294 setDelimiterString(delim); 295 } 296 297 /** 298 * Constructs a tokenizer splitting using the specified delimiter matcher. 299 * 300 * @param input the string which is to be parsed, not cloned 301 * @param delim the field delimiter matcher 302 */ 303 public StrTokenizer(final char[] input, final StrMatcher delim) { 304 this(input); 305 setDelimiterMatcher(delim); 306 } 307 308 /** 309 * Constructs a tokenizer splitting using the specified delimiter matcher 310 * and handling quotes using the specified quote matcher. 311 * 312 * @param input the string which is to be parsed, not cloned 313 * @param delim the field delimiter character 314 * @param quote the field quoted string character 315 */ 316 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 317 this(input, delim); 318 setQuoteMatcher(quote); 319 } 320 321 /** 322 * Constructs a tokenizer splitting on space, tab, newline and formfeed 323 * as per StringTokenizer. 324 * 325 * @param input the string which is to be parsed 326 */ 327 public StrTokenizer(final String input) { 328 if (input != null) { 329 chars = input.toCharArray(); 330 } else { 331 chars = null; 332 } 333 } 334 335 /** 336 * Constructs a tokenizer splitting on the specified delimiter character. 337 * 338 * @param input the string which is to be parsed 339 * @param delim the field delimiter character 340 */ 341 public StrTokenizer(final String input, final char delim) { 342 this(input); 343 setDelimiterChar(delim); 344 } 345 346 /** 347 * Constructs a tokenizer splitting on the specified delimiter character 348 * and handling quotes using the specified quote character. 349 * 350 * @param input the string which is to be parsed 351 * @param delim the field delimiter character 352 * @param quote the field quoted string character 353 */ 354 public StrTokenizer(final String input, final char delim, final char quote) { 355 this(input, delim); 356 setQuoteChar(quote); 357 } 358 359 /** 360 * Constructs a tokenizer splitting on the specified delimiter string. 361 * 362 * @param input the string which is to be parsed 363 * @param delim the field delimiter string 364 */ 365 public StrTokenizer(final String input, final String delim) { 366 this(input); 367 setDelimiterString(delim); 368 } 369 370 /** 371 * Constructs a tokenizer splitting using the specified delimiter matcher. 372 * 373 * @param input the string which is to be parsed 374 * @param delim the field delimiter matcher 375 */ 376 public StrTokenizer(final String input, final StrMatcher delim) { 377 this(input); 378 setDelimiterMatcher(delim); 379 } 380 381 /** 382 * Constructs a tokenizer splitting using the specified delimiter matcher 383 * and handling quotes using the specified quote matcher. 384 * 385 * @param input the string which is to be parsed 386 * @param delim the field delimiter matcher 387 * @param quote the field quoted string matcher 388 */ 389 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 390 this(input, delim); 391 setQuoteMatcher(quote); 392 } 393 394 /** 395 * Unsupported ListIterator operation. 396 * @param obj this parameter ignored. 397 * @throws UnsupportedOperationException always 398 */ 399 @Override 400 public void add(final String obj) { 401 throw new UnsupportedOperationException("add() is unsupported"); 402 } 403 404 /** 405 * Adds a token to a list, paying attention to the parameters we've set. 406 * 407 * @param list the list to add to 408 * @param tok the token to add 409 */ 410 private void addToken(final List<String> list, String tok) { 411 if (StringUtils.isEmpty(tok)) { 412 if (isIgnoreEmptyTokens()) { 413 return; 414 } 415 if (isEmptyTokenAsNull()) { 416 tok = null; 417 } 418 } 419 list.add(tok); 420 } 421 422 /** 423 * Checks if tokenization has been done, and if not then do it. 424 */ 425 private void checkTokenized() { 426 if (tokens == null) { 427 if (chars == null) { 428 // still call tokenize as subclass may do some work 429 final List<String> split = tokenize(null, 0, 0); 430 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 431 } else { 432 final List<String> split = tokenize(chars, 0, chars.length); 433 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 434 } 435 } 436 } 437 438 /** 439 * Creates a new instance of this Tokenizer. The new instance is reset so 440 * that it will be at the start of the token list. 441 * If a {@link CloneNotSupportedException} is caught, return {@code null}. 442 * 443 * @return a new instance of this Tokenizer which has been reset. 444 */ 445 @Override 446 public Object clone() { 447 try { 448 return cloneReset(); 449 } catch (final CloneNotSupportedException ex) { 450 return null; 451 } 452 } 453 454 /** 455 * Creates a new instance of this Tokenizer. The new instance is reset so that 456 * it will be at the start of the token list. 457 * 458 * @return a new instance of this Tokenizer which has been reset. 459 * @throws CloneNotSupportedException if there is a problem cloning 460 */ 461 Object cloneReset() throws CloneNotSupportedException { 462 // this method exists to enable 100% test coverage 463 final StrTokenizer cloned = (StrTokenizer) super.clone(); 464 if (cloned.chars != null) { 465 cloned.chars = cloned.chars.clone(); 466 } 467 cloned.reset(); 468 return cloned; 469 } 470 471 /** 472 * Gets the String content that the tokenizer is parsing. 473 * 474 * @return the string content being parsed 475 */ 476 public String getContent() { 477 if (chars == null) { 478 return null; 479 } 480 return new String(chars); 481 } 482 483 /** 484 * Gets the field delimiter matcher. 485 * 486 * @return the delimiter matcher in use 487 */ 488 public StrMatcher getDelimiterMatcher() { 489 return this.delimMatcher; 490 } 491 492 // Ignored 493 /** 494 * Gets the ignored character matcher. 495 * <p> 496 * These characters are ignored when parsing the String, unless they are 497 * within a quoted region. 498 * The default value is not to ignore anything. 499 * </p> 500 * 501 * @return the ignored matcher in use 502 */ 503 public StrMatcher getIgnoredMatcher() { 504 return ignoredMatcher; 505 } 506 507 /** 508 * Gets the quote matcher currently in use. 509 * <p> 510 * The quote character is used to wrap data between the tokens. 511 * This enables delimiters to be entered as data. 512 * The default value is '"' (double quote). 513 * </p> 514 * 515 * @return the quote matcher in use 516 */ 517 public StrMatcher getQuoteMatcher() { 518 return quoteMatcher; 519 } 520 521 /** 522 * Gets a copy of the full token list as an independent modifiable array. 523 * 524 * @return the tokens as a String array 525 */ 526 public String[] getTokenArray() { 527 checkTokenized(); 528 return tokens.clone(); 529 } 530 531 /** 532 * Gets a copy of the full token list as an independent modifiable list. 533 * 534 * @return the tokens as a String array 535 */ 536 public List<String> getTokenList() { 537 checkTokenized(); 538 final List<String> list = new ArrayList<>(tokens.length); 539 list.addAll(Arrays.asList(tokens)); 540 return list; 541 } 542 543 /** 544 * Gets the trimmer character matcher. 545 * <p> 546 * These characters are trimmed off on each side of the delimiter 547 * until the token or quote is found. 548 * The default value is not to trim anything. 549 * </p> 550 * 551 * @return the trimmer matcher in use 552 */ 553 public StrMatcher getTrimmerMatcher() { 554 return trimmerMatcher; 555 } 556 557 /** 558 * Checks whether there are any more tokens. 559 * 560 * @return true if there are more tokens 561 */ 562 @Override 563 public boolean hasNext() { 564 checkTokenized(); 565 return tokenPos < tokens.length; 566 } 567 568 /** 569 * Checks whether there are any previous tokens that can be iterated to. 570 * 571 * @return true if there are previous tokens 572 */ 573 @Override 574 public boolean hasPrevious() { 575 checkTokenized(); 576 return tokenPos > 0; 577 } 578 579 /** 580 * Gets whether the tokenizer currently returns empty tokens as null. 581 * The default for this property is false. 582 * 583 * @return true if empty tokens are returned as null 584 */ 585 public boolean isEmptyTokenAsNull() { 586 return this.emptyAsNull; 587 } 588 589 /** 590 * Gets whether the tokenizer currently ignores empty tokens. 591 * The default for this property is true. 592 * 593 * @return true if empty tokens are not returned 594 */ 595 public boolean isIgnoreEmptyTokens() { 596 return ignoreEmptyTokens; 597 } 598 599 /** 600 * Checks if the characters at the index specified match the quote 601 * already matched in readNextToken(). 602 * 603 * @param srcChars the character array being tokenized 604 * @param pos the position to check for a quote 605 * @param len the length of the character array being tokenized 606 * @param quoteStart the start position of the matched quote, 0 if no quoting 607 * @param quoteLen the length of the matched quote, 0 if no quoting 608 * @return true if a quote is matched 609 */ 610 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 611 for (int i = 0; i < quoteLen; i++) { 612 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 613 return false; 614 } 615 } 616 return true; 617 } 618 619 /** 620 * Gets the next token. 621 * 622 * @return the next String token 623 * @throws NoSuchElementException if there are no more elements 624 */ 625 @Override 626 public String next() { 627 if (hasNext()) { 628 return tokens[tokenPos++]; 629 } 630 throw new NoSuchElementException(); 631 } 632 633 /** 634 * Gets the index of the next token to return. 635 * 636 * @return the next token index 637 */ 638 @Override 639 public int nextIndex() { 640 return tokenPos; 641 } 642 643 /** 644 * Gets the next token from the String. 645 * Equivalent to {@link #next()} except it returns null rather than 646 * throwing {@link NoSuchElementException} when no tokens remain. 647 * 648 * @return the next sequential token, or null when no more tokens are found 649 */ 650 public String nextToken() { 651 if (hasNext()) { 652 return tokens[tokenPos++]; 653 } 654 return null; 655 } 656 657 /** 658 * Gets the token previous to the last returned token. 659 * 660 * @return the previous token 661 */ 662 @Override 663 public String previous() { 664 if (hasPrevious()) { 665 return tokens[--tokenPos]; 666 } 667 throw new NoSuchElementException(); 668 } 669 670 /** 671 * Gets the index of the previous token. 672 * 673 * @return the previous token index 674 */ 675 @Override 676 public int previousIndex() { 677 return tokenPos - 1; 678 } 679 680 /** 681 * Gets the previous token from the String. 682 * 683 * @return the previous sequential token, or null when no more tokens are found 684 */ 685 public String previousToken() { 686 if (hasPrevious()) { 687 return tokens[--tokenPos]; 688 } 689 return null; 690 } 691 692 /** 693 * Reads character by character through the String to get the next token. 694 * 695 * @param srcChars the character array being tokenized 696 * @param start the first character of field 697 * @param len the length of the character array being tokenized 698 * @param workArea a temporary work area 699 * @param tokenList the list of parsed tokens 700 * @return the starting position of the next field (the character 701 * immediately after the delimiter), or -1 if end of string found 702 */ 703 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 704 // skip all leading whitespace, unless it is the 705 // field delimiter or the quote character 706 while (start < len) { 707 final int removeLen = Math.max( 708 getIgnoredMatcher().isMatch(srcChars, start, start, len), 709 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 710 if (removeLen == 0 || 711 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 712 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 713 break; 714 } 715 start += removeLen; 716 } 717 718 // handle reaching end 719 if (start >= len) { 720 addToken(tokenList, StringUtils.EMPTY); 721 return -1; 722 } 723 724 // handle empty token 725 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 726 if (delimLen > 0) { 727 addToken(tokenList, StringUtils.EMPTY); 728 return start + delimLen; 729 } 730 731 // handle found token 732 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 733 if (quoteLen > 0) { 734 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 735 } 736 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 737 } 738 739 /** 740 * Reads a possibly quoted string token. 741 * 742 * @param srcChars the character array being tokenized 743 * @param start the first character of field 744 * @param len the length of the character array being tokenized 745 * @param workArea a temporary work area 746 * @param tokenList the list of parsed tokens 747 * @param quoteStart the start position of the matched quote, 0 if no quoting 748 * @param quoteLen the length of the matched quote, 0 if no quoting 749 * @return the starting position of the next field (the character 750 * immediately after the delimiter, or if end of string found, 751 * then the length of string 752 */ 753 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 754 final List<String> tokenList, final int quoteStart, final int quoteLen) { 755 // Loop until we've found the end of the quoted 756 // string or the end of the input 757 workArea.clear(); 758 int pos = start; 759 boolean quoting = quoteLen > 0; 760 int trimStart = 0; 761 762 while (pos < len) { 763 // quoting mode can occur several times throughout a string 764 // we must switch between quoting and non-quoting until we 765 // encounter a non-quoted delimiter, or end of string 766 if (quoting) { 767 // In quoting mode 768 769 // If we've found a quote character, see if it's 770 // followed by a second quote. If so, then we need 771 // to actually put the quote character into the token 772 // rather than end the token. 773 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 774 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 775 // matched pair of quotes, thus an escaped quote 776 workArea.append(srcChars, pos, quoteLen); 777 pos += quoteLen * 2; 778 trimStart = workArea.size(); 779 continue; 780 } 781 782 // end of quoting 783 quoting = false; 784 pos += quoteLen; 785 continue; 786 } 787 788 } else { 789 // Not in quoting mode 790 791 // check for delimiter, and thus end of token 792 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 793 if (delimLen > 0) { 794 // return condition when end of token found 795 addToken(tokenList, workArea.substring(0, trimStart)); 796 return pos + delimLen; 797 } 798 799 // check for quote, and thus back into quoting mode 800 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 801 quoting = true; 802 pos += quoteLen; 803 continue; 804 } 805 806 // check for ignored (outside quotes), and ignore 807 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 808 if (ignoredLen > 0) { 809 pos += ignoredLen; 810 continue; 811 } 812 813 // check for trimmed character 814 // don't yet know if it's at the end, so copy to workArea 815 // use trimStart to keep track of trim at the end 816 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 817 if (trimmedLen > 0) { 818 workArea.append(srcChars, pos, trimmedLen); 819 pos += trimmedLen; 820 continue; 821 } 822 } 823 // copy regular character from inside quotes 824 workArea.append(srcChars[pos++]); 825 trimStart = workArea.size(); 826 } 827 828 // return condition when end of string found 829 addToken(tokenList, workArea.substring(0, trimStart)); 830 return -1; 831 } 832 833 /** 834 * Unsupported ListIterator operation. 835 * 836 * @throws UnsupportedOperationException always 837 */ 838 @Override 839 public void remove() { 840 throw new UnsupportedOperationException("remove() is unsupported"); 841 } 842 843 /** 844 * Resets this tokenizer, forgetting all parsing and iteration already completed. 845 * <p> 846 * This method allows the same tokenizer to be reused for the same String. 847 * </p> 848 * 849 * @return this, to enable chaining 850 */ 851 public StrTokenizer reset() { 852 tokenPos = 0; 853 tokens = null; 854 return this; 855 } 856 857 /** 858 * Reset this tokenizer, giving it a new input string to parse. 859 * In this manner you can re-use a tokenizer with the same settings 860 * on multiple input lines. 861 * 862 * @param input the new character array to tokenize, not cloned, null sets no text to parse 863 * @return this, to enable chaining 864 */ 865 public StrTokenizer reset(final char[] input) { 866 reset(); 867 this.chars = ArrayUtils.clone(input); 868 return this; 869 } 870 871 /** 872 * Reset this tokenizer, giving it a new input string to parse. 873 * In this manner you can re-use a tokenizer with the same settings 874 * on multiple input lines. 875 * 876 * @param input the new string to tokenize, null sets no text to parse 877 * @return this, to enable chaining 878 */ 879 public StrTokenizer reset(final String input) { 880 reset(); 881 if (input != null) { 882 this.chars = input.toCharArray(); 883 } else { 884 this.chars = null; 885 } 886 return this; 887 } 888 889 /** 890 * Unsupported ListIterator operation. 891 * @param obj this parameter ignored. 892 * @throws UnsupportedOperationException always 893 */ 894 @Override 895 public void set(final String obj) { 896 throw new UnsupportedOperationException("set() is unsupported"); 897 } 898 899 /** 900 * Sets the field delimiter character. 901 * 902 * @param delim the delimiter character to use 903 * @return this, to enable chaining 904 */ 905 public StrTokenizer setDelimiterChar(final char delim) { 906 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 907 } 908 909 /** 910 * Sets the field delimiter matcher. 911 * <p> 912 * The delimiter is used to separate one token from another. 913 * </p> 914 * 915 * @param delim the delimiter matcher to use 916 * @return this, to enable chaining 917 */ 918 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 919 if (delim == null) { 920 this.delimMatcher = StrMatcher.noneMatcher(); 921 } else { 922 this.delimMatcher = delim; 923 } 924 return this; 925 } 926 927 /** 928 * Sets the field delimiter string. 929 * 930 * @param delim the delimiter string to use 931 * @return this, to enable chaining 932 */ 933 public StrTokenizer setDelimiterString(final String delim) { 934 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 935 } 936 937 /** 938 * Sets whether the tokenizer should return empty tokens as null. 939 * The default for this property is false. 940 * 941 * @param emptyAsNull whether empty tokens are returned as null 942 * @return this, to enable chaining 943 */ 944 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 945 this.emptyAsNull = emptyAsNull; 946 return this; 947 } 948 949 /** 950 * Sets the character to ignore. 951 * <p> 952 * This character is ignored when parsing the String, unless it is 953 * within a quoted region. 954 * 955 * @param ignored the ignored character to use 956 * @return this, to enable chaining 957 */ 958 public StrTokenizer setIgnoredChar(final char ignored) { 959 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 960 } 961 962 /** 963 * Sets the matcher for characters to ignore. 964 * <p> 965 * These characters are ignored when parsing the String, unless they are 966 * within a quoted region. 967 * </p> 968 * 969 * @param ignored the ignored matcher to use, null ignored 970 * @return this, to enable chaining 971 */ 972 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 973 if (ignored != null) { 974 this.ignoredMatcher = ignored; 975 } 976 return this; 977 } 978 979 /** 980 * Sets whether the tokenizer should ignore and not return empty tokens. 981 * The default for this property is true. 982 * 983 * @param ignoreEmptyTokens whether empty tokens are not returned 984 * @return this, to enable chaining 985 */ 986 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 987 this.ignoreEmptyTokens = ignoreEmptyTokens; 988 return this; 989 } 990 991 /** 992 * Sets the quote character to use. 993 * <p> 994 * The quote character is used to wrap data between the tokens. 995 * This enables delimiters to be entered as data. 996 * </p> 997 * 998 * @param quote the quote character to use 999 * @return this, to enable chaining 1000 */ 1001 public StrTokenizer setQuoteChar(final char quote) { 1002 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 1003 } 1004 1005 /** 1006 * Sets the quote matcher to use. 1007 * <p> 1008 * The quote character is used to wrap data between the tokens. 1009 * This enables delimiters to be entered as data. 1010 * </p> 1011 * 1012 * @param quote the quote matcher to use, null ignored 1013 * @return this, to enable chaining 1014 */ 1015 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 1016 if (quote != null) { 1017 this.quoteMatcher = quote; 1018 } 1019 return this; 1020 } 1021 1022 /** 1023 * Sets the matcher for characters to trim. 1024 * <p> 1025 * These characters are trimmed off on each side of the delimiter 1026 * until the token or quote is found. 1027 * </p> 1028 * 1029 * @param trimmer the trimmer matcher to use, null ignored 1030 * @return this, to enable chaining 1031 */ 1032 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1033 if (trimmer != null) { 1034 this.trimmerMatcher = trimmer; 1035 } 1036 return this; 1037 } 1038 1039 // API 1040 /** 1041 * Gets the number of tokens found in the String. 1042 * 1043 * @return the number of matched tokens 1044 */ 1045 public int size() { 1046 checkTokenized(); 1047 return tokens.length; 1048 } 1049 1050 /** 1051 * Internal method to performs the tokenization. 1052 * <p> 1053 * Most users of this class do not need to call this method. This method 1054 * will be called automatically by other (public) methods when required. 1055 * </p> 1056 * <p> 1057 * This method exists to allow subclasses to add code before or after the 1058 * tokenization. For example, a subclass could alter the character array, 1059 * offset or count to be parsed, or call the tokenizer multiple times on 1060 * multiple strings. It is also be possible to filter the results. 1061 * </p> 1062 * <p> 1063 * {@link StrTokenizer} will always pass a zero offset and a count 1064 * equal to the length of the array to this method, however a subclass 1065 * may pass other values, or even an entirely different array. 1066 * </p> 1067 * 1068 * @param srcChars the character array being tokenized, may be null 1069 * @param offset the start position within the character array, must be valid 1070 * @param count the number of characters to tokenize, must be valid 1071 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 1072 */ 1073 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 1074 if (ArrayUtils.isEmpty(srcChars)) { 1075 return Collections.emptyList(); 1076 } 1077 final StrBuilder buf = new StrBuilder(); 1078 final List<String> tokenList = new ArrayList<>(); 1079 int pos = offset; 1080 1081 // loop around the entire buffer 1082 while (pos >= 0 && pos < count) { 1083 // find next token 1084 pos = readNextToken(srcChars, pos, count, buf, tokenList); 1085 1086 // handle case where end of string is a delimiter 1087 if (pos >= count) { 1088 addToken(tokenList, StringUtils.EMPTY); 1089 } 1090 } 1091 return tokenList; 1092 } 1093 1094 /** 1095 * Gets the String content that the tokenizer is parsing. 1096 * 1097 * @return the string content being parsed 1098 */ 1099 @Override 1100 public String toString() { 1101 if (tokens == null) { 1102 return "StrTokenizer[not tokenized yet]"; 1103 } 1104 return "StrTokenizer" + getTokenList(); 1105 } 1106 1107}