001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.List; 022import java.util.ListIterator; 023import java.util.NoSuchElementException; 024 025/** 026 * Tokenizes a string based based on delimiters (separators) 027 * and supporting quoting and ignored character concepts. 028 * <p> 029 * This class can split a String into many smaller strings. It aims 030 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 031 * however it offers much more control and flexibility including implementing 032 * the <code>ListIterator</code> interface. By default, it is set up 033 * like <code>StringTokenizer</code>. 034 * <p> 035 * The input String is split into a number of <i>tokens</i>. 036 * Each token is separated from the next String by a <i>delimiter</i>. 037 * One or more delimiter characters must be specified. 038 * <p> 039 * Each token may be surrounded by quotes. 040 * The <i>quote</i> matcher specifies the quote character(s). 041 * A quote may be escaped within a quoted section by duplicating itself. 042 * <p> 043 * Between each token and the delimiter are potentially characters that need trimming. 044 * The <i>trimmer</i> matcher specifies these characters. 045 * One usage might be to trim whitespace characters. 046 * <p> 047 * At any point outside the quotes there might potentially be invalid characters. 048 * The <i>ignored</i> matcher specifies these characters to be removed. 049 * One usage might be to remove new line characters. 050 * <p> 051 * Empty tokens may be removed or returned as null. 052 * <pre> 053 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 054 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 055 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 056 * </pre> 057 * <p> 058 * 059 * This tokenizer has the following properties and options: 060 * 061 * <table summary="Tokenizer Properties"> 062 * <tr> 063 * <th>Property</th><th>Type</th><th>Default</th> 064 * </tr> 065 * <tr> 066 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 067 * </tr> 068 * <tr> 069 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 070 * </tr> 071 * <tr> 072 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 073 * </tr> 074 * <tr> 075 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 076 * </tr> 077 * <tr> 078 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 079 * </tr> 080 * </table> 081 * 082 * @since 1.0 083 */ 084public class StrTokenizer implements ListIterator<String>, Cloneable { 085 086 /** Comma separated values tokenizer internal variable. */ 087 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 088 /** Tab separated values tokenizer internal variable. */ 089 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 090 static { 091 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 092 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 093 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 094 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 095 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 096 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 097 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 098 099 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 100 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 101 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 102 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 103 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 104 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 105 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 106 } 107 108 /** The text to work on. */ 109 private char[] chars; 110 /** The parsed tokens. */ 111 private String[] tokens; 112 /** The current iteration position. */ 113 private int tokenPos; 114 115 /** The delimiter matcher. */ 116 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 117 /** The quote matcher. */ 118 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 119 /** The ignored matcher. */ 120 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 121 /** The trimmer matcher. */ 122 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 123 124 /** Whether to return empty tokens as null. */ 125 private boolean emptyAsNull = false; 126 /** Whether to ignore empty tokens. */ 127 private boolean ignoreEmptyTokens = true; 128 129 //----------------------------------------------------------------------- 130 131 /** 132 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 133 * 134 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 135 */ 136 private static StrTokenizer getCSVClone() { 137 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 138 } 139 140 /** 141 * Gets a new tokenizer instance which parses Comma Separated Value strings 142 * initializing it with the given input. The default for CSV processing 143 * will be trim whitespace from both ends (which can be overridden with 144 * the setTrimmer method). 145 * <p> 146 * You must call a "reset" method to set the string which you want to parse. 147 * @return a new tokenizer instance which parses Comma Separated Value strings 148 */ 149 public static StrTokenizer getCSVInstance() { 150 return getCSVClone(); 151 } 152 153 /** 154 * Gets a new tokenizer instance which parses Comma Separated Value strings 155 * initializing it with the given input. The default for CSV processing 156 * will be trim whitespace from both ends (which can be overridden with 157 * the setTrimmer method). 158 * 159 * @param input the text to parse 160 * @return a new tokenizer instance which parses Comma Separated Value strings 161 */ 162 public static StrTokenizer getCSVInstance(final String input) { 163 final StrTokenizer tok = getCSVClone(); 164 tok.reset(input); 165 return tok; 166 } 167 168 /** 169 * Gets a new tokenizer instance which parses Comma Separated Value strings 170 * initializing it with the given input. The default for CSV processing 171 * will be trim whitespace from both ends (which can be overridden with 172 * the setTrimmer method). 173 * 174 * @param input the text to parse 175 * @return a new tokenizer instance which parses Comma Separated Value strings 176 */ 177 public static StrTokenizer getCSVInstance(final char[] input) { 178 final StrTokenizer tok = getCSVClone(); 179 tok.reset(input); 180 return tok; 181 } 182 183 /** 184 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 185 * 186 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 187 */ 188 private static StrTokenizer getTSVClone() { 189 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 190 } 191 192 193 /** 194 * Gets a new tokenizer instance which parses Tab Separated Value strings. 195 * The default for CSV processing will be trim whitespace from both ends 196 * (which can be overridden with the setTrimmer method). 197 * <p> 198 * You must call a "reset" method to set the string which you want to parse. 199 * @return a new tokenizer instance which parses Tab Separated Value strings. 200 */ 201 public static StrTokenizer getTSVInstance() { 202 return getTSVClone(); 203 } 204 205 /** 206 * Gets a new tokenizer instance which parses Tab Separated Value strings. 207 * The default for CSV processing will be trim whitespace from both ends 208 * (which can be overridden with the setTrimmer method). 209 * @param input the string to parse 210 * @return a new tokenizer instance which parses Tab Separated Value strings. 211 */ 212 public static StrTokenizer getTSVInstance(final String input) { 213 final StrTokenizer tok = getTSVClone(); 214 tok.reset(input); 215 return tok; 216 } 217 218 /** 219 * Gets a new tokenizer instance which parses Tab Separated Value strings. 220 * The default for CSV processing will be trim whitespace from both ends 221 * (which can be overridden with the setTrimmer method). 222 * @param input the string to parse 223 * @return a new tokenizer instance which parses Tab Separated Value strings. 224 */ 225 public static StrTokenizer getTSVInstance(final char[] input) { 226 final StrTokenizer tok = getTSVClone(); 227 tok.reset(input); 228 return tok; 229 } 230 231 //----------------------------------------------------------------------- 232 /** 233 * Constructs a tokenizer splitting on space, tab, newline and formfeed 234 * as per StringTokenizer, but with no text to tokenize. 235 * <p> 236 * This constructor is normally used with {@link #reset(String)}. 237 */ 238 public StrTokenizer() { 239 super(); 240 this.chars = null; 241 } 242 243 /** 244 * Constructs a tokenizer splitting on space, tab, newline and formfeed 245 * as per StringTokenizer. 246 * 247 * @param input the string which is to be parsed 248 */ 249 public StrTokenizer(final String input) { 250 super(); 251 if (input != null) { 252 chars = input.toCharArray(); 253 } else { 254 chars = null; 255 } 256 } 257 258 /** 259 * Constructs a tokenizer splitting on the specified delimiter character. 260 * 261 * @param input the string which is to be parsed 262 * @param delim the field delimiter character 263 */ 264 public StrTokenizer(final String input, final char delim) { 265 this(input); 266 setDelimiterChar(delim); 267 } 268 269 /** 270 * Constructs a tokenizer splitting on the specified delimiter string. 271 * 272 * @param input the string which is to be parsed 273 * @param delim the field delimiter string 274 */ 275 public StrTokenizer(final String input, final String delim) { 276 this(input); 277 setDelimiterString(delim); 278 } 279 280 /** 281 * Constructs a tokenizer splitting using the specified delimiter matcher. 282 * 283 * @param input the string which is to be parsed 284 * @param delim the field delimiter matcher 285 */ 286 public StrTokenizer(final String input, final StrMatcher delim) { 287 this(input); 288 setDelimiterMatcher(delim); 289 } 290 291 /** 292 * Constructs a tokenizer splitting on the specified delimiter character 293 * and handling quotes using the specified quote character. 294 * 295 * @param input the string which is to be parsed 296 * @param delim the field delimiter character 297 * @param quote the field quoted string character 298 */ 299 public StrTokenizer(final String input, final char delim, final char quote) { 300 this(input, delim); 301 setQuoteChar(quote); 302 } 303 304 /** 305 * Constructs a tokenizer splitting using the specified delimiter matcher 306 * and handling quotes using the specified quote matcher. 307 * 308 * @param input the string which is to be parsed 309 * @param delim the field delimiter matcher 310 * @param quote the field quoted string matcher 311 */ 312 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 313 this(input, delim); 314 setQuoteMatcher(quote); 315 } 316 317 /** 318 * Constructs a tokenizer splitting on space, tab, newline and formfeed 319 * as per StringTokenizer. 320 * 321 * @param input the string which is to be parsed, not cloned 322 */ 323 public StrTokenizer(final char[] input) { 324 super(); 325 if (input == null) { 326 this.chars = null; 327 } else { 328 this.chars = input.clone(); 329 } 330 } 331 332 /** 333 * Constructs a tokenizer splitting on the specified character. 334 * 335 * @param input the string which is to be parsed, not cloned 336 * @param delim the field delimiter character 337 */ 338 public StrTokenizer(final char[] input, final char delim) { 339 this(input); 340 setDelimiterChar(delim); 341 } 342 343 /** 344 * Constructs a tokenizer splitting on the specified string. 345 * 346 * @param input the string which is to be parsed, not cloned 347 * @param delim the field delimiter string 348 */ 349 public StrTokenizer(final char[] input, final String delim) { 350 this(input); 351 setDelimiterString(delim); 352 } 353 354 /** 355 * Constructs a tokenizer splitting using the specified delimiter matcher. 356 * 357 * @param input the string which is to be parsed, not cloned 358 * @param delim the field delimiter matcher 359 */ 360 public StrTokenizer(final char[] input, final StrMatcher delim) { 361 this(input); 362 setDelimiterMatcher(delim); 363 } 364 365 /** 366 * Constructs a tokenizer splitting on the specified delimiter character 367 * and handling quotes using the specified quote character. 368 * 369 * @param input the string which is to be parsed, not cloned 370 * @param delim the field delimiter character 371 * @param quote the field quoted string character 372 */ 373 public StrTokenizer(final char[] input, final char delim, final char quote) { 374 this(input, delim); 375 setQuoteChar(quote); 376 } 377 378 /** 379 * Constructs a tokenizer splitting using the specified delimiter matcher 380 * and handling quotes using the specified quote matcher. 381 * 382 * @param input the string which is to be parsed, not cloned 383 * @param delim the field delimiter character 384 * @param quote the field quoted string character 385 */ 386 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 387 this(input, delim); 388 setQuoteMatcher(quote); 389 } 390 391 // API 392 //----------------------------------------------------------------------- 393 /** 394 * Gets the number of tokens found in the String. 395 * 396 * @return the number of matched tokens 397 */ 398 public int size() { 399 checkTokenized(); 400 return tokens.length; 401 } 402 403 /** 404 * Gets the next token from the String. 405 * Equivalent to {@link #next()} except it returns null rather than 406 * throwing {@link NoSuchElementException} when no tokens remain. 407 * 408 * @return the next sequential token, or null when no more tokens are found 409 */ 410 public String nextToken() { 411 if (hasNext()) { 412 return tokens[tokenPos++]; 413 } 414 return null; 415 } 416 417 /** 418 * Gets the previous token from the String. 419 * 420 * @return the previous sequential token, or null when no more tokens are found 421 */ 422 public String previousToken() { 423 if (hasPrevious()) { 424 return tokens[--tokenPos]; 425 } 426 return null; 427 } 428 429 /** 430 * Gets a copy of the full token list as an independent modifiable array. 431 * 432 * @return the tokens as a String array 433 */ 434 public String[] getTokenArray() { 435 checkTokenized(); 436 return tokens.clone(); 437 } 438 439 /** 440 * Gets a copy of the full token list as an independent modifiable list. 441 * 442 * @return the tokens as a String array 443 */ 444 public List<String> getTokenList() { 445 checkTokenized(); 446 final List<String> list = new ArrayList<>(tokens.length); 447 for (final String element : tokens) { 448 list.add(element); 449 } 450 return list; 451 } 452 453 /** 454 * Resets this tokenizer, forgetting all parsing and iteration already completed. 455 * <p> 456 * This method allows the same tokenizer to be reused for the same String. 457 * 458 * @return this, to enable chaining 459 */ 460 public StrTokenizer reset() { 461 tokenPos = 0; 462 tokens = null; 463 return this; 464 } 465 466 /** 467 * Reset this tokenizer, giving it a new input string to parse. 468 * In this manner you can re-use a tokenizer with the same settings 469 * on multiple input lines. 470 * 471 * @param input the new string to tokenize, null sets no text to parse 472 * @return this, to enable chaining 473 */ 474 public StrTokenizer reset(final String input) { 475 reset(); 476 if (input != null) { 477 this.chars = input.toCharArray(); 478 } else { 479 this.chars = null; 480 } 481 return this; 482 } 483 484 /** 485 * Reset this tokenizer, giving it a new input string to parse. 486 * In this manner you can re-use a tokenizer with the same settings 487 * on multiple input lines. 488 * 489 * @param input the new character array to tokenize, not cloned, null sets no text to parse 490 * @return this, to enable chaining 491 */ 492 public StrTokenizer reset(final char[] input) { 493 reset(); 494 if (input != null) { 495 this.chars = input.clone(); 496 } else { 497 this.chars = null; 498 } 499 return this; 500 } 501 502 // ListIterator 503 //----------------------------------------------------------------------- 504 /** 505 * Checks whether there are any more tokens. 506 * 507 * @return true if there are more tokens 508 */ 509 @Override 510 public boolean hasNext() { 511 checkTokenized(); 512 return tokenPos < tokens.length; 513 } 514 515 /** 516 * Gets the next token. 517 * 518 * @return the next String token 519 * @throws NoSuchElementException if there are no more elements 520 */ 521 @Override 522 public String next() { 523 if (hasNext()) { 524 return tokens[tokenPos++]; 525 } 526 throw new NoSuchElementException(); 527 } 528 529 /** 530 * Gets the index of the next token to return. 531 * 532 * @return the next token index 533 */ 534 @Override 535 public int nextIndex() { 536 return tokenPos; 537 } 538 539 /** 540 * Checks whether there are any previous tokens that can be iterated to. 541 * 542 * @return true if there are previous tokens 543 */ 544 @Override 545 public boolean hasPrevious() { 546 checkTokenized(); 547 return tokenPos > 0; 548 } 549 550 /** 551 * Gets the token previous to the last returned token. 552 * 553 * @return the previous token 554 */ 555 @Override 556 public String previous() { 557 if (hasPrevious()) { 558 return tokens[--tokenPos]; 559 } 560 throw new NoSuchElementException(); 561 } 562 563 /** 564 * Gets the index of the previous token. 565 * 566 * @return the previous token index 567 */ 568 @Override 569 public int previousIndex() { 570 return tokenPos - 1; 571 } 572 573 /** 574 * Unsupported ListIterator operation. 575 * 576 * @throws UnsupportedOperationException always 577 */ 578 @Override 579 public void remove() { 580 throw new UnsupportedOperationException("remove() is unsupported"); 581 } 582 583 /** 584 * Unsupported ListIterator operation. 585 * @param obj this parameter ignored. 586 * @throws UnsupportedOperationException always 587 */ 588 @Override 589 public void set(final String obj) { 590 throw new UnsupportedOperationException("set() is unsupported"); 591 } 592 593 /** 594 * Unsupported ListIterator operation. 595 * @param obj this parameter ignored. 596 * @throws UnsupportedOperationException always 597 */ 598 @Override 599 public void add(final String obj) { 600 throw new UnsupportedOperationException("add() is unsupported"); 601 } 602 603 // Implementation 604 //----------------------------------------------------------------------- 605 /** 606 * Checks if tokenization has been done, and if not then do it. 607 */ 608 private void checkTokenized() { 609 if (tokens == null) { 610 if (chars == null) { 611 // still call tokenize as subclass may do some work 612 final List<String> split = tokenize(null, 0, 0); 613 tokens = split.toArray(new String[split.size()]); 614 } else { 615 final List<String> split = tokenize(chars, 0, chars.length); 616 tokens = split.toArray(new String[split.size()]); 617 } 618 } 619 } 620 621 /** 622 * Internal method to performs the tokenization. 623 * <p> 624 * Most users of this class do not need to call this method. This method 625 * will be called automatically by other (public) methods when required. 626 * <p> 627 * This method exists to allow subclasses to add code before or after the 628 * tokenization. For example, a subclass could alter the character array, 629 * offset or count to be parsed, or call the tokenizer multiple times on 630 * multiple strings. It is also be possible to filter the results. 631 * <p> 632 * <code>StrTokenizer</code> will always pass a zero offset and a count 633 * equal to the length of the array to this method, however a subclass 634 * may pass other values, or even an entirely different array. 635 * 636 * @param srcChars the character array being tokenized, may be null 637 * @param offset the start position within the character array, must be valid 638 * @param count the number of characters to tokenize, must be valid 639 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 640 */ 641 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 642 if (srcChars == null || count == 0) { 643 return Collections.emptyList(); 644 } 645 final StrBuilder buf = new StrBuilder(); 646 final List<String> tokenList = new ArrayList<>(); 647 int pos = offset; 648 649 // loop around the entire buffer 650 while (pos >= 0 && pos < count) { 651 // find next token 652 pos = readNextToken(srcChars, pos, count, buf, tokenList); 653 654 // handle case where end of string is a delimiter 655 if (pos >= count) { 656 addToken(tokenList, ""); 657 } 658 } 659 return tokenList; 660 } 661 662 /** 663 * Adds a token to a list, paying attention to the parameters we've set. 664 * 665 * @param list the list to add to 666 * @param tok the token to add 667 */ 668 private void addToken(final List<String> list, String tok) { 669 if (tok == null || tok.length() == 0) { 670 if (isIgnoreEmptyTokens()) { 671 return; 672 } 673 if (isEmptyTokenAsNull()) { 674 tok = null; 675 } 676 } 677 list.add(tok); 678 } 679 680 /** 681 * Reads character by character through the String to get the next token. 682 * 683 * @param srcChars the character array being tokenized 684 * @param start the first character of field 685 * @param len the length of the character array being tokenized 686 * @param workArea a temporary work area 687 * @param tokenList the list of parsed tokens 688 * @return the starting position of the next field (the character 689 * immediately after the delimiter), or -1 if end of string found 690 */ 691 private int readNextToken(final char[] srcChars, 692 int start, 693 final int len, 694 final StrBuilder workArea, 695 final List<String> tokenList) { 696 // skip all leading whitespace, unless it is the 697 // field delimiter or the quote character 698 while (start < len) { 699 final int removeLen = Math.max( 700 getIgnoredMatcher().isMatch(srcChars, start, start, len), 701 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 702 if (removeLen == 0 703 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 704 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 705 break; 706 } 707 start += removeLen; 708 } 709 710 // handle reaching end 711 if (start >= len) { 712 addToken(tokenList, ""); 713 return -1; 714 } 715 716 // handle empty token 717 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 718 if (delimLen > 0) { 719 addToken(tokenList, ""); 720 return start + delimLen; 721 } 722 723 // handle found token 724 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 725 if (quoteLen > 0) { 726 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 727 } 728 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 729 } 730 731 /** 732 * Reads a possibly quoted string token. 733 * 734 * @param srcChars the character array being tokenized 735 * @param start the first character of field 736 * @param len the length of the character array being tokenized 737 * @param workArea a temporary work area 738 * @param tokenList the list of parsed tokens 739 * @param quoteStart the start position of the matched quote, 0 if no quoting 740 * @param quoteLen the length of the matched quote, 0 if no quoting 741 * @return the starting position of the next field (the character 742 * immediately after the delimiter, or if end of string found, 743 * then the length of string 744 */ 745 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 746 final List<String> tokenList, final int quoteStart, final int quoteLen) { 747 // Loop until we've found the end of the quoted 748 // string or the end of the input 749 workArea.clear(); 750 int pos = start; 751 boolean quoting = quoteLen > 0; 752 int trimStart = 0; 753 754 while (pos < len) { 755 // quoting mode can occur several times throughout a string 756 // we must switch between quoting and non-quoting until we 757 // encounter a non-quoted delimiter, or end of string 758 if (quoting) { 759 // In quoting mode 760 761 // If we've found a quote character, see if it's 762 // followed by a second quote. If so, then we need 763 // to actually put the quote character into the token 764 // rather than end the token. 765 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 766 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 767 // matched pair of quotes, thus an escaped quote 768 workArea.append(srcChars, pos, quoteLen); 769 pos += quoteLen * 2; 770 trimStart = workArea.size(); 771 continue; 772 } 773 774 // end of quoting 775 quoting = false; 776 pos += quoteLen; 777 continue; 778 } 779 780 // copy regular character from inside quotes 781 workArea.append(srcChars[pos++]); 782 trimStart = workArea.size(); 783 784 } else { 785 // Not in quoting mode 786 787 // check for delimiter, and thus end of token 788 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 789 if (delimLen > 0) { 790 // return condition when end of token found 791 addToken(tokenList, workArea.substring(0, trimStart)); 792 return pos + delimLen; 793 } 794 795 // check for quote, and thus back into quoting mode 796 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 797 quoting = true; 798 pos += quoteLen; 799 continue; 800 } 801 802 // check for ignored (outside quotes), and ignore 803 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 804 if (ignoredLen > 0) { 805 pos += ignoredLen; 806 continue; 807 } 808 809 // check for trimmed character 810 // don't yet know if its at the end, so copy to workArea 811 // use trimStart to keep track of trim at the end 812 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 813 if (trimmedLen > 0) { 814 workArea.append(srcChars, pos, trimmedLen); 815 pos += trimmedLen; 816 continue; 817 } 818 819 // copy regular character from outside quotes 820 workArea.append(srcChars[pos++]); 821 trimStart = workArea.size(); 822 } 823 } 824 825 // return condition when end of string found 826 addToken(tokenList, workArea.substring(0, trimStart)); 827 return -1; 828 } 829 830 /** 831 * Checks if the characters at the index specified match the quote 832 * already matched in readNextToken(). 833 * 834 * @param srcChars the character array being tokenized 835 * @param pos the position to check for a quote 836 * @param len the length of the character array being tokenized 837 * @param quoteStart the start position of the matched quote, 0 if no quoting 838 * @param quoteLen the length of the matched quote, 0 if no quoting 839 * @return true if a quote is matched 840 */ 841 private boolean isQuote(final char[] srcChars, 842 final int pos, 843 final int len, 844 final int quoteStart, 845 final int quoteLen) { 846 for (int i = 0; i < quoteLen; i++) { 847 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 848 return false; 849 } 850 } 851 return true; 852 } 853 854 // Delimiter 855 //----------------------------------------------------------------------- 856 /** 857 * Gets the field delimiter matcher. 858 * 859 * @return the delimiter matcher in use 860 */ 861 public StrMatcher getDelimiterMatcher() { 862 return this.delimMatcher; 863 } 864 865 /** 866 * Sets the field delimiter matcher. 867 * <p> 868 * The delimitier is used to separate one token from another. 869 * 870 * @param delim the delimiter matcher to use 871 * @return this, to enable chaining 872 */ 873 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 874 if (delim == null) { 875 this.delimMatcher = StrMatcher.noneMatcher(); 876 } else { 877 this.delimMatcher = delim; 878 } 879 return this; 880 } 881 882 /** 883 * Sets the field delimiter character. 884 * 885 * @param delim the delimiter character to use 886 * @return this, to enable chaining 887 */ 888 public StrTokenizer setDelimiterChar(final char delim) { 889 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 890 } 891 892 /** 893 * Sets the field delimiter string. 894 * 895 * @param delim the delimiter string to use 896 * @return this, to enable chaining 897 */ 898 public StrTokenizer setDelimiterString(final String delim) { 899 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 900 } 901 902 // Quote 903 //----------------------------------------------------------------------- 904 /** 905 * Gets the quote matcher currently in use. 906 * <p> 907 * The quote character is used to wrap data between the tokens. 908 * This enables delimiters to be entered as data. 909 * The default value is '"' (double quote). 910 * 911 * @return the quote matcher in use 912 */ 913 public StrMatcher getQuoteMatcher() { 914 return quoteMatcher; 915 } 916 917 /** 918 * Set the quote matcher to use. 919 * <p> 920 * The quote character is used to wrap data between the tokens. 921 * This enables delimiters to be entered as data. 922 * 923 * @param quote the quote matcher to use, null ignored 924 * @return this, to enable chaining 925 */ 926 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 927 if (quote != null) { 928 this.quoteMatcher = quote; 929 } 930 return this; 931 } 932 933 /** 934 * Sets the quote character to use. 935 * <p> 936 * The quote character is used to wrap data between the tokens. 937 * This enables delimiters to be entered as data. 938 * 939 * @param quote the quote character to use 940 * @return this, to enable chaining 941 */ 942 public StrTokenizer setQuoteChar(final char quote) { 943 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 944 } 945 946 // Ignored 947 //----------------------------------------------------------------------- 948 /** 949 * Gets the ignored character matcher. 950 * <p> 951 * These characters are ignored when parsing the String, unless they are 952 * within a quoted region. 953 * The default value is not to ignore anything. 954 * 955 * @return the ignored matcher in use 956 */ 957 public StrMatcher getIgnoredMatcher() { 958 return ignoredMatcher; 959 } 960 961 /** 962 * Set the matcher for characters to ignore. 963 * <p> 964 * These characters are ignored when parsing the String, unless they are 965 * within a quoted region. 966 * 967 * @param ignored the ignored matcher to use, null ignored 968 * @return this, to enable chaining 969 */ 970 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 971 if (ignored != null) { 972 this.ignoredMatcher = ignored; 973 } 974 return this; 975 } 976 977 /** 978 * Set the character to ignore. 979 * <p> 980 * This character is ignored when parsing the String, unless it is 981 * within a quoted region. 982 * 983 * @param ignored the ignored character to use 984 * @return this, to enable chaining 985 */ 986 public StrTokenizer setIgnoredChar(final char ignored) { 987 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 988 } 989 990 // Trimmer 991 //----------------------------------------------------------------------- 992 /** 993 * Gets the trimmer character matcher. 994 * <p> 995 * These characters are trimmed off on each side of the delimiter 996 * until the token or quote is found. 997 * The default value is not to trim anything. 998 * 999 * @return the trimmer matcher in use 1000 */ 1001 public StrMatcher getTrimmerMatcher() { 1002 return trimmerMatcher; 1003 } 1004 1005 /** 1006 * Sets the matcher for characters to trim. 1007 * <p> 1008 * These characters are trimmed off on each side of the delimiter 1009 * until the token or quote is found. 1010 * 1011 * @param trimmer the trimmer matcher to use, null ignored 1012 * @return this, to enable chaining 1013 */ 1014 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1015 if (trimmer != null) { 1016 this.trimmerMatcher = trimmer; 1017 } 1018 return this; 1019 } 1020 1021 //----------------------------------------------------------------------- 1022 /** 1023 * Gets whether the tokenizer currently returns empty tokens as null. 1024 * The default for this property is false. 1025 * 1026 * @return true if empty tokens are returned as null 1027 */ 1028 public boolean isEmptyTokenAsNull() { 1029 return this.emptyAsNull; 1030 } 1031 1032 /** 1033 * Sets whether the tokenizer should return empty tokens as null. 1034 * The default for this property is false. 1035 * 1036 * @param emptyAsNull whether empty tokens are returned as null 1037 * @return this, to enable chaining 1038 */ 1039 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1040 this.emptyAsNull = emptyAsNull; 1041 return this; 1042 } 1043 1044 //----------------------------------------------------------------------- 1045 /** 1046 * Gets whether the tokenizer currently ignores empty tokens. 1047 * The default for this property is true. 1048 * 1049 * @return true if empty tokens are not returned 1050 */ 1051 public boolean isIgnoreEmptyTokens() { 1052 return ignoreEmptyTokens; 1053 } 1054 1055 /** 1056 * Sets whether the tokenizer should ignore and not return empty tokens. 1057 * The default for this property is true. 1058 * 1059 * @param ignoreEmptyTokens whether empty tokens are not returned 1060 * @return this, to enable chaining 1061 */ 1062 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1063 this.ignoreEmptyTokens = ignoreEmptyTokens; 1064 return this; 1065 } 1066 1067 //----------------------------------------------------------------------- 1068 /** 1069 * Gets the String content that the tokenizer is parsing. 1070 * 1071 * @return the string content being parsed 1072 */ 1073 public String getContent() { 1074 if (chars == null) { 1075 return null; 1076 } 1077 return new String(chars); 1078 } 1079 1080 //----------------------------------------------------------------------- 1081 /** 1082 * Creates a new instance of this Tokenizer. The new instance is reset so 1083 * that it will be at the start of the token list. 1084 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1085 * 1086 * @return a new instance of this Tokenizer which has been reset. 1087 */ 1088 @Override 1089 public Object clone() { 1090 try { 1091 return cloneReset(); 1092 } catch (final CloneNotSupportedException ex) { 1093 return null; 1094 } 1095 } 1096 1097 /** 1098 * Creates a new instance of this Tokenizer. The new instance is reset so that 1099 * it will be at the start of the token list. 1100 * 1101 * @return a new instance of this Tokenizer which has been reset. 1102 * @throws CloneNotSupportedException if there is a problem cloning 1103 */ 1104 Object cloneReset() throws CloneNotSupportedException { 1105 // this method exists to enable 100% test coverage 1106 final StrTokenizer cloned = (StrTokenizer) super.clone(); 1107 if (cloned.chars != null) { 1108 cloned.chars = cloned.chars.clone(); 1109 } 1110 cloned.reset(); 1111 return cloned; 1112 } 1113 1114 //----------------------------------------------------------------------- 1115 /** 1116 * Gets the String content that the tokenizer is parsing. 1117 * 1118 * @return the string content being parsed 1119 */ 1120 @Override 1121 public String toString() { 1122 if (tokens == null) { 1123 return "StrTokenizer[not tokenized yet]"; 1124 } 1125 return "StrTokenizer" + getTokenList(); 1126 } 1127 1128}