001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022import org.apache.commons.codec.binary.StringUtils; 023 024/** 025 * Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence 026 * Philips</CITE>. 027 * <p> 028 * This class is conditionally thread-safe. The instance field for the maximum code length is mutable 029 * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is 030 * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication 031 * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup. 032 * </p> 033 * 034 * @see <a href="https://drdobbs.com/the-double-metaphone-search-algorithm/184401251?pgno=2">Dr. Dobbs Original Article</a> 035 * @see <a href="https://en.wikipedia.org/wiki/Metaphone">Wikipedia Metaphone</a> 036 */ 037public class DoubleMetaphone implements StringEncoder { 038 039 /** 040 * Stores results, since there is the optional alternate encoding. 041 */ 042 public class DoubleMetaphoneResult { 043 044 private final StringBuilder primary = new StringBuilder(getMaxCodeLen()); 045 private final StringBuilder alternate = new StringBuilder(getMaxCodeLen()); 046 private final int maxLength; 047 048 /** 049 * Constructs a new instance. 050 * 051 * @param maxLength The maximum length. 052 */ 053 public DoubleMetaphoneResult(final int maxLength) { 054 this.maxLength = maxLength; 055 } 056 057 /** 058 * Appends the given value as primary and alternative. 059 * 060 * @param value The value to append. 061 */ 062 public void append(final char value) { 063 appendPrimary(value); 064 appendAlternate(value); 065 } 066 067 /** 068 * Appends the given primary and alternative values. 069 * 070 * @param primary The primary value. 071 * @param alternate The alternate value. 072 */ 073 public void append(final char primary, final char alternate) { 074 appendPrimary(primary); 075 appendAlternate(alternate); 076 } 077 078 /** 079 * Appends the given value as primary and alternative. 080 * 081 * @param value The value to append. 082 */ 083 public void append(final String value) { 084 appendPrimary(value); 085 appendAlternate(value); 086 } 087 088 /** 089 * Appends the given primary and alternative values. 090 * 091 * @param primary The primary value. 092 * @param alternate The alternate value. 093 */ 094 public void append(final String primary, final String alternate) { 095 appendPrimary(primary); 096 appendAlternate(alternate); 097 } 098 099 /** 100 * Appends the given value as alternative. 101 * 102 * @param value The value to append. 103 */ 104 public void appendAlternate(final char value) { 105 if (this.alternate.length() < this.maxLength) { 106 this.alternate.append(value); 107 } 108 } 109 110 /** 111 * Appends the given value as alternative. 112 * 113 * @param value The value to append. 114 */ 115 public void appendAlternate(final String value) { 116 final int addChars = this.maxLength - this.alternate.length(); 117 if (value.length() <= addChars) { 118 this.alternate.append(value); 119 } else { 120 this.alternate.append(value, 0, addChars); 121 } 122 } 123 124 /** 125 * Appends the given value as primary. 126 * 127 * @param value The value to append. 128 */ 129 public void appendPrimary(final char value) { 130 if (this.primary.length() < this.maxLength) { 131 this.primary.append(value); 132 } 133 } 134 135 /** 136 * Appends the given value as primary. 137 * 138 * @param value The value to append. 139 */ 140 public void appendPrimary(final String value) { 141 final int addChars = this.maxLength - this.primary.length(); 142 if (value.length() <= addChars) { 143 this.primary.append(value); 144 } else { 145 this.primary.append(value, 0, addChars); 146 } 147 } 148 149 /** 150 * Gets the alternate string. 151 * 152 * @return the alternate string. 153 */ 154 public String getAlternate() { 155 return this.alternate.toString(); 156 } 157 158 /** 159 * Gets the primary string. 160 * 161 * @return the primary string. 162 */ 163 public String getPrimary() { 164 return this.primary.toString(); 165 } 166 167 /** 168 * Tests whether this result is complete. 169 * 170 * @return whether this result is complete. 171 */ 172 public boolean isComplete() { 173 return this.primary.length() >= this.maxLength && this.alternate.length() >= this.maxLength; 174 } 175 } 176 177 /** 178 * "Vowels" to test. 179 */ 180 private static final String VOWELS = "AEIOUY"; 181 182 /** 183 * Prefixes when present which are not pronounced. 184 */ 185 private static final String[] SILENT_START = { "GN", "KN", "PN", "WR", "PS" }; 186 187 private static final String[] L_R_N_M_B_H_F_V_W_SPACE = { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; 188 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; 189 private static final String[] L_T_K_S_N_M_B_Z = { "L", "T", "K", "S", "N", "M", "B", "Z" }; 190 191 /** 192 * Tests whether {@code value} contains any of the {@code criteria} starting at index {@code start} and matching up to length {@code length}. 193 * 194 * @param value The value to test. 195 * @param start Where in {@code value} to start testing. 196 * @param length How many to test. 197 * @param criteria The search criteria. 198 * @return Whether there was a match. 199 */ 200 protected static boolean contains(final String value, final int start, final int length, final String... criteria) { 201 boolean result = false; 202 if (start >= 0 && start + length <= value.length()) { 203 final String target = value.substring(start, start + length); 204 for (final String element : criteria) { 205 if (target.equals(element)) { 206 result = true; 207 break; 208 } 209 } 210 } 211 return result; 212 } 213 214 /** 215 * Maximum length of an encoding, default is 4 216 */ 217 private int maxCodeLen = 4; 218 219 /** 220 * Constructs a new instance. 221 */ 222 public DoubleMetaphone() { 223 // empty 224 } 225 226 /** 227 * Gets the character at index {@code index} if available, or {@link Character#MIN_VALUE} if out of bounds. 228 * 229 * @param value The String to query. 230 * @param index A string index. 231 * @return The character at the index or {@link Character#MIN_VALUE} if out of bounds. 232 */ 233 protected char charAt(final String value, final int index) { 234 if (index < 0 || index >= value.length()) { 235 return Character.MIN_VALUE; 236 } 237 return value.charAt(index); 238 } 239 240 /** 241 * Cleans the input. 242 */ 243 private String cleanInput(String input) { 244 if (input == null) { 245 return null; 246 } 247 input = input.trim(); 248 if (input.isEmpty()) { 249 return null; 250 } 251 return input.toUpperCase(java.util.Locale.ENGLISH); 252 } 253 254 /** 255 * Complex condition 0 for 'C'. 256 */ 257 private boolean conditionC0(final String value, final int index) { 258 if (contains(value, index, 4, "CHIA")) { 259 return true; 260 } 261 if (index <= 1) { 262 return false; 263 } 264 if (isVowel(charAt(value, index - 2))) { 265 return false; 266 } 267 if (!contains(value, index - 1, 3, "ACH")) { 268 return false; 269 } 270 final char c = charAt(value, index + 2); 271 return c != 'I' && c != 'E' || 272 contains(value, index - 2, 6, "BACHER", "MACHER"); 273 } 274 275 /** 276 * Complex condition 0 for 'CH'. 277 */ 278 private boolean conditionCH0(final String value, final int index) { 279 if (index != 0) { 280 return false; 281 } 282 if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 283 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { 284 return false; 285 } 286 return !contains(value, 0, 5, "CHORE"); 287 } 288 289 /** 290 * Complex condition 1 for 'CH'. 291 */ 292 private boolean conditionCH1(final String value, final int index) { 293 return contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH") || 294 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || 295 contains(value, index + 2, 1, "T", "S") || 296 (contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && 297 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1); 298 } 299 300 /** 301 * Complex condition 0 for 'L'. 302 */ 303 private boolean conditionL0(final String value, final int index) { 304 if (index == value.length() - 3 && 305 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { 306 return true; 307 } 308 return (contains(value, value.length() - 2, 2, "AS", "OS") || 309 contains(value, value.length() - 1, 1, "A", "O")) && 310 contains(value, index - 1, 4, "ALLE"); 311 } 312 313 //-- BEGIN HANDLERS --// 314 315 /** 316 * Complex condition 0 for 'M'. 317 */ 318 private boolean conditionM0(final String value, final int index) { 319 if (charAt(value, index + 1) == 'M') { 320 return true; 321 } 322 return contains(value, index - 1, 3, "UMB") && 323 (index + 1 == value.length() - 1 || contains(value, index + 2, 2, "ER")); 324 } 325 326 /** 327 * Encode a value with Double Metaphone. 328 * 329 * @param value String to encode 330 * @return an encoded string 331 */ 332 public String doubleMetaphone(final String value) { 333 return doubleMetaphone(value, false); 334 } 335 336 /** 337 * Encode a value with Double Metaphone, optionally using the alternate encoding. 338 * 339 * @param value String to encode 340 * @param alternate use alternate encode 341 * @return an encoded string 342 */ 343 public String doubleMetaphone(String value, final boolean alternate) { 344 value = cleanInput(value); 345 if (value == null) { 346 return null; 347 } 348 349 final boolean slavoGermanic = isSlavoGermanic(value); 350 int index = isSilentStart(value) ? 1 : 0; 351 352 final DoubleMetaphoneResult result = new DoubleMetaphoneResult(getMaxCodeLen()); 353 354 while (!result.isComplete() && index <= value.length() - 1) { 355 switch (value.charAt(index)) { 356 case 'A': 357 case 'E': 358 case 'I': 359 case 'O': 360 case 'U': 361 case 'Y': 362 index = handleAEIOUY(result, index); 363 break; 364 case 'B': 365 result.append('P'); 366 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; 367 break; 368 case '\u00C7': 369 // A C with a Cedilla 370 result.append('S'); 371 index++; 372 break; 373 case 'C': 374 index = handleC(value, result, index); 375 break; 376 case 'D': 377 index = handleD(value, result, index); 378 break; 379 case 'F': 380 result.append('F'); 381 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; 382 break; 383 case 'G': 384 index = handleG(value, result, index, slavoGermanic); 385 break; 386 case 'H': 387 index = handleH(value, result, index); 388 break; 389 case 'J': 390 index = handleJ(value, result, index, slavoGermanic); 391 break; 392 case 'K': 393 result.append('K'); 394 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; 395 break; 396 case 'L': 397 index = handleL(value, result, index); 398 break; 399 case 'M': 400 result.append('M'); 401 index = conditionM0(value, index) ? index + 2 : index + 1; 402 break; 403 case 'N': 404 result.append('N'); 405 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; 406 break; 407 case '\u00D1': 408 // N with a tilde (spanish ene) 409 result.append('N'); 410 index++; 411 break; 412 case 'P': 413 index = handleP(value, result, index); 414 break; 415 case 'Q': 416 result.append('K'); 417 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; 418 break; 419 case 'R': 420 index = handleR(value, result, index, slavoGermanic); 421 break; 422 case 'S': 423 index = handleS(value, result, index, slavoGermanic); 424 break; 425 case 'T': 426 index = handleT(value, result, index); 427 break; 428 case 'V': 429 result.append('F'); 430 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; 431 break; 432 case 'W': 433 index = handleW(value, result, index); 434 break; 435 case 'X': 436 index = handleX(value, result, index); 437 break; 438 case 'Z': 439 index = handleZ(value, result, index, slavoGermanic); 440 break; 441 default: 442 index++; 443 break; 444 } 445 } 446 447 return alternate ? result.getAlternate() : result.getPrimary(); 448 } 449 450 /** 451 * Encode the value using DoubleMetaphone. It will only work if 452 * {@code obj} is a {@code String} (like {@code Metaphone}). 453 * 454 * @param obj Object to encode (should be of type String) 455 * @return An encoded Object (will be of type String) 456 * @throws EncoderException encode parameter is not of type String 457 */ 458 @Override 459 public Object encode(final Object obj) throws EncoderException { 460 if (!(obj instanceof String)) { 461 throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); 462 } 463 return doubleMetaphone((String) obj); 464 } 465 466 /** 467 * Encode the value using DoubleMetaphone. 468 * 469 * @param value String to encode 470 * @return An encoded String 471 */ 472 @Override 473 public String encode(final String value) { 474 return doubleMetaphone(value); 475 } 476 477 /** 478 * Returns the maxCodeLen. 479 * @return int 480 */ 481 public int getMaxCodeLen() { 482 return this.maxCodeLen; 483 } 484 485 /** 486 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases. 487 */ 488 private int handleAEIOUY(final DoubleMetaphoneResult result, final int index) { 489 if (index == 0) { 490 result.append('A'); 491 } 492 return index + 1; 493 } 494 495 /** 496 * Handles 'C' cases. 497 */ 498 private int handleC(final String value, final DoubleMetaphoneResult result, int index) { 499 if (conditionC0(value, index)) { // very confusing, moved out 500 result.append('K'); 501 index += 2; 502 } else if (index == 0 && contains(value, index, 6, "CAESAR")) { 503 result.append('S'); 504 index += 2; 505 } else if (contains(value, index, 2, "CH")) { 506 index = handleCH(value, result, index); 507 } else if (contains(value, index, 2, "CZ") && 508 !contains(value, index - 2, 4, "WICZ")) { 509 //-- "Czerny" --// 510 result.append('S', 'X'); 511 index += 2; 512 } else if (contains(value, index + 1, 3, "CIA")) { 513 //-- "focaccia" --// 514 result.append('X'); 515 index += 3; 516 } else if (contains(value, index, 2, "CC") && 517 !(index == 1 && charAt(value, 0) == 'M')) { 518 //-- double "cc" but not "McClelland" --// 519 return handleCC(value, result, index); 520 } else if (contains(value, index, 2, "CK", "CG", "CQ")) { 521 result.append('K'); 522 index += 2; 523 } else if (contains(value, index, 2, "CI", "CE", "CY")) { 524 //-- Italian vs. English --// 525 if (contains(value, index, 3, "CIO", "CIE", "CIA")) { 526 result.append('S', 'X'); 527 } else { 528 result.append('S'); 529 } 530 index += 2; 531 } else { 532 result.append('K'); 533 if (contains(value, index + 1, 2, " C", " Q", " G")) { 534 //-- Mac Caffrey, Mac Gregor --// 535 index += 3; 536 } else if (contains(value, index + 1, 1, "C", "K", "Q") && 537 !contains(value, index + 1, 2, "CE", "CI")) { 538 index += 2; 539 } else { 540 index++; 541 } 542 } 543 544 return index; 545 } 546 547 /** 548 * Handles 'CC' cases. 549 */ 550 private int handleCC(final String value, final DoubleMetaphoneResult result, int index) { 551 if (contains(value, index + 2, 1, "I", "E", "H") && 552 !contains(value, index + 2, 2, "HU")) { 553 //-- "bellocchio" but not "bacchus" --// 554 if (index == 1 && charAt(value, index - 1) == 'A' || 555 contains(value, index - 1, 5, "UCCEE", "UCCES")) { 556 //-- "accident", "accede", "succeed" --// 557 result.append("KS"); 558 } else { 559 //-- "bacci", "bertucci", other Italian --// 560 result.append('X'); 561 } 562 index += 3; 563 } else { // Pierce's rule 564 result.append('K'); 565 index += 2; 566 } 567 568 return index; 569 } 570 571 /** 572 * Handles 'CH' cases. 573 */ 574 private int handleCH(final String value, final DoubleMetaphoneResult result, final int index) { 575 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael 576 result.append('K', 'X'); 577 return index + 2; 578 } 579 if (conditionCH0(value, index)) { 580 //-- Greek roots ("chemistry", "chorus", etc.) --// 581 result.append('K'); 582 return index + 2; 583 } 584 if (conditionCH1(value, index)) { 585 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// 586 result.append('K'); 587 return index + 2; 588 } 589 if (index > 0) { 590 if (contains(value, 0, 2, "MC")) { 591 result.append('K'); 592 } else { 593 result.append('X', 'K'); 594 } 595 } else { 596 result.append('X'); 597 } 598 return index + 2; 599 } 600 601 /** 602 * Handles 'D' cases. 603 */ 604 private int handleD(final String value, final DoubleMetaphoneResult result, int index) { 605 if (contains(value, index, 2, "DG")) { 606 //-- "Edge" --// 607 if (contains(value, index + 2, 1, "I", "E", "Y")) { 608 result.append('J'); 609 index += 3; 610 //-- "Edgar" --// 611 } else { 612 result.append("TK"); 613 index += 2; 614 } 615 } else if (contains(value, index, 2, "DT", "DD")) { 616 result.append('T'); 617 index += 2; 618 } else { 619 result.append('T'); 620 index++; 621 } 622 return index; 623 } 624 625 /** 626 * Handles 'G' cases. 627 */ 628 private int handleG(final String value, final DoubleMetaphoneResult result, int index, 629 final boolean slavoGermanic) { 630 if (charAt(value, index + 1) == 'H') { 631 index = handleGH(value, result, index); 632 } else if (charAt(value, index + 1) == 'N') { 633 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { 634 result.append("KN", "N"); 635 } else if (!contains(value, index + 2, 2, "EY") && 636 charAt(value, index + 1) != 'Y' && !slavoGermanic) { 637 result.append("N", "KN"); 638 } else { 639 result.append("KN"); 640 } 641 index += 2; 642 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { 643 result.append("KL", "L"); 644 index += 2; 645 } else if (index == 0 && 646 (charAt(value, index + 1) == 'Y' || 647 contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { 648 //-- -ges-, -gep-, -gel-, -gie- at beginning --// 649 result.append('K', 'J'); 650 index += 2; 651 } else if ((contains(value, index + 1, 2, "ER") || 652 charAt(value, index + 1) == 'Y') && 653 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && 654 !contains(value, index - 1, 1, "E", "I") && 655 !contains(value, index - 1, 3, "RGY", "OGY")) { 656 //-- -ger-, -gy- --// 657 result.append('K', 'J'); 658 index += 2; 659 } else if (contains(value, index + 1, 1, "E", "I", "Y") || 660 contains(value, index - 1, 4, "AGGI", "OGGI")) { 661 //-- Italian "biaggi" --// 662 if (contains(value, 0, 4, "VAN ", "VON ") || 663 contains(value, 0, 3, "SCH") || 664 contains(value, index + 1, 2, "ET")) { 665 //-- obvious germanic --// 666 result.append('K'); 667 } else if (contains(value, index + 1, 3, "IER")) { 668 result.append('J'); 669 } else { 670 result.append('J', 'K'); 671 } 672 index += 2; 673 } else { 674 if (charAt(value, index + 1) == 'G') { 675 index += 2; 676 } else { 677 index++; 678 } 679 result.append('K'); 680 } 681 return index; 682 } 683 684 /** 685 * Handles 'GH' cases. 686 */ 687 private int handleGH(final String value, final DoubleMetaphoneResult result, int index) { 688 if (index > 0 && !isVowel(charAt(value, index - 1))) { 689 result.append('K'); 690 index += 2; 691 } else if (index == 0) { 692 if (charAt(value, index + 2) == 'I') { 693 result.append('J'); 694 } else { 695 result.append('K'); 696 } 697 index += 2; 698 } else if (index > 1 && contains(value, index - 2, 1, "B", "H", "D") || 699 index > 2 && contains(value, index - 3, 1, "B", "H", "D") || 700 index > 3 && contains(value, index - 4, 1, "B", "H")) { 701 //-- Parker's rule (with some further refinements) - "hugh" 702 index += 2; 703 } else { 704 if (index > 2 && charAt(value, index - 1) == 'U' && 705 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { 706 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" 707 result.append('F'); 708 } else if (index > 0 && charAt(value, index - 1) != 'I') { 709 result.append('K'); 710 } 711 index += 2; 712 } 713 return index; 714 } 715 716 /** 717 * Handles 'H' cases. 718 */ 719 private int handleH(final String value, final DoubleMetaphoneResult result, int index) { 720 //-- only keep if first & before vowel or between 2 vowels --// 721 if ((index == 0 || isVowel(charAt(value, index - 1))) && 722 isVowel(charAt(value, index + 1))) { 723 result.append('H'); 724 index += 2; 725 //-- also takes car of "HH" --// 726 } else { 727 index++; 728 } 729 return index; 730 } 731 732 /** 733 * Handles 'J' cases. 734 */ 735 private int handleJ(final String value, final DoubleMetaphoneResult result, int index, 736 final boolean slavoGermanic) { 737 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { 738 //-- obvious Spanish, "Jose", "San Jacinto" --// 739 if (index == 0 && charAt(value, index + 4) == ' ' || 740 value.length() == 4 || contains(value, 0, 4, "SAN ")) { 741 result.append('H'); 742 } else { 743 result.append('J', 'H'); 744 } 745 index++; 746 } else { 747 if (index == 0 && !contains(value, index, 4, "JOSE")) { 748 result.append('J', 'A'); 749 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 750 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { 751 result.append('J', 'H'); 752 } else if (index == value.length() - 1) { 753 result.append('J', ' '); 754 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && 755 !contains(value, index - 1, 1, "S", "K", "L")) { 756 result.append('J'); 757 } 758 759 if (charAt(value, index + 1) == 'J') { 760 index += 2; 761 } else { 762 index++; 763 } 764 } 765 return index; 766 } 767 768 /** 769 * Handles 'L' cases. 770 */ 771 private int handleL(final String value, final DoubleMetaphoneResult result, int index) { 772 if (charAt(value, index + 1) == 'L') { 773 if (conditionL0(value, index)) { 774 result.appendPrimary('L'); 775 } else { 776 result.append('L'); 777 } 778 index += 2; 779 } else { 780 index++; 781 result.append('L'); 782 } 783 return index; 784 } 785 786 /** 787 * Handles 'P' cases. 788 */ 789 private int handleP(final String value, final DoubleMetaphoneResult result, int index) { 790 if (charAt(value, index + 1) == 'H') { 791 result.append('F'); 792 index += 2; 793 } else { 794 result.append('P'); 795 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; 796 } 797 return index; 798 } 799 800 /** 801 * Handles 'R' cases. 802 */ 803 private int handleR(final String value, final DoubleMetaphoneResult result, final int index, 804 final boolean slavoGermanic) { 805 if (index == value.length() - 1 && !slavoGermanic && 806 contains(value, index - 2, 2, "IE") && 807 !contains(value, index - 4, 2, "ME", "MA")) { 808 result.appendAlternate('R'); 809 } else { 810 result.append('R'); 811 } 812 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; 813 } 814 815 //-- BEGIN CONDITIONS --// 816 817 /** 818 * Handles 'S' cases. 819 */ 820 private int handleS(final String value, final DoubleMetaphoneResult result, int index, 821 final boolean slavoGermanic) { 822 if (contains(value, index - 1, 3, "ISL", "YSL")) { 823 //-- special cases "island", "isle", "carlisle", "carlysle" --// 824 index++; 825 } else if (index == 0 && contains(value, index, 5, "SUGAR")) { 826 //-- special case "sugar-" --// 827 result.append('X', 'S'); 828 index++; 829 } else if (contains(value, index, 2, "SH")) { 830 if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) { 831 //-- germanic --// 832 result.append('S'); 833 } else { 834 result.append('X'); 835 } 836 index += 2; 837 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { 838 //-- Italian and Armenian --// 839 if (slavoGermanic) { 840 result.append('S'); 841 } else { 842 result.append('S', 'X'); 843 } 844 index += 3; 845 } else if (index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W") || 846 contains(value, index + 1, 1, "Z")) { 847 //-- german & anglicisations, e.g. "smith" match "schmidt" // 848 // "snider" match "schneider" --// 849 //-- also, -sz- in slavic language although in hungarian it // 850 // is pronounced "s" --// 851 result.append('S', 'X'); 852 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; 853 } else if (contains(value, index, 2, "SC")) { 854 index = handleSC(value, result, index); 855 } else { 856 if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) { 857 //-- french e.g. "resnais", "artois" --// 858 result.appendAlternate('S'); 859 } else { 860 result.append('S'); 861 } 862 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; 863 } 864 return index; 865 } 866 867 /** 868 * Handles 'SC' cases. 869 */ 870 private int handleSC(final String value, final DoubleMetaphoneResult result, final int index) { 871 if (charAt(value, index + 2) == 'H') { 872 //-- Schlesinger's rule --// 873 if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) { 874 //-- Dutch origin, e.g. "school", "schooner" --// 875 if (contains(value, index + 3, 2, "ER", "EN")) { 876 //-- "schermerhorn", "schenker" --// 877 result.append("X", "SK"); 878 } else { 879 result.append("SK"); 880 } 881 } else if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { 882 result.append('X', 'S'); 883 } else { 884 result.append('X'); 885 } 886 } else if (contains(value, index + 2, 1, "I", "E", "Y")) { 887 result.append('S'); 888 } else { 889 result.append("SK"); 890 } 891 return index + 3; 892 } 893 894 /** 895 * Handles 'T' cases. 896 */ 897 private int handleT(final String value, final DoubleMetaphoneResult result, int index) { 898 if (contains(value, index, 4, "TION") || contains(value, index, 3, "TIA", "TCH")) { 899 result.append('X'); 900 index += 3; 901 } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) { 902 if (contains(value, index + 2, 2, "OM", "AM") || 903 //-- special case "thomas", "thames" or germanic --// 904 contains(value, 0, 4, "VAN ", "VON ") || 905 contains(value, 0, 3, "SCH")) { 906 result.append('T'); 907 } else { 908 result.append('0', 'T'); 909 } 910 index += 2; 911 } else { 912 result.append('T'); 913 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; 914 } 915 return index; 916 } 917 918 /** 919 * Handles 'W' cases. 920 */ 921 private int handleW(final String value, final DoubleMetaphoneResult result, int index) { 922 if (contains(value, index, 2, "WR")) { 923 //-- can also be in middle of word --// 924 result.append('R'); 925 index += 2; 926 } else if (index == 0 && (isVowel(charAt(value, index + 1)) || 927 contains(value, index, 2, "WH"))) { 928 if (isVowel(charAt(value, index + 1))) { 929 //-- Wasserman should match Vasserman --// 930 result.append('A', 'F'); 931 } else { 932 //-- need Uomo to match Womo --// 933 result.append('A'); 934 } 935 index++; 936 } else if (index == value.length() - 1 && isVowel(charAt(value, index - 1)) || 937 contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 938 contains(value, 0, 3, "SCH")) { 939 //-- Arnow should match Arnoff --// 940 result.appendAlternate('F'); 941 index++; 942 } else if (contains(value, index, 4, "WICZ", "WITZ")) { 943 //-- Polish e.g. "filipowicz" --// 944 result.append("TS", "FX"); 945 index += 4; 946 } else { 947 index++; 948 } 949 return index; 950 } 951 952 /** 953 * Handles 'X' cases. 954 */ 955 private int handleX(final String value, final DoubleMetaphoneResult result, int index) { 956 if (index == 0) { 957 result.append('S'); 958 index++; 959 } else { 960 if (!(index == value.length() - 1 && 961 (contains(value, index - 3, 3, "IAU", "EAU") || 962 contains(value, index - 2, 2, "AU", "OU")))) { 963 //-- French e.g. breaux --// 964 result.append("KS"); 965 } 966 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; 967 } 968 return index; 969 } 970 971 //-- BEGIN HELPER FUNCTIONS --// 972 973 /** 974 * Handles 'Z' cases. 975 */ 976 private int handleZ(final String value, final DoubleMetaphoneResult result, int index, 977 final boolean slavoGermanic) { 978 if (charAt(value, index + 1) == 'H') { 979 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// 980 result.append('J'); 981 index += 2; 982 } else { 983 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || 984 slavoGermanic && index > 0 && charAt(value, index - 1) != 'T') { 985 result.append("S", "TS"); 986 } else { 987 result.append('S'); 988 } 989 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; 990 } 991 return index; 992 } 993 994 /** 995 * Check if the Double Metaphone values of two {@code String} values 996 * are equal. 997 * 998 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 999 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 1000 * @return {@code true} if the encoded {@code String}s are equal; 1001 * {@code false} otherwise. 1002 * @see #isDoubleMetaphoneEqual(String,String,boolean) 1003 */ 1004 public boolean isDoubleMetaphoneEqual(final String value1, final String value2) { 1005 return isDoubleMetaphoneEqual(value1, value2, false); 1006 } 1007 1008 /** 1009 * Check if the Double Metaphone values of two {@code String} values 1010 * are equal, optionally using the alternate value. 1011 * 1012 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 1013 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 1014 * @param alternate use the alternate value if {@code true}. 1015 * @return {@code true} if the encoded {@code String}s are equal; 1016 * {@code false} otherwise. 1017 */ 1018 public boolean isDoubleMetaphoneEqual(final String value1, final String value2, final boolean alternate) { 1019 return StringUtils.equals(doubleMetaphone(value1, alternate), doubleMetaphone(value2, alternate)); 1020 } 1021 1022 /** 1023 * Determines whether or not the value starts with a silent letter. It will 1024 * return {@code true} if the value starts with any of 'GN', 'KN', 1025 * 'PN', 'WR' or 'PS'. 1026 */ 1027 private boolean isSilentStart(final String value) { 1028 boolean result = false; 1029 for (final String element : SILENT_START) { 1030 if (value.startsWith(element)) { 1031 result = true; 1032 break; 1033 } 1034 } 1035 return result; 1036 } 1037 1038 /** 1039 * Determines whether or not a value is of slavo-germanic origin. A value is 1040 * of slavo-germanic origin if it contains any of 'W', 'K', 'CZ', or 'WITZ'. 1041 */ 1042 private boolean isSlavoGermanic(final String value) { 1043 return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 1044 value.contains("CZ") || value.contains("WITZ"); 1045 } 1046 1047 /** 1048 * Determines whether or not a character is a vowel or not 1049 */ 1050 private boolean isVowel(final char ch) { 1051 return VOWELS.indexOf(ch) != -1; 1052 } 1053 1054 //-- BEGIN INNER CLASSES --// 1055 1056 /** 1057 * Sets the maxCodeLen. 1058 * @param maxCodeLen The maxCodeLen to set 1059 */ 1060 public void setMaxCodeLen(final int maxCodeLen) { 1061 this.maxCodeLen = maxCodeLen; 1062 } 1063}