001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.codec.language; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.HashMap; 023import java.util.LinkedHashSet; 024import java.util.List; 025import java.util.Map; 026import java.util.Scanner; 027import java.util.Set; 028 029import org.apache.commons.codec.CharEncoding; 030import org.apache.commons.codec.EncoderException; 031import org.apache.commons.codec.Resources; 032import org.apache.commons.codec.StringEncoder; 033 034/** 035 * Encodes a string into a Daitch-Mokotoff Soundex value. 036 * <p> 037 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater 038 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. 039 * </p> 040 * <p> 041 * The main differences compared to the other soundex variants are: 042 * </p> 043 * <ul> 044 * <li>coded names are 6 digits long 045 * <li>the initial character of the name is coded 046 * <li>rules to encoded multi-character n-grams 047 * <li>multiple possible encodings for the same name (branching) 048 * </ul> 049 * <p> 050 * This implementation supports branching, depending on the used method: 051 * <ul> 052 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned 053 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' 054 * </ul> 055 * <p> 056 * Note: this implementation has additional branching rules compared to the original description of the algorithm. The 057 * rules can be customized by overriding the default rules contained in the resource file 058 * {@code org/apache/commons/codec/language/dmrules.txt}. 059 * </p> 060 * <p> 061 * This class is thread-safe. 062 * </p> 063 * 064 * @see Soundex 065 * @see <a href="https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> 066 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> 067 * 068 * @since 1.10 069 */ 070public class DaitchMokotoffSoundex implements StringEncoder { 071 072 /** 073 * Inner class representing a branch during DM soundex encoding. 074 */ 075 private static final class Branch { 076 private final StringBuilder builder; 077 private String cachedString; 078 private String lastReplacement; 079 080 private Branch() { 081 builder = new StringBuilder(); 082 lastReplacement = null; 083 cachedString = null; 084 } 085 086 /** 087 * Creates a new branch, identical to this branch. 088 * 089 * @return a new, identical branch 090 */ 091 public Branch createBranch() { 092 final Branch branch = new Branch(); 093 branch.builder.append(toString()); 094 branch.lastReplacement = this.lastReplacement; 095 return branch; 096 } 097 098 @Override 099 public boolean equals(final Object other) { 100 if (this == other) { 101 return true; 102 } 103 if (!(other instanceof Branch)) { 104 return false; 105 } 106 107 return toString().equals(((Branch) other).toString()); 108 } 109 110 /** 111 * Finish this branch by appending '0's until the maximum code length has been reached. 112 */ 113 public void finish() { 114 while (builder.length() < MAX_LENGTH) { 115 builder.append('0'); 116 cachedString = null; 117 } 118 } 119 120 @Override 121 public int hashCode() { 122 return toString().hashCode(); 123 } 124 125 /** 126 * Process the next replacement to be added to this branch. 127 * 128 * @param replacement 129 * the next replacement to append 130 * @param forceAppend 131 * indicates if the default processing shall be overridden 132 */ 133 public void processNextReplacement(final String replacement, final boolean forceAppend) { 134 final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend; 135 136 if (append && builder.length() < MAX_LENGTH) { 137 builder.append(replacement); 138 // remove all characters after the maximum length 139 if (builder.length() > MAX_LENGTH) { 140 builder.delete(MAX_LENGTH, builder.length()); 141 } 142 cachedString = null; 143 } 144 145 lastReplacement = replacement; 146 } 147 148 @Override 149 public String toString() { 150 if (cachedString == null) { 151 cachedString = builder.toString(); 152 } 153 return cachedString; 154 } 155 } 156 157 /** 158 * Inner class for storing rules. 159 */ 160 private static final class Rule { 161 private final String pattern; 162 private final String[] replacementAtStart; 163 private final String[] replacementBeforeVowel; 164 private final String[] replacementDefault; 165 166 protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, 167 final String replacementDefault) { 168 this.pattern = pattern; 169 this.replacementAtStart = replacementAtStart.split("\\|"); 170 this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); 171 this.replacementDefault = replacementDefault.split("\\|"); 172 } 173 174 public int getPatternLength() { 175 return pattern.length(); 176 } 177 178 public String[] getReplacements(final String context, final boolean atStart) { 179 if (atStart) { 180 return replacementAtStart; 181 } 182 183 final int nextIndex = getPatternLength(); 184 final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex)); 185 if (nextCharIsVowel) { 186 return replacementBeforeVowel; 187 } 188 189 return replacementDefault; 190 } 191 192 private boolean isVowel(final char ch) { 193 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; 194 } 195 196 public boolean matches(final String context) { 197 return context.startsWith(pattern); 198 } 199 200 @Override 201 public String toString() { 202 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), 203 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); 204 } 205 } 206 207 private static final String COMMENT = "//"; 208 209 private static final String DOUBLE_QUOTE = "\""; 210 211 private static final String MULTILINE_COMMENT_END = "*/"; 212 213 private static final String MULTILINE_COMMENT_START = "/*"; 214 215 /** The resource file containing the replacement and folding rules */ 216 private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt"; 217 218 /** The code length of a DM soundex value. */ 219 private static final int MAX_LENGTH = 6; 220 221 /** Transformation rules indexed by the first character of their pattern. */ 222 private static final Map<Character, List<Rule>> RULES = new HashMap<>(); 223 224 /** Folding rules. */ 225 private static final Map<Character, Character> FOLDINGS = new HashMap<>(); 226 227 static { 228 try (final Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) { 229 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); 230 } 231 232 // sort RULES by pattern length in descending order 233 RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength())); 234 } 235 236 private static void parseRules(final Scanner scanner, final String location, 237 final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { 238 int currentLine = 0; 239 boolean inMultilineComment = false; 240 241 while (scanner.hasNextLine()) { 242 currentLine++; 243 final String rawLine = scanner.nextLine(); 244 String line = rawLine; 245 246 if (inMultilineComment) { 247 if (line.endsWith(MULTILINE_COMMENT_END)) { 248 inMultilineComment = false; 249 } 250 continue; 251 } 252 253 if (line.startsWith(MULTILINE_COMMENT_START)) { 254 inMultilineComment = true; 255 } else { 256 // discard comments 257 final int cmtI = line.indexOf(COMMENT); 258 if (cmtI >= 0) { 259 line = line.substring(0, cmtI); 260 } 261 262 // trim leading-trailing whitespace 263 line = line.trim(); 264 265 if (line.isEmpty()) { 266 continue; // empty lines can be safely skipped 267 } 268 269 if (line.contains("=")) { 270 // folding 271 final String[] parts = line.split("="); 272 if (parts.length != 2) { 273 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + 274 " parts: " + rawLine + " in " + location); 275 } 276 final String leftCharacter = parts[0]; 277 final String rightCharacter = parts[1]; 278 279 if (leftCharacter.length() != 1 || rightCharacter.length() != 1) { 280 throw new IllegalArgumentException("Malformed folding statement - " + 281 "patterns are not single characters: " + rawLine + " in " + location); 282 } 283 284 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); 285 } else { 286 // rule 287 final String[] parts = line.split("\\s+"); 288 if (parts.length != 4) { 289 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 290 " parts: " + rawLine + " in " + location); 291 } 292 try { 293 final String pattern = stripQuotes(parts[0]); 294 final String replacement1 = stripQuotes(parts[1]); 295 final String replacement2 = stripQuotes(parts[2]); 296 final String replacement3 = stripQuotes(parts[3]); 297 298 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); 299 final char patternKey = r.pattern.charAt(0); 300 final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>()); 301 rules.add(r); 302 } catch (final IllegalArgumentException e) { 303 throw new IllegalStateException( 304 "Problem parsing line '" + currentLine + "' in " + location, e); 305 } 306 } 307 } 308 } 309 } 310 311 private static String stripQuotes(String str) { 312 if (str.startsWith(DOUBLE_QUOTE)) { 313 str = str.substring(1); 314 } 315 316 if (str.endsWith(DOUBLE_QUOTE)) { 317 str = str.substring(0, str.length() - 1); 318 } 319 320 return str; 321 } 322 323 /** Whether to use ASCII folding prior to encoding. */ 324 private final boolean folding; 325 326 /** 327 * Creates a new instance with ASCII-folding enabled. 328 */ 329 public DaitchMokotoffSoundex() { 330 this(true); 331 } 332 333 /** 334 * Creates a new instance. 335 * <p> 336 * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. 337 * รจ -> e. 338 * </p> 339 * 340 * @param folding 341 * if ASCII-folding shall be performed before encoding 342 */ 343 public DaitchMokotoffSoundex(final boolean folding) { 344 this.folding = folding; 345 } 346 347 /** 348 * Performs a cleanup of the input string before the actual soundex transformation. 349 * <p> 350 * Removes all whitespace characters and performs ASCII folding if enabled. 351 * </p> 352 * 353 * @param input 354 * the input string to clean up 355 * @return a cleaned up string 356 */ 357 private String cleanup(final String input) { 358 final StringBuilder sb = new StringBuilder(); 359 for (char ch : input.toCharArray()) { 360 if (Character.isWhitespace(ch)) { 361 continue; 362 } 363 364 ch = Character.toLowerCase(ch); 365 final Character character = FOLDINGS.get(ch); 366 if (folding && character != null) { 367 ch = character; 368 } 369 sb.append(ch); 370 } 371 return sb.toString(); 372 } 373 374 /** 375 * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. 376 * <p> 377 * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an 378 * EncoderException if the supplied object is not of type {@link String}. 379 * </p> 380 * 381 * @see #soundex(String) 382 * 383 * @param obj 384 * Object to encode 385 * @return An object (of type {@link String}) containing the DM soundex code, which corresponds to the String 386 * supplied. 387 * @throws EncoderException 388 * if the parameter supplied is not of type {@link String} 389 * @throws IllegalArgumentException 390 * if a character is not mapped 391 */ 392 @Override 393 public Object encode(final Object obj) throws EncoderException { 394 if (!(obj instanceof String)) { 395 throw new EncoderException( 396 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); 397 } 398 return encode((String) obj); 399 } 400 401 /** 402 * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. 403 * 404 * @see #soundex(String) 405 * 406 * @param source 407 * A String object to encode 408 * @return A DM Soundex code corresponding to the String supplied 409 * @throws IllegalArgumentException 410 * if a character is not mapped 411 */ 412 @Override 413 public String encode(final String source) { 414 if (source == null) { 415 return null; 416 } 417 return soundex(source, false)[0]; 418 } 419 420 /** 421 * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. 422 * <p> 423 * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, 424 * separated by '|'. 425 * </p> 426 * <p> 427 * Example: the name "AUERBACH" is encoded as both 428 * </p> 429 * <ul> 430 * <li>097400</li> 431 * <li>097500</li> 432 * </ul> 433 * <p> 434 * Thus the result will be "097400|097500". 435 * </p> 436 * 437 * @param source 438 * A String object to encode 439 * @return A string containing a set of DM Soundex codes corresponding to the String supplied 440 * @throws IllegalArgumentException 441 * if a character is not mapped 442 */ 443 public String soundex(final String source) { 444 final String[] branches = soundex(source, true); 445 final StringBuilder sb = new StringBuilder(); 446 int index = 0; 447 for (final String branch : branches) { 448 sb.append(branch); 449 if (++index < branches.length) { 450 sb.append('|'); 451 } 452 } 453 return sb.toString(); 454 } 455 456 /** 457 * Perform the actual DM Soundex algorithm on the input string. 458 * 459 * @param source 460 * A String object to encode 461 * @param branching 462 * If branching shall be performed 463 * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the 464 * selected branching mode 465 */ 466 private String[] soundex(final String source, final boolean branching) { 467 if (source == null) { 468 return null; 469 } 470 471 final String input = cleanup(source); 472 473 final Set<Branch> currentBranches = new LinkedHashSet<>(); 474 currentBranches.add(new Branch()); 475 476 char lastChar = '\0'; 477 for (int index = 0; index < input.length(); index++) { 478 final char ch = input.charAt(index); 479 480 // ignore whitespace inside a name 481 if (Character.isWhitespace(ch)) { 482 continue; 483 } 484 485 final String inputContext = input.substring(index); 486 final List<Rule> rules = RULES.get(ch); 487 if (rules == null) { 488 continue; 489 } 490 491 // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access 492 final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList(); 493 494 for (final Rule rule : rules) { 495 if (rule.matches(inputContext)) { 496 if (branching) { 497 nextBranches.clear(); 498 } 499 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); 500 final boolean branchingRequired = replacements.length > 1 && branching; 501 502 for (final Branch branch : currentBranches) { 503 for (final String nextReplacement : replacements) { 504 // if we have multiple replacements, always create a new branch 505 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; 506 507 // special rule: occurrences of mn or nm are treated differently 508 final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm'; 509 510 nextBranch.processNextReplacement(nextReplacement, force); 511 512 if (!branching) { 513 break; 514 } 515 nextBranches.add(nextBranch); 516 } 517 } 518 519 if (branching) { 520 currentBranches.clear(); 521 currentBranches.addAll(nextBranches); 522 } 523 index += rule.getPatternLength() - 1; 524 break; 525 } 526 } 527 528 lastChar = ch; 529 } 530 531 final String[] result = new String[currentBranches.size()]; 532 int index = 0; 533 for (final Branch branch : currentBranches) { 534 branch.finish(); 535 result[index++] = branch.toString(); 536 } 537 538 return result; 539 } 540}