001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.codec.language; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.HashMap; 023import java.util.LinkedHashSet; 024import java.util.List; 025import java.util.Map; 026import java.util.Scanner; 027import java.util.Set; 028 029import org.apache.commons.codec.CharEncoding; 030import org.apache.commons.codec.EncoderException; 031import org.apache.commons.codec.Resources; 032import org.apache.commons.codec.StringEncoder; 033 034/** 035 * Encodes a string into a Daitch-Mokotoff Soundex value. 036 * <p> 037 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater 038 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. 039 * </p> 040 * <p> 041 * The main differences compared to the other soundex variants are: 042 * </p> 043 * <ul> 044 * <li>coded names are 6 digits long 045 * <li>the initial character of the name is coded 046 * <li>rules to encoded multi-character n-grams 047 * <li>multiple possible encodings for the same name (branching) 048 * </ul> 049 * <p> 050 * This implementation supports branching, depending on the used method: 051 * <ul> 052 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned 053 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' 054 * </ul> 055 * <p> 056 * Note: This implementation has additional branching rules compared to the original description of the algorithm. The 057 * rules can be customized by overriding the default rules contained in the resource file 058 * {@code org/apache/commons/codec/language/dmrules.txt}. 059 * </p> 060 * <p> 061 * This class is thread-safe. 062 * </p> 063 * 064 * @see Soundex 065 * @see <a href="https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> 066 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> 067 * @since 1.10 068 */ 069public class DaitchMokotoffSoundex implements StringEncoder { 070 071 /** 072 * Inner class representing a branch during DM soundex encoding. 073 */ 074 private static final class Branch { 075 private final StringBuilder builder; 076 private String cachedString; 077 private String lastReplacement; 078 079 private Branch() { 080 builder = new StringBuilder(); 081 lastReplacement = null; 082 cachedString = null; 083 } 084 085 /** 086 * Creates a new branch, identical to this branch. 087 * 088 * @return a new, identical branch 089 */ 090 public Branch createBranch() { 091 final Branch branch = new Branch(); 092 branch.builder.append(toString()); 093 branch.lastReplacement = this.lastReplacement; 094 return branch; 095 } 096 097 @Override 098 public boolean equals(final Object other) { 099 if (this == other) { 100 return true; 101 } 102 if (!(other instanceof Branch)) { 103 return false; 104 } 105 106 return toString().equals(((Branch) other).toString()); 107 } 108 109 /** 110 * Finish this branch by appending '0's until the maximum code length has been reached. 111 */ 112 public void finish() { 113 while (builder.length() < MAX_LENGTH) { 114 builder.append('0'); 115 cachedString = null; 116 } 117 } 118 119 @Override 120 public int hashCode() { 121 return toString().hashCode(); 122 } 123 124 /** 125 * Process the next replacement to be added to this branch. 126 * 127 * @param replacement 128 * the next replacement to append 129 * @param forceAppend 130 * indicates if the default processing shall be overridden 131 */ 132 public void processNextReplacement(final String replacement, final boolean forceAppend) { 133 final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend; 134 135 if (append && builder.length() < MAX_LENGTH) { 136 builder.append(replacement); 137 // remove all characters after the maximum length 138 if (builder.length() > MAX_LENGTH) { 139 builder.delete(MAX_LENGTH, builder.length()); 140 } 141 cachedString = null; 142 } 143 144 lastReplacement = replacement; 145 } 146 147 @Override 148 public String toString() { 149 if (cachedString == null) { 150 cachedString = builder.toString(); 151 } 152 return cachedString; 153 } 154 } 155 156 /** 157 * Inner class for storing rules. 158 */ 159 private static final class Rule { 160 private final String pattern; 161 private final String[] replacementAtStart; 162 private final String[] replacementBeforeVowel; 163 private final String[] replacementDefault; 164 165 protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, 166 final String replacementDefault) { 167 this.pattern = pattern; 168 this.replacementAtStart = replacementAtStart.split("\\|"); 169 this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); 170 this.replacementDefault = replacementDefault.split("\\|"); 171 } 172 173 public int getPatternLength() { 174 return pattern.length(); 175 } 176 177 public String[] getReplacements(final String context, final boolean atStart) { 178 if (atStart) { 179 return replacementAtStart; 180 } 181 182 final int nextIndex = getPatternLength(); 183 final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex)); 184 if (nextCharIsVowel) { 185 return replacementBeforeVowel; 186 } 187 188 return replacementDefault; 189 } 190 191 private boolean isVowel(final char ch) { 192 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; 193 } 194 195 public boolean matches(final String context) { 196 return context.startsWith(pattern); 197 } 198 199 @Override 200 public String toString() { 201 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), 202 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); 203 } 204 } 205 206 private static final String COMMENT = "//"; 207 208 private static final String DOUBLE_QUOTE = "\""; 209 210 private static final String MULTILINE_COMMENT_END = "*/"; 211 212 private static final String MULTILINE_COMMENT_START = "/*"; 213 214 /** The resource file containing the replacement and folding rules */ 215 private static final String RESOURCE_FILE = "/org/apache/commons/codec/language/dmrules.txt"; 216 217 /** The code length of a DM soundex value. */ 218 private static final int MAX_LENGTH = 6; 219 220 /** Transformation rules indexed by the first character of their pattern. */ 221 private static final Map<Character, List<Rule>> RULES = new HashMap<>(); 222 223 /** Folding rules. */ 224 private static final Map<Character, Character> FOLDINGS = new HashMap<>(); 225 226 static { 227 try (Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) { 228 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); 229 } 230 231 // sort RULES by pattern length in descending order 232 RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength())); 233 } 234 235 private static void parseRules(final Scanner scanner, final String location, 236 final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { 237 int currentLine = 0; 238 boolean inMultilineComment = false; 239 240 while (scanner.hasNextLine()) { 241 currentLine++; 242 final String rawLine = scanner.nextLine(); 243 String line = rawLine; 244 245 if (inMultilineComment) { 246 if (line.endsWith(MULTILINE_COMMENT_END)) { 247 inMultilineComment = false; 248 } 249 continue; 250 } 251 252 if (line.startsWith(MULTILINE_COMMENT_START)) { 253 inMultilineComment = true; 254 } else { 255 // discard comments 256 final int cmtI = line.indexOf(COMMENT); 257 if (cmtI >= 0) { 258 line = line.substring(0, cmtI); 259 } 260 261 // trim leading-trailing whitespace 262 line = line.trim(); 263 264 if (line.isEmpty()) { 265 continue; // empty lines can be safely skipped 266 } 267 268 if (line.contains("=")) { 269 // folding 270 final String[] parts = line.split("="); 271 if (parts.length != 2) { 272 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + 273 " parts: " + rawLine + " in " + location); 274 } 275 final String leftCharacter = parts[0]; 276 final String rightCharacter = parts[1]; 277 278 if (leftCharacter.length() != 1 || rightCharacter.length() != 1) { 279 throw new IllegalArgumentException("Malformed folding statement - " + 280 "patterns are not single characters: " + rawLine + " in " + location); 281 } 282 283 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); 284 } else { 285 // rule 286 final String[] parts = line.split("\\s+"); 287 if (parts.length != 4) { 288 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 289 " parts: " + rawLine + " in " + location); 290 } 291 try { 292 final String pattern = stripQuotes(parts[0]); 293 final String replacement1 = stripQuotes(parts[1]); 294 final String replacement2 = stripQuotes(parts[2]); 295 final String replacement3 = stripQuotes(parts[3]); 296 297 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); 298 final char patternKey = r.pattern.charAt(0); 299 final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>()); 300 rules.add(r); 301 } catch (final IllegalArgumentException e) { 302 throw new IllegalStateException( 303 "Problem parsing line '" + currentLine + "' in " + location, e); 304 } 305 } 306 } 307 } 308 } 309 310 private static String stripQuotes(String str) { 311 if (str.startsWith(DOUBLE_QUOTE)) { 312 str = str.substring(1); 313 } 314 315 if (str.endsWith(DOUBLE_QUOTE)) { 316 str = str.substring(0, str.length() - 1); 317 } 318 319 return str; 320 } 321 322 /** Whether to use ASCII folding prior to encoding. */ 323 private final boolean folding; 324 325 /** 326 * Creates a new instance with ASCII-folding enabled. 327 */ 328 public DaitchMokotoffSoundex() { 329 this(true); 330 } 331 332 /** 333 * Creates a new instance. 334 * <p> 335 * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, for example 336 * รจ -> e. 337 * </p> 338 * 339 * @param folding 340 * if ASCII-folding shall be performed before encoding 341 */ 342 public DaitchMokotoffSoundex(final boolean folding) { 343 this.folding = folding; 344 } 345 346 /** 347 * Performs a cleanup of the input string before the actual soundex transformation. 348 * <p> 349 * Removes all whitespace characters and performs ASCII folding if enabled. 350 * </p> 351 * 352 * @param input 353 * the input string to clean up 354 * @return a cleaned up string 355 */ 356 private String cleanup(final String input) { 357 final StringBuilder sb = new StringBuilder(); 358 for (char ch : input.toCharArray()) { 359 if (Character.isWhitespace(ch)) { 360 continue; 361 } 362 363 ch = Character.toLowerCase(ch); 364 final Character character = FOLDINGS.get(ch); 365 if (folding && character != null) { 366 ch = character; 367 } 368 sb.append(ch); 369 } 370 return sb.toString(); 371 } 372 373 /** 374 * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. 375 * <p> 376 * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an 377 * EncoderException if the supplied object is not of type {@link String}. 378 * </p> 379 * 380 * @see #soundex(String) 381 * @param obj 382 * Object to encode 383 * @return An object (of type {@link String}) containing the DM soundex code, which corresponds to the String 384 * supplied. 385 * @throws EncoderException 386 * if the parameter supplied is not of type {@link String} 387 * @throws IllegalArgumentException 388 * if a character is not mapped 389 */ 390 @Override 391 public Object encode(final Object obj) throws EncoderException { 392 if (!(obj instanceof String)) { 393 throw new EncoderException( 394 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); 395 } 396 return encode((String) obj); 397 } 398 399 /** 400 * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. 401 * 402 * @see #soundex(String) 403 * @param source 404 * A String object to encode 405 * @return A DM Soundex code corresponding to the String supplied 406 * @throws IllegalArgumentException 407 * if a character is not mapped 408 */ 409 @Override 410 public String encode(final String source) { 411 if (source == null) { 412 return null; 413 } 414 return soundex(source, false)[0]; 415 } 416 417 /** 418 * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. 419 * <p> 420 * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, 421 * separated by '|'. 422 * </p> 423 * <p> 424 * Example: the name "AUERBACH" is encoded as both 425 * </p> 426 * <ul> 427 * <li>097400</li> 428 * <li>097500</li> 429 * </ul> 430 * <p> 431 * Thus the result will be "097400|097500". 432 * </p> 433 * 434 * @param source 435 * A String object to encode 436 * @return A string containing a set of DM Soundex codes corresponding to the String supplied 437 * @throws IllegalArgumentException 438 * if a character is not mapped 439 */ 440 public String soundex(final String source) { 441 return String.join("|", soundex(source, true)); 442 } 443 444 /** 445 * Perform the actual DM Soundex algorithm on the input string. 446 * 447 * @param source 448 * A String object to encode 449 * @param branching 450 * If branching shall be performed 451 * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the 452 * selected branching mode 453 */ 454 private String[] soundex(final String source, final boolean branching) { 455 if (source == null) { 456 return null; 457 } 458 459 final String input = cleanup(source); 460 461 final Set<Branch> currentBranches = new LinkedHashSet<>(); 462 currentBranches.add(new Branch()); 463 464 char lastChar = '\0'; 465 for (int index = 0; index < input.length(); index++) { 466 final char ch = input.charAt(index); 467 468 // ignore whitespace inside a name 469 if (Character.isWhitespace(ch)) { 470 continue; 471 } 472 473 final String inputContext = input.substring(index); 474 final List<Rule> rules = RULES.get(ch); 475 if (rules == null) { 476 continue; 477 } 478 479 // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access 480 final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList(); 481 482 for (final Rule rule : rules) { 483 if (rule.matches(inputContext)) { 484 if (branching) { 485 nextBranches.clear(); 486 } 487 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); 488 final boolean branchingRequired = replacements.length > 1 && branching; 489 490 for (final Branch branch : currentBranches) { 491 for (final String nextReplacement : replacements) { 492 // if we have multiple replacements, always create a new branch 493 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; 494 495 // special rule: occurrences of mn or nm are treated differently 496 final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm'; 497 498 nextBranch.processNextReplacement(nextReplacement, force); 499 500 if (!branching) { 501 break; 502 } 503 nextBranches.add(nextBranch); 504 } 505 } 506 507 if (branching) { 508 currentBranches.clear(); 509 currentBranches.addAll(nextBranches); 510 } 511 index += rule.getPatternLength() - 1; 512 break; 513 } 514 } 515 516 lastChar = ch; 517 } 518 519 final String[] result = new String[currentBranches.size()]; 520 int index = 0; 521 for (final Branch branch : currentBranches) { 522 branch.finish(); 523 result[index++] = branch.toString(); 524 } 525 526 return result; 527 } 528}