View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.codec.language;
18  
19  import java.util.ArrayList;
20  import java.util.Arrays;
21  import java.util.Collections;
22  import java.util.HashMap;
23  import java.util.LinkedHashSet;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.Scanner;
27  import java.util.Set;
28  
29  import org.apache.commons.codec.CharEncoding;
30  import org.apache.commons.codec.EncoderException;
31  import org.apache.commons.codec.Resources;
32  import org.apache.commons.codec.StringEncoder;
33  
34  /**
35   * Encodes a string into a Daitch-Mokotoff Soundex value.
36   * <p>
37   * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
38   * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
39   * </p>
40   * <p>
41   * The main differences compared to the other soundex variants are:
42   * </p>
43   * <ul>
44   * <li>coded names are 6 digits long
45   * <li>the initial character of the name is coded
46   * <li>rules to encoded multi-character n-grams
47   * <li>multiple possible encodings for the same name (branching)
48   * </ul>
49   * <p>
50   * This implementation supports branching, depending on the used method:
51   * <ul>
52   * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
53   * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
54   * </ul>
55   * <p>
56   * Note: This implementation has additional branching rules compared to the original description of the algorithm. The
57   * rules can be customized by overriding the default rules contained in the resource file
58   * {@code org/apache/commons/codec/language/dmrules.txt}.
59   * </p>
60   * <p>
61   * This class is thread-safe.
62   * </p>
63   *
64   * @see Soundex
65   * @see <a href="https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
66   * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
67   * @since 1.10
68   */
69  public class DaitchMokotoffSoundex implements StringEncoder {
70  
71      /**
72       * Inner class representing a branch during DM soundex encoding.
73       */
74      private static final class Branch {
75          private final StringBuilder builder;
76          private String cachedString;
77          private String lastReplacement;
78  
79          private Branch() {
80              builder = new StringBuilder();
81              lastReplacement = null;
82              cachedString = null;
83          }
84  
85          /**
86           * Creates a new branch, identical to this branch.
87           *
88           * @return a new, identical branch
89           */
90          public Branch createBranch() {
91              final Branch branch = new Branch();
92              branch.builder.append(toString());
93              branch.lastReplacement = this.lastReplacement;
94              return branch;
95          }
96  
97          @Override
98          public boolean equals(final Object other) {
99              if (this == other) {
100                 return true;
101             }
102             if (!(other instanceof Branch)) {
103                 return false;
104             }
105 
106             return toString().equals(((Branch) other).toString());
107         }
108 
109         /**
110          * Finish this branch by appending '0's until the maximum code length has been reached.
111          */
112         public void finish() {
113             while (builder.length() < MAX_LENGTH) {
114                 builder.append('0');
115                 cachedString = null;
116             }
117         }
118 
119         @Override
120         public int hashCode() {
121             return toString().hashCode();
122         }
123 
124         /**
125          * Process the next replacement to be added to this branch.
126          *
127          * @param replacement
128          *            the next replacement to append
129          * @param forceAppend
130          *            indicates if the default processing shall be overridden
131          */
132         public void processNextReplacement(final String replacement, final boolean forceAppend) {
133             final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
134 
135             if (append && builder.length() < MAX_LENGTH) {
136                 builder.append(replacement);
137                 // remove all characters after the maximum length
138                 if (builder.length() > MAX_LENGTH) {
139                     builder.delete(MAX_LENGTH, builder.length());
140                 }
141                 cachedString = null;
142             }
143 
144             lastReplacement = replacement;
145         }
146 
147         @Override
148         public String toString() {
149             if (cachedString == null) {
150                 cachedString = builder.toString();
151             }
152             return cachedString;
153         }
154     }
155 
156     /**
157      * Inner class for storing rules.
158      */
159     private static final class Rule {
160         private final String pattern;
161         private final String[] replacementAtStart;
162         private final String[] replacementBeforeVowel;
163         private final String[] replacementDefault;
164 
165         protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
166                 final String replacementDefault) {
167             this.pattern = pattern;
168             this.replacementAtStart = replacementAtStart.split("\\|");
169             this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
170             this.replacementDefault = replacementDefault.split("\\|");
171         }
172 
173         public int getPatternLength() {
174             return pattern.length();
175         }
176 
177         public String[] getReplacements(final String context, final boolean atStart) {
178             if (atStart) {
179                 return replacementAtStart;
180             }
181 
182             final int nextIndex = getPatternLength();
183             final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
184             if (nextCharIsVowel) {
185                 return replacementBeforeVowel;
186             }
187 
188             return replacementDefault;
189         }
190 
191         private boolean isVowel(final char ch) {
192             return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
193         }
194 
195         public boolean matches(final String context) {
196             return context.startsWith(pattern);
197         }
198 
199         @Override
200         public String toString() {
201             return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
202                     Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
203         }
204     }
205 
206     private static final String COMMENT = "//";
207 
208     private static final String DOUBLE_QUOTE = "\"";
209 
210     private static final String MULTILINE_COMMENT_END = "*/";
211 
212     private static final String MULTILINE_COMMENT_START = "/*";
213 
214     /** The resource file containing the replacement and folding rules */
215     private static final String RESOURCE_FILE = "/org/apache/commons/codec/language/dmrules.txt";
216 
217     /** The code length of a DM soundex value. */
218     private static final int MAX_LENGTH = 6;
219 
220     /** Transformation rules indexed by the first character of their pattern. */
221     private static final Map<Character, List<Rule>> RULES = new HashMap<>();
222 
223     /** Folding rules. */
224     private static final Map<Character, Character> FOLDINGS = new HashMap<>();
225 
226     static {
227         try (Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
228             parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
229         }
230 
231         // sort RULES by pattern length in descending order
232         RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
233     }
234 
235     private static void parseRules(final Scanner scanner, final String location,
236             final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
237         int currentLine = 0;
238         boolean inMultilineComment = false;
239 
240         while (scanner.hasNextLine()) {
241             currentLine++;
242             final String rawLine = scanner.nextLine();
243             String line = rawLine;
244 
245             if (inMultilineComment) {
246                 if (line.endsWith(MULTILINE_COMMENT_END)) {
247                     inMultilineComment = false;
248                 }
249                 continue;
250             }
251 
252             if (line.startsWith(MULTILINE_COMMENT_START)) {
253                 inMultilineComment = true;
254             } else {
255                 // discard comments
256                 final int cmtI = line.indexOf(COMMENT);
257                 if (cmtI >= 0) {
258                     line = line.substring(0, cmtI);
259                 }
260 
261                 // trim leading-trailing whitespace
262                 line = line.trim();
263 
264                 if (line.isEmpty()) {
265                     continue; // empty lines can be safely skipped
266                 }
267 
268                 if (line.contains("=")) {
269                     // folding
270                     final String[] parts = line.split("=");
271                     if (parts.length != 2) {
272                         throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
273                                 " parts: " + rawLine + " in " + location);
274                     }
275                     final String leftCharacter = parts[0];
276                     final String rightCharacter = parts[1];
277 
278                     if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
279                         throw new IllegalArgumentException("Malformed folding statement - " +
280                                 "patterns are not single characters: " + rawLine + " in " + location);
281                     }
282 
283                     asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
284                 } else {
285                     // rule
286                     final String[] parts = line.split("\\s+");
287                     if (parts.length != 4) {
288                         throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
289                                 " parts: " + rawLine + " in " + location);
290                     }
291                     try {
292                         final String pattern = stripQuotes(parts[0]);
293                         final String replacement1 = stripQuotes(parts[1]);
294                         final String replacement2 = stripQuotes(parts[2]);
295                         final String replacement3 = stripQuotes(parts[3]);
296 
297                         final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
298                         final char patternKey = r.pattern.charAt(0);
299                         final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
300                         rules.add(r);
301                     } catch (final IllegalArgumentException e) {
302                         throw new IllegalStateException(
303                                 "Problem parsing line '" + currentLine + "' in " + location, e);
304                     }
305                 }
306             }
307         }
308     }
309 
310     private static String stripQuotes(String str) {
311         if (str.startsWith(DOUBLE_QUOTE)) {
312             str = str.substring(1);
313         }
314 
315         if (str.endsWith(DOUBLE_QUOTE)) {
316             str = str.substring(0, str.length() - 1);
317         }
318 
319         return str;
320     }
321 
322     /** Whether to use ASCII folding prior to encoding. */
323     private final boolean folding;
324 
325     /**
326      * Creates a new instance with ASCII-folding enabled.
327      */
328     public DaitchMokotoffSoundex() {
329         this(true);
330     }
331 
332     /**
333      * Creates a new instance.
334      * <p>
335      * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
336      * è -&gt; e.
337      * </p>
338      *
339      * @param folding
340      *            if ASCII-folding shall be performed before encoding
341      */
342     public DaitchMokotoffSoundex(final boolean folding) {
343         this.folding = folding;
344     }
345 
346     /**
347      * Performs a cleanup of the input string before the actual soundex transformation.
348      * <p>
349      * Removes all whitespace characters and performs ASCII folding if enabled.
350      * </p>
351      *
352      * @param input
353      *            the input string to clean up
354      * @return a cleaned up string
355      */
356     private String cleanup(final String input) {
357         final StringBuilder sb = new StringBuilder();
358         for (char ch : input.toCharArray()) {
359             if (Character.isWhitespace(ch)) {
360                 continue;
361             }
362 
363             ch = Character.toLowerCase(ch);
364             final Character character = FOLDINGS.get(ch);
365             if (folding && character != null) {
366                 ch = character;
367             }
368             sb.append(ch);
369         }
370         return sb.toString();
371     }
372 
373     /**
374      * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
375      * <p>
376      * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
377      * EncoderException if the supplied object is not of type {@link String}.
378      * </p>
379      *
380      * @see #soundex(String)
381      * @param obj
382      *            Object to encode
383      * @return An object (of type {@link String}) containing the DM soundex code, which corresponds to the String
384      *         supplied.
385      * @throws EncoderException
386      *             if the parameter supplied is not of type {@link String}
387      * @throws IllegalArgumentException
388      *             if a character is not mapped
389      */
390     @Override
391     public Object encode(final Object obj) throws EncoderException {
392         if (!(obj instanceof String)) {
393             throw new EncoderException(
394                     "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
395         }
396         return encode((String) obj);
397     }
398 
399     /**
400      * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
401      *
402      * @see #soundex(String)
403      * @param source
404      *            A String object to encode
405      * @return A DM Soundex code corresponding to the String supplied
406      * @throws IllegalArgumentException
407      *             if a character is not mapped
408      */
409     @Override
410     public String encode(final String source) {
411         if (source == null) {
412             return null;
413         }
414         return soundex(source, false)[0];
415     }
416 
417     /**
418      * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
419      * <p>
420      * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
421      * separated by '|'.
422      * </p>
423      * <p>
424      * Example: the name "AUERBACH" is encoded as both
425      * </p>
426      * <ul>
427      * <li>097400</li>
428      * <li>097500</li>
429      * </ul>
430      * <p>
431      * Thus the result will be "097400|097500".
432      * </p>
433      *
434      * @param source
435      *            A String object to encode
436      * @return A string containing a set of DM Soundex codes corresponding to the String supplied
437      * @throws IllegalArgumentException
438      *             if a character is not mapped
439      */
440     public String soundex(final String source) {
441         return String.join("|", soundex(source, true));
442     }
443 
444     /**
445      * Perform the actual DM Soundex algorithm on the input string.
446      *
447      * @param source
448      *            A String object to encode
449      * @param branching
450      *            If branching shall be performed
451      * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
452      *         selected branching mode
453      */
454     private String[] soundex(final String source, final boolean branching) {
455         if (source == null) {
456             return null;
457         }
458 
459         final String input = cleanup(source);
460 
461         final Set<Branch> currentBranches = new LinkedHashSet<>();
462         currentBranches.add(new Branch());
463 
464         char lastChar = '\0';
465         for (int index = 0; index < input.length(); index++) {
466             final char ch = input.charAt(index);
467 
468             // ignore whitespace inside a name
469             if (Character.isWhitespace(ch)) {
470                 continue;
471             }
472 
473             final String inputContext = input.substring(index);
474             final List<Rule> rules = RULES.get(ch);
475             if (rules == null) {
476                 continue;
477             }
478 
479             // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
480             final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();
481 
482             for (final Rule rule : rules) {
483                 if (rule.matches(inputContext)) {
484                     if (branching) {
485                         nextBranches.clear();
486                     }
487                     final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
488                     final boolean branchingRequired = replacements.length > 1 && branching;
489 
490                     for (final Branch branch : currentBranches) {
491                         for (final String nextReplacement : replacements) {
492                             // if we have multiple replacements, always create a new branch
493                             final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
494 
495                             // special rule: occurrences of mn or nm are treated differently
496                             final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm';
497 
498                             nextBranch.processNextReplacement(nextReplacement, force);
499 
500                             if (!branching) {
501                                 break;
502                             }
503                             nextBranches.add(nextBranch);
504                         }
505                     }
506 
507                     if (branching) {
508                         currentBranches.clear();
509                         currentBranches.addAll(nextBranches);
510                     }
511                     index += rule.getPatternLength() - 1;
512                     break;
513                 }
514             }
515 
516             lastChar = ch;
517         }
518 
519         final String[] result = new String[currentBranches.size()];
520         int index = 0;
521         for (final Branch branch : currentBranches) {
522             branch.finish();
523             result[index++] = branch.toString();
524         }
525 
526         return result;
527     }
528 }