Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.codec.language;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.HashMap;
023import java.util.LinkedHashSet;
024import java.util.List;
025import java.util.Map;
026import java.util.Scanner;
027import java.util.Set;
028
029import org.apache.commons.codec.CharEncoding;
030import org.apache.commons.codec.EncoderException;
031import org.apache.commons.codec.Resources;
032import org.apache.commons.codec.StringEncoder;
033
034/**
035 * Encodes a string into a Daitch-Mokotoff Soundex value.
036 * <p>
037 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
038 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
039 * </p>
040 * <p>
041 * The main differences compared to the other soundex variants are:
042 * </p>
043 * <ul>
044 * <li>coded names are 6 digits long
045 * <li>the initial character of the name is coded
046 * <li>rules to encoded multi-character n-grams
047 * <li>multiple possible encodings for the same name (branching)
048 * </ul>
049 * <p>
050 * This implementation supports branching, depending on the used method:
051 * <ul>
052 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
053 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
054 * </ul>
055 * <p>
056 * Note: This implementation has additional branching rules compared to the original description of the algorithm. The
057 * rules can be customized by overriding the default rules contained in the resource file
058 * {@code org/apache/commons/codec/language/dmrules.txt}.
059 * </p>
060 * <p>
061 * This class is thread-safe.
062 * </p>
063 *
064 * @see Soundex
065 * @see <a href="https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
066 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
067 * @since 1.10
068 */
069public class DaitchMokotoffSoundex implements StringEncoder {
070
071    /**
072     * Inner class representing a branch during DM soundex encoding.
073     */
074    private static final class Branch {
075        private final StringBuilder builder;
076        private String cachedString;
077        private String lastReplacement;
078
079        private Branch() {
080            builder = new StringBuilder();
081            lastReplacement = null;
082            cachedString = null;
083        }
084
085        /**
086         * Creates a new branch, identical to this branch.
087         *
088         * @return a new, identical branch
089         */
090        public Branch createBranch() {
091            final Branch branch = new Branch();
092            branch.builder.append(toString());
093            branch.lastReplacement = this.lastReplacement;
094            return branch;
095        }
096
097        @Override
098        public boolean equals(final Object other) {
099            if (this == other) {
100                return true;
101            }
102            if (!(other instanceof Branch)) {
103                return false;
104            }
105
106            return toString().equals(((Branch) other).toString());
107        }
108
109        /**
110         * Finish this branch by appending '0's until the maximum code length has been reached.
111         */
112        public void finish() {
113            while (builder.length() < MAX_LENGTH) {
114                builder.append('0');
115                cachedString = null;
116            }
117        }
118
119        @Override
120        public int hashCode() {
121            return toString().hashCode();
122        }
123
124        /**
125         * Process the next replacement to be added to this branch.
126         *
127         * @param replacement
128         *            the next replacement to append
129         * @param forceAppend
130         *            indicates if the default processing shall be overridden
131         */
132        public void processNextReplacement(final String replacement, final boolean forceAppend) {
133            final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
134
135            if (append && builder.length() < MAX_LENGTH) {
136                builder.append(replacement);
137                // remove all characters after the maximum length
138                if (builder.length() > MAX_LENGTH) {
139                    builder.delete(MAX_LENGTH, builder.length());
140                }
141                cachedString = null;
142            }
143
144            lastReplacement = replacement;
145        }
146
147        @Override
148        public String toString() {
149            if (cachedString == null) {
150                cachedString = builder.toString();
151            }
152            return cachedString;
153        }
154    }
155
156    /**
157     * Inner class for storing rules.
158     */
159    private static final class Rule {
160        private final String pattern;
161        private final String[] replacementAtStart;
162        private final String[] replacementBeforeVowel;
163        private final String[] replacementDefault;
164
165        protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
166                final String replacementDefault) {
167            this.pattern = pattern;
168            this.replacementAtStart = replacementAtStart.split("\\|");
169            this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
170            this.replacementDefault = replacementDefault.split("\\|");
171        }
172
173        public int getPatternLength() {
174            return pattern.length();
175        }
176
177        public String[] getReplacements(final String context, final boolean atStart) {
178            if (atStart) {
179                return replacementAtStart;
180            }
181
182            final int nextIndex = getPatternLength();
183            final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
184            if (nextCharIsVowel) {
185                return replacementBeforeVowel;
186            }
187
188            return replacementDefault;
189        }
190
191        private boolean isVowel(final char ch) {
192            return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
193        }
194
195        public boolean matches(final String context) {
196            return context.startsWith(pattern);
197        }
198
199        @Override
200        public String toString() {
201            return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
202                    Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
203        }
204    }
205
206    private static final String COMMENT = "//";
207
208    private static final String DOUBLE_QUOTE = "\"";
209
210    private static final String MULTILINE_COMMENT_END = "*/";
211
212    private static final String MULTILINE_COMMENT_START = "/*";
213
214    /** The resource file containing the replacement and folding rules */
215    private static final String RESOURCE_FILE = "/org/apache/commons/codec/language/dmrules.txt";
216
217    /** The code length of a DM soundex value. */
218    private static final int MAX_LENGTH = 6;
219
220    /** Transformation rules indexed by the first character of their pattern. */
221    private static final Map<Character, List<Rule>> RULES = new HashMap<>();
222
223    /** Folding rules. */
224    private static final Map<Character, Character> FOLDINGS = new HashMap<>();
225
226    static {
227        try (Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
228            parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
229        }
230
231        // sort RULES by pattern length in descending order
232        RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
233    }
234
235    private static void parseRules(final Scanner scanner, final String location,
236            final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
237        int currentLine = 0;
238        boolean inMultilineComment = false;
239
240        while (scanner.hasNextLine()) {
241            currentLine++;
242            final String rawLine = scanner.nextLine();
243            String line = rawLine;
244
245            if (inMultilineComment) {
246                if (line.endsWith(MULTILINE_COMMENT_END)) {
247                    inMultilineComment = false;
248                }
249                continue;
250            }
251
252            if (line.startsWith(MULTILINE_COMMENT_START)) {
253                inMultilineComment = true;
254            } else {
255                // discard comments
256                final int cmtI = line.indexOf(COMMENT);
257                if (cmtI >= 0) {
258                    line = line.substring(0, cmtI);
259                }
260
261                // trim leading-trailing whitespace
262                line = line.trim();
263
264                if (line.isEmpty()) {
265                    continue; // empty lines can be safely skipped
266                }
267
268                if (line.contains("=")) {
269                    // folding
270                    final String[] parts = line.split("=");
271                    if (parts.length != 2) {
272                        throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
273                                " parts: " + rawLine + " in " + location);
274                    }
275                    final String leftCharacter = parts[0];
276                    final String rightCharacter = parts[1];
277
278                    if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
279                        throw new IllegalArgumentException("Malformed folding statement - " +
280                                "patterns are not single characters: " + rawLine + " in " + location);
281                    }
282
283                    asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
284                } else {
285                    // rule
286                    final String[] parts = line.split("\\s+");
287                    if (parts.length != 4) {
288                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
289                                " parts: " + rawLine + " in " + location);
290                    }
291                    try {
292                        final String pattern = stripQuotes(parts[0]);
293                        final String replacement1 = stripQuotes(parts[1]);
294                        final String replacement2 = stripQuotes(parts[2]);
295                        final String replacement3 = stripQuotes(parts[3]);
296
297                        final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
298                        final char patternKey = r.pattern.charAt(0);
299                        final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
300                        rules.add(r);
301                    } catch (final IllegalArgumentException e) {
302                        throw new IllegalStateException(
303                                "Problem parsing line '" + currentLine + "' in " + location, e);
304                    }
305                }
306            }
307        }
308    }
309
310    private static String stripQuotes(String str) {
311        if (str.startsWith(DOUBLE_QUOTE)) {
312            str = str.substring(1);
313        }
314
315        if (str.endsWith(DOUBLE_QUOTE)) {
316            str = str.substring(0, str.length() - 1);
317        }
318
319        return str;
320    }
321
322    /** Whether to use ASCII folding prior to encoding. */
323    private final boolean folding;
324
325    /**
326     * Creates a new instance with ASCII-folding enabled.
327     */
328    public DaitchMokotoffSoundex() {
329        this(true);
330    }
331
332    /**
333     * Creates a new instance.
334     * <p>
335     * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, for example
336     * è -&gt; e.
337     * </p>
338     *
339     * @param folding
340     *            if ASCII-folding shall be performed before encoding
341     */
342    public DaitchMokotoffSoundex(final boolean folding) {
343        this.folding = folding;
344    }
345
346    /**
347     * Performs a cleanup of the input string before the actual soundex transformation.
348     * <p>
349     * Removes all whitespace characters and performs ASCII folding if enabled.
350     * </p>
351     *
352     * @param input
353     *            the input string to clean up
354     * @return a cleaned up string
355     */
356    private String cleanup(final String input) {
357        final StringBuilder sb = new StringBuilder();
358        for (char ch : input.toCharArray()) {
359            if (Character.isWhitespace(ch)) {
360                continue;
361            }
362
363            ch = Character.toLowerCase(ch);
364            final Character character = FOLDINGS.get(ch);
365            if (folding && character != null) {
366                ch = character;
367            }
368            sb.append(ch);
369        }
370        return sb.toString();
371    }
372
373    /**
374     * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
375     * <p>
376     * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
377     * EncoderException if the supplied object is not of type {@link String}.
378     * </p>
379     *
380     * @see #soundex(String)
381     * @param obj
382     *            Object to encode
383     * @return An object (of type {@link String}) containing the DM soundex code, which corresponds to the String
384     *         supplied.
385     * @throws EncoderException
386     *             if the parameter supplied is not of type {@link String}
387     * @throws IllegalArgumentException
388     *             if a character is not mapped
389     */
390    @Override
391    public Object encode(final Object obj) throws EncoderException {
392        if (!(obj instanceof String)) {
393            throw new EncoderException(
394                    "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
395        }
396        return encode((String) obj);
397    }
398
399    /**
400     * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
401     *
402     * @see #soundex(String)
403     * @param source
404     *            A String object to encode
405     * @return A DM Soundex code corresponding to the String supplied
406     * @throws IllegalArgumentException
407     *             if a character is not mapped
408     */
409    @Override
410    public String encode(final String source) {
411        if (source == null) {
412            return null;
413        }
414        return soundex(source, false)[0];
415    }
416
417    /**
418     * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
419     * <p>
420     * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
421     * separated by '|'.
422     * </p>
423     * <p>
424     * Example: the name "AUERBACH" is encoded as both
425     * </p>
426     * <ul>
427     * <li>097400</li>
428     * <li>097500</li>
429     * </ul>
430     * <p>
431     * Thus the result will be "097400|097500".
432     * </p>
433     *
434     * @param source
435     *            A String object to encode
436     * @return A string containing a set of DM Soundex codes corresponding to the String supplied
437     * @throws IllegalArgumentException
438     *             if a character is not mapped
439     */
440    public String soundex(final String source) {
441        return String.join("|", soundex(source, true));
442    }
443
444    /**
445     * Perform the actual DM Soundex algorithm on the input string.
446     *
447     * @param source
448     *            A String object to encode
449     * @param branching
450     *            If branching shall be performed
451     * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
452     *         selected branching mode
453     */
454    private String[] soundex(final String source, final boolean branching) {
455        if (source == null) {
456            return null;
457        }
458
459        final String input = cleanup(source);
460
461        final Set<Branch> currentBranches = new LinkedHashSet<>();
462        currentBranches.add(new Branch());
463
464        char lastChar = '\0';
465        for (int index = 0; index < input.length(); index++) {
466            final char ch = input.charAt(index);
467
468            // ignore whitespace inside a name
469            if (Character.isWhitespace(ch)) {
470                continue;
471            }
472
473            final String inputContext = input.substring(index);
474            final List<Rule> rules = RULES.get(ch);
475            if (rules == null) {
476                continue;
477            }
478
479            // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
480            final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();
481
482            for (final Rule rule : rules) {
483                if (rule.matches(inputContext)) {
484                    if (branching) {
485                        nextBranches.clear();
486                    }
487                    final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
488                    final boolean branchingRequired = replacements.length > 1 && branching;
489
490                    for (final Branch branch : currentBranches) {
491                        for (final String nextReplacement : replacements) {
492                            // if we have multiple replacements, always create a new branch
493                            final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
494
495                            // special rule: occurrences of mn or nm are treated differently
496                            final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm';
497
498                            nextBranch.processNextReplacement(nextReplacement, force);
499
500                            if (!branching) {
501                                break;
502                            }
503                            nextBranches.add(nextBranch);
504                        }
505                    }
506
507                    if (branching) {
508                        currentBranches.clear();
509                        currentBranches.addAll(nextBranches);
510                    }
511                    index += rule.getPatternLength() - 1;
512                    break;
513                }
514            }
515
516            lastChar = ch;
517        }
518
519        final String[] result = new String[currentBranches.size()];
520        int index = 0;
521        for (final Branch branch : currentBranches) {
522            branch.finish();
523            result[index++] = branch.toString();
524        }
525
526        return result;
527    }
528}