DaitchMokotoffSoundex.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.codec.language;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import org.apache.commons.codec.CharEncoding;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.Resources;
import org.apache.commons.codec.StringEncoder;
/**
* Encodes a string into a Daitch-Mokotoff Soundex value.
* <p>
* The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
* accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
* </p>
* <p>
* The main differences compared to the other soundex variants are:
* </p>
* <ul>
* <li>coded names are 6 digits long
* <li>the initial character of the name is coded
* <li>rules to encoded multi-character n-grams
* <li>multiple possible encodings for the same name (branching)
* </ul>
* <p>
* This implementation supports branching, depending on the used method:
* <ul>
* <li>{@link #encode(String)} - branching disabled, only the first code will be returned
* <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
* </ul>
* <p>
* Note: this implementation has additional branching rules compared to the original description of the algorithm. The
* rules can be customized by overriding the default rules contained in the resource file
* {@code org/apache/commons/codec/language/dmrules.txt}.
* </p>
* <p>
* This class is thread-safe.
* </p>
*
* @see Soundex
* @see <a href="https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
* @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
*
* @since 1.10
*/
public class DaitchMokotoffSoundex implements StringEncoder {
/**
* Inner class representing a branch during DM soundex encoding.
*/
private static final class Branch {
private final StringBuilder builder;
private String cachedString;
private String lastReplacement;
private Branch() {
builder = new StringBuilder();
lastReplacement = null;
cachedString = null;
}
/**
* Creates a new branch, identical to this branch.
*
* @return a new, identical branch
*/
public Branch createBranch() {
final Branch branch = new Branch();
branch.builder.append(toString());
branch.lastReplacement = this.lastReplacement;
return branch;
}
@Override
public boolean equals(final Object other) {
if (this == other) {
return true;
}
if (!(other instanceof Branch)) {
return false;
}
return toString().equals(((Branch) other).toString());
}
/**
* Finish this branch by appending '0's until the maximum code length has been reached.
*/
public void finish() {
while (builder.length() < MAX_LENGTH) {
builder.append('0');
cachedString = null;
}
}
@Override
public int hashCode() {
return toString().hashCode();
}
/**
* Process the next replacement to be added to this branch.
*
* @param replacement
* the next replacement to append
* @param forceAppend
* indicates if the default processing shall be overridden
*/
public void processNextReplacement(final String replacement, final boolean forceAppend) {
final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
if (append && builder.length() < MAX_LENGTH) {
builder.append(replacement);
// remove all characters after the maximum length
if (builder.length() > MAX_LENGTH) {
builder.delete(MAX_LENGTH, builder.length());
}
cachedString = null;
}
lastReplacement = replacement;
}
@Override
public String toString() {
if (cachedString == null) {
cachedString = builder.toString();
}
return cachedString;
}
}
/**
* Inner class for storing rules.
*/
private static final class Rule {
private final String pattern;
private final String[] replacementAtStart;
private final String[] replacementBeforeVowel;
private final String[] replacementDefault;
protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
final String replacementDefault) {
this.pattern = pattern;
this.replacementAtStart = replacementAtStart.split("\\|");
this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
this.replacementDefault = replacementDefault.split("\\|");
}
public int getPatternLength() {
return pattern.length();
}
public String[] getReplacements(final String context, final boolean atStart) {
if (atStart) {
return replacementAtStart;
}
final int nextIndex = getPatternLength();
final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
if (nextCharIsVowel) {
return replacementBeforeVowel;
}
return replacementDefault;
}
private boolean isVowel(final char ch) {
return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
}
public boolean matches(final String context) {
return context.startsWith(pattern);
}
@Override
public String toString() {
return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
}
}
private static final String COMMENT = "//";
private static final String DOUBLE_QUOTE = "\"";
private static final String MULTILINE_COMMENT_END = "*/";
private static final String MULTILINE_COMMENT_START = "/*";
/** The resource file containing the replacement and folding rules */
private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
/** The code length of a DM soundex value. */
private static final int MAX_LENGTH = 6;
/** Transformation rules indexed by the first character of their pattern. */
private static final Map<Character, List<Rule>> RULES = new HashMap<>();
/** Folding rules. */
private static final Map<Character, Character> FOLDINGS = new HashMap<>();
static {
try (final Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
}
// sort RULES by pattern length in descending order
RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
}
private static void parseRules(final Scanner scanner, final String location,
final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
int currentLine = 0;
boolean inMultilineComment = false;
while (scanner.hasNextLine()) {
currentLine++;
final String rawLine = scanner.nextLine();
String line = rawLine;
if (inMultilineComment) {
if (line.endsWith(MULTILINE_COMMENT_END)) {
inMultilineComment = false;
}
continue;
}
if (line.startsWith(MULTILINE_COMMENT_START)) {
inMultilineComment = true;
} else {
// discard comments
final int cmtI = line.indexOf(COMMENT);
if (cmtI >= 0) {
line = line.substring(0, cmtI);
}
// trim leading-trailing whitespace
line = line.trim();
if (line.isEmpty()) {
continue; // empty lines can be safely skipped
}
if (line.contains("=")) {
// folding
final String[] parts = line.split("=");
if (parts.length != 2) {
throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
" parts: " + rawLine + " in " + location);
}
final String leftCharacter = parts[0];
final String rightCharacter = parts[1];
if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
throw new IllegalArgumentException("Malformed folding statement - " +
"patterns are not single characters: " + rawLine + " in " + location);
}
asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
} else {
// rule
final String[] parts = line.split("\\s+");
if (parts.length != 4) {
throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
" parts: " + rawLine + " in " + location);
}
try {
final String pattern = stripQuotes(parts[0]);
final String replacement1 = stripQuotes(parts[1]);
final String replacement2 = stripQuotes(parts[2]);
final String replacement3 = stripQuotes(parts[3]);
final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
final char patternKey = r.pattern.charAt(0);
final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
rules.add(r);
} catch (final IllegalArgumentException e) {
throw new IllegalStateException(
"Problem parsing line '" + currentLine + "' in " + location, e);
}
}
}
}
}
private static String stripQuotes(String str) {
if (str.startsWith(DOUBLE_QUOTE)) {
str = str.substring(1);
}
if (str.endsWith(DOUBLE_QUOTE)) {
str = str.substring(0, str.length() - 1);
}
return str;
}
/** Whether to use ASCII folding prior to encoding. */
private final boolean folding;
/**
* Creates a new instance with ASCII-folding enabled.
*/
public DaitchMokotoffSoundex() {
this(true);
}
/**
* Creates a new instance.
* <p>
* With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
* รจ -> e.
* </p>
*
* @param folding
* if ASCII-folding shall be performed before encoding
*/
public DaitchMokotoffSoundex(final boolean folding) {
this.folding = folding;
}
/**
* Performs a cleanup of the input string before the actual soundex transformation.
* <p>
* Removes all whitespace characters and performs ASCII folding if enabled.
* </p>
*
* @param input
* the input string to clean up
* @return a cleaned up string
*/
private String cleanup(final String input) {
final StringBuilder sb = new StringBuilder();
for (char ch : input.toCharArray()) {
if (Character.isWhitespace(ch)) {
continue;
}
ch = Character.toLowerCase(ch);
final Character character = FOLDINGS.get(ch);
if (folding && character != null) {
ch = character;
}
sb.append(ch);
}
return sb.toString();
}
/**
* Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
* <p>
* This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
* EncoderException if the supplied object is not of type {@link String}.
* </p>
*
* @see #soundex(String)
*
* @param obj
* Object to encode
* @return An object (of type {@link String}) containing the DM soundex code, which corresponds to the String
* supplied.
* @throws EncoderException
* if the parameter supplied is not of type {@link String}
* @throws IllegalArgumentException
* if a character is not mapped
*/
@Override
public Object encode(final Object obj) throws EncoderException {
if (!(obj instanceof String)) {
throw new EncoderException(
"Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
}
return encode((String) obj);
}
/**
* Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
*
* @see #soundex(String)
*
* @param source
* A String object to encode
* @return A DM Soundex code corresponding to the String supplied
* @throws IllegalArgumentException
* if a character is not mapped
*/
@Override
public String encode(final String source) {
if (source == null) {
return null;
}
return soundex(source, false)[0];
}
/**
* Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
* <p>
* In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
* separated by '|'.
* </p>
* <p>
* Example: the name "AUERBACH" is encoded as both
* </p>
* <ul>
* <li>097400</li>
* <li>097500</li>
* </ul>
* <p>
* Thus the result will be "097400|097500".
* </p>
*
* @param source
* A String object to encode
* @return A string containing a set of DM Soundex codes corresponding to the String supplied
* @throws IllegalArgumentException
* if a character is not mapped
*/
public String soundex(final String source) {
final String[] branches = soundex(source, true);
final StringBuilder sb = new StringBuilder();
int index = 0;
for (final String branch : branches) {
sb.append(branch);
if (++index < branches.length) {
sb.append('|');
}
}
return sb.toString();
}
/**
* Perform the actual DM Soundex algorithm on the input string.
*
* @param source
* A String object to encode
* @param branching
* If branching shall be performed
* @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
* selected branching mode
*/
private String[] soundex(final String source, final boolean branching) {
if (source == null) {
return null;
}
final String input = cleanup(source);
final Set<Branch> currentBranches = new LinkedHashSet<>();
currentBranches.add(new Branch());
char lastChar = '\0';
for (int index = 0; index < input.length(); index++) {
final char ch = input.charAt(index);
// ignore whitespace inside a name
if (Character.isWhitespace(ch)) {
continue;
}
final String inputContext = input.substring(index);
final List<Rule> rules = RULES.get(ch);
if (rules == null) {
continue;
}
// use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();
for (final Rule rule : rules) {
if (rule.matches(inputContext)) {
if (branching) {
nextBranches.clear();
}
final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
final boolean branchingRequired = replacements.length > 1 && branching;
for (final Branch branch : currentBranches) {
for (final String nextReplacement : replacements) {
// if we have multiple replacements, always create a new branch
final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
// special rule: occurrences of mn or nm are treated differently
final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm';
nextBranch.processNextReplacement(nextReplacement, force);
if (!branching) {
break;
}
nextBranches.add(nextBranch);
}
}
if (branching) {
currentBranches.clear();
currentBranches.addAll(nextBranches);
}
index += rule.getPatternLength() - 1;
break;
}
}
lastChar = ch;
}
final String[] result = new String[currentBranches.size()];
int index = 0;
for (final Branch branch : currentBranches) {
branch.finish();
result[index++] = branch.toString();
}
return result;
}
}