1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.commons.codec.language; 19 20 import org.apache.commons.codec.EncoderException; 21 import org.apache.commons.codec.StringEncoder; 22 23 /** 24 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a 25 * general purpose scheme to find word with similar phonemes. 26 * 27 * <p>This class is thread-safe. 28 * Although not strictly immutable, the mutable fields are not actually used.</p> 29 */ 30 public class Soundex implements StringEncoder { 31 32 /** 33 * The marker character used to indicate a silent (ignored) character. 34 * These are ignored except when they appear as the first character. 35 * <p> 36 * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism 37 * because changing it might break existing code. Mappings that don't contain 38 * a silent marker code are treated as though H and W are silent. 39 * </p> 40 * <p> 41 * To override this, use the {@link #Soundex(String, boolean)} constructor. 42 * </p> 43 * 44 * @since 1.11 45 */ 46 public static final char SILENT_MARKER = '-'; 47 48 /** 49 * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position 50 * means do not encode, but treat as a separator when it occurs between consonants with the same code. 51 * <p> 52 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick 53 * up the value for the constant values page.) 54 * </p> 55 * <p> 56 * <b>Note that letters H and W are treated specially.</b> 57 * They are ignored (after the first letter) and don't act as separators 58 * between consonants with the same code. 59 * </p> 60 */ 61 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; 62 63 /** 64 * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position 65 * means do not encode. 66 * 67 * @see Soundex#Soundex(char[]) 68 */ 69 private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); 70 71 /** 72 * An instance of Soundex using the US_ENGLISH_MAPPING mapping. 73 * This treats H and W as silent letters. 74 * Apart from when they appear as the first letter, they are ignored. 75 * They don't act as separators between duplicate codes. 76 * 77 * @see #US_ENGLISH_MAPPING_STRING 78 */ 79 public static final Soundex US_ENGLISH = new Soundex(); 80 81 /** 82 * An instance of Soundex using the Simplified Soundex mapping, as described here: 83 * http://west-penwith.org.uk/misc/soundex.htm 84 * <p> 85 * This treats H and W the same as vowels (AEIOUY). 86 * Such letters aren't encoded (after the first), but they do 87 * act as separators when dropping duplicate codes. 88 * The mapping is otherwise the same as for {@link #US_ENGLISH} 89 * </p> 90 * 91 * @since 1.11 92 */ 93 public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false); 94 95 /** 96 * An instance of Soundex using the mapping as per the Genealogy site: 97 * http://www.genealogy.com/articles/research/00000060.html 98 * <p> 99 * This treats vowels (AEIOUY), H and W as silent letters. 100 * Such letters are ignored (after the first) and do not 101 * act as separators when dropping duplicate codes. 102 * </p> 103 * <p> 104 * The codes for consonants are otherwise the same as for 105 * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED} 106 * </p> 107 * 108 * @since 1.11 109 */ 110 public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2"); 111 // ABCDEFGHIJKLMNOPQRSTUVWXYZ 112 113 /** 114 * The maximum length of a Soundex code - Soundex codes are only four characters by definition. 115 * 116 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 117 */ 118 @Deprecated 119 private int maxLength = 4; 120 121 /** 122 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 123 * letter is mapped. This implementation contains a default map for US_ENGLISH 124 */ 125 private final char[] soundexMapping; 126 127 /** 128 * Should H and W be treated specially? 129 * <p> 130 * In versions of the code prior to 1.11, 131 * the code always treated H and W as silent (ignored) letters. 132 * If this field is false, H and W are no longer special-cased. 133 * </p> 134 */ 135 private final boolean specialCaseHW; 136 137 /** 138 * Creates an instance using US_ENGLISH_MAPPING 139 * 140 * @see Soundex#Soundex(char[]) 141 * @see Soundex#US_ENGLISH_MAPPING_STRING 142 */ 143 public Soundex() { 144 this.soundexMapping = US_ENGLISH_MAPPING; 145 this.specialCaseHW = true; 146 } 147 148 /** 149 * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized 150 * mapping for a non-Western character set. 151 * <p> 152 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 153 * letter is mapped. This implementation contains a default map for US_ENGLISH 154 * </p> 155 * <p> 156 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment 157 * </p> 158 * 159 * @param mapping 160 * Mapping array to use when finding the corresponding code for a given character 161 */ 162 public Soundex(final char[] mapping) { 163 this.soundexMapping = mapping.clone(); 164 this.specialCaseHW = !hasMarker(this.soundexMapping); 165 } 166 167 /** 168 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, 169 * and/or possibly provide an internationalized mapping for a non-Western character set. 170 * <p> 171 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment 172 * </p> 173 * 174 * @param mapping 175 * Mapping string to use when finding the corresponding code for a given character 176 * @since 1.4 177 */ 178 public Soundex(final String mapping) { 179 this.soundexMapping = mapping.toCharArray(); 180 this.specialCaseHW = !hasMarker(this.soundexMapping); 181 } 182 183 /** 184 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, 185 * and/or possibly provide an internationalized mapping for a non-Western character set. 186 * 187 * @param mapping 188 * Mapping string to use when finding the corresponding code for a given character 189 * @param specialCaseHW if true, then 190 * @since 1.11 191 */ 192 public Soundex(final String mapping, final boolean specialCaseHW) { 193 this.soundexMapping = mapping.toCharArray(); 194 this.specialCaseHW = specialCaseHW; 195 } 196 197 /** 198 * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This 199 * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or 200 * identical values. 201 * 202 * @param s1 203 * A String that will be encoded and compared. 204 * @param s2 205 * A String that will be encoded and compared. 206 * @return The number of characters in the two encoded Strings that are the same from 0 to 4. 207 * 208 * @see SoundexUtils#difference(StringEncoder,String,String) 209 * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS 210 * T-SQL DIFFERENCE</a> 211 * 212 * @throws EncoderException 213 * if an error occurs encoding one of the strings 214 * @since 1.3 215 */ 216 public int difference(final String s1, final String s2) throws EncoderException { 217 return SoundexUtils.difference(this, s1, s2); 218 } 219 220 /** 221 * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of 222 * the Encoder interface, and will throw an EncoderException if the supplied object is not of type {@link String}. 223 * 224 * @param obj 225 * Object to encode 226 * @return An object (or type {@link String}) containing the soundex code which corresponds to the String 227 * supplied. 228 * @throws EncoderException 229 * if the parameter supplied is not of type {@link String} 230 * @throws IllegalArgumentException 231 * if a character is not mapped 232 */ 233 @Override 234 public Object encode(final Object obj) throws EncoderException { 235 if (!(obj instanceof String)) { 236 throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); 237 } 238 return soundex((String) obj); 239 } 240 241 /** 242 * Encodes a String using the soundex algorithm. 243 * 244 * @param str 245 * A String object to encode 246 * @return A Soundex code corresponding to the String supplied 247 * @throws IllegalArgumentException 248 * if a character is not mapped 249 */ 250 @Override 251 public String encode(final String str) { 252 return soundex(str); 253 } 254 255 /** 256 * Returns the maxLength. Standard Soundex 257 * 258 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 259 * @return int 260 */ 261 @Deprecated 262 public int getMaxLength() { 263 return this.maxLength; 264 } 265 266 private boolean hasMarker(final char[] mapping) { 267 for (final char ch : mapping) { 268 if (ch == SILENT_MARKER) { 269 return true; 270 } 271 } 272 return false; 273 } 274 275 /** 276 * Maps the given upper-case character to its Soundex code. 277 * 278 * @param ch 279 * An upper-case character. 280 * @return A Soundex code. 281 * @throws IllegalArgumentException 282 * Thrown if {@code ch} is not mapped. 283 */ 284 private char map(final char ch) { 285 final int index = ch - 'A'; 286 if (index < 0 || index >= this.soundexMapping.length) { 287 throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")"); 288 } 289 return this.soundexMapping[index]; 290 } 291 292 /** 293 * Sets the maxLength. 294 * 295 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 296 * @param maxLength 297 * The maxLength to set 298 */ 299 @Deprecated 300 public void setMaxLength(final int maxLength) { 301 this.maxLength = maxLength; 302 } 303 304 /** 305 * Retrieves the Soundex code for a given String object. 306 * 307 * @param str 308 * String to encode using the Soundex algorithm 309 * @return A soundex code for the String supplied 310 * @throws IllegalArgumentException 311 * if a character is not mapped 312 */ 313 public String soundex(String str) { 314 if (str == null) { 315 return null; 316 } 317 str = SoundexUtils.clean(str); 318 if (str.isEmpty()) { 319 return str; 320 } 321 final char[] out = { '0', '0', '0', '0' }; 322 int count = 0; 323 final char first = str.charAt(0); 324 out[count++] = first; 325 char lastDigit = map(first); // previous digit 326 for (int i = 1; i < str.length() && count < out.length; i++) { 327 final char ch = str.charAt(i); 328 if (this.specialCaseHW && (ch == 'H' || ch == 'W')) { // these are ignored completely 329 continue; 330 } 331 final char digit = map(ch); 332 if (digit == SILENT_MARKER) { 333 continue; 334 } 335 if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats 336 out[count++] = digit; 337 } 338 lastDigit = digit; 339 } 340 return new String(out); 341 } 342 343 }