1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.commons.codec.language; 19 20 import java.util.regex.Pattern; 21 22 import org.apache.commons.codec.EncoderException; 23 import org.apache.commons.codec.StringEncoder; 24 25 /** 26 * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a 27 * general purpose scheme to find word with similar phonemes. 28 * <p> 29 * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm. 30 * </p> 31 * <p> 32 * Algorithm description: 33 * </p> 34 * <pre> 35 * 1. Transcode first characters of name 36 * 1a. MAC -> MCC 37 * 1b. KN -> NN 38 * 1c. K -> C 39 * 1d. PH -> FF 40 * 1e. PF -> FF 41 * 1f. SCH -> SSS 42 * 2. Transcode last characters of name 43 * 2a. EE, IE -> Y 44 * 2b. DT,RT,RD,NT,ND -> D 45 * 3. First character of key = first character of name 46 * 4. Transcode remaining characters by following these rules, incrementing by one character each time 47 * 4a. EV -> AF else A,E,I,O,U -> A 48 * 4b. Q -> G 49 * 4c. Z -> S 50 * 4d. M -> N 51 * 4e. KN -> N else K -> C 52 * 4f. SCH -> SSS 53 * 4g. PH -> FF 54 * 4h. H -> If previous or next is non-vowel, previous 55 * 4i. W -> If previous is vowel, previous 56 * 4j. Add current to key if current != last key character 57 * 5. If last character is S, remove it 58 * 6. If last characters are AY, replace with Y 59 * 7. If last character is A, remove it 60 * 8. Collapse all strings of repeated characters 61 * 9. Add original first character of name as first character of key 62 * </pre> 63 * <p> 64 * This class is immutable and thread-safe. 65 * </p> 66 * 67 * @see <a href="https://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a> 68 * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a> 69 * @see Soundex 70 * @since 1.7 71 */ 72 public class Nysiis implements StringEncoder { 73 74 private static final char[] CHARS_A = { 'A' }; 75 private static final char[] CHARS_AF = { 'A', 'F' }; 76 private static final char[] CHARS_C = { 'C' }; 77 private static final char[] CHARS_FF = { 'F', 'F' }; 78 private static final char[] CHARS_G = { 'G' }; 79 private static final char[] CHARS_N = { 'N' }; 80 private static final char[] CHARS_NN = { 'N', 'N' }; 81 private static final char[] CHARS_S = { 'S' }; 82 private static final char[] CHARS_SSS = { 'S', 'S', 'S' }; 83 84 private static final Pattern PAT_MAC = Pattern.compile("^MAC"); 85 private static final Pattern PAT_KN = Pattern.compile("^KN"); 86 private static final Pattern PAT_K = Pattern.compile("^K"); 87 private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)"); 88 private static final Pattern PAT_SCH = Pattern.compile("^SCH"); 89 private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$"); 90 private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$"); 91 92 private static final char SPACE = ' '; 93 private static final int TRUE_LENGTH = 6; 94 95 /** 96 * Tests if the given character is a vowel. 97 * 98 * @param c 99 * the character to test 100 * @return {@code true} if the character is a vowel, {@code false} otherwise 101 */ 102 private static boolean isVowel(final char c) { 103 return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U'; 104 } 105 106 /** 107 * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at 108 * a time: [i-1, i, i+1, i+2]. 109 * 110 * @param prev 111 * the previous character 112 * @param curr 113 * the current character 114 * @param next 115 * the next character 116 * @param aNext 117 * the after next character 118 * @return a transcoded array of characters, starting from the current position 119 */ 120 private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) { 121 // 1. EV -> AF 122 if (curr == 'E' && next == 'V') { 123 return CHARS_AF; 124 } 125 126 // A, E, I, O, U -> A 127 if (isVowel(curr)) { 128 return CHARS_A; 129 } 130 131 // 2. Q -> G, Z -> S, M -> N 132 133 // 3. KN -> NN else K -> C 134 switch (curr) { 135 case 'Q': 136 return CHARS_G; 137 case 'Z': 138 return CHARS_S; 139 case 'M': 140 return CHARS_N; 141 case 'K': 142 if (next == 'N') { 143 return CHARS_NN; 144 } 145 return CHARS_C; 146 default: 147 break; 148 } 149 150 // 4. SCH -> SSS 151 if (curr == 'S' && next == 'C' && aNext == 'H') { 152 return CHARS_SSS; 153 } 154 155 // PH -> FF 156 if (curr == 'P' && next == 'H') { 157 return CHARS_FF; 158 } 159 160 // 5. H -> If previous or next is a non vowel, previous. 161 if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) { 162 return new char[] { prev }; 163 } 164 165 // 6. W -> If previous is vowel, previous. 166 if (curr == 'W' && isVowel(prev)) { 167 return new char[] { prev }; 168 } 169 170 return new char[] { curr }; 171 } 172 173 /** Indicates the strict mode. */ 174 private final boolean strict; 175 176 /** 177 * Creates an instance of the {@link Nysiis} encoder with strict mode (original form), 178 * i.e. encoded strings have a maximum length of 6. 179 */ 180 public Nysiis() { 181 this(true); 182 } 183 184 /** 185 * Create an instance of the {@link Nysiis} encoder with the specified strict mode: 186 * 187 * <ul> 188 * <li>{@code true}: encoded strings have a maximum length of 6</li> 189 * <li>{@code false}: encoded strings may have arbitrary length</li> 190 * </ul> 191 * 192 * @param strict 193 * the strict mode 194 */ 195 public Nysiis(final boolean strict) { 196 this.strict = strict; 197 } 198 199 /** 200 * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the 201 * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type 202 * {@link String}. 203 * 204 * @param obj 205 * Object to encode 206 * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String. 207 * @throws EncoderException 208 * if the parameter supplied is not of a {@link String} 209 * @throws IllegalArgumentException 210 * if a character is not mapped 211 */ 212 @Override 213 public Object encode(final Object obj) throws EncoderException { 214 if (!(obj instanceof String)) { 215 throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String"); 216 } 217 return nysiis((String) obj); 218 } 219 220 /** 221 * Encodes a String using the NYSIIS algorithm. 222 * 223 * @param str 224 * A String object to encode 225 * @return A Nysiis code corresponding to the String supplied 226 * @throws IllegalArgumentException 227 * if a character is not mapped 228 */ 229 @Override 230 public String encode(final String str) { 231 return nysiis(str); 232 } 233 234 /** 235 * Indicates the strict mode for this {@link Nysiis} encoder. 236 * 237 * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise 238 */ 239 public boolean isStrict() { 240 return this.strict; 241 } 242 243 /** 244 * Retrieves the NYSIIS code for a given String object. 245 * 246 * @param str 247 * String to encode using the NYSIIS algorithm 248 * @return A NYSIIS code for the String supplied 249 */ 250 public String nysiis(String str) { 251 if (str == null) { 252 return null; 253 } 254 255 // Use the same clean rules as Soundex 256 str = SoundexUtils.clean(str); 257 258 if (str.isEmpty()) { 259 return str; 260 } 261 262 // Translate first characters of name: 263 // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS 264 str = PAT_MAC.matcher(str).replaceFirst("MCC"); 265 str = PAT_KN.matcher(str).replaceFirst("NN"); 266 str = PAT_K.matcher(str).replaceFirst("C"); 267 str = PAT_PH_PF.matcher(str).replaceFirst("FF"); 268 str = PAT_SCH.matcher(str).replaceFirst("SSS"); 269 270 // Translate last characters of name: 271 // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D 272 str = PAT_EE_IE.matcher(str).replaceFirst("Y"); 273 str = PAT_DT_ETC.matcher(str).replaceFirst("D"); 274 275 // First character of key = first character of name. 276 final StringBuilder key = new StringBuilder(str.length()); 277 key.append(str.charAt(0)); 278 279 // Transcode remaining characters, incrementing by one character each time 280 final char[] chars = str.toCharArray(); 281 final int len = chars.length; 282 283 for (int i = 1; i < len; i++) { 284 final char next = i < len - 1 ? chars[i + 1] : SPACE; 285 final char aNext = i < len - 2 ? chars[i + 2] : SPACE; 286 final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext); 287 System.arraycopy(transcoded, 0, chars, i, transcoded.length); 288 289 // only append the current char to the key if it is different from the last one 290 if (chars[i] != chars[i - 1]) { 291 key.append(chars[i]); 292 } 293 } 294 295 if (key.length() > 1) { 296 char lastChar = key.charAt(key.length() - 1); 297 298 // If last character is S, remove it. 299 if (lastChar == 'S') { 300 key.deleteCharAt(key.length() - 1); 301 lastChar = key.charAt(key.length() - 1); 302 } 303 304 if (key.length() > 2) { 305 final char last2Char = key.charAt(key.length() - 2); 306 // If last characters are AY, replace with Y. 307 if (last2Char == 'A' && lastChar == 'Y') { 308 key.deleteCharAt(key.length() - 2); 309 } 310 } 311 312 // If last character is A, remove it. 313 if (lastChar == 'A') { 314 key.deleteCharAt(key.length() - 1); 315 } 316 } 317 318 final String string = key.toString(); 319 return isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string; 320 } 321 322 }