View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import java.util.regex.Pattern;
21  
22  import org.apache.commons.codec.EncoderException;
23  import org.apache.commons.codec.StringEncoder;
24  
25  /**
26   * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
27   * general purpose scheme to find word with similar phonemes.
28   * <p>
29   * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
30   * </p>
31   * <p>
32   * Algorithm description:
33   * </p>
34   * <pre>
35   * 1. Transcode first characters of name
36   *   1a. MAC -&gt;   MCC
37   *   1b. KN  -&gt;   NN
38   *   1c. K   -&gt;   C
39   *   1d. PH  -&gt;   FF
40   *   1e. PF  -&gt;   FF
41   *   1f. SCH -&gt;   SSS
42   * 2. Transcode last characters of name
43   *   2a. EE, IE          -&gt;   Y
44   *   2b. DT,RT,RD,NT,ND  -&gt;   D
45   * 3. First character of key = first character of name
46   * 4. Transcode remaining characters by following these rules, incrementing by one character each time
47   *   4a. EV  -&gt;   AF  else A,E,I,O,U -&gt; A
48   *   4b. Q   -&gt;   G
49   *   4c. Z   -&gt;   S
50   *   4d. M   -&gt;   N
51   *   4e. KN  -&gt;   N   else K -&gt; C
52   *   4f. SCH -&gt;   SSS
53   *   4g. PH  -&gt;   FF
54   *   4h. H   -&gt;   If previous or next is non-vowel, previous
55   *   4i. W   -&gt;   If previous is vowel, previous
56   *   4j. Add current to key if current != last key character
57   * 5. If last character is S, remove it
58   * 6. If last characters are AY, replace with Y
59   * 7. If last character is A, remove it
60   * 8. Collapse all strings of repeated characters
61   * 9. Add original first character of name as first character of key
62   * </pre>
63   * <p>
64   * This class is immutable and thread-safe.
65   * </p>
66   *
67   * @see <a href="https://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
68   * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
69   * @see Soundex
70   * @since 1.7
71   */
72  public class Nysiis implements StringEncoder {
73  
74      private static final char[] CHARS_A   = { 'A' };
75      private static final char[] CHARS_AF  = { 'A', 'F' };
76      private static final char[] CHARS_C   = { 'C' };
77      private static final char[] CHARS_FF  = { 'F', 'F' };
78      private static final char[] CHARS_G   = { 'G' };
79      private static final char[] CHARS_N   = { 'N' };
80      private static final char[] CHARS_NN  = { 'N', 'N' };
81      private static final char[] CHARS_S   = { 'S' };
82      private static final char[] CHARS_SSS = { 'S', 'S', 'S' };
83  
84      private static final Pattern PAT_MAC    = Pattern.compile("^MAC");
85      private static final Pattern PAT_KN     = Pattern.compile("^KN");
86      private static final Pattern PAT_K      = Pattern.compile("^K");
87      private static final Pattern PAT_PH_PF  = Pattern.compile("^(PH|PF)");
88      private static final Pattern PAT_SCH    = Pattern.compile("^SCH");
89      private static final Pattern PAT_EE_IE  = Pattern.compile("(EE|IE)$");
90      private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
91  
92      private static final char SPACE = ' ';
93      private static final int TRUE_LENGTH = 6;
94  
95      /**
96       * Tests if the given character is a vowel.
97       *
98       * @param c
99       *            the character to test
100      * @return {@code true} if the character is a vowel, {@code false} otherwise
101      */
102     private static boolean isVowel(final char c) {
103         return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
104     }
105 
106     /**
107      * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
108      * a time: [i-1, i, i+1, i+2].
109      *
110      * @param prev
111      *            the previous character
112      * @param curr
113      *            the current character
114      * @param next
115      *            the next character
116      * @param aNext
117      *            the after next character
118      * @return a transcoded array of characters, starting from the current position
119      */
120     private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
121         // 1. EV -> AF
122         if (curr == 'E' && next == 'V') {
123             return CHARS_AF;
124         }
125 
126         // A, E, I, O, U -> A
127         if (isVowel(curr)) {
128             return CHARS_A;
129         }
130 
131         // 2. Q -> G, Z -> S, M -> N
132 
133         // 3. KN -> NN else K -> C
134         switch (curr) {
135         case 'Q':
136             return CHARS_G;
137         case 'Z':
138             return CHARS_S;
139         case 'M':
140             return CHARS_N;
141         case 'K':
142             if (next == 'N') {
143                 return CHARS_NN;
144             }
145             return CHARS_C;
146         default:
147             break;
148         }
149 
150         // 4. SCH -> SSS
151         if (curr == 'S' && next == 'C' && aNext == 'H') {
152             return CHARS_SSS;
153         }
154 
155         // PH -> FF
156         if (curr == 'P' && next == 'H') {
157             return CHARS_FF;
158         }
159 
160         // 5. H -> If previous or next is a non vowel, previous.
161         if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
162             return new char[] { prev };
163         }
164 
165         // 6. W -> If previous is vowel, previous.
166         if (curr == 'W' && isVowel(prev)) {
167             return new char[] { prev };
168         }
169 
170         return new char[] { curr };
171     }
172 
173     /** Indicates the strict mode. */
174     private final boolean strict;
175 
176     /**
177      * Creates an instance of the {@link Nysiis} encoder with strict mode (original form),
178      * i.e. encoded strings have a maximum length of 6.
179      */
180     public Nysiis() {
181         this(true);
182     }
183 
184     /**
185      * Create an instance of the {@link Nysiis} encoder with the specified strict mode:
186      *
187      * <ul>
188      *  <li>{@code true}: encoded strings have a maximum length of 6</li>
189      *  <li>{@code false}: encoded strings may have arbitrary length</li>
190      * </ul>
191      *
192      * @param strict
193      *            the strict mode
194      */
195     public Nysiis(final boolean strict) {
196         this.strict = strict;
197     }
198 
199     /**
200      * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
201      * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
202      * {@link String}.
203      *
204      * @param obj
205      *            Object to encode
206      * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
207      * @throws EncoderException
208      *            if the parameter supplied is not of a {@link String}
209      * @throws IllegalArgumentException
210      *            if a character is not mapped
211      */
212     @Override
213     public Object encode(final Object obj) throws EncoderException {
214         if (!(obj instanceof String)) {
215             throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
216         }
217         return nysiis((String) obj);
218     }
219 
220     /**
221      * Encodes a String using the NYSIIS algorithm.
222      *
223      * @param str
224      *            A String object to encode
225      * @return A Nysiis code corresponding to the String supplied
226      * @throws IllegalArgumentException
227      *            if a character is not mapped
228      */
229     @Override
230     public String encode(final String str) {
231         return nysiis(str);
232     }
233 
234     /**
235      * Indicates the strict mode for this {@link Nysiis} encoder.
236      *
237      * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise
238      */
239     public boolean isStrict() {
240         return this.strict;
241     }
242 
243     /**
244      * Retrieves the NYSIIS code for a given String object.
245      *
246      * @param str
247      *            String to encode using the NYSIIS algorithm
248      * @return A NYSIIS code for the String supplied
249      */
250     public String nysiis(String str) {
251         if (str == null) {
252             return null;
253         }
254 
255         // Use the same clean rules as Soundex
256         str = SoundexUtils.clean(str);
257 
258         if (str.isEmpty()) {
259             return str;
260         }
261 
262         // Translate first characters of name:
263         // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
264         str = PAT_MAC.matcher(str).replaceFirst("MCC");
265         str = PAT_KN.matcher(str).replaceFirst("NN");
266         str = PAT_K.matcher(str).replaceFirst("C");
267         str = PAT_PH_PF.matcher(str).replaceFirst("FF");
268         str = PAT_SCH.matcher(str).replaceFirst("SSS");
269 
270         // Translate last characters of name:
271         // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
272         str = PAT_EE_IE.matcher(str).replaceFirst("Y");
273         str = PAT_DT_ETC.matcher(str).replaceFirst("D");
274 
275         // First character of key = first character of name.
276         final StringBuilder key = new StringBuilder(str.length());
277         key.append(str.charAt(0));
278 
279         // Transcode remaining characters, incrementing by one character each time
280         final char[] chars = str.toCharArray();
281         final int len = chars.length;
282 
283         for (int i = 1; i < len; i++) {
284             final char next = i < len - 1 ? chars[i + 1] : SPACE;
285             final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
286             final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
287             System.arraycopy(transcoded, 0, chars, i, transcoded.length);
288 
289             // only append the current char to the key if it is different from the last one
290             if (chars[i] != chars[i - 1]) {
291                 key.append(chars[i]);
292             }
293         }
294 
295         if (key.length() > 1) {
296             char lastChar = key.charAt(key.length() - 1);
297 
298             // If last character is S, remove it.
299             if (lastChar == 'S') {
300                 key.deleteCharAt(key.length() - 1);
301                 lastChar = key.charAt(key.length() - 1);
302             }
303 
304             if (key.length() > 2) {
305                 final char last2Char = key.charAt(key.length() - 2);
306                 // If last characters are AY, replace with Y.
307                 if (last2Char == 'A' && lastChar == 'Y') {
308                     key.deleteCharAt(key.length() - 2);
309                 }
310             }
311 
312             // If last character is A, remove it.
313             if (lastChar == 'A') {
314                 key.deleteCharAt(key.length() - 1);
315             }
316         }
317 
318         final String string = key.toString();
319         return isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
320     }
321 
322 }