View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Metaphone value.
25   * <p>
26   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
27   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28   * </p>
29   * <p>
30   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
31   * p 39.</CITE>
32   * </p>
33   * <p>
34   * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
35   * </p>
36   * <ul>
37   * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
38   *  (broken link 4/30/2013) </li>
39   * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
40   *  (link checked 4/30/2013) </li>
41   * </ul>
42   * <p>
43   * They have had undocumented changes from the originally published algorithm.
44   * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
45   * </p>
46   * <p>
47   * This class is conditionally thread-safe.
48   * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)}
49   * but is not volatile, and accesses are not synchronized.
50   * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
51   * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
52   * after initial setup.
53   * </p>
54   */
55  public class Metaphone implements StringEncoder {
56  
57      /**
58       * Five values in the English language
59       */
60      private static final String VOWELS = "AEIOU";
61  
62      /**
63       * Variable used in Metaphone algorithm
64       */
65      private static final String FRONTV = "EIY";
66  
67      /**
68       * Variable used in Metaphone algorithm
69       */
70      private static final String VARSON = "CSPTG";
71  
72      /**
73       * The max code length for metaphone is 4
74       */
75      private int maxCodeLen = 4;
76  
77      /**
78       * Constructs a new instance.
79       */
80      public Metaphone() {
81          // empty
82      }
83  
84      /**
85       * Encodes an Object using the metaphone algorithm.  This method
86       * is provided in order to satisfy the requirements of the
87       * Encoder interface, and will throw an EncoderException if the
88       * supplied object is not of type {@link String}.
89       *
90       * @param obj Object to encode
91       * @return An object (or type {@link String}) containing the
92       *         metaphone code which corresponds to the String supplied.
93       * @throws EncoderException if the parameter supplied is not
94       *                          of type {@link String}
95       */
96      @Override
97      public Object encode(final Object obj) throws EncoderException {
98          if (!(obj instanceof String)) {
99              throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
100         }
101         return metaphone((String) obj);
102     }
103 
104     /**
105      * Encodes a String using the Metaphone algorithm.
106      *
107      * @param str String object to encode
108      * @return The metaphone code corresponding to the String supplied
109      */
110     @Override
111     public String encode(final String str) {
112         return metaphone(str);
113     }
114 
115     /**
116      * Gets the maxCodeLen.
117      *
118      * @return the maxCodeLen.
119      */
120     public int getMaxCodeLen() {
121         return this.maxCodeLen;
122     }
123 
124     private boolean isLastChar(final int wdsz, final int n) {
125         return n + 1 == wdsz;
126     }
127 
128     /**
129      * Tests is the metaphones of two strings are identical.
130      *
131      * @param str1 First of two strings to compare
132      * @param str2 Second of two strings to compare
133      * @return {@code true} if the metaphones of these strings are identical,
134      *        {@code false} otherwise.
135      */
136     public boolean isMetaphoneEqual(final String str1, final String str2) {
137         return metaphone(str1).equals(metaphone(str2));
138     }
139 
140     private boolean isNextChar(final StringBuilder string, final int index, final char c) {
141         boolean matches = false;
142         if (index >= 0 && index < string.length() - 1) {
143             matches = string.charAt(index + 1) == c;
144         }
145         return matches;
146     }
147 
148     private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
149         boolean matches = false;
150         if (index > 0 && index < string.length()) {
151             matches = string.charAt(index - 1) == c;
152         }
153         return matches;
154     }
155 
156     private boolean isVowel(final StringBuilder string, final int index) {
157         return VOWELS.indexOf(string.charAt(index)) >= 0;
158     }
159 
160     /**
161      * Find the metaphone value of a String. This is similar to the
162      * soundex algorithm, but better at finding similar sounding words.
163      * All input is converted to upper case.
164      * Limitations: Input format is expected to be a single ASCII word
165      * with only characters in the A - Z range, no punctuation or numbers.
166      *
167      * @param txt String to find the metaphone code for
168      * @return A metaphone code corresponding to the String supplied
169      */
170     public String metaphone(final String txt) {
171         boolean hard = false;
172         final int txtLength;
173         if (txt == null || (txtLength = txt.length()) == 0) {
174             return "";
175         }
176         // single character is itself
177         if (txtLength == 1) {
178             return txt.toUpperCase(java.util.Locale.ENGLISH);
179         }
180 
181         final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
182 
183         final StringBuilder local = new StringBuilder(40); // manipulate
184         final StringBuilder code = new StringBuilder(10); // output
185         // handle initial 2 characters exceptions
186         switch (inwd[0]) {
187         case 'K':
188         case 'G':
189         case 'P': /* looking for KN, etc */
190             if (inwd[1] == 'N') {
191                 local.append(inwd, 1, inwd.length - 1);
192             } else {
193                 local.append(inwd);
194             }
195             break;
196         case 'A': /* looking for AE */
197             if (inwd[1] == 'E') {
198                 local.append(inwd, 1, inwd.length - 1);
199             } else {
200                 local.append(inwd);
201             }
202             break;
203         case 'W': /* looking for WR or WH */
204             if (inwd[1] == 'R') { // WR -> R
205                 local.append(inwd, 1, inwd.length - 1);
206                 break;
207             }
208             if (inwd[1] == 'H') {
209                 local.append(inwd, 1, inwd.length - 1);
210                 local.setCharAt(0, 'W'); // WH -> W
211             } else {
212                 local.append(inwd);
213             }
214             break;
215         case 'X': /* initial X becomes S */
216             inwd[0] = 'S';
217             local.append(inwd);
218             break;
219         default:
220             local.append(inwd);
221         } // now local has working string with initials fixed
222 
223         final int wdsz = local.length();
224         int n = 0;
225 
226         while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well
227             final char symb = local.charAt(n);
228             // remove duplicate letters except C
229             if (symb != 'C' && isPreviousChar(local, n, symb)) {
230             } else { // not dup
231                 switch (symb) {
232                 case 'A':
233                 case 'E':
234                 case 'I':
235                 case 'O':
236                 case 'U':
237                     if (n == 0) {
238                         code.append(symb);
239                     }
240                     break; // only use vowel if leading char
241                 case 'B':
242                     if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB
243                         break;
244                     }
245                     code.append(symb);
246                     break;
247                 case 'C': // lots of C special cases
248                     /* discard if SCI, SCE or SCY */
249                     if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
250                         break;
251                     }
252                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
253                         code.append('X');
254                         break;
255                     }
256                     if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
257                         code.append('S');
258                         break; // CI,CE,CY -> S
259                     }
260                     if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
261                         code.append('K');
262                         break;
263                     }
264                     if (!isNextChar(local, n, 'H') || (n == 0 && wdsz >= 3 && isVowel(local, 2))) { // CH consonant -> K consonant
265                         code.append('K');
266                     } else {
267                         code.append('X'); // CHvowel -> X
268                     }
269                     break;
270                 case 'D':
271                     if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
272                         code.append('J');
273                         n += 2;
274                     } else {
275                         code.append('T');
276                     }
277                     break;
278                 case 'G': // GH silent at end or before consonant
279                     if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
280                         break;
281                     }
282                     if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
283                         break;
284                     }
285                     if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
286                         break; // silent G
287                     }
288                     // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
289                     hard = isPreviousChar(local, n, 'G');
290                     if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
291                         code.append('J');
292                     } else {
293                         code.append('K');
294                     }
295                     break;
296                 case 'H':
297                     if (isLastChar(wdsz, n)) {
298                         break; // terminal H
299                     }
300                     if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
301                         break;
302                     }
303                     if (isVowel(local, n + 1)) {
304                         code.append('H'); // Hvowel
305                     }
306                     break;
307                 case 'F':
308                 case 'J':
309                 case 'L':
310                 case 'M':
311                 case 'N':
312                 case 'R':
313                     code.append(symb);
314                     break;
315                 case 'K':
316                     if (n > 0) { // not initial
317                         if (!isPreviousChar(local, n, 'C')) {
318                             code.append(symb);
319                         }
320                     } else {
321                         code.append(symb); // initial K
322                     }
323                     break;
324                 case 'P':
325                     if (isNextChar(local, n, 'H')) {
326                         // PH -> F
327                         code.append('F');
328                     } else {
329                         code.append(symb);
330                     }
331                     break;
332                 case 'Q':
333                     code.append('K');
334                     break;
335                 case 'S':
336                     if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
337                         code.append('X');
338                     } else {
339                         code.append('S');
340                     }
341                     break;
342                 case 'T':
343                     if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
344                         code.append('X');
345                         break;
346                     }
347                     if (regionMatch(local, n, "TCH")) {
348                         // Silent if in "TCH"
349                         break;
350                     }
351                     // substitute numeral 0 for TH (resembles theta after all)
352                     if (regionMatch(local, n, "TH")) {
353                         code.append('0');
354                     } else {
355                         code.append('T');
356                     }
357                     break;
358                 case 'V':
359                     code.append('F');
360                     break;
361                 case 'W':
362                 case 'Y': // silent if not followed by vowel
363                     if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
364                         code.append(symb);
365                     }
366                     break;
367                 case 'X':
368                     code.append('K');
369                     code.append('S');
370                     break;
371                 case 'Z':
372                     code.append('S');
373                     break;
374                 default:
375                     // do nothing
376                     break;
377                 } // end switch
378             } // end else from symb != 'C'
379             n++;
380             if (code.length() > getMaxCodeLen()) {
381                 code.setLength(getMaxCodeLen());
382             }
383         }
384         return code.toString();
385     }
386 
387     private boolean regionMatch(final StringBuilder string, final int index, final String test) {
388         boolean matches = false;
389         if (index >= 0 && index + test.length() - 1 < string.length()) {
390             final String substring = string.substring(index, index + test.length());
391             matches = substring.equals(test);
392         }
393         return matches;
394     }
395 
396     /**
397      * Sets the maxCodeLen.
398      *
399      * @param maxCodeLen The maxCodeLen to set.
400      */
401     public void setMaxCodeLen(final int maxCodeLen) {
402         this.maxCodeLen = maxCodeLen;
403     }
404 
405 }