View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Metaphone value.
25   * <p>
26   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
27   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28   * </p>
29   * <p>
30   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
31   * p 39.</CITE>
32   * </p>
33   * <p>
34   * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
35   * </p>
36   * <ul>
37   * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
38   *  (broken link 4/30/2013) </li>
39   * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
40   *  (link checked 4/30/2013) </li>
41   * </ul>
42   * <p>
43   * They have had undocumented changes from the originally published algorithm.
44   * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
45   * </p>
46   * <p>
47   * This class is conditionally thread-safe.
48   * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)}
49   * but is not volatile, and accesses are not synchronized.
50   * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
51   * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
52   * after initial setup.
53   * </p>
54   */
55  public class Metaphone implements StringEncoder {
56  
57      /**
58       * Five values in the English language
59       */
60      private static final String VOWELS = "AEIOU";
61  
62      /**
63       * Variable used in Metaphone algorithm
64       */
65      private static final String FRONTV = "EIY";
66  
67      /**
68       * Variable used in Metaphone algorithm
69       */
70      private static final String VARSON = "CSPTG";
71  
72      /**
73       * The max code length for metaphone is 4
74       */
75      private int maxCodeLen = 4;
76  
77      /**
78       * Encodes an Object using the metaphone algorithm.  This method
79       * is provided in order to satisfy the requirements of the
80       * Encoder interface, and will throw an EncoderException if the
81       * supplied object is not of type {@link String}.
82       *
83       * @param obj Object to encode
84       * @return An object (or type {@link String}) containing the
85       *         metaphone code which corresponds to the String supplied.
86       * @throws EncoderException if the parameter supplied is not
87       *                          of type {@link String}
88       */
89      @Override
90      public Object encode(final Object obj) throws EncoderException {
91          if (!(obj instanceof String)) {
92              throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
93          }
94          return metaphone((String) obj);
95      }
96  
97      /**
98       * Encodes a String using the Metaphone algorithm.
99       *
100      * @param str String object to encode
101      * @return The metaphone code corresponding to the String supplied
102      */
103     @Override
104     public String encode(final String str) {
105         return metaphone(str);
106     }
107 
108     /**
109      * Returns the maxCodeLen.
110      * @return int
111      */
112     public int getMaxCodeLen() { return this.maxCodeLen; }
113 
114     private boolean isLastChar(final int wdsz, final int n) {
115         return n + 1 == wdsz;
116     }
117 
118     /**
119      * Tests is the metaphones of two strings are identical.
120      *
121      * @param str1 First of two strings to compare
122      * @param str2 Second of two strings to compare
123      * @return {@code true} if the metaphones of these strings are identical,
124      *        {@code false} otherwise.
125      */
126     public boolean isMetaphoneEqual(final String str1, final String str2) {
127         return metaphone(str1).equals(metaphone(str2));
128     }
129 
130     private boolean isNextChar(final StringBuilder string, final int index, final char c) {
131         boolean matches = false;
132         if (index >= 0 && index < string.length() - 1) {
133             matches = string.charAt(index + 1) == c;
134         }
135         return matches;
136     }
137 
138     private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
139         boolean matches = false;
140         if (index > 0 && index < string.length()) {
141             matches = string.charAt(index - 1) == c;
142         }
143         return matches;
144     }
145 
146     private boolean isVowel(final StringBuilder string, final int index) {
147         return VOWELS.indexOf(string.charAt(index)) >= 0;
148     }
149 
150     /**
151      * Find the metaphone value of a String. This is similar to the
152      * soundex algorithm, but better at finding similar sounding words.
153      * All input is converted to upper case.
154      * Limitations: Input format is expected to be a single ASCII word
155      * with only characters in the A - Z range, no punctuation or numbers.
156      *
157      * @param txt String to find the metaphone code for
158      * @return A metaphone code corresponding to the String supplied
159      */
160     public String metaphone(final String txt) {
161         boolean hard = false;
162         final int txtLength;
163         if (txt == null || (txtLength = txt.length()) == 0) {
164             return "";
165         }
166         // single character is itself
167         if (txtLength == 1) {
168             return txt.toUpperCase(java.util.Locale.ENGLISH);
169         }
170 
171         final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
172 
173         final StringBuilder local = new StringBuilder(40); // manipulate
174         final StringBuilder code = new StringBuilder(10); // output
175         // handle initial 2 characters exceptions
176         switch (inwd[0]) {
177         case 'K':
178         case 'G':
179         case 'P': /* looking for KN, etc */
180             if (inwd[1] == 'N') {
181                 local.append(inwd, 1, inwd.length - 1);
182             } else {
183                 local.append(inwd);
184             }
185             break;
186         case 'A': /* looking for AE */
187             if (inwd[1] == 'E') {
188                 local.append(inwd, 1, inwd.length - 1);
189             } else {
190                 local.append(inwd);
191             }
192             break;
193         case 'W': /* looking for WR or WH */
194             if (inwd[1] == 'R') { // WR -> R
195                 local.append(inwd, 1, inwd.length - 1);
196                 break;
197             }
198             if (inwd[1] == 'H') {
199                 local.append(inwd, 1, inwd.length - 1);
200                 local.setCharAt(0, 'W'); // WH -> W
201             } else {
202                 local.append(inwd);
203             }
204             break;
205         case 'X': /* initial X becomes S */
206             inwd[0] = 'S';
207             local.append(inwd);
208             break;
209         default:
210             local.append(inwd);
211         } // now local has working string with initials fixed
212 
213         final int wdsz = local.length();
214         int n = 0;
215 
216         while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well
217             final char symb = local.charAt(n);
218             // remove duplicate letters except C
219             if (symb != 'C' && isPreviousChar(local, n, symb)) {
220             } else { // not dup
221                 switch (symb) {
222                 case 'A':
223                 case 'E':
224                 case 'I':
225                 case 'O':
226                 case 'U':
227                     if (n == 0) {
228                         code.append(symb);
229                     }
230                     break; // only use vowel if leading char
231                 case 'B':
232                     if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB
233                         break;
234                     }
235                     code.append(symb);
236                     break;
237                 case 'C': // lots of C special cases
238                     /* discard if SCI, SCE or SCY */
239                     if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
240                         break;
241                     }
242                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
243                         code.append('X');
244                         break;
245                     }
246                     if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
247                         code.append('S');
248                         break; // CI,CE,CY -> S
249                     }
250                     if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
251                         code.append('K');
252                         break;
253                     }
254                     if (!isNextChar(local, n, 'H') || (n == 0 && wdsz >= 3 && isVowel(local, 2))) { // CH consonant -> K consonant
255                         code.append('K');
256                     } else {
257                         code.append('X'); // CHvowel -> X
258                     }
259                     break;
260                 case 'D':
261                     if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
262                         code.append('J');
263                         n += 2;
264                     } else {
265                         code.append('T');
266                     }
267                     break;
268                 case 'G': // GH silent at end or before consonant
269                     if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
270                         break;
271                     }
272                     if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
273                         break;
274                     }
275                     if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
276                         break; // silent G
277                     }
278                     // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
279                     hard = isPreviousChar(local, n, 'G');
280                     if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
281                         code.append('J');
282                     } else {
283                         code.append('K');
284                     }
285                     break;
286                 case 'H':
287                     if (isLastChar(wdsz, n)) {
288                         break; // terminal H
289                     }
290                     if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
291                         break;
292                     }
293                     if (isVowel(local, n + 1)) {
294                         code.append('H'); // Hvowel
295                     }
296                     break;
297                 case 'F':
298                 case 'J':
299                 case 'L':
300                 case 'M':
301                 case 'N':
302                 case 'R':
303                     code.append(symb);
304                     break;
305                 case 'K':
306                     if (n > 0) { // not initial
307                         if (!isPreviousChar(local, n, 'C')) {
308                             code.append(symb);
309                         }
310                     } else {
311                         code.append(symb); // initial K
312                     }
313                     break;
314                 case 'P':
315                     if (isNextChar(local, n, 'H')) {
316                         // PH -> F
317                         code.append('F');
318                     } else {
319                         code.append(symb);
320                     }
321                     break;
322                 case 'Q':
323                     code.append('K');
324                     break;
325                 case 'S':
326                     if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
327                         code.append('X');
328                     } else {
329                         code.append('S');
330                     }
331                     break;
332                 case 'T':
333                     if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
334                         code.append('X');
335                         break;
336                     }
337                     if (regionMatch(local, n, "TCH")) {
338                         // Silent if in "TCH"
339                         break;
340                     }
341                     // substitute numeral 0 for TH (resembles theta after all)
342                     if (regionMatch(local, n, "TH")) {
343                         code.append('0');
344                     } else {
345                         code.append('T');
346                     }
347                     break;
348                 case 'V':
349                     code.append('F');
350                     break;
351                 case 'W':
352                 case 'Y': // silent if not followed by vowel
353                     if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
354                         code.append(symb);
355                     }
356                     break;
357                 case 'X':
358                     code.append('K');
359                     code.append('S');
360                     break;
361                 case 'Z':
362                     code.append('S');
363                     break;
364                 default:
365                     // do nothing
366                     break;
367                 } // end switch
368             } // end else from symb != 'C'
369             n++;
370             if (code.length() > getMaxCodeLen()) {
371                 code.setLength(getMaxCodeLen());
372             }
373         }
374         return code.toString();
375     }
376 
377     private boolean regionMatch(final StringBuilder string, final int index, final String test) {
378         boolean matches = false;
379         if (index >= 0 && index + test.length() - 1 < string.length()) {
380             final String substring = string.substring(index, index + test.length());
381             matches = substring.equals(test);
382         }
383         return matches;
384     }
385 
386     /**
387      * Sets the maxCodeLen.
388      * @param maxCodeLen The maxCodeLen to set
389      */
390     public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
391 
392 }