001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Metaphone value.
025 * <p>
026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
028 * </p>
029 * <p>
030 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
031 * p 39.</CITE>
032 * </p>
033 * <p>
034 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
035 * </p>
036 * <ul>
037 * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
038 *  (broken link 4/30/2013) </li>
039 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
040 *  (link checked 4/30/2013) </li>
041 * </ul>
042 * <p>
043 * They have had undocumented changes from the originally published algorithm.
044 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
045 * </p>
046 * <p>
047 * This class is conditionally thread-safe.
048 * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)}
049 * but is not volatile, and accesses are not synchronized.
050 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
051 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
052 * after initial setup.
053 * </p>
054 */
055public class Metaphone implements StringEncoder {
056
057    /**
058     * Five values in the English language
059     */
060    private static final String VOWELS = "AEIOU";
061
062    /**
063     * Variable used in Metaphone algorithm
064     */
065    private static final String FRONTV = "EIY";
066
067    /**
068     * Variable used in Metaphone algorithm
069     */
070    private static final String VARSON = "CSPTG";
071
072    /**
073     * The max code length for metaphone is 4
074     */
075    private int maxCodeLen = 4;
076
077    /**
078     * Constructs a new instance.
079     */
080    public Metaphone() {
081        // empty
082    }
083
084    /**
085     * Encodes an Object using the metaphone algorithm.  This method
086     * is provided in order to satisfy the requirements of the
087     * Encoder interface, and will throw an EncoderException if the
088     * supplied object is not of type {@link String}.
089     *
090     * @param obj Object to encode
091     * @return An object (or type {@link String}) containing the
092     *         metaphone code which corresponds to the String supplied.
093     * @throws EncoderException if the parameter supplied is not
094     *                          of type {@link String}
095     */
096    @Override
097    public Object encode(final Object obj) throws EncoderException {
098        if (!(obj instanceof String)) {
099            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
100        }
101        return metaphone((String) obj);
102    }
103
104    /**
105     * Encodes a String using the Metaphone algorithm.
106     *
107     * @param str String object to encode
108     * @return The metaphone code corresponding to the String supplied
109     */
110    @Override
111    public String encode(final String str) {
112        return metaphone(str);
113    }
114
115    /**
116     * Gets the maxCodeLen.
117     *
118     * @return the maxCodeLen.
119     */
120    public int getMaxCodeLen() {
121        return this.maxCodeLen;
122    }
123
124    private boolean isLastChar(final int wdsz, final int n) {
125        return n + 1 == wdsz;
126    }
127
128    /**
129     * Tests is the metaphones of two strings are identical.
130     *
131     * @param str1 First of two strings to compare
132     * @param str2 Second of two strings to compare
133     * @return {@code true} if the metaphones of these strings are identical,
134     *        {@code false} otherwise.
135     */
136    public boolean isMetaphoneEqual(final String str1, final String str2) {
137        return metaphone(str1).equals(metaphone(str2));
138    }
139
140    private boolean isNextChar(final StringBuilder string, final int index, final char c) {
141        boolean matches = false;
142        if (index >= 0 && index < string.length() - 1) {
143            matches = string.charAt(index + 1) == c;
144        }
145        return matches;
146    }
147
148    private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
149        boolean matches = false;
150        if (index > 0 && index < string.length()) {
151            matches = string.charAt(index - 1) == c;
152        }
153        return matches;
154    }
155
156    private boolean isVowel(final StringBuilder string, final int index) {
157        return VOWELS.indexOf(string.charAt(index)) >= 0;
158    }
159
160    /**
161     * Find the metaphone value of a String. This is similar to the
162     * soundex algorithm, but better at finding similar sounding words.
163     * All input is converted to upper case.
164     * Limitations: Input format is expected to be a single ASCII word
165     * with only characters in the A - Z range, no punctuation or numbers.
166     *
167     * @param txt String to find the metaphone code for
168     * @return A metaphone code corresponding to the String supplied
169     */
170    public String metaphone(final String txt) {
171        boolean hard = false;
172        final int txtLength;
173        if (txt == null || (txtLength = txt.length()) == 0) {
174            return "";
175        }
176        // single character is itself
177        if (txtLength == 1) {
178            return txt.toUpperCase(java.util.Locale.ENGLISH);
179        }
180
181        final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
182
183        final StringBuilder local = new StringBuilder(40); // manipulate
184        final StringBuilder code = new StringBuilder(10); // output
185        // handle initial 2 characters exceptions
186        switch (inwd[0]) {
187        case 'K':
188        case 'G':
189        case 'P': /* looking for KN, etc */
190            if (inwd[1] == 'N') {
191                local.append(inwd, 1, inwd.length - 1);
192            } else {
193                local.append(inwd);
194            }
195            break;
196        case 'A': /* looking for AE */
197            if (inwd[1] == 'E') {
198                local.append(inwd, 1, inwd.length - 1);
199            } else {
200                local.append(inwd);
201            }
202            break;
203        case 'W': /* looking for WR or WH */
204            if (inwd[1] == 'R') { // WR -> R
205                local.append(inwd, 1, inwd.length - 1);
206                break;
207            }
208            if (inwd[1] == 'H') {
209                local.append(inwd, 1, inwd.length - 1);
210                local.setCharAt(0, 'W'); // WH -> W
211            } else {
212                local.append(inwd);
213            }
214            break;
215        case 'X': /* initial X becomes S */
216            inwd[0] = 'S';
217            local.append(inwd);
218            break;
219        default:
220            local.append(inwd);
221        } // now local has working string with initials fixed
222
223        final int wdsz = local.length();
224        int n = 0;
225
226        while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well
227            final char symb = local.charAt(n);
228            // remove duplicate letters except C
229            if (symb != 'C' && isPreviousChar(local, n, symb)) {
230            } else { // not dup
231                switch (symb) {
232                case 'A':
233                case 'E':
234                case 'I':
235                case 'O':
236                case 'U':
237                    if (n == 0) {
238                        code.append(symb);
239                    }
240                    break; // only use vowel if leading char
241                case 'B':
242                    if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB
243                        break;
244                    }
245                    code.append(symb);
246                    break;
247                case 'C': // lots of C special cases
248                    /* discard if SCI, SCE or SCY */
249                    if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
250                        break;
251                    }
252                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
253                        code.append('X');
254                        break;
255                    }
256                    if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
257                        code.append('S');
258                        break; // CI,CE,CY -> S
259                    }
260                    if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
261                        code.append('K');
262                        break;
263                    }
264                    if (!isNextChar(local, n, 'H') || (n == 0 && wdsz >= 3 && isVowel(local, 2))) { // CH consonant -> K consonant
265                        code.append('K');
266                    } else {
267                        code.append('X'); // CHvowel -> X
268                    }
269                    break;
270                case 'D':
271                    if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
272                        code.append('J');
273                        n += 2;
274                    } else {
275                        code.append('T');
276                    }
277                    break;
278                case 'G': // GH silent at end or before consonant
279                    if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
280                        break;
281                    }
282                    if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
283                        break;
284                    }
285                    if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
286                        break; // silent G
287                    }
288                    // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
289                    hard = isPreviousChar(local, n, 'G');
290                    if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
291                        code.append('J');
292                    } else {
293                        code.append('K');
294                    }
295                    break;
296                case 'H':
297                    if (isLastChar(wdsz, n)) {
298                        break; // terminal H
299                    }
300                    if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
301                        break;
302                    }
303                    if (isVowel(local, n + 1)) {
304                        code.append('H'); // Hvowel
305                    }
306                    break;
307                case 'F':
308                case 'J':
309                case 'L':
310                case 'M':
311                case 'N':
312                case 'R':
313                    code.append(symb);
314                    break;
315                case 'K':
316                    if (n > 0) { // not initial
317                        if (!isPreviousChar(local, n, 'C')) {
318                            code.append(symb);
319                        }
320                    } else {
321                        code.append(symb); // initial K
322                    }
323                    break;
324                case 'P':
325                    if (isNextChar(local, n, 'H')) {
326                        // PH -> F
327                        code.append('F');
328                    } else {
329                        code.append(symb);
330                    }
331                    break;
332                case 'Q':
333                    code.append('K');
334                    break;
335                case 'S':
336                    if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
337                        code.append('X');
338                    } else {
339                        code.append('S');
340                    }
341                    break;
342                case 'T':
343                    if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
344                        code.append('X');
345                        break;
346                    }
347                    if (regionMatch(local, n, "TCH")) {
348                        // Silent if in "TCH"
349                        break;
350                    }
351                    // substitute numeral 0 for TH (resembles theta after all)
352                    if (regionMatch(local, n, "TH")) {
353                        code.append('0');
354                    } else {
355                        code.append('T');
356                    }
357                    break;
358                case 'V':
359                    code.append('F');
360                    break;
361                case 'W':
362                case 'Y': // silent if not followed by vowel
363                    if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
364                        code.append(symb);
365                    }
366                    break;
367                case 'X':
368                    code.append('K');
369                    code.append('S');
370                    break;
371                case 'Z':
372                    code.append('S');
373                    break;
374                default:
375                    // do nothing
376                    break;
377                } // end switch
378            } // end else from symb != 'C'
379            n++;
380            if (code.length() > getMaxCodeLen()) {
381                code.setLength(getMaxCodeLen());
382            }
383        }
384        return code.toString();
385    }
386
387    private boolean regionMatch(final StringBuilder string, final int index, final String test) {
388        boolean matches = false;
389        if (index >= 0 && index + test.length() - 1 < string.length()) {
390            final String substring = string.substring(index, index + test.length());
391            matches = substring.equals(test);
392        }
393        return matches;
394    }
395
396    /**
397     * Sets the maxCodeLen.
398     *
399     * @param maxCodeLen The maxCodeLen to set.
400     */
401    public void setMaxCodeLen(final int maxCodeLen) {
402        this.maxCodeLen = maxCodeLen;
403    }
404
405}