001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022 023/** 024 * Encodes a string into a Metaphone value. 025 * <p> 026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 028 * </p> 029 * <p> 030 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, 031 * p 39.</CITE> 032 * </p> 033 * <p> 034 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: 035 * </p> 036 * <ul> 037 * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> 038 * (broken link 4/30/2013) </li> 039 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> 040 * (link checked 4/30/2013) </li> 041 * </ul> 042 * <p> 043 * They have had undocumented changes from the originally published algorithm. 044 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. 045 * </p> 046 * <p> 047 * This class is conditionally thread-safe. 048 * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)} 049 * but is not volatile, and accesses are not synchronized. 050 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization 051 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} 052 * after initial setup. 053 * </p> 054 */ 055public class Metaphone implements StringEncoder { 056 057 /** 058 * Five values in the English language 059 */ 060 private static final String VOWELS = "AEIOU"; 061 062 /** 063 * Variable used in Metaphone algorithm 064 */ 065 private static final String FRONTV = "EIY"; 066 067 /** 068 * Variable used in Metaphone algorithm 069 */ 070 private static final String VARSON = "CSPTG"; 071 072 /** 073 * The max code length for metaphone is 4 074 */ 075 private int maxCodeLen = 4; 076 077 /** 078 * Encodes an Object using the metaphone algorithm. This method 079 * is provided in order to satisfy the requirements of the 080 * Encoder interface, and will throw an EncoderException if the 081 * supplied object is not of type {@link String}. 082 * 083 * @param obj Object to encode 084 * @return An object (or type {@link String}) containing the 085 * metaphone code which corresponds to the String supplied. 086 * @throws EncoderException if the parameter supplied is not 087 * of type {@link String} 088 */ 089 @Override 090 public Object encode(final Object obj) throws EncoderException { 091 if (!(obj instanceof String)) { 092 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 093 } 094 return metaphone((String) obj); 095 } 096 097 /** 098 * Encodes a String using the Metaphone algorithm. 099 * 100 * @param str String object to encode 101 * @return The metaphone code corresponding to the String supplied 102 */ 103 @Override 104 public String encode(final String str) { 105 return metaphone(str); 106 } 107 108 /** 109 * Returns the maxCodeLen. 110 * @return int 111 */ 112 public int getMaxCodeLen() { return this.maxCodeLen; } 113 114 private boolean isLastChar(final int wdsz, final int n) { 115 return n + 1 == wdsz; 116 } 117 118 /** 119 * Tests is the metaphones of two strings are identical. 120 * 121 * @param str1 First of two strings to compare 122 * @param str2 Second of two strings to compare 123 * @return {@code true} if the metaphones of these strings are identical, 124 * {@code false} otherwise. 125 */ 126 public boolean isMetaphoneEqual(final String str1, final String str2) { 127 return metaphone(str1).equals(metaphone(str2)); 128 } 129 130 private boolean isNextChar(final StringBuilder string, final int index, final char c) { 131 boolean matches = false; 132 if (index >= 0 && index < string.length() - 1) { 133 matches = string.charAt(index + 1) == c; 134 } 135 return matches; 136 } 137 138 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { 139 boolean matches = false; 140 if (index > 0 && index < string.length()) { 141 matches = string.charAt(index - 1) == c; 142 } 143 return matches; 144 } 145 146 private boolean isVowel(final StringBuilder string, final int index) { 147 return VOWELS.indexOf(string.charAt(index)) >= 0; 148 } 149 150 /** 151 * Find the metaphone value of a String. This is similar to the 152 * soundex algorithm, but better at finding similar sounding words. 153 * All input is converted to upper case. 154 * Limitations: Input format is expected to be a single ASCII word 155 * with only characters in the A - Z range, no punctuation or numbers. 156 * 157 * @param txt String to find the metaphone code for 158 * @return A metaphone code corresponding to the String supplied 159 */ 160 public String metaphone(final String txt) { 161 boolean hard = false; 162 final int txtLength; 163 if (txt == null || (txtLength = txt.length()) == 0) { 164 return ""; 165 } 166 // single character is itself 167 if (txtLength == 1) { 168 return txt.toUpperCase(java.util.Locale.ENGLISH); 169 } 170 171 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); 172 173 final StringBuilder local = new StringBuilder(40); // manipulate 174 final StringBuilder code = new StringBuilder(10); // output 175 // handle initial 2 characters exceptions 176 switch (inwd[0]) { 177 case 'K': 178 case 'G': 179 case 'P': /* looking for KN, etc */ 180 if (inwd[1] == 'N') { 181 local.append(inwd, 1, inwd.length - 1); 182 } else { 183 local.append(inwd); 184 } 185 break; 186 case 'A': /* looking for AE */ 187 if (inwd[1] == 'E') { 188 local.append(inwd, 1, inwd.length - 1); 189 } else { 190 local.append(inwd); 191 } 192 break; 193 case 'W': /* looking for WR or WH */ 194 if (inwd[1] == 'R') { // WR -> R 195 local.append(inwd, 1, inwd.length - 1); 196 break; 197 } 198 if (inwd[1] == 'H') { 199 local.append(inwd, 1, inwd.length - 1); 200 local.setCharAt(0, 'W'); // WH -> W 201 } else { 202 local.append(inwd); 203 } 204 break; 205 case 'X': /* initial X becomes S */ 206 inwd[0] = 'S'; 207 local.append(inwd); 208 break; 209 default: 210 local.append(inwd); 211 } // now local has working string with initials fixed 212 213 final int wdsz = local.length(); 214 int n = 0; 215 216 while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well 217 final char symb = local.charAt(n); 218 // remove duplicate letters except C 219 if (symb != 'C' && isPreviousChar(local, n, symb)) { 220 } else { // not dup 221 switch (symb) { 222 case 'A': 223 case 'E': 224 case 'I': 225 case 'O': 226 case 'U': 227 if (n == 0) { 228 code.append(symb); 229 } 230 break; // only use vowel if leading char 231 case 'B': 232 if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB 233 break; 234 } 235 code.append(symb); 236 break; 237 case 'C': // lots of C special cases 238 /* discard if SCI, SCE or SCY */ 239 if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 240 break; 241 } 242 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 243 code.append('X'); 244 break; 245 } 246 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 247 code.append('S'); 248 break; // CI,CE,CY -> S 249 } 250 if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk 251 code.append('K'); 252 break; 253 } 254 if (!isNextChar(local, n, 'H') || (n == 0 && wdsz >= 3 && isVowel(local, 2))) { // CH consonant -> K consonant 255 code.append('K'); 256 } else { 257 code.append('X'); // CHvowel -> X 258 } 259 break; 260 case 'D': 261 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J 262 code.append('J'); 263 n += 2; 264 } else { 265 code.append('T'); 266 } 267 break; 268 case 'G': // GH silent at end or before consonant 269 if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) { 270 break; 271 } 272 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) { 273 break; 274 } 275 if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) { 276 break; // silent G 277 } 278 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true 279 hard = isPreviousChar(local, n, 'G'); 280 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) { 281 code.append('J'); 282 } else { 283 code.append('K'); 284 } 285 break; 286 case 'H': 287 if (isLastChar(wdsz, n)) { 288 break; // terminal H 289 } 290 if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) { 291 break; 292 } 293 if (isVowel(local, n + 1)) { 294 code.append('H'); // Hvowel 295 } 296 break; 297 case 'F': 298 case 'J': 299 case 'L': 300 case 'M': 301 case 'N': 302 case 'R': 303 code.append(symb); 304 break; 305 case 'K': 306 if (n > 0) { // not initial 307 if (!isPreviousChar(local, n, 'C')) { 308 code.append(symb); 309 } 310 } else { 311 code.append(symb); // initial K 312 } 313 break; 314 case 'P': 315 if (isNextChar(local, n, 'H')) { 316 // PH -> F 317 code.append('F'); 318 } else { 319 code.append(symb); 320 } 321 break; 322 case 'Q': 323 code.append('K'); 324 break; 325 case 'S': 326 if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) { 327 code.append('X'); 328 } else { 329 code.append('S'); 330 } 331 break; 332 case 'T': 333 if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) { 334 code.append('X'); 335 break; 336 } 337 if (regionMatch(local, n, "TCH")) { 338 // Silent if in "TCH" 339 break; 340 } 341 // substitute numeral 0 for TH (resembles theta after all) 342 if (regionMatch(local, n, "TH")) { 343 code.append('0'); 344 } else { 345 code.append('T'); 346 } 347 break; 348 case 'V': 349 code.append('F'); 350 break; 351 case 'W': 352 case 'Y': // silent if not followed by vowel 353 if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) { 354 code.append(symb); 355 } 356 break; 357 case 'X': 358 code.append('K'); 359 code.append('S'); 360 break; 361 case 'Z': 362 code.append('S'); 363 break; 364 default: 365 // do nothing 366 break; 367 } // end switch 368 } // end else from symb != 'C' 369 n++; 370 if (code.length() > getMaxCodeLen()) { 371 code.setLength(getMaxCodeLen()); 372 } 373 } 374 return code.toString(); 375 } 376 377 private boolean regionMatch(final StringBuilder string, final int index, final String test) { 378 boolean matches = false; 379 if (index >= 0 && index + test.length() - 1 < string.length()) { 380 final String substring = string.substring(index, index + test.length()); 381 matches = substring.equals(test); 382 } 383 return matches; 384 } 385 386 /** 387 * Sets the maxCodeLen. 388 * @param maxCodeLen The maxCodeLen to set 389 */ 390 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; } 391 392}