001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022 023/** 024 * Encodes a string into a Metaphone value. 025 * <p> 026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 028 * </p> 029 * <p> 030 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, 031 * p 39.</CITE> 032 * </p> 033 * <p> 034 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: 035 * </p> 036 * <ul> 037 * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> 038 * (broken link 4/30/2013) </li> 039 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> 040 * (link checked 4/30/2013) </li> 041 * </ul> 042 * <p> 043 * They have had undocumented changes from the originally published algorithm. 044 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. 045 * </p> 046 * <p> 047 * This class is conditionally thread-safe. 048 * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)} 049 * but is not volatile, and accesses are not synchronized. 050 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization 051 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} 052 * after initial setup. 053 * </p> 054 */ 055public class Metaphone implements StringEncoder { 056 057 /** 058 * Five values in the English language 059 */ 060 private static final String VOWELS = "AEIOU"; 061 062 /** 063 * Variable used in Metaphone algorithm 064 */ 065 private static final String FRONTV = "EIY"; 066 067 /** 068 * Variable used in Metaphone algorithm 069 */ 070 private static final String VARSON = "CSPTG"; 071 072 /** 073 * The max code length for metaphone is 4 074 */ 075 private int maxCodeLen = 4; 076 077 /** 078 * Constructs a new instance. 079 */ 080 public Metaphone() { 081 // empty 082 } 083 084 /** 085 * Encodes an Object using the metaphone algorithm. This method 086 * is provided in order to satisfy the requirements of the 087 * Encoder interface, and will throw an EncoderException if the 088 * supplied object is not of type {@link String}. 089 * 090 * @param obj Object to encode 091 * @return An object (or type {@link String}) containing the 092 * metaphone code which corresponds to the String supplied. 093 * @throws EncoderException if the parameter supplied is not 094 * of type {@link String} 095 */ 096 @Override 097 public Object encode(final Object obj) throws EncoderException { 098 if (!(obj instanceof String)) { 099 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 100 } 101 return metaphone((String) obj); 102 } 103 104 /** 105 * Encodes a String using the Metaphone algorithm. 106 * 107 * @param str String object to encode 108 * @return The metaphone code corresponding to the String supplied 109 */ 110 @Override 111 public String encode(final String str) { 112 return metaphone(str); 113 } 114 115 /** 116 * Gets the maxCodeLen. 117 * 118 * @return the maxCodeLen. 119 */ 120 public int getMaxCodeLen() { 121 return this.maxCodeLen; 122 } 123 124 private boolean isLastChar(final int wdsz, final int n) { 125 return n + 1 == wdsz; 126 } 127 128 /** 129 * Tests is the metaphones of two strings are identical. 130 * 131 * @param str1 First of two strings to compare 132 * @param str2 Second of two strings to compare 133 * @return {@code true} if the metaphones of these strings are identical, 134 * {@code false} otherwise. 135 */ 136 public boolean isMetaphoneEqual(final String str1, final String str2) { 137 return metaphone(str1).equals(metaphone(str2)); 138 } 139 140 private boolean isNextChar(final StringBuilder string, final int index, final char c) { 141 boolean matches = false; 142 if (index >= 0 && index < string.length() - 1) { 143 matches = string.charAt(index + 1) == c; 144 } 145 return matches; 146 } 147 148 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { 149 boolean matches = false; 150 if (index > 0 && index < string.length()) { 151 matches = string.charAt(index - 1) == c; 152 } 153 return matches; 154 } 155 156 private boolean isVowel(final StringBuilder string, final int index) { 157 return VOWELS.indexOf(string.charAt(index)) >= 0; 158 } 159 160 /** 161 * Find the metaphone value of a String. This is similar to the 162 * soundex algorithm, but better at finding similar sounding words. 163 * All input is converted to upper case. 164 * Limitations: Input format is expected to be a single ASCII word 165 * with only characters in the A - Z range, no punctuation or numbers. 166 * 167 * @param txt String to find the metaphone code for 168 * @return A metaphone code corresponding to the String supplied 169 */ 170 public String metaphone(final String txt) { 171 boolean hard = false; 172 final int txtLength; 173 if (txt == null || (txtLength = txt.length()) == 0) { 174 return ""; 175 } 176 // single character is itself 177 if (txtLength == 1) { 178 return txt.toUpperCase(java.util.Locale.ENGLISH); 179 } 180 181 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); 182 183 final StringBuilder local = new StringBuilder(40); // manipulate 184 final StringBuilder code = new StringBuilder(10); // output 185 // handle initial 2 characters exceptions 186 switch (inwd[0]) { 187 case 'K': 188 case 'G': 189 case 'P': /* looking for KN, etc */ 190 if (inwd[1] == 'N') { 191 local.append(inwd, 1, inwd.length - 1); 192 } else { 193 local.append(inwd); 194 } 195 break; 196 case 'A': /* looking for AE */ 197 if (inwd[1] == 'E') { 198 local.append(inwd, 1, inwd.length - 1); 199 } else { 200 local.append(inwd); 201 } 202 break; 203 case 'W': /* looking for WR or WH */ 204 if (inwd[1] == 'R') { // WR -> R 205 local.append(inwd, 1, inwd.length - 1); 206 break; 207 } 208 if (inwd[1] == 'H') { 209 local.append(inwd, 1, inwd.length - 1); 210 local.setCharAt(0, 'W'); // WH -> W 211 } else { 212 local.append(inwd); 213 } 214 break; 215 case 'X': /* initial X becomes S */ 216 inwd[0] = 'S'; 217 local.append(inwd); 218 break; 219 default: 220 local.append(inwd); 221 } // now local has working string with initials fixed 222 223 final int wdsz = local.length(); 224 int n = 0; 225 226 while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well 227 final char symb = local.charAt(n); 228 // remove duplicate letters except C 229 if (symb != 'C' && isPreviousChar(local, n, symb)) { 230 } else { // not dup 231 switch (symb) { 232 case 'A': 233 case 'E': 234 case 'I': 235 case 'O': 236 case 'U': 237 if (n == 0) { 238 code.append(symb); 239 } 240 break; // only use vowel if leading char 241 case 'B': 242 if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB 243 break; 244 } 245 code.append(symb); 246 break; 247 case 'C': // lots of C special cases 248 /* discard if SCI, SCE or SCY */ 249 if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 250 break; 251 } 252 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 253 code.append('X'); 254 break; 255 } 256 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 257 code.append('S'); 258 break; // CI,CE,CY -> S 259 } 260 if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk 261 code.append('K'); 262 break; 263 } 264 if (!isNextChar(local, n, 'H') || (n == 0 && wdsz >= 3 && isVowel(local, 2))) { // CH consonant -> K consonant 265 code.append('K'); 266 } else { 267 code.append('X'); // CHvowel -> X 268 } 269 break; 270 case 'D': 271 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J 272 code.append('J'); 273 n += 2; 274 } else { 275 code.append('T'); 276 } 277 break; 278 case 'G': // GH silent at end or before consonant 279 if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) { 280 break; 281 } 282 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) { 283 break; 284 } 285 if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) { 286 break; // silent G 287 } 288 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true 289 hard = isPreviousChar(local, n, 'G'); 290 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) { 291 code.append('J'); 292 } else { 293 code.append('K'); 294 } 295 break; 296 case 'H': 297 if (isLastChar(wdsz, n)) { 298 break; // terminal H 299 } 300 if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) { 301 break; 302 } 303 if (isVowel(local, n + 1)) { 304 code.append('H'); // Hvowel 305 } 306 break; 307 case 'F': 308 case 'J': 309 case 'L': 310 case 'M': 311 case 'N': 312 case 'R': 313 code.append(symb); 314 break; 315 case 'K': 316 if (n > 0) { // not initial 317 if (!isPreviousChar(local, n, 'C')) { 318 code.append(symb); 319 } 320 } else { 321 code.append(symb); // initial K 322 } 323 break; 324 case 'P': 325 if (isNextChar(local, n, 'H')) { 326 // PH -> F 327 code.append('F'); 328 } else { 329 code.append(symb); 330 } 331 break; 332 case 'Q': 333 code.append('K'); 334 break; 335 case 'S': 336 if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) { 337 code.append('X'); 338 } else { 339 code.append('S'); 340 } 341 break; 342 case 'T': 343 if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) { 344 code.append('X'); 345 break; 346 } 347 if (regionMatch(local, n, "TCH")) { 348 // Silent if in "TCH" 349 break; 350 } 351 // substitute numeral 0 for TH (resembles theta after all) 352 if (regionMatch(local, n, "TH")) { 353 code.append('0'); 354 } else { 355 code.append('T'); 356 } 357 break; 358 case 'V': 359 code.append('F'); 360 break; 361 case 'W': 362 case 'Y': // silent if not followed by vowel 363 if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) { 364 code.append(symb); 365 } 366 break; 367 case 'X': 368 code.append('K'); 369 code.append('S'); 370 break; 371 case 'Z': 372 code.append('S'); 373 break; 374 default: 375 // do nothing 376 break; 377 } // end switch 378 } // end else from symb != 'C' 379 n++; 380 if (code.length() > getMaxCodeLen()) { 381 code.setLength(getMaxCodeLen()); 382 } 383 } 384 return code.toString(); 385 } 386 387 private boolean regionMatch(final StringBuilder string, final int index, final String test) { 388 boolean matches = false; 389 if (index >= 0 && index + test.length() - 1 < string.length()) { 390 final String substring = string.substring(index, index + test.length()); 391 matches = substring.equals(test); 392 } 393 return matches; 394 } 395 396 /** 397 * Sets the maxCodeLen. 398 * 399 * @param maxCodeLen The maxCodeLen to set. 400 */ 401 public void setMaxCodeLen(final int maxCodeLen) { 402 this.maxCodeLen = maxCodeLen; 403 } 404 405}