1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 public class Metaphone implements StringEncoder {
56
57
58
59
60 private static final String VOWELS = "AEIOU";
61
62
63
64
65 private static final String FRONTV = "EIY";
66
67
68
69
70 private static final String VARSON = "CSPTG";
71
72
73
74
75 private int maxCodeLen = 4;
76
77
78
79
80 public Metaphone() {
81
82 }
83
84
85
86
87
88
89
90
91
92
93
94
95
96 @Override
97 public Object encode(final Object obj) throws EncoderException {
98 if (!(obj instanceof String)) {
99 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
100 }
101 return metaphone((String) obj);
102 }
103
104
105
106
107
108
109
110 @Override
111 public String encode(final String str) {
112 return metaphone(str);
113 }
114
115
116
117
118
119
120 public int getMaxCodeLen() {
121 return this.maxCodeLen;
122 }
123
124 private boolean isLastChar(final int wdsz, final int n) {
125 return n + 1 == wdsz;
126 }
127
128
129
130
131
132
133
134
135
136 public boolean isMetaphoneEqual(final String str1, final String str2) {
137 return metaphone(str1).equals(metaphone(str2));
138 }
139
140 private boolean isNextChar(final StringBuilder string, final int index, final char c) {
141 boolean matches = false;
142 if (index >= 0 && index < string.length() - 1) {
143 matches = string.charAt(index + 1) == c;
144 }
145 return matches;
146 }
147
148 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
149 boolean matches = false;
150 if (index > 0 && index < string.length()) {
151 matches = string.charAt(index - 1) == c;
152 }
153 return matches;
154 }
155
156 private boolean isVowel(final StringBuilder string, final int index) {
157 return VOWELS.indexOf(string.charAt(index)) >= 0;
158 }
159
160
161
162
163
164
165
166
167
168
169
170 public String metaphone(final String txt) {
171 boolean hard = false;
172 final int txtLength;
173 if (txt == null || (txtLength = txt.length()) == 0) {
174 return "";
175 }
176
177 if (txtLength == 1) {
178 return txt.toUpperCase(java.util.Locale.ENGLISH);
179 }
180
181 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
182
183 final StringBuilder local = new StringBuilder(40);
184 final StringBuilder code = new StringBuilder(10);
185
186 switch (inwd[0]) {
187 case 'K':
188 case 'G':
189 case 'P':
190 if (inwd[1] == 'N') {
191 local.append(inwd, 1, inwd.length - 1);
192 } else {
193 local.append(inwd);
194 }
195 break;
196 case 'A':
197 if (inwd[1] == 'E') {
198 local.append(inwd, 1, inwd.length - 1);
199 } else {
200 local.append(inwd);
201 }
202 break;
203 case 'W':
204 if (inwd[1] == 'R') {
205 local.append(inwd, 1, inwd.length - 1);
206 break;
207 }
208 if (inwd[1] == 'H') {
209 local.append(inwd, 1, inwd.length - 1);
210 local.setCharAt(0, 'W');
211 } else {
212 local.append(inwd);
213 }
214 break;
215 case 'X':
216 inwd[0] = 'S';
217 local.append(inwd);
218 break;
219 default:
220 local.append(inwd);
221 }
222
223 final int wdsz = local.length();
224 int n = 0;
225
226 while (code.length() < getMaxCodeLen() && n < wdsz) {
227 final char symb = local.charAt(n);
228
229 if (symb != 'C' && isPreviousChar(local, n, symb)) {
230 } else {
231 switch (symb) {
232 case 'A':
233 case 'E':
234 case 'I':
235 case 'O':
236 case 'U':
237 if (n == 0) {
238 code.append(symb);
239 }
240 break;
241 case 'B':
242 if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) {
243 break;
244 }
245 code.append(symb);
246 break;
247 case 'C':
248
249 if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
250 break;
251 }
252 if (regionMatch(local, n, "CIA")) {
253 code.append('X');
254 break;
255 }
256 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
257 code.append('S');
258 break;
259 }
260 if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) {
261 code.append('K');
262 break;
263 }
264 if (!isNextChar(local, n, 'H') || (n == 0 && wdsz >= 3 && isVowel(local, 2))) {
265 code.append('K');
266 } else {
267 code.append('X');
268 }
269 break;
270 case 'D':
271 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) {
272 code.append('J');
273 n += 2;
274 } else {
275 code.append('T');
276 }
277 break;
278 case 'G':
279 if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
280 break;
281 }
282 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
283 break;
284 }
285 if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
286 break;
287 }
288
289 hard = isPreviousChar(local, n, 'G');
290 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
291 code.append('J');
292 } else {
293 code.append('K');
294 }
295 break;
296 case 'H':
297 if (isLastChar(wdsz, n)) {
298 break;
299 }
300 if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
301 break;
302 }
303 if (isVowel(local, n + 1)) {
304 code.append('H');
305 }
306 break;
307 case 'F':
308 case 'J':
309 case 'L':
310 case 'M':
311 case 'N':
312 case 'R':
313 code.append(symb);
314 break;
315 case 'K':
316 if (n > 0) {
317 if (!isPreviousChar(local, n, 'C')) {
318 code.append(symb);
319 }
320 } else {
321 code.append(symb);
322 }
323 break;
324 case 'P':
325 if (isNextChar(local, n, 'H')) {
326
327 code.append('F');
328 } else {
329 code.append(symb);
330 }
331 break;
332 case 'Q':
333 code.append('K');
334 break;
335 case 'S':
336 if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
337 code.append('X');
338 } else {
339 code.append('S');
340 }
341 break;
342 case 'T':
343 if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
344 code.append('X');
345 break;
346 }
347 if (regionMatch(local, n, "TCH")) {
348
349 break;
350 }
351
352 if (regionMatch(local, n, "TH")) {
353 code.append('0');
354 } else {
355 code.append('T');
356 }
357 break;
358 case 'V':
359 code.append('F');
360 break;
361 case 'W':
362 case 'Y':
363 if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
364 code.append(symb);
365 }
366 break;
367 case 'X':
368 code.append('K');
369 code.append('S');
370 break;
371 case 'Z':
372 code.append('S');
373 break;
374 default:
375
376 break;
377 }
378 }
379 n++;
380 if (code.length() > getMaxCodeLen()) {
381 code.setLength(getMaxCodeLen());
382 }
383 }
384 return code.toString();
385 }
386
387 private boolean regionMatch(final StringBuilder string, final int index, final String test) {
388 boolean matches = false;
389 if (index >= 0 && index + test.length() - 1 < string.length()) {
390 final String substring = string.substring(index, index + test.length());
391 matches = substring.equals(test);
392 }
393 return matches;
394 }
395
396
397
398
399
400
401 public void setMaxCodeLen(final int maxCodeLen) {
402 this.maxCodeLen = maxCodeLen;
403 }
404
405 }