1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 public class Metaphone implements StringEncoder {
56
57
58
59
60 private static final String VOWELS = "AEIOU";
61
62
63
64
65 private static final String FRONTV = "EIY";
66
67
68
69
70 private static final String VARSON = "CSPTG";
71
72
73
74
75 private int maxCodeLen = 4;
76
77
78
79
80
81
82
83
84
85
86
87
88
89 @Override
90 public Object encode(final Object obj) throws EncoderException {
91 if (!(obj instanceof String)) {
92 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
93 }
94 return metaphone((String) obj);
95 }
96
97
98
99
100
101
102
103 @Override
104 public String encode(final String str) {
105 return metaphone(str);
106 }
107
108
109
110
111
112 public int getMaxCodeLen() { return this.maxCodeLen; }
113
114 private boolean isLastChar(final int wdsz, final int n) {
115 return n + 1 == wdsz;
116 }
117
118
119
120
121
122
123
124
125
126 public boolean isMetaphoneEqual(final String str1, final String str2) {
127 return metaphone(str1).equals(metaphone(str2));
128 }
129
130 private boolean isNextChar(final StringBuilder string, final int index, final char c) {
131 boolean matches = false;
132 if (index >= 0 && index < string.length() - 1) {
133 matches = string.charAt(index + 1) == c;
134 }
135 return matches;
136 }
137
138 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
139 boolean matches = false;
140 if (index > 0 && index < string.length()) {
141 matches = string.charAt(index - 1) == c;
142 }
143 return matches;
144 }
145
146 private boolean isVowel(final StringBuilder string, final int index) {
147 return VOWELS.indexOf(string.charAt(index)) >= 0;
148 }
149
150
151
152
153
154
155
156
157
158
159
160 public String metaphone(final String txt) {
161 boolean hard = false;
162 final int txtLength;
163 if (txt == null || (txtLength = txt.length()) == 0) {
164 return "";
165 }
166
167 if (txtLength == 1) {
168 return txt.toUpperCase(java.util.Locale.ENGLISH);
169 }
170
171 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
172
173 final StringBuilder local = new StringBuilder(40);
174 final StringBuilder code = new StringBuilder(10);
175
176 switch (inwd[0]) {
177 case 'K':
178 case 'G':
179 case 'P':
180 if (inwd[1] == 'N') {
181 local.append(inwd, 1, inwd.length - 1);
182 } else {
183 local.append(inwd);
184 }
185 break;
186 case 'A':
187 if (inwd[1] == 'E') {
188 local.append(inwd, 1, inwd.length - 1);
189 } else {
190 local.append(inwd);
191 }
192 break;
193 case 'W':
194 if (inwd[1] == 'R') {
195 local.append(inwd, 1, inwd.length - 1);
196 break;
197 }
198 if (inwd[1] == 'H') {
199 local.append(inwd, 1, inwd.length - 1);
200 local.setCharAt(0, 'W');
201 } else {
202 local.append(inwd);
203 }
204 break;
205 case 'X':
206 inwd[0] = 'S';
207 local.append(inwd);
208 break;
209 default:
210 local.append(inwd);
211 }
212
213 final int wdsz = local.length();
214 int n = 0;
215
216 while (code.length() < getMaxCodeLen() && n < wdsz) {
217 final char symb = local.charAt(n);
218
219 if (symb != 'C' && isPreviousChar(local, n, symb)) {
220 } else {
221 switch (symb) {
222 case 'A':
223 case 'E':
224 case 'I':
225 case 'O':
226 case 'U':
227 if (n == 0) {
228 code.append(symb);
229 }
230 break;
231 case 'B':
232 if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) {
233 break;
234 }
235 code.append(symb);
236 break;
237 case 'C':
238
239 if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
240 break;
241 }
242 if (regionMatch(local, n, "CIA")) {
243 code.append('X');
244 break;
245 }
246 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
247 code.append('S');
248 break;
249 }
250 if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) {
251 code.append('K');
252 break;
253 }
254 if (!isNextChar(local, n, 'H') || (n == 0 && wdsz >= 3 && isVowel(local, 2))) {
255 code.append('K');
256 } else {
257 code.append('X');
258 }
259 break;
260 case 'D':
261 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) {
262 code.append('J');
263 n += 2;
264 } else {
265 code.append('T');
266 }
267 break;
268 case 'G':
269 if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
270 break;
271 }
272 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
273 break;
274 }
275 if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
276 break;
277 }
278
279 hard = isPreviousChar(local, n, 'G');
280 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
281 code.append('J');
282 } else {
283 code.append('K');
284 }
285 break;
286 case 'H':
287 if (isLastChar(wdsz, n)) {
288 break;
289 }
290 if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
291 break;
292 }
293 if (isVowel(local, n + 1)) {
294 code.append('H');
295 }
296 break;
297 case 'F':
298 case 'J':
299 case 'L':
300 case 'M':
301 case 'N':
302 case 'R':
303 code.append(symb);
304 break;
305 case 'K':
306 if (n > 0) {
307 if (!isPreviousChar(local, n, 'C')) {
308 code.append(symb);
309 }
310 } else {
311 code.append(symb);
312 }
313 break;
314 case 'P':
315 if (isNextChar(local, n, 'H')) {
316
317 code.append('F');
318 } else {
319 code.append(symb);
320 }
321 break;
322 case 'Q':
323 code.append('K');
324 break;
325 case 'S':
326 if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
327 code.append('X');
328 } else {
329 code.append('S');
330 }
331 break;
332 case 'T':
333 if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
334 code.append('X');
335 break;
336 }
337 if (regionMatch(local, n, "TCH")) {
338
339 break;
340 }
341
342 if (regionMatch(local, n, "TH")) {
343 code.append('0');
344 } else {
345 code.append('T');
346 }
347 break;
348 case 'V':
349 code.append('F');
350 break;
351 case 'W':
352 case 'Y':
353 if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
354 code.append(symb);
355 }
356 break;
357 case 'X':
358 code.append('K');
359 code.append('S');
360 break;
361 case 'Z':
362 code.append('S');
363 break;
364 default:
365
366 break;
367 }
368 }
369 n++;
370 if (code.length() > getMaxCodeLen()) {
371 code.setLength(getMaxCodeLen());
372 }
373 }
374 return code.toString();
375 }
376
377 private boolean regionMatch(final StringBuilder string, final int index, final String test) {
378 boolean matches = false;
379 if (index >= 0 && index + test.length() - 1 < string.length()) {
380 final String substring = string.substring(index, index + test.length());
381 matches = substring.equals(test);
382 }
383 return matches;
384 }
385
386
387
388
389
390 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
391
392 }