1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.codec.language;
18
19 import java.util.Locale;
20
21 import org.apache.commons.codec.EncoderException;
22 import org.apache.commons.codec.StringEncoder;
23
24
25
26
27
28
29
30
31
32
33 public class MatchRatingApproachEncoder implements StringEncoder {
34
35 private static final String SPACE = " ";
36
37 private static final String EMPTY = "";
38
39
40
41
42 private static final String PLAIN_ASCII = "AaEeIiOoUu" +
43 "AaEeIiOoUuYy" +
44 "AaEeIiOoUuYy" +
45 "AaOoNn" +
46 "AaEeIiOoUuYy" +
47 "Aa" +
48 "Cc" +
49 "OoUu";
50
51
52
53
54 private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
55 "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
56 "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
57 "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
58 "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
59 "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
60
61 private static final String[] DOUBLE_CONSONANT =
62 { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
63 "TT", "VV", "WW", "XX", "YY", "ZZ" };
64
65
66
67
68 public MatchRatingApproachEncoder() {
69
70 }
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85 String cleanName(final String name) {
86 String upperName = name.toUpperCase(Locale.ENGLISH);
87
88 final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
89 for (final String str : charsToTrim) {
90 upperName = upperName.replaceAll(str, EMPTY);
91 }
92
93 upperName = removeAccents(upperName);
94 return upperName.replaceAll("\\s+", EMPTY);
95 }
96
97
98
99
100
101
102
103
104
105
106
107
108 @Override
109 public final Object encode(final Object pObject) throws EncoderException {
110 if (!(pObject instanceof String)) {
111 throw new EncoderException(
112 "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
113 }
114 return encode((String) pObject);
115 }
116
117
118
119
120
121
122
123
124 @Override
125 public final String encode(String name) {
126
127 if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
128 return EMPTY;
129 }
130
131
132 name = cleanName(name);
133
134
135 if (SPACE.equals(name) || name.isEmpty()) {
136 return EMPTY;
137 }
138
139
140
141 name = removeVowels(name);
142
143
144 if (SPACE.equals(name) || name.isEmpty()) {
145 return EMPTY;
146 }
147
148
149 name = removeDoubleConsonants(name);
150
151 return getFirst3Last3(name);
152 }
153
154
155
156
157
158
159
160
161
162
163
164
165
166 String getFirst3Last3(final String name) {
167 final int nameLength = name.length();
168
169 if (nameLength > 6) {
170 final String firstThree = name.substring(0, 3);
171 final String lastThree = name.substring(nameLength - 3, nameLength);
172 return firstThree + lastThree;
173 }
174 return name;
175 }
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190 int getMinRating(final int sumLength) {
191 int minRating = 0;
192
193 if (sumLength <= 4) {
194 minRating = 5;
195 } else if (sumLength <= 7) {
196 minRating = 4;
197 } else if (sumLength <= 11) {
198 minRating = 3;
199 } else if (sumLength == 12) {
200 minRating = 2;
201 } else {
202 minRating = 1;
203 }
204
205 return minRating;
206 }
207
208
209
210
211
212
213
214
215
216
217
218 public boolean isEncodeEquals(String name1, String name2) {
219
220 if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
221 return false;
222 }
223 if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
224 return false;
225 }
226 if (name1.length() == 1 || name2.length() == 1) {
227 return false;
228 }
229 if (name1.equalsIgnoreCase(name2)) {
230 return true;
231 }
232
233
234 name1 = cleanName(name1);
235 name2 = cleanName(name2);
236
237
238
239
240 name1 = removeVowels(name1);
241 name2 = removeVowels(name2);
242
243
244 name1 = removeDoubleConsonants(name1);
245 name2 = removeDoubleConsonants(name2);
246
247
248 name1 = getFirst3Last3(name1);
249 name2 = getFirst3Last3(name2);
250
251
252
253 if (Math.abs(name1.length() - name2.length()) >= 3) {
254 return false;
255 }
256
257
258
259 final int sumLength = Math.abs(name1.length() + name2.length());
260 final int minRating = getMinRating(sumLength);
261
262
263
264 final int count = leftToRightThenRightToLeftProcessing(name1, name2);
265
266
267
268 return count >= minRating;
269
270 }
271
272
273
274
275
276
277
278
279
280
281
282
283
284 boolean isVowel(final String letter) {
285 return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
286 letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
287 }
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302 int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
303 final char[] name1Char = name1.toCharArray();
304 final char[] name2Char = name2.toCharArray();
305
306 final int name1Size = name1.length() - 1;
307 final int name2Size = name2.length() - 1;
308
309 String name1LtRStart = EMPTY;
310 String name1LtREnd = EMPTY;
311
312 String name2RtLStart = EMPTY;
313 String name2RtLEnd = EMPTY;
314
315 for (int i = 0; i < name1Char.length; i++) {
316 if (i > name2Size) {
317 break;
318 }
319
320 name1LtRStart = name1.substring(i, i + 1);
321 name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
322
323 name2RtLStart = name2.substring(i, i + 1);
324 name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
325
326
327 if (name1LtRStart.equals(name2RtLStart)) {
328 name1Char[i] = ' ';
329 name2Char[i] = ' ';
330 }
331
332
333 if (name1LtREnd.equals(name2RtLEnd)) {
334 name1Char[name1Size - i] = ' ';
335 name2Char[name2Size - i] = ' ';
336 }
337 }
338
339
340 final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
341 final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
342
343
344 if (strA.length() > strB.length()) {
345 return Math.abs(6 - strA.length());
346 }
347 return Math.abs(6 - strB.length());
348 }
349
350
351
352
353
354
355
356
357
358 String removeAccents(final String accentedWord) {
359 if (accentedWord == null) {
360 return null;
361 }
362
363 final StringBuilder sb = new StringBuilder();
364 final int n = accentedWord.length();
365
366 for (int i = 0; i < n; i++) {
367 final char c = accentedWord.charAt(i);
368 final int pos = UNICODE.indexOf(c);
369 if (pos > -1) {
370 sb.append(PLAIN_ASCII.charAt(pos));
371 } else {
372 sb.append(c);
373 }
374 }
375
376 return sb.toString();
377 }
378
379
380
381
382
383
384
385
386
387
388
389
390
391 String removeDoubleConsonants(final String name) {
392 String replacedName = name.toUpperCase(Locale.ENGLISH);
393 for (final String dc : DOUBLE_CONSONANT) {
394 if (replacedName.contains(dc)) {
395 final String singleLetter = dc.substring(0, 1);
396 replacedName = replacedName.replace(dc, singleLetter);
397 }
398 }
399 return replacedName;
400 }
401
402
403
404
405
406
407
408
409
410
411
412
413
414 String removeVowels(String name) {
415
416 final String firstLetter = name.substring(0, 1);
417
418 name = name.replace("A", EMPTY);
419 name = name.replace("E", EMPTY);
420 name = name.replace("I", EMPTY);
421 name = name.replace("O", EMPTY);
422 name = name.replace("U", EMPTY);
423
424 name = name.replaceAll("\\s{2,}\\b", SPACE);
425
426
427 if (isVowel(firstLetter)) {
428 return firstLetter + name;
429 }
430 return name;
431 }
432 }