1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.codec.language;
18
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.HashMap;
23 import java.util.LinkedHashSet;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.Scanner;
27 import java.util.Set;
28
29 import org.apache.commons.codec.CharEncoding;
30 import org.apache.commons.codec.EncoderException;
31 import org.apache.commons.codec.Resources;
32 import org.apache.commons.codec.StringEncoder;
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70 public class DaitchMokotoffSoundex implements StringEncoder {
71
72
73
74
75 private static final class Branch {
76 private final StringBuilder builder;
77 private String cachedString;
78 private String lastReplacement;
79
80 private Branch() {
81 builder = new StringBuilder();
82 lastReplacement = null;
83 cachedString = null;
84 }
85
86
87
88
89
90
91 public Branch createBranch() {
92 final Branch branch = new Branch();
93 branch.builder.append(toString());
94 branch.lastReplacement = this.lastReplacement;
95 return branch;
96 }
97
98 @Override
99 public boolean equals(final Object other) {
100 if (this == other) {
101 return true;
102 }
103 if (!(other instanceof Branch)) {
104 return false;
105 }
106
107 return toString().equals(((Branch) other).toString());
108 }
109
110
111
112
113 public void finish() {
114 while (builder.length() < MAX_LENGTH) {
115 builder.append('0');
116 cachedString = null;
117 }
118 }
119
120 @Override
121 public int hashCode() {
122 return toString().hashCode();
123 }
124
125
126
127
128
129
130
131
132
133 public void processNextReplacement(final String replacement, final boolean forceAppend) {
134 final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
135
136 if (append && builder.length() < MAX_LENGTH) {
137 builder.append(replacement);
138
139 if (builder.length() > MAX_LENGTH) {
140 builder.delete(MAX_LENGTH, builder.length());
141 }
142 cachedString = null;
143 }
144
145 lastReplacement = replacement;
146 }
147
148 @Override
149 public String toString() {
150 if (cachedString == null) {
151 cachedString = builder.toString();
152 }
153 return cachedString;
154 }
155 }
156
157
158
159
160 private static final class Rule {
161 private final String pattern;
162 private final String[] replacementAtStart;
163 private final String[] replacementBeforeVowel;
164 private final String[] replacementDefault;
165
166 protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
167 final String replacementDefault) {
168 this.pattern = pattern;
169 this.replacementAtStart = replacementAtStart.split("\\|");
170 this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
171 this.replacementDefault = replacementDefault.split("\\|");
172 }
173
174 public int getPatternLength() {
175 return pattern.length();
176 }
177
178 public String[] getReplacements(final String context, final boolean atStart) {
179 if (atStart) {
180 return replacementAtStart;
181 }
182
183 final int nextIndex = getPatternLength();
184 final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
185 if (nextCharIsVowel) {
186 return replacementBeforeVowel;
187 }
188
189 return replacementDefault;
190 }
191
192 private boolean isVowel(final char ch) {
193 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
194 }
195
196 public boolean matches(final String context) {
197 return context.startsWith(pattern);
198 }
199
200 @Override
201 public String toString() {
202 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
203 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
204 }
205 }
206
207 private static final String COMMENT = "//";
208
209 private static final String DOUBLE_QUOTE = "\"";
210
211 private static final String MULTILINE_COMMENT_END = "*/";
212
213 private static final String MULTILINE_COMMENT_START = "/*";
214
215
216 private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
217
218
219 private static final int MAX_LENGTH = 6;
220
221
222 private static final Map<Character, List<Rule>> RULES = new HashMap<>();
223
224
225 private static final Map<Character, Character> FOLDINGS = new HashMap<>();
226
227 static {
228 try (final Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
229 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
230 }
231
232
233 RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
234 }
235
236 private static void parseRules(final Scanner scanner, final String location,
237 final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
238 int currentLine = 0;
239 boolean inMultilineComment = false;
240
241 while (scanner.hasNextLine()) {
242 currentLine++;
243 final String rawLine = scanner.nextLine();
244 String line = rawLine;
245
246 if (inMultilineComment) {
247 if (line.endsWith(MULTILINE_COMMENT_END)) {
248 inMultilineComment = false;
249 }
250 continue;
251 }
252
253 if (line.startsWith(MULTILINE_COMMENT_START)) {
254 inMultilineComment = true;
255 } else {
256
257 final int cmtI = line.indexOf(COMMENT);
258 if (cmtI >= 0) {
259 line = line.substring(0, cmtI);
260 }
261
262
263 line = line.trim();
264
265 if (line.isEmpty()) {
266 continue;
267 }
268
269 if (line.contains("=")) {
270
271 final String[] parts = line.split("=");
272 if (parts.length != 2) {
273 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
274 " parts: " + rawLine + " in " + location);
275 }
276 final String leftCharacter = parts[0];
277 final String rightCharacter = parts[1];
278
279 if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
280 throw new IllegalArgumentException("Malformed folding statement - " +
281 "patterns are not single characters: " + rawLine + " in " + location);
282 }
283
284 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
285 } else {
286
287 final String[] parts = line.split("\\s+");
288 if (parts.length != 4) {
289 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
290 " parts: " + rawLine + " in " + location);
291 }
292 try {
293 final String pattern = stripQuotes(parts[0]);
294 final String replacement1 = stripQuotes(parts[1]);
295 final String replacement2 = stripQuotes(parts[2]);
296 final String replacement3 = stripQuotes(parts[3]);
297
298 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
299 final char patternKey = r.pattern.charAt(0);
300 final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
301 rules.add(r);
302 } catch (final IllegalArgumentException e) {
303 throw new IllegalStateException(
304 "Problem parsing line '" + currentLine + "' in " + location, e);
305 }
306 }
307 }
308 }
309 }
310
311 private static String stripQuotes(String str) {
312 if (str.startsWith(DOUBLE_QUOTE)) {
313 str = str.substring(1);
314 }
315
316 if (str.endsWith(DOUBLE_QUOTE)) {
317 str = str.substring(0, str.length() - 1);
318 }
319
320 return str;
321 }
322
323
324 private final boolean folding;
325
326
327
328
329 public DaitchMokotoffSoundex() {
330 this(true);
331 }
332
333
334
335
336
337
338
339
340
341
342
343 public DaitchMokotoffSoundex(final boolean folding) {
344 this.folding = folding;
345 }
346
347
348
349
350
351
352
353
354
355
356
357 private String cleanup(final String input) {
358 final StringBuilder sb = new StringBuilder();
359 for (char ch : input.toCharArray()) {
360 if (Character.isWhitespace(ch)) {
361 continue;
362 }
363
364 ch = Character.toLowerCase(ch);
365 final Character character = FOLDINGS.get(ch);
366 if (folding && character != null) {
367 ch = character;
368 }
369 sb.append(ch);
370 }
371 return sb.toString();
372 }
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392 @Override
393 public Object encode(final Object obj) throws EncoderException {
394 if (!(obj instanceof String)) {
395 throw new EncoderException(
396 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
397 }
398 return encode((String) obj);
399 }
400
401
402
403
404
405
406
407
408
409
410
411
412 @Override
413 public String encode(final String source) {
414 if (source == null) {
415 return null;
416 }
417 return soundex(source, false)[0];
418 }
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443 public String soundex(final String source) {
444 final String[] branches = soundex(source, true);
445 final StringBuilder sb = new StringBuilder();
446 int index = 0;
447 for (final String branch : branches) {
448 sb.append(branch);
449 if (++index < branches.length) {
450 sb.append('|');
451 }
452 }
453 return sb.toString();
454 }
455
456
457
458
459
460
461
462
463
464
465
466 private String[] soundex(final String source, final boolean branching) {
467 if (source == null) {
468 return null;
469 }
470
471 final String input = cleanup(source);
472
473 final Set<Branch> currentBranches = new LinkedHashSet<>();
474 currentBranches.add(new Branch());
475
476 char lastChar = '\0';
477 for (int index = 0; index < input.length(); index++) {
478 final char ch = input.charAt(index);
479
480
481 if (Character.isWhitespace(ch)) {
482 continue;
483 }
484
485 final String inputContext = input.substring(index);
486 final List<Rule> rules = RULES.get(ch);
487 if (rules == null) {
488 continue;
489 }
490
491
492 final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();
493
494 for (final Rule rule : rules) {
495 if (rule.matches(inputContext)) {
496 if (branching) {
497 nextBranches.clear();
498 }
499 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
500 final boolean branchingRequired = replacements.length > 1 && branching;
501
502 for (final Branch branch : currentBranches) {
503 for (final String nextReplacement : replacements) {
504
505 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
506
507
508 final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm';
509
510 nextBranch.processNextReplacement(nextReplacement, force);
511
512 if (!branching) {
513 break;
514 }
515 nextBranches.add(nextBranch);
516 }
517 }
518
519 if (branching) {
520 currentBranches.clear();
521 currentBranches.addAll(nextBranches);
522 }
523 index += rule.getPatternLength() - 1;
524 break;
525 }
526 }
527
528 lastChar = ch;
529 }
530
531 final String[] result = new String[currentBranches.size()];
532 int index = 0;
533 for (final Branch branch : currentBranches) {
534 branch.finish();
535 result[index++] = branch.toString();
536 }
537
538 return result;
539 }
540 }