1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.codec.language;
18
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.HashMap;
23 import java.util.LinkedHashSet;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.Scanner;
27 import java.util.Set;
28
29 import org.apache.commons.codec.CharEncoding;
30 import org.apache.commons.codec.EncoderException;
31 import org.apache.commons.codec.Resources;
32 import org.apache.commons.codec.StringEncoder;
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 public class DaitchMokotoffSoundex implements StringEncoder {
70
71
72
73
74 private static final class Branch {
75 private final StringBuilder builder;
76 private String cachedString;
77 private String lastReplacement;
78
79 private Branch() {
80 builder = new StringBuilder();
81 lastReplacement = null;
82 cachedString = null;
83 }
84
85
86
87
88
89
90 public Branch createBranch() {
91 final Branch branch = new Branch();
92 branch.builder.append(toString());
93 branch.lastReplacement = this.lastReplacement;
94 return branch;
95 }
96
97 @Override
98 public boolean equals(final Object other) {
99 if (this == other) {
100 return true;
101 }
102 if (!(other instanceof Branch)) {
103 return false;
104 }
105
106 return toString().equals(((Branch) other).toString());
107 }
108
109
110
111
112 public void finish() {
113 while (builder.length() < MAX_LENGTH) {
114 builder.append('0');
115 cachedString = null;
116 }
117 }
118
119 @Override
120 public int hashCode() {
121 return toString().hashCode();
122 }
123
124
125
126
127
128
129
130
131
132 public void processNextReplacement(final String replacement, final boolean forceAppend) {
133 final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
134
135 if (append && builder.length() < MAX_LENGTH) {
136 builder.append(replacement);
137
138 if (builder.length() > MAX_LENGTH) {
139 builder.delete(MAX_LENGTH, builder.length());
140 }
141 cachedString = null;
142 }
143
144 lastReplacement = replacement;
145 }
146
147 @Override
148 public String toString() {
149 if (cachedString == null) {
150 cachedString = builder.toString();
151 }
152 return cachedString;
153 }
154 }
155
156
157
158
159 private static final class Rule {
160 private final String pattern;
161 private final String[] replacementAtStart;
162 private final String[] replacementBeforeVowel;
163 private final String[] replacementDefault;
164
165 protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
166 final String replacementDefault) {
167 this.pattern = pattern;
168 this.replacementAtStart = replacementAtStart.split("\\|");
169 this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
170 this.replacementDefault = replacementDefault.split("\\|");
171 }
172
173 public int getPatternLength() {
174 return pattern.length();
175 }
176
177 public String[] getReplacements(final String context, final boolean atStart) {
178 if (atStart) {
179 return replacementAtStart;
180 }
181
182 final int nextIndex = getPatternLength();
183 final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
184 if (nextCharIsVowel) {
185 return replacementBeforeVowel;
186 }
187
188 return replacementDefault;
189 }
190
191 private boolean isVowel(final char ch) {
192 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
193 }
194
195 public boolean matches(final String context) {
196 return context.startsWith(pattern);
197 }
198
199 @Override
200 public String toString() {
201 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
202 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
203 }
204 }
205
206 private static final String COMMENT = "//";
207
208 private static final String DOUBLE_QUOTE = "\"";
209
210 private static final String MULTILINE_COMMENT_END = "*/";
211
212 private static final String MULTILINE_COMMENT_START = "/*";
213
214
215 private static final String RESOURCE_FILE = "/org/apache/commons/codec/language/dmrules.txt";
216
217
218 private static final int MAX_LENGTH = 6;
219
220
221 private static final Map<Character, List<Rule>> RULES = new HashMap<>();
222
223
224 private static final Map<Character, Character> FOLDINGS = new HashMap<>();
225
226 static {
227 try (Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
228 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
229 }
230
231
232 RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
233 }
234
235 private static void parseRules(final Scanner scanner, final String location,
236 final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
237 int currentLine = 0;
238 boolean inMultilineComment = false;
239
240 while (scanner.hasNextLine()) {
241 currentLine++;
242 final String rawLine = scanner.nextLine();
243 String line = rawLine;
244
245 if (inMultilineComment) {
246 if (line.endsWith(MULTILINE_COMMENT_END)) {
247 inMultilineComment = false;
248 }
249 continue;
250 }
251
252 if (line.startsWith(MULTILINE_COMMENT_START)) {
253 inMultilineComment = true;
254 } else {
255
256 final int cmtI = line.indexOf(COMMENT);
257 if (cmtI >= 0) {
258 line = line.substring(0, cmtI);
259 }
260
261
262 line = line.trim();
263
264 if (line.isEmpty()) {
265 continue;
266 }
267
268 if (line.contains("=")) {
269
270 final String[] parts = line.split("=");
271 if (parts.length != 2) {
272 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
273 " parts: " + rawLine + " in " + location);
274 }
275 final String leftCharacter = parts[0];
276 final String rightCharacter = parts[1];
277
278 if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
279 throw new IllegalArgumentException("Malformed folding statement - " +
280 "patterns are not single characters: " + rawLine + " in " + location);
281 }
282
283 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
284 } else {
285
286 final String[] parts = line.split("\\s+");
287 if (parts.length != 4) {
288 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
289 " parts: " + rawLine + " in " + location);
290 }
291 try {
292 final String pattern = stripQuotes(parts[0]);
293 final String replacement1 = stripQuotes(parts[1]);
294 final String replacement2 = stripQuotes(parts[2]);
295 final String replacement3 = stripQuotes(parts[3]);
296
297 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
298 final char patternKey = r.pattern.charAt(0);
299 final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
300 rules.add(r);
301 } catch (final IllegalArgumentException e) {
302 throw new IllegalStateException(
303 "Problem parsing line '" + currentLine + "' in " + location, e);
304 }
305 }
306 }
307 }
308 }
309
310 private static String stripQuotes(String str) {
311 if (str.startsWith(DOUBLE_QUOTE)) {
312 str = str.substring(1);
313 }
314
315 if (str.endsWith(DOUBLE_QUOTE)) {
316 str = str.substring(0, str.length() - 1);
317 }
318
319 return str;
320 }
321
322
323 private final boolean folding;
324
325
326
327
328 public DaitchMokotoffSoundex() {
329 this(true);
330 }
331
332
333
334
335
336
337
338
339
340
341
342 public DaitchMokotoffSoundex(final boolean folding) {
343 this.folding = folding;
344 }
345
346
347
348
349
350
351
352
353
354
355
356 private String cleanup(final String input) {
357 final StringBuilder sb = new StringBuilder();
358 for (char ch : input.toCharArray()) {
359 if (Character.isWhitespace(ch)) {
360 continue;
361 }
362
363 ch = Character.toLowerCase(ch);
364 final Character character = FOLDINGS.get(ch);
365 if (folding && character != null) {
366 ch = character;
367 }
368 sb.append(ch);
369 }
370 return sb.toString();
371 }
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390 @Override
391 public Object encode(final Object obj) throws EncoderException {
392 if (!(obj instanceof String)) {
393 throw new EncoderException(
394 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
395 }
396 return encode((String) obj);
397 }
398
399
400
401
402
403
404
405
406
407
408
409 @Override
410 public String encode(final String source) {
411 if (source == null) {
412 return null;
413 }
414 return soundex(source, false)[0];
415 }
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440 public String soundex(final String source) {
441 return String.join("|", soundex(source, true));
442 }
443
444
445
446
447
448
449
450
451
452
453
454 private String[] soundex(final String source, final boolean branching) {
455 if (source == null) {
456 return null;
457 }
458
459 final String input = cleanup(source);
460
461 final Set<Branch> currentBranches = new LinkedHashSet<>();
462 currentBranches.add(new Branch());
463
464 char lastChar = '\0';
465 for (int index = 0; index < input.length(); index++) {
466 final char ch = input.charAt(index);
467
468
469 if (Character.isWhitespace(ch)) {
470 continue;
471 }
472
473 final String inputContext = input.substring(index);
474 final List<Rule> rules = RULES.get(ch);
475 if (rules == null) {
476 continue;
477 }
478
479
480 final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();
481
482 for (final Rule rule : rules) {
483 if (rule.matches(inputContext)) {
484 if (branching) {
485 nextBranches.clear();
486 }
487 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
488 final boolean branchingRequired = replacements.length > 1 && branching;
489
490 for (final Branch branch : currentBranches) {
491 for (final String nextReplacement : replacements) {
492
493 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
494
495
496 final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm';
497
498 nextBranch.processNextReplacement(nextReplacement, force);
499
500 if (!branching) {
501 break;
502 }
503 nextBranches.add(nextBranch);
504 }
505 }
506
507 if (branching) {
508 currentBranches.clear();
509 currentBranches.addAll(nextBranches);
510 }
511 index += rule.getPatternLength() - 1;
512 break;
513 }
514 }
515
516 lastChar = ch;
517 }
518
519 final String[] result = new String[currentBranches.size()];
520 int index = 0;
521 for (final Branch branch : currentBranches) {
522 branch.finish();
523 result[index++] = branch.toString();
524 }
525
526 return result;
527 }
528 }