1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language.bm;
19
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.EnumMap;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.List;
28 import java.util.Map;
29 import java.util.Scanner;
30 import java.util.Set;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
33
34 import org.apache.commons.codec.Resources;
35 import org.apache.commons.codec.language.bm.Languages.LanguageSet;
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85 public class Rule {
86
87
88
89
90 public static final class Phoneme implements PhonemeExpr {
91
92
93
94
95 public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> {
96 final int o1Length = o1.phonemeText.length();
97 final int o2Length = o2.phonemeText.length();
98 for (int i = 0; i < o1Length; i++) {
99 if (i >= o2Length) {
100 return +1;
101 }
102 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
103 if (c != 0) {
104 return c;
105 }
106 }
107
108 if (o1Length < o2Length) {
109 return -1;
110 }
111
112 return 0;
113 };
114
115 private final StringBuilder phonemeText;
116
117 private final Languages.LanguageSet languages;
118
119
120
121
122
123
124
125 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
126 this.phonemeText = new StringBuilder(phonemeText);
127 this.languages = languages;
128 }
129
130
131
132
133
134
135
136 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
137 this(phonemeLeft.phonemeText, phonemeLeft.languages);
138 this.phonemeText.append(phonemeRight.phonemeText);
139 }
140
141
142
143
144
145
146
147
148 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
149 this(phonemeLeft.phonemeText, languages);
150 this.phonemeText.append(phonemeRight.phonemeText);
151 }
152
153
154
155
156
157
158
159 public Phoneme append(final CharSequence sequence) {
160 this.phonemeText.append(sequence);
161 return this;
162 }
163
164
165
166
167
168
169 public Languages.LanguageSet getLanguages() {
170 return this.languages;
171 }
172
173 @Override
174 public Iterable<Phoneme> getPhonemes() {
175 return Collections.singleton(this);
176 }
177
178
179
180
181
182
183 public CharSequence getPhonemeText() {
184 return this.phonemeText;
185 }
186
187
188
189
190
191
192
193
194 @Deprecated
195 public Phoneme join(final Phoneme right) {
196 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
197 this.languages.restrictTo(right.languages));
198 }
199
200
201
202
203
204
205
206
207 public Phoneme mergeWithLanguage(final LanguageSet lang) {
208 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
209 }
210
211 @Override
212 public int size() {
213 return 1;
214 }
215
216 @Override
217 public String toString() {
218 return phonemeText.toString() + "[" + languages + "]";
219 }
220 }
221
222
223
224
225 public interface PhonemeExpr {
226
227
228
229
230
231
232 Iterable<Phoneme> getPhonemes();
233
234
235
236
237
238
239
240 default int size() {
241
242 return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE);
243 }
244 }
245
246
247
248
249 public static final class PhonemeList implements PhonemeExpr {
250
251 private final List<Phoneme> phonemeList;
252
253
254
255
256
257
258 public PhonemeList(final List<Phoneme> phonemes) {
259 this.phonemeList = phonemes;
260 }
261
262 @Override
263 public List<Phoneme> getPhonemes() {
264 return phonemeList;
265 }
266
267 @Override
268 public int size() {
269 return phonemeList.size();
270 }
271 }
272
273
274
275
276 public interface RPattern {
277
278
279
280
281
282
283
284 boolean isMatch(CharSequence input);
285 }
286
287
288
289
290 public static final RPattern ALL_STRINGS_RMATCHER = input -> true;
291
292
293
294
295 public static final String ALL = "ALL";
296
297 private static final String DOUBLE_QUOTE = "\"";
298
299 private static final String HASH_INCLUDE = "#include";
300
301 private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
302
303 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
304 new EnumMap<>(NameType.class);
305
306 static {
307 for (final NameType s : NameType.values()) {
308 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
309 new EnumMap<>(RuleType.class);
310
311 for (final RuleType rt : RuleType.values()) {
312 final Map<String, Map<String, List<Rule>>> rs = new HashMap<>();
313
314 final Languages ls = Languages.getInstance(s);
315 ls.getLanguages().forEach(l -> {
316 try (Scanner scanner = createScanner(s, rt, l)) {
317 rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
318 } catch (final IllegalStateException e) {
319 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
320 }
321 });
322 if (!rt.equals(RuleType.RULES)) {
323 try (Scanner scanner = createScanner(s, rt, "common")) {
324 rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
325 }
326 }
327
328 rts.put(rt, Collections.unmodifiableMap(rs));
329 }
330
331 RULES.put(s, Collections.unmodifiableMap(rts));
332 }
333 }
334
335 private static boolean contains(final CharSequence chars, final char input) {
336 return chars.chars().anyMatch(c -> c == input);
337 }
338
339 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
340 return String.format("/org/apache/commons/codec/language/bm/%s_%s_%s.txt",
341 nameType.getName(), rt.getName(), lang);
342 }
343
344 @SuppressWarnings("resource")
345 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
346 final String resName = createResourceName(nameType, rt, lang);
347 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
348 }
349
350 @SuppressWarnings("resource")
351 private static Scanner createScanner(final String lang) {
352 final String resName = String.format("/org/apache/commons/codec/language/bm/%s.txt", lang);
353 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
354 }
355
356 private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
357 final int suffixLength = suffix.length();
358 final int inputLength = input.length();
359
360 if (suffixLength > inputLength) {
361 return false;
362 }
363 for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
364 if (input.charAt(i) != suffix.charAt(j)) {
365 return false;
366 }
367 }
368 return true;
369 }
370
371
372
373
374
375
376
377
378
379
380
381
382 public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
383 final Languages.LanguageSet langs) {
384 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
385 final List<Rule> allRules = new ArrayList<>();
386 ruleMap.values().forEach(rules -> allRules.addAll(rules));
387 return allRules;
388 }
389
390
391
392
393
394
395
396
397
398
399
400
401 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
402 return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
403 }
404
405
406
407
408
409
410
411
412
413
414
415
416
417 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
418 final Languages.LanguageSet langs) {
419 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
420 getInstanceMap(nameType, rt, Languages.ANY);
421 }
422
423
424
425
426
427
428
429
430
431
432
433
434
435 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
436 final String lang) {
437 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
438
439 if (rules == null) {
440 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
441 nameType.getName(), rt.getName(), lang));
442 }
443
444 return rules;
445 }
446
447 private static Phoneme parsePhoneme(final String ph) {
448 final int open = ph.indexOf("[");
449 if (open >= 0) {
450 if (!ph.endsWith("]")) {
451 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
452 }
453 final String before = ph.substring(0, open);
454 final String in = ph.substring(open + 1, ph.length() - 1);
455 final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));
456
457 return new Phoneme(before, Languages.LanguageSet.from(langs));
458 }
459 return new Phoneme(ph, Languages.ANY_LANGUAGE);
460 }
461
462 private static PhonemeExpr parsePhonemeExpr(final String ph) {
463 if (ph.startsWith("(")) {
464 if (!ph.endsWith(")")) {
465 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
466 }
467
468 final List<Phoneme> phs = new ArrayList<>();
469 final String body = ph.substring(1, ph.length() - 1);
470 for (final String part : body.split("[|]")) {
471 phs.add(parsePhoneme(part));
472 }
473 if (body.startsWith("|") || body.endsWith("|")) {
474 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
475 }
476
477 return new PhonemeList(phs);
478 }
479 return parsePhoneme(ph);
480 }
481
482 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
483 final Map<String, List<Rule>> lines = new HashMap<>();
484 int currentLine = 0;
485
486 boolean inMultilineComment = false;
487 while (scanner.hasNextLine()) {
488 currentLine++;
489 final String rawLine = scanner.nextLine();
490 String line = rawLine;
491
492 if (inMultilineComment) {
493 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
494 inMultilineComment = false;
495 }
496 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
497 inMultilineComment = true;
498 } else {
499
500 final int cmtI = line.indexOf(ResourceConstants.CMT);
501 if (cmtI >= 0) {
502 line = line.substring(0, cmtI);
503 }
504
505
506 line = line.trim();
507
508 if (line.isEmpty()) {
509 continue;
510 }
511
512 if (line.startsWith(HASH_INCLUDE)) {
513
514 final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
515 if (incl.contains(" ")) {
516 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
517 location);
518 }
519 try (Scanner hashIncludeScanner = createScanner(incl)) {
520 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
521 }
522 } else {
523
524 final String[] parts = line.split("\\s+");
525 if (parts.length != 4) {
526 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
527 " parts: " + rawLine + " in " + location);
528 }
529 try {
530 final String pat = stripQuotes(parts[0]);
531 final String lCon = stripQuotes(parts[1]);
532 final String rCon = stripQuotes(parts[2]);
533 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
534 final int cLine = currentLine;
535 final Rule r = new Rule(pat, lCon, rCon, ph) {
536 private final int myLine = cLine;
537 private final String loc = location;
538
539 @Override
540 public String toString() {
541 final StringBuilder sb = new StringBuilder();
542 sb.append("Rule");
543 sb.append("{line=").append(myLine);
544 sb.append(", loc='").append(loc).append('\'');
545 sb.append(", pat='").append(pat).append('\'');
546 sb.append(", lcon='").append(lCon).append('\'');
547 sb.append(", rcon='").append(rCon).append('\'');
548 sb.append('}');
549 return sb.toString();
550 }
551 };
552 final String patternKey = r.pattern.substring(0, 1);
553 final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>());
554 rules.add(r);
555 } catch (final IllegalArgumentException e) {
556 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
557 location, e);
558 }
559 }
560 }
561 }
562
563 return lines;
564 }
565
566
567
568
569
570
571
572
573 private static RPattern pattern(final String regex) {
574 final boolean startsWith = regex.startsWith("^");
575 final boolean endsWith = regex.endsWith("$");
576 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
577 final boolean boxes = content.contains("[");
578
579 if (!boxes) {
580 if (startsWith && endsWith) {
581
582 if (content.isEmpty()) {
583
584 return input -> input.length() == 0;
585 }
586 return input -> input.equals(content);
587 }
588 if ((startsWith || endsWith) && content.isEmpty()) {
589
590 return ALL_STRINGS_RMATCHER;
591 }
592 if (startsWith) {
593
594 return input -> startsWith(input, content);
595 }
596 if (endsWith) {
597
598 return input -> endsWith(input, content);
599 }
600 } else {
601 final boolean startsWithBox = content.startsWith("[");
602 final boolean endsWithBox = content.endsWith("]");
603
604 if (startsWithBox && endsWithBox) {
605 String boxContent = content.substring(1, content.length() - 1);
606 if (!boxContent.contains("[")) {
607
608 final boolean negate = boxContent.startsWith("^");
609 if (negate) {
610 boxContent = boxContent.substring(1);
611 }
612 final String bContent = boxContent;
613 final boolean shouldMatch = !negate;
614
615 if (startsWith && endsWith) {
616
617 return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
618 }
619 if (startsWith) {
620
621 return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
622 }
623 if (endsWith) {
624
625 return input -> input.length() > 0 &&
626 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
627 }
628 }
629 }
630 }
631
632 return new RPattern() {
633 final Pattern pattern = Pattern.compile(regex);
634
635 @Override
636 public boolean isMatch(final CharSequence input) {
637 final Matcher matcher = pattern.matcher(input);
638 return matcher.find();
639 }
640 };
641 }
642
643 private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
644 if (prefix.length() > input.length()) {
645 return false;
646 }
647 for (int i = 0; i < prefix.length(); i++) {
648 if (input.charAt(i) != prefix.charAt(i)) {
649 return false;
650 }
651 }
652 return true;
653 }
654
655 private static String stripQuotes(String str) {
656 if (str.startsWith(DOUBLE_QUOTE)) {
657 str = str.substring(1);
658 }
659
660 if (str.endsWith(DOUBLE_QUOTE)) {
661 str = str.substring(0, str.length() - 1);
662 }
663
664 return str;
665 }
666
667 private final RPattern lContext;
668
669 private final String pattern;
670
671 private final PhonemeExpr phoneme;
672
673 private final RPattern rContext;
674
675
676
677
678
679
680
681
682
683
684
685
686
687 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
688 this.pattern = pattern;
689 this.lContext = pattern(lContext + "$");
690 this.rContext = pattern("^" + rContext);
691 this.phoneme = phoneme;
692 }
693
694
695
696
697
698
699 public RPattern getLContext() {
700 return this.lContext;
701 }
702
703
704
705
706
707
708 public String getPattern() {
709 return this.pattern;
710 }
711
712
713
714
715
716
717 public PhonemeExpr getPhoneme() {
718 return this.phoneme;
719 }
720
721
722
723
724
725
726 public RPattern getRContext() {
727 return this.rContext;
728 }
729
730
731
732
733
734
735
736
737
738
739
740
741 public boolean patternAndContextMatches(final CharSequence input, final int i) {
742 if (i < 0) {
743 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
744 }
745
746 final int patternLength = this.pattern.length();
747 final int ipl = i + patternLength;
748
749 if (ipl > input.length()) {
750
751 return false;
752 }
753
754
755
756 if (!input.subSequence(i, ipl).equals(this.pattern)) {
757 return false;
758 }
759 if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
760 return false;
761 }
762 return this.lContext.isMatch(input.subSequence(0, i));
763 }
764 }