1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language.bm;
19
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.EnumMap;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.List;
28 import java.util.Map;
29 import java.util.Scanner;
30 import java.util.Set;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
33
34 import org.apache.commons.codec.Resources;
35 import org.apache.commons.codec.language.bm.Languages.LanguageSet;
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85 public class Rule {
86
87 public static final class Phoneme implements PhonemeExpr {
88
89 public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> {
90 final int o1Length = o1.phonemeText.length();
91 final int o2Length = o2.phonemeText.length();
92 for (int i = 0; i < o1Length; i++) {
93 if (i >= o2Length) {
94 return +1;
95 }
96 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
97 if (c != 0) {
98 return c;
99 }
100 }
101
102 if (o1Length < o2Length) {
103 return -1;
104 }
105
106 return 0;
107 };
108 private final StringBuilder phonemeText;
109
110 private final Languages.LanguageSet languages;
111
112 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
113 this.phonemeText = new StringBuilder(phonemeText);
114 this.languages = languages;
115 }
116
117 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
118 this(phonemeLeft.phonemeText, phonemeLeft.languages);
119 this.phonemeText.append(phonemeRight.phonemeText);
120 }
121
122 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
123 this(phonemeLeft.phonemeText, languages);
124 this.phonemeText.append(phonemeRight.phonemeText);
125 }
126
127 public Phoneme append(final CharSequence str) {
128 this.phonemeText.append(str);
129 return this;
130 }
131
132 public Languages.LanguageSet getLanguages() {
133 return this.languages;
134 }
135
136 @Override
137 public Iterable<Phoneme> getPhonemes() {
138 return Collections.singleton(this);
139 }
140
141 public CharSequence getPhonemeText() {
142 return this.phonemeText;
143 }
144
145
146
147
148
149
150
151
152 @Deprecated
153 public Phoneme join(final Phoneme right) {
154 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
155 this.languages.restrictTo(right.languages));
156 }
157
158
159
160
161
162
163
164
165 public Phoneme mergeWithLanguage(final LanguageSet lang) {
166 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
167 }
168
169 @Override
170 public int size() {
171 return 1;
172 }
173
174 @Override
175 public String toString() {
176 return phonemeText.toString() + "[" + languages + "]";
177 }
178 }
179
180 public interface PhonemeExpr {
181 Iterable<Phoneme> getPhonemes();
182
183
184
185
186
187
188
189 default int size() {
190
191 return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE);
192 }
193 }
194
195 public static final class PhonemeList implements PhonemeExpr {
196
197 private final List<Phoneme> phonemeList;
198
199 public PhonemeList(final List<Phoneme> phonemes) {
200 this.phonemeList = phonemes;
201 }
202
203 @Override
204 public List<Phoneme> getPhonemes() {
205 return phonemeList;
206 }
207
208 @Override
209 public int size() {
210 return phonemeList.size();
211 }
212 }
213
214
215
216
217 public interface RPattern {
218 boolean isMatch(CharSequence input);
219 }
220
221 public static final RPattern ALL_STRINGS_RMATCHER = input -> true;
222
223 public static final String ALL = "ALL";
224
225 private static final String DOUBLE_QUOTE = "\"";
226
227 private static final String HASH_INCLUDE = "#include";
228
229 private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
230
231 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
232 new EnumMap<>(NameType.class);
233
234 static {
235 for (final NameType s : NameType.values()) {
236 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
237 new EnumMap<>(RuleType.class);
238
239 for (final RuleType rt : RuleType.values()) {
240 final Map<String, Map<String, List<Rule>>> rs = new HashMap<>();
241
242 final Languages ls = Languages.getInstance(s);
243 ls.getLanguages().forEach(l -> {
244 try (final Scanner scanner = createScanner(s, rt, l)) {
245 rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
246 } catch (final IllegalStateException e) {
247 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
248 }
249 });
250 if (!rt.equals(RuleType.RULES)) {
251 try (final Scanner scanner = createScanner(s, rt, "common")) {
252 rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
253 }
254 }
255
256 rts.put(rt, Collections.unmodifiableMap(rs));
257 }
258
259 RULES.put(s, Collections.unmodifiableMap(rts));
260 }
261 }
262
263 private static boolean contains(final CharSequence chars, final char input) {
264 return chars.chars().anyMatch(c -> c == input);
265 }
266
267 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
268 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
269 nameType.getName(), rt.getName(), lang);
270 }
271
272 @SuppressWarnings("resource")
273 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
274 final String resName = createResourceName(nameType, rt, lang);
275 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
276 }
277
278 @SuppressWarnings("resource")
279 private static Scanner createScanner(final String lang) {
280 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
281 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
282 }
283
284 private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
285 final int suffixLength = suffix.length();
286 final int inputLength = input.length();
287
288 if (suffixLength > inputLength) {
289 return false;
290 }
291 for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
292 if (input.charAt(i) != suffix.charAt(j)) {
293 return false;
294 }
295 }
296 return true;
297 }
298
299
300
301
302
303
304
305
306
307
308
309
310 public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
311 final Languages.LanguageSet langs) {
312 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
313 final List<Rule> allRules = new ArrayList<>();
314 ruleMap.values().forEach(rules -> allRules.addAll(rules));
315 return allRules;
316 }
317
318
319
320
321
322
323
324
325
326
327
328
329 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
330 return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
331 }
332
333
334
335
336
337
338
339
340
341
342
343
344
345 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
346 final Languages.LanguageSet langs) {
347 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
348 getInstanceMap(nameType, rt, Languages.ANY);
349 }
350
351
352
353
354
355
356
357
358
359
360
361
362
363 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
364 final String lang) {
365 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
366
367 if (rules == null) {
368 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
369 nameType.getName(), rt.getName(), lang));
370 }
371
372 return rules;
373 }
374
375 private static Phoneme parsePhoneme(final String ph) {
376 final int open = ph.indexOf("[");
377 if (open >= 0) {
378 if (!ph.endsWith("]")) {
379 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
380 }
381 final String before = ph.substring(0, open);
382 final String in = ph.substring(open + 1, ph.length() - 1);
383 final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));
384
385 return new Phoneme(before, Languages.LanguageSet.from(langs));
386 }
387 return new Phoneme(ph, Languages.ANY_LANGUAGE);
388 }
389
390 private static PhonemeExpr parsePhonemeExpr(final String ph) {
391 if (ph.startsWith("(")) {
392 if (!ph.endsWith(")")) {
393 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
394 }
395
396 final List<Phoneme> phs = new ArrayList<>();
397 final String body = ph.substring(1, ph.length() - 1);
398 for (final String part : body.split("[|]")) {
399 phs.add(parsePhoneme(part));
400 }
401 if (body.startsWith("|") || body.endsWith("|")) {
402 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
403 }
404
405 return new PhonemeList(phs);
406 }
407 return parsePhoneme(ph);
408 }
409
410 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
411 final Map<String, List<Rule>> lines = new HashMap<>();
412 int currentLine = 0;
413
414 boolean inMultilineComment = false;
415 while (scanner.hasNextLine()) {
416 currentLine++;
417 final String rawLine = scanner.nextLine();
418 String line = rawLine;
419
420 if (inMultilineComment) {
421 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
422 inMultilineComment = false;
423 }
424 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
425 inMultilineComment = true;
426 } else {
427
428 final int cmtI = line.indexOf(ResourceConstants.CMT);
429 if (cmtI >= 0) {
430 line = line.substring(0, cmtI);
431 }
432
433
434 line = line.trim();
435
436 if (line.isEmpty()) {
437 continue;
438 }
439
440 if (line.startsWith(HASH_INCLUDE)) {
441
442 final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
443 if (incl.contains(" ")) {
444 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
445 location);
446 }
447 try (final Scanner hashIncludeScanner = createScanner(incl)) {
448 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
449 }
450 } else {
451
452 final String[] parts = line.split("\\s+");
453 if (parts.length != 4) {
454 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
455 " parts: " + rawLine + " in " + location);
456 }
457 try {
458 final String pat = stripQuotes(parts[0]);
459 final String lCon = stripQuotes(parts[1]);
460 final String rCon = stripQuotes(parts[2]);
461 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
462 final int cLine = currentLine;
463 final Rule r = new Rule(pat, lCon, rCon, ph) {
464 private final int myLine = cLine;
465 private final String loc = location;
466
467 @Override
468 public String toString() {
469 final StringBuilder sb = new StringBuilder();
470 sb.append("Rule");
471 sb.append("{line=").append(myLine);
472 sb.append(", loc='").append(loc).append('\'');
473 sb.append(", pat='").append(pat).append('\'');
474 sb.append(", lcon='").append(lCon).append('\'');
475 sb.append(", rcon='").append(rCon).append('\'');
476 sb.append('}');
477 return sb.toString();
478 }
479 };
480 final String patternKey = r.pattern.substring(0, 1);
481 final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>());
482 rules.add(r);
483 } catch (final IllegalArgumentException e) {
484 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
485 location, e);
486 }
487 }
488 }
489 }
490
491 return lines;
492 }
493
494
495
496
497
498
499
500
501 private static RPattern pattern(final String regex) {
502 final boolean startsWith = regex.startsWith("^");
503 final boolean endsWith = regex.endsWith("$");
504 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
505 final boolean boxes = content.contains("[");
506
507 if (!boxes) {
508 if (startsWith && endsWith) {
509
510 if (content.isEmpty()) {
511
512 return input -> input.length() == 0;
513 }
514 return input -> input.equals(content);
515 }
516 if ((startsWith || endsWith) && content.isEmpty()) {
517
518 return ALL_STRINGS_RMATCHER;
519 }
520 if (startsWith) {
521
522 return input -> startsWith(input, content);
523 }
524 if (endsWith) {
525
526 return input -> endsWith(input, content);
527 }
528 } else {
529 final boolean startsWithBox = content.startsWith("[");
530 final boolean endsWithBox = content.endsWith("]");
531
532 if (startsWithBox && endsWithBox) {
533 String boxContent = content.substring(1, content.length() - 1);
534 if (!boxContent.contains("[")) {
535
536 final boolean negate = boxContent.startsWith("^");
537 if (negate) {
538 boxContent = boxContent.substring(1);
539 }
540 final String bContent = boxContent;
541 final boolean shouldMatch = !negate;
542
543 if (startsWith && endsWith) {
544
545 return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
546 }
547 if (startsWith) {
548
549 return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
550 }
551 if (endsWith) {
552
553 return input -> input.length() > 0 &&
554 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
555 }
556 }
557 }
558 }
559
560 return new RPattern() {
561 final Pattern pattern = Pattern.compile(regex);
562
563 @Override
564 public boolean isMatch(final CharSequence input) {
565 final Matcher matcher = pattern.matcher(input);
566 return matcher.find();
567 }
568 };
569 }
570
571 private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
572 if (prefix.length() > input.length()) {
573 return false;
574 }
575 for (int i = 0; i < prefix.length(); i++) {
576 if (input.charAt(i) != prefix.charAt(i)) {
577 return false;
578 }
579 }
580 return true;
581 }
582
583 private static String stripQuotes(String str) {
584 if (str.startsWith(DOUBLE_QUOTE)) {
585 str = str.substring(1);
586 }
587
588 if (str.endsWith(DOUBLE_QUOTE)) {
589 str = str.substring(0, str.length() - 1);
590 }
591
592 return str;
593 }
594
595 private final RPattern lContext;
596
597 private final String pattern;
598
599 private final PhonemeExpr phoneme;
600
601 private final RPattern rContext;
602
603
604
605
606
607
608
609
610
611
612
613
614
615 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
616 this.pattern = pattern;
617 this.lContext = pattern(lContext + "$");
618 this.rContext = pattern("^" + rContext);
619 this.phoneme = phoneme;
620 }
621
622
623
624
625
626
627 public RPattern getLContext() {
628 return this.lContext;
629 }
630
631
632
633
634
635
636 public String getPattern() {
637 return this.pattern;
638 }
639
640
641
642
643
644
645 public PhonemeExpr getPhoneme() {
646 return this.phoneme;
647 }
648
649
650
651
652
653
654 public RPattern getRContext() {
655 return this.rContext;
656 }
657
658
659
660
661
662
663
664
665
666
667
668
669 public boolean patternAndContextMatches(final CharSequence input, final int i) {
670 if (i < 0) {
671 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
672 }
673
674 final int patternLength = this.pattern.length();
675 final int ipl = i + patternLength;
676
677 if (ipl > input.length()) {
678
679 return false;
680 }
681
682
683
684 if (!input.subSequence(i, ipl).equals(this.pattern)) {
685 return false;
686 }
687 if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
688 return false;
689 }
690 return this.lContext.isMatch(input.subSequence(0, i));
691 }
692 }