001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import java.util.ArrayList; 021import java.util.Arrays; 022import java.util.Collections; 023import java.util.Comparator; 024import java.util.EnumMap; 025import java.util.HashMap; 026import java.util.HashSet; 027import java.util.List; 028import java.util.Map; 029import java.util.Scanner; 030import java.util.Set; 031import java.util.regex.Matcher; 032import java.util.regex.Pattern; 033 034import org.apache.commons.codec.Resources; 035import org.apache.commons.codec.language.bm.Languages.LanguageSet; 036 037/** 038 * A phoneme rule. 039 * <p> 040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply 041 * and a logical flag indicating if all languages must be in play. A rule matches if: 042 * </p> 043 * <ul> 044 * <li>the pattern matches at the current position</li> 045 * <li>the string up until the beginning of the pattern matches the left context</li> 046 * <li>the string from the end of the pattern matches the right context</li> 047 * <li>logical is ALL and all languages are in scope; or</li> 048 * <li>logical is any other value and at least one language is in scope</li> 049 * </ul> 050 * <p> 051 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user 052 * to explicitly construct their own. 053 * </p> 054 * <p> 055 * Rules are immutable and thread-safe. 056 * </p> 057 * <h2>Rules resources</h2> 058 * <p> 059 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically 060 * named following the pattern: 061 * </p> 062 * <blockquote>/org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 063 * <p> 064 * The format of these resources is the following: 065 * </p> 066 * <ul> 067 * <li><strong>Rules:</strong> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these 068 * will be interpreted as: 069 * <ol> 070 * <li>pattern</li> 071 * <li>left context</li> 072 * <li>right context</li> 073 * <li>phoneme</li> 074 * </ol> 075 * </li> 076 * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be discarded 077 * as a comment.</li> 078 * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode. This will skip 079 * all content until a line ending in '*' and '/' is found.</li> 080 * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li> 081 * </ul> 082 * 083 * @since 1.6 084 */ 085public class Rule { 086 087 /** 088 * A phoneme. 089 */ 090 public static final class Phoneme implements PhonemeExpr { 091 092 /** 093 * The Phoneme Comparator. 094 */ 095 public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> { 096 final int o1Length = o1.phonemeText.length(); 097 final int o2Length = o2.phonemeText.length(); 098 for (int i = 0; i < o1Length; i++) { 099 if (i >= o2Length) { 100 return +1; 101 } 102 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 103 if (c != 0) { 104 return c; 105 } 106 } 107 108 if (o1Length < o2Length) { 109 return -1; 110 } 111 112 return 0; 113 }; 114 115 private final StringBuilder phonemeText; 116 117 private final Languages.LanguageSet languages; 118 119 /** 120 * Constructs a new instance. 121 * 122 * @param phonemeText The phoneme text. 123 * @param languages A language set. 124 */ 125 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { 126 this.phonemeText = new StringBuilder(phonemeText); 127 this.languages = languages; 128 } 129 130 /** 131 * Constructs a new instance. 132 * 133 * @param phonemeLeft The left phoneme text. 134 * @param phonemeRight The right phoneme text. 135 */ 136 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) { 137 this(phonemeLeft.phonemeText, phonemeLeft.languages); 138 this.phonemeText.append(phonemeRight.phonemeText); 139 } 140 141 /** 142 * Constructs a new instance. 143 * 144 * @param phonemeLeft The left phoneme text. 145 * @param phonemeRight The right phoneme text. 146 * @param languages A language set. 147 */ 148 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) { 149 this(phonemeLeft.phonemeText, languages); 150 this.phonemeText.append(phonemeRight.phonemeText); 151 } 152 153 /** 154 * Appends the sequence to the phone text. 155 * 156 * @param sequence The sequence to append. 157 * @return this instance. 158 */ 159 public Phoneme append(final CharSequence sequence) { 160 this.phonemeText.append(sequence); 161 return this; 162 } 163 164 /** 165 * Gets the language set. 166 * 167 * @return the language set. 168 */ 169 public Languages.LanguageSet getLanguages() { 170 return this.languages; 171 } 172 173 @Override 174 public Iterable<Phoneme> getPhonemes() { 175 return Collections.singleton(this); 176 } 177 178 /** 179 * Gets the phoneme text sequence. 180 * 181 * @return the phoneme text sequence. 182 */ 183 public CharSequence getPhonemeText() { 184 return this.phonemeText; 185 } 186 187 /** 188 * Deprecated since 1.9. 189 * 190 * @param right the Phoneme to join 191 * @return a new Phoneme 192 * @deprecated since 1.9 193 */ 194 @Deprecated 195 public Phoneme join(final Phoneme right) { 196 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), 197 this.languages.restrictTo(right.languages)); 198 } 199 200 /** 201 * Returns a new Phoneme with the same text but a union of its 202 * current language set and the given one. 203 * 204 * @param lang the language set to merge 205 * @return a new Phoneme 206 */ 207 public Phoneme mergeWithLanguage(final LanguageSet lang) { 208 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang)); 209 } 210 211 @Override 212 public int size() { 213 return 1; 214 } 215 216 @Override 217 public String toString() { 218 return phonemeText.toString() + "[" + languages + "]"; 219 } 220 } 221 222 /** 223 * A phoneme expression. 224 */ 225 public interface PhonemeExpr { 226 227 /** 228 * Gets an iteration of phonemes. 229 * 230 * @return an iteration of phonemes. 231 */ 232 Iterable<Phoneme> getPhonemes(); 233 234 /** 235 * Gets the expression size in phonemes. 236 * 237 * @return the expression size in phonemes. 238 * @since 1.17.0 239 */ 240 default int size() { 241 // All implementations are int-bound. 242 return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE); 243 } 244 } 245 246 /** 247 * A list of phonemes. 248 */ 249 public static final class PhonemeList implements PhonemeExpr { 250 251 private final List<Phoneme> phonemeList; 252 253 /** 254 * Constructs a new instance. 255 * 256 * @param phonemes the phoneme list. 257 */ 258 public PhonemeList(final List<Phoneme> phonemes) { 259 this.phonemeList = phonemes; 260 } 261 262 @Override 263 public List<Phoneme> getPhonemes() { 264 return phonemeList; 265 } 266 267 @Override 268 public int size() { 269 return phonemeList.size(); 270 } 271 } 272 273 /** 274 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 275 */ 276 public interface RPattern { 277 278 /** 279 * Tests whether the given input matches this instance. 280 * 281 * @param input the input to test. 282 * @return whether the given input matches this instance. 283 */ 284 boolean isMatch(CharSequence input); 285 } 286 287 /** 288 * Always matches. 289 */ 290 public static final RPattern ALL_STRINGS_RMATCHER = input -> true; 291 292 /** 293 * Unused. 294 */ 295 public static final String ALL = "ALL"; 296 297 private static final String DOUBLE_QUOTE = "\""; 298 299 private static final String HASH_INCLUDE = "#include"; 300 301 private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length(); 302 303 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = 304 new EnumMap<>(NameType.class); 305 306 static { 307 for (final NameType s : NameType.values()) { 308 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts = 309 new EnumMap<>(RuleType.class); 310 311 for (final RuleType rt : RuleType.values()) { 312 final Map<String, Map<String, List<Rule>>> rs = new HashMap<>(); 313 314 final Languages ls = Languages.getInstance(s); 315 ls.getLanguages().forEach(l -> { 316 try (Scanner scanner = createScanner(s, rt, l)) { 317 rs.put(l, parseRules(scanner, createResourceName(s, rt, l))); 318 } catch (final IllegalStateException e) { 319 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); 320 } 321 }); 322 if (!rt.equals(RuleType.RULES)) { 323 try (Scanner scanner = createScanner(s, rt, "common")) { 324 rs.put("common", parseRules(scanner, createResourceName(s, rt, "common"))); 325 } 326 } 327 328 rts.put(rt, Collections.unmodifiableMap(rs)); 329 } 330 331 RULES.put(s, Collections.unmodifiableMap(rts)); 332 } 333 } 334 335 private static boolean contains(final CharSequence chars, final char input) { 336 return chars.chars().anyMatch(c -> c == input); 337 } 338 339 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { 340 return String.format("/org/apache/commons/codec/language/bm/%s_%s_%s.txt", 341 nameType.getName(), rt.getName(), lang); 342 } 343 344 @SuppressWarnings("resource") // Closing the Scanner closes the resource 345 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { 346 final String resName = createResourceName(nameType, rt, lang); 347 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 348 } 349 350 @SuppressWarnings("resource") // Closing the Scanner closes the resource 351 private static Scanner createScanner(final String lang) { 352 final String resName = String.format("/org/apache/commons/codec/language/bm/%s.txt", lang); 353 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 354 } 355 356 private static boolean endsWith(final CharSequence input, final CharSequence suffix) { 357 final int suffixLength = suffix.length(); 358 final int inputLength = input.length(); 359 360 if (suffixLength > inputLength) { 361 return false; 362 } 363 for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) { 364 if (input.charAt(i) != suffix.charAt(j)) { 365 return false; 366 } 367 } 368 return true; 369 } 370 371 /** 372 * Gets rules for a combination of name type, rule type and languages. 373 * 374 * @param nameType 375 * the NameType to consider 376 * @param rt 377 * the RuleType to consider 378 * @param langs 379 * the set of languages to consider 380 * @return a list of Rules that apply 381 */ 382 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, 383 final Languages.LanguageSet langs) { 384 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs); 385 final List<Rule> allRules = new ArrayList<>(); 386 ruleMap.values().forEach(rules -> allRules.addAll(rules)); 387 return allRules; 388 } 389 390 /** 391 * Gets rules for a combination of name type, rule type and a single language. 392 * 393 * @param nameType 394 * the NameType to consider 395 * @param rt 396 * the RuleType to consider 397 * @param lang 398 * the language to consider 399 * @return a list of Rules that apply 400 */ 401 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { 402 return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang)))); 403 } 404 405 /** 406 * Gets rules for a combination of name type, rule type and languages. 407 * 408 * @param nameType 409 * the NameType to consider 410 * @param rt 411 * the RuleType to consider 412 * @param langs 413 * the set of languages to consider 414 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 415 * @since 1.9 416 */ 417 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 418 final Languages.LanguageSet langs) { 419 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : 420 getInstanceMap(nameType, rt, Languages.ANY); 421 } 422 423 /** 424 * Gets rules for a combination of name type, rule type and a single language. 425 * 426 * @param nameType 427 * the NameType to consider 428 * @param rt 429 * the RuleType to consider 430 * @param lang 431 * the language to consider 432 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 433 * @since 1.9 434 */ 435 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 436 final String lang) { 437 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang); 438 439 if (rules == null) { 440 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", 441 nameType.getName(), rt.getName(), lang)); 442 } 443 444 return rules; 445 } 446 447 private static Phoneme parsePhoneme(final String ph) { 448 final int open = ph.indexOf("["); 449 if (open >= 0) { 450 if (!ph.endsWith("]")) { 451 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 452 } 453 final String before = ph.substring(0, open); 454 final String in = ph.substring(open + 1, ph.length() - 1); 455 final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]"))); 456 457 return new Phoneme(before, Languages.LanguageSet.from(langs)); 458 } 459 return new Phoneme(ph, Languages.ANY_LANGUAGE); 460 } 461 462 private static PhonemeExpr parsePhonemeExpr(final String ph) { 463 if (ph.startsWith("(")) { // we have a bracketed list of options 464 if (!ph.endsWith(")")) { 465 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); 466 } 467 468 final List<Phoneme> phs = new ArrayList<>(); 469 final String body = ph.substring(1, ph.length() - 1); 470 for (final String part : body.split("[|]")) { 471 phs.add(parsePhoneme(part)); 472 } 473 if (body.startsWith("|") || body.endsWith("|")) { 474 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 475 } 476 477 return new PhonemeList(phs); 478 } 479 return parsePhoneme(ph); 480 } 481 482 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) { 483 final Map<String, List<Rule>> lines = new HashMap<>(); 484 int currentLine = 0; 485 486 boolean inMultilineComment = false; 487 while (scanner.hasNextLine()) { 488 currentLine++; 489 final String rawLine = scanner.nextLine(); 490 String line = rawLine; 491 492 if (inMultilineComment) { 493 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 494 inMultilineComment = false; 495 } 496 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 497 inMultilineComment = true; 498 } else { 499 // discard comments 500 final int cmtI = line.indexOf(ResourceConstants.CMT); 501 if (cmtI >= 0) { 502 line = line.substring(0, cmtI); 503 } 504 505 // trim leading-trailing whitespace 506 line = line.trim(); 507 508 if (line.isEmpty()) { 509 continue; // empty lines can be safely skipped 510 } 511 512 if (line.startsWith(HASH_INCLUDE)) { 513 // include statement 514 final String incl = line.substring(HASH_INCLUDE_LENGTH).trim(); 515 if (incl.contains(" ")) { 516 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + 517 location); 518 } 519 try (Scanner hashIncludeScanner = createScanner(incl)) { 520 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl)); 521 } 522 } else { 523 // rule 524 final String[] parts = line.split("\\s+"); 525 if (parts.length != 4) { 526 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 527 " parts: " + rawLine + " in " + location); 528 } 529 try { 530 final String pat = stripQuotes(parts[0]); 531 final String lCon = stripQuotes(parts[1]); 532 final String rCon = stripQuotes(parts[2]); 533 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 534 final int cLine = currentLine; 535 final Rule r = new Rule(pat, lCon, rCon, ph) { 536 private final int myLine = cLine; 537 private final String loc = location; 538 539 @Override 540 public String toString() { 541 final StringBuilder sb = new StringBuilder(); 542 sb.append("Rule"); 543 sb.append("{line=").append(myLine); 544 sb.append(", loc='").append(loc).append('\''); 545 sb.append(", pat='").append(pat).append('\''); 546 sb.append(", lcon='").append(lCon).append('\''); 547 sb.append(", rcon='").append(rCon).append('\''); 548 sb.append('}'); 549 return sb.toString(); 550 } 551 }; 552 final String patternKey = r.pattern.substring(0, 1); 553 final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>()); 554 rules.add(r); 555 } catch (final IllegalArgumentException e) { 556 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + 557 location, e); 558 } 559 } 560 } 561 } 562 563 return lines; 564 } 565 566 /** 567 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 568 * 569 * @param regex 570 * the regular expression to compile 571 * @return an RPattern that will match this regex 572 */ 573 private static RPattern pattern(final String regex) { 574 final boolean startsWith = regex.startsWith("^"); 575 final boolean endsWith = regex.endsWith("$"); 576 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 577 final boolean boxes = content.contains("["); 578 579 if (!boxes) { 580 if (startsWith && endsWith) { 581 // exact match 582 if (content.isEmpty()) { 583 // empty 584 return input -> input.length() == 0; 585 } 586 return input -> input.equals(content); 587 } 588 if ((startsWith || endsWith) && content.isEmpty()) { 589 // matches every string 590 return ALL_STRINGS_RMATCHER; 591 } 592 if (startsWith) { 593 // matches from start 594 return input -> startsWith(input, content); 595 } 596 if (endsWith) { 597 // matches from start 598 return input -> endsWith(input, content); 599 } 600 } else { 601 final boolean startsWithBox = content.startsWith("["); 602 final boolean endsWithBox = content.endsWith("]"); 603 604 if (startsWithBox && endsWithBox) { 605 String boxContent = content.substring(1, content.length() - 1); 606 if (!boxContent.contains("[")) { 607 // box containing alternatives 608 final boolean negate = boxContent.startsWith("^"); 609 if (negate) { 610 boxContent = boxContent.substring(1); 611 } 612 final String bContent = boxContent; 613 final boolean shouldMatch = !negate; 614 615 if (startsWith && endsWith) { 616 // exact match 617 return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 618 } 619 if (startsWith) { 620 // first char 621 return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 622 } 623 if (endsWith) { 624 // last char 625 return input -> input.length() > 0 && 626 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 627 } 628 } 629 } 630 } 631 632 return new RPattern() { 633 final Pattern pattern = Pattern.compile(regex); 634 635 @Override 636 public boolean isMatch(final CharSequence input) { 637 final Matcher matcher = pattern.matcher(input); 638 return matcher.find(); 639 } 640 }; 641 } 642 643 private static boolean startsWith(final CharSequence input, final CharSequence prefix) { 644 if (prefix.length() > input.length()) { 645 return false; 646 } 647 for (int i = 0; i < prefix.length(); i++) { 648 if (input.charAt(i) != prefix.charAt(i)) { 649 return false; 650 } 651 } 652 return true; 653 } 654 655 private static String stripQuotes(String str) { 656 if (str.startsWith(DOUBLE_QUOTE)) { 657 str = str.substring(1); 658 } 659 660 if (str.endsWith(DOUBLE_QUOTE)) { 661 str = str.substring(0, str.length() - 1); 662 } 663 664 return str; 665 } 666 667 private final RPattern lContext; 668 669 private final String pattern; 670 671 private final PhonemeExpr phoneme; 672 673 private final RPattern rContext; 674 675 /** 676 * Creates a new rule. 677 * 678 * @param pattern 679 * the pattern 680 * @param lContext 681 * the left context 682 * @param rContext 683 * the right context 684 * @param phoneme 685 * the resulting phoneme 686 */ 687 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { 688 this.pattern = pattern; 689 this.lContext = pattern(lContext + "$"); 690 this.rContext = pattern("^" + rContext); 691 this.phoneme = phoneme; 692 } 693 694 /** 695 * Gets the left context. This is a regular expression that must match to the left of the pattern. 696 * 697 * @return the left context Pattern 698 */ 699 public RPattern getLContext() { 700 return this.lContext; 701 } 702 703 /** 704 * Gets the pattern. This is a string-literal that must exactly match. 705 * 706 * @return the pattern 707 */ 708 public String getPattern() { 709 return this.pattern; 710 } 711 712 /** 713 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 714 * 715 * @return the phoneme 716 */ 717 public PhonemeExpr getPhoneme() { 718 return this.phoneme; 719 } 720 721 /** 722 * Gets the right context. This is a regular expression that must match to the right of the pattern. 723 * 724 * @return the right context Pattern 725 */ 726 public RPattern getRContext() { 727 return this.rContext; 728 } 729 730 /** 731 * Decides if the pattern and context match the input starting at a position. It is a match if the 732 * {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and 733 * {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}. 734 * 735 * @param input 736 * the input String 737 * @param i 738 * the int position within the input 739 * @return true if the pattern and left/right context match, false otherwise 740 */ 741 public boolean patternAndContextMatches(final CharSequence input, final int i) { 742 if (i < 0) { 743 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 744 } 745 746 final int patternLength = this.pattern.length(); 747 final int ipl = i + patternLength; 748 749 if (ipl > input.length()) { 750 // not enough room for the pattern to match 751 return false; 752 } 753 754 // evaluate the pattern, left context and right context 755 // fail early if any of the evaluations is not successful 756 if (!input.subSequence(i, ipl).equals(this.pattern)) { 757 return false; 758 } 759 if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { 760 return false; 761 } 762 return this.lContext.isMatch(input.subSequence(0, i)); 763 } 764}