001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import java.util.ArrayList; 021import java.util.Arrays; 022import java.util.Collections; 023import java.util.Comparator; 024import java.util.EnumMap; 025import java.util.HashMap; 026import java.util.HashSet; 027import java.util.List; 028import java.util.Map; 029import java.util.Scanner; 030import java.util.Set; 031import java.util.regex.Matcher; 032import java.util.regex.Pattern; 033 034import org.apache.commons.codec.Resources; 035import org.apache.commons.codec.language.bm.Languages.LanguageSet; 036 037/** 038 * A phoneme rule. 039 * <p> 040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply 041 * and a logical flag indicating if all languages must be in play. A rule matches if: 042 * </p> 043 * <ul> 044 * <li>the pattern matches at the current position</li> 045 * <li>the string up until the beginning of the pattern matches the left context</li> 046 * <li>the string from the end of the pattern matches the right context</li> 047 * <li>logical is ALL and all languages are in scope; or</li> 048 * <li>logical is any other value and at least one language is in scope</li> 049 * </ul> 050 * <p> 051 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user 052 * to explicitly construct their own. 053 * </p> 054 * <p> 055 * Rules are immutable and thread-safe. 056 * </p> 057 * <h2>Rules resources</h2> 058 * <p> 059 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically 060 * named following the pattern: 061 * </p> 062 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 063 * <p> 064 * The format of these resources is the following: 065 * </p> 066 * <ul> 067 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these 068 * will be interpreted as: 069 * <ol> 070 * <li>pattern</li> 071 * <li>left context</li> 072 * <li>right context</li> 073 * <li>phoneme</li> 074 * </ol> 075 * </li> 076 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded 077 * as a comment.</li> 078 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip 079 * all content until a line ending in '*' and '/' is found.</li> 080 * <li><b>Blank lines:</b> All blank lines will be skipped.</li> 081 * </ul> 082 * 083 * @since 1.6 084 */ 085public class Rule { 086 087 public static final class Phoneme implements PhonemeExpr { 088 089 public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> { 090 final int o1Length = o1.phonemeText.length(); 091 final int o2Length = o2.phonemeText.length(); 092 for (int i = 0; i < o1Length; i++) { 093 if (i >= o2Length) { 094 return +1; 095 } 096 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 097 if (c != 0) { 098 return c; 099 } 100 } 101 102 if (o1Length < o2Length) { 103 return -1; 104 } 105 106 return 0; 107 }; 108 private final StringBuilder phonemeText; 109 110 private final Languages.LanguageSet languages; 111 112 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { 113 this.phonemeText = new StringBuilder(phonemeText); 114 this.languages = languages; 115 } 116 117 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) { 118 this(phonemeLeft.phonemeText, phonemeLeft.languages); 119 this.phonemeText.append(phonemeRight.phonemeText); 120 } 121 122 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) { 123 this(phonemeLeft.phonemeText, languages); 124 this.phonemeText.append(phonemeRight.phonemeText); 125 } 126 127 public Phoneme append(final CharSequence str) { 128 this.phonemeText.append(str); 129 return this; 130 } 131 132 public Languages.LanguageSet getLanguages() { 133 return this.languages; 134 } 135 136 @Override 137 public Iterable<Phoneme> getPhonemes() { 138 return Collections.singleton(this); 139 } 140 141 public CharSequence getPhonemeText() { 142 return this.phonemeText; 143 } 144 145 /** 146 * Deprecated since 1.9. 147 * 148 * @param right the Phoneme to join 149 * @return a new Phoneme 150 * @deprecated since 1.9 151 */ 152 @Deprecated 153 public Phoneme join(final Phoneme right) { 154 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), 155 this.languages.restrictTo(right.languages)); 156 } 157 158 /** 159 * Returns a new Phoneme with the same text but a union of its 160 * current language set and the given one. 161 * 162 * @param lang the language set to merge 163 * @return a new Phoneme 164 */ 165 public Phoneme mergeWithLanguage(final LanguageSet lang) { 166 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang)); 167 } 168 169 @Override 170 public int size() { 171 return 1; 172 } 173 174 @Override 175 public String toString() { 176 return phonemeText.toString() + "[" + languages + "]"; 177 } 178 } 179 180 public interface PhonemeExpr { 181 Iterable<Phoneme> getPhonemes(); 182 183 /** 184 * Gets the expression size in phonemes. 185 * 186 * @return the expression size in phonemes. 187 * @since 1.17.0 188 */ 189 default int size() { 190 // All implementations are int-bound. 191 return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE); 192 } 193 } 194 195 public static final class PhonemeList implements PhonemeExpr { 196 197 private final List<Phoneme> phonemeList; 198 199 public PhonemeList(final List<Phoneme> phonemes) { 200 this.phonemeList = phonemes; 201 } 202 203 @Override 204 public List<Phoneme> getPhonemes() { 205 return phonemeList; 206 } 207 208 @Override 209 public int size() { 210 return phonemeList.size(); 211 } 212 } 213 214 /** 215 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 216 */ 217 public interface RPattern { 218 boolean isMatch(CharSequence input); 219 } 220 221 public static final RPattern ALL_STRINGS_RMATCHER = input -> true; 222 223 public static final String ALL = "ALL"; 224 225 private static final String DOUBLE_QUOTE = "\""; 226 227 private static final String HASH_INCLUDE = "#include"; 228 229 private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length(); 230 231 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = 232 new EnumMap<>(NameType.class); 233 234 static { 235 for (final NameType s : NameType.values()) { 236 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts = 237 new EnumMap<>(RuleType.class); 238 239 for (final RuleType rt : RuleType.values()) { 240 final Map<String, Map<String, List<Rule>>> rs = new HashMap<>(); 241 242 final Languages ls = Languages.getInstance(s); 243 ls.getLanguages().forEach(l -> { 244 try (final Scanner scanner = createScanner(s, rt, l)) { 245 rs.put(l, parseRules(scanner, createResourceName(s, rt, l))); 246 } catch (final IllegalStateException e) { 247 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); 248 } 249 }); 250 if (!rt.equals(RuleType.RULES)) { 251 try (final Scanner scanner = createScanner(s, rt, "common")) { 252 rs.put("common", parseRules(scanner, createResourceName(s, rt, "common"))); 253 } 254 } 255 256 rts.put(rt, Collections.unmodifiableMap(rs)); 257 } 258 259 RULES.put(s, Collections.unmodifiableMap(rts)); 260 } 261 } 262 263 private static boolean contains(final CharSequence chars, final char input) { 264 return chars.chars().anyMatch(c -> c == input); 265 } 266 267 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { 268 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", 269 nameType.getName(), rt.getName(), lang); 270 } 271 272 @SuppressWarnings("resource") // Closing the Scanner closes the resource 273 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { 274 final String resName = createResourceName(nameType, rt, lang); 275 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 276 } 277 278 @SuppressWarnings("resource") // Closing the Scanner closes the resource 279 private static Scanner createScanner(final String lang) { 280 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); 281 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 282 } 283 284 private static boolean endsWith(final CharSequence input, final CharSequence suffix) { 285 final int suffixLength = suffix.length(); 286 final int inputLength = input.length(); 287 288 if (suffixLength > inputLength) { 289 return false; 290 } 291 for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) { 292 if (input.charAt(i) != suffix.charAt(j)) { 293 return false; 294 } 295 } 296 return true; 297 } 298 299 /** 300 * Gets rules for a combination of name type, rule type and languages. 301 * 302 * @param nameType 303 * the NameType to consider 304 * @param rt 305 * the RuleType to consider 306 * @param langs 307 * the set of languages to consider 308 * @return a list of Rules that apply 309 */ 310 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, 311 final Languages.LanguageSet langs) { 312 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs); 313 final List<Rule> allRules = new ArrayList<>(); 314 ruleMap.values().forEach(rules -> allRules.addAll(rules)); 315 return allRules; 316 } 317 318 /** 319 * Gets rules for a combination of name type, rule type and a single language. 320 * 321 * @param nameType 322 * the NameType to consider 323 * @param rt 324 * the RuleType to consider 325 * @param lang 326 * the language to consider 327 * @return a list of Rules that apply 328 */ 329 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { 330 return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang)))); 331 } 332 333 /** 334 * Gets rules for a combination of name type, rule type and languages. 335 * 336 * @param nameType 337 * the NameType to consider 338 * @param rt 339 * the RuleType to consider 340 * @param langs 341 * the set of languages to consider 342 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 343 * @since 1.9 344 */ 345 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 346 final Languages.LanguageSet langs) { 347 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : 348 getInstanceMap(nameType, rt, Languages.ANY); 349 } 350 351 /** 352 * Gets rules for a combination of name type, rule type and a single language. 353 * 354 * @param nameType 355 * the NameType to consider 356 * @param rt 357 * the RuleType to consider 358 * @param lang 359 * the language to consider 360 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 361 * @since 1.9 362 */ 363 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 364 final String lang) { 365 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang); 366 367 if (rules == null) { 368 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", 369 nameType.getName(), rt.getName(), lang)); 370 } 371 372 return rules; 373 } 374 375 private static Phoneme parsePhoneme(final String ph) { 376 final int open = ph.indexOf("["); 377 if (open >= 0) { 378 if (!ph.endsWith("]")) { 379 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 380 } 381 final String before = ph.substring(0, open); 382 final String in = ph.substring(open + 1, ph.length() - 1); 383 final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]"))); 384 385 return new Phoneme(before, Languages.LanguageSet.from(langs)); 386 } 387 return new Phoneme(ph, Languages.ANY_LANGUAGE); 388 } 389 390 private static PhonemeExpr parsePhonemeExpr(final String ph) { 391 if (ph.startsWith("(")) { // we have a bracketed list of options 392 if (!ph.endsWith(")")) { 393 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); 394 } 395 396 final List<Phoneme> phs = new ArrayList<>(); 397 final String body = ph.substring(1, ph.length() - 1); 398 for (final String part : body.split("[|]")) { 399 phs.add(parsePhoneme(part)); 400 } 401 if (body.startsWith("|") || body.endsWith("|")) { 402 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 403 } 404 405 return new PhonemeList(phs); 406 } 407 return parsePhoneme(ph); 408 } 409 410 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) { 411 final Map<String, List<Rule>> lines = new HashMap<>(); 412 int currentLine = 0; 413 414 boolean inMultilineComment = false; 415 while (scanner.hasNextLine()) { 416 currentLine++; 417 final String rawLine = scanner.nextLine(); 418 String line = rawLine; 419 420 if (inMultilineComment) { 421 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 422 inMultilineComment = false; 423 } 424 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 425 inMultilineComment = true; 426 } else { 427 // discard comments 428 final int cmtI = line.indexOf(ResourceConstants.CMT); 429 if (cmtI >= 0) { 430 line = line.substring(0, cmtI); 431 } 432 433 // trim leading-trailing whitespace 434 line = line.trim(); 435 436 if (line.isEmpty()) { 437 continue; // empty lines can be safely skipped 438 } 439 440 if (line.startsWith(HASH_INCLUDE)) { 441 // include statement 442 final String incl = line.substring(HASH_INCLUDE_LENGTH).trim(); 443 if (incl.contains(" ")) { 444 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + 445 location); 446 } 447 try (final Scanner hashIncludeScanner = createScanner(incl)) { 448 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl)); 449 } 450 } else { 451 // rule 452 final String[] parts = line.split("\\s+"); 453 if (parts.length != 4) { 454 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 455 " parts: " + rawLine + " in " + location); 456 } 457 try { 458 final String pat = stripQuotes(parts[0]); 459 final String lCon = stripQuotes(parts[1]); 460 final String rCon = stripQuotes(parts[2]); 461 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 462 final int cLine = currentLine; 463 final Rule r = new Rule(pat, lCon, rCon, ph) { 464 private final int myLine = cLine; 465 private final String loc = location; 466 467 @Override 468 public String toString() { 469 final StringBuilder sb = new StringBuilder(); 470 sb.append("Rule"); 471 sb.append("{line=").append(myLine); 472 sb.append(", loc='").append(loc).append('\''); 473 sb.append(", pat='").append(pat).append('\''); 474 sb.append(", lcon='").append(lCon).append('\''); 475 sb.append(", rcon='").append(rCon).append('\''); 476 sb.append('}'); 477 return sb.toString(); 478 } 479 }; 480 final String patternKey = r.pattern.substring(0, 1); 481 final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>()); 482 rules.add(r); 483 } catch (final IllegalArgumentException e) { 484 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + 485 location, e); 486 } 487 } 488 } 489 } 490 491 return lines; 492 } 493 494 /** 495 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 496 * 497 * @param regex 498 * the regular expression to compile 499 * @return an RPattern that will match this regex 500 */ 501 private static RPattern pattern(final String regex) { 502 final boolean startsWith = regex.startsWith("^"); 503 final boolean endsWith = regex.endsWith("$"); 504 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 505 final boolean boxes = content.contains("["); 506 507 if (!boxes) { 508 if (startsWith && endsWith) { 509 // exact match 510 if (content.isEmpty()) { 511 // empty 512 return input -> input.length() == 0; 513 } 514 return input -> input.equals(content); 515 } 516 if ((startsWith || endsWith) && content.isEmpty()) { 517 // matches every string 518 return ALL_STRINGS_RMATCHER; 519 } 520 if (startsWith) { 521 // matches from start 522 return input -> startsWith(input, content); 523 } 524 if (endsWith) { 525 // matches from start 526 return input -> endsWith(input, content); 527 } 528 } else { 529 final boolean startsWithBox = content.startsWith("["); 530 final boolean endsWithBox = content.endsWith("]"); 531 532 if (startsWithBox && endsWithBox) { 533 String boxContent = content.substring(1, content.length() - 1); 534 if (!boxContent.contains("[")) { 535 // box containing alternatives 536 final boolean negate = boxContent.startsWith("^"); 537 if (negate) { 538 boxContent = boxContent.substring(1); 539 } 540 final String bContent = boxContent; 541 final boolean shouldMatch = !negate; 542 543 if (startsWith && endsWith) { 544 // exact match 545 return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 546 } 547 if (startsWith) { 548 // first char 549 return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 550 } 551 if (endsWith) { 552 // last char 553 return input -> input.length() > 0 && 554 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 555 } 556 } 557 } 558 } 559 560 return new RPattern() { 561 final Pattern pattern = Pattern.compile(regex); 562 563 @Override 564 public boolean isMatch(final CharSequence input) { 565 final Matcher matcher = pattern.matcher(input); 566 return matcher.find(); 567 } 568 }; 569 } 570 571 private static boolean startsWith(final CharSequence input, final CharSequence prefix) { 572 if (prefix.length() > input.length()) { 573 return false; 574 } 575 for (int i = 0; i < prefix.length(); i++) { 576 if (input.charAt(i) != prefix.charAt(i)) { 577 return false; 578 } 579 } 580 return true; 581 } 582 583 private static String stripQuotes(String str) { 584 if (str.startsWith(DOUBLE_QUOTE)) { 585 str = str.substring(1); 586 } 587 588 if (str.endsWith(DOUBLE_QUOTE)) { 589 str = str.substring(0, str.length() - 1); 590 } 591 592 return str; 593 } 594 595 private final RPattern lContext; 596 597 private final String pattern; 598 599 private final PhonemeExpr phoneme; 600 601 private final RPattern rContext; 602 603 /** 604 * Creates a new rule. 605 * 606 * @param pattern 607 * the pattern 608 * @param lContext 609 * the left context 610 * @param rContext 611 * the right context 612 * @param phoneme 613 * the resulting phoneme 614 */ 615 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { 616 this.pattern = pattern; 617 this.lContext = pattern(lContext + "$"); 618 this.rContext = pattern("^" + rContext); 619 this.phoneme = phoneme; 620 } 621 622 /** 623 * Gets the left context. This is a regular expression that must match to the left of the pattern. 624 * 625 * @return the left context Pattern 626 */ 627 public RPattern getLContext() { 628 return this.lContext; 629 } 630 631 /** 632 * Gets the pattern. This is a string-literal that must exactly match. 633 * 634 * @return the pattern 635 */ 636 public String getPattern() { 637 return this.pattern; 638 } 639 640 /** 641 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 642 * 643 * @return the phoneme 644 */ 645 public PhonemeExpr getPhoneme() { 646 return this.phoneme; 647 } 648 649 /** 650 * Gets the right context. This is a regular expression that must match to the right of the pattern. 651 * 652 * @return the right context Pattern 653 */ 654 public RPattern getRContext() { 655 return this.rContext; 656 } 657 658 /** 659 * Decides if the pattern and context match the input starting at a position. It is a match if the 660 * {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and 661 * {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}. 662 * 663 * @param input 664 * the input String 665 * @param i 666 * the int position within the input 667 * @return true if the pattern and left/right context match, false otherwise 668 */ 669 public boolean patternAndContextMatches(final CharSequence input, final int i) { 670 if (i < 0) { 671 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 672 } 673 674 final int patternLength = this.pattern.length(); 675 final int ipl = i + patternLength; 676 677 if (ipl > input.length()) { 678 // not enough room for the pattern to match 679 return false; 680 } 681 682 // evaluate the pattern, left context and right context 683 // fail early if any of the evaluations is not successful 684 if (!input.subSequence(i, ipl).equals(this.pattern)) { 685 return false; 686 } 687 if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { 688 return false; 689 } 690 return this.lContext.isMatch(input.subSequence(0, i)); 691 } 692}