Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.util.ArrayList;
021import java.util.Arrays;
022import java.util.Collections;
023import java.util.Comparator;
024import java.util.EnumMap;
025import java.util.HashMap;
026import java.util.HashSet;
027import java.util.List;
028import java.util.Map;
029import java.util.Scanner;
030import java.util.Set;
031import java.util.regex.Matcher;
032import java.util.regex.Pattern;
033
034import org.apache.commons.codec.Resources;
035import org.apache.commons.codec.language.bm.Languages.LanguageSet;
036
037/**
038 * A phoneme rule.
039 * <p>
040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
041 * and a logical flag indicating if all languages must be in play. A rule matches if:
042 * </p>
043 * <ul>
044 * <li>the pattern matches at the current position</li>
045 * <li>the string up until the beginning of the pattern matches the left context</li>
046 * <li>the string from the end of the pattern matches the right context</li>
047 * <li>logical is ALL and all languages are in scope; or</li>
048 * <li>logical is any other value and at least one language is in scope</li>
049 * </ul>
050 * <p>
051 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
052 * to explicitly construct their own.
053 * </p>
054 * <p>
055 * Rules are immutable and thread-safe.
056 * </p>
057 * <h2>Rules resources</h2>
058 * <p>
059 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
060 * named following the pattern:
061 * </p>
062 * <blockquote>/org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
063 * <p>
064 * The format of these resources is the following:
065 * </p>
066 * <ul>
067 * <li><strong>Rules:</strong> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
068 * will be interpreted as:
069 * <ol>
070 * <li>pattern</li>
071 * <li>left context</li>
072 * <li>right context</li>
073 * <li>phoneme</li>
074 * </ol>
075 * </li>
076 * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be discarded
077 * as a comment.</li>
078 * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode. This will skip
079 * all content until a line ending in '*' and '/' is found.</li>
080 * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li>
081 * </ul>
082 *
083 * @since 1.6
084 */
085public class Rule {
086
087    /**
088     * A phoneme.
089     */
090    public static final class Phoneme implements PhonemeExpr {
091
092        /**
093         * The Phoneme Comparator.
094         */
095        public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> {
096            final int o1Length = o1.phonemeText.length();
097            final int o2Length = o2.phonemeText.length();
098            for (int i = 0; i < o1Length; i++) {
099                if (i >= o2Length) {
100                    return +1;
101                }
102                final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
103                if (c != 0) {
104                    return c;
105                }
106            }
107
108            if (o1Length < o2Length) {
109                return -1;
110            }
111
112            return 0;
113        };
114
115        private final StringBuilder phonemeText;
116
117        private final Languages.LanguageSet languages;
118
119        /**
120         * Constructs a new instance.
121         *
122         * @param phonemeText The phoneme text.
123         * @param languages A language set.
124         */
125        public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
126            this.phonemeText = new StringBuilder(phonemeText);
127            this.languages = languages;
128        }
129
130        /**
131         * Constructs a new instance.
132         *
133         * @param phonemeLeft The left phoneme text.
134         * @param phonemeRight The right phoneme text.
135         */
136        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
137            this(phonemeLeft.phonemeText, phonemeLeft.languages);
138            this.phonemeText.append(phonemeRight.phonemeText);
139        }
140
141        /**
142         * Constructs a new instance.
143         *
144         * @param phonemeLeft The left phoneme text.
145         * @param phonemeRight The right phoneme text.
146         * @param languages A language set.
147         */
148        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
149            this(phonemeLeft.phonemeText, languages);
150            this.phonemeText.append(phonemeRight.phonemeText);
151        }
152
153        /**
154         * Appends the sequence to the phone text.
155         *
156         * @param sequence The sequence to append.
157         * @return this instance.
158         */
159        public Phoneme append(final CharSequence sequence) {
160            this.phonemeText.append(sequence);
161            return this;
162        }
163
164        /**
165         * Gets the language set.
166         *
167         * @return the language set.
168         */
169        public Languages.LanguageSet getLanguages() {
170            return this.languages;
171        }
172
173        @Override
174        public Iterable<Phoneme> getPhonemes() {
175            return Collections.singleton(this);
176        }
177
178        /**
179         * Gets the phoneme text sequence.
180         *
181         * @return the phoneme text sequence.
182         */
183        public CharSequence getPhonemeText() {
184            return this.phonemeText;
185        }
186
187        /**
188         * Deprecated since 1.9.
189         *
190         * @param right the Phoneme to join
191         * @return a new Phoneme
192         * @deprecated since 1.9
193         */
194        @Deprecated
195        public Phoneme join(final Phoneme right) {
196            return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
197                               this.languages.restrictTo(right.languages));
198        }
199
200        /**
201         * Returns a new Phoneme with the same text but a union of its
202         * current language set and the given one.
203         *
204         * @param lang the language set to merge
205         * @return a new Phoneme
206         */
207        public Phoneme mergeWithLanguage(final LanguageSet lang) {
208          return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
209        }
210
211        @Override
212        public int size() {
213            return 1;
214        }
215
216        @Override
217        public String toString() {
218          return phonemeText.toString() + "[" + languages + "]";
219        }
220    }
221
222    /**
223     * A phoneme expression.
224     */
225    public interface PhonemeExpr {
226
227        /**
228         * Gets an iteration of phonemes.
229         *
230         * @return an iteration of phonemes.
231         */
232        Iterable<Phoneme> getPhonemes();
233
234        /**
235         * Gets the expression size in phonemes.
236         *
237         * @return the expression size in phonemes.
238         * @since 1.17.0
239         */
240        default int size() {
241            // All implementations are int-bound.
242            return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE);
243        }
244    }
245
246    /**
247     * A list of phonemes.
248     */
249    public static final class PhonemeList implements PhonemeExpr {
250
251        private final List<Phoneme> phonemeList;
252
253        /**
254         * Constructs a new instance.
255         *
256         * @param phonemes the phoneme list.
257         */
258        public PhonemeList(final List<Phoneme> phonemes) {
259            this.phonemeList = phonemes;
260        }
261
262        @Override
263        public List<Phoneme> getPhonemes() {
264            return phonemeList;
265        }
266
267        @Override
268        public int size() {
269            return phonemeList.size();
270        }
271    }
272
273    /**
274     * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
275     */
276    public interface RPattern {
277
278        /**
279         * Tests whether the given input matches this instance.
280         *
281         * @param input the input to test.
282         * @return whether the given input matches this instance.
283         */
284        boolean isMatch(CharSequence input);
285    }
286
287    /**
288     * Always matches.
289     */
290    public static final RPattern ALL_STRINGS_RMATCHER = input -> true;
291
292    /**
293     * Unused.
294     */
295    public static final String ALL = "ALL";
296
297    private static final String DOUBLE_QUOTE = "\"";
298
299    private static final String HASH_INCLUDE = "#include";
300
301    private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
302
303    private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
304            new EnumMap<>(NameType.class);
305
306    static {
307        for (final NameType s : NameType.values()) {
308            final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
309                    new EnumMap<>(RuleType.class);
310
311            for (final RuleType rt : RuleType.values()) {
312                final Map<String, Map<String, List<Rule>>> rs = new HashMap<>();
313
314                final Languages ls = Languages.getInstance(s);
315                ls.getLanguages().forEach(l -> {
316                    try (Scanner scanner = createScanner(s, rt, l)) {
317                        rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
318                    } catch (final IllegalStateException e) {
319                        throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
320                    }
321                });
322                if (!rt.equals(RuleType.RULES)) {
323                    try (Scanner scanner = createScanner(s, rt, "common")) {
324                        rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
325                    }
326                }
327
328                rts.put(rt, Collections.unmodifiableMap(rs));
329            }
330
331            RULES.put(s, Collections.unmodifiableMap(rts));
332        }
333    }
334
335    private static boolean contains(final CharSequence chars, final char input) {
336        return chars.chars().anyMatch(c -> c == input);
337    }
338
339    private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
340        return String.format("/org/apache/commons/codec/language/bm/%s_%s_%s.txt",
341                             nameType.getName(), rt.getName(), lang);
342    }
343
344    @SuppressWarnings("resource") // Closing the Scanner closes the resource
345    private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
346        final String resName = createResourceName(nameType, rt, lang);
347        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
348    }
349
350    @SuppressWarnings("resource") // Closing the Scanner closes the resource
351    private static Scanner createScanner(final String lang) {
352        final String resName = String.format("/org/apache/commons/codec/language/bm/%s.txt", lang);
353        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
354    }
355
356    private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
357        final int suffixLength = suffix.length();
358        final int inputLength = input.length();
359
360        if (suffixLength > inputLength) {
361            return false;
362        }
363        for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
364            if (input.charAt(i) != suffix.charAt(j)) {
365                return false;
366            }
367        }
368        return true;
369    }
370
371    /**
372     * Gets rules for a combination of name type, rule type and languages.
373     *
374     * @param nameType
375     *            the NameType to consider
376     * @param rt
377     *            the RuleType to consider
378     * @param langs
379     *            the set of languages to consider
380     * @return a list of Rules that apply
381     */
382    public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
383                                         final Languages.LanguageSet langs) {
384        final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
385        final List<Rule> allRules = new ArrayList<>();
386        ruleMap.values().forEach(rules -> allRules.addAll(rules));
387        return allRules;
388    }
389
390    /**
391     * Gets rules for a combination of name type, rule type and a single language.
392     *
393     * @param nameType
394     *            the NameType to consider
395     * @param rt
396     *            the RuleType to consider
397     * @param lang
398     *            the language to consider
399     * @return a list of Rules that apply
400     */
401    public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
402        return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
403    }
404
405    /**
406     * Gets rules for a combination of name type, rule type and languages.
407     *
408     * @param nameType
409     *            the NameType to consider
410     * @param rt
411     *            the RuleType to consider
412     * @param langs
413     *            the set of languages to consider
414     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
415     * @since 1.9
416     */
417    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
418                                                         final Languages.LanguageSet langs) {
419        return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
420                                     getInstanceMap(nameType, rt, Languages.ANY);
421    }
422
423    /**
424     * Gets rules for a combination of name type, rule type and a single language.
425     *
426     * @param nameType
427     *            the NameType to consider
428     * @param rt
429     *            the RuleType to consider
430     * @param lang
431     *            the language to consider
432     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
433     * @since 1.9
434     */
435    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
436                                                         final String lang) {
437        final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
438
439        if (rules == null) {
440            throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
441                                               nameType.getName(), rt.getName(), lang));
442        }
443
444        return rules;
445    }
446
447    private static Phoneme parsePhoneme(final String ph) {
448        final int open = ph.indexOf("[");
449        if (open >= 0) {
450            if (!ph.endsWith("]")) {
451                throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
452            }
453            final String before = ph.substring(0, open);
454            final String in = ph.substring(open + 1, ph.length() - 1);
455            final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));
456
457            return new Phoneme(before, Languages.LanguageSet.from(langs));
458        }
459        return new Phoneme(ph, Languages.ANY_LANGUAGE);
460    }
461
462    private static PhonemeExpr parsePhonemeExpr(final String ph) {
463        if (ph.startsWith("(")) { // we have a bracketed list of options
464            if (!ph.endsWith(")")) {
465                throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
466            }
467
468            final List<Phoneme> phs = new ArrayList<>();
469            final String body = ph.substring(1, ph.length() - 1);
470            for (final String part : body.split("[|]")) {
471                phs.add(parsePhoneme(part));
472            }
473            if (body.startsWith("|") || body.endsWith("|")) {
474                phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
475            }
476
477            return new PhonemeList(phs);
478        }
479        return parsePhoneme(ph);
480    }
481
482    private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
483        final Map<String, List<Rule>> lines = new HashMap<>();
484        int currentLine = 0;
485
486        boolean inMultilineComment = false;
487        while (scanner.hasNextLine()) {
488            currentLine++;
489            final String rawLine = scanner.nextLine();
490            String line = rawLine;
491
492            if (inMultilineComment) {
493                if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
494                    inMultilineComment = false;
495                }
496            } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
497                inMultilineComment = true;
498            } else {
499                // discard comments
500                final int cmtI = line.indexOf(ResourceConstants.CMT);
501                if (cmtI >= 0) {
502                    line = line.substring(0, cmtI);
503                }
504
505                // trim leading-trailing whitespace
506                line = line.trim();
507
508                if (line.isEmpty()) {
509                    continue; // empty lines can be safely skipped
510                }
511
512                if (line.startsWith(HASH_INCLUDE)) {
513                    // include statement
514                    final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
515                    if (incl.contains(" ")) {
516                        throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
517                                                           location);
518                    }
519                    try (Scanner hashIncludeScanner = createScanner(incl)) {
520                        lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
521                    }
522                } else {
523                    // rule
524                    final String[] parts = line.split("\\s+");
525                    if (parts.length != 4) {
526                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
527                                                           " parts: " + rawLine + " in " + location);
528                    }
529                    try {
530                        final String pat = stripQuotes(parts[0]);
531                        final String lCon = stripQuotes(parts[1]);
532                        final String rCon = stripQuotes(parts[2]);
533                        final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
534                        final int cLine = currentLine;
535                        final Rule r = new Rule(pat, lCon, rCon, ph) {
536                            private final int myLine = cLine;
537                            private final String loc = location;
538
539                            @Override
540                            public String toString() {
541                                final StringBuilder sb = new StringBuilder();
542                                sb.append("Rule");
543                                sb.append("{line=").append(myLine);
544                                sb.append(", loc='").append(loc).append('\'');
545                                sb.append(", pat='").append(pat).append('\'');
546                                sb.append(", lcon='").append(lCon).append('\'');
547                                sb.append(", rcon='").append(rCon).append('\'');
548                                sb.append('}');
549                                return sb.toString();
550                            }
551                        };
552                        final String patternKey = r.pattern.substring(0, 1);
553                        final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>());
554                        rules.add(r);
555                    } catch (final IllegalArgumentException e) {
556                        throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
557                                                        location, e);
558                    }
559                }
560            }
561        }
562
563        return lines;
564    }
565
566    /**
567     * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
568     *
569     * @param regex
570     *            the regular expression to compile
571     * @return an RPattern that will match this regex
572     */
573    private static RPattern pattern(final String regex) {
574        final boolean startsWith = regex.startsWith("^");
575        final boolean endsWith = regex.endsWith("$");
576        final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
577        final boolean boxes = content.contains("[");
578
579        if (!boxes) {
580            if (startsWith && endsWith) {
581                // exact match
582                if (content.isEmpty()) {
583                    // empty
584                    return input -> input.length() == 0;
585                }
586                return input -> input.equals(content);
587            }
588            if ((startsWith || endsWith) && content.isEmpty()) {
589                // matches every string
590                return ALL_STRINGS_RMATCHER;
591            }
592            if (startsWith) {
593                // matches from start
594                return input -> startsWith(input, content);
595            }
596            if (endsWith) {
597                // matches from start
598                return input -> endsWith(input, content);
599            }
600        } else {
601            final boolean startsWithBox = content.startsWith("[");
602            final boolean endsWithBox = content.endsWith("]");
603
604            if (startsWithBox && endsWithBox) {
605                String boxContent = content.substring(1, content.length() - 1);
606                if (!boxContent.contains("[")) {
607                    // box containing alternatives
608                    final boolean negate = boxContent.startsWith("^");
609                    if (negate) {
610                        boxContent = boxContent.substring(1);
611                    }
612                    final String bContent = boxContent;
613                    final boolean shouldMatch = !negate;
614
615                    if (startsWith && endsWith) {
616                        // exact match
617                        return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
618                    }
619                    if (startsWith) {
620                        // first char
621                        return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
622                    }
623                    if (endsWith) {
624                        // last char
625                        return input -> input.length() > 0 &&
626                               contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
627                    }
628                }
629            }
630        }
631
632        return new RPattern() {
633            final Pattern pattern = Pattern.compile(regex);
634
635            @Override
636            public boolean isMatch(final CharSequence input) {
637                final Matcher matcher = pattern.matcher(input);
638                return matcher.find();
639            }
640        };
641    }
642
643    private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
644        if (prefix.length() > input.length()) {
645            return false;
646        }
647        for (int i = 0; i < prefix.length(); i++) {
648            if (input.charAt(i) != prefix.charAt(i)) {
649                return false;
650            }
651        }
652        return true;
653    }
654
655    private static String stripQuotes(String str) {
656        if (str.startsWith(DOUBLE_QUOTE)) {
657            str = str.substring(1);
658        }
659
660        if (str.endsWith(DOUBLE_QUOTE)) {
661            str = str.substring(0, str.length() - 1);
662        }
663
664        return str;
665    }
666
667    private final RPattern lContext;
668
669    private final String pattern;
670
671    private final PhonemeExpr phoneme;
672
673    private final RPattern rContext;
674
675    /**
676     * Creates a new rule.
677     *
678     * @param pattern
679     *            the pattern
680     * @param lContext
681     *            the left context
682     * @param rContext
683     *            the right context
684     * @param phoneme
685     *            the resulting phoneme
686     */
687    public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
688        this.pattern = pattern;
689        this.lContext = pattern(lContext + "$");
690        this.rContext = pattern("^" + rContext);
691        this.phoneme = phoneme;
692    }
693
694    /**
695     * Gets the left context. This is a regular expression that must match to the left of the pattern.
696     *
697     * @return the left context Pattern
698     */
699    public RPattern getLContext() {
700        return this.lContext;
701    }
702
703    /**
704     * Gets the pattern. This is a string-literal that must exactly match.
705     *
706     * @return the pattern
707     */
708    public String getPattern() {
709        return this.pattern;
710    }
711
712    /**
713     * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
714     *
715     * @return the phoneme
716     */
717    public PhonemeExpr getPhoneme() {
718        return this.phoneme;
719    }
720
721    /**
722     * Gets the right context. This is a regular expression that must match to the right of the pattern.
723     *
724     * @return the right context Pattern
725     */
726    public RPattern getRContext() {
727        return this.rContext;
728    }
729
730    /**
731     * Decides if the pattern and context match the input starting at a position. It is a match if the
732     * {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and
733     * {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}.
734     *
735     * @param input
736     *            the input String
737     * @param i
738     *            the int position within the input
739     * @return true if the pattern and left/right context match, false otherwise
740     */
741    public boolean patternAndContextMatches(final CharSequence input, final int i) {
742        if (i < 0) {
743            throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
744        }
745
746        final int patternLength = this.pattern.length();
747        final int ipl = i + patternLength;
748
749        if (ipl > input.length()) {
750            // not enough room for the pattern to match
751            return false;
752        }
753
754        // evaluate the pattern, left context and right context
755        // fail early if any of the evaluations is not successful
756        if (!input.subSequence(i, ipl).equals(this.pattern)) {
757            return false;
758        }
759        if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
760            return false;
761        }
762        return this.lContext.isMatch(input.subSequence(0, i));
763    }
764}