001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.util.ArrayList;
021import java.util.Arrays;
022import java.util.Collections;
023import java.util.Comparator;
024import java.util.EnumMap;
025import java.util.HashMap;
026import java.util.HashSet;
027import java.util.List;
028import java.util.Map;
029import java.util.Scanner;
030import java.util.Set;
031import java.util.regex.Matcher;
032import java.util.regex.Pattern;
033
034import org.apache.commons.codec.Resources;
035import org.apache.commons.codec.language.bm.Languages.LanguageSet;
036
037/**
038 * A phoneme rule.
039 * <p>
040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
041 * and a logical flag indicating if all languages must be in play. A rule matches if:
042 * </p>
043 * <ul>
044 * <li>the pattern matches at the current position</li>
045 * <li>the string up until the beginning of the pattern matches the left context</li>
046 * <li>the string from the end of the pattern matches the right context</li>
047 * <li>logical is ALL and all languages are in scope; or</li>
048 * <li>logical is any other value and at least one language is in scope</li>
049 * </ul>
050 * <p>
051 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
052 * to explicitly construct their own.
053 * </p>
054 * <p>
055 * Rules are immutable and thread-safe.
056 * </p>
057 * <h2>Rules resources</h2>
058 * <p>
059 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
060 * named following the pattern:
061 * </p>
062 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
063 * <p>
064 * The format of these resources is the following:
065 * </p>
066 * <ul>
067 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
068 * will be interpreted as:
069 * <ol>
070 * <li>pattern</li>
071 * <li>left context</li>
072 * <li>right context</li>
073 * <li>phoneme</li>
074 * </ol>
075 * </li>
076 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
077 * as a comment.</li>
078 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
079 * all content until a line ending in '*' and '/' is found.</li>
080 * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
081 * </ul>
082 *
083 * @since 1.6
084 */
085public class Rule {
086
087    public static final class Phoneme implements PhonemeExpr {
088
089        public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> {
090            final int o1Length = o1.phonemeText.length();
091            final int o2Length = o2.phonemeText.length();
092            for (int i = 0; i < o1Length; i++) {
093                if (i >= o2Length) {
094                    return +1;
095                }
096                final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
097                if (c != 0) {
098                    return c;
099                }
100            }
101
102            if (o1Length < o2Length) {
103                return -1;
104            }
105
106            return 0;
107        };
108        private final StringBuilder phonemeText;
109
110        private final Languages.LanguageSet languages;
111
112        public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
113            this.phonemeText = new StringBuilder(phonemeText);
114            this.languages = languages;
115        }
116
117        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
118            this(phonemeLeft.phonemeText, phonemeLeft.languages);
119            this.phonemeText.append(phonemeRight.phonemeText);
120        }
121
122        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
123            this(phonemeLeft.phonemeText, languages);
124            this.phonemeText.append(phonemeRight.phonemeText);
125        }
126
127        public Phoneme append(final CharSequence str) {
128            this.phonemeText.append(str);
129            return this;
130        }
131
132        public Languages.LanguageSet getLanguages() {
133            return this.languages;
134        }
135
136        @Override
137        public Iterable<Phoneme> getPhonemes() {
138            return Collections.singleton(this);
139        }
140
141        public CharSequence getPhonemeText() {
142            return this.phonemeText;
143        }
144
145        /**
146         * Deprecated since 1.9.
147         *
148         * @param right the Phoneme to join
149         * @return a new Phoneme
150         * @deprecated since 1.9
151         */
152        @Deprecated
153        public Phoneme join(final Phoneme right) {
154            return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
155                               this.languages.restrictTo(right.languages));
156        }
157
158        /**
159         * Returns a new Phoneme with the same text but a union of its
160         * current language set and the given one.
161         *
162         * @param lang the language set to merge
163         * @return a new Phoneme
164         */
165        public Phoneme mergeWithLanguage(final LanguageSet lang) {
166          return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
167        }
168
169        @Override
170        public int size() {
171            return 1;
172        }
173
174        @Override
175        public String toString() {
176          return phonemeText.toString() + "[" + languages + "]";
177        }
178    }
179
180    public interface PhonemeExpr {
181        Iterable<Phoneme> getPhonemes();
182
183        /**
184         * Gets the expression size in phonemes.
185         *
186         * @return the expression size in phonemes.
187         * @since 1.17.0
188         */
189        default int size() {
190            // All implementations are int-bound.
191            return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE);
192        }
193    }
194
195    public static final class PhonemeList implements PhonemeExpr {
196
197        private final List<Phoneme> phonemeList;
198
199        public PhonemeList(final List<Phoneme> phonemes) {
200            this.phonemeList = phonemes;
201        }
202
203        @Override
204        public List<Phoneme> getPhonemes() {
205            return phonemeList;
206        }
207
208        @Override
209        public int size() {
210            return phonemeList.size();
211        }
212    }
213
214    /**
215     * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
216     */
217    public interface RPattern {
218        boolean isMatch(CharSequence input);
219    }
220
221    public static final RPattern ALL_STRINGS_RMATCHER = input -> true;
222
223    public static final String ALL = "ALL";
224
225    private static final String DOUBLE_QUOTE = "\"";
226
227    private static final String HASH_INCLUDE = "#include";
228
229    private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
230
231    private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
232            new EnumMap<>(NameType.class);
233
234    static {
235        for (final NameType s : NameType.values()) {
236            final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
237                    new EnumMap<>(RuleType.class);
238
239            for (final RuleType rt : RuleType.values()) {
240                final Map<String, Map<String, List<Rule>>> rs = new HashMap<>();
241
242                final Languages ls = Languages.getInstance(s);
243                ls.getLanguages().forEach(l -> {
244                    try (final Scanner scanner = createScanner(s, rt, l)) {
245                        rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
246                    } catch (final IllegalStateException e) {
247                        throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
248                    }
249                });
250                if (!rt.equals(RuleType.RULES)) {
251                    try (final Scanner scanner = createScanner(s, rt, "common")) {
252                        rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
253                    }
254                }
255
256                rts.put(rt, Collections.unmodifiableMap(rs));
257            }
258
259            RULES.put(s, Collections.unmodifiableMap(rts));
260        }
261    }
262
263    private static boolean contains(final CharSequence chars, final char input) {
264        return chars.chars().anyMatch(c -> c == input);
265    }
266
267    private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
268        return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
269                             nameType.getName(), rt.getName(), lang);
270    }
271
272    @SuppressWarnings("resource") // Closing the Scanner closes the resource
273    private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
274        final String resName = createResourceName(nameType, rt, lang);
275        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
276    }
277
278    @SuppressWarnings("resource") // Closing the Scanner closes the resource
279    private static Scanner createScanner(final String lang) {
280        final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
281        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
282    }
283
284    private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
285        final int suffixLength = suffix.length();
286        final int inputLength = input.length();
287
288        if (suffixLength > inputLength) {
289            return false;
290        }
291        for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
292            if (input.charAt(i) != suffix.charAt(j)) {
293                return false;
294            }
295        }
296        return true;
297    }
298
299    /**
300     * Gets rules for a combination of name type, rule type and languages.
301     *
302     * @param nameType
303     *            the NameType to consider
304     * @param rt
305     *            the RuleType to consider
306     * @param langs
307     *            the set of languages to consider
308     * @return a list of Rules that apply
309     */
310    public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
311                                         final Languages.LanguageSet langs) {
312        final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
313        final List<Rule> allRules = new ArrayList<>();
314        ruleMap.values().forEach(rules -> allRules.addAll(rules));
315        return allRules;
316    }
317
318    /**
319     * Gets rules for a combination of name type, rule type and a single language.
320     *
321     * @param nameType
322     *            the NameType to consider
323     * @param rt
324     *            the RuleType to consider
325     * @param lang
326     *            the language to consider
327     * @return a list of Rules that apply
328     */
329    public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
330        return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
331    }
332
333    /**
334     * Gets rules for a combination of name type, rule type and languages.
335     *
336     * @param nameType
337     *            the NameType to consider
338     * @param rt
339     *            the RuleType to consider
340     * @param langs
341     *            the set of languages to consider
342     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
343     * @since 1.9
344     */
345    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
346                                                         final Languages.LanguageSet langs) {
347        return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
348                                     getInstanceMap(nameType, rt, Languages.ANY);
349    }
350
351    /**
352     * Gets rules for a combination of name type, rule type and a single language.
353     *
354     * @param nameType
355     *            the NameType to consider
356     * @param rt
357     *            the RuleType to consider
358     * @param lang
359     *            the language to consider
360     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
361     * @since 1.9
362     */
363    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
364                                                         final String lang) {
365        final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
366
367        if (rules == null) {
368            throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
369                                               nameType.getName(), rt.getName(), lang));
370        }
371
372        return rules;
373    }
374
375    private static Phoneme parsePhoneme(final String ph) {
376        final int open = ph.indexOf("[");
377        if (open >= 0) {
378            if (!ph.endsWith("]")) {
379                throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
380            }
381            final String before = ph.substring(0, open);
382            final String in = ph.substring(open + 1, ph.length() - 1);
383            final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));
384
385            return new Phoneme(before, Languages.LanguageSet.from(langs));
386        }
387        return new Phoneme(ph, Languages.ANY_LANGUAGE);
388    }
389
390    private static PhonemeExpr parsePhonemeExpr(final String ph) {
391        if (ph.startsWith("(")) { // we have a bracketed list of options
392            if (!ph.endsWith(")")) {
393                throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
394            }
395
396            final List<Phoneme> phs = new ArrayList<>();
397            final String body = ph.substring(1, ph.length() - 1);
398            for (final String part : body.split("[|]")) {
399                phs.add(parsePhoneme(part));
400            }
401            if (body.startsWith("|") || body.endsWith("|")) {
402                phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
403            }
404
405            return new PhonemeList(phs);
406        }
407        return parsePhoneme(ph);
408    }
409
410    private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
411        final Map<String, List<Rule>> lines = new HashMap<>();
412        int currentLine = 0;
413
414        boolean inMultilineComment = false;
415        while (scanner.hasNextLine()) {
416            currentLine++;
417            final String rawLine = scanner.nextLine();
418            String line = rawLine;
419
420            if (inMultilineComment) {
421                if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
422                    inMultilineComment = false;
423                }
424            } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
425                inMultilineComment = true;
426            } else {
427                // discard comments
428                final int cmtI = line.indexOf(ResourceConstants.CMT);
429                if (cmtI >= 0) {
430                    line = line.substring(0, cmtI);
431                }
432
433                // trim leading-trailing whitespace
434                line = line.trim();
435
436                if (line.isEmpty()) {
437                    continue; // empty lines can be safely skipped
438                }
439
440                if (line.startsWith(HASH_INCLUDE)) {
441                    // include statement
442                    final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
443                    if (incl.contains(" ")) {
444                        throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
445                                                           location);
446                    }
447                    try (final Scanner hashIncludeScanner = createScanner(incl)) {
448                        lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
449                    }
450                } else {
451                    // rule
452                    final String[] parts = line.split("\\s+");
453                    if (parts.length != 4) {
454                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
455                                                           " parts: " + rawLine + " in " + location);
456                    }
457                    try {
458                        final String pat = stripQuotes(parts[0]);
459                        final String lCon = stripQuotes(parts[1]);
460                        final String rCon = stripQuotes(parts[2]);
461                        final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
462                        final int cLine = currentLine;
463                        final Rule r = new Rule(pat, lCon, rCon, ph) {
464                            private final int myLine = cLine;
465                            private final String loc = location;
466
467                            @Override
468                            public String toString() {
469                                final StringBuilder sb = new StringBuilder();
470                                sb.append("Rule");
471                                sb.append("{line=").append(myLine);
472                                sb.append(", loc='").append(loc).append('\'');
473                                sb.append(", pat='").append(pat).append('\'');
474                                sb.append(", lcon='").append(lCon).append('\'');
475                                sb.append(", rcon='").append(rCon).append('\'');
476                                sb.append('}');
477                                return sb.toString();
478                            }
479                        };
480                        final String patternKey = r.pattern.substring(0, 1);
481                        final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>());
482                        rules.add(r);
483                    } catch (final IllegalArgumentException e) {
484                        throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
485                                                        location, e);
486                    }
487                }
488            }
489        }
490
491        return lines;
492    }
493
494    /**
495     * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
496     *
497     * @param regex
498     *            the regular expression to compile
499     * @return an RPattern that will match this regex
500     */
501    private static RPattern pattern(final String regex) {
502        final boolean startsWith = regex.startsWith("^");
503        final boolean endsWith = regex.endsWith("$");
504        final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
505        final boolean boxes = content.contains("[");
506
507        if (!boxes) {
508            if (startsWith && endsWith) {
509                // exact match
510                if (content.isEmpty()) {
511                    // empty
512                    return input -> input.length() == 0;
513                }
514                return input -> input.equals(content);
515            }
516            if ((startsWith || endsWith) && content.isEmpty()) {
517                // matches every string
518                return ALL_STRINGS_RMATCHER;
519            }
520            if (startsWith) {
521                // matches from start
522                return input -> startsWith(input, content);
523            }
524            if (endsWith) {
525                // matches from start
526                return input -> endsWith(input, content);
527            }
528        } else {
529            final boolean startsWithBox = content.startsWith("[");
530            final boolean endsWithBox = content.endsWith("]");
531
532            if (startsWithBox && endsWithBox) {
533                String boxContent = content.substring(1, content.length() - 1);
534                if (!boxContent.contains("[")) {
535                    // box containing alternatives
536                    final boolean negate = boxContent.startsWith("^");
537                    if (negate) {
538                        boxContent = boxContent.substring(1);
539                    }
540                    final String bContent = boxContent;
541                    final boolean shouldMatch = !negate;
542
543                    if (startsWith && endsWith) {
544                        // exact match
545                        return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
546                    }
547                    if (startsWith) {
548                        // first char
549                        return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
550                    }
551                    if (endsWith) {
552                        // last char
553                        return input -> input.length() > 0 &&
554                               contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
555                    }
556                }
557            }
558        }
559
560        return new RPattern() {
561            final Pattern pattern = Pattern.compile(regex);
562
563            @Override
564            public boolean isMatch(final CharSequence input) {
565                final Matcher matcher = pattern.matcher(input);
566                return matcher.find();
567            }
568        };
569    }
570
571    private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
572        if (prefix.length() > input.length()) {
573            return false;
574        }
575        for (int i = 0; i < prefix.length(); i++) {
576            if (input.charAt(i) != prefix.charAt(i)) {
577                return false;
578            }
579        }
580        return true;
581    }
582
583    private static String stripQuotes(String str) {
584        if (str.startsWith(DOUBLE_QUOTE)) {
585            str = str.substring(1);
586        }
587
588        if (str.endsWith(DOUBLE_QUOTE)) {
589            str = str.substring(0, str.length() - 1);
590        }
591
592        return str;
593    }
594
595    private final RPattern lContext;
596
597    private final String pattern;
598
599    private final PhonemeExpr phoneme;
600
601    private final RPattern rContext;
602
603    /**
604     * Creates a new rule.
605     *
606     * @param pattern
607     *            the pattern
608     * @param lContext
609     *            the left context
610     * @param rContext
611     *            the right context
612     * @param phoneme
613     *            the resulting phoneme
614     */
615    public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
616        this.pattern = pattern;
617        this.lContext = pattern(lContext + "$");
618        this.rContext = pattern("^" + rContext);
619        this.phoneme = phoneme;
620    }
621
622    /**
623     * Gets the left context. This is a regular expression that must match to the left of the pattern.
624     *
625     * @return the left context Pattern
626     */
627    public RPattern getLContext() {
628        return this.lContext;
629    }
630
631    /**
632     * Gets the pattern. This is a string-literal that must exactly match.
633     *
634     * @return the pattern
635     */
636    public String getPattern() {
637        return this.pattern;
638    }
639
640    /**
641     * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
642     *
643     * @return the phoneme
644     */
645    public PhonemeExpr getPhoneme() {
646        return this.phoneme;
647    }
648
649    /**
650     * Gets the right context. This is a regular expression that must match to the right of the pattern.
651     *
652     * @return the right context Pattern
653     */
654    public RPattern getRContext() {
655        return this.rContext;
656    }
657
658    /**
659     * Decides if the pattern and context match the input starting at a position. It is a match if the
660     * {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and
661     * {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}.
662     *
663     * @param input
664     *            the input String
665     * @param i
666     *            the int position within the input
667     * @return true if the pattern and left/right context match, false otherwise
668     */
669    public boolean patternAndContextMatches(final CharSequence input, final int i) {
670        if (i < 0) {
671            throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
672        }
673
674        final int patternLength = this.pattern.length();
675        final int ipl = i + patternLength;
676
677        if (ipl > input.length()) {
678            // not enough room for the pattern to match
679            return false;
680        }
681
682        // evaluate the pattern, left context and right context
683        // fail early if any of the evaluations is not successful
684        if (!input.subSequence(i, ipl).equals(this.pattern)) {
685            return false;
686        }
687        if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
688            return false;
689        }
690        return this.lContext.isMatch(input.subSequence(0, i));
691    }
692}