Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.util.ArrayList;
021import java.util.Arrays;
022import java.util.Collections;
023import java.util.EnumMap;
024import java.util.HashSet;
025import java.util.List;
026import java.util.Locale;
027import java.util.Map;
028import java.util.Scanner;
029import java.util.Set;
030import java.util.regex.Pattern;
031
032import org.apache.commons.codec.Resources;
033
034/**
035 * Language guessing utility.
036 * <p>
037 * This class encapsulates rules used to guess the possible languages that a word originates from. This is
038 * done by reference to a whole series of rules distributed in resource files.
039 * </p>
040 * <p>
041 * Instances of this class are typically managed through the static factory method instance().
042 * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
043 * </p>
044 * <p>
045 * This class is intended to be immutable and thread-safe.
046 * </p>
047 * <h2>Lang resources</h2>
048 * <p>
049 * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
050 * They are systematically named following the pattern:
051 * </p>
052 * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
053 * <p>
054 * The format of these resources is the following:
055 * </p>
056 * <ul>
057 * <li><strong>Rules:</strong> whitespace separated strings.
058 * There should be 3 columns to each row, and these will be interpreted as:
059 * <ol>
060 * <li>pattern: a regular expression.</li>
061 * <li>languages: a '+'-separated list of languages.</li>
062 * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
063 * </ol>
064 * </li>
065 * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be
066 * discarded as a comment.</li>
067 * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode.
068 * This will skip all content until a line ending in '*' and '/' is found.</li>
069 * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li>
070 * </ul>
071 * <p>
072 * Port of lang.php
073 * </p>
074 *
075 * @since 1.6
076 */
077public class Lang {
078    // Implementation note: This class is divided into two sections. The first part is a static factory interface that
079    // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
080    // encapsulate a particular language-guessing rule table and the language guessing itself.
081    //
082    // It may make sense in the future to expose the private constructor to allow power users to build custom language-
083    // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
084    // should be strongly encouraged to use the static factory {@code instance} method to get their Lang instances.
085
086    private static final class LangRule {
087        private final boolean acceptOnMatch;
088        private final Set<String> languages;
089        private final Pattern pattern;
090
091        private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
092            this.pattern = pattern;
093            this.languages = languages;
094            this.acceptOnMatch = acceptOnMatch;
095        }
096
097        public boolean matches(final String txt) {
098            return this.pattern.matcher(txt).find();
099        }
100    }
101
102    private static final Map<NameType, Lang> LANGS = new EnumMap<>(NameType.class);
103
104    private static final String LANGUAGE_RULES_RN = "/org/apache/commons/codec/language/bm/%s_lang.txt";
105
106    static {
107        for (final NameType s : NameType.values()) {
108            LANGS.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
109        }
110    }
111
112    /**
113     * Gets a Lang instance for one of the supported NameTypes.
114     *
115     * @param nameType
116     *            the NameType to look up
117     * @return a Lang encapsulating the language guessing rules for that name type
118     */
119    public static Lang instance(final NameType nameType) {
120        return LANGS.get(nameType);
121    }
122
123    /**
124     * Loads language rules from a resource.
125     * <p>
126     * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
127     * You will only need to call this yourself if you are developing custom language mapping rules.
128     * </p>
129     *
130     * @param languageRulesResourceName
131     *            the fully-qualified resource name to load
132     * @param languages
133     *            the languages that these rules will support
134     * @return a Lang encapsulating the loaded language-guessing rules.
135     */
136    public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
137        final List<LangRule> rules = new ArrayList<>();
138        try (Scanner scanner = new Scanner(Resources.getInputStream(languageRulesResourceName),
139                ResourceConstants.ENCODING)) {
140            boolean inExtendedComment = false;
141            while (scanner.hasNextLine()) {
142                final String rawLine = scanner.nextLine();
143                String line = rawLine;
144                if (inExtendedComment) {
145                    // check for closing comment marker, otherwise discard doc comment line
146                    if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
147                        inExtendedComment = false;
148                    }
149                } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
150                    inExtendedComment = true;
151                } else {
152                    // discard comments
153                    final int cmtI = line.indexOf(ResourceConstants.CMT);
154                    if (cmtI >= 0) {
155                        line = line.substring(0, cmtI);
156                    }
157
158                    // trim leading-trailing whitespace
159                    line = line.trim();
160
161                    if (line.isEmpty()) {
162                        continue; // empty lines can be safely skipped
163                    }
164
165                    // split it up
166                    final String[] parts = line.split("\\s+");
167
168                    if (parts.length != 3) {
169                        throw new IllegalArgumentException("Malformed line '" + rawLine +
170                                "' in language resource '" + languageRulesResourceName + "'");
171                    }
172
173                    final Pattern pattern = Pattern.compile(parts[0]);
174                    final String[] langs = parts[1].split("\\+");
175                    final boolean accept = parts[2].equals("true");
176
177                    rules.add(new LangRule(pattern, new HashSet<>(Arrays.asList(langs)), accept));
178                }
179            }
180        }
181        return new Lang(rules, languages);
182    }
183
184    private final Languages languages;
185
186    private final List<LangRule> rules;
187
188    private Lang(final List<LangRule> rules, final Languages languages) {
189        this.rules = Collections.unmodifiableList(rules);
190        this.languages = languages;
191    }
192
193    /**
194     * Guesses the language of a word.
195     *
196     * @param text
197     *            the word
198     * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
199     */
200    public String guessLanguage(final String text) {
201        final Languages.LanguageSet ls = guessLanguages(text);
202        return ls.isSingleton() ? ls.getAny() : Languages.ANY;
203    }
204
205    /**
206     * Guesses the languages of a word.
207     *
208     * @param input
209     *            the word
210     * @return a Set of Strings of language names that are potential matches for the input word
211     */
212    public Languages.LanguageSet guessLanguages(final String input) {
213        final String text = input.toLowerCase(Locale.ENGLISH);
214        final Set<String> langs = new HashSet<>(this.languages.getLanguages());
215        rules.forEach(rule -> {
216            if (rule.matches(text)) {
217                if (rule.acceptOnMatch) {
218                    langs.retainAll(rule.languages);
219                } else {
220                    langs.removeAll(rule.languages);
221                }
222            }
223        });
224        final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
225        return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
226    }
227}