View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language.bm;
19  
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.Collections;
23  import java.util.EnumMap;
24  import java.util.HashSet;
25  import java.util.List;
26  import java.util.Locale;
27  import java.util.Map;
28  import java.util.Scanner;
29  import java.util.Set;
30  import java.util.regex.Pattern;
31  
32  import org.apache.commons.codec.Resources;
33  
34  /**
35   * Language guessing utility.
36   * <p>
37   * This class encapsulates rules used to guess the possible languages that a word originates from. This is
38   * done by reference to a whole series of rules distributed in resource files.
39   * </p>
40   * <p>
41   * Instances of this class are typically managed through the static factory method instance().
42   * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
43   * </p>
44   * <p>
45   * This class is intended to be immutable and thread-safe.
46   * </p>
47   * <h2>Lang resources</h2>
48   * <p>
49   * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
50   * They are systematically named following the pattern:
51   * </p>
52   * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
53   * <p>
54   * The format of these resources is the following:
55   * </p>
56   * <ul>
57   * <li><strong>Rules:</strong> whitespace separated strings.
58   * There should be 3 columns to each row, and these will be interpreted as:
59   * <ol>
60   * <li>pattern: a regular expression.</li>
61   * <li>languages: a '+'-separated list of languages.</li>
62   * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
63   * </ol>
64   * </li>
65   * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be
66   * discarded as a comment.</li>
67   * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode.
68   * This will skip all content until a line ending in '*' and '/' is found.</li>
69   * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li>
70   * </ul>
71   * <p>
72   * Port of lang.php
73   * </p>
74   *
75   * @since 1.6
76   */
77  public class Lang {
78      // Implementation note: This class is divided into two sections. The first part is a static factory interface that
79      // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
80      // encapsulate a particular language-guessing rule table and the language guessing itself.
81      //
82      // It may make sense in the future to expose the private constructor to allow power users to build custom language-
83      // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
84      // should be strongly encouraged to use the static factory {@code instance} method to get their Lang instances.
85  
86      private static final class LangRule {
87          private final boolean acceptOnMatch;
88          private final Set<String> languages;
89          private final Pattern pattern;
90  
91          private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
92              this.pattern = pattern;
93              this.languages = languages;
94              this.acceptOnMatch = acceptOnMatch;
95          }
96  
97          public boolean matches(final String txt) {
98              return this.pattern.matcher(txt).find();
99          }
100     }
101 
102     private static final Map<NameType, Lang> LANGS = new EnumMap<>(NameType.class);
103 
104     private static final String LANGUAGE_RULES_RN = "/org/apache/commons/codec/language/bm/%s_lang.txt";
105 
106     static {
107         for (final NameType s : NameType.values()) {
108             LANGS.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
109         }
110     }
111 
112     /**
113      * Gets a Lang instance for one of the supported NameTypes.
114      *
115      * @param nameType
116      *            the NameType to look up
117      * @return a Lang encapsulating the language guessing rules for that name type
118      */
119     public static Lang instance(final NameType nameType) {
120         return LANGS.get(nameType);
121     }
122 
123     /**
124      * Loads language rules from a resource.
125      * <p>
126      * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
127      * You will only need to call this yourself if you are developing custom language mapping rules.
128      * </p>
129      *
130      * @param languageRulesResourceName
131      *            the fully-qualified resource name to load
132      * @param languages
133      *            the languages that these rules will support
134      * @return a Lang encapsulating the loaded language-guessing rules.
135      */
136     public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
137         final List<LangRule> rules = new ArrayList<>();
138         try (Scanner scanner = new Scanner(Resources.getInputStream(languageRulesResourceName),
139                 ResourceConstants.ENCODING)) {
140             boolean inExtendedComment = false;
141             while (scanner.hasNextLine()) {
142                 final String rawLine = scanner.nextLine();
143                 String line = rawLine;
144                 if (inExtendedComment) {
145                     // check for closing comment marker, otherwise discard doc comment line
146                     if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
147                         inExtendedComment = false;
148                     }
149                 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
150                     inExtendedComment = true;
151                 } else {
152                     // discard comments
153                     final int cmtI = line.indexOf(ResourceConstants.CMT);
154                     if (cmtI >= 0) {
155                         line = line.substring(0, cmtI);
156                     }
157 
158                     // trim leading-trailing whitespace
159                     line = line.trim();
160 
161                     if (line.isEmpty()) {
162                         continue; // empty lines can be safely skipped
163                     }
164 
165                     // split it up
166                     final String[] parts = line.split("\\s+");
167 
168                     if (parts.length != 3) {
169                         throw new IllegalArgumentException("Malformed line '" + rawLine +
170                                 "' in language resource '" + languageRulesResourceName + "'");
171                     }
172 
173                     final Pattern pattern = Pattern.compile(parts[0]);
174                     final String[] langs = parts[1].split("\\+");
175                     final boolean accept = parts[2].equals("true");
176 
177                     rules.add(new LangRule(pattern, new HashSet<>(Arrays.asList(langs)), accept));
178                 }
179             }
180         }
181         return new Lang(rules, languages);
182     }
183 
184     private final Languages languages;
185 
186     private final List<LangRule> rules;
187 
188     private Lang(final List<LangRule> rules, final Languages languages) {
189         this.rules = Collections.unmodifiableList(rules);
190         this.languages = languages;
191     }
192 
193     /**
194      * Guesses the language of a word.
195      *
196      * @param text
197      *            the word
198      * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
199      */
200     public String guessLanguage(final String text) {
201         final Languages.LanguageSet ls = guessLanguages(text);
202         return ls.isSingleton() ? ls.getAny() : Languages.ANY;
203     }
204 
205     /**
206      * Guesses the languages of a word.
207      *
208      * @param input
209      *            the word
210      * @return a Set of Strings of language names that are potential matches for the input word
211      */
212     public Languages.LanguageSet guessLanguages(final String input) {
213         final String text = input.toLowerCase(Locale.ENGLISH);
214         final Set<String> langs = new HashSet<>(this.languages.getLanguages());
215         rules.forEach(rule -> {
216             if (rule.matches(text)) {
217                 if (rule.acceptOnMatch) {
218                     langs.retainAll(rule.languages);
219                 } else {
220                     langs.removeAll(rule.languages);
221                 }
222             }
223         });
224         final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
225         return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
226     }
227 }