1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.commons.codec.language.bm; 19 20 import java.util.ArrayList; 21 import java.util.Arrays; 22 import java.util.Collections; 23 import java.util.EnumMap; 24 import java.util.HashSet; 25 import java.util.List; 26 import java.util.Locale; 27 import java.util.Map; 28 import java.util.Scanner; 29 import java.util.Set; 30 import java.util.regex.Pattern; 31 32 import org.apache.commons.codec.Resources; 33 34 /** 35 * Language guessing utility. 36 * <p> 37 * This class encapsulates rules used to guess the possible languages that a word originates from. This is 38 * done by reference to a whole series of rules distributed in resource files. 39 * </p> 40 * <p> 41 * Instances of this class are typically managed through the static factory method instance(). 42 * Unless you are developing your own language guessing rules, you will not need to interact with this class directly. 43 * </p> 44 * <p> 45 * This class is intended to be immutable and thread-safe. 46 * </p> 47 * <h2>Lang resources</h2> 48 * <p> 49 * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files. 50 * They are systematically named following the pattern: 51 * </p> 52 * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote> 53 * <p> 54 * The format of these resources is the following: 55 * </p> 56 * <ul> 57 * <li><strong>Rules:</strong> whitespace separated strings. 58 * There should be 3 columns to each row, and these will be interpreted as: 59 * <ol> 60 * <li>pattern: a regular expression.</li> 61 * <li>languages: a '+'-separated list of languages.</li> 62 * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li> 63 * </ol> 64 * </li> 65 * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be 66 * discarded as a comment.</li> 67 * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode. 68 * This will skip all content until a line ending in '*' and '/' is found.</li> 69 * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li> 70 * </ul> 71 * <p> 72 * Port of lang.php 73 * </p> 74 * 75 * @since 1.6 76 */ 77 public class Lang { 78 // Implementation note: This class is divided into two sections. The first part is a static factory interface that 79 // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that 80 // encapsulate a particular language-guessing rule table and the language guessing itself. 81 // 82 // It may make sense in the future to expose the private constructor to allow power users to build custom language- 83 // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users 84 // should be strongly encouraged to use the static factory {@code instance} method to get their Lang instances. 85 86 private static final class LangRule { 87 private final boolean acceptOnMatch; 88 private final Set<String> languages; 89 private final Pattern pattern; 90 91 private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) { 92 this.pattern = pattern; 93 this.languages = languages; 94 this.acceptOnMatch = acceptOnMatch; 95 } 96 97 public boolean matches(final String txt) { 98 return this.pattern.matcher(txt).find(); 99 } 100 } 101 102 private static final Map<NameType, Lang> LANGS = new EnumMap<>(NameType.class); 103 104 private static final String LANGUAGE_RULES_RN = "/org/apache/commons/codec/language/bm/%s_lang.txt"; 105 106 static { 107 for (final NameType s : NameType.values()) { 108 LANGS.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s))); 109 } 110 } 111 112 /** 113 * Gets a Lang instance for one of the supported NameTypes. 114 * 115 * @param nameType 116 * the NameType to look up 117 * @return a Lang encapsulating the language guessing rules for that name type 118 */ 119 public static Lang instance(final NameType nameType) { 120 return LANGS.get(nameType); 121 } 122 123 /** 124 * Loads language rules from a resource. 125 * <p> 126 * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method. 127 * You will only need to call this yourself if you are developing custom language mapping rules. 128 * </p> 129 * 130 * @param languageRulesResourceName 131 * the fully-qualified resource name to load 132 * @param languages 133 * the languages that these rules will support 134 * @return a Lang encapsulating the loaded language-guessing rules. 135 */ 136 public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) { 137 final List<LangRule> rules = new ArrayList<>(); 138 try (Scanner scanner = new Scanner(Resources.getInputStream(languageRulesResourceName), 139 ResourceConstants.ENCODING)) { 140 boolean inExtendedComment = false; 141 while (scanner.hasNextLine()) { 142 final String rawLine = scanner.nextLine(); 143 String line = rawLine; 144 if (inExtendedComment) { 145 // check for closing comment marker, otherwise discard doc comment line 146 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 147 inExtendedComment = false; 148 } 149 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 150 inExtendedComment = true; 151 } else { 152 // discard comments 153 final int cmtI = line.indexOf(ResourceConstants.CMT); 154 if (cmtI >= 0) { 155 line = line.substring(0, cmtI); 156 } 157 158 // trim leading-trailing whitespace 159 line = line.trim(); 160 161 if (line.isEmpty()) { 162 continue; // empty lines can be safely skipped 163 } 164 165 // split it up 166 final String[] parts = line.split("\\s+"); 167 168 if (parts.length != 3) { 169 throw new IllegalArgumentException("Malformed line '" + rawLine + 170 "' in language resource '" + languageRulesResourceName + "'"); 171 } 172 173 final Pattern pattern = Pattern.compile(parts[0]); 174 final String[] langs = parts[1].split("\\+"); 175 final boolean accept = parts[2].equals("true"); 176 177 rules.add(new LangRule(pattern, new HashSet<>(Arrays.asList(langs)), accept)); 178 } 179 } 180 } 181 return new Lang(rules, languages); 182 } 183 184 private final Languages languages; 185 186 private final List<LangRule> rules; 187 188 private Lang(final List<LangRule> rules, final Languages languages) { 189 this.rules = Collections.unmodifiableList(rules); 190 this.languages = languages; 191 } 192 193 /** 194 * Guesses the language of a word. 195 * 196 * @param text 197 * the word 198 * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match 199 */ 200 public String guessLanguage(final String text) { 201 final Languages.LanguageSet ls = guessLanguages(text); 202 return ls.isSingleton() ? ls.getAny() : Languages.ANY; 203 } 204 205 /** 206 * Guesses the languages of a word. 207 * 208 * @param input 209 * the word 210 * @return a Set of Strings of language names that are potential matches for the input word 211 */ 212 public Languages.LanguageSet guessLanguages(final String input) { 213 final String text = input.toLowerCase(Locale.ENGLISH); 214 final Set<String> langs = new HashSet<>(this.languages.getLanguages()); 215 rules.forEach(rule -> { 216 if (rule.matches(text)) { 217 if (rule.acceptOnMatch) { 218 langs.retainAll(rule.languages); 219 } else { 220 langs.removeAll(rule.languages); 221 } 222 } 223 }); 224 final Languages.LanguageSet ls = Languages.LanguageSet.from(langs); 225 return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls; 226 } 227 }