1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language.bm;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23 /**
24 * Encodes strings into their Beider-Morse phonetic encoding.
25 * <p>
26 * Beider-Morse phonetic encodings are optimized for family names. However, they may be useful for a wide range of
27 * words.
28 * </p>
29 * <p>
30 * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it is mutable,
31 * and may not be thread-safe. If you require a guaranteed thread-safe encoding then use {@link PhoneticEngine}
32 * directly.
33 * </p>
34 * <h2>Encoding overview</h2>
35 * <p>
36 * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
37 * language the word comes from. For example, if it ends in "{@code ault}" then it infers that the word is French.
38 * Next, the word is translated into a phonetic representation using a language-specific phonetics table. Some runs of
39 * letters can be pronounced in multiple ways, and a single run of letters may be potentially broken up into phonemes at
40 * different places, so this stage results in a set of possible language-specific phonetic representations. Lastly, this
41 * language-specific phonetic representation is processed by a table of rules that re-writes it phonetically taking into
42 * account systematic pronunciation differences between languages, to move it towards a pan-indo-european phonetic
43 * representation. Again, sometimes there are multiple ways this could be done and sometimes things that can be
44 * pronounced in several ways in the source language have only one way to represent them in this average phonetic
45 * language, so the result is again a set of phonetic spellings.
46 * </p>
47 * <p>
48 * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. In
49 * this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final encoding.
50 * Secondly, some names have standard prefixes, for example, "{@code Mac/Mc}" in Scottish (English) names. As
51 * sometimes it is ambiguous whether the prefix is intended or is an accident of the spelling, the word is encoded once
52 * with the prefix and once without it. The resulting encoding contains one and then the other result.
53 * </p>
54 * <h2>Encoding format</h2>
55 * <p>
56 * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where there
57 * are multiple possible phonetic representations, these are joined with a pipe ({@code |}) character. If multiple
58 * hyphenated words where found, or if the word may contain a name prefix, each encoded word is placed in ellipses and
59 * these blocks are then joined with hyphens. For example, "{@code d'ortley}" has a possible prefix. The form
60 * without prefix encodes to "{@code ortlaj|ortlej}", while the form with prefix encodes to "
61 * {@code dortlaj|dortlej}". Thus, the full, combined encoding is "{@code (ortlaj|ortlej)-(dortlaj|dortlej)}".
62 * </p>
63 * <p>
64 * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
65 * potential phonetic interpretations. For example, "{@code Renault}" encodes to "
66 * {@code rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult}". The {@code APPROX} rules will tend to produce larger
67 * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
68 * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
69 * splitting on pipe ({@code |}) and indexing under each of these alternatives.
70 * </p>
71 * <p>
72 * <b>Note</b>: this version of the Beider-Morse encoding is equivalent with v3.4 of the reference implementation.
73 * </p>
74 * @see <a href="https://stevemorse.org/phonetics/bmpm.htm">Beider-Morse Phonetic Matching</a>
75 * @see <a href="https://stevemorse.org/phoneticinfo.htm">Reference implementation</a>
76 *
77 * <p>
78 * This class is Not ThreadSafe
79 * </p>
80 * @since 1.6
81 */
82 public class BeiderMorseEncoder implements StringEncoder {
83 // Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
84 // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.
85
86 // a cached object
87 private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
88
89 @Override
90 public Object encode(final Object source) throws EncoderException {
91 if (!(source instanceof String)) {
92 throw new EncoderException("BeiderMorseEncoder encode parameter is not of type String");
93 }
94 return encode((String) source);
95 }
96
97 @Override
98 public String encode(final String source) throws EncoderException {
99 if (source == null) {
100 return null;
101 }
102 return this.engine.encode(source);
103 }
104
105 /**
106 * Gets the name type currently in operation.
107 *
108 * @return the NameType currently being used
109 */
110 public NameType getNameType() {
111 return this.engine.getNameType();
112 }
113
114 /**
115 * Gets the rule type currently in operation.
116 *
117 * @return the RuleType currently being used
118 */
119 public RuleType getRuleType() {
120 return this.engine.getRuleType();
121 }
122
123 /**
124 * Discovers if multiple possible encodings are concatenated.
125 *
126 * @return true if multiple encodings are concatenated, false if just the first one is returned
127 */
128 public boolean isConcat() {
129 return this.engine.isConcat();
130 }
131
132 /**
133 * Sets how multiple possible phonetic encodings are combined.
134 *
135 * @param concat
136 * true if multiple encodings are to be combined with a '|', false if just the first one is
137 * to be considered
138 */
139 public void setConcat(final boolean concat) {
140 this.engine = new PhoneticEngine(this.engine.getNameType(),
141 this.engine.getRuleType(),
142 concat,
143 this.engine.getMaxPhonemes());
144 }
145
146 /**
147 * Sets the number of maximum of phonemes that shall be considered by the engine.
148 *
149 * @param maxPhonemes
150 * the maximum number of phonemes returned by the engine
151 * @since 1.7
152 */
153 public void setMaxPhonemes(final int maxPhonemes) {
154 this.engine = new PhoneticEngine(this.engine.getNameType(),
155 this.engine.getRuleType(),
156 this.engine.isConcat(),
157 maxPhonemes);
158 }
159
160 /**
161 * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phonetic encodings
162 * optimized for Ashkenazi or Sephardic Jewish family names.
163 *
164 * @param nameType
165 * the NameType in use
166 */
167 public void setNameType(final NameType nameType) {
168 this.engine = new PhoneticEngine(nameType,
169 this.engine.getRuleType(),
170 this.engine.isConcat(),
171 this.engine.getMaxPhonemes());
172 }
173
174 /**
175 * Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
176 *
177 * @param ruleType
178 * {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches
179 */
180 public void setRuleType(final RuleType ruleType) {
181 this.engine = new PhoneticEngine(this.engine.getNameType(),
182 ruleType,
183 this.engine.isConcat(),
184 this.engine.getMaxPhonemes());
185 }
186
187 }