View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language.bm;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes strings into their Beider-Morse phonetic encoding.
25   * <p>
26   * Beider-Morse phonetic encodings are optimized for family names. However, they may be useful for a wide range of
27   * words.
28   * </p>
29   * <p>
30   * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it is mutable,
31   * and may not be thread-safe. If you require a guaranteed thread-safe encoding then use {@link PhoneticEngine}
32   * directly.
33   * </p>
34   * <h2>Encoding overview</h2>
35   * <p>
36   * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
37   * language the word comes from. For example, if it ends in "{@code ault}" then it infers that the word is French.
38   * Next, the word is translated into a phonetic representation using a language-specific phonetics table. Some runs of
39   * letters can be pronounced in multiple ways, and a single run of letters may be potentially broken up into phonemes at
40   * different places, so this stage results in a set of possible language-specific phonetic representations. Lastly, this
41   * language-specific phonetic representation is processed by a table of rules that re-writes it phonetically taking into
42   * account systematic pronunciation differences between languages, to move it towards a pan-indo-european phonetic
43   * representation. Again, sometimes there are multiple ways this could be done and sometimes things that can be
44   * pronounced in several ways in the source language have only one way to represent them in this average phonetic
45   * language, so the result is again a set of phonetic spellings.
46   * </p>
47   * <p>
48   * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. In
49   * this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final encoding.
50   * Secondly, some names have standard prefixes, for example, "{@code Mac/Mc}" in Scottish (English) names. As
51   * sometimes it is ambiguous whether the prefix is intended or is an accident of the spelling, the word is encoded once
52   * with the prefix and once without it. The resulting encoding contains one and then the other result.
53   * </p>
54   * <h2>Encoding format</h2>
55   * <p>
56   * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where there
57   * are multiple possible phonetic representations, these are joined with a pipe ({@code |}) character. If multiple
58   * hyphenated words where found, or if the word may contain a name prefix, each encoded word is placed in ellipses and
59   * these blocks are then joined with hyphens. For example, "{@code d'ortley}" has a possible prefix. The form
60   * without prefix encodes to "{@code ortlaj|ortlej}", while the form with prefix encodes to "
61   * {@code dortlaj|dortlej}". Thus, the full, combined encoding is "{@code (ortlaj|ortlej)-(dortlaj|dortlej)}".
62   * </p>
63   * <p>
64   * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
65   * potential phonetic interpretations. For example, "{@code Renault}" encodes to "
66   * {@code rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult}". The {@code APPROX} rules will tend to produce larger
67   * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
68   * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
69   * splitting on pipe ({@code |}) and indexing under each of these alternatives.
70   * </p>
71   * <p>
72   * <strong>Note</strong>: this version of the Beider-Morse encoding is equivalent with v3.4 of the reference implementation.
73   * </p>
74   * @see <a href="https://stevemorse.org/phonetics/bmpm.htm">Beider-Morse Phonetic Matching</a>
75   * @see <a href="https://stevemorse.org/phoneticinfo.htm">Reference implementation</a>
76   *
77   * <p>
78   * This class is Not ThreadSafe.
79   * </p>
80   * @since 1.6
81   */
82  public class BeiderMorseEncoder implements StringEncoder {
83  
84      // Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
85      // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.
86  
87      // a cached object
88      private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
89  
90      /**
91       * Constructs a new instance.
92       */
93      public BeiderMorseEncoder() {
94          // empty
95      }
96  
97      @Override
98      public Object encode(final Object source) throws EncoderException {
99          if (!(source instanceof String)) {
100             throw new EncoderException("BeiderMorseEncoder encode parameter is not of type String");
101         }
102         return encode((String) source);
103     }
104 
105     @Override
106     public String encode(final String source) throws EncoderException {
107         if (source == null) {
108             return null;
109         }
110         return this.engine.encode(source);
111     }
112 
113     /**
114      * Gets the name type currently in operation.
115      *
116      * @return the NameType currently being used
117      */
118     public NameType getNameType() {
119         return this.engine.getNameType();
120     }
121 
122     /**
123      * Gets the rule type currently in operation.
124      *
125      * @return the RuleType currently being used
126      */
127     public RuleType getRuleType() {
128         return this.engine.getRuleType();
129     }
130 
131     /**
132      * Discovers if multiple possible encodings are concatenated.
133      *
134      * @return true if multiple encodings are concatenated, false if just the first one is returned
135      */
136     public boolean isConcat() {
137         return this.engine.isConcat();
138     }
139 
140     /**
141      * Sets how multiple possible phonetic encodings are combined.
142      *
143      * @param concat
144      *            true if multiple encodings are to be combined with a '|', false if just the first one is
145      *            to be considered
146      */
147     public void setConcat(final boolean concat) {
148         this.engine = new PhoneticEngine(this.engine.getNameType(),
149                                          this.engine.getRuleType(),
150                                          concat,
151                                          this.engine.getMaxPhonemes());
152     }
153 
154     /**
155      * Sets the number of maximum of phonemes that shall be considered by the engine.
156      *
157      * @param maxPhonemes
158      *            the maximum number of phonemes returned by the engine
159      * @since 1.7
160      */
161     public void setMaxPhonemes(final int maxPhonemes) {
162         this.engine = new PhoneticEngine(this.engine.getNameType(),
163                                          this.engine.getRuleType(),
164                                          this.engine.isConcat(),
165                                          maxPhonemes);
166     }
167 
168     /**
169      * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phonetic encodings
170      * optimized for Ashkenazi or Sephardic Jewish family names.
171      *
172      * @param nameType
173      *            the NameType in use
174      */
175     public void setNameType(final NameType nameType) {
176         this.engine = new PhoneticEngine(nameType,
177                                          this.engine.getRuleType(),
178                                          this.engine.isConcat(),
179                                          this.engine.getMaxPhonemes());
180     }
181 
182     /**
183      * Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
184      *
185      * @param ruleType
186      *            {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches
187      */
188     public void setRuleType(final RuleType ruleType) {
189         this.engine = new PhoneticEngine(this.engine.getNameType(),
190                                          ruleType,
191                                          this.engine.isConcat(),
192                                          this.engine.getMaxPhonemes());
193     }
194 
195 }