1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.commons.codec.language; 19 20 import org.apache.commons.codec.EncoderException; 21 import org.apache.commons.codec.StringEncoder; 22 23 /** 24 * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes. 25 * 26 * <p>This class is immutable and thread-safe.</p> 27 * 28 * @since 1.3 29 */ 30 final class SoundexUtils { 31 32 /** 33 * Cleans up the input string before Soundex processing by only returning 34 * upper case letters. 35 * 36 * @param str 37 * The String to clean. 38 * @return A clean String. 39 */ 40 static String clean(final String str) { 41 if (isEmpty(str)) { 42 return str; 43 } 44 final int len = str.length(); 45 final char[] chars = new char[len]; 46 int count = 0; 47 for (int i = 0; i < len; i++) { 48 if (Character.isLetter(str.charAt(i))) { 49 chars[count++] = str.charAt(i); 50 } 51 } 52 if (count == len) { 53 return str.toUpperCase(java.util.Locale.ENGLISH); 54 } 55 return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH); 56 } 57 58 /** 59 * Encodes the Strings and returns the number of characters in the two 60 * encoded Strings that are the same. 61 * <ul> 62 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates 63 * little or no similarity, and 4 indicates strong similarity or identical 64 * values.</li> 65 * <li>For refined Soundex, the return value can be greater than 4.</li> 66 * </ul> 67 * 68 * @param encoder 69 * The encoder to use to encode the Strings. 70 * @param s1 71 * A String that will be encoded and compared. 72 * @param s2 73 * A String that will be encoded and compared. 74 * @return The number of characters in the two Soundex encoded Strings that 75 * are the same. 76 * 77 * @see #differenceEncoded(String,String) 78 * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> 79 * MS T-SQL DIFFERENCE</a> 80 * 81 * @throws EncoderException 82 * if an error occurs encoding one of the strings 83 */ 84 static int difference(final StringEncoder encoder, final String s1, final String s2) throws EncoderException { 85 return differenceEncoded(encoder.encode(s1), encoder.encode(s2)); 86 } 87 88 /** 89 * Returns the number of characters in the two Soundex encoded Strings that 90 * are the same. 91 * <ul> 92 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates 93 * little or no similarity, and 4 indicates strong similarity or identical 94 * values.</li> 95 * <li>For refined Soundex, the return value can be greater than 4.</li> 96 * </ul> 97 * 98 * @param es1 99 * An encoded String. 100 * @param es2 101 * An encoded String. 102 * @return The number of characters in the two Soundex encoded Strings that 103 * are the same. 104 * 105 * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> 106 * MS T-SQL DIFFERENCE</a> 107 */ 108 static int differenceEncoded(final String es1, final String es2) { 109 110 if (es1 == null || es2 == null) { 111 return 0; 112 } 113 final int lengthToMatch = Math.min(es1.length(), es2.length()); 114 int diff = 0; 115 for (int i = 0; i < lengthToMatch; i++) { 116 if (es1.charAt(i) == es2.charAt(i)) { 117 diff++; 118 } 119 } 120 return diff; 121 } 122 123 /** 124 * <p>Checks if a CharSequence is empty ("") or null.</p> 125 * 126 * <pre> 127 * StringUtils.isEmpty(null) = true 128 * StringUtils.isEmpty("") = true 129 * StringUtils.isEmpty(" ") = false 130 * StringUtils.isEmpty("bob") = false 131 * StringUtils.isEmpty(" bob ") = false 132 * </pre> 133 * 134 * @param cs the CharSequence to check, may be null 135 * @return {@code true} if the CharSequence is empty or null 136 */ 137 static boolean isEmpty(final CharSequence cs) { 138 return cs == null || cs.length() == 0; 139 } 140 141 }