001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.similarity;
018
019import java.util.Arrays;
020import java.util.Objects;
021
022/**
023 * A similarity algorithm indicating the percentage of matched characters between two character sequences.
024 *
025 * <p>
026 * The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters. Winkler increased this measure for
027 * matching initial characters.
028 * </p>
029 * <p>
030 * This implementation is based on the Jaro Winkler similarity algorithm from <a href="https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
031 * https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
032 * </p>
033 * <p>
034 * This code has been adapted from Apache Commons Lang 3.3.
035 * </p>
036 *
037 * @since 1.7
038 */
039public class JaroWinklerSimilarity implements SimilarityScore<Double> {
040
041    /**
042     * Singleton instance.
043     */
044    static final JaroWinklerSimilarity INSTANCE = new JaroWinklerSimilarity();
045
046    /**
047     * Computes the Jaro-Winkler string matches, half transpositions, prefix array.
048     *
049     * @param first  the first input to be matched.
050     * @param second the second input to be matched.
051     * @return mtp array containing: matches, half transpositions, and prefix.
052     */
053    protected static int[] matches(final CharSequence first, final CharSequence second) {
054        return matches(SimilarityInput.input(first), SimilarityInput.input(second));
055    }
056
057    /**
058     * Computes the Jaro-Winkler string matches, half transpositions, prefix array.
059     *
060     * @param <E> The type of similarity score unit.
061     * @param first  the first input to be matched.
062     * @param second the second input to be matched.
063     * @return mtp array containing: matches, half transpositions, and prefix.
064     * @since 1.13.0
065     */
066    protected static <E> int[] matches(final SimilarityInput<E> first, final SimilarityInput<E> second) {
067        final SimilarityInput<E> max;
068        final SimilarityInput<E> min;
069        if (first.length() > second.length()) {
070            max = first;
071            min = second;
072        } else {
073            max = second;
074            min = first;
075        }
076        final int range = Math.max(max.length() / 2 - 1, 0);
077        final int[] matchIndexes = new int[min.length()];
078        Arrays.fill(matchIndexes, -1);
079        final boolean[] matchFlags = new boolean[max.length()];
080        int matches = 0;
081        for (int mi = 0; mi < min.length(); mi++) {
082            final E c1 = min.at(mi);
083            for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
084                if (!matchFlags[xi] && c1.equals(max.at(xi))) {
085                    matchIndexes[mi] = xi;
086                    matchFlags[xi] = true;
087                    matches++;
088                    break;
089                }
090            }
091        }
092        final Object[] ms1 = new Object[matches];
093        final Object[] ms2 = new Object[matches];
094        for (int i = 0, si = 0; i < min.length(); i++) {
095            if (matchIndexes[i] != -1) {
096                ms1[si] = min.at(i);
097                si++;
098            }
099        }
100        for (int i = 0, si = 0; i < max.length(); i++) {
101            if (matchFlags[i]) {
102                ms2[si] = max.at(i);
103                si++;
104            }
105        }
106        int halfTranspositions = 0;
107        for (int mi = 0; mi < ms1.length; mi++) {
108            if (!ms1[mi].equals(ms2[mi])) {
109                halfTranspositions++;
110            }
111        }
112        int prefix = 0;
113        for (int mi = 0; mi < Math.min(4, min.length()); mi++) {
114            if (!first.at(mi).equals(second.at(mi))) {
115                break;
116            }
117            prefix++;
118        }
119        return new int[] { matches, halfTranspositions, prefix };
120    }
121
122    /**
123     * Computes the Jaro Winkler Similarity between two character sequences.
124     *
125     * <pre>
126     * sim.apply(null, null)          = IllegalArgumentException
127     * sim.apply("foo", null)         = IllegalArgumentException
128     * sim.apply(null, "foo")         = IllegalArgumentException
129     * sim.apply("", "")              = 1.0
130     * sim.apply("foo", "foo")        = 1.0
131     * sim.apply("foo", "foo ")       = 0.94
132     * sim.apply("foo", "foo  ")      = 0.91
133     * sim.apply("foo", " foo ")      = 0.87
134     * sim.apply("foo", "  foo")      = 0.51
135     * sim.apply("", "a")             = 0.0
136     * sim.apply("aaapppp", "")       = 0.0
137     * sim.apply("frog", "fog")       = 0.93
138     * sim.apply("fly", "ant")        = 0.0
139     * sim.apply("elephant", "hippo") = 0.44
140     * sim.apply("hippo", "elephant") = 0.44
141     * sim.apply("hippo", "zzzzzzzz") = 0.0
142     * sim.apply("hello", "hallo")    = 0.88
143     * sim.apply("ABC Corporation", "ABC Corp") = 0.91
144     * sim.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.95
145     * sim.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
146     * sim.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
147     * </pre>
148     *
149     * @param left  the first input, must not be null.
150     * @param right the second input, must not be null.
151     * @return result similarity.
152     * @throws IllegalArgumentException if either CharSequence input is {@code null}.
153     */
154    @Override
155    public Double apply(final CharSequence left, final CharSequence right) {
156        return apply(SimilarityInput.input(left), SimilarityInput.input(right));
157    }
158
159    /**
160     * Computes the Jaro Winkler Similarity between two character sequences.
161     *
162     * <pre>
163     * sim.apply(null, null)          = IllegalArgumentException
164     * sim.apply("foo", null)         = IllegalArgumentException
165     * sim.apply(null, "foo")         = IllegalArgumentException
166     * sim.apply("", "")              = 1.0
167     * sim.apply("foo", "foo")        = 1.0
168     * sim.apply("foo", "foo ")       = 0.94
169     * sim.apply("foo", "foo  ")      = 0.91
170     * sim.apply("foo", " foo ")      = 0.87
171     * sim.apply("foo", "  foo")      = 0.51
172     * sim.apply("", "a")             = 0.0
173     * sim.apply("aaapppp", "")       = 0.0
174     * sim.apply("frog", "fog")       = 0.93
175     * sim.apply("fly", "ant")        = 0.0
176     * sim.apply("elephant", "hippo") = 0.44
177     * sim.apply("hippo", "elephant") = 0.44
178     * sim.apply("hippo", "zzzzzzzz") = 0.0
179     * sim.apply("hello", "hallo")    = 0.88
180     * sim.apply("ABC Corporation", "ABC Corp") = 0.91
181     * sim.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.95
182     * sim.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
183     * sim.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
184     * </pre>
185     *
186     * @param <E> The type of similarity score unit.
187     * @param left  the first input, must not be null.
188     * @param right the second input, must not be null.
189     * @return result similarity.
190     * @throws IllegalArgumentException if either CharSequence input is {@code null}.
191     * @since 1.13.0
192     */
193    public <E> Double apply(final SimilarityInput<E> left, final SimilarityInput<E> right) {
194        final double defaultScalingFactor = 0.1;
195        if (left == null || right == null) {
196            throw new IllegalArgumentException("CharSequences must not be null");
197        }
198        if (Objects.equals(left, right)) {
199            return 1d;
200        }
201        final int[] mtp = matches(left, right);
202        final double m = mtp[0];
203        if (m == 0) {
204            return 0d;
205        }
206        final double j = (m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m) / 3;
207        return j < 0.7d ? j : j + defaultScalingFactor * mtp[2] * (1d - j);
208    }
209
210}