001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.similarity; 018 019/** 020 * Measures the Jaro-Winkler distance of two character sequences. 021 * It is the complementary of Jaro-Winkler similarity. 022 * 023 * @since 1.0 024 */ 025public class JaroWinklerDistance implements EditDistance<Double> { 026 027 /** 028 * @deprecated Deprecated as of 1.7. This constant will be removed in 2.0. 029 */ 030 @Deprecated 031 public static final int INDEX_NOT_FOUND = -1; 032 033 /** 034 * Computes the Jaro-Winkler string matches, half transpositions, prefix array. 035 * 036 * @param first the first string to be matched. 037 * @param second the second string to be matched. 038 * @return array containing: matches, half transpositions, and prefix 039 * @deprecated Deprecated as of 1.7. This method will be removed in 2.0, and moved to a Jaro Winkler similarity 040 * class. TODO see TEXT-104. 041 */ 042 @Deprecated 043 protected static int[] matches(final CharSequence first, final CharSequence second) { 044 return JaroWinklerSimilarity.matches(first, second); 045 } 046 047 /** 048 * Computes the Jaro Winkler Distance between two character sequences. 049 * 050 * <pre> 051 * distance.apply(null, null) = IllegalArgumentException 052 * distance.apply("foo", null) = IllegalArgumentException 053 * distance.apply(null, "foo") = IllegalArgumentException 054 * distance.apply("", "") = 0.0 055 * distance.apply("foo", "foo") = 0.0 056 * distance.apply("foo", "foo ") = 0.06 057 * distance.apply("foo", "foo ") = 0.09 058 * distance.apply("foo", " foo ") = 0.13 059 * distance.apply("foo", " foo") = 0.49 060 * distance.apply("", "a") = 1.0 061 * distance.apply("aaapppp", "") = 1.0 062 * distance.apply("frog", "fog") = 0.07 063 * distance.apply("fly", "ant") = 1.0 064 * distance.apply("elephant", "hippo") = 0.56 065 * distance.apply("hippo", "elephant") = 0.56 066 * distance.apply("hippo", "zzzzzzzz") = 1.0 067 * distance.apply("hello", "hallo") = 0.12 068 * distance.apply("ABC Corporation", "ABC Corp") = 0.09 069 * distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.05 070 * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.08 071 * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.12 072 * </pre> 073 * 074 * @param left the first input, must not be null. 075 * @param right the second input, must not be null. 076 * @return result distance. 077 * @throws IllegalArgumentException if either CharSequence input is {@code null} 078 */ 079 @Override 080 public Double apply(final CharSequence left, final CharSequence right) { 081 return apply(SimilarityInput.input(left), SimilarityInput.input(right)); 082 } 083 084 /** 085 * Computes the Jaro Winkler Distance between two character sequences. 086 * 087 * <pre> 088 * distance.apply(null, null) = IllegalArgumentException 089 * distance.apply("foo", null) = IllegalArgumentException 090 * distance.apply(null, "foo") = IllegalArgumentException 091 * distance.apply("", "") = 0.0 092 * distance.apply("foo", "foo") = 0.0 093 * distance.apply("foo", "foo ") = 0.06 094 * distance.apply("foo", "foo ") = 0.09 095 * distance.apply("foo", " foo ") = 0.13 096 * distance.apply("foo", " foo") = 0.49 097 * distance.apply("", "a") = 1.0 098 * distance.apply("aaapppp", "") = 1.0 099 * distance.apply("frog", "fog") = 0.07 100 * distance.apply("fly", "ant") = 1.0 101 * distance.apply("elephant", "hippo") = 0.56 102 * distance.apply("hippo", "elephant") = 0.56 103 * distance.apply("hippo", "zzzzzzzz") = 1.0 104 * distance.apply("hello", "hallo") = 0.12 105 * distance.apply("ABC Corporation", "ABC Corp") = 0.09 106 * distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.05 107 * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.08 108 * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.12 109 * </pre> 110 * 111 * @param <E> The type of similarity score unit. 112 * @param left the first input, must not be null. 113 * @param right the second input, must not be null. 114 * @return result distance. 115 * @throws IllegalArgumentException if either CharSequence input is {@code null}. 116 * @since 1.13.0 117 */ 118 public <E> Double apply(final SimilarityInput<E> left, final SimilarityInput<E> right) { 119 if (left == null || right == null) { 120 throw new IllegalArgumentException("CharSequences must not be null"); 121 } 122 return 1 - JaroWinklerSimilarity.INSTANCE.apply(left, right); 123 } 124}