View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import java.util.Arrays;
20  
21  /**
22   * An algorithm for measuring the difference between two character sequences.
23   *
24   * <p>
25   * This is the number of changes needed to change one sequence into another,
26   * where each change is a single character modification (deletion, insertion
27   * or substitution).
28   * </p>
29   *
30   * @since 1.0
31   */
32  public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResults> {
33  
34      /**
35       * Singleton instance.
36       */
37      private static final LevenshteinDetailedDistance INSTANCE = new LevenshteinDetailedDistance();
38  
39      /**
40       * Finds count for each of the three [insert, delete, substitute] operations
41       * needed. This is based on the matrix formed based on the two character
42       * sequence.
43       *
44       * @param <E> The type of similarity score unit.
45       * @param left character sequence which need to be converted from
46       * @param right character sequence which need to be converted to
47       * @param matrix two dimensional array containing
48       * @param swapped tells whether the value for left character sequence and right
49       *            character sequence were swapped to save memory
50       * @return result object containing the count of insert, delete and substitute and total count needed
51       */
52      private static <E> LevenshteinResults findDetailedResults(final SimilarityInput<E> left,
53                                                            final SimilarityInput<E> right,
54                                                            final int[][] matrix,
55                                                            final boolean swapped) {
56  
57          int delCount = 0;
58          int addCount = 0;
59          int subCount = 0;
60  
61          int rowIndex = right.length();
62          int columnIndex = left.length();
63  
64          int dataAtLeft = 0;
65          int dataAtTop = 0;
66          int dataAtDiagonal = 0;
67          int data = 0;
68          boolean deleted = false;
69          boolean added = false;
70  
71          while (rowIndex >= 0 && columnIndex >= 0) {
72  
73              if (columnIndex == 0) {
74                  dataAtLeft = -1;
75              } else {
76                  dataAtLeft = matrix[rowIndex][columnIndex - 1];
77              }
78              if (rowIndex == 0) {
79                  dataAtTop = -1;
80              } else {
81                  dataAtTop = matrix[rowIndex - 1][columnIndex];
82              }
83              if (rowIndex > 0 && columnIndex > 0) {
84                  dataAtDiagonal = matrix[rowIndex - 1][columnIndex - 1];
85              } else {
86                  dataAtDiagonal = -1;
87              }
88              if (dataAtLeft == -1 && dataAtTop == -1 && dataAtDiagonal == -1) {
89                  break;
90              }
91              data = matrix[rowIndex][columnIndex];
92  
93              // case in which the character at left and right are the same,
94              // in this case none of the counters will be incremented.
95              if (columnIndex > 0 && rowIndex > 0 && left.at(columnIndex - 1).equals(right.at(rowIndex - 1))) {
96                  columnIndex--;
97                  rowIndex--;
98                  continue;
99              }
100 
101             // handling insert and delete cases.
102             deleted = false;
103             added = false;
104             if (data - 1 == dataAtLeft && data <= dataAtDiagonal && data <= dataAtTop
105                     || dataAtDiagonal == -1 && dataAtTop == -1) { // NOPMD
106                 columnIndex--;
107                 if (swapped) {
108                     addCount++;
109                     added = true;
110                 } else {
111                     delCount++;
112                     deleted = true;
113                 }
114             } else if (data - 1 == dataAtTop && data <= dataAtDiagonal && data <= dataAtLeft
115                     || dataAtDiagonal == -1 && dataAtLeft == -1) { // NOPMD
116                 rowIndex--;
117                 if (swapped) {
118                     delCount++;
119                     deleted = true;
120                 } else {
121                     addCount++;
122                     added = true;
123                 }
124             }
125 
126             // substituted case
127             if (!added && !deleted) {
128                 subCount++;
129                 columnIndex--;
130                 rowIndex--;
131             }
132         }
133         return new LevenshteinResults(addCount + delCount + subCount, addCount, delCount, subCount);
134     }
135 
136     /**
137      * Gets the default instance.
138      *
139      * @return The default instace
140      */
141     public static LevenshteinDetailedDistance getDefaultInstance() {
142         return INSTANCE;
143     }
144 
145     /**
146      * Finds the Levenshtein distance between two CharSequences if it's less than or
147      * equal to a given threshold.
148      *
149      * <p>
150      * This implementation follows from Algorithms on Strings, Trees and
151      * Sequences by Dan Gusfield and Chas Emerick's implementation of the
152      * Levenshtein distance algorithm from <a
153      * href="http://www.merriampark.com/ld.htm"
154      * >http://www.merriampark.com/ld.htm</a>
155      * </p>
156      *
157      * <pre>
158      * limitedCompare(null, *, *)             = IllegalArgumentException
159      * limitedCompare(*, null, *)             = IllegalArgumentException
160      * limitedCompare(*, *, -1)               = IllegalArgumentException
161      * limitedCompare("","", 0)               = 0
162      * limitedCompare("aaapppp", "", 8)       = 7
163      * limitedCompare("aaapppp", "", 7)       = 7
164      * limitedCompare("aaapppp", "", 6))      = -1
165      * limitedCompare("elephant", "hippo", 7) = 7
166      * limitedCompare("elephant", "hippo", 6) = -1
167      * limitedCompare("hippo", "elephant", 7) = 7
168      * limitedCompare("hippo", "elephant", 6) = -1
169      * </pre>
170      *
171      * @param <E> The type of similarity score unit.
172      * @param left the first CharSequence, must not be null
173      * @param right the second CharSequence, must not be null
174      * @param threshold the target threshold, must not be negative
175      * @return result distance, or -1
176      */
177     private static <E> LevenshteinResults limitedCompare(SimilarityInput<E> left, SimilarityInput<E> right, final int threshold) { //NOPMD
178         if (left == null || right == null) {
179             throw new IllegalArgumentException("CharSequences must not be null");
180         }
181         if (threshold < 0) {
182             throw new IllegalArgumentException("Threshold must not be negative");
183         }
184 
185         /*
186          * This implementation only computes the distance if it's less than or
187          * equal to the threshold value, returning -1 if it's greater. The
188          * advantage is performance: unbounded distance is O(nm), but a bound of
189          * k allows us to reduce it to O(km) time by only computing a diagonal
190          * stripe of width 2k + 1 of the cost table. It is also possible to use
191          * this to compute the unbounded Levenshtein distance by starting the
192          * threshold at 1 and doubling each time until the distance is found;
193          * this is O(dm), where d is the distance.
194          *
195          * One subtlety comes from needing to ignore entries on the border of
196          * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry
197          * to the left of the leftmost member We must ignore the entry above the
198          * rightmost member
199          *
200          * Another subtlety comes from our stripe running off the matrix if the
201          * strings aren't of the same size. Since string s is always swapped to
202          * be the shorter of the two, the stripe will always run off to the
203          * upper right instead of the lower left of the matrix.
204          *
205          * As a concrete example, suppose s is of length 5, t is of length 7,
206          * and our threshold is 1. In this case we're going to walk a stripe of
207          * length 3. The matrix would look like so:
208          *
209          * <pre>
210          *    1 2 3 4 5
211          * 1 |#|#| | | |
212          * 2 |#|#|#| | |
213          * 3 | |#|#|#| |
214          * 4 | | |#|#|#|
215          * 5 | | | |#|#|
216          * 6 | | | | |#|
217          * 7 | | | | | |
218          * </pre>
219          *
220          * Note how the stripe leads off the table as there is no possible way
221          * to turn a string of length 5 into one of length 7 in edit distance of
222          * 1.
223          *
224          * Additionally, this implementation decreases memory usage by using two
225          * single-dimensional arrays and swapping them back and forth instead of
226          * allocating an entire n by m matrix. This requires a few minor
227          * changes, such as immediately returning when it's detected that the
228          * stripe has run off the matrix and initially filling the arrays with
229          * large values so that entries we don't compute are ignored.
230          *
231          * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for
232          * some discussion.
233          */
234 
235         int n = left.length(); // length of left
236         int m = right.length(); // length of right
237 
238         // if one string is empty, the edit distance is necessarily the length of the other
239         if (n == 0) {
240             return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0);
241         }
242         if (m == 0) {
243             return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0);
244         }
245 
246         boolean swapped = false;
247         if (n > m) {
248             // swap the two strings to consume less memory
249             final SimilarityInput<E> tmp = left;
250             left = right;
251             right = tmp;
252             n = m;
253             m = right.length();
254             swapped = true;
255         }
256 
257         int[] p = new int[n + 1]; // 'previous' cost array, horizontally
258         int[] d = new int[n + 1]; // cost array, horizontally
259         int[] tempD; // placeholder to assist in swapping p and d
260         final int[][] matrix = new int[m + 1][n + 1];
261 
262         //filling the first row and first column values in the matrix
263         for (int index = 0; index <= n; index++) {
264             matrix[0][index] = index;
265         }
266         for (int index = 0; index <= m; index++) {
267             matrix[index][0] = index;
268         }
269 
270         // fill in starting table values
271         final int boundary = Math.min(n, threshold) + 1;
272         for (int i = 0; i < boundary; i++) {
273             p[i] = i;
274         }
275         // these fills ensure that the value above the rightmost entry of our
276         // stripe will be ignored in following loop iterations
277         Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
278         Arrays.fill(d, Integer.MAX_VALUE);
279 
280         // iterates through t
281         for (int j = 1; j <= m; j++) {
282             final E rightJ = right.at(j - 1); // jth character of right
283             d[0] = j;
284 
285             // compute stripe indices, constrain to array size
286             final int min = Math.max(1, j - threshold);
287             final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min(
288                     n, j + threshold);
289 
290             // the stripe may lead off of the table if s and t are of different sizes
291             if (min > max) {
292                 return new LevenshteinResults(-1, 0, 0, 0);
293             }
294 
295             // ignore entry left of leftmost
296             if (min > 1) {
297                 d[min - 1] = Integer.MAX_VALUE;
298             }
299 
300             // iterates through [min, max] in s
301             for (int i = min; i <= max; i++) {
302                 if (left.at(i - 1).equals(rightJ)) {
303                     // diagonally left and up
304                     d[i] = p[i - 1];
305                 } else {
306                     // 1 + minimum of cell to the left, to the top, diagonally left and up
307                     d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]);
308                 }
309                 matrix[j][i] = d[i];
310             }
311 
312             // copy current distance counts to 'previous row' distance counts
313             tempD = p;
314             p = d;
315             d = tempD;
316         }
317 
318         // if p[n] is greater than the threshold, there's no guarantee on it being the correct distance
319         if (p[n] <= threshold) {
320             return findDetailedResults(left, right, matrix, swapped);
321         }
322         return new LevenshteinResults(-1, 0, 0, 0);
323     }
324 
325     /**
326      * Finds the Levenshtein distance between two Strings.
327      *
328      * <p>A higher score indicates a greater distance.</p>
329      *
330      * <p>The previous implementation of the Levenshtein distance algorithm
331      * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
332      *
333      * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
334      * which can occur when my Java implementation is used with very large strings.<br>
335      * This implementation of the Levenshtein distance algorithm
336      * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
337      *
338      * <pre>
339      * unlimitedCompare(null, *)             = IllegalArgumentException
340      * unlimitedCompare(*, null)             = IllegalArgumentException
341      * unlimitedCompare("","")               = 0
342      * unlimitedCompare("","a")              = 1
343      * unlimitedCompare("aaapppp", "")       = 7
344      * unlimitedCompare("frog", "fog")       = 1
345      * unlimitedCompare("fly", "ant")        = 3
346      * unlimitedCompare("elephant", "hippo") = 7
347      * unlimitedCompare("hippo", "elephant") = 7
348      * unlimitedCompare("hippo", "zzzzzzzz") = 8
349      * unlimitedCompare("hello", "hallo")    = 1
350      * </pre>
351      *
352      * @param <E> The type of similarity score unit.
353      * @param left the first CharSequence, must not be null
354      * @param right the second CharSequence, must not be null
355      * @return result distance, or -1
356      * @throws IllegalArgumentException if either CharSequence input is {@code null}
357      */
358     private static <E> LevenshteinResults unlimitedCompare(SimilarityInput<E> left, SimilarityInput<E> right) {
359         if (left == null || right == null) {
360             throw new IllegalArgumentException("CharSequences must not be null");
361         }
362 
363         /*
364            The difference between this impl. and the previous is that, rather
365            than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
366            we maintain two single-dimensional arrays of length s.length() + 1.  The first, d,
367            is the 'current working' distance array that maintains the newest distance cost
368            counts as we iterate through the characters of String s.  Each time we increment
369            the index of String t we are comparing, d is copied to p, the second int[].  Doing so
370            allows us to retain the previous cost counts as required by the algorithm (taking
371            the minimum of the cost count to the left, up one, and diagonally up and to the left
372            of the current cost count being calculated).  (Note that the arrays aren't really
373            copied anymore, just switched...this is clearly much better than cloning an array
374            or doing a System.arraycopy() each time  through the outer loop.)
375 
376            Effectively, the difference between the two implementations is this one does not
377            cause an out of memory condition when calculating the LD over two very large strings.
378          */
379 
380         int n = left.length(); // length of left
381         int m = right.length(); // length of right
382 
383         if (n == 0) {
384             return new LevenshteinResults(m, m, 0, 0);
385         }
386         if (m == 0) {
387             return new LevenshteinResults(n, 0, n, 0);
388         }
389         boolean swapped = false;
390         if (n > m) {
391             // swap the input strings to consume less memory
392             final SimilarityInput<E> tmp = left;
393             left = right;
394             right = tmp;
395             n = m;
396             m = right.length();
397             swapped = true;
398         }
399 
400         int[] p = new int[n + 1]; // 'previous' cost array, horizontally
401         int[] d = new int[n + 1]; // cost array, horizontally
402         int[] tempD; //placeholder to assist in swapping p and d
403         final int[][] matrix = new int[m + 1][n + 1];
404 
405         // filling the first row and first column values in the matrix
406         for (int index = 0; index <= n; index++) {
407             matrix[0][index] = index;
408         }
409         for (int index = 0; index <= m; index++) {
410             matrix[index][0] = index;
411         }
412 
413         // indexes into strings left and right
414         int i; // iterates through left
415         int j; // iterates through right
416 
417         E rightJ; // jth character of right
418 
419         int cost; // cost
420         for (i = 0; i <= n; i++) {
421             p[i] = i;
422         }
423 
424         for (j = 1; j <= m; j++) {
425             rightJ = right.at(j - 1);
426             d[0] = j;
427 
428             for (i = 1; i <= n; i++) {
429                 cost = left.at(i - 1).equals(rightJ) ? 0 : 1;
430                 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
431                 d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
432                 //filling the matrix
433                 matrix[j][i] = d[i];
434             }
435 
436             // copy current distance counts to 'previous row' distance counts
437             tempD = p;
438             p = d;
439             d = tempD;
440         }
441         return findDetailedResults(left, right, matrix, swapped);
442     }
443 
444     /**
445      * Threshold.
446      */
447     private final Integer threshold;
448 
449     /**
450      * <p>
451      * This returns the default instance that uses a version
452      * of the algorithm that does not use a threshold parameter.
453      * </p>
454      *
455      * @see LevenshteinDetailedDistance#getDefaultInstance()
456      * @deprecated Use {@link #getDefaultInstance()}.
457      */
458     @Deprecated
459     public LevenshteinDetailedDistance() {
460         this(null);
461     }
462 
463     /**
464      * If the threshold is not null, distance calculations will be limited to a maximum length.
465      *
466      * <p>If the threshold is null, the unlimited version of the algorithm will be used.</p>
467      *
468      * @param threshold If this is null then distances calculations will not be limited. This may not be negative.
469      */
470     public LevenshteinDetailedDistance(final Integer threshold) {
471         if (threshold != null && threshold < 0) {
472             throw new IllegalArgumentException("Threshold must not be negative");
473         }
474         this.threshold = threshold;
475     }
476 
477     /**
478      * Computes the Levenshtein distance between two Strings.
479      *
480      * <p>A higher score indicates a greater distance.</p>
481      *
482      * <p>The previous implementation of the Levenshtein distance algorithm
483      * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
484      *
485      * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
486      * which can occur when my Java implementation is used with very large strings.<br>
487      * This implementation of the Levenshtein distance algorithm
488      * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
489      *
490      * <pre>
491      * distance.apply(null, *)             = IllegalArgumentException
492      * distance.apply(*, null)             = IllegalArgumentException
493      * distance.apply("","")               = 0
494      * distance.apply("","a")              = 1
495      * distance.apply("aaapppp", "")       = 7
496      * distance.apply("frog", "fog")       = 1
497      * distance.apply("fly", "ant")        = 3
498      * distance.apply("elephant", "hippo") = 7
499      * distance.apply("hippo", "elephant") = 7
500      * distance.apply("hippo", "zzzzzzzz") = 8
501      * distance.apply("hello", "hallo")    = 1
502      * </pre>
503      *
504      * @param left the first input, must not be null
505      * @param right the second input, must not be null
506      * @return result distance, or -1
507      * @throws IllegalArgumentException if either String input {@code null}
508      */
509     @Override
510     public LevenshteinResults apply(final CharSequence left, final CharSequence right) {
511         return apply(SimilarityInput.input(left), SimilarityInput.input(right));
512     }
513 
514     /**
515      * Computes the Levenshtein distance between two Strings.
516      *
517      * <p>A higher score indicates a greater distance.</p>
518      *
519      * <p>The previous implementation of the Levenshtein distance algorithm
520      * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
521      *
522      * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
523      * which can occur when my Java implementation is used with very large strings.<br>
524      * This implementation of the Levenshtein distance algorithm
525      * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
526      *
527      * <pre>
528      * distance.apply(null, *)             = IllegalArgumentException
529      * distance.apply(*, null)             = IllegalArgumentException
530      * distance.apply("","")               = 0
531      * distance.apply("","a")              = 1
532      * distance.apply("aaapppp", "")       = 7
533      * distance.apply("frog", "fog")       = 1
534      * distance.apply("fly", "ant")        = 3
535      * distance.apply("elephant", "hippo") = 7
536      * distance.apply("hippo", "elephant") = 7
537      * distance.apply("hippo", "zzzzzzzz") = 8
538      * distance.apply("hello", "hallo")    = 1
539      * </pre>
540      *
541      * @param <E> The type of similarity score unit.
542      * @param left the first input, must not be null
543      * @param right the second input, must not be null
544      * @return result distance, or -1
545      * @throws IllegalArgumentException if either String input {@code null}
546      * @since 1.13.0
547      */
548     public <E> LevenshteinResults apply(final SimilarityInput<E> left, final SimilarityInput<E> right) {
549         if (threshold != null) {
550             return limitedCompare(left, right, threshold);
551         }
552         return unlimitedCompare(left, right);
553     }
554 
555     /**
556      * Gets the distance threshold.
557      *
558      * @return The distance threshold
559      */
560     public Integer getThreshold() {
561         return threshold;
562     }
563 }