1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.text.similarity; 18 19 import java.util.Arrays; 20 21 /** 22 * An algorithm for measuring the difference between two character sequences. 23 * 24 * <p> 25 * This is the number of changes needed to change one sequence into another, 26 * where each change is a single character modification (deletion, insertion 27 * or substitution). 28 * </p> 29 * 30 * @since 1.0 31 */ 32 public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResults> { 33 34 /** 35 * Singleton instance. 36 */ 37 private static final LevenshteinDetailedDistance INSTANCE = new LevenshteinDetailedDistance(); 38 39 /** 40 * Finds count for each of the three [insert, delete, substitute] operations 41 * needed. This is based on the matrix formed based on the two character 42 * sequence. 43 * 44 * @param left character sequence which need to be converted from 45 * @param right character sequence which need to be converted to 46 * @param matrix two dimensional array containing 47 * @param swapped tells whether the value for left character sequence and right 48 * character sequence were swapped to save memory 49 * @return result object containing the count of insert, delete and substitute and total count needed 50 */ 51 private static LevenshteinResults findDetailedResults(final CharSequence left, 52 final CharSequence right, 53 final int[][] matrix, 54 final boolean swapped) { 55 56 int delCount = 0; 57 int addCount = 0; 58 int subCount = 0; 59 60 int rowIndex = right.length(); 61 int columnIndex = left.length(); 62 63 int dataAtLeft = 0; 64 int dataAtTop = 0; 65 int dataAtDiagonal = 0; 66 int data = 0; 67 boolean deleted = false; 68 boolean added = false; 69 70 while (rowIndex >= 0 && columnIndex >= 0) { 71 72 if (columnIndex == 0) { 73 dataAtLeft = -1; 74 } else { 75 dataAtLeft = matrix[rowIndex][columnIndex - 1]; 76 } 77 if (rowIndex == 0) { 78 dataAtTop = -1; 79 } else { 80 dataAtTop = matrix[rowIndex - 1][columnIndex]; 81 } 82 if (rowIndex > 0 && columnIndex > 0) { 83 dataAtDiagonal = matrix[rowIndex - 1][columnIndex - 1]; 84 } else { 85 dataAtDiagonal = -1; 86 } 87 if (dataAtLeft == -1 && dataAtTop == -1 && dataAtDiagonal == -1) { 88 break; 89 } 90 data = matrix[rowIndex][columnIndex]; 91 92 // case in which the character at left and right are the same, 93 // in this case none of the counters will be incremented. 94 if (columnIndex > 0 && rowIndex > 0 && left.charAt(columnIndex - 1) == right.charAt(rowIndex - 1)) { 95 columnIndex--; 96 rowIndex--; 97 continue; 98 } 99 100 // handling insert and delete cases. 101 deleted = false; 102 added = false; 103 if (data - 1 == dataAtLeft && data <= dataAtDiagonal && data <= dataAtTop 104 || dataAtDiagonal == -1 && dataAtTop == -1) { // NOPMD 105 columnIndex--; 106 if (swapped) { 107 addCount++; 108 added = true; 109 } else { 110 delCount++; 111 deleted = true; 112 } 113 } else if (data - 1 == dataAtTop && data <= dataAtDiagonal && data <= dataAtLeft 114 || dataAtDiagonal == -1 && dataAtLeft == -1) { // NOPMD 115 rowIndex--; 116 if (swapped) { 117 delCount++; 118 deleted = true; 119 } else { 120 addCount++; 121 added = true; 122 } 123 } 124 125 // substituted case 126 if (!added && !deleted) { 127 subCount++; 128 columnIndex--; 129 rowIndex--; 130 } 131 } 132 return new LevenshteinResults(addCount + delCount + subCount, addCount, delCount, subCount); 133 } 134 135 /** 136 * Gets the default instance. 137 * 138 * @return The default instace 139 */ 140 public static LevenshteinDetailedDistance getDefaultInstance() { 141 return INSTANCE; 142 } 143 144 /** 145 * Finds the Levenshtein distance between two CharSequences if it's less than or 146 * equal to a given threshold. 147 * 148 * <p> 149 * This implementation follows from Algorithms on Strings, Trees and 150 * Sequences by Dan Gusfield and Chas Emerick's implementation of the 151 * Levenshtein distance algorithm from <a 152 * href="http://www.merriampark.com/ld.htm" 153 * >http://www.merriampark.com/ld.htm</a> 154 * </p> 155 * 156 * <pre> 157 * limitedCompare(null, *, *) = IllegalArgumentException 158 * limitedCompare(*, null, *) = IllegalArgumentException 159 * limitedCompare(*, *, -1) = IllegalArgumentException 160 * limitedCompare("","", 0) = 0 161 * limitedCompare("aaapppp", "", 8) = 7 162 * limitedCompare("aaapppp", "", 7) = 7 163 * limitedCompare("aaapppp", "", 6)) = -1 164 * limitedCompare("elephant", "hippo", 7) = 7 165 * limitedCompare("elephant", "hippo", 6) = -1 166 * limitedCompare("hippo", "elephant", 7) = 7 167 * limitedCompare("hippo", "elephant", 6) = -1 168 * </pre> 169 * 170 * @param left the first CharSequence, must not be null 171 * @param right the second CharSequence, must not be null 172 * @param threshold the target threshold, must not be negative 173 * @return result distance, or -1 174 */ 175 private static LevenshteinResults limitedCompare(CharSequence left, 176 CharSequence right, 177 final int threshold) { //NOPMD 178 if (left == null || right == null) { 179 throw new IllegalArgumentException("CharSequences must not be null"); 180 } 181 if (threshold < 0) { 182 throw new IllegalArgumentException("Threshold must not be negative"); 183 } 184 185 /* 186 * This implementation only computes the distance if it's less than or 187 * equal to the threshold value, returning -1 if it's greater. The 188 * advantage is performance: unbounded distance is O(nm), but a bound of 189 * k allows us to reduce it to O(km) time by only computing a diagonal 190 * stripe of width 2k + 1 of the cost table. It is also possible to use 191 * this to compute the unbounded Levenshtein distance by starting the 192 * threshold at 1 and doubling each time until the distance is found; 193 * this is O(dm), where d is the distance. 194 * 195 * One subtlety comes from needing to ignore entries on the border of 196 * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry 197 * to the left of the leftmost member We must ignore the entry above the 198 * rightmost member 199 * 200 * Another subtlety comes from our stripe running off the matrix if the 201 * strings aren't of the same size. Since string s is always swapped to 202 * be the shorter of the two, the stripe will always run off to the 203 * upper right instead of the lower left of the matrix. 204 * 205 * As a concrete example, suppose s is of length 5, t is of length 7, 206 * and our threshold is 1. In this case we're going to walk a stripe of 207 * length 3. The matrix would look like so: 208 * 209 * <pre> 210 * 1 2 3 4 5 211 * 1 |#|#| | | | 212 * 2 |#|#|#| | | 213 * 3 | |#|#|#| | 214 * 4 | | |#|#|#| 215 * 5 | | | |#|#| 216 * 6 | | | | |#| 217 * 7 | | | | | | 218 * </pre> 219 * 220 * Note how the stripe leads off the table as there is no possible way 221 * to turn a string of length 5 into one of length 7 in edit distance of 222 * 1. 223 * 224 * Additionally, this implementation decreases memory usage by using two 225 * single-dimensional arrays and swapping them back and forth instead of 226 * allocating an entire n by m matrix. This requires a few minor 227 * changes, such as immediately returning when it's detected that the 228 * stripe has run off the matrix and initially filling the arrays with 229 * large values so that entries we don't compute are ignored. 230 * 231 * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for 232 * some discussion. 233 */ 234 235 int n = left.length(); // length of left 236 int m = right.length(); // length of right 237 238 // if one string is empty, the edit distance is necessarily the length of the other 239 if (n == 0) { 240 return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0); 241 } 242 if (m == 0) { 243 return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0); 244 } 245 246 boolean swapped = false; 247 if (n > m) { 248 // swap the two strings to consume less memory 249 final CharSequence tmp = left; 250 left = right; 251 right = tmp; 252 n = m; 253 m = right.length(); 254 swapped = true; 255 } 256 257 int[] p = new int[n + 1]; // 'previous' cost array, horizontally 258 int[] d = new int[n + 1]; // cost array, horizontally 259 int[] tempD; // placeholder to assist in swapping p and d 260 final int[][] matrix = new int[m + 1][n + 1]; 261 262 //filling the first row and first column values in the matrix 263 for (int index = 0; index <= n; index++) { 264 matrix[0][index] = index; 265 } 266 for (int index = 0; index <= m; index++) { 267 matrix[index][0] = index; 268 } 269 270 // fill in starting table values 271 final int boundary = Math.min(n, threshold) + 1; 272 for (int i = 0; i < boundary; i++) { 273 p[i] = i; 274 } 275 // these fills ensure that the value above the rightmost entry of our 276 // stripe will be ignored in following loop iterations 277 Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); 278 Arrays.fill(d, Integer.MAX_VALUE); 279 280 // iterates through t 281 for (int j = 1; j <= m; j++) { 282 final char rightJ = right.charAt(j - 1); // jth character of right 283 d[0] = j; 284 285 // compute stripe indices, constrain to array size 286 final int min = Math.max(1, j - threshold); 287 final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( 288 n, j + threshold); 289 290 // the stripe may lead off of the table if s and t are of different sizes 291 if (min > max) { 292 return new LevenshteinResults(-1, 0, 0, 0); 293 } 294 295 // ignore entry left of leftmost 296 if (min > 1) { 297 d[min - 1] = Integer.MAX_VALUE; 298 } 299 300 // iterates through [min, max] in s 301 for (int i = min; i <= max; i++) { 302 if (left.charAt(i - 1) == rightJ) { 303 // diagonally left and up 304 d[i] = p[i - 1]; 305 } else { 306 // 1 + minimum of cell to the left, to the top, diagonally left and up 307 d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); 308 } 309 matrix[j][i] = d[i]; 310 } 311 312 // copy current distance counts to 'previous row' distance counts 313 tempD = p; 314 p = d; 315 d = tempD; 316 } 317 318 // if p[n] is greater than the threshold, there's no guarantee on it being the correct distance 319 if (p[n] <= threshold) { 320 return findDetailedResults(left, right, matrix, swapped); 321 } 322 return new LevenshteinResults(-1, 0, 0, 0); 323 } 324 325 /** 326 * Finds the Levenshtein distance between two Strings. 327 * 328 * <p>A higher score indicates a greater distance.</p> 329 * 330 * <p>The previous implementation of the Levenshtein distance algorithm 331 * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> 332 * 333 * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError 334 * which can occur when my Java implementation is used with very large strings.<br> 335 * This implementation of the Levenshtein distance algorithm 336 * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> 337 * 338 * <pre> 339 * unlimitedCompare(null, *) = IllegalArgumentException 340 * unlimitedCompare(*, null) = IllegalArgumentException 341 * unlimitedCompare("","") = 0 342 * unlimitedCompare("","a") = 1 343 * unlimitedCompare("aaapppp", "") = 7 344 * unlimitedCompare("frog", "fog") = 1 345 * unlimitedCompare("fly", "ant") = 3 346 * unlimitedCompare("elephant", "hippo") = 7 347 * unlimitedCompare("hippo", "elephant") = 7 348 * unlimitedCompare("hippo", "zzzzzzzz") = 8 349 * unlimitedCompare("hello", "hallo") = 1 350 * </pre> 351 * 352 * @param left the first CharSequence, must not be null 353 * @param right the second CharSequence, must not be null 354 * @return result distance, or -1 355 * @throws IllegalArgumentException if either CharSequence input is {@code null} 356 */ 357 private static LevenshteinResults unlimitedCompare(CharSequence left, CharSequence right) { 358 if (left == null || right == null) { 359 throw new IllegalArgumentException("CharSequences must not be null"); 360 } 361 362 /* 363 The difference between this impl. and the previous is that, rather 364 than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, 365 we maintain two single-dimensional arrays of length s.length() + 1. The first, d, 366 is the 'current working' distance array that maintains the newest distance cost 367 counts as we iterate through the characters of String s. Each time we increment 368 the index of String t we are comparing, d is copied to p, the second int[]. Doing so 369 allows us to retain the previous cost counts as required by the algorithm (taking 370 the minimum of the cost count to the left, up one, and diagonally up and to the left 371 of the current cost count being calculated). (Note that the arrays aren't really 372 copied anymore, just switched...this is clearly much better than cloning an array 373 or doing a System.arraycopy() each time through the outer loop.) 374 375 Effectively, the difference between the two implementations is this one does not 376 cause an out of memory condition when calculating the LD over two very large strings. 377 */ 378 379 int n = left.length(); // length of left 380 int m = right.length(); // length of right 381 382 if (n == 0) { 383 return new LevenshteinResults(m, m, 0, 0); 384 } 385 if (m == 0) { 386 return new LevenshteinResults(n, 0, n, 0); 387 } 388 boolean swapped = false; 389 if (n > m) { 390 // swap the input strings to consume less memory 391 final CharSequence tmp = left; 392 left = right; 393 right = tmp; 394 n = m; 395 m = right.length(); 396 swapped = true; 397 } 398 399 int[] p = new int[n + 1]; // 'previous' cost array, horizontally 400 int[] d = new int[n + 1]; // cost array, horizontally 401 int[] tempD; //placeholder to assist in swapping p and d 402 final int[][] matrix = new int[m + 1][n + 1]; 403 404 // filling the first row and first column values in the matrix 405 for (int index = 0; index <= n; index++) { 406 matrix[0][index] = index; 407 } 408 for (int index = 0; index <= m; index++) { 409 matrix[index][0] = index; 410 } 411 412 // indexes into strings left and right 413 int i; // iterates through left 414 int j; // iterates through right 415 416 char rightJ; // jth character of right 417 418 int cost; // cost 419 for (i = 0; i <= n; i++) { 420 p[i] = i; 421 } 422 423 for (j = 1; j <= m; j++) { 424 rightJ = right.charAt(j - 1); 425 d[0] = j; 426 427 for (i = 1; i <= n; i++) { 428 cost = left.charAt(i - 1) == rightJ ? 0 : 1; 429 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost 430 d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); 431 //filling the matrix 432 matrix[j][i] = d[i]; 433 } 434 435 // copy current distance counts to 'previous row' distance counts 436 tempD = p; 437 p = d; 438 d = tempD; 439 } 440 return findDetailedResults(left, right, matrix, swapped); 441 } 442 443 /** 444 * Threshold. 445 */ 446 private final Integer threshold; 447 448 /** 449 * <p> 450 * This returns the default instance that uses a version 451 * of the algorithm that does not use a threshold parameter. 452 * </p> 453 * 454 * @see LevenshteinDetailedDistance#getDefaultInstance() 455 */ 456 public LevenshteinDetailedDistance() { 457 this(null); 458 } 459 460 /** 461 * If the threshold is not null, distance calculations will be limited to a maximum length. 462 * 463 * <p>If the threshold is null, the unlimited version of the algorithm will be used.</p> 464 * 465 * @param threshold If this is null then distances calculations will not be limited. This may not be negative. 466 */ 467 public LevenshteinDetailedDistance(final Integer threshold) { 468 if (threshold != null && threshold < 0) { 469 throw new IllegalArgumentException("Threshold must not be negative"); 470 } 471 this.threshold = threshold; 472 } 473 474 /** 475 * Finds the Levenshtein distance between two Strings. 476 * 477 * <p>A higher score indicates a greater distance.</p> 478 * 479 * <p>The previous implementation of the Levenshtein distance algorithm 480 * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> 481 * 482 * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError 483 * which can occur when my Java implementation is used with very large strings.<br> 484 * This implementation of the Levenshtein distance algorithm 485 * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> 486 * 487 * <pre> 488 * distance.apply(null, *) = IllegalArgumentException 489 * distance.apply(*, null) = IllegalArgumentException 490 * distance.apply("","") = 0 491 * distance.apply("","a") = 1 492 * distance.apply("aaapppp", "") = 7 493 * distance.apply("frog", "fog") = 1 494 * distance.apply("fly", "ant") = 3 495 * distance.apply("elephant", "hippo") = 7 496 * distance.apply("hippo", "elephant") = 7 497 * distance.apply("hippo", "zzzzzzzz") = 8 498 * distance.apply("hello", "hallo") = 1 499 * </pre> 500 * 501 * @param left the first string, must not be null 502 * @param right the second string, must not be null 503 * @return result distance, or -1 504 * @throws IllegalArgumentException if either String input {@code null} 505 */ 506 @Override 507 public LevenshteinResults apply(final CharSequence left, final CharSequence right) { 508 if (threshold != null) { 509 return limitedCompare(left, right, threshold); 510 } 511 return unlimitedCompare(left, right); 512 } 513 514 /** 515 * Gets the distance threshold. 516 * 517 * @return The distance threshold 518 */ 519 public Integer getThreshold() { 520 return threshold; 521 } 522 }