001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.diff; 018 019/** 020 * <p> 021 * It is guaranteed that the comparisons will always be done as 022 * {@code o1.equals(o2)} where {@code o1} belongs to the first 023 * sequence and {@code o2} belongs to the second sequence. This can 024 * be important if subclassing is used for some elements in the first 025 * sequence and the {@code equals} method is specialized. 026 * </p> 027 * <p> 028 * Comparison can be seen from two points of view: either as giving the smallest 029 * modification allowing to transform the first sequence into the second one, or 030 * as giving the longest sequence which is a subsequence of both initial 031 * sequences. The {@code equals} method is used to compare objects, so any 032 * object can be put into sequences. Modifications include deleting, inserting 033 * or keeping one object, starting from the beginning of the first sequence. 034 * </p> 035 * <p> 036 * This class implements the comparison algorithm, which is the very efficient 037 * algorithm from Eugene W. Myers 038 * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps"> 039 * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces 040 * the shortest possible {@link EditScript edit script} containing all the 041 * {@link EditCommand commands} needed to transform the first sequence into 042 * the second one. 043 * 044 * <p> 045 * This code has been adapted from Apache Commons Collections 4.0. 046 * </p> 047 * 048 * @see EditScript 049 * @see EditCommand 050 * @see CommandVisitor 051 * @since 1.0 052 */ 053public class StringsComparator { 054 055 /** 056 * This class is a simple placeholder to hold the end part of a path 057 * under construction in a {@link StringsComparator StringsComparator}. 058 */ 059 private static final class Snake { 060 061 /** Start index. */ 062 private final int start; 063 064 /** End index. */ 065 private final int end; 066 067 /** Diagonal number. */ 068 private final int diag; 069 070 /** 071 * Constructs a new instance of Snake with specified indices. 072 * 073 * @param start start index of the snake 074 * @param end end index of the snake 075 * @param diag diagonal number 076 */ 077 Snake(final int start, final int end, final int diag) { 078 this.start = start; 079 this.end = end; 080 this.diag = diag; 081 } 082 083 /** 084 * Gets the diagonal number of the snake. 085 * 086 * @return diagonal number of the snake 087 */ 088 public int getDiag() { 089 return diag; 090 } 091 092 /** 093 * Gets the end index of the snake. 094 * 095 * @return end index of the snake 096 */ 097 public int getEnd() { 098 return end; 099 } 100 101 /** 102 * Gets the start index of the snake. 103 * 104 * @return start index of the snake 105 */ 106 public int getStart() { 107 return start; 108 } 109 } 110 /** 111 * First character sequence. 112 */ 113 private final String left; 114 /** 115 * Second character sequence. 116 */ 117 private final String right; 118 /** 119 * Temporary array. 120 */ 121 private final int[] vDown; 122 123 /** 124 * Temporary array. 125 */ 126 private final int[] vUp; 127 128 /** 129 * Constructs a new instance of StringsComparator. 130 * <p> 131 * It is <em>guaranteed</em> that the comparisons will always be done as 132 * {@code o1.equals(o2)} where {@code o1} belongs to the first 133 * sequence and {@code o2} belongs to the second sequence. This can be 134 * important if subclassing is used for some elements in the first sequence 135 * and the {@code equals} method is specialized. 136 * </p> 137 * 138 * @param left first character sequence to be compared 139 * @param right second character sequence to be compared 140 */ 141 public StringsComparator(final String left, final String right) { 142 this.left = left; 143 this.right = right; 144 145 final int size = left.length() + right.length() + 2; 146 vDown = new int[size]; 147 vUp = new int[size]; 148 } 149 150 /** 151 * Builds an edit script. 152 * 153 * @param start1 the begin of the first sequence to be compared 154 * @param end1 the end of the first sequence to be compared 155 * @param start2 the begin of the second sequence to be compared 156 * @param end2 the end of the second sequence to be compared 157 * @param script the edited script 158 */ 159 private void buildScript(final int start1, final int end1, final int start2, final int end2, 160 final EditScript<Character> script) { 161 final Snake middle = getMiddleSnake(start1, end1, start2, end2); 162 163 if (middle == null 164 || middle.getStart() == end1 && middle.getDiag() == end1 - end2 165 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) { 166 167 int i = start1; 168 int j = start2; 169 while (i < end1 || j < end2) { 170 if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) { 171 script.append(new KeepCommand<>(left.charAt(i))); 172 ++i; 173 ++j; 174 } else if (end1 - start1 > end2 - start2) { 175 script.append(new DeleteCommand<>(left.charAt(i))); 176 ++i; 177 } else { 178 script.append(new InsertCommand<>(right.charAt(j))); 179 ++j; 180 } 181 } 182 183 } else { 184 185 buildScript(start1, middle.getStart(), 186 start2, middle.getStart() - middle.getDiag(), 187 script); 188 for (int i = middle.getStart(); i < middle.getEnd(); ++i) { 189 script.append(new KeepCommand<>(left.charAt(i))); 190 } 191 buildScript(middle.getEnd(), end1, 192 middle.getEnd() - middle.getDiag(), end2, 193 script); 194 } 195 } 196 197 /** 198 * Builds a snake. 199 * 200 * @param start the value of the start of the snake 201 * @param diag the value of the diagonal of the snake 202 * @param end1 the value of the end of the first sequence to be compared 203 * @param end2 the value of the end of the second sequence to be compared 204 * @return The snake built 205 */ 206 private Snake buildSnake(final int start, final int diag, final int end1, final int end2) { 207 int end = start; 208 while (end - diag < end2 209 && end < end1 210 && left.charAt(end) == right.charAt(end - diag)) { 211 ++end; 212 } 213 return new Snake(start, end, diag); 214 } 215 216 /** 217 * Gets the middle snake corresponding to two subsequences of the 218 * main sequences. 219 * <p> 220 * The snake is found using the MYERS Algorithm (this algorithms has 221 * also been implemented in the GNU diff program). This algorithm is 222 * explained in Eugene Myers article: 223 * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps"> 224 * An O(ND) Difference Algorithm and Its Variations</a>. 225 * </p> 226 * 227 * @param start1 the begin of the first sequence to be compared 228 * @param end1 the end of the first sequence to be compared 229 * @param start2 the begin of the second sequence to be compared 230 * @param end2 the end of the second sequence to be compared 231 * @return The middle snake 232 */ 233 private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) { 234 // Myers Algorithm 235 // Initialisations 236 final int m = end1 - start1; 237 final int n = end2 - start2; 238 if (m == 0 || n == 0) { 239 return null; 240 } 241 242 final int delta = m - n; 243 final int sum = n + m; 244 final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2; 245 vDown[1 + offset] = start1; 246 vUp[1 + offset] = end1 + 1; 247 248 for (int d = 0; d <= offset; ++d) { 249 // Down 250 for (int k = -d; k <= d; k += 2) { 251 // First step 252 253 final int i = k + offset; 254 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) { 255 vDown[i] = vDown[i + 1]; 256 } else { 257 vDown[i] = vDown[i - 1] + 1; 258 } 259 260 int x = vDown[i]; 261 int y = x - start1 + start2 - k; 262 263 while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) { 264 vDown[i] = ++x; 265 ++y; 266 } 267 // Second step 268 if (delta % 2 != 0 && delta - d <= k && k <= delta + d) { 269 if (vUp[i - delta] <= vDown[i]) { // NOPMD 270 return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2); 271 } 272 } 273 } 274 275 // Up 276 for (int k = delta - d; k <= delta + d; k += 2) { 277 // First step 278 final int i = k + offset - delta; 279 if (k == delta - d 280 || k != delta + d && vUp[i + 1] <= vUp[i - 1]) { 281 vUp[i] = vUp[i + 1] - 1; 282 } else { 283 vUp[i] = vUp[i - 1]; 284 } 285 286 int x = vUp[i] - 1; 287 int y = x - start1 + start2 - k; 288 while (x >= start1 && y >= start2 289 && left.charAt(x) == right.charAt(y)) { 290 vUp[i] = x--; 291 y--; 292 } 293 // Second step 294 if (delta % 2 == 0 && -d <= k && k <= d) { 295 if (vUp[i] <= vDown[i + delta]) { // NOPMD 296 return buildSnake(vUp[i], k + start1 - start2, end1, end2); 297 } 298 } 299 } 300 } 301 302 // this should not happen 303 throw new IllegalStateException("Internal Error"); 304 } 305 306 /** 307 * Gets the {@link EditScript} object. 308 * <p> 309 * It is guaranteed that the objects embedded in the {@link InsertCommand 310 * insert commands} come from the second sequence and that the objects 311 * embedded in either the {@link DeleteCommand delete commands} or 312 * {@link KeepCommand keep commands} come from the first sequence. This can 313 * be important if subclassing is used for some elements in the first 314 * sequence and the {@code equals} method is specialized. 315 * </p> 316 * 317 * @return The edit script resulting from the comparison of the two 318 * sequences 319 */ 320 public EditScript<Character> getScript() { 321 final EditScript<Character> script = new EditScript<>(); 322 buildScript(0, left.length(), 0, right.length(), script); 323 return script; 324 } 325 326}