1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.collections4.sequence;
18
19 import java.util.List;
20
21 import org.apache.commons.collections4.Equator;
22 import org.apache.commons.collections4.functors.DefaultEquator;
23
24 /**
25 * This class allows to compare two objects sequences.
26 * <p>
27 * The two sequences can hold any object type, as only the {@code equals}
28 * method is used to compare the elements of the sequences. It is guaranteed
29 * that the comparisons will always be done as {@code o1.equals(o2)} where
30 * {@code o1} belongs to the first sequence and {@code o2} belongs to
31 * the second sequence. This can be important if subclassing is used for some
32 * elements in the first sequence and the {@code equals} method is
33 * specialized.
34 * </p>
35 * <p>
36 * Comparison can be seen from two points of view: either as giving the smallest
37 * modification allowing to transform the first sequence into the second one, or
38 * as giving the longest sequence which is a subsequence of both initial
39 * sequences. The {@code equals} method is used to compare objects, so any
40 * object can be put into sequences. Modifications include deleting, inserting
41 * or keeping one object, starting from the beginning of the first sequence.
42 * </p>
43 * <p>
44 * This class implements the comparison algorithm, which is the very efficient
45 * algorithm from Eugene W. Myers
46 * <a href="https://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
47 * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
48 * the shortest possible
49 * {@link EditScript edit script}
50 * containing all the
51 * {@link EditCommand commands}
52 * needed to transform the first sequence into the second one.
53 * </p>
54 *
55 * @param <T> the type of elements in the lists.
56 * @see EditScript
57 * @see EditCommand
58 * @see CommandVisitor
59 * @since 4.0
60 */
61 public class SequencesComparator<T> {
62
63 /**
64 * This class is a simple placeholder to hold the end part of a path
65 * under construction in a {@link SequencesComparator SequencesComparator}.
66 */
67 private static final class Snake {
68
69 /** Start index. */
70 private final int start;
71
72 /** End index. */
73 private final int end;
74
75 /** Diagonal number. */
76 private final int diag;
77
78 /**
79 * Simple constructor. Creates a new instance of Snake with specified indices.
80 *
81 * @param start start index of the snake
82 * @param end end index of the snake
83 * @param diag diagonal number
84 */
85 Snake(final int start, final int end, final int diag) {
86 this.start = start;
87 this.end = end;
88 this.diag = diag;
89 }
90
91 /**
92 * Gets the diagonal number of the snake.
93 *
94 * @return diagonal number of the snake
95 */
96 public int getDiag() {
97 return diag;
98 }
99
100 /**
101 * Gets the end index of the snake.
102 *
103 * @return end index of the snake
104 */
105 public int getEnd() {
106 return end;
107 }
108
109 /**
110 * Gets the start index of the snake.
111 *
112 * @return start index of the snake
113 */
114 public int getStart() {
115 return start;
116 }
117 }
118
119 /** First sequence. */
120 private final List<T> sequence1;
121
122 /** Second sequence. */
123 private final List<T> sequence2;
124
125 /** The equator used for testing object equality. */
126 private final Equator<? super T> equator;
127 /** Temporary variables. */
128 private final int[] vDown;
129
130 private final int[] vUp;
131
132 /**
133 * Simple constructor.
134 * <p>
135 * Creates a new instance of SequencesComparator using a {@link DefaultEquator}.
136 * <p>
137 * It is <em>guaranteed</em> that the comparisons will always be done as
138 * {@code o1.equals(o2)} where {@code o1} belongs to the first
139 * sequence and {@code o2} belongs to the second sequence. This can be
140 * important if subclassing is used for some elements in the first sequence
141 * and the {@code equals} method is specialized.
142 *
143 * @param sequence1 first sequence to be compared
144 * @param sequence2 second sequence to be compared
145 */
146 public SequencesComparator(final List<T> sequence1, final List<T> sequence2) {
147 this(sequence1, sequence2, DefaultEquator.defaultEquator());
148 }
149
150 /**
151 * Simple constructor.
152 * <p>
153 * Creates a new instance of SequencesComparator with a custom {@link Equator}.
154 * <p>
155 * It is <em>guaranteed</em> that the comparisons will always be done as
156 * {@code Equator.equate(o1, o2)} where {@code o1} belongs to the first
157 * sequence and {@code o2} belongs to the second sequence.
158 *
159 * @param sequence1 first sequence to be compared
160 * @param sequence2 second sequence to be compared
161 * @param equator the equator to use for testing object equality
162 */
163 public SequencesComparator(final List<T> sequence1, final List<T> sequence2, final Equator<? super T> equator) {
164 this.sequence1 = sequence1;
165 this.sequence2 = sequence2;
166 this.equator = equator;
167
168 final int size = sequence1.size() + sequence2.size() + 2;
169 vDown = new int[size];
170 vUp = new int[size];
171 }
172
173 /**
174 * Build an edit script.
175 *
176 * @param start1 the start of the first sequence to be compared
177 * @param end1 the end of the first sequence to be compared
178 * @param start2 the start of the second sequence to be compared
179 * @param end2 the end of the second sequence to be compared
180 * @param script the edited script
181 */
182 private void buildScript(final int start1, final int end1, final int start2, final int end2,
183 final EditScript<T> script) {
184
185 final Snake middle = getMiddleSnake(start1, end1, start2, end2);
186
187 if (middle == null
188 || middle.getStart() == end1 && middle.getDiag() == end1 - end2
189 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
190
191 int i = start1;
192 int j = start2;
193 while (i < end1 || j < end2) {
194 if (i < end1 && j < end2 && equator.equate(sequence1.get(i), sequence2.get(j))) {
195 script.append(new KeepCommand<>(sequence1.get(i)));
196 ++i;
197 ++j;
198 } else if (end1 - start1 > end2 - start2) {
199 script.append(new DeleteCommand<>(sequence1.get(i)));
200 ++i;
201 } else {
202 script.append(new InsertCommand<>(sequence2.get(j)));
203 ++j;
204 }
205 }
206
207 } else {
208
209 buildScript(start1, middle.getStart(),
210 start2, middle.getStart() - middle.getDiag(),
211 script);
212 for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
213 script.append(new KeepCommand<>(sequence1.get(i)));
214 }
215 buildScript(middle.getEnd(), end1,
216 middle.getEnd() - middle.getDiag(), end2,
217 script);
218 }
219 }
220
221 /**
222 * Build a snake.
223 *
224 * @param start the value of the start of the snake
225 * @param diag the value of the diagonal of the snake
226 * @param end1 the value of the end of the first sequence to be compared
227 * @param end2 the value of the end of the second sequence to be compared
228 * @return the snake built
229 */
230 private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
231 int end = start;
232 while (end - diag < end2
233 && end < end1
234 && equator.equate(sequence1.get(end), sequence2.get(end - diag))) {
235 ++end;
236 }
237 return new Snake(start, end, diag);
238 }
239
240 /**
241 * Gets the middle snake corresponding to two subsequences of the
242 * main sequences.
243 * <p>
244 * The snake is found using the MYERS Algorithm (this algorithm has
245 * also been implemented in the GNU diff program). This algorithm is
246 * explained in Eugene Myers article:
247 * <a href="https://web.archive.org/web/20040719035900/http%3A//www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
248 * An O(ND) Difference Algorithm and Its Variations</a>.
249 *
250 * @param start1 the start of the first sequence to be compared
251 * @param end1 the end of the first sequence to be compared
252 * @param start2 the start of the second sequence to be compared
253 * @param end2 the end of the second sequence to be compared
254 * @return the middle snake
255 */
256 private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
257 // Myers Algorithm
258 // Initialisations
259 final int m = end1 - start1;
260 final int n = end2 - start2;
261 if (m == 0 || n == 0) {
262 return null;
263 }
264
265 final int delta = m - n;
266 final int sum = n + m;
267 final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
268 vDown[1 + offset] = start1;
269 vUp[1 + offset] = end1 + 1;
270
271 for (int d = 0; d <= offset; ++d) {
272 // Down
273 for (int k = -d; k <= d; k += 2) {
274 // First step
275
276 final int i = k + offset;
277 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
278 vDown[i] = vDown[i + 1];
279 } else {
280 vDown[i] = vDown[i - 1] + 1;
281 }
282
283 int x = vDown[i];
284 int y = x - start1 + start2 - k;
285
286 while (x < end1 && y < end2 && equator.equate(sequence1.get(x), sequence2.get(y))) {
287 vDown[i] = ++x;
288 ++y;
289 }
290 // Second step
291 if (delta % 2 != 0 && delta - d <= k && k <= delta + d && vUp[i - delta] <= vDown[i]) { // NOPMD
292 return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
293 }
294 }
295
296 // Up
297 for (int k = delta - d; k <= delta + d; k += 2) {
298 // First step
299 final int i = k + offset - delta;
300 if (k == delta - d || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
301 vUp[i] = vUp[i + 1] - 1;
302 } else {
303 vUp[i] = vUp[i - 1];
304 }
305
306 int x = vUp[i] - 1;
307 int y = x - start1 + start2 - k;
308 while (x >= start1 && y >= start2 && equator.equate(sequence1.get(x), sequence2.get(y))) {
309 vUp[i] = x--;
310 y--;
311 }
312 // Second step
313 if (delta % 2 == 0 && -d <= k && k <= d && vUp[i] <= vDown[i + delta]) { // NOPMD
314 return buildSnake(vUp[i], k + start1 - start2, end1, end2);
315 }
316 }
317 }
318
319 // this should not happen
320 throw new IllegalStateException("Internal Error");
321 }
322
323 /**
324 * Gets the {@link EditScript} object.
325 * <p>
326 * It is guaranteed that the objects embedded in the {@link InsertCommand
327 * insert commands} come from the second sequence and that the objects
328 * embedded in either the {@link DeleteCommand delete commands} or
329 * {@link KeepCommand keep commands} come from the first sequence. This can
330 * be important if subclassing is used for some elements in the first
331 * sequence and the {@code equals} method is specialized.
332 *
333 * @return the edit script resulting from the comparison of the two
334 * sequences
335 */
336 public EditScript<T> getScript() {
337 final EditScript<T> script = new EditScript<>();
338 buildScript(0, sequence1.size(), 0, sequence2.size(), script);
339 return script;
340 }
341 }