View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.collections4.sequence;
18  
19  import java.util.List;
20  
21  import org.apache.commons.collections4.Equator;
22  import org.apache.commons.collections4.functors.DefaultEquator;
23  
24  /**
25   * This class allows to compare two objects sequences.
26   * <p>
27   * The two sequences can hold any object type, as only the {@code equals}
28   * method is used to compare the elements of the sequences. It is guaranteed
29   * that the comparisons will always be done as {@code o1.equals(o2)} where
30   * {@code o1} belongs to the first sequence and {@code o2} belongs to
31   * the second sequence. This can be important if subclassing is used for some
32   * elements in the first sequence and the {@code equals} method is
33   * specialized.
34   * </p>
35   * <p>
36   * Comparison can be seen from two points of view: either as giving the smallest
37   * modification allowing to transform the first sequence into the second one, or
38   * as giving the longest sequence which is a subsequence of both initial
39   * sequences. The {@code equals} method is used to compare objects, so any
40   * object can be put into sequences. Modifications include deleting, inserting
41   * or keeping one object, starting from the beginning of the first sequence.
42   * </p>
43   * <p>
44   * This class implements the comparison algorithm, which is the very efficient
45   * algorithm from Eugene W. Myers
46   * <a href="https://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
47   * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
48   * the shortest possible
49   * {@link EditScript edit script}
50   * containing all the
51   * {@link EditCommand commands}
52   * needed to transform the first sequence into the second one.
53   * </p>
54   *
55   * @param <T> the type of elements in the lists.
56   * @see EditScript
57   * @see EditCommand
58   * @see CommandVisitor
59   * @since 4.0
60   */
61  public class SequencesComparator<T> {
62  
63      /**
64       * This class is a simple placeholder to hold the end part of a path
65       * under construction in a {@link SequencesComparator SequencesComparator}.
66       */
67      private static final class Snake {
68  
69          /** Start index. */
70          private final int start;
71  
72          /** End index. */
73          private final int end;
74  
75          /** Diagonal number. */
76          private final int diag;
77  
78          /**
79           * Simple constructor. Creates a new instance of Snake with specified indices.
80           *
81           * @param start  start index of the snake
82           * @param end  end index of the snake
83           * @param diag  diagonal number
84           */
85          Snake(final int start, final int end, final int diag) {
86              this.start = start;
87              this.end   = end;
88              this.diag  = diag;
89          }
90  
91          /**
92           * Gets the diagonal number of the snake.
93           *
94           * @return diagonal number of the snake
95           */
96          public int getDiag() {
97              return diag;
98          }
99  
100         /**
101          * Gets the end index of the snake.
102          *
103          * @return end index of the snake
104          */
105         public int getEnd() {
106             return end;
107         }
108 
109         /**
110          * Gets the start index of the snake.
111          *
112          * @return start index of the snake
113          */
114         public int getStart() {
115             return start;
116         }
117     }
118 
119     /** First sequence. */
120     private final List<T> sequence1;
121 
122     /** Second sequence. */
123     private final List<T> sequence2;
124 
125     /** The equator used for testing object equality. */
126     private final Equator<? super T> equator;
127     /** Temporary variables. */
128     private final int[] vDown;
129 
130     private final int[] vUp;
131 
132     /**
133      * Simple constructor.
134      * <p>
135      * Creates a new instance of SequencesComparator using a {@link DefaultEquator}.
136      * <p>
137      * It is <em>guaranteed</em> that the comparisons will always be done as
138      * {@code o1.equals(o2)} where {@code o1} belongs to the first
139      * sequence and {@code o2} belongs to the second sequence. This can be
140      * important if subclassing is used for some elements in the first sequence
141      * and the {@code equals} method is specialized.
142      *
143      * @param sequence1  first sequence to be compared
144      * @param sequence2  second sequence to be compared
145      */
146     public SequencesComparator(final List<T> sequence1, final List<T> sequence2) {
147         this(sequence1, sequence2, DefaultEquator.defaultEquator());
148     }
149 
150     /**
151      * Simple constructor.
152      * <p>
153      * Creates a new instance of SequencesComparator with a custom {@link Equator}.
154      * <p>
155      * It is <em>guaranteed</em> that the comparisons will always be done as
156      * {@code Equator.equate(o1, o2)} where {@code o1} belongs to the first
157      * sequence and {@code o2} belongs to the second sequence.
158      *
159      * @param sequence1  first sequence to be compared
160      * @param sequence2  second sequence to be compared
161      * @param equator  the equator to use for testing object equality
162      */
163     public SequencesComparator(final List<T> sequence1, final List<T> sequence2, final Equator<? super T> equator) {
164         this.sequence1 = sequence1;
165         this.sequence2 = sequence2;
166         this.equator = equator;
167 
168         final int size = sequence1.size() + sequence2.size() + 2;
169         vDown = new int[size];
170         vUp   = new int[size];
171     }
172 
173     /**
174      * Build an edit script.
175      *
176      * @param start1  the start of the first sequence to be compared
177      * @param end1  the end of the first sequence to be compared
178      * @param start2  the start of the second sequence to be compared
179      * @param end2  the end of the second sequence to be compared
180      * @param script the edited script
181      */
182     private void buildScript(final int start1, final int end1, final int start2, final int end2,
183                              final EditScript<T> script) {
184 
185         final Snake middle = getMiddleSnake(start1, end1, start2, end2);
186 
187         if (middle == null
188                 || middle.getStart() == end1 && middle.getDiag() == end1 - end2
189                 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
190 
191             int i = start1;
192             int j = start2;
193             while (i < end1 || j < end2) {
194                 if (i < end1 && j < end2 && equator.equate(sequence1.get(i), sequence2.get(j))) {
195                     script.append(new KeepCommand<>(sequence1.get(i)));
196                     ++i;
197                     ++j;
198                 } else if (end1 - start1 > end2 - start2) {
199                     script.append(new DeleteCommand<>(sequence1.get(i)));
200                     ++i;
201                 } else {
202                     script.append(new InsertCommand<>(sequence2.get(j)));
203                     ++j;
204                 }
205             }
206 
207         } else {
208 
209             buildScript(start1, middle.getStart(),
210                         start2, middle.getStart() - middle.getDiag(),
211                         script);
212             for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
213                 script.append(new KeepCommand<>(sequence1.get(i)));
214             }
215             buildScript(middle.getEnd(), end1,
216                         middle.getEnd() - middle.getDiag(), end2,
217                         script);
218         }
219     }
220 
221     /**
222      * Build a snake.
223      *
224      * @param start  the value of the start of the snake
225      * @param diag  the value of the diagonal of the snake
226      * @param end1  the value of the end of the first sequence to be compared
227      * @param end2  the value of the end of the second sequence to be compared
228      * @return the snake built
229      */
230     private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
231         int end = start;
232         while (end - diag < end2
233                 && end < end1
234                 && equator.equate(sequence1.get(end), sequence2.get(end - diag))) {
235             ++end;
236         }
237         return new Snake(start, end, diag);
238     }
239 
240     /**
241      * Gets the middle snake corresponding to two subsequences of the
242      * main sequences.
243      * <p>
244      * The snake is found using the MYERS Algorithm (this algorithm has
245      * also been implemented in the GNU diff program). This algorithm is
246      * explained in Eugene Myers article:
247      * <a href="https://web.archive.org/web/20040719035900/http%3A//www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
248      * An O(ND) Difference Algorithm and Its Variations</a>.
249      *
250      * @param start1  the start of the first sequence to be compared
251      * @param end1  the end of the first sequence to be compared
252      * @param start2  the start of the second sequence to be compared
253      * @param end2  the end of the second sequence to be compared
254      * @return the middle snake
255      */
256     private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
257         // Myers Algorithm
258         // Initialisations
259         final int m = end1 - start1;
260         final int n = end2 - start2;
261         if (m == 0 || n == 0) {
262             return null;
263         }
264 
265         final int delta = m - n;
266         final int sum = n + m;
267         final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
268         vDown[1 + offset] = start1;
269         vUp[1 + offset] = end1 + 1;
270 
271         for (int d = 0; d <= offset; ++d) {
272             // Down
273             for (int k = -d; k <= d; k += 2) {
274                 // First step
275 
276                 final int i = k + offset;
277                 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
278                     vDown[i] = vDown[i + 1];
279                 } else {
280                     vDown[i] = vDown[i - 1] + 1;
281                 }
282 
283                 int x = vDown[i];
284                 int y = x - start1 + start2 - k;
285 
286                 while (x < end1 && y < end2 && equator.equate(sequence1.get(x), sequence2.get(y))) {
287                     vDown[i] = ++x;
288                     ++y;
289                 }
290                 // Second step
291                 if (delta % 2 != 0 && delta - d <= k && k <= delta + d && vUp[i - delta] <= vDown[i]) { // NOPMD
292                     return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
293                 }
294             }
295 
296             // Up
297             for (int k = delta - d; k <= delta + d; k += 2) {
298                 // First step
299                 final int i = k + offset - delta;
300                 if (k == delta - d || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
301                     vUp[i] = vUp[i + 1] - 1;
302                 } else {
303                     vUp[i] = vUp[i - 1];
304                 }
305 
306                 int x = vUp[i] - 1;
307                 int y = x - start1 + start2 - k;
308                 while (x >= start1 && y >= start2 && equator.equate(sequence1.get(x), sequence2.get(y))) {
309                     vUp[i] = x--;
310                     y--;
311                 }
312                 // Second step
313                 if (delta % 2 == 0 && -d <= k && k <= d && vUp[i] <= vDown[i + delta]) { // NOPMD
314                     return buildSnake(vUp[i], k + start1 - start2, end1, end2);
315                 }
316             }
317         }
318 
319         // this should not happen
320         throw new IllegalStateException("Internal Error");
321     }
322 
323     /**
324      * Gets the {@link EditScript} object.
325      * <p>
326      * It is guaranteed that the objects embedded in the {@link InsertCommand
327      * insert commands} come from the second sequence and that the objects
328      * embedded in either the {@link DeleteCommand delete commands} or
329      * {@link KeepCommand keep commands} come from the first sequence. This can
330      * be important if subclassing is used for some elements in the first
331      * sequence and the {@code equals} method is specialized.
332      *
333      * @return the edit script resulting from the comparison of the two
334      *         sequences
335      */
336     public EditScript<T> getScript() {
337         final EditScript<T> script = new EditScript<>();
338         buildScript(0, sequence1.size(), 0, sequence2.size(), script);
339         return script;
340     }
341 }