View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.rng.examples.stress;
18  
19  import java.io.Serializable;
20  import java.util.Comparator;
21  
22  /**
23   * Provides number sensitive sorting for character sequences.
24   *
25   * <p>Extracts sub-sequences of either numeric ({@code [0, 9]}) or non-numeric characters
26   * and compares them numerically or lexicographically. Leading zeros are ignored from
27   * numbers. Negative numbers are not supported.
28   *
29   * <pre>
30   * Traditional  AlphaNumeric
31   * z0200.html   z2.html
32   * z100.html    z100.html
33   * z2.html      z0200.html
34   * </pre>
35   *
36   * <p>This is based on ideas in the Alphanum algorithm by David Koelle.</p>
37   *
38   * <p>This implementation supports:</p>
39   *
40   * <ul>
41   *  <li>{@link CharSequence} comparison
42   *  <li>Direct use of input sequences for minimal memory consumption
43   *  <li>Numbers with leading zeros
44   * </ul>
45   *
46   * <p>Any null sequences are ordered before non-null sequences.</p>
47   *
48   * <p>Note: The comparator is thread-safe so can be used in a parallel sort.
49   *
50   * @see <a href="http://www.DaveKoelle.com">Alphanum Algorithm</a>
51   */
52  class AlphaNumericComparator implements Comparator<CharSequence>, Serializable {
53      /**
54       * An instance.
55       */
56      public static final AlphaNumericComparator INSTANCE = new AlphaNumericComparator();
57  
58      /**
59       * The serial version ID.
60       * Note: Comparators are recommended to be Serializable to allow serialization of
61       * collections constructed with a Comparator.
62       */
63      private static final long serialVersionUID = 1L;
64  
65      @Override
66      public int compare(CharSequence seq1, CharSequence seq2) {
67          // Null is less
68          if (seq1 == null) {
69              return -1;
70          }
71          if (seq2 == null) {
72              return 1;
73          }
74          if (seq1.equals(seq2)) {
75              return 0;
76          }
77  
78          int pos1 = 0;
79          int pos2 = 0;
80          final int length1 = seq1.length();
81          final int length2 = seq2.length();
82  
83          while (pos1 < length1 && pos2 < length2) {
84              final int end1 = nextSubSequenceEnd(seq1, pos1, length1);
85              final int end2 = nextSubSequenceEnd(seq2, pos2, length2);
86  
87              // If both sub-sequences contain numeric characters, sort them numerically
88              int result = 0;
89              if (isDigit(seq1.charAt(pos1)) && isDigit(seq2.charAt(pos2))) {
90                  result = compareNumerically(seq1, pos1, end1, seq2, pos2, end2);
91              } else {
92                  result = compareLexicographically(seq1, pos1, end1, seq2, pos2, end2);
93              }
94  
95              if (result != 0) {
96                  return result;
97              }
98  
99              pos1 = end1;
100             pos2 = end2;
101         }
102 
103         return length1 - length2;
104     }
105 
106     /**
107      * Get the end position of the next sub-sequence of either digits or non-digit
108      * characters starting from the start position.
109      *
110      * <p>The end position is exclusive such that the sub-sequence is the interval
111      * {@code [start, end)}.
112      *
113      * @param seq the character sequence
114      * @param start the start position
115      * @param length the sequence length
116      * @return the sub-sequence end position (exclusive)
117      */
118     private static int nextSubSequenceEnd(CharSequence seq, int start, int length) {
119         int pos = start;
120         // Set the sub-sequence type (digits or non-digits)
121         final boolean seqType = isDigit(seq.charAt(pos++));
122         while (pos < length && seqType == isDigit(seq.charAt(pos))) {
123             // Extend the sub-sequence
124             pos++;
125         }
126         return pos;
127     }
128 
129     /**
130      * Checks if the character is a digit.
131      *
132      * @param ch the character
133      * @return true if a digit
134      */
135     private static boolean isDigit(char ch) {
136         return ch >= 48 && ch <= 57;
137     }
138 
139     /**
140      * Compares two sub-sequences numerically. Ignores leading zeros. Assumes all
141      * characters are digits.
142      *
143      * @param seq1 the first sequence
144      * @param start1 the start of the first sub-sequence
145      * @param end1 the end of the first sub-sequence
146      * @param seq2 the second sequence
147      * @param start2 the start of the second sub-sequence
148      * @param end2 the end of the second sub-sequence sequence
149      * @return the value {@code 0} if the sub-sequences are equal; a value less than
150      * {@code 0} if sub-sequence 1 is numerically less than sub-sequence 2; and a value
151      * greater than {@code 0} if sub-sequence 1 is numerically greater than sub-sequence
152      * 2.
153      */
154     private static int compareNumerically(CharSequence seq1, int start1, int end1,
155                                           CharSequence seq2, int start2, int end2) {
156         // Ignore leading zeros in numbers
157         int pos1 = advancePastLeadingZeros(seq1, start1, end1);
158         int pos2 = advancePastLeadingZeros(seq2, start2, end2);
159 
160         // Simple comparison by length
161         final int result = (end1 - pos1) - (end2 - pos2);
162         // If equal, the first different number counts.
163         if (result == 0) {
164             while (pos1 < end1) {
165                 final char c1 = seq1.charAt(pos1++);
166                 final char c2 = seq2.charAt(pos2++);
167                 if (c1 != c2) {
168                     return c1 - c2;
169                 }
170             }
171         }
172         return result;
173     }
174 
175     /**
176      * Advances past leading zeros in the sub-sequence. Returns the index of the start
177      * character of the number.
178      *
179      * @param seq the sequence
180      * @param start the start of the sub-sequence
181      * @param end the end of the sub-sequence
182      * @return the start index of the number
183      */
184     private static int advancePastLeadingZeros(CharSequence seq, int start, int end) {
185         int pos = start;
186         // Ignore zeros only when there are further characters
187         while (pos < end - 1 && seq.charAt(pos) == '0') {
188             pos++;
189         }
190         return pos;
191     }
192 
193     /**
194      * Compares two sub-sequences lexicographically. This matches the compare function in
195      * {@link String} using extracted sub-sequences.
196      *
197      * @param seq1 the first sequence
198      * @param start1 the start of the first sub-sequence
199      * @param end1 the end of the first sub-sequence
200      * @param seq2 the second sequence
201      * @param start2 the start of the second sub-sequence
202      * @param end2 the end of the second sub-sequence sequence
203      * @return the value {@code 0} if the sub-sequences are equal; a value less than
204      * {@code 0} if sub-sequence 1 is lexicographically less than sub-sequence 2; and a
205      * value greater than {@code 0} if sub-sequence 1 is lexicographically greater than
206      * sub-sequence 2.
207      * @see String#compareTo(String)
208      */
209     private static int compareLexicographically(CharSequence seq1, int start1, int end1,
210                                                 CharSequence seq2, int start2, int end2) {
211         final int len1 = end1 - start1;
212         final int len2 = end2 - start2;
213         final int limit = Math.min(len1, len2);
214 
215         for (int i = 0; i < limit; i++) {
216             final char c1 = seq1.charAt(i + start1);
217             final char c2 = seq2.charAt(i + start2);
218             if (c1 != c2) {
219                 return c1 - c2;
220             }
221         }
222         return len1 - len2;
223     }
224 }