1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.rng.examples.stress; 18 19 import java.io.Serializable; 20 import java.util.Comparator; 21 22 /** 23 * Provides number sensitive sorting for character sequences. 24 * 25 * <p>Extracts sub-sequences of either numeric ({@code [0, 9]}) or non-numeric characters 26 * and compares them numerically or lexicographically. Leading zeros are ignored from 27 * numbers. Negative numbers are not supported. 28 * 29 * <pre> 30 * Traditional AlphaNumeric 31 * z0200.html z2.html 32 * z100.html z100.html 33 * z2.html z0200.html 34 * </pre> 35 * 36 * <p>This is based on ideas in the Alphanum algorithm by David Koelle.</p> 37 * 38 * <p>This implementation supports:</p> 39 * 40 * <ul> 41 * <li>{@link CharSequence} comparison 42 * <li>Direct use of input sequences for minimal memory consumption 43 * <li>Numbers with leading zeros 44 * </ul> 45 * 46 * <p>Any null sequences are ordered before non-null sequences.</p> 47 * 48 * <p>Note: The comparator is thread-safe so can be used in a parallel sort. 49 * 50 * @see <a href="http://www.DaveKoelle.com">Alphanum Algorithm</a> 51 */ 52 class AlphaNumericComparator implements Comparator<CharSequence>, Serializable { 53 /** 54 * An instance. 55 */ 56 public static final AlphaNumericComparator INSTANCE = new AlphaNumericComparator(); 57 58 /** 59 * The serial version ID. 60 * Note: Comparators are recommended to be Serializable to allow serialization of 61 * collections constructed with a Comparator. 62 */ 63 private static final long serialVersionUID = 1L; 64 65 @Override 66 public int compare(CharSequence seq1, CharSequence seq2) { 67 // Null is less 68 if (seq1 == null) { 69 return -1; 70 } 71 if (seq2 == null) { 72 return 1; 73 } 74 if (seq1.equals(seq2)) { 75 return 0; 76 } 77 78 int pos1 = 0; 79 int pos2 = 0; 80 final int length1 = seq1.length(); 81 final int length2 = seq2.length(); 82 83 while (pos1 < length1 && pos2 < length2) { 84 final int end1 = nextSubSequenceEnd(seq1, pos1, length1); 85 final int end2 = nextSubSequenceEnd(seq2, pos2, length2); 86 87 // If both sub-sequences contain numeric characters, sort them numerically 88 int result = 0; 89 if (isDigit(seq1.charAt(pos1)) && isDigit(seq2.charAt(pos2))) { 90 result = compareNumerically(seq1, pos1, end1, seq2, pos2, end2); 91 } else { 92 result = compareLexicographically(seq1, pos1, end1, seq2, pos2, end2); 93 } 94 95 if (result != 0) { 96 return result; 97 } 98 99 pos1 = end1; 100 pos2 = end2; 101 } 102 103 return length1 - length2; 104 } 105 106 /** 107 * Get the end position of the next sub-sequence of either digits or non-digit 108 * characters starting from the start position. 109 * 110 * <p>The end position is exclusive such that the sub-sequence is the interval 111 * {@code [start, end)}. 112 * 113 * @param seq the character sequence 114 * @param start the start position 115 * @param length the sequence length 116 * @return the sub-sequence end position (exclusive) 117 */ 118 private static int nextSubSequenceEnd(CharSequence seq, int start, int length) { 119 int pos = start; 120 // Set the sub-sequence type (digits or non-digits) 121 final boolean seqType = isDigit(seq.charAt(pos++)); 122 while (pos < length && seqType == isDigit(seq.charAt(pos))) { 123 // Extend the sub-sequence 124 pos++; 125 } 126 return pos; 127 } 128 129 /** 130 * Checks if the character is a digit. 131 * 132 * @param ch the character 133 * @return true if a digit 134 */ 135 private static boolean isDigit(char ch) { 136 return ch >= 48 && ch <= 57; 137 } 138 139 /** 140 * Compares two sub-sequences numerically. Ignores leading zeros. Assumes all 141 * characters are digits. 142 * 143 * @param seq1 the first sequence 144 * @param start1 the start of the first sub-sequence 145 * @param end1 the end of the first sub-sequence 146 * @param seq2 the second sequence 147 * @param start2 the start of the second sub-sequence 148 * @param end2 the end of the second sub-sequence sequence 149 * @return the value {@code 0} if the sub-sequences are equal; a value less than 150 * {@code 0} if sub-sequence 1 is numerically less than sub-sequence 2; and a value 151 * greater than {@code 0} if sub-sequence 1 is numerically greater than sub-sequence 152 * 2. 153 */ 154 private static int compareNumerically(CharSequence seq1, int start1, int end1, 155 CharSequence seq2, int start2, int end2) { 156 // Ignore leading zeros in numbers 157 int pos1 = advancePastLeadingZeros(seq1, start1, end1); 158 int pos2 = advancePastLeadingZeros(seq2, start2, end2); 159 160 // Simple comparison by length 161 final int result = (end1 - pos1) - (end2 - pos2); 162 // If equal, the first different number counts. 163 if (result == 0) { 164 while (pos1 < end1) { 165 final char c1 = seq1.charAt(pos1++); 166 final char c2 = seq2.charAt(pos2++); 167 if (c1 != c2) { 168 return c1 - c2; 169 } 170 } 171 } 172 return result; 173 } 174 175 /** 176 * Advances past leading zeros in the sub-sequence. Returns the index of the start 177 * character of the number. 178 * 179 * @param seq the sequence 180 * @param start the start of the sub-sequence 181 * @param end the end of the sub-sequence 182 * @return the start index of the number 183 */ 184 private static int advancePastLeadingZeros(CharSequence seq, int start, int end) { 185 int pos = start; 186 // Ignore zeros only when there are further characters 187 while (pos < end - 1 && seq.charAt(pos) == '0') { 188 pos++; 189 } 190 return pos; 191 } 192 193 /** 194 * Compares two sub-sequences lexicographically. This matches the compare function in 195 * {@link String} using extracted sub-sequences. 196 * 197 * @param seq1 the first sequence 198 * @param start1 the start of the first sub-sequence 199 * @param end1 the end of the first sub-sequence 200 * @param seq2 the second sequence 201 * @param start2 the start of the second sub-sequence 202 * @param end2 the end of the second sub-sequence sequence 203 * @return the value {@code 0} if the sub-sequences are equal; a value less than 204 * {@code 0} if sub-sequence 1 is lexicographically less than sub-sequence 2; and a 205 * value greater than {@code 0} if sub-sequence 1 is lexicographically greater than 206 * sub-sequence 2. 207 * @see String#compareTo(String) 208 */ 209 private static int compareLexicographically(CharSequence seq1, int start1, int end1, 210 CharSequence seq2, int start2, int end2) { 211 final int len1 = end1 - start1; 212 final int len2 = end2 - start2; 213 final int limit = Math.min(len1, len2); 214 215 for (int i = 0; i < limit; i++) { 216 final char c1 = seq1.charAt(i + start1); 217 final char c2 = seq2.charAt(i + start2); 218 if (c1 != c2) { 219 return c1 - c2; 220 } 221 } 222 return len1 - len2; 223 } 224 }