View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.numbers.arrays;
18  
19  /**
20   * Support for creating {@link UpdatingInterval} implementations and validating indices.
21   *
22   * @since 1.2
23   */
24  final class IndexSupport {
25      /** The upper threshold to use a modified insertion sort to find unique indices. */
26      private static final int INSERTION_SORT_SIZE = 20;
27  
28      /** No instances. */
29      private IndexSupport() {}
30  
31      /**
32       * Returns an interval that covers the specified indices {@code k}.
33       *
34       * @param left Lower bound of data (inclusive).
35       * @param right Upper bound of data (inclusive).
36       * @param k Indices.
37       * @param n Count of indices (must be strictly positive).
38       * @throws IndexOutOfBoundsException if any index {@code k} is not within the
39       * sub-range {@code [left, right]}
40       * @return the interval
41       */
42      static UpdatingInterval createUpdatingInterval(int left, int right, int[] k, int n) {
43          // Note: A typical use case is to have a few indices. Thus the heuristics
44          // in this method should be very fast when n is small.
45          // We have a choice between a KeyUpdatingInterval which requires
46          // sorted keys or a BitIndexUpdatingInterval which handles keys in any order.
47          // The purpose of the heuristics is to avoid a very bad choice of data structure,
48          // rather than choosing the best data structure in all situations. As long as the
49          // choice is reasonable the speed will not impact a partition algorithm.
50  
51          // Simple cases
52          if (n == 2) {
53              if (k[0] == k[1]) {
54                  return newUpdatingInterval(k, 1);
55              }
56              if (k[1] < k[0]) {
57                  final int v = k[0];
58                  k[0] = k[1];
59                  k[1] = v;
60              }
61              return newUpdatingInterval(k, 2);
62          }
63  
64          // Strategy: Must be fast on already ascending data.
65          // Note: The recommended way to generate a lot of partition indices is to
66          // generate in sequence.
67  
68          // n <= small:
69          //   Modified insertion sort (naturally finds ascending data)
70          // n > small:
71          //   Look for ascending sequence and compact
72          // else:
73          //   Remove duplicates using an order(1) data structure and sort
74  
75          if (n <= INSERTION_SORT_SIZE) {
76              final int unique = Sorting.insertionSortIndices(k, n);
77              return newUpdatingInterval(k, unique);
78          }
79  
80          if (isAscending(k, n)) {
81              // For sorted keys the KeyUpdatingInterval is fast. It may be slower than the
82              // BitIndexUpdatingInterval depending on data length but not significantly
83              // slower and the difference is lost in the time taken for partitioning.
84              // So always use the keys.
85              final int unique = compressDuplicates(k, n);
86              return newUpdatingInterval(k, unique);
87          }
88  
89          // At least 20 indices that are partially unordered.
90  
91          // Find min/max to understand the range.
92          int min = k[n - 1];
93          int max = min;
94          for (int i = n - 1; --i >= 0;) {
95              min = Math.min(min, k[i]);
96              max = Math.max(max, k[i]);
97          }
98  
99          // Here we use a simple test based on the number of comparisons required
100         // to perform the expected next/previous look-ups after a split.
101         // It is expected that we can cut n keys a maximum of n-1 times.
102         // Each cut requires a scan next/previous to divide the interval into two intervals:
103         //
104         //            cut
105         //             |
106         //        k1--------k2---------k3---- ... ---------kn    initial interval
107         //         <--| find previous
108         //    find next |-->
109         //        k1        k2---------k3---- ... ---------kn    divided intervals
110         //
111         // An BitSet will scan from the cut location and find a match in time proportional to
112         // the index density. Average density is (size / n) and the scanning covers 64
113         // indices together: Order(2 * n * (size / n) / 64) = Order(size / 32)
114 
115         // Sorted keys: Sort time Order(n log(n)) : Splitting time Order(log(n)) (binary search approx)
116         // Bit keys   : Sort time Order(1)        : Splitting time Order(size / 32)
117 
118         // Transition when n * n ~ size / 32
119         // Benchmarking shows this is a reasonable approximation when size < 2^20.
120         // The speed of the bit keys is approximately independent of n and proportional to size.
121         // Large size observes degrading performance of the bit keys vs sorted keys.
122         // We introduce a penalty for each 4x increase over size = 2^20.
123         // n * n = size/32 * 2^log4(size / 2^20)
124         // The transition point still favours the bit keys when sorted keys would be faster.
125         // However the difference is held within 4x and the BitSet type structure is still fast
126         // enough to be negligible against the speed of partitioning.
127 
128         // Transition point: n = sqrt(size/32)
129         // size n
130         // 2^10 5.66
131         // 2^15 32.0
132         // 2^20 181.0
133 
134         // Transition point: n = sqrt(size/32 * 2^(log4(size/2^20))))
135         // size n
136         // 2^22 512.0
137         // 2^24 1448.2
138         // 2^28 11585
139         // 2^31 55108
140 
141         final int size = max - min + 1;
142 
143         // Divide by 32 is a shift of 5. This is reduced for each 4-fold size above 2^20.
144         // At 2^31 the shift reduces to 0.
145         int shift = 5;
146         if (size > (1 << 20)) {
147             // log4(size/2^20) == (log2(size) - 20) / 2
148             shift -= (ceilLog2(size) - 20) >>> 1;
149         }
150 
151         if ((long) n * n > (size >> shift)) {
152             final BitIndexUpdatingInterval interval = new BitIndexUpdatingInterval(min, max);
153             for (int i = n; --i >= 0;) {
154                 interval.set(k[i]);
155             }
156             return interval;
157         }
158 
159         // Sort with a hash set to filter indices
160         final int unique = Sorting.sortIndices(k, n);
161         return new KeyUpdatingInterval(k, unique);
162     }
163 
164     /**
165      * Test the data is in ascending order: {@code data[i] <= data[i+1]} for all {@code i}.
166      * Data is assumed to be at least length 1.
167      *
168      * @param data Data.
169      * @param n Length of data.
170      * @return true if ascending
171      */
172     private static boolean isAscending(int[] data, int n) {
173         for (int i = 0; ++i < n;) {
174             if (data[i] < data[i - 1]) {
175                 // descending
176                 return false;
177             }
178         }
179         return true;
180     }
181 
182     /**
183      * Compress duplicates in the ascending data.
184      *
185      * <p>Warning: Requires {@code n > 0}.
186      *
187      * @param data Indices.
188      * @param n Number of indices.
189      * @return the number of unique indices
190      */
191     private static int compressDuplicates(int[] data, int n) {
192         // Compress to remove duplicates
193         int last = 0;
194         int top = data[0];
195         for (int i = 0; ++i < n;) {
196             final int v = data[i];
197             if (v == top) {
198                 continue;
199             }
200             top = v;
201             data[++last] = v;
202         }
203         return last + 1;
204     }
205 
206     /**
207      * Compute {@code ceil(log2(x))}. This is valid for all strictly positive {@code x}.
208      *
209      * <p>Returns -1 for {@code x = 0} in place of -infinity.
210      *
211      * @param x Value.
212      * @return {@code ceil(log2(x))}
213      */
214     private static int ceilLog2(int x) {
215         return 32 - Integer.numberOfLeadingZeros(x - 1);
216     }
217 
218     /**
219      * Returns an interval that covers the specified indices {@code k}.
220      * The indices must be sorted.
221      *
222      * @param k Indices.
223      * @param n Count of indices (must be strictly positive).
224      * @throws IndexOutOfBoundsException if any index {@code k} is not within the
225      * sub-range {@code [left, right]}
226      * @return the interval
227      */
228     private static UpdatingInterval newUpdatingInterval(int[] k, int n) {
229         return new KeyUpdatingInterval(k, n);
230     }
231 
232     /**
233      * Count the number of indices. Returns a negative value if the indices are sorted.
234      *
235      * @param keys Keys.
236      * @param n Count of indices.
237      * @return the count of (sorted) indices
238      */
239     static int countIndices(UpdatingInterval keys, int n) {
240         if (keys instanceof KeyUpdatingInterval) {
241             return -((KeyUpdatingInterval) keys).size();
242         }
243         return n;
244     }
245 
246     /**
247      * Checks if the sub-range from fromIndex (inclusive) to toIndex (exclusive) is
248      * within the bounds of range from 0 (inclusive) to length (exclusive).
249      *
250      * <p>This function provides the functionality of
251      * {@code java.utils.Objects.checkFromToIndex} introduced in JDK 9. The <a
252      * href="https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/Objects.html#checkFromToIndex(int,int,int)">Objects</a>
253      * javadoc has been reproduced for reference. The return value has been changed
254      * to void.
255      *
256      * <p>The sub-range is defined to be out of bounds if any of the following
257      * inequalities is true:
258      * <ul>
259      * <li>{@code fromIndex < 0}
260      * <li>{@code fromIndex > toIndex}
261      * <li>{@code toIndex > length}
262      * <li>{@code length < 0}, which is implied from the former inequalities
263      * </ul>
264      *
265      * @param fromIndex Lower-bound (inclusive) of the sub-range.
266      * @param toIndex Upper-bound (exclusive) of the sub-range.
267      * @param length Upper-bound (exclusive) of the range.
268      * @throws IndexOutOfBoundsException if the sub-range is out of bounds
269      */
270     static void checkFromToIndex(int fromIndex, int toIndex, int length) {
271         // Checks as documented above
272         if (fromIndex < 0 || fromIndex > toIndex || toIndex > length) {
273             throw new IndexOutOfBoundsException(
274                 msgRangeOutOfBounds(fromIndex, toIndex, length));
275         }
276     }
277 
278     /**
279      * Checks if the {@code index} is within the half-open interval {@code [fromIndex, toIndex)}.
280      *
281      * @param fromIndex Lower-bound (inclusive) of the sub-range.
282      * @param toIndex Upper-bound (exclusive) of the sub-range.
283      * @param k Indices.
284      * @throws IndexOutOfBoundsException if any index is out of bounds
285      */
286     static void checkIndices(int fromIndex, int toIndex, int[] k) {
287         for (final int i : k) {
288             checkIndex(fromIndex, toIndex, i);
289         }
290     }
291 
292     /**
293      * Checks if the {@code index} is within the half-open interval {@code [fromIndex, toIndex)}.
294      *
295      * @param fromIndex Lower-bound (inclusive) of the sub-range.
296      * @param toIndex Upper-bound (exclusive) of the sub-range.
297      * @param index Index.
298      * @throws IndexOutOfBoundsException if the index is out of bounds
299      */
300     static void checkIndex(int fromIndex, int toIndex, int index) {
301         if (index < fromIndex || index >= toIndex) {
302             throw new IndexOutOfBoundsException(
303                 msgIndexOutOfBounds(fromIndex, toIndex, index));
304         }
305     }
306 
307     // Message formatting moved to separate methods to assist inlining of the validation methods.
308 
309     /**
310      * Format a message when range [from, to) is not entirely within the length.
311      *
312      * @param fromIndex Lower-bound (inclusive) of the sub-range.
313      * @param toIndex Upper-bound (exclusive) of the sub-range.
314      * @param length Upper-bound (exclusive) of the range.
315      * @return the message
316      */
317     private static String msgRangeOutOfBounds(int fromIndex, int toIndex, int length) {
318         return String.format("Range [%d, %d) out of bounds for length %d", fromIndex, toIndex, length);
319     }
320 
321     /**
322      * Format a message when index is not within range [from, to).
323      *
324      * @param fromIndex Lower-bound (inclusive) of the sub-range.
325      * @param toIndex Upper-bound (exclusive) of the sub-range.
326      * @param index Index.
327      * @return the message
328      */
329     private static String msgIndexOutOfBounds(int fromIndex, int toIndex, int index) {
330         return String.format("Index %d out of bounds for range [%d, %d)", index, fromIndex, toIndex);
331     }
332 }