View Javadoc
1   /*
2    *  Licensed to the Apache Software Foundation (ASF) under one or more
3    *  contributor license agreements.  See the NOTICE file distributed with
4    *  this work for additional information regarding copyright ownership.
5    *  The ASF licenses this file to You under the Apache License, Version 2.0
6    *  (the "License"); you may not use this file except in compliance with
7    *  the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   *  Unless required by applicable law or agreed to in writing, software
12   *  distributed under the License is distributed on an "AS IS" BASIS,
13   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   *  See the License for the specific language governing permissions and
15   *  limitations under the License.
16   */
17  package org.apache.commons.compress.harmony.pack200;
18  
19  import java.io.EOFException;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.List;
25  
26  import org.apache.commons.compress.utils.ExactMath;
27  
28  /**
29   * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" encoding mechanism. It uses a
30   * variable-length encoding and a modified sign representation such that small numbers are represented as a single byte, whilst larger numbers take more bytes
31   * to encode. The number may be signed or unsigned; if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's
32   * complement. The Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. So a delta encoding of
33   * the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute value of a coded integer to fall outside of the 'small number'
34   * range, whilst still being encoded as a single byte.
35   * <p>
36   * A BHSD codec is configured with four parameters:
37   * </p>
38   * <dl>
39   * <dt>B</dt>
40   * <dd>The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as
41   * itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd>
42   * <dt>H</dt>
43   * <dd>The radix of the integer. Values are defined as a sequence of values, where value {@code n} is multiplied by {@code H^<sup>n</sup>}. So the number 1234
44   * may be represented as the sequence 4 3 2 1 with a radix (H) of 10. Note that other permutations are also possible; 43 2 1 will also encode 1234. The
45   * co-parameter L is defined as 256-H. This is important because only the last value in a sequence may be &lt; L; all prior values must be &gt; L.</dd>
46   * <dt>S</dt>
47   * <dd>Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's
48   * complement)</dd>
49   * <dt>D</dt>
50   * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding of 1 indicates that values are
51   * cumulative; a sequence of {@code 1 1 1 1 1} will represent the sequence {@code 1 2 3 4 5}. For this reason, the codec supports two variants of decode; one
52   * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a {@code last} parameter. If the codec is a non-delta encoding, then the
53   * value is ignored if passed. If the codec is a delta encoding, it is a run-time error to call the value without the extra parameter, and the previous value
54   * should be returned. (It was designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for each use.)</dd>
55   * </dl>
56   * <p>
57   * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted (1,256,0,0) or (1,256). The {@link #toString()}
58   * method prints out the condensed form of the encoding. Often, the last character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B
59   * value. Those that start with U ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the word Delta
60   * ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used.
61   * </p>
62   */
63  public final class BHSDCodec extends Codec {
64  
65      /**
66       * The maximum number of bytes in each coding word. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as itself, aka
67       * {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).
68       */
69      private final int b;
70  
71      /**
72       * Whether delta encoding is used (0=false,1=true).
73       */
74      private final int d;
75  
76      /**
77       * The radix of the encoding.
78       */
79      private final int h;
80  
81      /**
82       * The co-parameter of h; 256-h.
83       */
84      private final int l;
85  
86      /**
87       * Represents signed numbers or not, 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's complement).
88       */
89      private final int s;
90  
91      private long cardinality;
92  
93      private final long smallest;
94  
95      private final long largest;
96  
97      /**
98       * radix^i powers
99       */
100     private final long[] powers;
101 
102     /**
103      * Constructs an unsigned, non-delta Codec with the given B and H values.
104      *
105      * @param b the maximum number of bytes that a value can be encoded as [1..5]
106      * @param h the radix of the encoding [1..256]
107      */
108     public BHSDCodec(final int b, final int h) {
109         this(b, h, 0, 0);
110     }
111 
112     /**
113      * Constructs a non-delta Codec with the given B, H and S values.
114      *
115      * @param b the maximum number of bytes that a value can be encoded as [1..5]
116      * @param h the radix of the encoding [1..256]
117      * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
118      */
119     public BHSDCodec(final int b, final int h, final int s) {
120         this(b, h, s, 0);
121     }
122 
123     /**
124      * Constructs a Codec with the given B, H, S and D values.
125      *
126      * @param b the maximum number of bytes that a value can be encoded as [1..5]
127      * @param h the radix of the encoding [1..256]
128      * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
129      * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta)
130      */
131     public BHSDCodec(final int b, final int h, final int s, final int d) {
132         if (b < 1 || b > 5) {
133             throw new IllegalArgumentException("1<=b<=5");
134         }
135         if (h < 1 || h > 256) {
136             throw new IllegalArgumentException("1<=h<=256");
137         }
138         if (s < 0 || s > 2) {
139             throw new IllegalArgumentException("0<=s<=2");
140         }
141         if (d < 0 || d > 1) {
142             throw new IllegalArgumentException("0<=d<=1");
143         }
144         if (b == 1 && h != 256) {
145             throw new IllegalArgumentException("b=1 -> h=256");
146         }
147         if (h == 256 && b == 5) {
148             throw new IllegalArgumentException("h=256 -> b!=5");
149         }
150         this.b = b;
151         this.h = h;
152         this.s = s;
153         this.d = d;
154         this.l = 256 - h;
155         if (h == 1) {
156             cardinality = b * 255 + 1;
157         } else {
158             cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b));
159         }
160         smallest = calculateSmallest();
161         largest = calculateLargest();
162 
163         powers = new long[b];
164         Arrays.setAll(powers, c -> (long) Math.pow(h, c));
165     }
166 
167     private long calculateLargest() {
168         long result;
169         // TODO This can probably be optimized into a better mathematical statement
170         if (d == 1) {
171             return new BHSDCodec(b, h).largest();
172         }
173         switch (s) {
174         case 0:
175             result = cardinality() - 1;
176             break;
177         case 1:
178             result = cardinality() / 2 - 1;
179             break;
180         case 2:
181             result = 3L * cardinality() / 4 - 1;
182             break;
183         default:
184             throw new Error("Unknown s value");
185         }
186         return Math.min((s == 0 ? (long) Integer.MAX_VALUE << 1 : Integer.MAX_VALUE) - 1, result);
187     }
188 
189     private long calculateSmallest() {
190         long result;
191         if (d == 1 || !isSigned()) {
192             if (cardinality >= 4294967296L) { // 2^32
193                 result = Integer.MIN_VALUE;
194             } else {
195                 result = 0;
196             }
197         } else {
198             result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s));
199         }
200         return result;
201     }
202 
203     /**
204      * Returns the cardinality of this codec; that is, the number of distinct values that it can contain.
205      *
206      * @return the cardinality of this codec
207      */
208     public long cardinality() {
209         return cardinality;
210     }
211 
212     @Override
213     public int decode(final InputStream in) throws IOException, Pack200Exception {
214         if (d != 0) {
215             throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error");
216         }
217         return decode(in, 0);
218     }
219 
220     @Override
221     public int decode(final InputStream in, final long last) throws IOException, Pack200Exception {
222         int n = 0;
223         long z = 0;
224         long x = 0;
225 
226         do {
227             x = in.read();
228             lastBandLength++;
229             z += x * powers[n];
230             n++;
231         } while (x >= l && n < b);
232 
233         if (x == -1) {
234             throw new EOFException("End of stream reached whilst decoding");
235         }
236 
237         if (isSigned()) {
238             final int u = (1 << s) - 1;
239             if ((z & u) == u) {
240                 z = z >>> s ^ -1L;
241             } else {
242                 z -= z >>> s;
243             }
244         }
245         // This algorithm does the same thing, but is probably slower. Leaving
246         // in for now for readability
247         // if (isSigned()) {
248         // long u = z;
249         // long twoPowS = (long) Math.pow(2, s);
250         // double twoPowSMinusOne = twoPowS-1;
251         // if (u % twoPowS < twoPowSMinusOne) {
252         // if (cardinality < Math.pow(2, 32)) {
253         // z = (long) (u - (Math.floor(u/ twoPowS)));
254         // } else {
255         // z = cast32((long) (u - (Math.floor(u/ twoPowS))));
256         // }
257         // } else {
258         // z = (long) (-Math.floor(u/ twoPowS) - 1);
259         // }
260         // }
261         if (isDelta()) {
262             z += last;
263         }
264         return (int) z;
265     }
266 
267     // private long cast32(long u) {
268     // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) -
269     // Math.pow(2, 31));
270     // return u;
271     // }
272 
273     @Override
274     public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception {
275         final int[] band = super.decodeInts(n, in);
276         if (isDelta()) {
277             for (int i = 0; i < band.length; i++) {
278                 while (band[i] > largest) {
279                     band[i] -= cardinality;
280                 }
281                 while (band[i] < smallest) {
282                     band[i] = ExactMath.add(band[i], cardinality);
283                 }
284             }
285         }
286         return band;
287     }
288 
289     @Override
290     public int[] decodeInts(final int n, final InputStream in, final int firstValue) throws IOException, Pack200Exception {
291         final int[] band = super.decodeInts(n, in, firstValue);
292         if (isDelta()) {
293             for (int i = 0; i < band.length; i++) {
294                 while (band[i] > largest) {
295                     band[i] -= cardinality;
296                 }
297                 while (band[i] < smallest) {
298                     band[i] = ExactMath.add(band[i], cardinality);
299                 }
300             }
301         }
302         return band;
303     }
304 
305     @Override
306     public byte[] encode(final int value) throws Pack200Exception {
307         return encode(value, 0);
308     }
309 
310     @Override
311     public byte[] encode(final int value, final int last) throws Pack200Exception {
312         if (!encodes(value)) {
313             throw new Pack200Exception("The codec " + this + " does not encode the value " + value);
314         }
315 
316         long z = value;
317         if (isDelta()) {
318             z -= last;
319         }
320         if (isSigned()) {
321             if (z < Integer.MIN_VALUE) {
322                 z += 4294967296L;
323             } else if (z > Integer.MAX_VALUE) {
324                 z -= 4294967296L;
325             }
326             if (z < 0) {
327                 z = (-z << s) - 1;
328             } else if (s == 1) {
329                 z = z << s;
330             } else {
331                 z += (z - z % 3) / 3;
332             }
333         } else if (z < 0) {
334             // Need to use integer overflow here to represent negatives.
335             // 4294967296L is the 1 << 32.
336             z += Math.min(cardinality, 4294967296L);
337         }
338         if (z < 0) {
339             throw new Pack200Exception("unable to encode");
340         }
341 
342         final List<Byte> byteList = new ArrayList<>();
343         for (int n = 0; n < b; n++) {
344             long byteN;
345             if (z < l) {
346                 byteN = z;
347             } else {
348                 byteN = z % h;
349                 while (byteN < l) {
350                     byteN += h;
351                 }
352             }
353             byteList.add(Byte.valueOf((byte) byteN));
354             if (byteN < l) {
355                 break;
356             }
357             z -= byteN;
358             z /= h;
359         }
360         final byte[] bytes = new byte[byteList.size()];
361         for (int i = 0; i < bytes.length; i++) {
362             bytes[i] = byteList.get(i).byteValue();
363         }
364         return bytes;
365     }
366 
367     /**
368      * True if this encoding can code the given value
369      *
370      * @param value the value to check
371      * @return {@code true} if the encoding can encode this value
372      */
373     public boolean encodes(final long value) {
374         return value >= smallest && value <= largest;
375     }
376 
377     @Override
378     public boolean equals(final Object o) {
379         if (o instanceof BHSDCodec) {
380             final BHSDCodec codec = (BHSDCodec) o;
381             return codec.b == b && codec.h == h && codec.s == s && codec.d == d;
382         }
383         return false;
384     }
385 
386     /**
387      * Gets the B.
388      *
389      * @return the b
390      */
391     public int getB() {
392         return b;
393     }
394 
395     /**
396      * Gets the H.
397      *
398      * @return the h
399      */
400     public int getH() {
401         return h;
402     }
403 
404     /**
405      * Gets the L.
406      *
407      * @return the l
408      */
409     public int getL() {
410         return l;
411     }
412 
413     /**
414      * Gets the S.
415      *
416      * @return the s
417      */
418     public int getS() {
419         return s;
420     }
421 
422     @Override
423     public int hashCode() {
424         return ((b * 37 + h) * 37 + s) * 37 + d;
425     }
426 
427     /**
428      * Returns true if this codec is a delta codec
429      *
430      * @return true if this codec is a delta codec
431      */
432     public boolean isDelta() {
433         return d != 0;
434     }
435 
436     /**
437      * Returns true if this codec is a signed codec
438      *
439      * @return true if this codec is a signed codec
440      */
441     public boolean isSigned() {
442         return s != 0;
443     }
444 
445     /**
446      * Returns the largest value that this codec can represent.
447      *
448      * @return the largest value that this codec can represent.
449      */
450     public long largest() {
451         return largest;
452     }
453 
454     /**
455      * Returns the smallest value that this codec can represent.
456      *
457      * @return the smallest value that this codec can represent.
458      */
459     public long smallest() {
460         return smallest;
461     }
462 
463     /**
464      * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown.
465      */
466     @Override
467     public String toString() {
468         final StringBuilder buffer = new StringBuilder(11);
469         buffer.append('(');
470         buffer.append(b);
471         buffer.append(',');
472         buffer.append(h);
473         if (s != 0 || d != 0) {
474             buffer.append(',');
475             buffer.append(s);
476         }
477         if (d != 0) {
478             buffer.append(',');
479             buffer.append(d);
480         }
481         buffer.append(')');
482         return buffer.toString();
483     }
484 }