001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.compress.harmony.pack200; 018 019import java.io.EOFException; 020import java.io.IOException; 021import java.io.InputStream; 022import java.util.ArrayList; 023import java.util.Arrays; 024import java.util.List; 025 026import org.apache.commons.compress.utils.ExactMath; 027 028/** 029 * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" encoding mechanism. It uses a 030 * variable-length encoding and a modified sign representation such that small numbers are represented as a single byte, whilst larger numbers take more bytes 031 * to encode. The number may be signed or unsigned; if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's 032 * complement. The Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. So a delta encoding of 033 * the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute value of a coded integer to fall outside of the 'small number' 034 * range, whilst still being encoded as a single byte. 035 * <p> 036 * A BHSD codec is configured with four parameters: 037 * </p> 038 * <dl> 039 * <dt>B</dt> 040 * <dd>The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as 041 * itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd> 042 * <dt>H</dt> 043 * <dd>The radix of the integer. Values are defined as a sequence of values, where value {@code n} is multiplied by {@code H^<sup>n</sup>}. So the number 1234 044 * may be represented as the sequence 4 3 2 1 with a radix (H) of 10. Note that other permutations are also possible; 43 2 1 will also encode 1234. The 045 * co-parameter L is defined as 256-H. This is important because only the last value in a sequence may be < L; all prior values must be > L.</dd> 046 * <dt>S</dt> 047 * <dd>Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's 048 * complement)</dd> 049 * <dt>D</dt> 050 * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding of 1 indicates that values are 051 * cumulative; a sequence of {@code 1 1 1 1 1} will represent the sequence {@code 1 2 3 4 5}. For this reason, the codec supports two variants of decode; one 052 * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a {@code last} parameter. If the codec is a non-delta encoding, then the 053 * value is ignored if passed. If the codec is a delta encoding, it is a run-time error to call the value without the extra parameter, and the previous value 054 * should be returned. (It was designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for each use.)</dd> 055 * </dl> 056 * <p> 057 * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted (1,256,0,0) or (1,256). The {@link #toString()} 058 * method prints out the condensed form of the encoding. Often, the last character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B 059 * value. Those that start with U ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the word Delta 060 * ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used. 061 * </p> 062 */ 063public final class BHSDCodec extends Codec { 064 065 /** 066 * The maximum number of bytes in each coding word. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as itself, aka 067 * {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte). 068 */ 069 private final int b; 070 071 /** 072 * Whether delta encoding is used (0=false,1=true). 073 */ 074 private final int d; 075 076 /** 077 * The radix of the encoding. 078 */ 079 private final int h; 080 081 /** 082 * The co-parameter of h; 256-h. 083 */ 084 private final int l; 085 086 /** 087 * Represents signed numbers or not, 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's complement). 088 */ 089 private final int s; 090 091 private long cardinality; 092 093 private final long smallest; 094 095 private final long largest; 096 097 /** 098 * radix^i powers 099 */ 100 private final long[] powers; 101 102 /** 103 * Constructs an unsigned, non-delta Codec with the given B and H values. 104 * 105 * @param b the maximum number of bytes that a value can be encoded as [1..5] 106 * @param h the radix of the encoding [1..256] 107 */ 108 public BHSDCodec(final int b, final int h) { 109 this(b, h, 0, 0); 110 } 111 112 /** 113 * Constructs a non-delta Codec with the given B, H and S values. 114 * 115 * @param b the maximum number of bytes that a value can be encoded as [1..5] 116 * @param h the radix of the encoding [1..256] 117 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?) 118 */ 119 public BHSDCodec(final int b, final int h, final int s) { 120 this(b, h, s, 0); 121 } 122 123 /** 124 * Constructs a Codec with the given B, H, S and D values. 125 * 126 * @param b the maximum number of bytes that a value can be encoded as [1..5] 127 * @param h the radix of the encoding [1..256] 128 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?) 129 * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta) 130 */ 131 public BHSDCodec(final int b, final int h, final int s, final int d) { 132 if (b < 1 || b > 5) { 133 throw new IllegalArgumentException("1<=b<=5"); 134 } 135 if (h < 1 || h > 256) { 136 throw new IllegalArgumentException("1<=h<=256"); 137 } 138 if (s < 0 || s > 2) { 139 throw new IllegalArgumentException("0<=s<=2"); 140 } 141 if (d < 0 || d > 1) { 142 throw new IllegalArgumentException("0<=d<=1"); 143 } 144 if (b == 1 && h != 256) { 145 throw new IllegalArgumentException("b=1 -> h=256"); 146 } 147 if (h == 256 && b == 5) { 148 throw new IllegalArgumentException("h=256 -> b!=5"); 149 } 150 this.b = b; 151 this.h = h; 152 this.s = s; 153 this.d = d; 154 this.l = 256 - h; 155 if (h == 1) { 156 cardinality = b * 255 + 1; 157 } else { 158 cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b)); 159 } 160 smallest = calculateSmallest(); 161 largest = calculateLargest(); 162 163 powers = new long[b]; 164 Arrays.setAll(powers, c -> (long) Math.pow(h, c)); 165 } 166 167 private long calculateLargest() { 168 long result; 169 // TODO This can probably be optimized into a better mathematical statement 170 if (d == 1) { 171 return new BHSDCodec(b, h).largest(); 172 } 173 switch (s) { 174 case 0: 175 result = cardinality() - 1; 176 break; 177 case 1: 178 result = cardinality() / 2 - 1; 179 break; 180 case 2: 181 result = 3L * cardinality() / 4 - 1; 182 break; 183 default: 184 throw new Error("Unknown s value"); 185 } 186 return Math.min((s == 0 ? (long) Integer.MAX_VALUE << 1 : Integer.MAX_VALUE) - 1, result); 187 } 188 189 private long calculateSmallest() { 190 long result; 191 if (d == 1 || !isSigned()) { 192 if (cardinality >= 4294967296L) { // 2^32 193 result = Integer.MIN_VALUE; 194 } else { 195 result = 0; 196 } 197 } else { 198 result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s)); 199 } 200 return result; 201 } 202 203 /** 204 * Returns the cardinality of this codec; that is, the number of distinct values that it can contain. 205 * 206 * @return the cardinality of this codec 207 */ 208 public long cardinality() { 209 return cardinality; 210 } 211 212 @Override 213 public int decode(final InputStream in) throws IOException, Pack200Exception { 214 if (d != 0) { 215 throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error"); 216 } 217 return decode(in, 0); 218 } 219 220 @Override 221 public int decode(final InputStream in, final long last) throws IOException, Pack200Exception { 222 int n = 0; 223 long z = 0; 224 long x = 0; 225 226 do { 227 x = in.read(); 228 lastBandLength++; 229 z += x * powers[n]; 230 n++; 231 } while (x >= l && n < b); 232 233 if (x == -1) { 234 throw new EOFException("End of stream reached whilst decoding"); 235 } 236 237 if (isSigned()) { 238 final int u = (1 << s) - 1; 239 if ((z & u) == u) { 240 z = z >>> s ^ -1L; 241 } else { 242 z -= z >>> s; 243 } 244 } 245 // This algorithm does the same thing, but is probably slower. Leaving 246 // in for now for readability 247 // if (isSigned()) { 248 // long u = z; 249 // long twoPowS = (long) Math.pow(2, s); 250 // double twoPowSMinusOne = twoPowS-1; 251 // if (u % twoPowS < twoPowSMinusOne) { 252 // if (cardinality < Math.pow(2, 32)) { 253 // z = (long) (u - (Math.floor(u/ twoPowS))); 254 // } else { 255 // z = cast32((long) (u - (Math.floor(u/ twoPowS)))); 256 // } 257 // } else { 258 // z = (long) (-Math.floor(u/ twoPowS) - 1); 259 // } 260 // } 261 if (isDelta()) { 262 z += last; 263 } 264 return (int) z; 265 } 266 267 // private long cast32(long u) { 268 // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) - 269 // Math.pow(2, 31)); 270 // return u; 271 // } 272 273 @Override 274 public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception { 275 final int[] band = super.decodeInts(n, in); 276 if (isDelta()) { 277 for (int i = 0; i < band.length; i++) { 278 while (band[i] > largest) { 279 band[i] -= cardinality; 280 } 281 while (band[i] < smallest) { 282 band[i] = ExactMath.add(band[i], cardinality); 283 } 284 } 285 } 286 return band; 287 } 288 289 @Override 290 public int[] decodeInts(final int n, final InputStream in, final int firstValue) throws IOException, Pack200Exception { 291 final int[] band = super.decodeInts(n, in, firstValue); 292 if (isDelta()) { 293 for (int i = 0; i < band.length; i++) { 294 while (band[i] > largest) { 295 band[i] -= cardinality; 296 } 297 while (band[i] < smallest) { 298 band[i] = ExactMath.add(band[i], cardinality); 299 } 300 } 301 } 302 return band; 303 } 304 305 @Override 306 public byte[] encode(final int value) throws Pack200Exception { 307 return encode(value, 0); 308 } 309 310 @Override 311 public byte[] encode(final int value, final int last) throws Pack200Exception { 312 if (!encodes(value)) { 313 throw new Pack200Exception("The codec " + this + " does not encode the value " + value); 314 } 315 316 long z = value; 317 if (isDelta()) { 318 z -= last; 319 } 320 if (isSigned()) { 321 if (z < Integer.MIN_VALUE) { 322 z += 4294967296L; 323 } else if (z > Integer.MAX_VALUE) { 324 z -= 4294967296L; 325 } 326 if (z < 0) { 327 z = (-z << s) - 1; 328 } else if (s == 1) { 329 z = z << s; 330 } else { 331 z += (z - z % 3) / 3; 332 } 333 } else if (z < 0) { 334 // Need to use integer overflow here to represent negatives. 335 // 4294967296L is the 1 << 32. 336 z += Math.min(cardinality, 4294967296L); 337 } 338 if (z < 0) { 339 throw new Pack200Exception("unable to encode"); 340 } 341 342 final List<Byte> byteList = new ArrayList<>(); 343 for (int n = 0; n < b; n++) { 344 long byteN; 345 if (z < l) { 346 byteN = z; 347 } else { 348 byteN = z % h; 349 while (byteN < l) { 350 byteN += h; 351 } 352 } 353 byteList.add(Byte.valueOf((byte) byteN)); 354 if (byteN < l) { 355 break; 356 } 357 z -= byteN; 358 z /= h; 359 } 360 final byte[] bytes = new byte[byteList.size()]; 361 for (int i = 0; i < bytes.length; i++) { 362 bytes[i] = byteList.get(i).byteValue(); 363 } 364 return bytes; 365 } 366 367 /** 368 * True if this encoding can code the given value 369 * 370 * @param value the value to check 371 * @return {@code true} if the encoding can encode this value 372 */ 373 public boolean encodes(final long value) { 374 return value >= smallest && value <= largest; 375 } 376 377 @Override 378 public boolean equals(final Object o) { 379 if (o instanceof BHSDCodec) { 380 final BHSDCodec codec = (BHSDCodec) o; 381 return codec.b == b && codec.h == h && codec.s == s && codec.d == d; 382 } 383 return false; 384 } 385 386 /** 387 * Gets the B. 388 * 389 * @return the b 390 */ 391 public int getB() { 392 return b; 393 } 394 395 /** 396 * Gets the H. 397 * 398 * @return the h 399 */ 400 public int getH() { 401 return h; 402 } 403 404 /** 405 * Gets the L. 406 * 407 * @return the l 408 */ 409 public int getL() { 410 return l; 411 } 412 413 /** 414 * Gets the S. 415 * 416 * @return the s 417 */ 418 public int getS() { 419 return s; 420 } 421 422 @Override 423 public int hashCode() { 424 return ((b * 37 + h) * 37 + s) * 37 + d; 425 } 426 427 /** 428 * Returns true if this codec is a delta codec 429 * 430 * @return true if this codec is a delta codec 431 */ 432 public boolean isDelta() { 433 return d != 0; 434 } 435 436 /** 437 * Returns true if this codec is a signed codec 438 * 439 * @return true if this codec is a signed codec 440 */ 441 public boolean isSigned() { 442 return s != 0; 443 } 444 445 /** 446 * Returns the largest value that this codec can represent. 447 * 448 * @return the largest value that this codec can represent. 449 */ 450 public long largest() { 451 return largest; 452 } 453 454 /** 455 * Returns the smallest value that this codec can represent. 456 * 457 * @return the smallest value that this codec can represent. 458 */ 459 public long smallest() { 460 return smallest; 461 } 462 463 /** 464 * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown. 465 */ 466 @Override 467 public String toString() { 468 final StringBuilder buffer = new StringBuilder(11); 469 buffer.append('('); 470 buffer.append(b); 471 buffer.append(','); 472 buffer.append(h); 473 if (s != 0 || d != 0) { 474 buffer.append(','); 475 buffer.append(s); 476 } 477 if (d != 0) { 478 buffer.append(','); 479 buffer.append(d); 480 } 481 buffer.append(')'); 482 return buffer.toString(); 483 } 484}