1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, 13 * software distributed under the License is distributed on an 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 * KIND, either express or implied. See the License for the 16 * specific language governing permissions and limitations 17 * under the License. 18 */ 19 package org.apache.commons.compress.compressors.lz77support; 20 21 import java.io.IOException; 22 import java.io.InputStream; 23 import java.util.Arrays; 24 25 import org.apache.commons.compress.compressors.CompressorInputStream; 26 import org.apache.commons.compress.utils.ByteUtils; 27 import org.apache.commons.compress.utils.IOUtils; 28 import org.apache.commons.compress.utils.InputStreamStatistics; 29 import org.apache.commons.io.input.BoundedInputStream; 30 31 /** 32 * Encapsulates code common to LZ77 decompressors. 33 * 34 * <p> 35 * Assumes the stream consists of blocks of literal data and back-references (called copies) in any order. Of course the first block must be a literal block for 36 * the scheme to work - unless the {@link #prefill prefill} method has been used to provide initial data that is never returned by {@link #read read} but only 37 * used for back-references. 38 * </p> 39 * 40 * <p> 41 * Subclasses must override the three-arg {@link #read read} method as the no-arg version delegates to it and the default implementation delegates to the no-arg 42 * version, leading to infinite mutual recursion and a {@code StackOverflowError} otherwise. 43 * </p> 44 * 45 * <p> 46 * The contract for subclasses' {@code read} implementation is: 47 * </p> 48 * <ul> 49 * 50 * <li>keep track of the current state of the stream. Is it inside a literal block or a back-reference or in-between blocks?</li> 51 * 52 * <li>Use {@link #readOneByte} to access the underlying stream directly.</li> 53 * 54 * <li>If a new literal block starts, use {@link #startLiteral} to tell this class about it and read the literal data using {@link #readLiteral} until it 55 * returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readLiteral} would return {@code 0}.</li> 56 * 57 * <li>If a new back-reference starts, use {@link #startBackReference} to tell this class about it and read the literal data using {@link #readBackReference} 58 * until it returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readBackReference} would return 59 * {@code 0}.</li> 60 * 61 * <li>If the end of the stream has been reached, return {@code -1} as this class' methods will never do so themselves.</li> 62 * 63 * </ul> 64 * 65 * <p> 66 * {@link #readOneByte} and {@link #readLiteral} update the counter for bytes read. 67 * </p> 68 * 69 * @since 1.14 70 */ 71 public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream implements InputStreamStatistics { 72 73 /** Size of the window - must be bigger than the biggest offset expected. */ 74 private final int windowSize; 75 76 /** 77 * Buffer to write decompressed bytes to for back-references, will be three times windowSize big. 78 * 79 * <p> 80 * Three times so we can slide the whole buffer a windowSize to the left once we've read twice windowSize and still have enough data inside of it to satisfy 81 * back-references. 82 * </p> 83 */ 84 private final byte[] buf; 85 86 /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */ 87 private int writeIndex; 88 89 /** Index of the next byte to be read. */ 90 private int readIndex; 91 92 /** The underlying stream to read compressed data from */ 93 private final BoundedInputStream in; 94 95 /** Number of bytes still to be read from the current literal or back-reference. */ 96 private long bytesRemaining; 97 98 /** Offset of the current back-reference. */ 99 private int backReferenceOffset; 100 101 /** Uncompressed size */ 102 private int size; 103 104 // used in no-arg read method 105 private final byte[] oneByte = new byte[1]; 106 107 /** 108 * Supplier that delegates to {@link #readOneByte}. 109 */ 110 protected final ByteUtils.ByteSupplier supplier = this::readOneByte; 111 112 /** 113 * Creates a new LZ77 input stream. 114 * 115 * @param is An InputStream to read compressed data from 116 * @param windowSize Size of the window kept for back-references, must be bigger than the biggest offset expected. 117 * 118 * @throws IllegalArgumentException if windowSize is not bigger than 0 119 */ 120 public AbstractLZ77CompressorInputStream(final InputStream is, final int windowSize) { 121 this.in = BoundedInputStream.builder().setInputStream(is).asSupplier().get(); 122 if (windowSize <= 0) { 123 throw new IllegalArgumentException("windowSize must be bigger than 0"); 124 } 125 this.windowSize = windowSize; 126 buf = new byte[3 * windowSize]; 127 writeIndex = readIndex = 0; 128 bytesRemaining = 0; 129 } 130 131 /** {@inheritDoc} */ 132 @Override 133 public int available() { 134 return writeIndex - readIndex; 135 } 136 137 /** {@inheritDoc} */ 138 @Override 139 public void close() throws IOException { 140 in.close(); 141 } 142 143 /** 144 * @since 1.17 145 */ 146 @Override 147 public long getCompressedCount() { 148 return in.getCount(); 149 } 150 151 /** 152 * Gets the uncompressed size of the stream 153 * 154 * @return the uncompressed size 155 */ 156 public int getSize() { 157 return size; 158 } 159 160 /** 161 * Is there still data remaining inside the current block? 162 * 163 * @return true if there is still data remaining inside the current block. 164 */ 165 protected final boolean hasMoreDataInBlock() { 166 return bytesRemaining > 0; 167 } 168 169 /** 170 * Adds some initial data to fill the window with. 171 * 172 * <p> 173 * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the 174 * LZ4 frame format using block dependency. 175 * </p> 176 * 177 * @param data the data to fill the window with. 178 * @throws IllegalStateException if the stream has already started to read data 179 */ 180 public void prefill(final byte[] data) { 181 if (writeIndex != 0) { 182 throw new IllegalStateException("The stream has already been read from, can't prefill anymore"); 183 } 184 // we don't need more data than the big offset could refer to, so cap it 185 final int len = Math.min(windowSize, data.length); 186 // we need the last data as we are dealing with *back*-references 187 System.arraycopy(data, data.length - len, buf, 0, len); 188 writeIndex += len; 189 readIndex += len; 190 } 191 192 /** {@inheritDoc} */ 193 @Override 194 public int read() throws IOException { 195 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 196 } 197 198 /** 199 * Reads data from the current back-reference. 200 * 201 * @param b buffer to write data to 202 * @param off offset to start writing to 203 * @param len maximum amount of data to read 204 * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass 205 * @throws NullPointerException if {@code b} is null 206 * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off} 207 */ 208 protected final int readBackReference(final byte[] b, final int off, final int len) { 209 final int avail = available(); 210 if (len > avail) { 211 tryToCopy(len - avail); 212 } 213 return readFromBuffer(b, off, len); 214 } 215 216 private int readFromBuffer(final byte[] b, final int off, final int len) { 217 final int readable = Math.min(len, available()); 218 if (readable > 0) { 219 System.arraycopy(buf, readIndex, b, off, readable); 220 readIndex += readable; 221 if (readIndex > 2 * windowSize) { 222 slideBuffer(); 223 } 224 } 225 size += readable; 226 return readable; 227 } 228 229 /** 230 * Reads data from the current literal block. 231 * 232 * @param b buffer to write data to 233 * @param off offset to start writing to 234 * @param len maximum amount of data to read 235 * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass 236 * @throws IOException if the underlying stream throws or signals an EOF before the amount of data promised for the block have been read 237 * @throws NullPointerException if {@code b} is null 238 * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off} 239 */ 240 protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException { 241 final int avail = available(); 242 if (len > avail) { 243 tryToReadLiteral(len - avail); 244 } 245 return readFromBuffer(b, off, len); 246 } 247 248 /** 249 * Reads a single byte from the real input stream and ensures the data is accounted for. 250 * 251 * @return the byte read as value between 0 and 255 or -1 if EOF has been reached. 252 * @throws IOException if the underlying stream throws 253 */ 254 protected final int readOneByte() throws IOException { 255 final int b = in.read(); 256 if (b != -1) { 257 count(1); 258 return b & 0xFF; 259 } 260 return -1; 261 } 262 263 private void slideBuffer() { 264 System.arraycopy(buf, windowSize, buf, 0, windowSize * 2); 265 writeIndex -= windowSize; 266 readIndex -= windowSize; 267 } 268 269 /** 270 * Used by subclasses to signal the next block contains a back-reference with the given coordinates. 271 * 272 * @param offset the offset of the back-reference 273 * @param length the length of the back-reference 274 * @throws IllegalArgumentException if offset not bigger than 0 or bigger than the number of bytes available for back-references or if length is negative 275 */ 276 protected final void startBackReference(final int offset, final long length) { 277 if (offset <= 0 || offset > writeIndex) { 278 throw new IllegalArgumentException("offset must be bigger than 0 but not bigger than the number" + " of bytes available for back-references"); 279 } 280 if (length < 0) { 281 throw new IllegalArgumentException("length must not be negative"); 282 } 283 backReferenceOffset = offset; 284 bytesRemaining = length; 285 } 286 287 /** 288 * Used by subclasses to signal the next block contains the given amount of literal data. 289 * 290 * @param length the length of the block 291 * @throws IllegalArgumentException if length is negative 292 */ 293 protected final void startLiteral(final long length) { 294 if (length < 0) { 295 throw new IllegalArgumentException("length must not be negative"); 296 } 297 bytesRemaining = length; 298 } 299 300 private void tryToCopy(final int bytesToCopy) { 301 // this will fit into the buffer without sliding and not 302 // require more than is available inside the back-reference 303 final int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining), buf.length - writeIndex); 304 if (copy == 0) { 305 // NOP 306 } else if (backReferenceOffset == 1) { // pretty common special case 307 final byte last = buf[writeIndex - 1]; 308 Arrays.fill(buf, writeIndex, writeIndex + copy, last); 309 writeIndex += copy; 310 } else if (copy < backReferenceOffset) { 311 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy); 312 writeIndex += copy; 313 } else { 314 // back-reference overlaps with the bytes created from it 315 // like go back two bytes and then copy six (by copying 316 // the last two bytes three time). 317 final int fullRots = copy / backReferenceOffset; 318 for (int i = 0; i < fullRots; i++) { 319 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset); 320 writeIndex += backReferenceOffset; 321 } 322 323 final int pad = copy - backReferenceOffset * fullRots; 324 if (pad > 0) { 325 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad); 326 writeIndex += pad; 327 } 328 } 329 bytesRemaining -= copy; 330 } 331 332 private void tryToReadLiteral(final int bytesToRead) throws IOException { 333 // min of "what is still inside the literal", "what does the user want" and "how much can fit into the buffer" 334 final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining), buf.length - writeIndex); 335 final int bytesRead = reallyTryToRead > 0 ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead) : 0 /* happens for bytesRemaining == 0 */; 336 count(bytesRead); 337 if (reallyTryToRead != bytesRead) { 338 throw new IOException("Premature end of stream reading literal"); 339 } 340 writeIndex += reallyTryToRead; 341 bytesRemaining -= reallyTryToRead; 342 } 343 }