View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.lz77support;
20  
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.util.Arrays;
24  
25  import org.apache.commons.compress.compressors.CompressorInputStream;
26  import org.apache.commons.compress.utils.ByteUtils;
27  import org.apache.commons.compress.utils.IOUtils;
28  import org.apache.commons.compress.utils.InputStreamStatistics;
29  import org.apache.commons.io.input.BoundedInputStream;
30  
31  /**
32   * Encapsulates code common to LZ77 decompressors.
33   *
34   * <p>
35   * Assumes the stream consists of blocks of literal data and back-references (called copies) in any order. Of course the first block must be a literal block for
36   * the scheme to work - unless the {@link #prefill prefill} method has been used to provide initial data that is never returned by {@link #read read} but only
37   * used for back-references.
38   * </p>
39   *
40   * <p>
41   * Subclasses must override the three-arg {@link #read read} method as the no-arg version delegates to it and the default implementation delegates to the no-arg
42   * version, leading to infinite mutual recursion and a {@code StackOverflowError} otherwise.
43   * </p>
44   *
45   * <p>
46   * The contract for subclasses' {@code read} implementation is:
47   * </p>
48   * <ul>
49   *
50   * <li>keep track of the current state of the stream. Is it inside a literal block or a back-reference or in-between blocks?</li>
51   *
52   * <li>Use {@link #readOneByte} to access the underlying stream directly.</li>
53   *
54   * <li>If a new literal block starts, use {@link #startLiteral} to tell this class about it and read the literal data using {@link #readLiteral} until it
55   * returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readLiteral} would return {@code 0}.</li>
56   *
57   * <li>If a new back-reference starts, use {@link #startBackReference} to tell this class about it and read the literal data using {@link #readBackReference}
58   * until it returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readBackReference} would return
59   * {@code 0}.</li>
60   *
61   * <li>If the end of the stream has been reached, return {@code -1} as this class' methods will never do so themselves.</li>
62   *
63   * </ul>
64   *
65   * <p>
66   * {@link #readOneByte} and {@link #readLiteral} update the counter for bytes read.
67   * </p>
68   *
69   * @since 1.14
70   */
71  public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
72  
73      /** Size of the window - must be bigger than the biggest offset expected. */
74      private final int windowSize;
75  
76      /**
77       * Buffer to write decompressed bytes to for back-references, will be three times windowSize big.
78       *
79       * <p>
80       * Three times so we can slide the whole buffer a windowSize to the left once we've read twice windowSize and still have enough data inside of it to satisfy
81       * back-references.
82       * </p>
83       */
84      private final byte[] buf;
85  
86      /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */
87      private int writeIndex;
88  
89      /** Index of the next byte to be read. */
90      private int readIndex;
91  
92      /** The underlying stream to read compressed data from */
93      private final BoundedInputStream in;
94  
95      /** Number of bytes still to be read from the current literal or back-reference. */
96      private long bytesRemaining;
97  
98      /** Offset of the current back-reference. */
99      private int backReferenceOffset;
100 
101     /** Uncompressed size */
102     private int size;
103 
104     // used in no-arg read method
105     private final byte[] oneByte = new byte[1];
106 
107     /**
108      * Supplier that delegates to {@link #readOneByte}.
109      */
110     protected final ByteUtils.ByteSupplier supplier = this::readOneByte;
111 
112     /**
113      * Creates a new LZ77 input stream.
114      *
115      * @param is         An InputStream to read compressed data from
116      * @param windowSize Size of the window kept for back-references, must be bigger than the biggest offset expected.
117      *
118      * @throws IllegalArgumentException if windowSize is not bigger than 0
119      */
120     public AbstractLZ77CompressorInputStream(final InputStream is, final int windowSize) {
121         this.in = BoundedInputStream.builder().setInputStream(is).asSupplier().get();
122         if (windowSize <= 0) {
123             throw new IllegalArgumentException("windowSize must be bigger than 0");
124         }
125         this.windowSize = windowSize;
126         buf = new byte[3 * windowSize];
127         writeIndex = readIndex = 0;
128         bytesRemaining = 0;
129     }
130 
131     /** {@inheritDoc} */
132     @Override
133     public int available() {
134         return writeIndex - readIndex;
135     }
136 
137     /** {@inheritDoc} */
138     @Override
139     public void close() throws IOException {
140         in.close();
141     }
142 
143     /**
144      * @since 1.17
145      */
146     @Override
147     public long getCompressedCount() {
148         return in.getCount();
149     }
150 
151     /**
152      * Gets the uncompressed size of the stream
153      *
154      * @return the uncompressed size
155      */
156     public int getSize() {
157         return size;
158     }
159 
160     /**
161      * Is there still data remaining inside the current block?
162      *
163      * @return true if there is still data remaining inside the current block.
164      */
165     protected final boolean hasMoreDataInBlock() {
166         return bytesRemaining > 0;
167     }
168 
169     /**
170      * Adds some initial data to fill the window with.
171      *
172      * <p>
173      * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
174      * LZ4 frame format using block dependency.
175      * </p>
176      *
177      * @param data the data to fill the window with.
178      * @throws IllegalStateException if the stream has already started to read data
179      */
180     public void prefill(final byte[] data) {
181         if (writeIndex != 0) {
182             throw new IllegalStateException("The stream has already been read from, can't prefill anymore");
183         }
184         // we don't need more data than the big offset could refer to, so cap it
185         final int len = Math.min(windowSize, data.length);
186         // we need the last data as we are dealing with *back*-references
187         System.arraycopy(data, data.length - len, buf, 0, len);
188         writeIndex += len;
189         readIndex += len;
190     }
191 
192     /** {@inheritDoc} */
193     @Override
194     public int read() throws IOException {
195         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
196     }
197 
198     /**
199      * Reads data from the current back-reference.
200      *
201      * @param b   buffer to write data to
202      * @param off offset to start writing to
203      * @param len maximum amount of data to read
204      * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass
205      * @throws NullPointerException      if {@code b} is null
206      * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off}
207      */
208     protected final int readBackReference(final byte[] b, final int off, final int len) {
209         final int avail = available();
210         if (len > avail) {
211             tryToCopy(len - avail);
212         }
213         return readFromBuffer(b, off, len);
214     }
215 
216     private int readFromBuffer(final byte[] b, final int off, final int len) {
217         final int readable = Math.min(len, available());
218         if (readable > 0) {
219             System.arraycopy(buf, readIndex, b, off, readable);
220             readIndex += readable;
221             if (readIndex > 2 * windowSize) {
222                 slideBuffer();
223             }
224         }
225         size += readable;
226         return readable;
227     }
228 
229     /**
230      * Reads data from the current literal block.
231      *
232      * @param b   buffer to write data to
233      * @param off offset to start writing to
234      * @param len maximum amount of data to read
235      * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass
236      * @throws IOException               if the underlying stream throws or signals an EOF before the amount of data promised for the block have been read
237      * @throws NullPointerException      if {@code b} is null
238      * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off}
239      */
240     protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException {
241         final int avail = available();
242         if (len > avail) {
243             tryToReadLiteral(len - avail);
244         }
245         return readFromBuffer(b, off, len);
246     }
247 
248     /**
249      * Reads a single byte from the real input stream and ensures the data is accounted for.
250      *
251      * @return the byte read as value between 0 and 255 or -1 if EOF has been reached.
252      * @throws IOException if the underlying stream throws
253      */
254     protected final int readOneByte() throws IOException {
255         final int b = in.read();
256         if (b != -1) {
257             count(1);
258             return b & 0xFF;
259         }
260         return -1;
261     }
262 
263     private void slideBuffer() {
264         System.arraycopy(buf, windowSize, buf, 0, windowSize * 2);
265         writeIndex -= windowSize;
266         readIndex -= windowSize;
267     }
268 
269     /**
270      * Used by subclasses to signal the next block contains a back-reference with the given coordinates.
271      *
272      * @param offset the offset of the back-reference
273      * @param length the length of the back-reference
274      * @throws IllegalArgumentException if offset not bigger than 0 or bigger than the number of bytes available for back-references or if length is negative
275      */
276     protected final void startBackReference(final int offset, final long length) {
277         if (offset <= 0 || offset > writeIndex) {
278             throw new IllegalArgumentException("offset must be bigger than 0 but not bigger than the number" + " of bytes available for back-references");
279         }
280         if (length < 0) {
281             throw new IllegalArgumentException("length must not be negative");
282         }
283         backReferenceOffset = offset;
284         bytesRemaining = length;
285     }
286 
287     /**
288      * Used by subclasses to signal the next block contains the given amount of literal data.
289      *
290      * @param length the length of the block
291      * @throws IllegalArgumentException if length is negative
292      */
293     protected final void startLiteral(final long length) {
294         if (length < 0) {
295             throw new IllegalArgumentException("length must not be negative");
296         }
297         bytesRemaining = length;
298     }
299 
300     private void tryToCopy(final int bytesToCopy) {
301         // this will fit into the buffer without sliding and not
302         // require more than is available inside the back-reference
303         final int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining), buf.length - writeIndex);
304         if (copy == 0) {
305             // NOP
306         } else if (backReferenceOffset == 1) { // pretty common special case
307             final byte last = buf[writeIndex - 1];
308             Arrays.fill(buf, writeIndex, writeIndex + copy, last);
309             writeIndex += copy;
310         } else if (copy < backReferenceOffset) {
311             System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy);
312             writeIndex += copy;
313         } else {
314             // back-reference overlaps with the bytes created from it
315             // like go back two bytes and then copy six (by copying
316             // the last two bytes three time).
317             final int fullRots = copy / backReferenceOffset;
318             for (int i = 0; i < fullRots; i++) {
319                 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset);
320                 writeIndex += backReferenceOffset;
321             }
322 
323             final int pad = copy - backReferenceOffset * fullRots;
324             if (pad > 0) {
325                 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad);
326                 writeIndex += pad;
327             }
328         }
329         bytesRemaining -= copy;
330     }
331 
332     private void tryToReadLiteral(final int bytesToRead) throws IOException {
333         // min of "what is still inside the literal", "what does the user want" and "how much can fit into the buffer"
334         final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining), buf.length - writeIndex);
335         final int bytesRead = reallyTryToRead > 0 ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead) : 0 /* happens for bytesRemaining == 0 */;
336         count(bytesRead);
337         if (reallyTryToRead != bytesRead) {
338             throw new IOException("Premature end of stream reading literal");
339         }
340         writeIndex += reallyTryToRead;
341         bytesRemaining -= reallyTryToRead;
342     }
343 }