View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.gzip;
20  
21  import java.io.BufferedInputStream;
22  import java.io.ByteArrayOutputStream;
23  import java.io.DataInput;
24  import java.io.DataInputStream;
25  import java.io.EOFException;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.util.zip.CRC32;
29  import java.util.zip.DataFormatException;
30  import java.util.zip.Deflater;
31  import java.util.zip.Inflater;
32  
33  import org.apache.commons.compress.compressors.CompressorInputStream;
34  import org.apache.commons.compress.utils.ByteUtils;
35  import org.apache.commons.compress.utils.InputStreamStatistics;
36  import org.apache.commons.io.input.BoundedInputStream;
37  
38  /**
39   * Input stream that decompresses .gz files.
40   *
41   * <p>
42   * This supports decompressing concatenated .gz files which is important when decompressing standalone .gz files.
43   * </p>
44   *
45   * <p>
46   * Instead of using {@code java.util.zip.GZIPInputStream}, this class has its own GZIP member decoder.
47   * The actual decompression is done with {@link java.util.zip.Inflater}.
48   * </p>
49   *
50   * <p>
51   * If you use the constructor {@code GzipCompressorInputStream(in)} or {@code GzipCompressorInputStream(in, false)},
52   * then {@link #read} will return -1 as soon as the first encoded GZIP member has been completely read. In this case,
53   * if the underlying input stream supports {@link InputStream#mark mark()} and {@link InputStream#reset reset()},
54   * then it will be left positioned just after the end of the encoded GZIP member; otherwise, some indeterminate number
55   * of extra bytes following the encoded GZIP member will have been consumed and discarded.
56   * </p>
57   *
58   * <p>
59   * If you use the constructor {@code GzipCompressorInputStream(in, true)} then {@link #read} will return -1 only after
60   * the entire input stream has been exhausted; any bytes that follow an encoded GZIP member must constitute a new encoded
61   * GZIP member, otherwise an {@link IOException} is thrown. The data read from a stream constructed this way will consist
62   * of the concatenated data of all of the encoded GZIP members in order.
63   * </p>
64   *
65   * @see "https://tools.ietf.org/html/rfc1952"
66   */
67  public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
68  
69      // Header flags
70      // private static final int FTEXT = 0x01; // Uninteresting for us
71      private static final int FHCRC = 0x02;
72      private static final int FEXTRA = 0x04;
73      private static final int FNAME = 0x08;
74      private static final int FCOMMENT = 0x10;
75      private static final int FRESERVED = 0xE0;
76  
77      /**
78       * Checks if the signature matches what is expected for a .gz file.
79       *
80       * @param signature the bytes to check
81       * @param length    the number of bytes to check
82       * @return true if this is a .gz stream, false otherwise
83       *
84       * @since 1.1
85       */
86      public static boolean matches(final byte[] signature, final int length) {
87          return length >= 2 && signature[0] == 31 && signature[1] == -117;
88      }
89  
90      private static byte[] readToNull(final DataInput inData) throws IOException {
91          try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
92              int b;
93              while ((b = inData.readUnsignedByte()) != 0) { // NOPMD NOSONAR
94                  bos.write(b);
95              }
96              return bos.toByteArray();
97          }
98      }
99  
100     private final BoundedInputStream countingStream;
101 
102     // Compressed input stream, possibly wrapped in a
103     // BufferedInputStream, always wrapped in countingStream above
104     private final InputStream in;
105 
106     // True if decompressing multi member streams.
107     private final boolean decompressConcatenated;
108 
109     // Buffer to hold the input data
110     private final byte[] buf = new byte[8192];
111 
112     // Amount of data in buf.
113     private int bufUsed;
114 
115     // Decompressor
116     private Inflater inf = new Inflater(true);
117 
118     // CRC32 from uncompressed data
119     private final CRC32 crc = new CRC32();
120 
121     // True once everything has been decompressed
122     private boolean endReached;
123 
124     // used in no-arg read method
125     private final byte[] oneByte = new byte[1];
126 
127     private final GzipParameters parameters = new GzipParameters();
128 
129     /**
130      * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
131      * <p>
132      * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files.
133      *
134      * @param inputStream the InputStream from which this object should be created of
135      *
136      * @throws IOException if the stream could not be created
137      */
138     public GzipCompressorInputStream(final InputStream inputStream) throws IOException {
139         this(inputStream, false);
140     }
141 
142     /**
143      * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
144      * <p>
145      * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports
146      * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If
147      * {@code mark} isn't supported, the input position will be undefined.
148      *
149      * @param inputStream            the InputStream from which this object should be created of
150      * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member
151      *
152      * @throws IOException if the stream could not be created
153      */
154     public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException {
155         countingStream = BoundedInputStream.builder().setInputStream(inputStream).get();
156         // Mark support is strictly needed for concatenated files only,
157         // but it's simpler if it is always available.
158         if (countingStream.markSupported()) {
159             in = countingStream;
160         } else {
161             in = new BufferedInputStream(countingStream);
162         }
163 
164         this.decompressConcatenated = decompressConcatenated;
165         init(true);
166     }
167 
168     /**
169      * Closes the input stream (unless it is System.in).
170      *
171      * @since 1.2
172      */
173     @Override
174     public void close() throws IOException {
175         if (inf != null) {
176             inf.end();
177             inf = null;
178         }
179 
180         if (this.in != System.in) {
181             this.in.close();
182         }
183     }
184 
185     /**
186      * @since 1.17
187      */
188     @Override
189     public long getCompressedCount() {
190         return countingStream.getCount();
191     }
192 
193     /**
194      * Provides the stream's meta data - may change with each stream when decompressing concatenated streams.
195      *
196      * @return the stream's meta data
197      * @since 1.8
198      */
199     public GzipParameters getMetaData() {
200         return parameters;
201     }
202 
203     private boolean init(final boolean isFirstMember) throws IOException {
204         if (!isFirstMember && !decompressConcatenated) { // at least one must be true
205             throw new IllegalStateException("Unexpected: isFirstMember and decompressConcatenated are both false!");
206         }
207 
208         // Check the magic bytes without a possibility of EOFException.
209         final int magic0 = in.read();
210 
211         // If end of input was reached after decompressing at least
212         // one .gz member, we have reached the end of the file successfully.
213         if (magic0 == -1 && !isFirstMember) {
214             return false;
215         }
216 
217         if (magic0 != 31 || in.read() != 139) {
218             throw new IOException(isFirstMember ? "Input is not in the .gz format" : "Garbage after a valid .gz stream");
219         }
220 
221         // Parsing the rest of the header may throw EOFException.
222         final DataInput inData = new DataInputStream(in);
223         final int method = inData.readUnsignedByte();
224         if (method != Deflater.DEFLATED) {
225             throw new IOException("Unsupported compression method " + method + " in the .gz header");
226         }
227 
228         final int flg = inData.readUnsignedByte();
229         if ((flg & FRESERVED) != 0) {
230             throw new IOException("Reserved flags are set in the .gz header");
231         }
232 
233         parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
234         switch (inData.readUnsignedByte()) { // extra flags
235         case 2:
236             parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
237             break;
238         case 4:
239             parameters.setCompressionLevel(Deflater.BEST_SPEED);
240             break;
241         default:
242             // ignored for now
243             break;
244         }
245         parameters.setOperatingSystem(inData.readUnsignedByte());
246 
247         // Extra field, ignored
248         if ((flg & FEXTRA) != 0) {
249             int xlen = inData.readUnsignedByte();
250             xlen |= inData.readUnsignedByte() << 8;
251 
252             // This isn't as efficient as calling in.skip would be,
253             // but it's lazier to handle unexpected end of input this way.
254             // Most files don't have an extra field anyway.
255             while (xlen-- > 0) {
256                 inData.readUnsignedByte();
257             }
258         }
259 
260         // Original file name
261         if ((flg & FNAME) != 0) {
262             parameters.setFileName(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
263         }
264 
265         // Comment
266         if ((flg & FCOMMENT) != 0) {
267             parameters.setComment(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
268         }
269 
270         // Header "CRC16" which is actually a truncated CRC32 (which isn't
271         // as good as real CRC16). I don't know if any encoder implementation
272         // sets this, so it's not worth trying to verify it. GNU gzip 1.4
273         // doesn't support this field, but zlib seems to be able to at least
274         // skip over it.
275         if ((flg & FHCRC) != 0) {
276             inData.readShort();
277         }
278 
279         // Reset
280         inf.reset();
281         crc.reset();
282 
283         return true;
284     }
285 
286     @Override
287     public int read() throws IOException {
288         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
289     }
290 
291     /**
292      * {@inheritDoc}
293      *
294      * @since 1.1
295      */
296     @Override
297     public int read(final byte[] b, int off, int len) throws IOException {
298         if (len == 0) {
299             return 0;
300         }
301         if (endReached) {
302             return -1;
303         }
304 
305         int size = 0;
306 
307         while (len > 0) {
308             if (inf.needsInput()) {
309                 // Remember the current position because we may need to
310                 // rewind after reading too much input.
311                 in.mark(buf.length);
312 
313                 bufUsed = in.read(buf);
314                 if (bufUsed == -1) {
315                     throw new EOFException();
316                 }
317 
318                 inf.setInput(buf, 0, bufUsed);
319             }
320 
321             final int ret;
322             try {
323                 ret = inf.inflate(b, off, len);
324             } catch (final DataFormatException e) { // NOSONAR
325                 throw new IOException("Gzip-compressed data is corrupt");
326             }
327 
328             crc.update(b, off, ret);
329             off += ret;
330             len -= ret;
331             size += ret;
332             count(ret);
333 
334             if (inf.finished()) {
335                 // We may have read too many bytes. Rewind the read
336                 // position to match the actual amount used.
337                 in.reset();
338 
339                 final int skipAmount = bufUsed - inf.getRemaining();
340                 if (org.apache.commons.io.IOUtils.skip(in, skipAmount) != skipAmount) {
341                     throw new IOException();
342                 }
343 
344                 bufUsed = 0;
345 
346                 final DataInput inData = new DataInputStream(in);
347 
348                 // CRC32
349                 final long crcStored = ByteUtils.fromLittleEndian(inData, 4);
350 
351                 if (crcStored != crc.getValue()) {
352                     throw new IOException("Gzip-compressed data is corrupt " + "(CRC32 error)");
353                 }
354 
355                 // Uncompressed size modulo 2^32 (ISIZE in the spec)
356                 final long isize = ByteUtils.fromLittleEndian(inData, 4);
357 
358                 if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
359                     throw new IOException("Gzip-compressed data is corrupt" + "(uncompressed size mismatch)");
360                 }
361 
362                 // See if this is the end of the file.
363                 if (!decompressConcatenated || !init(false)) {
364                     inf.end();
365                     inf = null;
366                     endReached = true;
367                     return size == 0 ? -1 : size;
368                 }
369             }
370         }
371 
372         return size;
373     }
374 }