001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.gzip;
020
021import java.io.BufferedInputStream;
022import java.io.ByteArrayOutputStream;
023import java.io.DataInput;
024import java.io.DataInputStream;
025import java.io.EOFException;
026import java.io.IOException;
027import java.io.InputStream;
028import java.util.zip.CRC32;
029import java.util.zip.DataFormatException;
030import java.util.zip.Deflater;
031import java.util.zip.Inflater;
032
033import org.apache.commons.compress.compressors.CompressorInputStream;
034import org.apache.commons.compress.utils.ByteUtils;
035import org.apache.commons.compress.utils.InputStreamStatistics;
036import org.apache.commons.io.input.BoundedInputStream;
037
038/**
039 * Input stream that decompresses .gz files.
040 *
041 * <p>
042 * This supports decompressing concatenated .gz files which is important when decompressing standalone .gz files.
043 * </p>
044 *
045 * <p>
046 * Instead of using {@code java.util.zip.GZIPInputStream}, this class has its own GZIP member decoder.
047 * The actual decompression is done with {@link java.util.zip.Inflater}.
048 * </p>
049 *
050 * <p>
051 * If you use the constructor {@code GzipCompressorInputStream(in)} or {@code GzipCompressorInputStream(in, false)},
052 * then {@link #read} will return -1 as soon as the first encoded GZIP member has been completely read. In this case,
053 * if the underlying input stream supports {@link InputStream#mark mark()} and {@link InputStream#reset reset()},
054 * then it will be left positioned just after the end of the encoded GZIP member; otherwise, some indeterminate number
055 * of extra bytes following the encoded GZIP member will have been consumed and discarded.
056 * </p>
057 *
058 * <p>
059 * If you use the constructor {@code GzipCompressorInputStream(in, true)} then {@link #read} will return -1 only after
060 * the entire input stream has been exhausted; any bytes that follow an encoded GZIP member must constitute a new encoded
061 * GZIP member, otherwise an {@link IOException} is thrown. The data read from a stream constructed this way will consist
062 * of the concatenated data of all of the encoded GZIP members in order.
063 * </p>
064 *
065 * @see "https://tools.ietf.org/html/rfc1952"
066 */
067public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
068
069    // Header flags
070    // private static final int FTEXT = 0x01; // Uninteresting for us
071    private static final int FHCRC = 0x02;
072    private static final int FEXTRA = 0x04;
073    private static final int FNAME = 0x08;
074    private static final int FCOMMENT = 0x10;
075    private static final int FRESERVED = 0xE0;
076
077    /**
078     * Checks if the signature matches what is expected for a .gz file.
079     *
080     * @param signature the bytes to check
081     * @param length    the number of bytes to check
082     * @return true if this is a .gz stream, false otherwise
083     *
084     * @since 1.1
085     */
086    public static boolean matches(final byte[] signature, final int length) {
087        return length >= 2 && signature[0] == 31 && signature[1] == -117;
088    }
089
090    private static byte[] readToNull(final DataInput inData) throws IOException {
091        try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
092            int b;
093            while ((b = inData.readUnsignedByte()) != 0) { // NOPMD NOSONAR
094                bos.write(b);
095            }
096            return bos.toByteArray();
097        }
098    }
099
100    private final BoundedInputStream countingStream;
101
102    // Compressed input stream, possibly wrapped in a
103    // BufferedInputStream, always wrapped in countingStream above
104    private final InputStream in;
105
106    // True if decompressing multi member streams.
107    private final boolean decompressConcatenated;
108
109    // Buffer to hold the input data
110    private final byte[] buf = new byte[8192];
111
112    // Amount of data in buf.
113    private int bufUsed;
114
115    // Decompressor
116    private Inflater inf = new Inflater(true);
117
118    // CRC32 from uncompressed data
119    private final CRC32 crc = new CRC32();
120
121    // True once everything has been decompressed
122    private boolean endReached;
123
124    // used in no-arg read method
125    private final byte[] oneByte = new byte[1];
126
127    private final GzipParameters parameters = new GzipParameters();
128
129    /**
130     * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
131     * <p>
132     * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files.
133     *
134     * @param inputStream the InputStream from which this object should be created of
135     *
136     * @throws IOException if the stream could not be created
137     */
138    public GzipCompressorInputStream(final InputStream inputStream) throws IOException {
139        this(inputStream, false);
140    }
141
142    /**
143     * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
144     * <p>
145     * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports
146     * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If
147     * {@code mark} isn't supported, the input position will be undefined.
148     *
149     * @param inputStream            the InputStream from which this object should be created of
150     * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member
151     *
152     * @throws IOException if the stream could not be created
153     */
154    public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException {
155        countingStream = BoundedInputStream.builder().setInputStream(inputStream).get();
156        // Mark support is strictly needed for concatenated files only,
157        // but it's simpler if it is always available.
158        if (countingStream.markSupported()) {
159            in = countingStream;
160        } else {
161            in = new BufferedInputStream(countingStream);
162        }
163
164        this.decompressConcatenated = decompressConcatenated;
165        init(true);
166    }
167
168    /**
169     * Closes the input stream (unless it is System.in).
170     *
171     * @since 1.2
172     */
173    @Override
174    public void close() throws IOException {
175        if (inf != null) {
176            inf.end();
177            inf = null;
178        }
179
180        if (this.in != System.in) {
181            this.in.close();
182        }
183    }
184
185    /**
186     * @since 1.17
187     */
188    @Override
189    public long getCompressedCount() {
190        return countingStream.getCount();
191    }
192
193    /**
194     * Provides the stream's meta data - may change with each stream when decompressing concatenated streams.
195     *
196     * @return the stream's meta data
197     * @since 1.8
198     */
199    public GzipParameters getMetaData() {
200        return parameters;
201    }
202
203    private boolean init(final boolean isFirstMember) throws IOException {
204        if (!isFirstMember && !decompressConcatenated) { // at least one must be true
205            throw new IllegalStateException("Unexpected: isFirstMember and decompressConcatenated are both false!");
206        }
207
208        // Check the magic bytes without a possibility of EOFException.
209        final int magic0 = in.read();
210
211        // If end of input was reached after decompressing at least
212        // one .gz member, we have reached the end of the file successfully.
213        if (magic0 == -1 && !isFirstMember) {
214            return false;
215        }
216
217        if (magic0 != 31 || in.read() != 139) {
218            throw new IOException(isFirstMember ? "Input is not in the .gz format" : "Garbage after a valid .gz stream");
219        }
220
221        // Parsing the rest of the header may throw EOFException.
222        final DataInput inData = new DataInputStream(in);
223        final int method = inData.readUnsignedByte();
224        if (method != Deflater.DEFLATED) {
225            throw new IOException("Unsupported compression method " + method + " in the .gz header");
226        }
227
228        final int flg = inData.readUnsignedByte();
229        if ((flg & FRESERVED) != 0) {
230            throw new IOException("Reserved flags are set in the .gz header");
231        }
232
233        parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
234        switch (inData.readUnsignedByte()) { // extra flags
235        case 2:
236            parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
237            break;
238        case 4:
239            parameters.setCompressionLevel(Deflater.BEST_SPEED);
240            break;
241        default:
242            // ignored for now
243            break;
244        }
245        parameters.setOperatingSystem(inData.readUnsignedByte());
246
247        // Extra field, ignored
248        if ((flg & FEXTRA) != 0) {
249            int xlen = inData.readUnsignedByte();
250            xlen |= inData.readUnsignedByte() << 8;
251
252            // This isn't as efficient as calling in.skip would be,
253            // but it's lazier to handle unexpected end of input this way.
254            // Most files don't have an extra field anyway.
255            while (xlen-- > 0) {
256                inData.readUnsignedByte();
257            }
258        }
259
260        // Original file name
261        if ((flg & FNAME) != 0) {
262            parameters.setFileName(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
263        }
264
265        // Comment
266        if ((flg & FCOMMENT) != 0) {
267            parameters.setComment(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
268        }
269
270        // Header "CRC16" which is actually a truncated CRC32 (which isn't
271        // as good as real CRC16). I don't know if any encoder implementation
272        // sets this, so it's not worth trying to verify it. GNU gzip 1.4
273        // doesn't support this field, but zlib seems to be able to at least
274        // skip over it.
275        if ((flg & FHCRC) != 0) {
276            inData.readShort();
277        }
278
279        // Reset
280        inf.reset();
281        crc.reset();
282
283        return true;
284    }
285
286    @Override
287    public int read() throws IOException {
288        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
289    }
290
291    /**
292     * {@inheritDoc}
293     *
294     * @since 1.1
295     */
296    @Override
297    public int read(final byte[] b, int off, int len) throws IOException {
298        if (len == 0) {
299            return 0;
300        }
301        if (endReached) {
302            return -1;
303        }
304
305        int size = 0;
306
307        while (len > 0) {
308            if (inf.needsInput()) {
309                // Remember the current position because we may need to
310                // rewind after reading too much input.
311                in.mark(buf.length);
312
313                bufUsed = in.read(buf);
314                if (bufUsed == -1) {
315                    throw new EOFException();
316                }
317
318                inf.setInput(buf, 0, bufUsed);
319            }
320
321            final int ret;
322            try {
323                ret = inf.inflate(b, off, len);
324            } catch (final DataFormatException e) { // NOSONAR
325                throw new IOException("Gzip-compressed data is corrupt");
326            }
327
328            crc.update(b, off, ret);
329            off += ret;
330            len -= ret;
331            size += ret;
332            count(ret);
333
334            if (inf.finished()) {
335                // We may have read too many bytes. Rewind the read
336                // position to match the actual amount used.
337                in.reset();
338
339                final int skipAmount = bufUsed - inf.getRemaining();
340                if (org.apache.commons.io.IOUtils.skip(in, skipAmount) != skipAmount) {
341                    throw new IOException();
342                }
343
344                bufUsed = 0;
345
346                final DataInput inData = new DataInputStream(in);
347
348                // CRC32
349                final long crcStored = ByteUtils.fromLittleEndian(inData, 4);
350
351                if (crcStored != crc.getValue()) {
352                    throw new IOException("Gzip-compressed data is corrupt " + "(CRC32 error)");
353                }
354
355                // Uncompressed size modulo 2^32 (ISIZE in the spec)
356                final long isize = ByteUtils.fromLittleEndian(inData, 4);
357
358                if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
359                    throw new IOException("Gzip-compressed data is corrupt" + "(uncompressed size mismatch)");
360                }
361
362                // See if this is the end of the file.
363                if (!decompressConcatenated || !init(false)) {
364                    inf.end();
365                    inf = null;
366                    endReached = true;
367                    return size == 0 ? -1 : size;
368                }
369            }
370        }
371
372        return size;
373    }
374}