SnappyCompressorInputStream.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.commons.compress.compressors.snappy;

import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
import org.apache.commons.compress.utils.ByteUtils;

/**
 * CompressorInputStream for the raw Snappy format.
 *
 * <p>
 * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must
 * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks
 * and doesn't contain offsets bigger than 32k which is the default block size used by this class.
 * </p>
 *
 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
 * @since 1.7
 */
public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {

    private enum State {
        NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
    }

    /** Mask used to determine the type of "tag" is being processed */
    private static final int TAG_MASK = 0x03;

    /** Default block size */
    public static final int DEFAULT_BLOCK_SIZE = 32768;

    /** The size of the uncompressed data */
    private final int size;

    /** Number of uncompressed bytes still to be read. */
    private int uncompressedBytesRemaining;

    /** Current state of the stream */
    private State state = State.NO_BLOCK;

    private boolean endReached;

    /**
     * Constructor using the default buffer size of 32k.
     *
     * @param is An InputStream to read compressed data from
     *
     * @throws IOException if reading fails
     */
    public SnappyCompressorInputStream(final InputStream is) throws IOException {
        this(is, DEFAULT_BLOCK_SIZE);
    }

    /**
     * Constructor using a configurable buffer size.
     *
     * @param is        An InputStream to read compressed data from
     * @param blockSize The block size used in compression
     *
     * @throws IOException              if reading fails
     * @throws IllegalArgumentException if blockSize is not bigger than 0
     */
    public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException {
        super(is, blockSize);
        uncompressedBytesRemaining = size = (int) readSize();
    }

    /**
     * Try to fill the buffer with the next block of data.
     */
    private void fill() throws IOException {
        if (uncompressedBytesRemaining == 0) {
            endReached = true;
            return;
        }

        int b = readOneByte();
        if (b == -1) {
            throw new IOException("Premature end of stream reading block start");
        }
        int length = 0;
        int offset = 0;

        switch (b & TAG_MASK) {

        case 0x00:

            length = readLiteralLength(b);
            if (length < 0) {
                throw new IOException("Illegal block with a negative literal size found");
            }
            uncompressedBytesRemaining -= length;
            startLiteral(length);
            state = State.IN_LITERAL;
            break;

        case 0x01:

            /*
             * These elements can encode lengths between [4..11] bytes and offsets between [0..2047] bytes. (len-4) occupies three bits and is stored in bits
             * [2..4] of the tag byte. The offset occupies 11 bits, of which the upper three are stored in the upper three bits ([5..7]) of the tag byte, and
             * the lower eight are stored in a byte following the tag byte.
             */

            length = 4 + (b >> 2 & 0x07);
            uncompressedBytesRemaining -= length;
            offset = (b & 0xE0) << 3;
            b = readOneByte();
            if (b == -1) {
                throw new IOException("Premature end of stream reading back-reference length");
            }
            offset |= b;

            try {
                startBackReference(offset, length);
            } catch (final IllegalArgumentException ex) {
                throw new IOException("Illegal block with bad offset found", ex);
            }
            state = State.IN_BACK_REFERENCE;
            break;

        case 0x02:

            /*
             * These elements can encode lengths between [1..64] and offsets from [0..65535]. (len-1) occupies six bits and is stored in the upper six bits
             * ([2..7]) of the tag byte. The offset is stored as a little-endian 16-bit integer in the two bytes following the tag byte.
             */

            length = (b >> 2) + 1;
            if (length < 0) {
                throw new IOException("Illegal block with a negative match length found");
            }
            uncompressedBytesRemaining -= length;

            offset = (int) ByteUtils.fromLittleEndian(supplier, 2);

            try {
                startBackReference(offset, length);
            } catch (final IllegalArgumentException ex) {
                throw new IOException("Illegal block with bad offset found", ex);
            }
            state = State.IN_BACK_REFERENCE;
            break;

        case 0x03:

            /*
             * These are like the copies with 2-byte offsets (see previous subsection), except that the offset is stored as a 32-bit integer instead of a 16-bit
             * integer (and thus will occupy four bytes).
             */

            length = (b >> 2) + 1;
            if (length < 0) {
                throw new IOException("Illegal block with a negative match length found");
            }
            uncompressedBytesRemaining -= length;

            offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;

            try {
                startBackReference(offset, length);
            } catch (final IllegalArgumentException ex) {
                throw new IOException("Illegal block with bad offset found", ex);
            }
            state = State.IN_BACK_REFERENCE;
            break;
        default:
            // impossible as TAG_MASK is two bits and all four possible cases have been covered
            break;
        }
    }

    /**
     * Gets the uncompressed size of the stream
     *
     * @return the uncompressed size
     */
    @Override
    public int getSize() {
        return size;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public int read(final byte[] b, final int off, final int len) throws IOException {
        if (len == 0) {
            return 0;
        }
        if (endReached) {
            return -1;
        }
        switch (state) {
        case NO_BLOCK:
            fill();
            return read(b, off, len);
        case IN_LITERAL:
            final int litLen = readLiteral(b, off, len);
            if (!hasMoreDataInBlock()) {
                state = State.NO_BLOCK;
            }
            return litLen > 0 ? litLen : read(b, off, len);
        case IN_BACK_REFERENCE:
            final int backReferenceLen = readBackReference(b, off, len);
            if (!hasMoreDataInBlock()) {
                state = State.NO_BLOCK;
            }
            return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
        default:
            throw new IOException("Unknown stream state " + state);
        }
    }

    /*
     * For literals up to and including 60 bytes in length, the upper six bits of the tag byte contain (len-1). The literal follows immediately thereafter in
     * the bytestream. - For longer literals, the (len-1) value is stored after the tag byte, little-endian. The upper six bits of the tag byte describe how
     * many bytes are used for the length; 60, 61, 62 or 63 for 1-4 bytes, respectively. The literal itself follows after the length.
     */
    private int readLiteralLength(final int b) throws IOException {
        final int length;
        switch (b >> 2) {
        case 60:
            length = readOneByte();
            if (length == -1) {
                throw new IOException("Premature end of stream reading literal length");
            }
            break;
        case 61:
            length = (int) ByteUtils.fromLittleEndian(supplier, 2);
            break;
        case 62:
            length = (int) ByteUtils.fromLittleEndian(supplier, 3);
            break;
        case 63:
            length = (int) ByteUtils.fromLittleEndian(supplier, 4);
            break;
        default:
            length = b >> 2;
            break;
        }

        return length + 1;
    }

    /**
     * The stream starts with the uncompressed length (up to a maximum of 2^32 - 1), stored as a little-endian varint. Varints consist of a series of bytes,
     * where the lower 7 bits are data and the upper bit is set iff there are more bytes to be read. In other words, an uncompressed length of 64 would be
     * stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) would be stored as 0xFE 0xFF 0x7F.
     *
     * @return The size of the uncompressed data
     *
     * @throws IOException Could not read a byte
     */
    private long readSize() throws IOException {
        int index = 0;
        long sz = 0;
        int b = 0;

        do {
            b = readOneByte();
            if (b == -1) {
                throw new IOException("Premature end of stream reading size");
            }
            sz |= (b & 0x7f) << index++ * 7;
        } while (0 != (b & 0x80));
        return sz;
    }
}