001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.snappy; 020 021import java.io.IOException; 022import java.io.InputStream; 023 024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream; 025import org.apache.commons.compress.utils.ByteUtils; 026 027/** 028 * CompressorInputStream for the raw Snappy format. 029 * 030 * <p> 031 * This implementation uses an internal buffer in order to handle the back-references that are at the heart of the LZ77 algorithm. The size of the buffer must 032 * be at least as big as the biggest offset used in the compressed stream. The current version of the Snappy algorithm as defined by Google works on 32k blocks 033 * and doesn't contain offsets bigger than 32k which is the default block size used by this class. 034 * </p> 035 * 036 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a> 037 * @since 1.7 038 */ 039public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream { 040 041 private enum State { 042 NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE 043 } 044 045 /** Mask used to determine the type of "tag" is being processed */ 046 private static final int TAG_MASK = 0x03; 047 048 /** Default block size */ 049 public static final int DEFAULT_BLOCK_SIZE = 32768; 050 051 /** The size of the uncompressed data */ 052 private final int size; 053 054 /** Number of uncompressed bytes still to be read. */ 055 private int uncompressedBytesRemaining; 056 057 /** Current state of the stream */ 058 private State state = State.NO_BLOCK; 059 060 private boolean endReached; 061 062 /** 063 * Constructor using the default buffer size of 32k. 064 * 065 * @param is An InputStream to read compressed data from 066 * 067 * @throws IOException if reading fails 068 */ 069 public SnappyCompressorInputStream(final InputStream is) throws IOException { 070 this(is, DEFAULT_BLOCK_SIZE); 071 } 072 073 /** 074 * Constructor using a configurable buffer size. 075 * 076 * @param is An InputStream to read compressed data from 077 * @param blockSize The block size used in compression 078 * 079 * @throws IOException if reading fails 080 * @throws IllegalArgumentException if blockSize is not bigger than 0 081 */ 082 public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException { 083 super(is, blockSize); 084 uncompressedBytesRemaining = size = (int) readSize(); 085 } 086 087 /** 088 * Try to fill the buffer with the next block of data. 089 */ 090 private void fill() throws IOException { 091 if (uncompressedBytesRemaining == 0) { 092 endReached = true; 093 return; 094 } 095 096 int b = readOneByte(); 097 if (b == -1) { 098 throw new IOException("Premature end of stream reading block start"); 099 } 100 int length = 0; 101 int offset = 0; 102 103 switch (b & TAG_MASK) { 104 105 case 0x00: 106 107 length = readLiteralLength(b); 108 if (length < 0) { 109 throw new IOException("Illegal block with a negative literal size found"); 110 } 111 uncompressedBytesRemaining -= length; 112 startLiteral(length); 113 state = State.IN_LITERAL; 114 break; 115 116 case 0x01: 117 118 /* 119 * These elements can encode lengths between [4..11] bytes and offsets between [0..2047] bytes. (len-4) occupies three bits and is stored in bits 120 * [2..4] of the tag byte. The offset occupies 11 bits, of which the upper three are stored in the upper three bits ([5..7]) of the tag byte, and 121 * the lower eight are stored in a byte following the tag byte. 122 */ 123 124 length = 4 + (b >> 2 & 0x07); 125 uncompressedBytesRemaining -= length; 126 offset = (b & 0xE0) << 3; 127 b = readOneByte(); 128 if (b == -1) { 129 throw new IOException("Premature end of stream reading back-reference length"); 130 } 131 offset |= b; 132 133 try { 134 startBackReference(offset, length); 135 } catch (final IllegalArgumentException ex) { 136 throw new IOException("Illegal block with bad offset found", ex); 137 } 138 state = State.IN_BACK_REFERENCE; 139 break; 140 141 case 0x02: 142 143 /* 144 * These elements can encode lengths between [1..64] and offsets from [0..65535]. (len-1) occupies six bits and is stored in the upper six bits 145 * ([2..7]) of the tag byte. The offset is stored as a little-endian 16-bit integer in the two bytes following the tag byte. 146 */ 147 148 length = (b >> 2) + 1; 149 if (length < 0) { 150 throw new IOException("Illegal block with a negative match length found"); 151 } 152 uncompressedBytesRemaining -= length; 153 154 offset = (int) ByteUtils.fromLittleEndian(supplier, 2); 155 156 try { 157 startBackReference(offset, length); 158 } catch (final IllegalArgumentException ex) { 159 throw new IOException("Illegal block with bad offset found", ex); 160 } 161 state = State.IN_BACK_REFERENCE; 162 break; 163 164 case 0x03: 165 166 /* 167 * These are like the copies with 2-byte offsets (see previous subsection), except that the offset is stored as a 32-bit integer instead of a 16-bit 168 * integer (and thus will occupy four bytes). 169 */ 170 171 length = (b >> 2) + 1; 172 if (length < 0) { 173 throw new IOException("Illegal block with a negative match length found"); 174 } 175 uncompressedBytesRemaining -= length; 176 177 offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff; 178 179 try { 180 startBackReference(offset, length); 181 } catch (final IllegalArgumentException ex) { 182 throw new IOException("Illegal block with bad offset found", ex); 183 } 184 state = State.IN_BACK_REFERENCE; 185 break; 186 default: 187 // impossible as TAG_MASK is two bits and all four possible cases have been covered 188 break; 189 } 190 } 191 192 /** 193 * Gets the uncompressed size of the stream 194 * 195 * @return the uncompressed size 196 */ 197 @Override 198 public int getSize() { 199 return size; 200 } 201 202 /** 203 * {@inheritDoc} 204 */ 205 @Override 206 public int read(final byte[] b, final int off, final int len) throws IOException { 207 if (len == 0) { 208 return 0; 209 } 210 if (endReached) { 211 return -1; 212 } 213 switch (state) { 214 case NO_BLOCK: 215 fill(); 216 return read(b, off, len); 217 case IN_LITERAL: 218 final int litLen = readLiteral(b, off, len); 219 if (!hasMoreDataInBlock()) { 220 state = State.NO_BLOCK; 221 } 222 return litLen > 0 ? litLen : read(b, off, len); 223 case IN_BACK_REFERENCE: 224 final int backReferenceLen = readBackReference(b, off, len); 225 if (!hasMoreDataInBlock()) { 226 state = State.NO_BLOCK; 227 } 228 return backReferenceLen > 0 ? backReferenceLen : read(b, off, len); 229 default: 230 throw new IOException("Unknown stream state " + state); 231 } 232 } 233 234 /* 235 * For literals up to and including 60 bytes in length, the upper six bits of the tag byte contain (len-1). The literal follows immediately thereafter in 236 * the bytestream. - For longer literals, the (len-1) value is stored after the tag byte, little-endian. The upper six bits of the tag byte describe how 237 * many bytes are used for the length; 60, 61, 62 or 63 for 1-4 bytes, respectively. The literal itself follows after the length. 238 */ 239 private int readLiteralLength(final int b) throws IOException { 240 final int length; 241 switch (b >> 2) { 242 case 60: 243 length = readOneByte(); 244 if (length == -1) { 245 throw new IOException("Premature end of stream reading literal length"); 246 } 247 break; 248 case 61: 249 length = (int) ByteUtils.fromLittleEndian(supplier, 2); 250 break; 251 case 62: 252 length = (int) ByteUtils.fromLittleEndian(supplier, 3); 253 break; 254 case 63: 255 length = (int) ByteUtils.fromLittleEndian(supplier, 4); 256 break; 257 default: 258 length = b >> 2; 259 break; 260 } 261 262 return length + 1; 263 } 264 265 /** 266 * The stream starts with the uncompressed length (up to a maximum of 2^32 - 1), stored as a little-endian varint. Varints consist of a series of bytes, 267 * where the lower 7 bits are data and the upper bit is set iff there are more bytes to be read. In other words, an uncompressed length of 64 would be 268 * stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) would be stored as 0xFE 0xFF 0x7F. 269 * 270 * @return The size of the uncompressed data 271 * 272 * @throws IOException Could not read a byte 273 */ 274 private long readSize() throws IOException { 275 int index = 0; 276 long sz = 0; 277 int b = 0; 278 279 do { 280 b = readOneByte(); 281 if (b == -1) { 282 throw new IOException("Premature end of stream reading size"); 283 } 284 sz |= (b & 0x7f) << index++ * 7; 285 } while (0 != (b & 0x80)); 286 return sz; 287 } 288}