View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.lz4;
20  
21  import java.io.IOException;
22  import java.io.OutputStream;
23  import java.util.Arrays;
24  import java.util.Deque;
25  import java.util.Iterator;
26  import java.util.LinkedList;
27  
28  import org.apache.commons.compress.compressors.CompressorOutputStream;
29  import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
30  import org.apache.commons.compress.compressors.lz77support.Parameters;
31  import org.apache.commons.compress.utils.ByteUtils;
32  
33  /**
34   * CompressorOutputStream for the LZ4 block format.
35   *
36   * @see <a href="https://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a>
37   * @since 1.14
38   * @NotThreadSafe
39   */
40  public class BlockLZ4CompressorOutputStream extends CompressorOutputStream<OutputStream> {
41  
42      static final class Pair {
43  
44          private static int lengths(final int litLength, final int brLength) {
45              final int l = Math.min(litLength, 15);
46              final int br = brLength < 4 ? 0 : brLength < 19 ? brLength - 4 : 15;
47              return l << BlockLZ4CompressorInputStream.SIZE_BITS | br;
48          }
49  
50          private static void writeLength(int length, final OutputStream out) throws IOException {
51              while (length >= 255) {
52                  out.write(255);
53                  length -= 255;
54              }
55              out.write(length);
56          }
57  
58          private final Deque<byte[]> literals = new LinkedList<>();
59  
60          private int literalLength;
61  
62          private int brOffset, brLength;
63  
64          private boolean written;
65  
66          byte[] addLiteral(final LZ77Compressor.LiteralBlock block) {
67              final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), block.getOffset() + block.getLength());
68              literals.add(copy);
69              literalLength += copy.length;
70              return copy;
71          }
72  
73          private int backReferenceLength() {
74              return brLength;
75          }
76  
77          boolean canBeWritten(final int lengthOfBlocksAfterThisPair) {
78              return hasBackReference() && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH;
79          }
80  
81          boolean hasBackReference() {
82              return brOffset > 0;
83          }
84  
85          private boolean hasBeenWritten() {
86              return written;
87          }
88  
89          int length() {
90              return literalLength() + brLength;
91          }
92  
93          private int literalLength() {
94              // This method is performance sensitive
95              if (literalLength != 0) {
96                  return literalLength;
97              }
98              int length = 0;
99              for (final byte[] b : literals) {
100                 length += b.length;
101             }
102             return literalLength = length;
103         }
104 
105         private void prependLiteral(final byte[] data) {
106             literals.addFirst(data);
107             literalLength += data.length;
108         }
109 
110         private void prependTo(final Pair other) {
111             final Iterator<byte[]> listBackwards = literals.descendingIterator();
112             while (listBackwards.hasNext()) {
113                 other.prependLiteral(listBackwards.next());
114             }
115         }
116 
117         void setBackReference(final LZ77Compressor.BackReference block) {
118             if (hasBackReference()) {
119                 throw new IllegalStateException();
120             }
121             brOffset = block.getOffset();
122             brLength = block.getLength();
123         }
124 
125         private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) {
126             final Pair p = new Pair();
127             p.literals.addAll(literals);
128             p.brOffset = brOffset;
129             p.brLength = newBackReferenceLength;
130             return p;
131         }
132 
133         void writeTo(final OutputStream out) throws IOException {
134             final int litLength = literalLength();
135             out.write(lengths(litLength, brLength));
136             if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
137                 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
138             }
139             for (final byte[] b : literals) {
140                 out.write(b);
141             }
142             if (hasBackReference()) {
143                 ByteUtils.toLittleEndian(out, brOffset, 2);
144                 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
145                     writeLength(brLength - MIN_BACK_REFERENCE_LENGTH - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
146                 }
147             }
148             written = true;
149         }
150     }
151 
152     private static final int MIN_BACK_REFERENCE_LENGTH = 4;
153 
154     /*
155      *
156      * The LZ4 block format has a few properties that make it less straight-forward than one would hope:
157      *
158      * literal blocks and back-references must come in pairs (except for the very last literal block), so consecutive literal blocks created by the compressor
159      * must be merged into a single block.
160      *
161      * the start of a literal/back-reference pair contains the length of the back-reference (at least some part of it) so we can't start writing the literal
162      * before we know how long the next back-reference is going to be.
163      *
164      * there are special rules for the final blocks
165      *
166      * > There are specific parsing rules to respect in order to remain > compatible with assumptions made by the decoder : > > 1. The last 5 bytes are always
167      * literals > > 2. The last match must start at least 12 bytes before end of > block. Consequently, a block with less than 13 bytes cannot be > compressed.
168      *
169      * which means any back-reference may need to get rewritten as a literal block unless we know the next block is at least of length 5 and the sum of this
170      * block's length and offset and the next block's length is at least twelve.
171      */
172 
173     private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12;
174 
175     /**
176      * Returns a builder correctly configured for the LZ4 algorithm.
177      *
178      * @return a builder correctly configured for the LZ4 algorithm
179      */
180     public static Parameters.Builder createParameterBuilder() {
181         final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1;
182         return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE).withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH)
183                 .withMaxBackReferenceLength(maxLen).withMaxOffset(maxLen).withMaxLiteralLength(maxLen);
184     }
185 
186     private final LZ77Compressor compressor;
187 
188     // used in one-arg write method
189     private final byte[] oneByte = new byte[1];
190     private boolean finished;
191 
192     private final Deque<Pair> pairs = new LinkedList<>();
193 
194     // keeps track of the last window-size bytes (64k) in order to be
195     // able to expand back-references when needed
196     private final Deque<byte[]> expandedBlocks = new LinkedList<>();
197 
198     /**
199      * Creates a new LZ4 output stream.
200      *
201      * @param out An OutputStream to read compressed data from
202      */
203     public BlockLZ4CompressorOutputStream(final OutputStream out) {
204         this(out, createParameterBuilder().build());
205     }
206 
207     /**
208      * Creates a new LZ4 output stream.
209      *
210      * @param out     An OutputStream to read compressed data from
211      * @param params The parameters to use for LZ77 compression.
212      */
213     public BlockLZ4CompressorOutputStream(final OutputStream out, final Parameters params) {
214         super(out);
215         compressor = new LZ77Compressor(params, block -> {
216             switch (block.getType()) {
217             case LITERAL:
218                 addLiteralBlock((LZ77Compressor.LiteralBlock) block);
219                 break;
220             case BACK_REFERENCE:
221                 addBackReference((LZ77Compressor.BackReference) block);
222                 break;
223             case EOD:
224                 writeFinalLiteralBlock();
225                 break;
226             }
227         });
228     }
229 
230     private void addBackReference(final LZ77Compressor.BackReference block) throws IOException {
231         final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
232         last.setBackReference(block);
233         recordBackReference(block);
234         clearUnusedBlocksAndPairs();
235     }
236 
237     private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
238         final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
239         recordLiteral(last.addLiteral(block));
240         clearUnusedBlocksAndPairs();
241     }
242 
243     private void clearUnusedBlocks() {
244         int blockLengths = 0;
245         int blocksToKeep = 0;
246         for (final byte[] b : expandedBlocks) {
247             blocksToKeep++;
248             blockLengths += b.length;
249             if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
250                 break;
251             }
252         }
253         final int size = expandedBlocks.size();
254         for (int i = blocksToKeep; i < size; i++) {
255             expandedBlocks.removeLast();
256         }
257     }
258 
259     private void clearUnusedBlocksAndPairs() {
260         clearUnusedBlocks();
261         clearUnusedPairs();
262     }
263 
264     private void clearUnusedPairs() {
265         int pairLengths = 0;
266         int pairsToKeep = 0;
267         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
268             final Pair p = it.next();
269             pairsToKeep++;
270             pairLengths += p.length();
271             if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
272                 break;
273             }
274         }
275         final int size = pairs.size();
276         for (int i = pairsToKeep; i < size; i++) {
277             final Pair p = pairs.peekFirst();
278             if (!p.hasBeenWritten()) {
279                 break;
280             }
281             pairs.removeFirst();
282         }
283     }
284 
285     @Override
286     public void close() throws IOException {
287         try {
288             finish();
289         } finally {
290             out.close();
291         }
292     }
293 
294     private byte[] expand(final int offset, final int length) {
295         final byte[] expanded = new byte[length];
296         if (offset == 1) { // surprisingly common special case
297             final byte[] block = expandedBlocks.peekFirst();
298             final byte b = block[block.length - 1];
299             if (b != 0) { // the fresh array contains 0s anyway
300                 Arrays.fill(expanded, b);
301             }
302         } else {
303             expandFromList(expanded, offset, length);
304         }
305         return expanded;
306     }
307 
308     private void expandFromList(final byte[] expanded, final int offset, final int length) {
309         int offsetRemaining = offset;
310         int lengthRemaining = length;
311         int writeOffset = 0;
312         while (lengthRemaining > 0) {
313             // find block that contains offsetRemaining
314             byte[] block = null;
315             final int copyLen;
316             final int copyOffset;
317             if (offsetRemaining > 0) {
318                 int blockOffset = 0;
319                 for (final byte[] b : expandedBlocks) {
320                     if (b.length + blockOffset >= offsetRemaining) {
321                         block = b;
322                         break;
323                     }
324                     blockOffset += b.length;
325                 }
326                 if (block == null) {
327                     // should not be possible
328                     throw new IllegalStateException("Failed to find a block containing offset " + offset);
329                 }
330                 copyOffset = blockOffset + block.length - offsetRemaining;
331                 copyLen = Math.min(lengthRemaining, block.length - copyOffset);
332             } else {
333                 // offsetRemaining is negative or 0 and points into the expanded bytes
334                 block = expanded;
335                 copyOffset = -offsetRemaining;
336                 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
337             }
338             System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
339             offsetRemaining -= copyLen;
340             lengthRemaining -= copyLen;
341             writeOffset += copyLen;
342         }
343     }
344 
345     /**
346      * Compresses all remaining data and writes it to the stream, doesn't close the underlying stream.
347      *
348      * @throws IOException if an error occurs
349      */
350     public void finish() throws IOException {
351         if (!finished) {
352             compressor.finish();
353             finished = true;
354         }
355     }
356 
357     /**
358      * Adds some initial data to fill the window with.
359      *
360      * @param data the data to fill the window with.
361      * @param off  offset of real data into the array
362      * @param len  amount of data
363      * @throws IllegalStateException if the stream has already started to write data
364      * @see LZ77Compressor#prefill
365      */
366     public void prefill(final byte[] data, final int off, final int len) {
367         if (len > 0) {
368             final byte[] b = Arrays.copyOfRange(data, off, off + len);
369             compressor.prefill(b);
370             recordLiteral(b);
371         }
372     }
373 
374     private void recordBackReference(final LZ77Compressor.BackReference block) {
375         expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
376     }
377 
378     private void recordLiteral(final byte[] b) {
379         expandedBlocks.addFirst(b);
380     }
381 
382     private void rewriteLastPairs() {
383         final LinkedList<Pair> lastPairs = new LinkedList<>();
384         final LinkedList<Integer> pairLength = new LinkedList<>();
385         int offset = 0;
386         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
387             final Pair p = it.next();
388             if (p.hasBeenWritten()) {
389                 break;
390             }
391             final int len = p.length();
392             pairLength.addFirst(len);
393             lastPairs.addFirst(p);
394             offset += len;
395             if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
396                 break;
397             }
398         }
399         lastPairs.forEach(pairs::remove);
400         // lastPairs may contain between one and four Pairs:
401         // * the last pair may be a one byte literal
402         // * all other Pairs contain a back-reference which must be four bytes long at minimum
403         // we could merge them all into a single literal block but
404         // this may harm compression. For example compressing
405         // "bla.tar" from our tests yields a last block containing a
406         // back-reference of length > 2k and we'd end up with a last
407         // literal of that size rather than a 2k back-reference and a
408         // 12 byte literal at the end.
409 
410         // Instead we merge all but the first of lastPairs into a new
411         // literal-only Pair "replacement" and look at the
412         // back-reference in the first of lastPairs and see if we can
413         // split it. We can split it if it is longer than 16 -
414         // replacement.length (i.e. the minimal length of four is kept
415         // while making sure the last literal is at least twelve bytes
416         // long). If we can't split it, we expand the first of the pairs
417         // as well.
418 
419         // this is not optimal, we could get better compression
420         // results with more complex approaches as the last literal
421         // only needs to be five bytes long if the previous
422         // back-reference has an offset big enough
423 
424         final int lastPairsSize = lastPairs.size();
425         int toExpand = 0;
426         for (int i = 1; i < lastPairsSize; i++) {
427             toExpand += pairLength.get(i);
428         }
429         final Pair replacement = new Pair();
430         if (toExpand > 0) {
431             replacement.prependLiteral(expand(toExpand, toExpand));
432         }
433         final Pair splitCandidate = lastPairs.get(0);
434         final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
435         final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0;
436         if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) {
437             replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
438             pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded));
439         } else {
440             if (splitCandidate.hasBackReference()) {
441                 replacement.prependLiteral(expand(toExpand + brLen, brLen));
442             }
443             splitCandidate.prependTo(replacement);
444         }
445         pairs.add(replacement);
446     }
447 
448     @Override
449     public void write(final byte[] data, final int off, final int len) throws IOException {
450         compressor.compress(data, off, len);
451     }
452 
453     @Override
454     public void write(final int b) throws IOException {
455         oneByte[0] = (byte) (b & 0xff);
456         write(oneByte);
457     }
458 
459     private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException {
460         writeWritablePairs(length);
461         Pair last = pairs.peekLast();
462         if (last == null || last.hasBackReference()) {
463             last = new Pair();
464             pairs.addLast(last);
465         }
466         return last;
467     }
468 
469     private void writeFinalLiteralBlock() throws IOException {
470         rewriteLastPairs();
471         for (final Pair p : pairs) {
472             if (!p.hasBeenWritten()) {
473                 p.writeTo(out);
474             }
475         }
476         pairs.clear();
477     }
478 
479     private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException {
480         int unwrittenLength = lengthOfBlocksAfterLastPair;
481         for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext();) {
482             final Pair p = it.next();
483             if (p.hasBeenWritten()) {
484                 break;
485             }
486             unwrittenLength += p.length();
487         }
488         for (final Pair p : pairs) {
489             if (p.hasBeenWritten()) {
490                 continue;
491             }
492             unwrittenLength -= p.length();
493             if (!p.canBeWritten(unwrittenLength)) {
494                 break;
495             }
496             p.writeTo(out);
497         }
498     }
499 }