View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.lz77support;
20  
21  import java.io.IOException;
22  import java.util.Objects;
23  
24  import org.apache.commons.lang3.ArrayFill;
25  
26  /**
27   * Helper class for compression algorithms that use the ideas of LZ77.
28   *
29   * <p>
30   * Most LZ77 derived algorithms split input data into blocks of uncompressed data (called literal blocks) and back-references (pairs of offsets and lengths)
31   * that state "add {@code length} bytes that are the same as those already written starting {@code offset} bytes before the current position. The details of how
32   * those blocks and back-references are encoded are quite different between the algorithms and some algorithms perform additional steps (Huffman encoding in the
33   * case of DEFLATE for example).
34   * </p>
35   *
36   * <p>
37   * This class attempts to extract the core logic - finding back-references - so it can be re-used. It follows the algorithm explained in section 4 of RFC 1951
38   * (DEFLATE) and currently doesn't implement the "lazy match" optimization. The three-byte hash function used in this class is the same as the one used by zlib
39   * and InfoZIP's ZIP implementation of DEFLATE. The whole class is strongly inspired by InfoZIP's implementation.
40   * </p>
41   *
42   * <p>
43   * LZ77 is used vaguely here (as well as many other places that talk about it :-), LZSS would likely be closer to the truth but LZ77 has become the synonym for
44   * a whole family of algorithms.
45   * </p>
46   *
47   * <p>
48   * The API consists of a compressor that is fed {@code byte}s and emits {@link Block}s to a registered callback where the blocks represent either
49   * {@link LiteralBlock literal blocks}, {@link BackReference back-references} or {@link EOD end of data markers}. In order to ensure the callback receives all
50   * information, the {@code #finish} method must be used once all data has been fed into the compressor.
51   * </p>
52   *
53   * <p>
54   * Several parameters influence the outcome of the "compression":
55   * </p>
56   * <dl>
57   *
58   * <dt>{@code windowSize}</dt>
59   * <dd>the size of the sliding window, must be a power of two - this determines the maximum offset a back-reference can take. The compressor maintains a buffer
60   * of twice of {@code windowSize} - real world values are in the area of 32k.</dd>
61   *
62   * <dt>{@code minBackReferenceLength}</dt>
63   * <dd>Minimal length of a back-reference found. A true minimum of 3 is hard-coded inside of this implementation but bigger lengths can be configured.</dd>
64   *
65   * <dt>{@code maxBackReferenceLength}</dt>
66   * <dd>Maximal length of a back-reference found.</dd>
67   *
68   * <dt>{@code maxOffset}</dt>
69   * <dd>Maximal offset of a back-reference.</dd>
70   *
71   * <dt>{@code maxLiteralLength}</dt>
72   * <dd>Maximal length of a literal block.</dd>
73   * </dl>
74   *
75   * @see "https://tools.ietf.org/html/rfc1951#section-4"
76   * @since 1.14
77   * @NotThreadSafe
78   */
79  public class LZ77Compressor {
80  
81      /**
82       * Represents a back-reference.
83       */
84      public static final class BackReference extends Block {
85          private final int offset, length;
86  
87          public BackReference(final int offset, final int length) {
88              this.offset = offset;
89              this.length = length;
90          }
91  
92          /**
93           * Provides the length of the back-reference.
94           *
95           * @return the length
96           */
97          public int getLength() {
98              return length;
99          }
100 
101         /**
102          * Provides the offset of the back-reference.
103          *
104          * @return the offset
105          */
106         public int getOffset() {
107             return offset;
108         }
109 
110         @Override
111         public BlockType getType() {
112             return BlockType.BACK_REFERENCE;
113         }
114 
115         @Override
116         public String toString() {
117             return "BackReference with offset " + offset + " and length " + length;
118         }
119     }
120 
121     /**
122      * Base class representing blocks the compressor may emit.
123      *
124      * <p>
125      * This class is not supposed to be subclassed by classes outside of Commons Compress so it is considered internal and changed that would break subclasses
126      * may get introduced with future releases.
127      * </p>
128      */
129     public abstract static class Block {
130         /** Enumeration of the block types the compressor may emit. */
131         public enum BlockType {
132             LITERAL, BACK_REFERENCE, EOD
133         }
134 
135         public abstract BlockType getType();
136     }
137 
138     /**
139      * Callback invoked while the compressor processes data.
140      *
141      * <p>
142      * The callback is invoked on the same thread that receives the bytes to compress and may be invoked multiple times during the execution of
143      * {@link #compress} or {@link #finish}.
144      * </p>
145      */
146     public interface Callback {
147         /**
148          * Consumes a block.
149          *
150          * @param b the block to consume
151          * @throws IOException in case of an error
152          */
153         void accept(Block b) throws IOException;
154     }
155 
156     /** A simple "we are done" marker. */
157     public static final class EOD extends Block {
158         @Override
159         public BlockType getType() {
160             return BlockType.EOD;
161         }
162     }
163 
164     /**
165      * Represents a literal block of data.
166      *
167      * <p>
168      * For performance reasons this encapsulates the real data, not a copy of it. Don't modify the data and process it inside of {@link Callback#accept}
169      * immediately as it will get overwritten sooner or later.
170      * </p>
171      */
172     public static final class LiteralBlock extends Block {
173         private final byte[] data;
174         private final int offset, length;
175 
176         public LiteralBlock(final byte[] data, final int offset, final int length) {
177             this.data = data;
178             this.offset = offset;
179             this.length = length;
180         }
181 
182         /**
183          * The literal data.
184          *
185          * <p>
186          * This returns a life view of the actual data in order to avoid copying, modify the array at your own risk.
187          * </p>
188          *
189          * @return the data
190          */
191         public byte[] getData() {
192             return data;
193         }
194 
195         /**
196          * Length of literal block.
197          *
198          * @return the length
199          */
200         public int getLength() {
201             return length;
202         }
203 
204         /**
205          * Offset into data where the literal block starts.
206          *
207          * @return the offset
208          */
209         public int getOffset() {
210             return offset;
211         }
212 
213         @Override
214         public BlockType getType() {
215             return BlockType.LITERAL;
216         }
217 
218         @Override
219         public String toString() {
220             return "LiteralBlock starting at " + offset + " with length " + length;
221         }
222     }
223 
224     private static final Block THE_EOD = new EOD();
225 
226     static final int NUMBER_OF_BYTES_IN_HASH = 3;
227     private static final int NO_MATCH = -1;
228 
229     // we use a 15 bit hash code as calculated in updateHash
230     private static final int HASH_SIZE = 1 << 15;
231     private static final int HASH_MASK = HASH_SIZE - 1;
232 
233     private static final int H_SHIFT = 5;
234     private final Parameters params;
235     private final Callback callback;
236 
237     // the sliding window, twice as big as "windowSize" parameter
238     private final byte[] window;
239 
240     // the head of hash-chain - indexed by hash-code, points to the
241     // location inside of window of the latest sequence of bytes with
242     // the given hash.
243     private final int[] head;
244     // for each window-location points to the latest earlier location
245     // with the same hash. Only stores values for the latest
246     // "windowSize" elements, the index is "window location modulo
247     // windowSize".
248     private final int[] prev;
249     // bit mask used when indexing into prev
250     private final int wMask;
251     private boolean initialized;
252     // the position inside of window that shall be encoded right now
253     private int currentPosition;
254     // the number of bytes available to compress including the one at
255     // currentPosition
256     private int lookahead;
257     // the hash of the three bytes stating at the current position
258     private int insertHash;
259 
260     // the position inside the window where the current literal
261     // block starts (in case we are inside a literal block).
262     private int blockStart;
263 
264     // position of the current match
265     private int matchStart = NO_MATCH;
266 
267     // number of missed insertString calls for the up to three last
268     // bytes of the last match that can only be performed once more
269     // data has been read
270     private int missedInserts;
271 
272     /**
273      * Initializes a compressor with parameters and a callback.
274      *
275      * @param params   the parameters
276      * @param callback the callback
277      * @throws NullPointerException if either parameter is {@code null}
278      */
279     public LZ77Compressor(final Parameters params, final Callback callback) {
280         Objects.requireNonNull(params, "params");
281         Objects.requireNonNull(callback, "callback");
282 
283         this.params = params;
284         this.callback = callback;
285 
286         final int wSize = params.getWindowSize();
287         window = new byte[wSize * 2];
288         wMask = wSize - 1;
289         head = ArrayFill.fill(new int[HASH_SIZE], NO_MATCH);
290         prev = new int[wSize];
291     }
292 
293     private void catchUpMissedInserts() {
294         while (missedInserts > 0) {
295             insertString(currentPosition - missedInserts--);
296         }
297     }
298 
299     private void compress() throws IOException {
300         final int minMatch = params.getMinBackReferenceLength();
301         final boolean lazy = params.getLazyMatching();
302         final int lazyThreshold = params.getLazyMatchingThreshold();
303 
304         while (lookahead >= minMatch) {
305             catchUpMissedInserts();
306             int matchLength = 0;
307             final int hashHead = insertString(currentPosition);
308             if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
309                 // sets matchStart as a side effect
310                 matchLength = longestMatch(hashHead);
311 
312                 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
313                     // try to find a longer match using the next position
314                     matchLength = longestMatchForNextPosition(matchLength);
315                 }
316             }
317             if (matchLength >= minMatch) {
318                 if (blockStart != currentPosition) {
319                     // emit preceding literal block
320                     flushLiteralBlock();
321                     blockStart = NO_MATCH;
322                 }
323                 flushBackReference(matchLength);
324                 insertStringsInMatch(matchLength);
325                 lookahead -= matchLength;
326                 currentPosition += matchLength;
327                 blockStart = currentPosition;
328             } else {
329                 // no match, append to current or start a new literal
330                 lookahead--;
331                 currentPosition++;
332                 if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
333                     flushLiteralBlock();
334                     blockStart = currentPosition;
335                 }
336             }
337         }
338     }
339 
340     /**
341      * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
342      *
343      * @param data the data to compress - must not be null
344      * @throws IOException if the callback throws an exception
345      */
346     public void compress(final byte[] data) throws IOException {
347         compress(data, 0, data.length);
348     }
349 
350     /**
351      * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
352      *
353      * @param data the data to compress - must not be null
354      * @param off  the start offset of the data
355      * @param len  the number of bytes to compress
356      * @throws IOException if the callback throws an exception
357      */
358     public void compress(final byte[] data, int off, int len) throws IOException {
359         final int wSize = params.getWindowSize();
360         while (len > wSize) { // chop into windowSize sized chunks
361             doCompress(data, off, wSize);
362             off += wSize;
363             len -= wSize;
364         }
365         if (len > 0) {
366             doCompress(data, off, len);
367         }
368     }
369 
370     // performs the actual algorithm with the pre-condition len <= windowSize
371     private void doCompress(final byte[] data, final int off, final int len) throws IOException {
372         final int spaceLeft = window.length - currentPosition - lookahead;
373         if (len > spaceLeft) {
374             slide();
375         }
376         System.arraycopy(data, off, window, currentPosition + lookahead, len);
377         lookahead += len;
378         if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
379             initialize();
380         }
381         if (initialized) {
382             compress();
383         }
384     }
385 
386     /**
387      * Tells the compressor to process all remaining data and signal end of data to the callback.
388      *
389      * <p>
390      * The compressor will in turn emit at least one block ({@link EOD}) but potentially multiple blocks to the callback during the execution of this method.
391      * </p>
392      *
393      * @throws IOException if the callback throws an exception
394      */
395     public void finish() throws IOException {
396         if (blockStart != currentPosition || lookahead > 0) {
397             currentPosition += lookahead;
398             flushLiteralBlock();
399         }
400         callback.accept(THE_EOD);
401     }
402 
403     private void flushBackReference(final int matchLength) throws IOException {
404         callback.accept(new BackReference(currentPosition - matchStart, matchLength));
405     }
406 
407     private void flushLiteralBlock() throws IOException {
408         callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
409     }
410 
411     private void initialize() {
412         for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
413             insertHash = nextHash(insertHash, window[i]);
414         }
415         initialized = true;
416     }
417 
418     /**
419      * Inserts the current three byte sequence into the dictionary and returns the previous head of the hash-chain.
420      *
421      * <p>
422      * Updates {@code insertHash} and {@code prev} as a side effect.
423      * </p>
424      */
425     private int insertString(final int pos) {
426         insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
427         final int hashHead = head[insertHash];
428         prev[pos & wMask] = hashHead;
429         head[insertHash] = pos;
430         return hashHead;
431     }
432 
433     private void insertStringsInMatch(final int matchLength) {
434         // inserts strings contained in current match
435         // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
436         final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
437         // currentPosition has been inserted already
438         for (int i = 1; i <= stop; i++) {
439             insertString(currentPosition + i);
440         }
441         missedInserts = matchLength - stop - 1;
442     }
443 
444     /**
445      * Searches the hash chain for real matches and returns the length of the longest match (0 if none were found) that isn't too far away (WRT maxOffset).
446      *
447      * <p>
448      * Sets matchStart to the index of the start position of the longest match as a side effect.
449      * </p>
450      */
451     private int longestMatch(int matchHead) {
452         final int minLength = params.getMinBackReferenceLength();
453         int longestMatchLength = minLength - 1;
454         final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
455         final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
456         final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
457         final int maxCandidates = params.getMaxCandidates();
458         for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
459             int currentLength = 0;
460             for (int i = 0; i < maxPossibleLength; i++) {
461                 if (window[matchHead + i] != window[currentPosition + i]) {
462                     break;
463                 }
464                 currentLength++;
465             }
466             if (currentLength > longestMatchLength) {
467                 longestMatchLength = currentLength;
468                 matchStart = matchHead;
469                 if (currentLength >= niceBackReferenceLength) {
470                     // no need to search any further
471                     break;
472                 }
473             }
474             matchHead = prev[matchHead & wMask];
475         }
476         return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
477     }
478 
479     private int longestMatchForNextPosition(final int prevMatchLength) {
480         // save a bunch of values to restore them if the next match isn't better than the current one
481         final int prevMatchStart = matchStart;
482         final int prevInsertHash = insertHash;
483 
484         lookahead--;
485         currentPosition++;
486         final int hashHead = insertString(currentPosition);
487         final int prevHashHead = prev[currentPosition & wMask];
488         int matchLength = longestMatch(hashHead);
489 
490         if (matchLength <= prevMatchLength) {
491             // use the first match, as the next one isn't any better
492             matchLength = prevMatchLength;
493             matchStart = prevMatchStart;
494 
495             // restore modified values
496             head[insertHash] = prevHashHead;
497             insertHash = prevInsertHash;
498             currentPosition--;
499             lookahead++;
500         }
501         return matchLength;
502     }
503 
504     /**
505      * Assumes we are calculating the hash for three consecutive bytes as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC the new hash for BCD is
506      * nextHash(H, D).
507      *
508      * <p>
509      * The hash is shifted by five bits on each update so all effects of A have been swapped after the third update.
510      * </p>
511      */
512     private int nextHash(final int oldHash, final byte nextByte) {
513         final int nextVal = nextByte & 0xFF;
514         return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK;
515     }
516 
517     /**
518      * Adds some initial data to fill the window with.
519      *
520      * <p>
521      * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
522      * LZ4 frame format using block dependency.
523      * </p>
524      *
525      * @param data the data to fill the window with.
526      * @throws IllegalStateException if the compressor has already started to accept data
527      */
528     public void prefill(final byte[] data) {
529         if (currentPosition != 0 || lookahead != 0) {
530             throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore");
531         }
532 
533         // don't need more than windowSize for back-references
534         final int len = Math.min(params.getWindowSize(), data.length);
535         System.arraycopy(data, data.length - len, window, 0, len);
536 
537         if (len >= NUMBER_OF_BYTES_IN_HASH) {
538             initialize();
539             final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
540             for (int i = 0; i < stop; i++) {
541                 insertString(i);
542             }
543             missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
544         } else { // not enough data to hash anything
545             missedInserts = len;
546         }
547         blockStart = currentPosition = len;
548     }
549 
550     private void slide() throws IOException {
551         final int wSize = params.getWindowSize();
552         if (blockStart != currentPosition && blockStart < wSize) {
553             flushLiteralBlock();
554             blockStart = currentPosition;
555         }
556         System.arraycopy(window, wSize, window, 0, wSize);
557         currentPosition -= wSize;
558         matchStart -= wSize;
559         blockStart -= wSize;
560         for (int i = 0; i < HASH_SIZE; i++) {
561             final int h = head[i];
562             head[i] = h >= wSize ? h - wSize : NO_MATCH;
563         }
564         for (int i = 0; i < wSize; i++) {
565             final int p = prev[i];
566             prev[i] = p >= wSize ? p - wSize : NO_MATCH;
567         }
568     }
569 }