1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.digest;
19
20 import org.apache.commons.codec.binary.StringUtils;
21
22 /**
23 * Implements the MurmurHash3 32-bit and 128-bit hash functions.
24 *
25 * <p>
26 * MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic
27 * operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not
28 * specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes.
29 * </p>
30 *
31 * <p>
32 * This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function
33 * {@code MurmurHash3_x64_128} from Austin Appleby's original {@code c++} code in SMHasher.
34 * </p>
35 *
36 * <p>
37 * This is public domain code with no copyrights. From home page of
38 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:
39 * </p>
40 *
41 * <blockquote> "All MurmurHash versions are public domain software, and the author disclaims all copyright to their
42 * code." </blockquote>
43 *
44 * <p>
45 * Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original
46 * MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a
47 * 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes.
48 * </p>
49 *
50 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
51 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> Original MurmurHash3 c++
52 * code</a>
53 * @see <a href=
54 * "https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java">
55 * Apache Hive Murmer3</a>
56 * @since 1.13
57 */
58 public final class MurmurHash3 {
59
60 /**
61 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
62 * hash computed.
63 *
64 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
65 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
66 *
67 * <p>This implementation contains a sign-extension bug in the finalization step of
68 * any bytes left over from dividing the length by 4. This manifests if any of these
69 * bytes are negative.</p>
70 *
71 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
72 */
73 @Deprecated
74 public static class IncrementalHash32 extends IncrementalHash32x86 {
75
76 /**
77 * {@inheritDoc}
78 *
79 * <p>This implementation contains a sign-extension bug in the finalization step of
80 * any bytes left over from dividing the length by 4. This manifests if any of these
81 * bytes are negative.<p>
82 *
83 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
84 */
85 @Override
86 @Deprecated
87 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) {
88 int result = hash;
89 // ************
90 // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
91 // ************
92 int k1 = 0;
93 switch (unprocessedLength) {
94 case 3:
95 k1 ^= unprocessed[2] << 16;
96 case 2:
97 k1 ^= unprocessed[1] << 8;
98 case 1:
99 k1 ^= unprocessed[0];
100
101 // mix functions
102 k1 *= C1_32;
103 k1 = Integer.rotateLeft(k1, R1_32);
104 k1 *= C2_32;
105 result ^= k1;
106 }
107
108 // finalization
109 result ^= totalLen;
110 return fmix32(result);
111 }
112 }
113
114 /**
115 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
116 * hash computed.
117 *
118 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
119 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
120 *
121 * @since 1.14
122 */
123 public static class IncrementalHash32x86 {
124
125 /** The size of byte blocks that are processed together. */
126 private static final int BLOCK_SIZE = 4;
127
128 /**
129 * Combines the bytes using an Or operation ({@code | } in a little-endian representation
130 * of a 32-bit integer; byte 1 will be the least significant byte, byte 4 the most
131 * significant.
132 *
133 * @param b1 The first byte
134 * @param b2 The second byte
135 * @param b3 The third byte
136 * @param b4 The fourth byte
137 * @return The 32-bit integer
138 */
139 private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) {
140 return b1 & 0xff | (b2 & 0xff) << 8 | (b3 & 0xff) << 16 | (b4 & 0xff) << 24;
141 }
142
143 /** Up to 3 unprocessed bytes from input data. */
144 private final byte[] unprocessed = new byte[3];
145
146 /** The number of unprocessed bytes in the tail data. */
147 private int unprocessedLength;
148
149 /** The total number of input bytes added since the start. */
150 private int totalLen;
151
152 /**
153 * The current running hash.
154 * This must be finalised to generate the 32-bit hash value.
155 */
156 private int hash;
157
158 /**
159 * Adds the byte array to the current incremental hash.
160 *
161 * @param data The input byte array
162 * @param offset The offset of data
163 * @param length The length of array
164 */
165 public final void add(final byte[] data, final int offset, final int length) {
166 if (length <= 0) {
167 // Nothing to add
168 return;
169 }
170 totalLen += length;
171
172 // Process the bytes in blocks of 4.
173 // New bytes must be added to any current unprocessed bytes,
174 // then processed in blocks of 4 and the remaining bytes saved:
175 //
176 // |--|---------------------------|--|
177 // unprocessed
178 // main block
179 // remaining
180
181 // Check if the unprocessed bytes and new bytes can fill a block of 4.
182 // Make this overflow safe in the event that length is Integer.MAX_VALUE.
183 // Equivalent to: (unprocessedLength + length < BLOCK_SIZE)
184 if (unprocessedLength + length - BLOCK_SIZE < 0) {
185 // Not enough so add to the unprocessed bytes
186 System.arraycopy(data, offset, unprocessed, unprocessedLength, length);
187 unprocessedLength += length;
188 return;
189 }
190
191 // Combine unprocessed bytes with new bytes.
192 final int newOffset;
193 final int newLength;
194 if (unprocessedLength > 0) {
195 int k = -1;
196 switch (unprocessedLength) {
197 case 1:
198 k = orBytes(unprocessed[0], data[offset], data[offset + 1], data[offset + 2]);
199 break;
200 case 2:
201 k = orBytes(unprocessed[0], unprocessed[1], data[offset], data[offset + 1]);
202 break;
203 case 3:
204 k = orBytes(unprocessed[0], unprocessed[1], unprocessed[2], data[offset]);
205 break;
206 default:
207 throw new IllegalStateException("Unprocessed length should be 1, 2, or 3: " + unprocessedLength);
208 }
209 hash = mix32(k, hash);
210 // Update the offset and length
211 final int consumed = BLOCK_SIZE - unprocessedLength;
212 newOffset = offset + consumed;
213 newLength = length - consumed;
214 } else {
215 newOffset = offset;
216 newLength = length;
217 }
218
219 // Main processing of blocks of 4 bytes
220 final int nblocks = newLength >> 2;
221
222 for (int i = 0; i < nblocks; i++) {
223 final int index = newOffset + (i << 2);
224 final int k = getLittleEndianInt(data, index);
225 hash = mix32(k, hash);
226 }
227
228 // Save left-over unprocessed bytes
229 final int consumed = nblocks << 2;
230 unprocessedLength = newLength - consumed;
231 if (unprocessedLength != 0) {
232 System.arraycopy(data, newOffset + consumed, unprocessed, 0, unprocessedLength);
233 }
234 }
235
236 /**
237 * Generates the 32-bit hash value. Repeat calls to this method with no additional data
238 * will generate the same hash value.
239 *
240 * @return The 32-bit hash
241 */
242 public final int end() {
243 // Allow calling end() again after adding no data to return the same result.
244 return finalise(hash, unprocessedLength, unprocessed, totalLen);
245 }
246
247 /**
248 * Finalizes the running hash to the output 32-bit hash by processing remaining bytes
249 * and performing final mixing.
250 *
251 * @param hash The running hash
252 * @param unprocessedLength The number of unprocessed bytes in the tail data.
253 * @param unprocessed Up to 3 unprocessed bytes from input data.
254 * @param totalLen The total number of input bytes added since the start.
255 * @return The 32-bit hash
256 */
257 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) {
258 int result = hash;
259 int k1 = 0;
260 switch (unprocessedLength) {
261 case 3:
262 k1 ^= (unprocessed[2] & 0xff) << 16;
263 case 2:
264 k1 ^= (unprocessed[1] & 0xff) << 8;
265 case 1:
266 k1 ^= unprocessed[0] & 0xff;
267
268 // mix functions
269 k1 *= C1_32;
270 k1 = Integer.rotateLeft(k1, R1_32);
271 k1 *= C2_32;
272 result ^= k1;
273 }
274
275 // finalization
276 result ^= totalLen;
277 return fmix32(result);
278 }
279
280 /**
281 * Starts a new incremental hash.
282 *
283 * @param seed The initial seed value
284 */
285 public final void start(final int seed) {
286 // Reset
287 unprocessedLength = totalLen = 0;
288 this.hash = seed;
289 }
290 }
291
292 /**
293 * A random number to use for a hash code.
294 *
295 * @deprecated This is not used internally and will be removed in a future release.
296 */
297 @Deprecated
298 public static final long NULL_HASHCODE = 2862933555777941757L;
299 /**
300 * A default seed to use for the murmur hash algorithm.
301 * Has the value {@code 104729}.
302 */
303 public static final int DEFAULT_SEED = 104729;
304 // Constants for 32-bit variant
305 private static final int C1_32 = 0xcc9e2d51;
306 private static final int C2_32 = 0x1b873593;
307 private static final int R1_32 = 15;
308 private static final int R2_32 = 13;
309
310 private static final int M_32 = 5;
311 private static final int N_32 = 0xe6546b64;
312 // Constants for 128-bit variant
313 private static final long C1 = 0x87c37b91114253d5L;
314 private static final long C2 = 0x4cf5ad432745937fL;
315 private static final int R1 = 31;
316 private static final int R2 = 27;
317 private static final int R3 = 33;
318 private static final int M = 5;
319
320 private static final int N1 = 0x52dce729;
321
322 private static final int N2 = 0x38495ab5;
323
324 /**
325 * Performs the final avalanche mix step of the 32-bit hash function {@code MurmurHash3_x86_32}.
326 *
327 * @param hash The current hash
328 * @return The final hash
329 */
330 private static int fmix32(int hash) {
331 hash ^= hash >>> 16;
332 hash *= 0x85ebca6b;
333 hash ^= hash >>> 13;
334 hash *= 0xc2b2ae35;
335 hash ^= hash >>> 16;
336 return hash;
337 }
338
339 /**
340 * Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}.
341 *
342 * @param hash The current hash
343 * @return The final hash
344 */
345 private static long fmix64(long hash) {
346 hash ^= hash >>> 33;
347 hash *= 0xff51afd7ed558ccdL;
348 hash ^= hash >>> 33;
349 hash *= 0xc4ceb9fe1a85ec53L;
350 hash ^= hash >>> 33;
351 return hash;
352 }
353
354 /**
355 * Gets the little-endian int from 4 bytes starting at the specified index.
356 *
357 * @param data The data
358 * @param index The index
359 * @return The little-endian int
360 */
361 private static int getLittleEndianInt(final byte[] data, final int index) {
362 return data[index ] & 0xff |
363 (data[index + 1] & 0xff) << 8 |
364 (data[index + 2] & 0xff) << 16 |
365 (data[index + 3] & 0xff) << 24;
366 }
367
368 /**
369 * Gets the little-endian long from 8 bytes starting at the specified index.
370 *
371 * @param data The data
372 * @param index The index
373 * @return The little-endian long
374 */
375 private static long getLittleEndianLong(final byte[] data, final int index) {
376 return (long) data[index ] & 0xff |
377 ((long) data[index + 1] & 0xff) << 8 |
378 ((long) data[index + 2] & 0xff) << 16 |
379 ((long) data[index + 3] & 0xff) << 24 |
380 ((long) data[index + 4] & 0xff) << 32 |
381 ((long) data[index + 5] & 0xff) << 40 |
382 ((long) data[index + 6] & 0xff) << 48 |
383 ((long) data[index + 7] & 0xff) << 56;
384 }
385
386 /**
387 * Generates 128-bit hash from the byte array with a default seed.
388 * This is a helper method that will produce the same result as:
389 *
390 * <pre>
391 * int offset = 0;
392 * int seed = 104729;
393 * int hash = MurmurHash3.hash128(data, offset, data.length, seed);
394 * </pre>
395 *
396 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
397 * this result as the default seed is positive.</p>
398 *
399 * @param data The input byte array
400 * @return The 128-bit hash (2 longs)
401 * @see #hash128(byte[], int, int, int)
402 */
403 public static long[] hash128(final byte[] data) {
404 return hash128(data, 0, data.length, DEFAULT_SEED);
405 }
406
407 /**
408 * Generates 128-bit hash from the byte array with the given offset, length and seed.
409 *
410 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
411 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
412 *
413 * <p>This implementation contains a sign-extension bug in the seed initialization.
414 * This manifests if the seed is negative.</p>
415 *
416 * @param data The input byte array
417 * @param offset The first element of array
418 * @param length The length of array
419 * @param seed The initial seed value
420 * @return The 128-bit hash (2 longs)
421 * @deprecated Use {@link #hash128x64(byte[], int, int, int)}. This corrects the seed initialization.
422 */
423 @Deprecated
424 public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) {
425 // ************
426 // Note: This deliberately fails to apply masking using 0xffffffffL to the seed
427 // to maintain behavioral compatibility with the original version.
428 // The implicit conversion to a long will extend a negative sign
429 // bit through the upper 32-bits of the long seed. These should be zero.
430 // ************
431 return hash128x64Internal(data, offset, length, seed);
432 }
433
434 /**
435 * Generates 128-bit hash from a string with a default seed.
436 * <p>
437 * Before 1.14 the string was converted using default encoding.
438 * Since 1.14 the string is converted to bytes using UTF-8 encoding.
439 * </p>
440 * <p>
441 * This is a helper method that will produce the same result as:
442 * </p>
443 *
444 * <pre>
445 * int offset = 0;
446 * int seed = 104729;
447 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
448 * int hash = MurmurHash3.hash128(bytes, offset, bytes.length, seed);
449 * </pre>
450 *
451 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
452 * this result as the default seed is positive.</p>
453 *
454 * @param data The input String
455 * @return The 128-bit hash (2 longs)
456 * @see #hash128(byte[], int, int, int)
457 * @deprecated Use {@link #hash128x64(byte[])} using the bytes returned from
458 * {@link String#getBytes(java.nio.charset.Charset)}.
459 */
460 @Deprecated
461 public static long[] hash128(final String data) {
462 final byte[] bytes = StringUtils.getBytesUtf8(data);
463 return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
464 }
465
466 /**
467 * Generates 128-bit hash from the byte array with a seed of zero.
468 * This is a helper method that will produce the same result as:
469 *
470 * <pre>
471 * int offset = 0;
472 * int seed = 0;
473 * int hash = MurmurHash3.hash128x64(data, offset, data.length, seed);
474 * </pre>
475 *
476 * @param data The input byte array
477 * @return The 128-bit hash (2 longs)
478 * @see #hash128x64(byte[], int, int, int)
479 * @since 1.14
480 */
481 public static long[] hash128x64(final byte[] data) {
482 return hash128x64(data, 0, data.length, 0);
483 }
484
485 /**
486 * Generates 128-bit hash from the byte array with the given offset, length and seed.
487 *
488 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
489 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
490 *
491 * @param data The input byte array
492 * @param offset The first element of array
493 * @param length The length of array
494 * @param seed The initial seed value
495 * @return The 128-bit hash (2 longs)
496 * @since 1.14
497 */
498 public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) {
499 // Use an unsigned 32-bit integer as the seed
500 return hash128x64Internal(data, offset, length, seed & 0xffffffffL);
501 }
502
503 /**
504 * Generates 128-bit hash from the byte array with the given offset, length and seed.
505 *
506 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
507 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
508 *
509 * @param data The input byte array
510 * @param offset The first element of array
511 * @param length The length of array
512 * @param seed The initial seed value
513 * @return The 128-bit hash (2 longs)
514 */
515 private static long[] hash128x64Internal(final byte[] data, final int offset, final int length, final long seed) {
516 long h1 = seed;
517 long h2 = seed;
518 final int nblocks = length >> 4;
519
520 // body
521 for (int i = 0; i < nblocks; i++) {
522 final int index = offset + (i << 4);
523 long k1 = getLittleEndianLong(data, index);
524 long k2 = getLittleEndianLong(data, index + 8);
525
526 // mix functions for k1
527 k1 *= C1;
528 k1 = Long.rotateLeft(k1, R1);
529 k1 *= C2;
530 h1 ^= k1;
531 h1 = Long.rotateLeft(h1, R2);
532 h1 += h2;
533 h1 = h1 * M + N1;
534
535 // mix functions for k2
536 k2 *= C2;
537 k2 = Long.rotateLeft(k2, R3);
538 k2 *= C1;
539 h2 ^= k2;
540 h2 = Long.rotateLeft(h2, R1);
541 h2 += h1;
542 h2 = h2 * M + N2;
543 }
544
545 // tail
546 long k1 = 0;
547 long k2 = 0;
548 final int index = offset + (nblocks << 4);
549 switch (offset + length - index) {
550 case 15:
551 k2 ^= ((long) data[index + 14] & 0xff) << 48;
552 case 14:
553 k2 ^= ((long) data[index + 13] & 0xff) << 40;
554 case 13:
555 k2 ^= ((long) data[index + 12] & 0xff) << 32;
556 case 12:
557 k2 ^= ((long) data[index + 11] & 0xff) << 24;
558 case 11:
559 k2 ^= ((long) data[index + 10] & 0xff) << 16;
560 case 10:
561 k2 ^= ((long) data[index + 9] & 0xff) << 8;
562 case 9:
563 k2 ^= data[index + 8] & 0xff;
564 k2 *= C2;
565 k2 = Long.rotateLeft(k2, R3);
566 k2 *= C1;
567 h2 ^= k2;
568
569 case 8:
570 k1 ^= ((long) data[index + 7] & 0xff) << 56;
571 case 7:
572 k1 ^= ((long) data[index + 6] & 0xff) << 48;
573 case 6:
574 k1 ^= ((long) data[index + 5] & 0xff) << 40;
575 case 5:
576 k1 ^= ((long) data[index + 4] & 0xff) << 32;
577 case 4:
578 k1 ^= ((long) data[index + 3] & 0xff) << 24;
579 case 3:
580 k1 ^= ((long) data[index + 2] & 0xff) << 16;
581 case 2:
582 k1 ^= ((long) data[index + 1] & 0xff) << 8;
583 case 1:
584 k1 ^= data[index] & 0xff;
585 k1 *= C1;
586 k1 = Long.rotateLeft(k1, R1);
587 k1 *= C2;
588 h1 ^= k1;
589 }
590
591 // finalization
592 h1 ^= length;
593 h2 ^= length;
594
595 h1 += h2;
596 h2 += h1;
597
598 h1 = fmix64(h1);
599 h2 = fmix64(h2);
600
601 h1 += h2;
602 h2 += h1;
603
604 return new long[] { h1, h2 };
605 }
606
607 /**
608 * Generates 32-bit hash from the byte array with a default seed.
609 * This is a helper method that will produce the same result as:
610 *
611 * <pre>
612 * int offset = 0;
613 * int seed = 104729;
614 * int hash = MurmurHash3.hash32(data, offset, data.length, seed);
615 * </pre>
616 *
617 * <p>This implementation contains a sign-extension bug in the finalization step of
618 * any bytes left over from dividing the length by 4. This manifests if any of these
619 * bytes are negative.</p>
620 *
621 * @param data The input byte array
622 * @return The 32-bit hash
623 * @see #hash32(byte[], int, int, int)
624 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
625 */
626 @Deprecated
627 public static int hash32(final byte[] data) {
628 return hash32(data, 0, data.length, DEFAULT_SEED);
629 }
630
631 /**
632 * Generates 32-bit hash from the byte array with the given length and a default seed.
633 * This is a helper method that will produce the same result as:
634 *
635 * <pre>
636 * int offset = 0;
637 * int seed = 104729;
638 * int hash = MurmurHash3.hash32(data, offset, length, seed);
639 * </pre>
640 *
641 * <p>This implementation contains a sign-extension bug in the finalization step of
642 * any bytes left over from dividing the length by 4. This manifests if any of these
643 * bytes are negative.</p>
644 *
645 * @param data The input byte array
646 * @param length The length of array
647 * @return The 32-bit hash
648 * @see #hash32(byte[], int, int, int)
649 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
650 */
651 @Deprecated
652 public static int hash32(final byte[] data, final int length) {
653 return hash32(data, length, DEFAULT_SEED);
654 }
655
656 /**
657 * Generates 32-bit hash from the byte array with the given length and seed. This is a
658 * helper method that will produce the same result as:
659 *
660 * <pre>
661 * int offset = 0;
662 * int hash = MurmurHash3.hash32(data, offset, length, seed);
663 * </pre>
664 *
665 * <p>This implementation contains a sign-extension bug in the finalization step of
666 * any bytes left over from dividing the length by 4. This manifests if any of these
667 * bytes are negative.</p>
668 *
669 * @param data The input byte array
670 * @param length The length of array
671 * @param seed The initial seed value
672 * @return The 32-bit hash
673 * @see #hash32(byte[], int, int, int)
674 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
675 */
676 @Deprecated
677 public static int hash32(final byte[] data, final int length, final int seed) {
678 return hash32(data, 0, length, seed);
679 }
680
681 /**
682 * Generates 32-bit hash from the byte array with the given offset, length and seed.
683 *
684 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
685 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
686 *
687 * <p>This implementation contains a sign-extension bug in the finalization step of
688 * any bytes left over from dividing the length by 4. This manifests if any of these
689 * bytes are negative.</p>
690 *
691 * @param data The input byte array
692 * @param offset The offset of data
693 * @param length The length of array
694 * @param seed The initial seed value
695 * @return The 32-bit hash
696 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
697 */
698 @Deprecated
699 public static int hash32(final byte[] data, final int offset, final int length, final int seed) {
700 int hash = seed;
701 final int nblocks = length >> 2;
702
703 // body
704 for (int i = 0; i < nblocks; i++) {
705 final int index = offset + (i << 2);
706 final int k = getLittleEndianInt(data, index);
707 hash = mix32(k, hash);
708 }
709
710 // tail
711 // ************
712 // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
713 // ************
714 final int index = offset + (nblocks << 2);
715 int k1 = 0;
716 switch (offset + length - index) {
717 case 3:
718 k1 ^= data[index + 2] << 16;
719 case 2:
720 k1 ^= data[index + 1] << 8;
721 case 1:
722 k1 ^= data[index];
723
724 // mix functions
725 k1 *= C1_32;
726 k1 = Integer.rotateLeft(k1, R1_32);
727 k1 *= C2_32;
728 hash ^= k1;
729 }
730
731 hash ^= length;
732 return fmix32(hash);
733 }
734
735 /**
736 * Generates 32-bit hash from a long with a default seed value.
737 * This is a helper method that will produce the same result as:
738 *
739 * <pre>
740 * int offset = 0;
741 * int seed = 104729;
742 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8)
743 * .putLong(data)
744 * .array(), offset, 8, seed);
745 * </pre>
746 *
747 * @param data The long to hash
748 * @return The 32-bit hash
749 * @see #hash32x86(byte[], int, int, int)
750 */
751 public static int hash32(final long data) {
752 return hash32(data, DEFAULT_SEED);
753 }
754
755 /**
756 * Generates 32-bit hash from a long with the given seed.
757 * This is a helper method that will produce the same result as:
758 *
759 * <pre>
760 * int offset = 0;
761 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8)
762 * .putLong(data)
763 * .array(), offset, 8, seed);
764 * </pre>
765 *
766 * @param data The long to hash
767 * @param seed The initial seed value
768 * @return The 32-bit hash
769 * @see #hash32x86(byte[], int, int, int)
770 */
771 public static int hash32(final long data, final int seed) {
772 int hash = seed;
773 final long r0 = Long.reverseBytes(data);
774
775 hash = mix32((int) r0, hash);
776 hash = mix32((int) (r0 >>> 32), hash);
777
778 hash ^= Long.BYTES;
779 return fmix32(hash);
780 }
781
782 /**
783 * Generates 32-bit hash from two longs with a default seed value.
784 * This is a helper method that will produce the same result as:
785 *
786 * <pre>
787 * int offset = 0;
788 * int seed = 104729;
789 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16)
790 * .putLong(data1)
791 * .putLong(data2)
792 * .array(), offset, 16, seed);
793 * </pre>
794 *
795 * @param data1 The first long to hash
796 * @param data2 The second long to hash
797 * @return The 32-bit hash
798 * @see #hash32x86(byte[], int, int, int)
799 */
800 public static int hash32(final long data1, final long data2) {
801 return hash32(data1, data2, DEFAULT_SEED);
802 }
803
804 /**
805 * Generates 32-bit hash from two longs with the given seed.
806 * This is a helper method that will produce the same result as:
807 *
808 * <pre>
809 * int offset = 0;
810 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16)
811 * .putLong(data1)
812 * .putLong(data2)
813 * .array(), offset, 16, seed);
814 * </pre>
815 *
816 * @param data1 The first long to hash
817 * @param data2 The second long to hash
818 * @param seed The initial seed value
819 * @return The 32-bit hash
820 * @see #hash32x86(byte[], int, int, int)
821 */
822 public static int hash32(final long data1, final long data2, final int seed) {
823 int hash = seed;
824 final long r0 = Long.reverseBytes(data1);
825 final long r1 = Long.reverseBytes(data2);
826
827 hash = mix32((int) r0, hash);
828 hash = mix32((int) (r0 >>> 32), hash);
829 hash = mix32((int) r1, hash);
830 hash = mix32((int) (r1 >>> 32), hash);
831
832 hash ^= Long.BYTES * 2;
833 return fmix32(hash);
834 }
835
836 /**
837 * Generates 32-bit hash from a string with a default seed.
838 * <p>
839 * Before 1.14 the string was converted using default encoding.
840 * Since 1.14 the string is converted to bytes using UTF-8 encoding.
841 * </p>
842 * This is a helper method that will produce the same result as:
843 *
844 * <pre>
845 * int offset = 0;
846 * int seed = 104729;
847 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
848 * int hash = MurmurHash3.hash32(bytes, offset, bytes.length, seed);
849 * </pre>
850 *
851 * <p>This implementation contains a sign-extension bug in the finalization step of
852 * any bytes left over from dividing the length by 4. This manifests if any of these
853 * bytes are negative.</p>
854 *
855 * @param data The input string
856 * @return The 32-bit hash
857 * @see #hash32(byte[], int, int, int)
858 * @deprecated Use {@link #hash32x86(byte[], int, int, int)} with the bytes returned from
859 * {@link String#getBytes(java.nio.charset.Charset)}. This corrects the processing of trailing bytes.
860 */
861 @Deprecated
862 public static int hash32(final String data) {
863 final byte[] bytes = StringUtils.getBytesUtf8(data);
864 return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
865 }
866
867 /**
868 * Generates 32-bit hash from the byte array with a seed of zero.
869 * This is a helper method that will produce the same result as:
870 *
871 * <pre>
872 * int offset = 0;
873 * int seed = 0;
874 * int hash = MurmurHash3.hash32x86(data, offset, data.length, seed);
875 * </pre>
876 *
877 * @param data The input byte array
878 * @return The 32-bit hash
879 * @see #hash32x86(byte[], int, int, int)
880 * @since 1.14
881 */
882 public static int hash32x86(final byte[] data) {
883 return hash32x86(data, 0, data.length, 0);
884 }
885
886 /**
887 * Generates 32-bit hash from the byte array with the given offset, length and seed.
888 *
889 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
890 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
891 *
892 * @param data The input byte array
893 * @param offset The offset of data
894 * @param length The length of array
895 * @param seed The initial seed value
896 * @return The 32-bit hash
897 * @since 1.14
898 */
899 public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) {
900 int hash = seed;
901 final int nblocks = length >> 2;
902
903 // body
904 for (int i = 0; i < nblocks; i++) {
905 final int index = offset + (i << 2);
906 final int k = getLittleEndianInt(data, index);
907 hash = mix32(k, hash);
908 }
909
910 // tail
911 final int index = offset + (nblocks << 2);
912 int k1 = 0;
913 switch (offset + length - index) {
914 case 3:
915 k1 ^= (data[index + 2] & 0xff) << 16;
916 case 2:
917 k1 ^= (data[index + 1] & 0xff) << 8;
918 case 1:
919 k1 ^= data[index] & 0xff;
920
921 // mix functions
922 k1 *= C1_32;
923 k1 = Integer.rotateLeft(k1, R1_32);
924 k1 *= C2_32;
925 hash ^= k1;
926 }
927
928 hash ^= length;
929 return fmix32(hash);
930 }
931
932 /**
933 * Generates 64-bit hash from a byte array with a default seed.
934 *
935 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
936 *
937 * <p>This is a Murmur3-like 64-bit variant.
938 * The method does not produce the same result as either half of the hash bytes from
939 * {@linkplain #hash128x64(byte[])} with the same byte data.
940 * This method will be removed in a future release.</p>
941 *
942 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
943 * this result as the default seed is positive.</p>
944 *
945 * <p>This is a helper method that will produce the same result as:</p>
946 *
947 * <pre>
948 * int offset = 0;
949 * int seed = 104729;
950 * long hash = MurmurHash3.hash64(data, offset, data.length, seed);
951 * </pre>
952 *
953 * @param data The input byte array
954 * @return The 64-bit hash
955 * @see #hash64(byte[], int, int, int)
956 * @deprecated Not part of the MurmurHash3 implementation.
957 * Use half of the hash bytes from {@link #hash128x64(byte[])}.
958 */
959 @Deprecated
960 public static long hash64(final byte[] data) {
961 return hash64(data, 0, data.length, DEFAULT_SEED);
962 }
963
964 /**
965 * Generates 64-bit hash from a byte array with the given offset and length and a default seed.
966 *
967 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
968 *
969 * <p>This is a Murmur3-like 64-bit variant.
970 * The method does not produce the same result as either half of the hash bytes from
971 * {@linkplain #hash128x64(byte[])} with the same byte data.
972 * This method will be removed in a future release.</p>
973 *
974 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
975 * this result as the default seed is positive.</p>
976 *
977 * <p>This is a helper method that will produce the same result as:</p>
978 *
979 * <pre>
980 * int seed = 104729;
981 * long hash = MurmurHash3.hash64(data, offset, length, seed);
982 * </pre>
983 *
984 * @param data The input byte array
985 * @param offset The offset of data
986 * @param length The length of array
987 * @return The 64-bit hash
988 * @see #hash64(byte[], int, int, int)
989 * @deprecated Not part of the MurmurHash3 implementation.
990 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}.
991 */
992 @Deprecated
993 public static long hash64(final byte[] data, final int offset, final int length) {
994 return hash64(data, offset, length, DEFAULT_SEED);
995 }
996
997 /**
998 * Generates 64-bit hash from a byte array with the given offset, length and seed.
999 *
1000 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
1001 *
1002 * <p>This is a Murmur3-like 64-bit variant.
1003 * This method will be removed in a future release.</p>
1004 *
1005 * <p>This implementation contains a sign-extension bug in the seed initialization.
1006 * This manifests if the seed is negative.</p>
1007 *
1008 * <p>This algorithm processes 8 bytes chunks of data in a manner similar to the 16 byte chunks
1009 * of data processed in the MurmurHash3 {@code MurmurHash3_x64_128} method. However the hash
1010 * is not mixed with a hash chunk from the next 8 bytes of data. The method will not return
1011 * the same value as the first or second 64-bits of the function
1012 * {@link #hash128(byte[], int, int, int)}.</p>
1013 *
1014 * <p>Use of this method is not advised. Use the first long returned from
1015 * {@link #hash128x64(byte[], int, int, int)}.</p>
1016 *
1017 * @param data The input byte array
1018 * @param offset The offset of data
1019 * @param length The length of array
1020 * @param seed The initial seed value
1021 * @return The 64-bit hash
1022 * @deprecated Not part of the MurmurHash3 implementation.
1023 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}.
1024 */
1025 @Deprecated
1026 public static long hash64(final byte[] data, final int offset, final int length, final int seed) {
1027 //
1028 // Note: This fails to apply masking using 0xffffffffL to the seed.
1029 //
1030 long hash = seed;
1031 final int nblocks = length >> 3;
1032
1033 // body
1034 for (int i = 0; i < nblocks; i++) {
1035 final int index = offset + (i << 3);
1036 long k = getLittleEndianLong(data, index);
1037
1038 // mix functions
1039 k *= C1;
1040 k = Long.rotateLeft(k, R1);
1041 k *= C2;
1042 hash ^= k;
1043 hash = Long.rotateLeft(hash, R2) * M + N1;
1044 }
1045
1046 // tail
1047 long k1 = 0;
1048 final int index = offset + (nblocks << 3);
1049 switch (offset + length - index) {
1050 case 7:
1051 k1 ^= ((long) data[index + 6] & 0xff) << 48;
1052 case 6:
1053 k1 ^= ((long) data[index + 5] & 0xff) << 40;
1054 case 5:
1055 k1 ^= ((long) data[index + 4] & 0xff) << 32;
1056 case 4:
1057 k1 ^= ((long) data[index + 3] & 0xff) << 24;
1058 case 3:
1059 k1 ^= ((long) data[index + 2] & 0xff) << 16;
1060 case 2:
1061 k1 ^= ((long) data[index + 1] & 0xff) << 8;
1062 case 1:
1063 k1 ^= (long) data[index] & 0xff;
1064 k1 *= C1;
1065 k1 = Long.rotateLeft(k1, R1);
1066 k1 *= C2;
1067 hash ^= k1;
1068 }
1069
1070 // finalization
1071 hash ^= length;
1072 return fmix64(hash);
1073 }
1074
1075 /**
1076 * Generates 64-bit hash from an int with a default seed.
1077 *
1078 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
1079 *
1080 * <p>This is a Murmur3-like 64-bit variant.
1081 * The method does not produce the same result as either half of the hash bytes from
1082 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code int}.
1083 * This method will be removed in a future release.</p>
1084 *
1085 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
1086 * this result as the default seed is positive.</p>
1087 *
1088 * <p>This is a helper method that will produce the same result as:</p>
1089 *
1090 * <pre>
1091 * int offset = 0;
1092 * int seed = 104729;
1093 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(4)
1094 * .putInt(data)
1095 * .array(), offset, 4, seed);
1096 * </pre>
1097 *
1098 * @param data The int to hash
1099 * @return The 64-bit hash
1100 * @see #hash64(byte[], int, int, int)
1101 * @deprecated Not part of the MurmurHash3 implementation.
1102 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code int}.
1103 */
1104 @Deprecated
1105 public static long hash64(final int data) {
1106 long k1 = Integer.reverseBytes(data) & -1L >>> 32;
1107 long hash = DEFAULT_SEED;
1108 k1 *= C1;
1109 k1 = Long.rotateLeft(k1, R1);
1110 k1 *= C2;
1111 hash ^= k1;
1112 // finalization
1113 hash ^= Integer.BYTES;
1114 return fmix64(hash);
1115 }
1116
1117 /**
1118 * Generates 64-bit hash from a long with a default seed.
1119 *
1120 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
1121 *
1122 * <p>This is a Murmur3-like 64-bit variant.
1123 * The method does not produce the same result as either half of the hash bytes from
1124 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code long}.
1125 * This method will be removed in a future release.</p>
1126 *
1127 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
1128 * this result as the default seed is positive.</p>
1129 *
1130 * <p>This is a helper method that will produce the same result as:</p>
1131 *
1132 * <pre>
1133 * int offset = 0;
1134 * int seed = 104729;
1135 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(8)
1136 * .putLong(data)
1137 * .array(), offset, 8, seed);
1138 * </pre>
1139 *
1140 * @param data The long to hash
1141 * @return The 64-bit hash
1142 * @see #hash64(byte[], int, int, int)
1143 * @deprecated Not part of the MurmurHash3 implementation.
1144 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code long}.
1145 */
1146 @Deprecated
1147 public static long hash64(final long data) {
1148 long hash = DEFAULT_SEED;
1149 long k = Long.reverseBytes(data);
1150 // mix functions
1151 k *= C1;
1152 k = Long.rotateLeft(k, R1);
1153 k *= C2;
1154 hash ^= k;
1155 hash = Long.rotateLeft(hash, R2) * M + N1;
1156 // finalization
1157 hash ^= Long.BYTES;
1158 return fmix64(hash);
1159 }
1160
1161 /**
1162 * Generates 64-bit hash from a short with a default seed.
1163 *
1164 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
1165 *
1166 * <p>This is a Murmur3-like 64-bit variant.
1167 * The method does not produce the same result as either half of the hash bytes from
1168 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code short}.
1169 * This method will be removed in a future release.</p>
1170 *
1171 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
1172 * this result as the default seed is positive.</p>
1173 *
1174 * <p>This is a helper method that will produce the same result as:</p>
1175 *
1176 * <pre>
1177 * int offset = 0;
1178 * int seed = 104729;
1179 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(2)
1180 * .putShort(data)
1181 * .array(), offset, 2, seed);
1182 * </pre>
1183 *
1184 * @param data The short to hash
1185 * @return The 64-bit hash
1186 * @see #hash64(byte[], int, int, int)
1187 * @deprecated Not part of the MurmurHash3 implementation.
1188 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code short}.
1189 */
1190 @Deprecated
1191 public static long hash64(final short data) {
1192 long hash = DEFAULT_SEED;
1193 long k1 = 0;
1194 k1 ^= ((long) data & 0xff) << 8;
1195 k1 ^= (long) ((data & 0xFF00) >> 8) & 0xff;
1196 k1 *= C1;
1197 k1 = Long.rotateLeft(k1, R1);
1198 k1 *= C2;
1199 hash ^= k1;
1200
1201 // finalization
1202 hash ^= Short.BYTES;
1203 return fmix64(hash);
1204 }
1205
1206 /**
1207 * Performs the intermediate mix step of the 32-bit hash function {@code MurmurHash3_x86_32}.
1208 *
1209 * @param k The data to add to the hash
1210 * @param hash The current hash
1211 * @return The new hash
1212 */
1213 private static int mix32(int k, int hash) {
1214 k *= C1_32;
1215 k = Integer.rotateLeft(k, R1_32);
1216 k *= C2_32;
1217 hash ^= k;
1218 return Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
1219 }
1220
1221 /** No instance methods. */
1222 private MurmurHash3() {
1223 }
1224 }