1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.digest;
19
20 import org.apache.commons.codec.binary.StringUtils;
21
22 /**
23 * Implements the MurmurHash3 32-bit and 128-bit hash functions.
24 *
25 * <p>
26 * MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic
27 * operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not
28 * specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes.
29 * </p>
30 *
31 * <p>
32 * This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function
33 * {@code MurmurHash3_x64_128} from Austin Appleby's original {@code c++} code in SMHasher.
34 * </p>
35 *
36 * <p>
37 * This is public domain code with no copyrights. From home page of
38 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:
39 * </p>
40 *
41 * <blockquote> "All MurmurHash versions are public domain software, and the author disclaims all copyright to their
42 * code." </blockquote>
43 *
44 * <p>
45 * Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original
46 * MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a
47 * 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes.
48 * </p>
49 *
50 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
51 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> Original MurmurHash3 c++
52 * code</a>
53 * @see <a href=
54 * "https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java">
55 * Apache Hive Murmer3</a>
56 * @since 1.13
57 */
58 public final class MurmurHash3 {
59
60 /**
61 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
62 * hash computed.
63 *
64 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
65 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
66 *
67 * <p>This implementation contains a sign-extension bug in the finalization step of
68 * any bytes left over from dividing the length by 4. This manifests if any of these
69 * bytes are negative.</p>
70 *
71 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
72 */
73 @Deprecated
74 public static class IncrementalHash32 extends IncrementalHash32x86 {
75
76 /**
77 * Constructs a new instance.
78 */
79 public IncrementalHash32() {
80 // empty
81 }
82
83 /**
84 * {@inheritDoc}
85 *
86 * <p>This implementation contains a sign-extension bug in the finalization step of
87 * any bytes left over from dividing the length by 4. This manifests if any of these
88 * bytes are negative.<p>
89 *
90 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
91 */
92 @Override
93 @Deprecated
94 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) {
95 int result = hash;
96 // ************
97 // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
98 // ************
99 int k1 = 0;
100 switch (unprocessedLength) {
101 case 3:
102 k1 ^= unprocessed[2] << 16;
103 case 2:
104 k1 ^= unprocessed[1] << 8;
105 case 1:
106 k1 ^= unprocessed[0];
107
108 // mix functions
109 k1 *= C1_32;
110 k1 = Integer.rotateLeft(k1, R1_32);
111 k1 *= C2_32;
112 result ^= k1;
113 }
114
115 // finalization
116 result ^= totalLen;
117 return fmix32(result);
118 }
119 }
120
121 /**
122 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
123 * hash computed.
124 *
125 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
126 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
127 *
128 * @since 1.14
129 */
130 public static class IncrementalHash32x86 {
131
132 /** The size of byte blocks that are processed together. */
133 private static final int BLOCK_SIZE = 4;
134
135 /**
136 * Combines the bytes using an Or operation ({@code | } in a little-endian representation
137 * of a 32-bit integer; byte 1 will be the least significant byte, byte 4 the most
138 * significant.
139 *
140 * @param b1 The first byte
141 * @param b2 The second byte
142 * @param b3 The third byte
143 * @param b4 The fourth byte
144 * @return The 32-bit integer
145 */
146 private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) {
147 return b1 & 0xff | (b2 & 0xff) << 8 | (b3 & 0xff) << 16 | (b4 & 0xff) << 24;
148 }
149
150 /** Up to 3 unprocessed bytes from input data. */
151 private final byte[] unprocessed = new byte[3];
152
153 /** The number of unprocessed bytes in the tail data. */
154 private int unprocessedLength;
155
156 /** The total number of input bytes added since the start. */
157 private int totalLen;
158
159 /**
160 * The current running hash.
161 * This must be finalised to generate the 32-bit hash value.
162 */
163 private int hash;
164
165 /**
166 * Constructs a new instance.
167 */
168 public IncrementalHash32x86() {
169 // empty
170 }
171
172 /**
173 * Adds the byte array to the current incremental hash.
174 *
175 * @param data The input byte array
176 * @param offset The offset of data
177 * @param length The length of array
178 */
179 public final void add(final byte[] data, final int offset, final int length) {
180 if (length <= 0) {
181 // Nothing to add
182 return;
183 }
184 totalLen += length;
185
186 // Process the bytes in blocks of 4.
187 // New bytes must be added to any current unprocessed bytes,
188 // then processed in blocks of 4 and the remaining bytes saved:
189 //
190 // |--|---------------------------|--|
191 // unprocessed
192 // main block
193 // remaining
194
195 // Check if the unprocessed bytes and new bytes can fill a block of 4.
196 // Make this overflow safe in the event that length is Integer.MAX_VALUE.
197 // Equivalent to: (unprocessedLength + length < BLOCK_SIZE)
198 if (unprocessedLength + length - BLOCK_SIZE < 0) {
199 // Not enough so add to the unprocessed bytes
200 System.arraycopy(data, offset, unprocessed, unprocessedLength, length);
201 unprocessedLength += length;
202 return;
203 }
204
205 // Combine unprocessed bytes with new bytes.
206 final int newOffset;
207 final int newLength;
208 if (unprocessedLength > 0) {
209 int k = -1;
210 switch (unprocessedLength) {
211 case 1:
212 k = orBytes(unprocessed[0], data[offset], data[offset + 1], data[offset + 2]);
213 break;
214 case 2:
215 k = orBytes(unprocessed[0], unprocessed[1], data[offset], data[offset + 1]);
216 break;
217 case 3:
218 k = orBytes(unprocessed[0], unprocessed[1], unprocessed[2], data[offset]);
219 break;
220 default:
221 throw new IllegalStateException("Unprocessed length should be 1, 2, or 3: " + unprocessedLength);
222 }
223 hash = mix32(k, hash);
224 // Update the offset and length
225 final int consumed = BLOCK_SIZE - unprocessedLength;
226 newOffset = offset + consumed;
227 newLength = length - consumed;
228 } else {
229 newOffset = offset;
230 newLength = length;
231 }
232
233 // Main processing of blocks of 4 bytes
234 final int nblocks = newLength >> 2;
235
236 for (int i = 0; i < nblocks; i++) {
237 final int index = newOffset + (i << 2);
238 final int k = getLittleEndianInt(data, index);
239 hash = mix32(k, hash);
240 }
241
242 // Save left-over unprocessed bytes
243 final int consumed = nblocks << 2;
244 unprocessedLength = newLength - consumed;
245 if (unprocessedLength != 0) {
246 System.arraycopy(data, newOffset + consumed, unprocessed, 0, unprocessedLength);
247 }
248 }
249
250 /**
251 * Generates the 32-bit hash value. Repeat calls to this method with no additional data
252 * will generate the same hash value.
253 *
254 * @return The 32-bit hash
255 */
256 public final int end() {
257 // Allow calling end() again after adding no data to return the same result.
258 return finalise(hash, unprocessedLength, unprocessed, totalLen);
259 }
260
261 /**
262 * Finalizes the running hash to the output 32-bit hash by processing remaining bytes
263 * and performing final mixing.
264 *
265 * @param hash The running hash
266 * @param unprocessedLength The number of unprocessed bytes in the tail data.
267 * @param unprocessed Up to 3 unprocessed bytes from input data.
268 * @param totalLen The total number of input bytes added since the start.
269 * @return The 32-bit hash
270 */
271 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) {
272 int result = hash;
273 int k1 = 0;
274 switch (unprocessedLength) {
275 case 3:
276 k1 ^= (unprocessed[2] & 0xff) << 16;
277 case 2:
278 k1 ^= (unprocessed[1] & 0xff) << 8;
279 case 1:
280 k1 ^= unprocessed[0] & 0xff;
281
282 // mix functions
283 k1 *= C1_32;
284 k1 = Integer.rotateLeft(k1, R1_32);
285 k1 *= C2_32;
286 result ^= k1;
287 }
288
289 // finalization
290 result ^= totalLen;
291 return fmix32(result);
292 }
293
294 /**
295 * Starts a new incremental hash.
296 *
297 * @param seed The initial seed value
298 */
299 public final void start(final int seed) {
300 // Reset
301 unprocessedLength = totalLen = 0;
302 this.hash = seed;
303 }
304 }
305
306 /**
307 * A random number to use for a hash code.
308 *
309 * @deprecated This is not used internally and will be removed in a future release.
310 */
311 @Deprecated
312 public static final long NULL_HASHCODE = 2862933555777941757L;
313 /**
314 * A default seed to use for the murmur hash algorithm.
315 * Has the value {@code 104729}.
316 */
317 public static final int DEFAULT_SEED = 104729;
318 // Constants for 32-bit variant
319 private static final int C1_32 = 0xcc9e2d51;
320 private static final int C2_32 = 0x1b873593;
321 private static final int R1_32 = 15;
322 private static final int R2_32 = 13;
323
324 private static final int M_32 = 5;
325 private static final int N_32 = 0xe6546b64;
326 // Constants for 128-bit variant
327 private static final long C1 = 0x87c37b91114253d5L;
328 private static final long C2 = 0x4cf5ad432745937fL;
329 private static final int R1 = 31;
330 private static final int R2 = 27;
331 private static final int R3 = 33;
332 private static final int M = 5;
333
334 private static final int N1 = 0x52dce729;
335
336 private static final int N2 = 0x38495ab5;
337
338 /**
339 * Performs the final avalanche mix step of the 32-bit hash function {@code MurmurHash3_x86_32}.
340 *
341 * @param hash The current hash
342 * @return The final hash
343 */
344 private static int fmix32(int hash) {
345 hash ^= hash >>> 16;
346 hash *= 0x85ebca6b;
347 hash ^= hash >>> 13;
348 hash *= 0xc2b2ae35;
349 hash ^= hash >>> 16;
350 return hash;
351 }
352
353 /**
354 * Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}.
355 *
356 * @param hash The current hash
357 * @return The final hash
358 */
359 private static long fmix64(long hash) {
360 hash ^= hash >>> 33;
361 hash *= 0xff51afd7ed558ccdL;
362 hash ^= hash >>> 33;
363 hash *= 0xc4ceb9fe1a85ec53L;
364 hash ^= hash >>> 33;
365 return hash;
366 }
367
368 /**
369 * Gets the little-endian int from 4 bytes starting at the specified index.
370 *
371 * @param data The data
372 * @param index The index
373 * @return The little-endian int
374 */
375 private static int getLittleEndianInt(final byte[] data, final int index) {
376 return data[index ] & 0xff |
377 (data[index + 1] & 0xff) << 8 |
378 (data[index + 2] & 0xff) << 16 |
379 (data[index + 3] & 0xff) << 24;
380 }
381
382 /**
383 * Gets the little-endian long from 8 bytes starting at the specified index.
384 *
385 * @param data The data
386 * @param index The index
387 * @return The little-endian long
388 */
389 private static long getLittleEndianLong(final byte[] data, final int index) {
390 return (long) data[index ] & 0xff |
391 ((long) data[index + 1] & 0xff) << 8 |
392 ((long) data[index + 2] & 0xff) << 16 |
393 ((long) data[index + 3] & 0xff) << 24 |
394 ((long) data[index + 4] & 0xff) << 32 |
395 ((long) data[index + 5] & 0xff) << 40 |
396 ((long) data[index + 6] & 0xff) << 48 |
397 ((long) data[index + 7] & 0xff) << 56;
398 }
399
400 /**
401 * Generates 128-bit hash from the byte array with a default seed.
402 * This is a helper method that will produce the same result as:
403 *
404 * <pre>
405 * int offset = 0;
406 * int seed = 104729;
407 * int hash = MurmurHash3.hash128(data, offset, data.length, seed);
408 * </pre>
409 *
410 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
411 * this result as the default seed is positive.</p>
412 *
413 * @param data The input byte array
414 * @return The 128-bit hash (2 longs)
415 * @see #hash128(byte[], int, int, int)
416 */
417 public static long[] hash128(final byte[] data) {
418 return hash128(data, 0, data.length, DEFAULT_SEED);
419 }
420
421 /**
422 * Generates 128-bit hash from the byte array with the given offset, length and seed.
423 *
424 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
425 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
426 *
427 * <p>This implementation contains a sign-extension bug in the seed initialization.
428 * This manifests if the seed is negative.</p>
429 *
430 * @param data The input byte array
431 * @param offset The first element of array
432 * @param length The length of array
433 * @param seed The initial seed value
434 * @return The 128-bit hash (2 longs)
435 * @deprecated Use {@link #hash128x64(byte[], int, int, int)}. This corrects the seed initialization.
436 */
437 @Deprecated
438 public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) {
439 // ************
440 // Note: This deliberately fails to apply masking using 0xffffffffL to the seed
441 // to maintain behavioral compatibility with the original version.
442 // The implicit conversion to a long will extend a negative sign
443 // bit through the upper 32-bits of the long seed. These should be zero.
444 // ************
445 return hash128x64Internal(data, offset, length, seed);
446 }
447
448 /**
449 * Generates 128-bit hash from a string with a default seed.
450 * <p>
451 * Before 1.14 the string was converted using default encoding.
452 * Since 1.14 the string is converted to bytes using UTF-8 encoding.
453 * </p>
454 * <p>
455 * This is a helper method that will produce the same result as:
456 * </p>
457 *
458 * <pre>
459 * int offset = 0;
460 * int seed = 104729;
461 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
462 * int hash = MurmurHash3.hash128(bytes, offset, bytes.length, seed);
463 * </pre>
464 *
465 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
466 * this result as the default seed is positive.</p>
467 *
468 * @param data The input String
469 * @return The 128-bit hash (2 longs)
470 * @see #hash128(byte[], int, int, int)
471 * @deprecated Use {@link #hash128x64(byte[])} using the bytes returned from
472 * {@link String#getBytes(java.nio.charset.Charset)}.
473 */
474 @Deprecated
475 public static long[] hash128(final String data) {
476 final byte[] bytes = StringUtils.getBytesUtf8(data);
477 return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
478 }
479
480 /**
481 * Generates 128-bit hash from the byte array with a seed of zero.
482 * This is a helper method that will produce the same result as:
483 *
484 * <pre>
485 * int offset = 0;
486 * int seed = 0;
487 * int hash = MurmurHash3.hash128x64(data, offset, data.length, seed);
488 * </pre>
489 *
490 * @param data The input byte array
491 * @return The 128-bit hash (2 longs)
492 * @see #hash128x64(byte[], int, int, int)
493 * @since 1.14
494 */
495 public static long[] hash128x64(final byte[] data) {
496 return hash128x64(data, 0, data.length, 0);
497 }
498
499 /**
500 * Generates 128-bit hash from the byte array with the given offset, length and seed.
501 *
502 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
503 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
504 *
505 * @param data The input byte array
506 * @param offset The first element of array
507 * @param length The length of array
508 * @param seed The initial seed value
509 * @return The 128-bit hash (2 longs)
510 * @since 1.14
511 */
512 public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) {
513 // Use an unsigned 32-bit integer as the seed
514 return hash128x64Internal(data, offset, length, seed & 0xffffffffL);
515 }
516
517 /**
518 * Generates 128-bit hash from the byte array with the given offset, length and seed.
519 *
520 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
521 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
522 *
523 * @param data The input byte array
524 * @param offset The first element of array
525 * @param length The length of array
526 * @param seed The initial seed value
527 * @return The 128-bit hash (2 longs)
528 */
529 private static long[] hash128x64Internal(final byte[] data, final int offset, final int length, final long seed) {
530 long h1 = seed;
531 long h2 = seed;
532 final int nblocks = length >> 4;
533
534 // body
535 for (int i = 0; i < nblocks; i++) {
536 final int index = offset + (i << 4);
537 long k1 = getLittleEndianLong(data, index);
538 long k2 = getLittleEndianLong(data, index + 8);
539
540 // mix functions for k1
541 k1 *= C1;
542 k1 = Long.rotateLeft(k1, R1);
543 k1 *= C2;
544 h1 ^= k1;
545 h1 = Long.rotateLeft(h1, R2);
546 h1 += h2;
547 h1 = h1 * M + N1;
548
549 // mix functions for k2
550 k2 *= C2;
551 k2 = Long.rotateLeft(k2, R3);
552 k2 *= C1;
553 h2 ^= k2;
554 h2 = Long.rotateLeft(h2, R1);
555 h2 += h1;
556 h2 = h2 * M + N2;
557 }
558
559 // tail
560 long k1 = 0;
561 long k2 = 0;
562 final int index = offset + (nblocks << 4);
563 switch (offset + length - index) {
564 case 15:
565 k2 ^= ((long) data[index + 14] & 0xff) << 48;
566 case 14:
567 k2 ^= ((long) data[index + 13] & 0xff) << 40;
568 case 13:
569 k2 ^= ((long) data[index + 12] & 0xff) << 32;
570 case 12:
571 k2 ^= ((long) data[index + 11] & 0xff) << 24;
572 case 11:
573 k2 ^= ((long) data[index + 10] & 0xff) << 16;
574 case 10:
575 k2 ^= ((long) data[index + 9] & 0xff) << 8;
576 case 9:
577 k2 ^= data[index + 8] & 0xff;
578 k2 *= C2;
579 k2 = Long.rotateLeft(k2, R3);
580 k2 *= C1;
581 h2 ^= k2;
582
583 case 8:
584 k1 ^= ((long) data[index + 7] & 0xff) << 56;
585 case 7:
586 k1 ^= ((long) data[index + 6] & 0xff) << 48;
587 case 6:
588 k1 ^= ((long) data[index + 5] & 0xff) << 40;
589 case 5:
590 k1 ^= ((long) data[index + 4] & 0xff) << 32;
591 case 4:
592 k1 ^= ((long) data[index + 3] & 0xff) << 24;
593 case 3:
594 k1 ^= ((long) data[index + 2] & 0xff) << 16;
595 case 2:
596 k1 ^= ((long) data[index + 1] & 0xff) << 8;
597 case 1:
598 k1 ^= data[index] & 0xff;
599 k1 *= C1;
600 k1 = Long.rotateLeft(k1, R1);
601 k1 *= C2;
602 h1 ^= k1;
603 }
604
605 // finalization
606 h1 ^= length;
607 h2 ^= length;
608
609 h1 += h2;
610 h2 += h1;
611
612 h1 = fmix64(h1);
613 h2 = fmix64(h2);
614
615 h1 += h2;
616 h2 += h1;
617
618 return new long[] { h1, h2 };
619 }
620
621 /**
622 * Generates 32-bit hash from the byte array with a default seed.
623 * This is a helper method that will produce the same result as:
624 *
625 * <pre>
626 * int offset = 0;
627 * int seed = 104729;
628 * int hash = MurmurHash3.hash32(data, offset, data.length, seed);
629 * </pre>
630 *
631 * <p>This implementation contains a sign-extension bug in the finalization step of
632 * any bytes left over from dividing the length by 4. This manifests if any of these
633 * bytes are negative.</p>
634 *
635 * @param data The input byte array
636 * @return The 32-bit hash
637 * @see #hash32(byte[], int, int, int)
638 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
639 */
640 @Deprecated
641 public static int hash32(final byte[] data) {
642 return hash32(data, 0, data.length, DEFAULT_SEED);
643 }
644
645 /**
646 * Generates 32-bit hash from the byte array with the given length and a default seed.
647 * This is a helper method that will produce the same result as:
648 *
649 * <pre>
650 * int offset = 0;
651 * int seed = 104729;
652 * int hash = MurmurHash3.hash32(data, offset, length, seed);
653 * </pre>
654 *
655 * <p>This implementation contains a sign-extension bug in the finalization step of
656 * any bytes left over from dividing the length by 4. This manifests if any of these
657 * bytes are negative.</p>
658 *
659 * @param data The input byte array
660 * @param length The length of array
661 * @return The 32-bit hash
662 * @see #hash32(byte[], int, int, int)
663 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
664 */
665 @Deprecated
666 public static int hash32(final byte[] data, final int length) {
667 return hash32(data, length, DEFAULT_SEED);
668 }
669
670 /**
671 * Generates 32-bit hash from the byte array with the given length and seed. This is a
672 * helper method that will produce the same result as:
673 *
674 * <pre>
675 * int offset = 0;
676 * int hash = MurmurHash3.hash32(data, offset, length, seed);
677 * </pre>
678 *
679 * <p>This implementation contains a sign-extension bug in the finalization step of
680 * any bytes left over from dividing the length by 4. This manifests if any of these
681 * bytes are negative.</p>
682 *
683 * @param data The input byte array
684 * @param length The length of array
685 * @param seed The initial seed value
686 * @return The 32-bit hash
687 * @see #hash32(byte[], int, int, int)
688 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
689 */
690 @Deprecated
691 public static int hash32(final byte[] data, final int length, final int seed) {
692 return hash32(data, 0, length, seed);
693 }
694
695 /**
696 * Generates 32-bit hash from the byte array with the given offset, length and seed.
697 *
698 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
699 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
700 *
701 * <p>This implementation contains a sign-extension bug in the finalization step of
702 * any bytes left over from dividing the length by 4. This manifests if any of these
703 * bytes are negative.</p>
704 *
705 * @param data The input byte array
706 * @param offset The offset of data
707 * @param length The length of array
708 * @param seed The initial seed value
709 * @return The 32-bit hash
710 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
711 */
712 @Deprecated
713 public static int hash32(final byte[] data, final int offset, final int length, final int seed) {
714 int hash = seed;
715 final int nblocks = length >> 2;
716
717 // body
718 for (int i = 0; i < nblocks; i++) {
719 final int index = offset + (i << 2);
720 final int k = getLittleEndianInt(data, index);
721 hash = mix32(k, hash);
722 }
723
724 // tail
725 // ************
726 // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
727 // ************
728 final int index = offset + (nblocks << 2);
729 int k1 = 0;
730 switch (offset + length - index) {
731 case 3:
732 k1 ^= data[index + 2] << 16;
733 case 2:
734 k1 ^= data[index + 1] << 8;
735 case 1:
736 k1 ^= data[index];
737
738 // mix functions
739 k1 *= C1_32;
740 k1 = Integer.rotateLeft(k1, R1_32);
741 k1 *= C2_32;
742 hash ^= k1;
743 }
744
745 hash ^= length;
746 return fmix32(hash);
747 }
748
749 /**
750 * Generates 32-bit hash from a long with a default seed value.
751 * This is a helper method that will produce the same result as:
752 *
753 * <pre>
754 * int offset = 0;
755 * int seed = 104729;
756 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8)
757 * .putLong(data)
758 * .array(), offset, 8, seed);
759 * </pre>
760 *
761 * @param data The long to hash
762 * @return The 32-bit hash
763 * @see #hash32x86(byte[], int, int, int)
764 */
765 public static int hash32(final long data) {
766 return hash32(data, DEFAULT_SEED);
767 }
768
769 /**
770 * Generates 32-bit hash from a long with the given seed.
771 * This is a helper method that will produce the same result as:
772 *
773 * <pre>
774 * int offset = 0;
775 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8)
776 * .putLong(data)
777 * .array(), offset, 8, seed);
778 * </pre>
779 *
780 * @param data The long to hash
781 * @param seed The initial seed value
782 * @return The 32-bit hash
783 * @see #hash32x86(byte[], int, int, int)
784 */
785 public static int hash32(final long data, final int seed) {
786 int hash = seed;
787 final long r0 = Long.reverseBytes(data);
788
789 hash = mix32((int) r0, hash);
790 hash = mix32((int) (r0 >>> 32), hash);
791
792 hash ^= Long.BYTES;
793 return fmix32(hash);
794 }
795
796 /**
797 * Generates 32-bit hash from two longs with a default seed value.
798 * This is a helper method that will produce the same result as:
799 *
800 * <pre>
801 * int offset = 0;
802 * int seed = 104729;
803 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16)
804 * .putLong(data1)
805 * .putLong(data2)
806 * .array(), offset, 16, seed);
807 * </pre>
808 *
809 * @param data1 The first long to hash
810 * @param data2 The second long to hash
811 * @return The 32-bit hash
812 * @see #hash32x86(byte[], int, int, int)
813 */
814 public static int hash32(final long data1, final long data2) {
815 return hash32(data1, data2, DEFAULT_SEED);
816 }
817
818 /**
819 * Generates 32-bit hash from two longs with the given seed.
820 * This is a helper method that will produce the same result as:
821 *
822 * <pre>
823 * int offset = 0;
824 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16)
825 * .putLong(data1)
826 * .putLong(data2)
827 * .array(), offset, 16, seed);
828 * </pre>
829 *
830 * @param data1 The first long to hash
831 * @param data2 The second long to hash
832 * @param seed The initial seed value
833 * @return The 32-bit hash
834 * @see #hash32x86(byte[], int, int, int)
835 */
836 public static int hash32(final long data1, final long data2, final int seed) {
837 int hash = seed;
838 final long r0 = Long.reverseBytes(data1);
839 final long r1 = Long.reverseBytes(data2);
840
841 hash = mix32((int) r0, hash);
842 hash = mix32((int) (r0 >>> 32), hash);
843 hash = mix32((int) r1, hash);
844 hash = mix32((int) (r1 >>> 32), hash);
845
846 hash ^= Long.BYTES * 2;
847 return fmix32(hash);
848 }
849
850 /**
851 * Generates 32-bit hash from a string with a default seed.
852 * <p>
853 * Before 1.14 the string was converted using default encoding.
854 * Since 1.14 the string is converted to bytes using UTF-8 encoding.
855 * </p>
856 * This is a helper method that will produce the same result as:
857 *
858 * <pre>
859 * int offset = 0;
860 * int seed = 104729;
861 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
862 * int hash = MurmurHash3.hash32(bytes, offset, bytes.length, seed);
863 * </pre>
864 *
865 * <p>This implementation contains a sign-extension bug in the finalization step of
866 * any bytes left over from dividing the length by 4. This manifests if any of these
867 * bytes are negative.</p>
868 *
869 * @param data The input string
870 * @return The 32-bit hash
871 * @see #hash32(byte[], int, int, int)
872 * @deprecated Use {@link #hash32x86(byte[], int, int, int)} with the bytes returned from
873 * {@link String#getBytes(java.nio.charset.Charset)}. This corrects the processing of trailing bytes.
874 */
875 @Deprecated
876 public static int hash32(final String data) {
877 final byte[] bytes = StringUtils.getBytesUtf8(data);
878 return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
879 }
880
881 /**
882 * Generates 32-bit hash from the byte array with a seed of zero.
883 * This is a helper method that will produce the same result as:
884 *
885 * <pre>
886 * int offset = 0;
887 * int seed = 0;
888 * int hash = MurmurHash3.hash32x86(data, offset, data.length, seed);
889 * </pre>
890 *
891 * @param data The input byte array
892 * @return The 32-bit hash
893 * @see #hash32x86(byte[], int, int, int)
894 * @since 1.14
895 */
896 public static int hash32x86(final byte[] data) {
897 return hash32x86(data, 0, data.length, 0);
898 }
899
900 /**
901 * Generates 32-bit hash from the byte array with the given offset, length and seed.
902 *
903 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
904 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p>
905 *
906 * @param data The input byte array
907 * @param offset The offset of data
908 * @param length The length of array
909 * @param seed The initial seed value
910 * @return The 32-bit hash
911 * @since 1.14
912 */
913 public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) {
914 int hash = seed;
915 final int nblocks = length >> 2;
916
917 // body
918 for (int i = 0; i < nblocks; i++) {
919 final int index = offset + (i << 2);
920 final int k = getLittleEndianInt(data, index);
921 hash = mix32(k, hash);
922 }
923
924 // tail
925 final int index = offset + (nblocks << 2);
926 int k1 = 0;
927 switch (offset + length - index) {
928 case 3:
929 k1 ^= (data[index + 2] & 0xff) << 16;
930 case 2:
931 k1 ^= (data[index + 1] & 0xff) << 8;
932 case 1:
933 k1 ^= data[index] & 0xff;
934
935 // mix functions
936 k1 *= C1_32;
937 k1 = Integer.rotateLeft(k1, R1_32);
938 k1 *= C2_32;
939 hash ^= k1;
940 }
941
942 hash ^= length;
943 return fmix32(hash);
944 }
945
946 /**
947 * Generates 64-bit hash from a byte array with a default seed.
948 *
949 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
950 *
951 * <p>This is a Murmur3-like 64-bit variant.
952 * The method does not produce the same result as either half of the hash bytes from
953 * {@linkplain #hash128x64(byte[])} with the same byte data.
954 * This method will be removed in a future release.</p>
955 *
956 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
957 * this result as the default seed is positive.</p>
958 *
959 * <p>This is a helper method that will produce the same result as:</p>
960 *
961 * <pre>
962 * int offset = 0;
963 * int seed = 104729;
964 * long hash = MurmurHash3.hash64(data, offset, data.length, seed);
965 * </pre>
966 *
967 * @param data The input byte array
968 * @return The 64-bit hash
969 * @see #hash64(byte[], int, int, int)
970 * @deprecated Not part of the MurmurHash3 implementation.
971 * Use half of the hash bytes from {@link #hash128x64(byte[])}.
972 */
973 @Deprecated
974 public static long hash64(final byte[] data) {
975 return hash64(data, 0, data.length, DEFAULT_SEED);
976 }
977
978 /**
979 * Generates 64-bit hash from a byte array with the given offset and length and a default seed.
980 *
981 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
982 *
983 * <p>This is a Murmur3-like 64-bit variant.
984 * The method does not produce the same result as either half of the hash bytes from
985 * {@linkplain #hash128x64(byte[])} with the same byte data.
986 * This method will be removed in a future release.</p>
987 *
988 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
989 * this result as the default seed is positive.</p>
990 *
991 * <p>This is a helper method that will produce the same result as:</p>
992 *
993 * <pre>
994 * int seed = 104729;
995 * long hash = MurmurHash3.hash64(data, offset, length, seed);
996 * </pre>
997 *
998 * @param data The input byte array
999 * @param offset The offset of data
1000 * @param length The length of array
1001 * @return The 64-bit hash
1002 * @see #hash64(byte[], int, int, int)
1003 * @deprecated Not part of the MurmurHash3 implementation.
1004 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}.
1005 */
1006 @Deprecated
1007 public static long hash64(final byte[] data, final int offset, final int length) {
1008 return hash64(data, offset, length, DEFAULT_SEED);
1009 }
1010
1011 /**
1012 * Generates 64-bit hash from a byte array with the given offset, length and seed.
1013 *
1014 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
1015 *
1016 * <p>This is a Murmur3-like 64-bit variant.
1017 * This method will be removed in a future release.</p>
1018 *
1019 * <p>This implementation contains a sign-extension bug in the seed initialization.
1020 * This manifests if the seed is negative.</p>
1021 *
1022 * <p>This algorithm processes 8 bytes chunks of data in a manner similar to the 16 byte chunks
1023 * of data processed in the MurmurHash3 {@code MurmurHash3_x64_128} method. However the hash
1024 * is not mixed with a hash chunk from the next 8 bytes of data. The method will not return
1025 * the same value as the first or second 64-bits of the function
1026 * {@link #hash128(byte[], int, int, int)}.</p>
1027 *
1028 * <p>Use of this method is not advised. Use the first long returned from
1029 * {@link #hash128x64(byte[], int, int, int)}.</p>
1030 *
1031 * @param data The input byte array
1032 * @param offset The offset of data
1033 * @param length The length of array
1034 * @param seed The initial seed value
1035 * @return The 64-bit hash
1036 * @deprecated Not part of the MurmurHash3 implementation.
1037 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}.
1038 */
1039 @Deprecated
1040 public static long hash64(final byte[] data, final int offset, final int length, final int seed) {
1041 //
1042 // Note: This fails to apply masking using 0xffffffffL to the seed.
1043 //
1044 long hash = seed;
1045 final int nblocks = length >> 3;
1046
1047 // body
1048 for (int i = 0; i < nblocks; i++) {
1049 final int index = offset + (i << 3);
1050 long k = getLittleEndianLong(data, index);
1051
1052 // mix functions
1053 k *= C1;
1054 k = Long.rotateLeft(k, R1);
1055 k *= C2;
1056 hash ^= k;
1057 hash = Long.rotateLeft(hash, R2) * M + N1;
1058 }
1059
1060 // tail
1061 long k1 = 0;
1062 final int index = offset + (nblocks << 3);
1063 switch (offset + length - index) {
1064 case 7:
1065 k1 ^= ((long) data[index + 6] & 0xff) << 48;
1066 case 6:
1067 k1 ^= ((long) data[index + 5] & 0xff) << 40;
1068 case 5:
1069 k1 ^= ((long) data[index + 4] & 0xff) << 32;
1070 case 4:
1071 k1 ^= ((long) data[index + 3] & 0xff) << 24;
1072 case 3:
1073 k1 ^= ((long) data[index + 2] & 0xff) << 16;
1074 case 2:
1075 k1 ^= ((long) data[index + 1] & 0xff) << 8;
1076 case 1:
1077 k1 ^= (long) data[index] & 0xff;
1078 k1 *= C1;
1079 k1 = Long.rotateLeft(k1, R1);
1080 k1 *= C2;
1081 hash ^= k1;
1082 }
1083
1084 // finalization
1085 hash ^= length;
1086 return fmix64(hash);
1087 }
1088
1089 /**
1090 * Generates 64-bit hash from an int with a default seed.
1091 *
1092 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
1093 *
1094 * <p>This is a Murmur3-like 64-bit variant.
1095 * The method does not produce the same result as either half of the hash bytes from
1096 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code int}.
1097 * This method will be removed in a future release.</p>
1098 *
1099 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
1100 * this result as the default seed is positive.</p>
1101 *
1102 * <p>This is a helper method that will produce the same result as:</p>
1103 *
1104 * <pre>
1105 * int offset = 0;
1106 * int seed = 104729;
1107 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(4)
1108 * .putInt(data)
1109 * .array(), offset, 4, seed);
1110 * </pre>
1111 *
1112 * @param data The int to hash
1113 * @return The 64-bit hash
1114 * @see #hash64(byte[], int, int, int)
1115 * @deprecated Not part of the MurmurHash3 implementation.
1116 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code int}.
1117 */
1118 @Deprecated
1119 public static long hash64(final int data) {
1120 long k1 = Integer.reverseBytes(data) & -1L >>> 32;
1121 long hash = DEFAULT_SEED;
1122 k1 *= C1;
1123 k1 = Long.rotateLeft(k1, R1);
1124 k1 *= C2;
1125 hash ^= k1;
1126 // finalization
1127 hash ^= Integer.BYTES;
1128 return fmix64(hash);
1129 }
1130
1131 /**
1132 * Generates 64-bit hash from a long with a default seed.
1133 *
1134 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
1135 *
1136 * <p>This is a Murmur3-like 64-bit variant.
1137 * The method does not produce the same result as either half of the hash bytes from
1138 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code long}.
1139 * This method will be removed in a future release.</p>
1140 *
1141 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
1142 * this result as the default seed is positive.</p>
1143 *
1144 * <p>This is a helper method that will produce the same result as:</p>
1145 *
1146 * <pre>
1147 * int offset = 0;
1148 * int seed = 104729;
1149 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(8)
1150 * .putLong(data)
1151 * .array(), offset, 8, seed);
1152 * </pre>
1153 *
1154 * @param data The long to hash
1155 * @return The 64-bit hash
1156 * @see #hash64(byte[], int, int, int)
1157 * @deprecated Not part of the MurmurHash3 implementation.
1158 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code long}.
1159 */
1160 @Deprecated
1161 public static long hash64(final long data) {
1162 long hash = DEFAULT_SEED;
1163 long k = Long.reverseBytes(data);
1164 // mix functions
1165 k *= C1;
1166 k = Long.rotateLeft(k, R1);
1167 k *= C2;
1168 hash ^= k;
1169 hash = Long.rotateLeft(hash, R2) * M + N1;
1170 // finalization
1171 hash ^= Long.BYTES;
1172 return fmix64(hash);
1173 }
1174
1175 /**
1176 * Generates 64-bit hash from a short with a default seed.
1177 *
1178 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
1179 *
1180 * <p>This is a Murmur3-like 64-bit variant.
1181 * The method does not produce the same result as either half of the hash bytes from
1182 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code short}.
1183 * This method will be removed in a future release.</p>
1184 *
1185 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
1186 * this result as the default seed is positive.</p>
1187 *
1188 * <p>This is a helper method that will produce the same result as:</p>
1189 *
1190 * <pre>
1191 * int offset = 0;
1192 * int seed = 104729;
1193 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(2)
1194 * .putShort(data)
1195 * .array(), offset, 2, seed);
1196 * </pre>
1197 *
1198 * @param data The short to hash
1199 * @return The 64-bit hash
1200 * @see #hash64(byte[], int, int, int)
1201 * @deprecated Not part of the MurmurHash3 implementation.
1202 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code short}.
1203 */
1204 @Deprecated
1205 public static long hash64(final short data) {
1206 long hash = DEFAULT_SEED;
1207 long k1 = 0;
1208 k1 ^= ((long) data & 0xff) << 8;
1209 k1 ^= (long) ((data & 0xFF00) >> 8) & 0xff;
1210 k1 *= C1;
1211 k1 = Long.rotateLeft(k1, R1);
1212 k1 *= C2;
1213 hash ^= k1;
1214
1215 // finalization
1216 hash ^= Short.BYTES;
1217 return fmix64(hash);
1218 }
1219
1220 /**
1221 * Performs the intermediate mix step of the 32-bit hash function {@code MurmurHash3_x86_32}.
1222 *
1223 * @param k The data to add to the hash
1224 * @param hash The current hash
1225 * @return The new hash
1226 */
1227 private static int mix32(int k, int hash) {
1228 k *= C1_32;
1229 k = Integer.rotateLeft(k, R1_32);
1230 k *= C2_32;
1231 hash ^= k;
1232 return Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
1233 }
1234
1235 /** No instance methods. */
1236 private MurmurHash3() {
1237 }
1238 }