001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.digest; 019 020import org.apache.commons.codec.binary.StringUtils; 021 022/** 023 * Implements the MurmurHash3 32-bit and 128-bit hash functions. 024 * 025 * <p> 026 * MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic 027 * operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not 028 * specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes. 029 * </p> 030 * 031 * <p> 032 * This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function 033 * {@code MurmurHash3_x64_128} from Austin Appleby's original {@code c++} code in SMHasher. 034 * </p> 035 * 036 * <p> 037 * This is public domain code with no copyrights. From home page of 038 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>: 039 * </p> 040 * 041 * <blockquote> "All MurmurHash versions are public domain software, and the author disclaims all copyright to their 042 * code." </blockquote> 043 * 044 * <p> 045 * Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original 046 * MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a 047 * 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes. 048 * </p> 049 * 050 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a> 051 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> Original MurmurHash3 c++ 052 * code</a> 053 * @see <a href= 054 * "https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java"> 055 * Apache Hive Murmer3</a> 056 * @since 1.13 057 */ 058public final class MurmurHash3 { 059 060 /** 061 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new 062 * hash computed. 063 * 064 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 065 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 066 * 067 * <p>This implementation contains a sign-extension bug in the finalization step of 068 * any bytes left over from dividing the length by 4. This manifests if any of these 069 * bytes are negative.</p> 070 * 071 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes. 072 */ 073 @Deprecated 074 public static class IncrementalHash32 extends IncrementalHash32x86 { 075 076 /** 077 * Constructs a new instance. 078 */ 079 public IncrementalHash32() { 080 // empty 081 } 082 083 /** 084 * {@inheritDoc} 085 * 086 * <p>This implementation contains a sign-extension bug in the finalization step of 087 * any bytes left over from dividing the length by 4. This manifests if any of these 088 * bytes are negative.<p> 089 * 090 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes. 091 */ 092 @Override 093 @Deprecated 094 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) { 095 int result = hash; 096 // ************ 097 // Note: This fails to apply masking using 0xff to the 3 remaining bytes. 098 // ************ 099 int k1 = 0; 100 switch (unprocessedLength) { 101 case 3: 102 k1 ^= unprocessed[2] << 16; 103 case 2: 104 k1 ^= unprocessed[1] << 8; 105 case 1: 106 k1 ^= unprocessed[0]; 107 108 // mix functions 109 k1 *= C1_32; 110 k1 = Integer.rotateLeft(k1, R1_32); 111 k1 *= C2_32; 112 result ^= k1; 113 } 114 115 // finalization 116 result ^= totalLen; 117 return fmix32(result); 118 } 119 } 120 121 /** 122 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new 123 * hash computed. 124 * 125 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 126 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 127 * 128 * @since 1.14 129 */ 130 public static class IncrementalHash32x86 { 131 132 /** The size of byte blocks that are processed together. */ 133 private static final int BLOCK_SIZE = 4; 134 135 /** 136 * Combines the bytes using an Or operation ({@code | } in a little-endian representation 137 * of a 32-bit integer; byte 1 will be the least significant byte, byte 4 the most 138 * significant. 139 * 140 * @param b1 The first byte 141 * @param b2 The second byte 142 * @param b3 The third byte 143 * @param b4 The fourth byte 144 * @return The 32-bit integer 145 */ 146 private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) { 147 return b1 & 0xff | (b2 & 0xff) << 8 | (b3 & 0xff) << 16 | (b4 & 0xff) << 24; 148 } 149 150 /** Up to 3 unprocessed bytes from input data. */ 151 private final byte[] unprocessed = new byte[3]; 152 153 /** The number of unprocessed bytes in the tail data. */ 154 private int unprocessedLength; 155 156 /** The total number of input bytes added since the start. */ 157 private int totalLen; 158 159 /** 160 * The current running hash. 161 * This must be finalised to generate the 32-bit hash value. 162 */ 163 private int hash; 164 165 /** 166 * Constructs a new instance. 167 */ 168 public IncrementalHash32x86() { 169 // empty 170 } 171 172 /** 173 * Adds the byte array to the current incremental hash. 174 * 175 * @param data The input byte array 176 * @param offset The offset of data 177 * @param length The length of array 178 */ 179 public final void add(final byte[] data, final int offset, final int length) { 180 if (length <= 0) { 181 // Nothing to add 182 return; 183 } 184 totalLen += length; 185 186 // Process the bytes in blocks of 4. 187 // New bytes must be added to any current unprocessed bytes, 188 // then processed in blocks of 4 and the remaining bytes saved: 189 // 190 // |--|---------------------------|--| 191 // unprocessed 192 // main block 193 // remaining 194 195 // Check if the unprocessed bytes and new bytes can fill a block of 4. 196 // Make this overflow safe in the event that length is Integer.MAX_VALUE. 197 // Equivalent to: (unprocessedLength + length < BLOCK_SIZE) 198 if (unprocessedLength + length - BLOCK_SIZE < 0) { 199 // Not enough so add to the unprocessed bytes 200 System.arraycopy(data, offset, unprocessed, unprocessedLength, length); 201 unprocessedLength += length; 202 return; 203 } 204 205 // Combine unprocessed bytes with new bytes. 206 final int newOffset; 207 final int newLength; 208 if (unprocessedLength > 0) { 209 int k = -1; 210 switch (unprocessedLength) { 211 case 1: 212 k = orBytes(unprocessed[0], data[offset], data[offset + 1], data[offset + 2]); 213 break; 214 case 2: 215 k = orBytes(unprocessed[0], unprocessed[1], data[offset], data[offset + 1]); 216 break; 217 case 3: 218 k = orBytes(unprocessed[0], unprocessed[1], unprocessed[2], data[offset]); 219 break; 220 default: 221 throw new IllegalStateException("Unprocessed length should be 1, 2, or 3: " + unprocessedLength); 222 } 223 hash = mix32(k, hash); 224 // Update the offset and length 225 final int consumed = BLOCK_SIZE - unprocessedLength; 226 newOffset = offset + consumed; 227 newLength = length - consumed; 228 } else { 229 newOffset = offset; 230 newLength = length; 231 } 232 233 // Main processing of blocks of 4 bytes 234 final int nblocks = newLength >> 2; 235 236 for (int i = 0; i < nblocks; i++) { 237 final int index = newOffset + (i << 2); 238 final int k = getLittleEndianInt(data, index); 239 hash = mix32(k, hash); 240 } 241 242 // Save left-over unprocessed bytes 243 final int consumed = nblocks << 2; 244 unprocessedLength = newLength - consumed; 245 if (unprocessedLength != 0) { 246 System.arraycopy(data, newOffset + consumed, unprocessed, 0, unprocessedLength); 247 } 248 } 249 250 /** 251 * Generates the 32-bit hash value. Repeat calls to this method with no additional data 252 * will generate the same hash value. 253 * 254 * @return The 32-bit hash 255 */ 256 public final int end() { 257 // Allow calling end() again after adding no data to return the same result. 258 return finalise(hash, unprocessedLength, unprocessed, totalLen); 259 } 260 261 /** 262 * Finalizes the running hash to the output 32-bit hash by processing remaining bytes 263 * and performing final mixing. 264 * 265 * @param hash The running hash 266 * @param unprocessedLength The number of unprocessed bytes in the tail data. 267 * @param unprocessed Up to 3 unprocessed bytes from input data. 268 * @param totalLen The total number of input bytes added since the start. 269 * @return The 32-bit hash 270 */ 271 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) { 272 int result = hash; 273 int k1 = 0; 274 switch (unprocessedLength) { 275 case 3: 276 k1 ^= (unprocessed[2] & 0xff) << 16; 277 case 2: 278 k1 ^= (unprocessed[1] & 0xff) << 8; 279 case 1: 280 k1 ^= unprocessed[0] & 0xff; 281 282 // mix functions 283 k1 *= C1_32; 284 k1 = Integer.rotateLeft(k1, R1_32); 285 k1 *= C2_32; 286 result ^= k1; 287 } 288 289 // finalization 290 result ^= totalLen; 291 return fmix32(result); 292 } 293 294 /** 295 * Starts a new incremental hash. 296 * 297 * @param seed The initial seed value 298 */ 299 public final void start(final int seed) { 300 // Reset 301 unprocessedLength = totalLen = 0; 302 this.hash = seed; 303 } 304 } 305 306 /** 307 * A random number to use for a hash code. 308 * 309 * @deprecated This is not used internally and will be removed in a future release. 310 */ 311 @Deprecated 312 public static final long NULL_HASHCODE = 2862933555777941757L; 313 /** 314 * A default seed to use for the murmur hash algorithm. 315 * Has the value {@code 104729}. 316 */ 317 public static final int DEFAULT_SEED = 104729; 318 // Constants for 32-bit variant 319 private static final int C1_32 = 0xcc9e2d51; 320 private static final int C2_32 = 0x1b873593; 321 private static final int R1_32 = 15; 322 private static final int R2_32 = 13; 323 324 private static final int M_32 = 5; 325 private static final int N_32 = 0xe6546b64; 326 // Constants for 128-bit variant 327 private static final long C1 = 0x87c37b91114253d5L; 328 private static final long C2 = 0x4cf5ad432745937fL; 329 private static final int R1 = 31; 330 private static final int R2 = 27; 331 private static final int R3 = 33; 332 private static final int M = 5; 333 334 private static final int N1 = 0x52dce729; 335 336 private static final int N2 = 0x38495ab5; 337 338 /** 339 * Performs the final avalanche mix step of the 32-bit hash function {@code MurmurHash3_x86_32}. 340 * 341 * @param hash The current hash 342 * @return The final hash 343 */ 344 private static int fmix32(int hash) { 345 hash ^= hash >>> 16; 346 hash *= 0x85ebca6b; 347 hash ^= hash >>> 13; 348 hash *= 0xc2b2ae35; 349 hash ^= hash >>> 16; 350 return hash; 351 } 352 353 /** 354 * Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}. 355 * 356 * @param hash The current hash 357 * @return The final hash 358 */ 359 private static long fmix64(long hash) { 360 hash ^= hash >>> 33; 361 hash *= 0xff51afd7ed558ccdL; 362 hash ^= hash >>> 33; 363 hash *= 0xc4ceb9fe1a85ec53L; 364 hash ^= hash >>> 33; 365 return hash; 366 } 367 368 /** 369 * Gets the little-endian int from 4 bytes starting at the specified index. 370 * 371 * @param data The data 372 * @param index The index 373 * @return The little-endian int 374 */ 375 private static int getLittleEndianInt(final byte[] data, final int index) { 376 return data[index ] & 0xff | 377 (data[index + 1] & 0xff) << 8 | 378 (data[index + 2] & 0xff) << 16 | 379 (data[index + 3] & 0xff) << 24; 380 } 381 382 /** 383 * Gets the little-endian long from 8 bytes starting at the specified index. 384 * 385 * @param data The data 386 * @param index The index 387 * @return The little-endian long 388 */ 389 private static long getLittleEndianLong(final byte[] data, final int index) { 390 return (long) data[index ] & 0xff | 391 ((long) data[index + 1] & 0xff) << 8 | 392 ((long) data[index + 2] & 0xff) << 16 | 393 ((long) data[index + 3] & 0xff) << 24 | 394 ((long) data[index + 4] & 0xff) << 32 | 395 ((long) data[index + 5] & 0xff) << 40 | 396 ((long) data[index + 6] & 0xff) << 48 | 397 ((long) data[index + 7] & 0xff) << 56; 398 } 399 400 /** 401 * Generates 128-bit hash from the byte array with a default seed. 402 * This is a helper method that will produce the same result as: 403 * 404 * <pre> 405 * int offset = 0; 406 * int seed = 104729; 407 * int hash = MurmurHash3.hash128(data, offset, data.length, seed); 408 * </pre> 409 * 410 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect 411 * this result as the default seed is positive.</p> 412 * 413 * @param data The input byte array 414 * @return The 128-bit hash (2 longs) 415 * @see #hash128(byte[], int, int, int) 416 */ 417 public static long[] hash128(final byte[] data) { 418 return hash128(data, 0, data.length, DEFAULT_SEED); 419 } 420 421 /** 422 * Generates 128-bit hash from the byte array with the given offset, length and seed. 423 * 424 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 425 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 426 * 427 * <p>This implementation contains a sign-extension bug in the seed initialization. 428 * This manifests if the seed is negative.</p> 429 * 430 * @param data The input byte array 431 * @param offset The first element of array 432 * @param length The length of array 433 * @param seed The initial seed value 434 * @return The 128-bit hash (2 longs) 435 * @deprecated Use {@link #hash128x64(byte[], int, int, int)}. This corrects the seed initialization. 436 */ 437 @Deprecated 438 public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) { 439 // ************ 440 // Note: This deliberately fails to apply masking using 0xffffffffL to the seed 441 // to maintain behavioral compatibility with the original version. 442 // The implicit conversion to a long will extend a negative sign 443 // bit through the upper 32-bits of the long seed. These should be zero. 444 // ************ 445 return hash128x64Internal(data, offset, length, seed); 446 } 447 448 /** 449 * Generates 128-bit hash from a string with a default seed. 450 * <p> 451 * Before 1.14 the string was converted using default encoding. 452 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 453 * </p> 454 * <p> 455 * This is a helper method that will produce the same result as: 456 * </p> 457 * 458 * <pre> 459 * int offset = 0; 460 * int seed = 104729; 461 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 462 * int hash = MurmurHash3.hash128(bytes, offset, bytes.length, seed); 463 * </pre> 464 * 465 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect 466 * this result as the default seed is positive.</p> 467 * 468 * @param data The input String 469 * @return The 128-bit hash (2 longs) 470 * @see #hash128(byte[], int, int, int) 471 * @deprecated Use {@link #hash128x64(byte[])} using the bytes returned from 472 * {@link String#getBytes(java.nio.charset.Charset)}. 473 */ 474 @Deprecated 475 public static long[] hash128(final String data) { 476 final byte[] bytes = StringUtils.getBytesUtf8(data); 477 return hash128(bytes, 0, bytes.length, DEFAULT_SEED); 478 } 479 480 /** 481 * Generates 128-bit hash from the byte array with a seed of zero. 482 * This is a helper method that will produce the same result as: 483 * 484 * <pre> 485 * int offset = 0; 486 * int seed = 0; 487 * int hash = MurmurHash3.hash128x64(data, offset, data.length, seed); 488 * </pre> 489 * 490 * @param data The input byte array 491 * @return The 128-bit hash (2 longs) 492 * @see #hash128x64(byte[], int, int, int) 493 * @since 1.14 494 */ 495 public static long[] hash128x64(final byte[] data) { 496 return hash128x64(data, 0, data.length, 0); 497 } 498 499 /** 500 * Generates 128-bit hash from the byte array with the given offset, length and seed. 501 * 502 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 503 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 504 * 505 * @param data The input byte array 506 * @param offset The first element of array 507 * @param length The length of array 508 * @param seed The initial seed value 509 * @return The 128-bit hash (2 longs) 510 * @since 1.14 511 */ 512 public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) { 513 // Use an unsigned 32-bit integer as the seed 514 return hash128x64Internal(data, offset, length, seed & 0xffffffffL); 515 } 516 517 /** 518 * Generates 128-bit hash from the byte array with the given offset, length and seed. 519 * 520 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 521 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 522 * 523 * @param data The input byte array 524 * @param offset The first element of array 525 * @param length The length of array 526 * @param seed The initial seed value 527 * @return The 128-bit hash (2 longs) 528 */ 529 private static long[] hash128x64Internal(final byte[] data, final int offset, final int length, final long seed) { 530 long h1 = seed; 531 long h2 = seed; 532 final int nblocks = length >> 4; 533 534 // body 535 for (int i = 0; i < nblocks; i++) { 536 final int index = offset + (i << 4); 537 long k1 = getLittleEndianLong(data, index); 538 long k2 = getLittleEndianLong(data, index + 8); 539 540 // mix functions for k1 541 k1 *= C1; 542 k1 = Long.rotateLeft(k1, R1); 543 k1 *= C2; 544 h1 ^= k1; 545 h1 = Long.rotateLeft(h1, R2); 546 h1 += h2; 547 h1 = h1 * M + N1; 548 549 // mix functions for k2 550 k2 *= C2; 551 k2 = Long.rotateLeft(k2, R3); 552 k2 *= C1; 553 h2 ^= k2; 554 h2 = Long.rotateLeft(h2, R1); 555 h2 += h1; 556 h2 = h2 * M + N2; 557 } 558 559 // tail 560 long k1 = 0; 561 long k2 = 0; 562 final int index = offset + (nblocks << 4); 563 switch (offset + length - index) { 564 case 15: 565 k2 ^= ((long) data[index + 14] & 0xff) << 48; 566 case 14: 567 k2 ^= ((long) data[index + 13] & 0xff) << 40; 568 case 13: 569 k2 ^= ((long) data[index + 12] & 0xff) << 32; 570 case 12: 571 k2 ^= ((long) data[index + 11] & 0xff) << 24; 572 case 11: 573 k2 ^= ((long) data[index + 10] & 0xff) << 16; 574 case 10: 575 k2 ^= ((long) data[index + 9] & 0xff) << 8; 576 case 9: 577 k2 ^= data[index + 8] & 0xff; 578 k2 *= C2; 579 k2 = Long.rotateLeft(k2, R3); 580 k2 *= C1; 581 h2 ^= k2; 582 583 case 8: 584 k1 ^= ((long) data[index + 7] & 0xff) << 56; 585 case 7: 586 k1 ^= ((long) data[index + 6] & 0xff) << 48; 587 case 6: 588 k1 ^= ((long) data[index + 5] & 0xff) << 40; 589 case 5: 590 k1 ^= ((long) data[index + 4] & 0xff) << 32; 591 case 4: 592 k1 ^= ((long) data[index + 3] & 0xff) << 24; 593 case 3: 594 k1 ^= ((long) data[index + 2] & 0xff) << 16; 595 case 2: 596 k1 ^= ((long) data[index + 1] & 0xff) << 8; 597 case 1: 598 k1 ^= data[index] & 0xff; 599 k1 *= C1; 600 k1 = Long.rotateLeft(k1, R1); 601 k1 *= C2; 602 h1 ^= k1; 603 } 604 605 // finalization 606 h1 ^= length; 607 h2 ^= length; 608 609 h1 += h2; 610 h2 += h1; 611 612 h1 = fmix64(h1); 613 h2 = fmix64(h2); 614 615 h1 += h2; 616 h2 += h1; 617 618 return new long[] { h1, h2 }; 619 } 620 621 /** 622 * Generates 32-bit hash from the byte array with a default seed. 623 * This is a helper method that will produce the same result as: 624 * 625 * <pre> 626 * int offset = 0; 627 * int seed = 104729; 628 * int hash = MurmurHash3.hash32(data, offset, data.length, seed); 629 * </pre> 630 * 631 * <p>This implementation contains a sign-extension bug in the finalization step of 632 * any bytes left over from dividing the length by 4. This manifests if any of these 633 * bytes are negative.</p> 634 * 635 * @param data The input byte array 636 * @return The 32-bit hash 637 * @see #hash32(byte[], int, int, int) 638 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 639 */ 640 @Deprecated 641 public static int hash32(final byte[] data) { 642 return hash32(data, 0, data.length, DEFAULT_SEED); 643 } 644 645 /** 646 * Generates 32-bit hash from the byte array with the given length and a default seed. 647 * This is a helper method that will produce the same result as: 648 * 649 * <pre> 650 * int offset = 0; 651 * int seed = 104729; 652 * int hash = MurmurHash3.hash32(data, offset, length, seed); 653 * </pre> 654 * 655 * <p>This implementation contains a sign-extension bug in the finalization step of 656 * any bytes left over from dividing the length by 4. This manifests if any of these 657 * bytes are negative.</p> 658 * 659 * @param data The input byte array 660 * @param length The length of array 661 * @return The 32-bit hash 662 * @see #hash32(byte[], int, int, int) 663 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 664 */ 665 @Deprecated 666 public static int hash32(final byte[] data, final int length) { 667 return hash32(data, length, DEFAULT_SEED); 668 } 669 670 /** 671 * Generates 32-bit hash from the byte array with the given length and seed. This is a 672 * helper method that will produce the same result as: 673 * 674 * <pre> 675 * int offset = 0; 676 * int hash = MurmurHash3.hash32(data, offset, length, seed); 677 * </pre> 678 * 679 * <p>This implementation contains a sign-extension bug in the finalization step of 680 * any bytes left over from dividing the length by 4. This manifests if any of these 681 * bytes are negative.</p> 682 * 683 * @param data The input byte array 684 * @param length The length of array 685 * @param seed The initial seed value 686 * @return The 32-bit hash 687 * @see #hash32(byte[], int, int, int) 688 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 689 */ 690 @Deprecated 691 public static int hash32(final byte[] data, final int length, final int seed) { 692 return hash32(data, 0, length, seed); 693 } 694 695 /** 696 * Generates 32-bit hash from the byte array with the given offset, length and seed. 697 * 698 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 699 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 700 * 701 * <p>This implementation contains a sign-extension bug in the finalization step of 702 * any bytes left over from dividing the length by 4. This manifests if any of these 703 * bytes are negative.</p> 704 * 705 * @param data The input byte array 706 * @param offset The offset of data 707 * @param length The length of array 708 * @param seed The initial seed value 709 * @return The 32-bit hash 710 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 711 */ 712 @Deprecated 713 public static int hash32(final byte[] data, final int offset, final int length, final int seed) { 714 int hash = seed; 715 final int nblocks = length >> 2; 716 717 // body 718 for (int i = 0; i < nblocks; i++) { 719 final int index = offset + (i << 2); 720 final int k = getLittleEndianInt(data, index); 721 hash = mix32(k, hash); 722 } 723 724 // tail 725 // ************ 726 // Note: This fails to apply masking using 0xff to the 3 remaining bytes. 727 // ************ 728 final int index = offset + (nblocks << 2); 729 int k1 = 0; 730 switch (offset + length - index) { 731 case 3: 732 k1 ^= data[index + 2] << 16; 733 case 2: 734 k1 ^= data[index + 1] << 8; 735 case 1: 736 k1 ^= data[index]; 737 738 // mix functions 739 k1 *= C1_32; 740 k1 = Integer.rotateLeft(k1, R1_32); 741 k1 *= C2_32; 742 hash ^= k1; 743 } 744 745 hash ^= length; 746 return fmix32(hash); 747 } 748 749 /** 750 * Generates 32-bit hash from a long with a default seed value. 751 * This is a helper method that will produce the same result as: 752 * 753 * <pre> 754 * int offset = 0; 755 * int seed = 104729; 756 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8) 757 * .putLong(data) 758 * .array(), offset, 8, seed); 759 * </pre> 760 * 761 * @param data The long to hash 762 * @return The 32-bit hash 763 * @see #hash32x86(byte[], int, int, int) 764 */ 765 public static int hash32(final long data) { 766 return hash32(data, DEFAULT_SEED); 767 } 768 769 /** 770 * Generates 32-bit hash from a long with the given seed. 771 * This is a helper method that will produce the same result as: 772 * 773 * <pre> 774 * int offset = 0; 775 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8) 776 * .putLong(data) 777 * .array(), offset, 8, seed); 778 * </pre> 779 * 780 * @param data The long to hash 781 * @param seed The initial seed value 782 * @return The 32-bit hash 783 * @see #hash32x86(byte[], int, int, int) 784 */ 785 public static int hash32(final long data, final int seed) { 786 int hash = seed; 787 final long r0 = Long.reverseBytes(data); 788 789 hash = mix32((int) r0, hash); 790 hash = mix32((int) (r0 >>> 32), hash); 791 792 hash ^= Long.BYTES; 793 return fmix32(hash); 794 } 795 796 /** 797 * Generates 32-bit hash from two longs with a default seed value. 798 * This is a helper method that will produce the same result as: 799 * 800 * <pre> 801 * int offset = 0; 802 * int seed = 104729; 803 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16) 804 * .putLong(data1) 805 * .putLong(data2) 806 * .array(), offset, 16, seed); 807 * </pre> 808 * 809 * @param data1 The first long to hash 810 * @param data2 The second long to hash 811 * @return The 32-bit hash 812 * @see #hash32x86(byte[], int, int, int) 813 */ 814 public static int hash32(final long data1, final long data2) { 815 return hash32(data1, data2, DEFAULT_SEED); 816 } 817 818 /** 819 * Generates 32-bit hash from two longs with the given seed. 820 * This is a helper method that will produce the same result as: 821 * 822 * <pre> 823 * int offset = 0; 824 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16) 825 * .putLong(data1) 826 * .putLong(data2) 827 * .array(), offset, 16, seed); 828 * </pre> 829 * 830 * @param data1 The first long to hash 831 * @param data2 The second long to hash 832 * @param seed The initial seed value 833 * @return The 32-bit hash 834 * @see #hash32x86(byte[], int, int, int) 835 */ 836 public static int hash32(final long data1, final long data2, final int seed) { 837 int hash = seed; 838 final long r0 = Long.reverseBytes(data1); 839 final long r1 = Long.reverseBytes(data2); 840 841 hash = mix32((int) r0, hash); 842 hash = mix32((int) (r0 >>> 32), hash); 843 hash = mix32((int) r1, hash); 844 hash = mix32((int) (r1 >>> 32), hash); 845 846 hash ^= Long.BYTES * 2; 847 return fmix32(hash); 848 } 849 850 /** 851 * Generates 32-bit hash from a string with a default seed. 852 * <p> 853 * Before 1.14 the string was converted using default encoding. 854 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 855 * </p> 856 * This is a helper method that will produce the same result as: 857 * 858 * <pre> 859 * int offset = 0; 860 * int seed = 104729; 861 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 862 * int hash = MurmurHash3.hash32(bytes, offset, bytes.length, seed); 863 * </pre> 864 * 865 * <p>This implementation contains a sign-extension bug in the finalization step of 866 * any bytes left over from dividing the length by 4. This manifests if any of these 867 * bytes are negative.</p> 868 * 869 * @param data The input string 870 * @return The 32-bit hash 871 * @see #hash32(byte[], int, int, int) 872 * @deprecated Use {@link #hash32x86(byte[], int, int, int)} with the bytes returned from 873 * {@link String#getBytes(java.nio.charset.Charset)}. This corrects the processing of trailing bytes. 874 */ 875 @Deprecated 876 public static int hash32(final String data) { 877 final byte[] bytes = StringUtils.getBytesUtf8(data); 878 return hash32(bytes, 0, bytes.length, DEFAULT_SEED); 879 } 880 881 /** 882 * Generates 32-bit hash from the byte array with a seed of zero. 883 * This is a helper method that will produce the same result as: 884 * 885 * <pre> 886 * int offset = 0; 887 * int seed = 0; 888 * int hash = MurmurHash3.hash32x86(data, offset, data.length, seed); 889 * </pre> 890 * 891 * @param data The input byte array 892 * @return The 32-bit hash 893 * @see #hash32x86(byte[], int, int, int) 894 * @since 1.14 895 */ 896 public static int hash32x86(final byte[] data) { 897 return hash32x86(data, 0, data.length, 0); 898 } 899 900 /** 901 * Generates 32-bit hash from the byte array with the given offset, length and seed. 902 * 903 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 904 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 905 * 906 * @param data The input byte array 907 * @param offset The offset of data 908 * @param length The length of array 909 * @param seed The initial seed value 910 * @return The 32-bit hash 911 * @since 1.14 912 */ 913 public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) { 914 int hash = seed; 915 final int nblocks = length >> 2; 916 917 // body 918 for (int i = 0; i < nblocks; i++) { 919 final int index = offset + (i << 2); 920 final int k = getLittleEndianInt(data, index); 921 hash = mix32(k, hash); 922 } 923 924 // tail 925 final int index = offset + (nblocks << 2); 926 int k1 = 0; 927 switch (offset + length - index) { 928 case 3: 929 k1 ^= (data[index + 2] & 0xff) << 16; 930 case 2: 931 k1 ^= (data[index + 1] & 0xff) << 8; 932 case 1: 933 k1 ^= data[index] & 0xff; 934 935 // mix functions 936 k1 *= C1_32; 937 k1 = Integer.rotateLeft(k1, R1_32); 938 k1 *= C2_32; 939 hash ^= k1; 940 } 941 942 hash ^= length; 943 return fmix32(hash); 944 } 945 946 /** 947 * Generates 64-bit hash from a byte array with a default seed. 948 * 949 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 950 * 951 * <p>This is a Murmur3-like 64-bit variant. 952 * The method does not produce the same result as either half of the hash bytes from 953 * {@linkplain #hash128x64(byte[])} with the same byte data. 954 * This method will be removed in a future release.</p> 955 * 956 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 957 * this result as the default seed is positive.</p> 958 * 959 * <p>This is a helper method that will produce the same result as:</p> 960 * 961 * <pre> 962 * int offset = 0; 963 * int seed = 104729; 964 * long hash = MurmurHash3.hash64(data, offset, data.length, seed); 965 * </pre> 966 * 967 * @param data The input byte array 968 * @return The 64-bit hash 969 * @see #hash64(byte[], int, int, int) 970 * @deprecated Not part of the MurmurHash3 implementation. 971 * Use half of the hash bytes from {@link #hash128x64(byte[])}. 972 */ 973 @Deprecated 974 public static long hash64(final byte[] data) { 975 return hash64(data, 0, data.length, DEFAULT_SEED); 976 } 977 978 /** 979 * Generates 64-bit hash from a byte array with the given offset and length and a default seed. 980 * 981 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 982 * 983 * <p>This is a Murmur3-like 64-bit variant. 984 * The method does not produce the same result as either half of the hash bytes from 985 * {@linkplain #hash128x64(byte[])} with the same byte data. 986 * This method will be removed in a future release.</p> 987 * 988 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 989 * this result as the default seed is positive.</p> 990 * 991 * <p>This is a helper method that will produce the same result as:</p> 992 * 993 * <pre> 994 * int seed = 104729; 995 * long hash = MurmurHash3.hash64(data, offset, length, seed); 996 * </pre> 997 * 998 * @param data The input byte array 999 * @param offset The offset of data 1000 * @param length The length of array 1001 * @return The 64-bit hash 1002 * @see #hash64(byte[], int, int, int) 1003 * @deprecated Not part of the MurmurHash3 implementation. 1004 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}. 1005 */ 1006 @Deprecated 1007 public static long hash64(final byte[] data, final int offset, final int length) { 1008 return hash64(data, offset, length, DEFAULT_SEED); 1009 } 1010 1011 /** 1012 * Generates 64-bit hash from a byte array with the given offset, length and seed. 1013 * 1014 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 1015 * 1016 * <p>This is a Murmur3-like 64-bit variant. 1017 * This method will be removed in a future release.</p> 1018 * 1019 * <p>This implementation contains a sign-extension bug in the seed initialization. 1020 * This manifests if the seed is negative.</p> 1021 * 1022 * <p>This algorithm processes 8 bytes chunks of data in a manner similar to the 16 byte chunks 1023 * of data processed in the MurmurHash3 {@code MurmurHash3_x64_128} method. However the hash 1024 * is not mixed with a hash chunk from the next 8 bytes of data. The method will not return 1025 * the same value as the first or second 64-bits of the function 1026 * {@link #hash128(byte[], int, int, int)}.</p> 1027 * 1028 * <p>Use of this method is not advised. Use the first long returned from 1029 * {@link #hash128x64(byte[], int, int, int)}.</p> 1030 * 1031 * @param data The input byte array 1032 * @param offset The offset of data 1033 * @param length The length of array 1034 * @param seed The initial seed value 1035 * @return The 64-bit hash 1036 * @deprecated Not part of the MurmurHash3 implementation. 1037 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}. 1038 */ 1039 @Deprecated 1040 public static long hash64(final byte[] data, final int offset, final int length, final int seed) { 1041 // 1042 // Note: This fails to apply masking using 0xffffffffL to the seed. 1043 // 1044 long hash = seed; 1045 final int nblocks = length >> 3; 1046 1047 // body 1048 for (int i = 0; i < nblocks; i++) { 1049 final int index = offset + (i << 3); 1050 long k = getLittleEndianLong(data, index); 1051 1052 // mix functions 1053 k *= C1; 1054 k = Long.rotateLeft(k, R1); 1055 k *= C2; 1056 hash ^= k; 1057 hash = Long.rotateLeft(hash, R2) * M + N1; 1058 } 1059 1060 // tail 1061 long k1 = 0; 1062 final int index = offset + (nblocks << 3); 1063 switch (offset + length - index) { 1064 case 7: 1065 k1 ^= ((long) data[index + 6] & 0xff) << 48; 1066 case 6: 1067 k1 ^= ((long) data[index + 5] & 0xff) << 40; 1068 case 5: 1069 k1 ^= ((long) data[index + 4] & 0xff) << 32; 1070 case 4: 1071 k1 ^= ((long) data[index + 3] & 0xff) << 24; 1072 case 3: 1073 k1 ^= ((long) data[index + 2] & 0xff) << 16; 1074 case 2: 1075 k1 ^= ((long) data[index + 1] & 0xff) << 8; 1076 case 1: 1077 k1 ^= (long) data[index] & 0xff; 1078 k1 *= C1; 1079 k1 = Long.rotateLeft(k1, R1); 1080 k1 *= C2; 1081 hash ^= k1; 1082 } 1083 1084 // finalization 1085 hash ^= length; 1086 return fmix64(hash); 1087 } 1088 1089 /** 1090 * Generates 64-bit hash from an int with a default seed. 1091 * 1092 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 1093 * 1094 * <p>This is a Murmur3-like 64-bit variant. 1095 * The method does not produce the same result as either half of the hash bytes from 1096 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code int}. 1097 * This method will be removed in a future release.</p> 1098 * 1099 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 1100 * this result as the default seed is positive.</p> 1101 * 1102 * <p>This is a helper method that will produce the same result as:</p> 1103 * 1104 * <pre> 1105 * int offset = 0; 1106 * int seed = 104729; 1107 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(4) 1108 * .putInt(data) 1109 * .array(), offset, 4, seed); 1110 * </pre> 1111 * 1112 * @param data The int to hash 1113 * @return The 64-bit hash 1114 * @see #hash64(byte[], int, int, int) 1115 * @deprecated Not part of the MurmurHash3 implementation. 1116 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code int}. 1117 */ 1118 @Deprecated 1119 public static long hash64(final int data) { 1120 long k1 = Integer.reverseBytes(data) & -1L >>> 32; 1121 long hash = DEFAULT_SEED; 1122 k1 *= C1; 1123 k1 = Long.rotateLeft(k1, R1); 1124 k1 *= C2; 1125 hash ^= k1; 1126 // finalization 1127 hash ^= Integer.BYTES; 1128 return fmix64(hash); 1129 } 1130 1131 /** 1132 * Generates 64-bit hash from a long with a default seed. 1133 * 1134 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 1135 * 1136 * <p>This is a Murmur3-like 64-bit variant. 1137 * The method does not produce the same result as either half of the hash bytes from 1138 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code long}. 1139 * This method will be removed in a future release.</p> 1140 * 1141 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 1142 * this result as the default seed is positive.</p> 1143 * 1144 * <p>This is a helper method that will produce the same result as:</p> 1145 * 1146 * <pre> 1147 * int offset = 0; 1148 * int seed = 104729; 1149 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(8) 1150 * .putLong(data) 1151 * .array(), offset, 8, seed); 1152 * </pre> 1153 * 1154 * @param data The long to hash 1155 * @return The 64-bit hash 1156 * @see #hash64(byte[], int, int, int) 1157 * @deprecated Not part of the MurmurHash3 implementation. 1158 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code long}. 1159 */ 1160 @Deprecated 1161 public static long hash64(final long data) { 1162 long hash = DEFAULT_SEED; 1163 long k = Long.reverseBytes(data); 1164 // mix functions 1165 k *= C1; 1166 k = Long.rotateLeft(k, R1); 1167 k *= C2; 1168 hash ^= k; 1169 hash = Long.rotateLeft(hash, R2) * M + N1; 1170 // finalization 1171 hash ^= Long.BYTES; 1172 return fmix64(hash); 1173 } 1174 1175 /** 1176 * Generates 64-bit hash from a short with a default seed. 1177 * 1178 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 1179 * 1180 * <p>This is a Murmur3-like 64-bit variant. 1181 * The method does not produce the same result as either half of the hash bytes from 1182 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code short}. 1183 * This method will be removed in a future release.</p> 1184 * 1185 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 1186 * this result as the default seed is positive.</p> 1187 * 1188 * <p>This is a helper method that will produce the same result as:</p> 1189 * 1190 * <pre> 1191 * int offset = 0; 1192 * int seed = 104729; 1193 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(2) 1194 * .putShort(data) 1195 * .array(), offset, 2, seed); 1196 * </pre> 1197 * 1198 * @param data The short to hash 1199 * @return The 64-bit hash 1200 * @see #hash64(byte[], int, int, int) 1201 * @deprecated Not part of the MurmurHash3 implementation. 1202 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code short}. 1203 */ 1204 @Deprecated 1205 public static long hash64(final short data) { 1206 long hash = DEFAULT_SEED; 1207 long k1 = 0; 1208 k1 ^= ((long) data & 0xff) << 8; 1209 k1 ^= (long) ((data & 0xFF00) >> 8) & 0xff; 1210 k1 *= C1; 1211 k1 = Long.rotateLeft(k1, R1); 1212 k1 *= C2; 1213 hash ^= k1; 1214 1215 // finalization 1216 hash ^= Short.BYTES; 1217 return fmix64(hash); 1218 } 1219 1220 /** 1221 * Performs the intermediate mix step of the 32-bit hash function {@code MurmurHash3_x86_32}. 1222 * 1223 * @param k The data to add to the hash 1224 * @param hash The current hash 1225 * @return The new hash 1226 */ 1227 private static int mix32(int k, int hash) { 1228 k *= C1_32; 1229 k = Integer.rotateLeft(k, R1_32); 1230 k *= C2_32; 1231 hash ^= k; 1232 return Integer.rotateLeft(hash, R2_32) * M_32 + N_32; 1233 } 1234 1235 /** No instance methods. */ 1236 private MurmurHash3() { 1237 } 1238}