MurmurHash2.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.codec.digest;
import org.apache.commons.codec.binary.StringUtils;
/**
* Implements the MurmurHash2 32-bit and 64-bit hash functions.
*
* <p>MurmurHash is a non-cryptographic hash function suitable for general
* hash-based lookup. The name comes from two basic operations, multiply (MU)
* and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
* it is not specifically designed to be difficult to reverse by an adversary,
* making it unsuitable for cryptographic purposes.</p>
*
* <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2}
* and the 64-bit hash function {@code MurmurHash64A} from Austin Appleby's
* original {@code c++} code in SMHasher.</p>
*
* <p>This is a re-implementation of the original C code plus some additional
* features.</p>
*
* <p>This is public domain code with no copyrights. From home page of
* <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p>
*
* <blockquote>
* "All MurmurHash versions are public domain software, and the author
* disclaims all copyright to their code."
* </blockquote>
*
* @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
* @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp">
* Original MurmurHash2 c++ code</a>
* @since 1.13
*/
public final class MurmurHash2 {
// Constants for 32-bit variant
private static final int M32 = 0x5bd1e995;
private static final int R32 = 24;
// Constants for 64-bit variant
private static final long M64 = 0xc6a4a7935bd1e995L;
private static final int R64 = 47;
/**
* Gets the little-endian int from 4 bytes starting at the specified index.
*
* @param data The data
* @param index The index
* @return The little-endian int
*/
private static int getLittleEndianInt(final byte[] data, final int index) {
return data[index ] & 0xff |
(data[index + 1] & 0xff) << 8 |
(data[index + 2] & 0xff) << 16 |
(data[index + 3] & 0xff) << 24;
}
/**
* Gets the little-endian long from 8 bytes starting at the specified index.
*
* @param data The data
* @param index The index
* @return The little-endian long
*/
private static long getLittleEndianLong(final byte[] data, final int index) {
return (long) data[index ] & 0xff |
((long) data[index + 1] & 0xff) << 8 |
((long) data[index + 2] & 0xff) << 16 |
((long) data[index + 3] & 0xff) << 24 |
((long) data[index + 4] & 0xff) << 32 |
((long) data[index + 5] & 0xff) << 40 |
((long) data[index + 6] & 0xff) << 48 |
((long) data[index + 7] & 0xff) << 56;
}
/**
* Generates a 32-bit hash from byte array with the given length and a default seed value.
* This is a helper method that will produce the same result as:
*
* <pre>
* int seed = 0x9747b28c;
* int hash = MurmurHash2.hash32(data, length, seed);
* </pre>
*
* @param data The input byte array
* @param length The length of the array
* @return The 32-bit hash
* @see #hash32(byte[], int, int)
*/
public static int hash32(final byte[] data, final int length) {
return hash32(data, length, 0x9747b28c);
}
/**
* Generates a 32-bit hash from byte array with the given length and seed.
*
* @param data The input byte array
* @param length The length of the array
* @param seed The initial seed value
* @return The 32-bit hash
*/
public static int hash32(final byte[] data, final int length, final int seed) {
// Initialize the hash to a random value
int h = seed ^ length;
// Mix 4 bytes at a time into the hash
final int nblocks = length >> 2;
// body
for (int i = 0; i < nblocks; i++) {
final int index = i << 2;
int k = getLittleEndianInt(data, index);
k *= M32;
k ^= k >>> R32;
k *= M32;
h *= M32;
h ^= k;
}
// Handle the last few bytes of the input array
final int index = nblocks << 2;
switch (length - index) {
case 3:
h ^= (data[index + 2] & 0xff) << 16;
case 2:
h ^= (data[index + 1] & 0xff) << 8;
case 1:
h ^= data[index] & 0xff;
h *= M32;
}
// Do a few final mixes of the hash to ensure the last few
// bytes are well-incorporated.
h ^= h >>> 13;
h *= M32;
h ^= h >>> 15;
return h;
}
/**
* Generates a 32-bit hash from a string with a default seed.
* <p>
* Before 1.14 the string was converted using default encoding.
* Since 1.14 the string is converted to bytes using UTF-8 encoding.
* </p>
* This is a helper method that will produce the same result as:
*
* <pre>
* int seed = 0x9747b28c;
* byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
* int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
* </pre>
*
* @param text The input string
* @return The 32-bit hash
* @see #hash32(byte[], int, int)
*/
public static int hash32(final String text) {
final byte[] bytes = StringUtils.getBytesUtf8(text);
return hash32(bytes, bytes.length);
}
/**
* Generates a 32-bit hash from a substring with a default seed value.
* The string is converted to bytes using the default encoding.
* This is a helper method that will produce the same result as:
*
* <pre>
* int seed = 0x9747b28c;
* byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
* int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
* </pre>
*
* @param text The input string
* @param from The starting index
* @param length The length of the substring
* @return The 32-bit hash
* @see #hash32(byte[], int, int)
*/
public static int hash32(final String text, final int from, final int length) {
return hash32(text.substring(from, from + length));
}
/**
* Generates a 64-bit hash from byte array with given length and a default seed value.
* This is a helper method that will produce the same result as:
*
* <pre>
* int seed = 0xe17a1465;
* int hash = MurmurHash2.hash64(data, length, seed);
* </pre>
*
* @param data The input byte array
* @param length The length of the array
* @return The 64-bit hash
* @see #hash64(byte[], int, int)
*/
public static long hash64(final byte[] data, final int length) {
return hash64(data, length, 0xe17a1465);
}
/**
* Generates a 64-bit hash from byte array of the given length and seed.
*
* @param data The input byte array
* @param length The length of the array
* @param seed The initial seed value
* @return The 64-bit hash of the given array
*/
public static long hash64(final byte[] data, final int length, final int seed) {
long h = seed & 0xffffffffL ^ length * M64;
final int nblocks = length >> 3;
// body
for (int i = 0; i < nblocks; i++) {
final int index = i << 3;
long k = getLittleEndianLong(data, index);
k *= M64;
k ^= k >>> R64;
k *= M64;
h ^= k;
h *= M64;
}
final int index = nblocks << 3;
switch (length - index) {
case 7:
h ^= ((long) data[index + 6] & 0xff) << 48;
case 6:
h ^= ((long) data[index + 5] & 0xff) << 40;
case 5:
h ^= ((long) data[index + 4] & 0xff) << 32;
case 4:
h ^= ((long) data[index + 3] & 0xff) << 24;
case 3:
h ^= ((long) data[index + 2] & 0xff) << 16;
case 2:
h ^= ((long) data[index + 1] & 0xff) << 8;
case 1:
h ^= (long) data[index] & 0xff;
h *= M64;
}
h ^= h >>> R64;
h *= M64;
h ^= h >>> R64;
return h;
}
/**
* Generates a 64-bit hash from a string with a default seed.
* <p>
* Before 1.14 the string was converted using default encoding.
* Since 1.14 the string is converted to bytes using UTF-8 encoding.
* </p>
* <p>
* This is a helper method that will produce the same result as:
* </p>
*
* <pre>
* int seed = 0xe17a1465;
* byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
* int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
* </pre>
*
* @param text The input string
* @return The 64-bit hash
* @see #hash64(byte[], int, int)
*/
public static long hash64(final String text) {
final byte[] bytes = StringUtils.getBytesUtf8(text);
return hash64(bytes, bytes.length);
}
/**
* Generates a 64-bit hash from a substring with a default seed value.
* The string is converted to bytes using the default encoding.
* This is a helper method that will produce the same result as:
*
* <pre>
* int seed = 0xe17a1465;
* byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
* int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
* </pre>
*
* @param text The input string
* @param from The starting index
* @param length The length of the substring
* @return The 64-bit hash
* @see #hash64(byte[], int, int)
*/
public static long hash64(final String text, final int from, final int length) {
return hash64(text.substring(from, from + length));
}
/** No instance methods. */
private MurmurHash2() {
}
}