1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.commons.codec.digest; 19 20 import org.apache.commons.codec.binary.StringUtils; 21 22 /** 23 * Implements the MurmurHash2 32-bit and 64-bit hash functions. 24 * 25 * <p>MurmurHash is a non-cryptographic hash function suitable for general 26 * hash-based lookup. The name comes from two basic operations, multiply (MU) 27 * and rotate (R), used in its inner loop. Unlike cryptographic hash functions, 28 * it is not specifically designed to be difficult to reverse by an adversary, 29 * making it unsuitable for cryptographic purposes.</p> 30 * 31 * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2} 32 * and the 64-bit hash function {@code MurmurHash64A} from Austin Appleby's 33 * original {@code c++} code in SMHasher.</p> 34 * 35 * <p>This is a re-implementation of the original C code plus some additional 36 * features.</p> 37 * 38 * <p>This is public domain code with no copyrights. From home page of 39 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p> 40 * 41 * <blockquote> 42 * "All MurmurHash versions are public domain software, and the author 43 * disclaims all copyright to their code." 44 * </blockquote> 45 * 46 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a> 47 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp"> 48 * Original MurmurHash2 c++ code</a> 49 * @since 1.13 50 */ 51 public final class MurmurHash2 { 52 53 // Constants for 32-bit variant 54 private static final int M32 = 0x5bd1e995; 55 private static final int R32 = 24; 56 57 // Constants for 64-bit variant 58 private static final long M64 = 0xc6a4a7935bd1e995L; 59 private static final int R64 = 47; 60 61 /** 62 * Gets the little-endian int from 4 bytes starting at the specified index. 63 * 64 * @param data The data 65 * @param index The index 66 * @return The little-endian int 67 */ 68 private static int getLittleEndianInt(final byte[] data, final int index) { 69 return data[index ] & 0xff | 70 (data[index + 1] & 0xff) << 8 | 71 (data[index + 2] & 0xff) << 16 | 72 (data[index + 3] & 0xff) << 24; 73 } 74 75 /** 76 * Gets the little-endian long from 8 bytes starting at the specified index. 77 * 78 * @param data The data 79 * @param index The index 80 * @return The little-endian long 81 */ 82 private static long getLittleEndianLong(final byte[] data, final int index) { 83 return (long) data[index ] & 0xff | 84 ((long) data[index + 1] & 0xff) << 8 | 85 ((long) data[index + 2] & 0xff) << 16 | 86 ((long) data[index + 3] & 0xff) << 24 | 87 ((long) data[index + 4] & 0xff) << 32 | 88 ((long) data[index + 5] & 0xff) << 40 | 89 ((long) data[index + 6] & 0xff) << 48 | 90 ((long) data[index + 7] & 0xff) << 56; 91 } 92 93 /** 94 * Generates a 32-bit hash from byte array with the given length and a default seed value. 95 * This is a helper method that will produce the same result as: 96 * 97 * <pre> 98 * int seed = 0x9747b28c; 99 * int hash = MurmurHash2.hash32(data, length, seed); 100 * </pre> 101 * 102 * @param data The input byte array 103 * @param length The length of the array 104 * @return The 32-bit hash 105 * @see #hash32(byte[], int, int) 106 */ 107 public static int hash32(final byte[] data, final int length) { 108 return hash32(data, length, 0x9747b28c); 109 } 110 111 /** 112 * Generates a 32-bit hash from byte array with the given length and seed. 113 * 114 * @param data The input byte array 115 * @param length The length of the array 116 * @param seed The initial seed value 117 * @return The 32-bit hash 118 */ 119 public static int hash32(final byte[] data, final int length, final int seed) { 120 // Initialize the hash to a random value 121 int h = seed ^ length; 122 123 // Mix 4 bytes at a time into the hash 124 final int nblocks = length >> 2; 125 126 // body 127 for (int i = 0; i < nblocks; i++) { 128 final int index = i << 2; 129 int k = getLittleEndianInt(data, index); 130 k *= M32; 131 k ^= k >>> R32; 132 k *= M32; 133 h *= M32; 134 h ^= k; 135 } 136 137 // Handle the last few bytes of the input array 138 final int index = nblocks << 2; 139 switch (length - index) { 140 case 3: 141 h ^= (data[index + 2] & 0xff) << 16; 142 case 2: 143 h ^= (data[index + 1] & 0xff) << 8; 144 case 1: 145 h ^= data[index] & 0xff; 146 h *= M32; 147 } 148 149 // Do a few final mixes of the hash to ensure the last few 150 // bytes are well-incorporated. 151 h ^= h >>> 13; 152 h *= M32; 153 h ^= h >>> 15; 154 155 return h; 156 } 157 158 /** 159 * Generates a 32-bit hash from a string with a default seed. 160 * <p> 161 * Before 1.14 the string was converted using default encoding. 162 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 163 * </p> 164 * This is a helper method that will produce the same result as: 165 * 166 * <pre> 167 * int seed = 0x9747b28c; 168 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 169 * int hash = MurmurHash2.hash32(bytes, bytes.length, seed); 170 * </pre> 171 * 172 * @param text The input string 173 * @return The 32-bit hash 174 * @see #hash32(byte[], int, int) 175 */ 176 public static int hash32(final String text) { 177 final byte[] bytes = StringUtils.getBytesUtf8(text); 178 return hash32(bytes, bytes.length); 179 } 180 181 /** 182 * Generates a 32-bit hash from a substring with a default seed value. 183 * The string is converted to bytes using the default encoding. 184 * This is a helper method that will produce the same result as: 185 * 186 * <pre> 187 * int seed = 0x9747b28c; 188 * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8); 189 * int hash = MurmurHash2.hash32(bytes, bytes.length, seed); 190 * </pre> 191 * 192 * @param text The input string 193 * @param from The starting index 194 * @param length The length of the substring 195 * @return The 32-bit hash 196 * @see #hash32(byte[], int, int) 197 */ 198 public static int hash32(final String text, final int from, final int length) { 199 return hash32(text.substring(from, from + length)); 200 } 201 202 /** 203 * Generates a 64-bit hash from byte array with given length and a default seed value. 204 * This is a helper method that will produce the same result as: 205 * 206 * <pre> 207 * int seed = 0xe17a1465; 208 * int hash = MurmurHash2.hash64(data, length, seed); 209 * </pre> 210 * 211 * @param data The input byte array 212 * @param length The length of the array 213 * @return The 64-bit hash 214 * @see #hash64(byte[], int, int) 215 */ 216 public static long hash64(final byte[] data, final int length) { 217 return hash64(data, length, 0xe17a1465); 218 } 219 220 /** 221 * Generates a 64-bit hash from byte array of the given length and seed. 222 * 223 * @param data The input byte array 224 * @param length The length of the array 225 * @param seed The initial seed value 226 * @return The 64-bit hash of the given array 227 */ 228 public static long hash64(final byte[] data, final int length, final int seed) { 229 long h = seed & 0xffffffffL ^ length * M64; 230 231 final int nblocks = length >> 3; 232 233 // body 234 for (int i = 0; i < nblocks; i++) { 235 final int index = i << 3; 236 long k = getLittleEndianLong(data, index); 237 238 k *= M64; 239 k ^= k >>> R64; 240 k *= M64; 241 242 h ^= k; 243 h *= M64; 244 } 245 246 final int index = nblocks << 3; 247 switch (length - index) { 248 case 7: 249 h ^= ((long) data[index + 6] & 0xff) << 48; 250 case 6: 251 h ^= ((long) data[index + 5] & 0xff) << 40; 252 case 5: 253 h ^= ((long) data[index + 4] & 0xff) << 32; 254 case 4: 255 h ^= ((long) data[index + 3] & 0xff) << 24; 256 case 3: 257 h ^= ((long) data[index + 2] & 0xff) << 16; 258 case 2: 259 h ^= ((long) data[index + 1] & 0xff) << 8; 260 case 1: 261 h ^= (long) data[index] & 0xff; 262 h *= M64; 263 } 264 265 h ^= h >>> R64; 266 h *= M64; 267 h ^= h >>> R64; 268 269 return h; 270 } 271 272 /** 273 * Generates a 64-bit hash from a string with a default seed. 274 * <p> 275 * Before 1.14 the string was converted using default encoding. 276 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 277 * </p> 278 * <p> 279 * This is a helper method that will produce the same result as: 280 * </p> 281 * 282 * <pre> 283 * int seed = 0xe17a1465; 284 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 285 * int hash = MurmurHash2.hash64(bytes, bytes.length, seed); 286 * </pre> 287 * 288 * @param text The input string 289 * @return The 64-bit hash 290 * @see #hash64(byte[], int, int) 291 */ 292 public static long hash64(final String text) { 293 final byte[] bytes = StringUtils.getBytesUtf8(text); 294 return hash64(bytes, bytes.length); 295 } 296 297 /** 298 * Generates a 64-bit hash from a substring with a default seed value. 299 * The string is converted to bytes using the default encoding. 300 * This is a helper method that will produce the same result as: 301 * 302 * <pre> 303 * int seed = 0xe17a1465; 304 * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8); 305 * int hash = MurmurHash2.hash64(bytes, bytes.length, seed); 306 * </pre> 307 * 308 * @param text The input string 309 * @param from The starting index 310 * @param length The length of the substring 311 * @return The 64-bit hash 312 * @see #hash64(byte[], int, int) 313 */ 314 public static long hash64(final String text, final int from, final int length) { 315 return hash64(text.substring(from, from + length)); 316 } 317 318 /** No instance methods. */ 319 private MurmurHash2() { 320 } 321 }