001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.digest; 019 020import org.apache.commons.codec.binary.StringUtils; 021 022/** 023 * Implements the MurmurHash2 32-bit and 64-bit hash functions. 024 * 025 * <p>MurmurHash is a non-cryptographic hash function suitable for general 026 * hash-based lookup. The name comes from two basic operations, multiply (MU) 027 * and rotate (R), used in its inner loop. Unlike cryptographic hash functions, 028 * it is not specifically designed to be difficult to reverse by an adversary, 029 * making it unsuitable for cryptographic purposes.</p> 030 * 031 * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2} 032 * and the 64-bit hash function {@code MurmurHash64A} from Austin Appleby's 033 * original {@code c++} code in SMHasher.</p> 034 * 035 * <p>This is a re-implementation of the original C code plus some additional 036 * features.</p> 037 * 038 * <p>This is public domain code with no copyrights. From home page of 039 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p> 040 * 041 * <blockquote> 042 * "All MurmurHash versions are public domain software, and the author 043 * disclaims all copyright to their code." 044 * </blockquote> 045 * 046 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a> 047 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp"> 048 * Original MurmurHash2 c++ code</a> 049 * @since 1.13 050 */ 051public final class MurmurHash2 { 052 053 // Constants for 32-bit variant 054 private static final int M32 = 0x5bd1e995; 055 private static final int R32 = 24; 056 057 // Constants for 64-bit variant 058 private static final long M64 = 0xc6a4a7935bd1e995L; 059 private static final int R64 = 47; 060 061 /** 062 * Gets the little-endian int from 4 bytes starting at the specified index. 063 * 064 * @param data The data 065 * @param index The index 066 * @return The little-endian int 067 */ 068 private static int getLittleEndianInt(final byte[] data, final int index) { 069 return data[index ] & 0xff | 070 (data[index + 1] & 0xff) << 8 | 071 (data[index + 2] & 0xff) << 16 | 072 (data[index + 3] & 0xff) << 24; 073 } 074 075 /** 076 * Gets the little-endian long from 8 bytes starting at the specified index. 077 * 078 * @param data The data 079 * @param index The index 080 * @return The little-endian long 081 */ 082 private static long getLittleEndianLong(final byte[] data, final int index) { 083 return (long) data[index ] & 0xff | 084 ((long) data[index + 1] & 0xff) << 8 | 085 ((long) data[index + 2] & 0xff) << 16 | 086 ((long) data[index + 3] & 0xff) << 24 | 087 ((long) data[index + 4] & 0xff) << 32 | 088 ((long) data[index + 5] & 0xff) << 40 | 089 ((long) data[index + 6] & 0xff) << 48 | 090 ((long) data[index + 7] & 0xff) << 56; 091 } 092 093 /** 094 * Generates a 32-bit hash from byte array with the given length and a default seed value. 095 * This is a helper method that will produce the same result as: 096 * 097 * <pre> 098 * int seed = 0x9747b28c; 099 * int hash = MurmurHash2.hash32(data, length, seed); 100 * </pre> 101 * 102 * @param data The input byte array 103 * @param length The length of the array 104 * @return The 32-bit hash 105 * @see #hash32(byte[], int, int) 106 */ 107 public static int hash32(final byte[] data, final int length) { 108 return hash32(data, length, 0x9747b28c); 109 } 110 111 /** 112 * Generates a 32-bit hash from byte array with the given length and seed. 113 * 114 * @param data The input byte array 115 * @param length The length of the array 116 * @param seed The initial seed value 117 * @return The 32-bit hash 118 */ 119 public static int hash32(final byte[] data, final int length, final int seed) { 120 // Initialize the hash to a random value 121 int h = seed ^ length; 122 123 // Mix 4 bytes at a time into the hash 124 final int nblocks = length >> 2; 125 126 // body 127 for (int i = 0; i < nblocks; i++) { 128 final int index = i << 2; 129 int k = getLittleEndianInt(data, index); 130 k *= M32; 131 k ^= k >>> R32; 132 k *= M32; 133 h *= M32; 134 h ^= k; 135 } 136 137 // Handle the last few bytes of the input array 138 final int index = nblocks << 2; 139 switch (length - index) { 140 case 3: 141 h ^= (data[index + 2] & 0xff) << 16; 142 case 2: 143 h ^= (data[index + 1] & 0xff) << 8; 144 case 1: 145 h ^= data[index] & 0xff; 146 h *= M32; 147 } 148 149 // Do a few final mixes of the hash to ensure the last few 150 // bytes are well-incorporated. 151 h ^= h >>> 13; 152 h *= M32; 153 h ^= h >>> 15; 154 155 return h; 156 } 157 158 /** 159 * Generates a 32-bit hash from a string with a default seed. 160 * <p> 161 * Before 1.14 the string was converted using default encoding. 162 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 163 * </p> 164 * This is a helper method that will produce the same result as: 165 * 166 * <pre> 167 * int seed = 0x9747b28c; 168 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 169 * int hash = MurmurHash2.hash32(bytes, bytes.length, seed); 170 * </pre> 171 * 172 * @param text The input string 173 * @return The 32-bit hash 174 * @see #hash32(byte[], int, int) 175 */ 176 public static int hash32(final String text) { 177 final byte[] bytes = StringUtils.getBytesUtf8(text); 178 return hash32(bytes, bytes.length); 179 } 180 181 /** 182 * Generates a 32-bit hash from a substring with a default seed value. 183 * The string is converted to bytes using the default encoding. 184 * This is a helper method that will produce the same result as: 185 * 186 * <pre> 187 * int seed = 0x9747b28c; 188 * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8); 189 * int hash = MurmurHash2.hash32(bytes, bytes.length, seed); 190 * </pre> 191 * 192 * @param text The input string 193 * @param from The starting index 194 * @param length The length of the substring 195 * @return The 32-bit hash 196 * @see #hash32(byte[], int, int) 197 */ 198 public static int hash32(final String text, final int from, final int length) { 199 return hash32(text.substring(from, from + length)); 200 } 201 202 /** 203 * Generates a 64-bit hash from byte array with given length and a default seed value. 204 * This is a helper method that will produce the same result as: 205 * 206 * <pre> 207 * int seed = 0xe17a1465; 208 * int hash = MurmurHash2.hash64(data, length, seed); 209 * </pre> 210 * 211 * @param data The input byte array 212 * @param length The length of the array 213 * @return The 64-bit hash 214 * @see #hash64(byte[], int, int) 215 */ 216 public static long hash64(final byte[] data, final int length) { 217 return hash64(data, length, 0xe17a1465); 218 } 219 220 /** 221 * Generates a 64-bit hash from byte array of the given length and seed. 222 * 223 * @param data The input byte array 224 * @param length The length of the array 225 * @param seed The initial seed value 226 * @return The 64-bit hash of the given array 227 */ 228 public static long hash64(final byte[] data, final int length, final int seed) { 229 long h = seed & 0xffffffffL ^ length * M64; 230 231 final int nblocks = length >> 3; 232 233 // body 234 for (int i = 0; i < nblocks; i++) { 235 final int index = i << 3; 236 long k = getLittleEndianLong(data, index); 237 238 k *= M64; 239 k ^= k >>> R64; 240 k *= M64; 241 242 h ^= k; 243 h *= M64; 244 } 245 246 final int index = nblocks << 3; 247 switch (length - index) { 248 case 7: 249 h ^= ((long) data[index + 6] & 0xff) << 48; 250 case 6: 251 h ^= ((long) data[index + 5] & 0xff) << 40; 252 case 5: 253 h ^= ((long) data[index + 4] & 0xff) << 32; 254 case 4: 255 h ^= ((long) data[index + 3] & 0xff) << 24; 256 case 3: 257 h ^= ((long) data[index + 2] & 0xff) << 16; 258 case 2: 259 h ^= ((long) data[index + 1] & 0xff) << 8; 260 case 1: 261 h ^= (long) data[index] & 0xff; 262 h *= M64; 263 } 264 265 h ^= h >>> R64; 266 h *= M64; 267 h ^= h >>> R64; 268 269 return h; 270 } 271 272 /** 273 * Generates a 64-bit hash from a string with a default seed. 274 * <p> 275 * Before 1.14 the string was converted using default encoding. 276 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 277 * </p> 278 * <p> 279 * This is a helper method that will produce the same result as: 280 * </p> 281 * 282 * <pre> 283 * int seed = 0xe17a1465; 284 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 285 * int hash = MurmurHash2.hash64(bytes, bytes.length, seed); 286 * </pre> 287 * 288 * @param text The input string 289 * @return The 64-bit hash 290 * @see #hash64(byte[], int, int) 291 */ 292 public static long hash64(final String text) { 293 final byte[] bytes = StringUtils.getBytesUtf8(text); 294 return hash64(bytes, bytes.length); 295 } 296 297 /** 298 * Generates a 64-bit hash from a substring with a default seed value. 299 * The string is converted to bytes using the default encoding. 300 * This is a helper method that will produce the same result as: 301 * 302 * <pre> 303 * int seed = 0xe17a1465; 304 * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8); 305 * int hash = MurmurHash2.hash64(bytes, bytes.length, seed); 306 * </pre> 307 * 308 * @param text The input string 309 * @param from The starting index 310 * @param length The length of the substring 311 * @return The 64-bit hash 312 * @see #hash64(byte[], int, int) 313 */ 314 public static long hash64(final String text, final int from, final int length) { 315 return hash64(text.substring(from, from + length)); 316 } 317 318 /** No instance methods. */ 319 private MurmurHash2() { 320 } 321}