View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.digest;
19  
20  import org.apache.commons.codec.binary.StringUtils;
21  
22  /**
23   * Implements the MurmurHash2 32-bit and 64-bit hash functions.
24   *
25   * <p>MurmurHash is a non-cryptographic hash function suitable for general
26   * hash-based lookup. The name comes from two basic operations, multiply (MU)
27   * and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
28   * it is not specifically designed to be difficult to reverse by an adversary,
29   * making it unsuitable for cryptographic purposes.</p>
30   *
31   * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2}
32   * and the 64-bit hash function {@code MurmurHash64A} from Austin Appleby's
33   * original {@code c++} code in SMHasher.</p>
34   *
35   * <p>This is a re-implementation of the original C code plus some additional
36   * features.</p>
37   *
38   * <p>This is public domain code with no copyrights. From home page of
39   * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p>
40   *
41   * <blockquote>
42   * "All MurmurHash versions are public domain software, and the author
43   * disclaims all copyright to their code."
44   * </blockquote>
45   *
46   * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
47   * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp">
48   *   Original MurmurHash2 c++ code</a>
49   * @since 1.13
50   */
51  public final class MurmurHash2 {
52  
53      // Constants for 32-bit variant
54      private static final int M32 = 0x5bd1e995;
55      private static final int R32 = 24;
56  
57      // Constants for 64-bit variant
58      private static final long M64 = 0xc6a4a7935bd1e995L;
59      private static final int R64 = 47;
60  
61      /**
62       * Gets the little-endian int from 4 bytes starting at the specified index.
63       *
64       * @param data The data
65       * @param index The index
66       * @return The little-endian int
67       */
68      private static int getLittleEndianInt(final byte[] data, final int index) {
69          return data[index    ] & 0xff |
70                 (data[index + 1] & 0xff) <<  8 |
71                 (data[index + 2] & 0xff) << 16 |
72                 (data[index + 3] & 0xff) << 24;
73      }
74  
75      /**
76       * Gets the little-endian long from 8 bytes starting at the specified index.
77       *
78       * @param data The data
79       * @param index The index
80       * @return The little-endian long
81       */
82      private static long getLittleEndianLong(final byte[] data, final int index) {
83          return (long) data[index    ] & 0xff |
84                 ((long) data[index + 1] & 0xff) <<  8 |
85                 ((long) data[index + 2] & 0xff) << 16 |
86                 ((long) data[index + 3] & 0xff) << 24 |
87                 ((long) data[index + 4] & 0xff) << 32 |
88                 ((long) data[index + 5] & 0xff) << 40 |
89                 ((long) data[index + 6] & 0xff) << 48 |
90                 ((long) data[index + 7] & 0xff) << 56;
91      }
92  
93      /**
94       * Generates a 32-bit hash from byte array with the given length and a default seed value.
95       * This is a helper method that will produce the same result as:
96       *
97       * <pre>
98       * int seed = 0x9747b28c;
99       * int hash = MurmurHash2.hash32(data, length, seed);
100      * </pre>
101      *
102      * @param data The input byte array
103      * @param length The length of the array
104      * @return The 32-bit hash
105      * @see #hash32(byte[], int, int)
106      */
107     public static int hash32(final byte[] data, final int length) {
108         return hash32(data, length, 0x9747b28c);
109     }
110 
111     /**
112      * Generates a 32-bit hash from byte array with the given length and seed.
113      *
114      * @param data The input byte array
115      * @param length The length of the array
116      * @param seed The initial seed value
117      * @return The 32-bit hash
118      */
119     public static int hash32(final byte[] data, final int length, final int seed) {
120         // Initialize the hash to a random value
121         int h = seed ^ length;
122 
123         // Mix 4 bytes at a time into the hash
124         final int nblocks = length >> 2;
125 
126         // body
127         for (int i = 0; i < nblocks; i++) {
128             final int index = i << 2;
129             int k = getLittleEndianInt(data, index);
130             k *= M32;
131             k ^= k >>> R32;
132             k *= M32;
133             h *= M32;
134             h ^= k;
135         }
136 
137         // Handle the last few bytes of the input array
138         final int index = nblocks << 2;
139         switch (length - index) {
140         case 3:
141             h ^= (data[index + 2] & 0xff) << 16;
142         case 2:
143             h ^= (data[index + 1] & 0xff) << 8;
144         case 1:
145             h ^= data[index] & 0xff;
146             h *= M32;
147         }
148 
149         // Do a few final mixes of the hash to ensure the last few
150         // bytes are well-incorporated.
151         h ^= h >>> 13;
152         h *= M32;
153         h ^= h >>> 15;
154 
155         return h;
156     }
157 
158     /**
159      * Generates a 32-bit hash from a string with a default seed.
160      * <p>
161      * Before 1.14 the string was converted using default encoding.
162      * Since 1.14 the string is converted to bytes using UTF-8 encoding.
163      * </p>
164      * This is a helper method that will produce the same result as:
165      *
166      * <pre>
167      * int seed = 0x9747b28c;
168      * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
169      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
170      * </pre>
171      *
172      * @param text The input string
173      * @return The 32-bit hash
174      * @see #hash32(byte[], int, int)
175      */
176     public static int hash32(final String text) {
177         final byte[] bytes = StringUtils.getBytesUtf8(text);
178         return hash32(bytes, bytes.length);
179     }
180 
181     /**
182      * Generates a 32-bit hash from a substring with a default seed value.
183      * The string is converted to bytes using the default encoding.
184      * This is a helper method that will produce the same result as:
185      *
186      * <pre>
187      * int seed = 0x9747b28c;
188      * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
189      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
190      * </pre>
191      *
192      * @param text The input string
193      * @param from The starting index
194      * @param length The length of the substring
195      * @return The 32-bit hash
196      * @see #hash32(byte[], int, int)
197      */
198     public static int hash32(final String text, final int from, final int length) {
199         return hash32(text.substring(from, from + length));
200     }
201 
202     /**
203      * Generates a 64-bit hash from byte array with given length and a default seed value.
204      * This is a helper method that will produce the same result as:
205      *
206      * <pre>
207      * int seed = 0xe17a1465;
208      * int hash = MurmurHash2.hash64(data, length, seed);
209      * </pre>
210      *
211      * @param data The input byte array
212      * @param length The length of the array
213      * @return The 64-bit hash
214      * @see #hash64(byte[], int, int)
215      */
216     public static long hash64(final byte[] data, final int length) {
217         return hash64(data, length, 0xe17a1465);
218     }
219 
220     /**
221      * Generates a 64-bit hash from byte array of the given length and seed.
222      *
223      * @param data The input byte array
224      * @param length The length of the array
225      * @param seed The initial seed value
226      * @return The 64-bit hash of the given array
227      */
228     public static long hash64(final byte[] data, final int length, final int seed) {
229         long h = seed & 0xffffffffL ^ length * M64;
230 
231         final int nblocks = length >> 3;
232 
233         // body
234         for (int i = 0; i < nblocks; i++) {
235             final int index = i << 3;
236             long k = getLittleEndianLong(data, index);
237 
238             k *= M64;
239             k ^= k >>> R64;
240             k *= M64;
241 
242             h ^= k;
243             h *= M64;
244         }
245 
246         final int index = nblocks << 3;
247         switch (length - index) {
248         case 7:
249             h ^= ((long) data[index + 6] & 0xff) << 48;
250         case 6:
251             h ^= ((long) data[index + 5] & 0xff) << 40;
252         case 5:
253             h ^= ((long) data[index + 4] & 0xff) << 32;
254         case 4:
255             h ^= ((long) data[index + 3] & 0xff) << 24;
256         case 3:
257             h ^= ((long) data[index + 2] & 0xff) << 16;
258         case 2:
259             h ^= ((long) data[index + 1] & 0xff) << 8;
260         case 1:
261             h ^= (long) data[index] & 0xff;
262             h *= M64;
263         }
264 
265         h ^= h >>> R64;
266         h *= M64;
267         h ^= h >>> R64;
268 
269         return h;
270     }
271 
272     /**
273      * Generates a 64-bit hash from a string with a default seed.
274      * <p>
275      * Before 1.14 the string was converted using default encoding.
276      * Since 1.14 the string is converted to bytes using UTF-8 encoding.
277      * </p>
278      * <p>
279      * This is a helper method that will produce the same result as:
280      * </p>
281      *
282      * <pre>
283      * int seed = 0xe17a1465;
284      * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
285      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
286      * </pre>
287      *
288      * @param text The input string
289      * @return The 64-bit hash
290      * @see #hash64(byte[], int, int)
291      */
292     public static long hash64(final String text) {
293         final byte[] bytes = StringUtils.getBytesUtf8(text);
294         return hash64(bytes, bytes.length);
295     }
296 
297     /**
298      * Generates a 64-bit hash from a substring with a default seed value.
299      * The string is converted to bytes using the default encoding.
300      * This is a helper method that will produce the same result as:
301      *
302      * <pre>
303      * int seed = 0xe17a1465;
304      * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
305      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
306      * </pre>
307      *
308      * @param text The input string
309      * @param from The starting index
310      * @param length The length of the substring
311      * @return The 64-bit hash
312      * @see #hash64(byte[], int, int)
313      */
314     public static long hash64(final String text, final int from, final int length) {
315         return hash64(text.substring(from, from + length));
316     }
317 
318     /** No instance methods. */
319     private MurmurHash2() {
320     }
321 }