001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.digest;
019
020import org.apache.commons.codec.binary.StringUtils;
021
022/**
023 * Implements the MurmurHash2 32-bit and 64-bit hash functions.
024 *
025 * <p>MurmurHash is a non-cryptographic hash function suitable for general
026 * hash-based lookup. The name comes from two basic operations, multiply (MU)
027 * and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
028 * it is not specifically designed to be difficult to reverse by an adversary,
029 * making it unsuitable for cryptographic purposes.</p>
030 *
031 * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2}
032 * and the 64-bit hash function {@code MurmurHash64A} from Austin Appleby's
033 * original {@code c++} code in SMHasher.</p>
034 *
035 * <p>This is a re-implementation of the original C code plus some additional
036 * features.</p>
037 *
038 * <p>This is public domain code with no copyrights. From home page of
039 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p>
040 *
041 * <blockquote>
042 * "All MurmurHash versions are public domain software, and the author
043 * disclaims all copyright to their code."
044 * </blockquote>
045 *
046 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
047 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp">
048 *   Original MurmurHash2 c++ code</a>
049 * @since 1.13
050 */
051public final class MurmurHash2 {
052
053    // Constants for 32-bit variant
054    private static final int M32 = 0x5bd1e995;
055    private static final int R32 = 24;
056
057    // Constants for 64-bit variant
058    private static final long M64 = 0xc6a4a7935bd1e995L;
059    private static final int R64 = 47;
060
061    /**
062     * Gets the little-endian int from 4 bytes starting at the specified index.
063     *
064     * @param data The data
065     * @param index The index
066     * @return The little-endian int
067     */
068    private static int getLittleEndianInt(final byte[] data, final int index) {
069        return data[index    ] & 0xff |
070               (data[index + 1] & 0xff) <<  8 |
071               (data[index + 2] & 0xff) << 16 |
072               (data[index + 3] & 0xff) << 24;
073    }
074
075    /**
076     * Gets the little-endian long from 8 bytes starting at the specified index.
077     *
078     * @param data The data
079     * @param index The index
080     * @return The little-endian long
081     */
082    private static long getLittleEndianLong(final byte[] data, final int index) {
083        return (long) data[index    ] & 0xff |
084               ((long) data[index + 1] & 0xff) <<  8 |
085               ((long) data[index + 2] & 0xff) << 16 |
086               ((long) data[index + 3] & 0xff) << 24 |
087               ((long) data[index + 4] & 0xff) << 32 |
088               ((long) data[index + 5] & 0xff) << 40 |
089               ((long) data[index + 6] & 0xff) << 48 |
090               ((long) data[index + 7] & 0xff) << 56;
091    }
092
093    /**
094     * Generates a 32-bit hash from byte array with the given length and a default seed value.
095     * This is a helper method that will produce the same result as:
096     *
097     * <pre>
098     * int seed = 0x9747b28c;
099     * int hash = MurmurHash2.hash32(data, length, seed);
100     * </pre>
101     *
102     * @param data The input byte array
103     * @param length The length of the array
104     * @return The 32-bit hash
105     * @see #hash32(byte[], int, int)
106     */
107    public static int hash32(final byte[] data, final int length) {
108        return hash32(data, length, 0x9747b28c);
109    }
110
111    /**
112     * Generates a 32-bit hash from byte array with the given length and seed.
113     *
114     * @param data The input byte array
115     * @param length The length of the array
116     * @param seed The initial seed value
117     * @return The 32-bit hash
118     */
119    public static int hash32(final byte[] data, final int length, final int seed) {
120        // Initialize the hash to a random value
121        int h = seed ^ length;
122
123        // Mix 4 bytes at a time into the hash
124        final int nblocks = length >> 2;
125
126        // body
127        for (int i = 0; i < nblocks; i++) {
128            final int index = i << 2;
129            int k = getLittleEndianInt(data, index);
130            k *= M32;
131            k ^= k >>> R32;
132            k *= M32;
133            h *= M32;
134            h ^= k;
135        }
136
137        // Handle the last few bytes of the input array
138        final int index = nblocks << 2;
139        switch (length - index) {
140        case 3:
141            h ^= (data[index + 2] & 0xff) << 16;
142        case 2:
143            h ^= (data[index + 1] & 0xff) << 8;
144        case 1:
145            h ^= data[index] & 0xff;
146            h *= M32;
147        }
148
149        // Do a few final mixes of the hash to ensure the last few
150        // bytes are well-incorporated.
151        h ^= h >>> 13;
152        h *= M32;
153        h ^= h >>> 15;
154
155        return h;
156    }
157
158    /**
159     * Generates a 32-bit hash from a string with a default seed.
160     * <p>
161     * Before 1.14 the string was converted using default encoding.
162     * Since 1.14 the string is converted to bytes using UTF-8 encoding.
163     * </p>
164     * This is a helper method that will produce the same result as:
165     *
166     * <pre>
167     * int seed = 0x9747b28c;
168     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
169     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
170     * </pre>
171     *
172     * @param text The input string
173     * @return The 32-bit hash
174     * @see #hash32(byte[], int, int)
175     */
176    public static int hash32(final String text) {
177        final byte[] bytes = StringUtils.getBytesUtf8(text);
178        return hash32(bytes, bytes.length);
179    }
180
181    /**
182     * Generates a 32-bit hash from a substring with a default seed value.
183     * The string is converted to bytes using the default encoding.
184     * This is a helper method that will produce the same result as:
185     *
186     * <pre>
187     * int seed = 0x9747b28c;
188     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
189     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
190     * </pre>
191     *
192     * @param text The input string
193     * @param from The starting index
194     * @param length The length of the substring
195     * @return The 32-bit hash
196     * @see #hash32(byte[], int, int)
197     */
198    public static int hash32(final String text, final int from, final int length) {
199        return hash32(text.substring(from, from + length));
200    }
201
202    /**
203     * Generates a 64-bit hash from byte array with given length and a default seed value.
204     * This is a helper method that will produce the same result as:
205     *
206     * <pre>
207     * int seed = 0xe17a1465;
208     * int hash = MurmurHash2.hash64(data, length, seed);
209     * </pre>
210     *
211     * @param data The input byte array
212     * @param length The length of the array
213     * @return The 64-bit hash
214     * @see #hash64(byte[], int, int)
215     */
216    public static long hash64(final byte[] data, final int length) {
217        return hash64(data, length, 0xe17a1465);
218    }
219
220    /**
221     * Generates a 64-bit hash from byte array of the given length and seed.
222     *
223     * @param data The input byte array
224     * @param length The length of the array
225     * @param seed The initial seed value
226     * @return The 64-bit hash of the given array
227     */
228    public static long hash64(final byte[] data, final int length, final int seed) {
229        long h = seed & 0xffffffffL ^ length * M64;
230
231        final int nblocks = length >> 3;
232
233        // body
234        for (int i = 0; i < nblocks; i++) {
235            final int index = i << 3;
236            long k = getLittleEndianLong(data, index);
237
238            k *= M64;
239            k ^= k >>> R64;
240            k *= M64;
241
242            h ^= k;
243            h *= M64;
244        }
245
246        final int index = nblocks << 3;
247        switch (length - index) {
248        case 7:
249            h ^= ((long) data[index + 6] & 0xff) << 48;
250        case 6:
251            h ^= ((long) data[index + 5] & 0xff) << 40;
252        case 5:
253            h ^= ((long) data[index + 4] & 0xff) << 32;
254        case 4:
255            h ^= ((long) data[index + 3] & 0xff) << 24;
256        case 3:
257            h ^= ((long) data[index + 2] & 0xff) << 16;
258        case 2:
259            h ^= ((long) data[index + 1] & 0xff) << 8;
260        case 1:
261            h ^= (long) data[index] & 0xff;
262            h *= M64;
263        }
264
265        h ^= h >>> R64;
266        h *= M64;
267        h ^= h >>> R64;
268
269        return h;
270    }
271
272    /**
273     * Generates a 64-bit hash from a string with a default seed.
274     * <p>
275     * Before 1.14 the string was converted using default encoding.
276     * Since 1.14 the string is converted to bytes using UTF-8 encoding.
277     * </p>
278     * <p>
279     * This is a helper method that will produce the same result as:
280     * </p>
281     *
282     * <pre>
283     * int seed = 0xe17a1465;
284     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
285     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
286     * </pre>
287     *
288     * @param text The input string
289     * @return The 64-bit hash
290     * @see #hash64(byte[], int, int)
291     */
292    public static long hash64(final String text) {
293        final byte[] bytes = StringUtils.getBytesUtf8(text);
294        return hash64(bytes, bytes.length);
295    }
296
297    /**
298     * Generates a 64-bit hash from a substring with a default seed value.
299     * The string is converted to bytes using the default encoding.
300     * This is a helper method that will produce the same result as:
301     *
302     * <pre>
303     * int seed = 0xe17a1465;
304     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
305     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
306     * </pre>
307     *
308     * @param text The input string
309     * @param from The starting index
310     * @param length The length of the substring
311     * @return The 64-bit hash
312     * @see #hash64(byte[], int, int)
313     */
314    public static long hash64(final String text, final int from, final int length) {
315        return hash64(text.substring(from, from + length));
316    }
317
318    /** No instance methods. */
319    private MurmurHash2() {
320    }
321}