001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.digest;
019
020import org.apache.commons.codec.binary.StringUtils;
021
022/**
023 * Implementation of the MurmurHash2 32-bit and 64-bit hash functions.
024 *
025 * <p>MurmurHash is a non-cryptographic hash function suitable for general
026 * hash-based lookup. The name comes from two basic operations, multiply (MU)
027 * and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
028 * it is not specifically designed to be difficult to reverse by an adversary,
029 * making it unsuitable for cryptographic purposes.</p>
030 *
031 * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2}
032 * and the 64-bit hash function {@code MurmurHash64A} from Austin Applyby's
033 * original {@code c++} code in SMHasher.</p>
034 *
035 * <p>This is a re-implementation of the original C code plus some additional
036 * features.</p>
037 *
038 * <p>This is public domain code with no copyrights. From home page of
039 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p>
040 *
041 * <blockquote>
042 * "All MurmurHash versions are public domain software, and the author
043 * disclaims all copyright to their code."
044 * </blockquote>
045 *
046 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
047 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp">
048 *   Original MurmurHash2 c++ code</a>
049 * @since 1.13
050 */
051public final class MurmurHash2 {
052
053    // Constants for 32-bit variant
054    private static final int M32 = 0x5bd1e995;
055    private static final int R32 = 24;
056
057    // Constants for 64-bit variant
058    private static final long M64 = 0xc6a4a7935bd1e995L;
059    private static final int R64 = 47;
060
061    /** No instance methods. */
062    private MurmurHash2() {
063    }
064
065    /**
066     * Generates a 32-bit hash from byte array with the given length and seed.
067     *
068     * @param data The input byte array
069     * @param length The length of the array
070     * @param seed The initial seed value
071     * @return The 32-bit hash
072     */
073    public static int hash32(final byte[] data, final int length, final int seed) {
074        // Initialize the hash to a random value
075        int h = seed ^ length;
076
077        // Mix 4 bytes at a time into the hash
078        final int nblocks = length >> 2;
079
080        // body
081        for (int i = 0; i < nblocks; i++) {
082            final int index = (i << 2);
083            int k = getLittleEndianInt(data, index);
084            k *= M32;
085            k ^= k >>> R32;
086            k *= M32;
087            h *= M32;
088            h ^= k;
089        }
090
091        // Handle the last few bytes of the input array
092        final int index = (nblocks << 2);
093        switch (length - index) {
094        case 3:
095            h ^= (data[index + 2] & 0xff) << 16;
096        case 2:
097            h ^= (data[index + 1] & 0xff) << 8;
098        case 1:
099            h ^= (data[index] & 0xff);
100            h *= M32;
101        }
102
103        // Do a few final mixes of the hash to ensure the last few
104        // bytes are well-incorporated.
105        h ^= h >>> 13;
106        h *= M32;
107        h ^= h >>> 15;
108
109        return h;
110    }
111
112    /**
113     * Generates a 32-bit hash from byte array with the given length and a default seed value.
114     * This is a helper method that will produce the same result as:
115     *
116     * <pre>
117     * int seed = 0x9747b28c;
118     * int hash = MurmurHash2.hash32(data, length, seed);
119     * </pre>
120     *
121     * @param data The input byte array
122     * @param length The length of the array
123     * @return The 32-bit hash
124     * @see #hash32(byte[], int, int)
125     */
126    public static int hash32(final byte[] data, final int length) {
127        return hash32(data, length, 0x9747b28c);
128    }
129
130    /**
131     * Generates a 32-bit hash from a string with a default seed.
132     * <p>
133     * Before 1.14 the string was converted using default encoding.
134     * Since 1.14 the string is converted to bytes using UTF-8 encoding.
135     * </p>
136     * This is a helper method that will produce the same result as:
137     *
138     * <pre>
139     * int seed = 0x9747b28c;
140     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
141     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
142     * </pre>
143     *
144     * @param text The input string
145     * @return The 32-bit hash
146     * @see #hash32(byte[], int, int)
147     */
148    public static int hash32(final String text) {
149        final byte[] bytes = StringUtils.getBytesUtf8(text);
150        return hash32(bytes, bytes.length);
151    }
152
153    /**
154     * Generates a 32-bit hash from a substring with a default seed value.
155     * The string is converted to bytes using the default encoding.
156     * This is a helper method that will produce the same result as:
157     *
158     * <pre>
159     * int seed = 0x9747b28c;
160     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
161     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
162     * </pre>
163     *
164     * @param text The input string
165     * @param from The starting index
166     * @param length The length of the substring
167     * @return The 32-bit hash
168     * @see #hash32(byte[], int, int)
169     */
170    public static int hash32(final String text, final int from, final int length) {
171        return hash32(text.substring(from, from + length));
172    }
173
174    /**
175     * Generates a 64-bit hash from byte array of the given length and seed.
176     *
177     * @param data The input byte array
178     * @param length The length of the array
179     * @param seed The initial seed value
180     * @return The 64-bit hash of the given array
181     */
182    public static long hash64(final byte[] data, final int length, final int seed) {
183        long h = (seed & 0xffffffffL) ^ (length * M64);
184
185        final int nblocks = length >> 3;
186
187        // body
188        for (int i = 0; i < nblocks; i++) {
189            final int index = (i << 3);
190            long k = getLittleEndianLong(data, index);
191
192            k *= M64;
193            k ^= k >>> R64;
194            k *= M64;
195
196            h ^= k;
197            h *= M64;
198        }
199
200        final int index = (nblocks << 3);
201        switch (length - index) {
202        case 7:
203            h ^= ((long) data[index + 6] & 0xff) << 48;
204        case 6:
205            h ^= ((long) data[index + 5] & 0xff) << 40;
206        case 5:
207            h ^= ((long) data[index + 4] & 0xff) << 32;
208        case 4:
209            h ^= ((long) data[index + 3] & 0xff) << 24;
210        case 3:
211            h ^= ((long) data[index + 2] & 0xff) << 16;
212        case 2:
213            h ^= ((long) data[index + 1] & 0xff) << 8;
214        case 1:
215            h ^= ((long) data[index] & 0xff);
216            h *= M64;
217        }
218
219        h ^= h >>> R64;
220        h *= M64;
221        h ^= h >>> R64;
222
223        return h;
224    }
225
226    /**
227     * Generates a 64-bit hash from byte array with given length and a default seed value.
228     * This is a helper method that will produce the same result as:
229     *
230     * <pre>
231     * int seed = 0xe17a1465;
232     * int hash = MurmurHash2.hash64(data, length, seed);
233     * </pre>
234     *
235     * @param data The input byte array
236     * @param length The length of the array
237     * @return The 64-bit hash
238     * @see #hash64(byte[], int, int)
239     */
240    public static long hash64(final byte[] data, final int length) {
241        return hash64(data, length, 0xe17a1465);
242    }
243
244    /**
245     * Generates a 64-bit hash from a string with a default seed.
246     * <p>
247     * Before 1.14 the string was converted using default encoding.
248     * Since 1.14 the string is converted to bytes using UTF-8 encoding.
249     * </p>
250     * This is a helper method that will produce the same result as:
251     *
252     * <pre>
253     * int seed = 0xe17a1465;
254     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
255     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
256     * </pre>
257     *
258     * @param text The input string
259     * @return The 64-bit hash
260     * @see #hash64(byte[], int, int)
261     */
262    public static long hash64(final String text) {
263        final byte[] bytes = StringUtils.getBytesUtf8(text);
264        return hash64(bytes, bytes.length);
265    }
266
267    /**
268     * Generates a 64-bit hash from a substring with a default seed value.
269     * The string is converted to bytes using the default encoding.
270     * This is a helper method that will produce the same result as:
271     *
272     * <pre>
273     * int seed = 0xe17a1465;
274     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
275     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
276     * </pre>
277     *
278     * @param text The The input string
279     * @param from The starting index
280     * @param length The length of the substring
281     * @return The 64-bit hash
282     * @see #hash64(byte[], int, int)
283     */
284    public static long hash64(final String text, final int from, final int length) {
285        return hash64(text.substring(from, from + length));
286    }
287
288    /**
289     * Gets the little-endian int from 4 bytes starting at the specified index.
290     *
291     * @param data The data
292     * @param index The index
293     * @return The little-endian int
294     */
295    private static int getLittleEndianInt(final byte[] data, final int index) {
296        return ((data[index    ] & 0xff)      ) |
297               ((data[index + 1] & 0xff) <<  8) |
298               ((data[index + 2] & 0xff) << 16) |
299               ((data[index + 3] & 0xff) << 24);
300    }
301
302    /**
303     * Gets the little-endian long from 8 bytes starting at the specified index.
304     *
305     * @param data The data
306     * @param index The index
307     * @return The little-endian long
308     */
309    private static long getLittleEndianLong(final byte[] data, final int index) {
310        return (((long) data[index    ] & 0xff)      ) |
311               (((long) data[index + 1] & 0xff) <<  8) |
312               (((long) data[index + 2] & 0xff) << 16) |
313               (((long) data[index + 3] & 0xff) << 24) |
314               (((long) data[index + 4] & 0xff) << 32) |
315               (((long) data[index + 5] & 0xff) << 40) |
316               (((long) data[index + 6] & 0xff) << 48) |
317               (((long) data[index + 7] & 0xff) << 56);
318    }
319}