001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.digest;
019
020import org.apache.commons.codec.binary.StringUtils;
021
022/**
023 * Implementation of the MurmurHash3 32-bit and 128-bit hash functions.
024 *
025 * <p>
026 * MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic
027 * operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not
028 * specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes.
029 * </p>
030 *
031 * <p>
032 * This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function
033 * {@code MurmurHash3_x64_128} from Austin Applyby's original {@code c++} code in SMHasher.
034 * </p>
035 *
036 * <p>
037 * This is public domain code with no copyrights. From home page of
038 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:
039 * </p>
040 *
041 * <blockquote> "All MurmurHash versions are public domain software, and the author disclaims all copyright to their
042 * code." </blockquote>
043 *
044 * <p>
045 * Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original
046 * MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a
047 * 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes.
048 * <p>
049 *
050 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
051 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> Original MurmurHash3 c++
052 *      code</a>
053 * @see <a href=
054 *      "https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java">
055 *      Apache Hive Murmer3</a>
056 * @since 1.13
057 */
058public final class MurmurHash3 {
059
060    /**
061     * A random number to use for a hash code.
062     *
063     * @deprecated This is not used internally and will be removed in a future release.
064     */
065    @Deprecated
066    public static final long NULL_HASHCODE = 2862933555777941757L;
067
068    /**
069     * A default seed to use for the murmur hash algorithm.
070     * Has the value {@code 104729}.
071     */
072    public static final int DEFAULT_SEED = 104729;
073
074    /** TODO Replace on Java 8 with Long.BYTES. */
075    static final int LONG_BYTES = Long.SIZE / Byte.SIZE;
076
077    /** TODO Replace on Java 8 with Integer.BYTES. */
078    static final int INTEGER_BYTES = Integer.SIZE / Byte.SIZE;
079
080    /** TODO Replace on Java 8 with Short.BYTES. */
081    static final int SHORT_BYTES = Short.SIZE / Byte.SIZE;
082
083    // Constants for 32-bit variant
084    private static final int C1_32 = 0xcc9e2d51;
085    private static final int C2_32 = 0x1b873593;
086    private static final int R1_32 = 15;
087    private static final int R2_32 = 13;
088    private static final int M_32 = 5;
089    private static final int N_32 = 0xe6546b64;
090
091    // Constants for 128-bit variant
092    private static final long C1 = 0x87c37b91114253d5L;
093    private static final long C2 = 0x4cf5ad432745937fL;
094    private static final int R1 = 31;
095    private static final int R2 = 27;
096    private static final int R3 = 33;
097    private static final int M = 5;
098    private static final int N1 = 0x52dce729;
099    private static final int N2 = 0x38495ab5;
100
101    /** No instance methods. */
102    private MurmurHash3() {
103    }
104
105    /**
106     * Generates 32-bit hash from two longs with a default seed value.
107     * This is a helper method that will produce the same result as:
108     *
109     * <pre>
110     * int offset = 0;
111     * int seed = 104729;
112     * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16)
113     *                                            .putLong(data1)
114     *                                            .putLong(data2)
115     *                                            .array(), offset, 16, seed);
116     * </pre>
117     *
118     * @param data1 The first long to hash
119     * @param data2 The second long to hash
120     * @return The 32-bit hash
121     * @see #hash32x86(byte[], int, int, int)
122     */
123    public static int hash32(final long data1, final long data2) {
124        return hash32(data1, data2, DEFAULT_SEED);
125    }
126
127    /**
128     * Generates 32-bit hash from two longs with the given seed.
129     * This is a helper method that will produce the same result as:
130     *
131     * <pre>
132     * int offset = 0;
133     * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16)
134     *                                            .putLong(data1)
135     *                                            .putLong(data2)
136     *                                            .array(), offset, 16, seed);
137     * </pre>
138     *
139     * @param data1 The first long to hash
140     * @param data2 The second long to hash
141     * @param seed The initial seed value
142     * @return The 32-bit hash
143     * @see #hash32x86(byte[], int, int, int)
144     */
145    public static int hash32(final long data1, final long data2, final int seed) {
146        int hash = seed;
147        final long r0 = Long.reverseBytes(data1);
148        final long r1 = Long.reverseBytes(data2);
149
150        hash = mix32((int) r0, hash);
151        hash = mix32((int) (r0 >>> 32), hash);
152        hash = mix32((int) (r1), hash);
153        hash = mix32((int) (r1 >>> 32), hash);
154
155        hash ^= LONG_BYTES * 2;
156        return fmix32(hash);
157    }
158
159    /**
160     * Generates 32-bit hash from a long with a default seed value.
161     * This is a helper method that will produce the same result as:
162     *
163     * <pre>
164     * int offset = 0;
165     * int seed = 104729;
166     * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8)
167     *                                            .putLong(data)
168     *                                            .array(), offset, 8, seed);
169     * </pre>
170     *
171     * @param data The long to hash
172     * @return The 32-bit hash
173     * @see #hash32x86(byte[], int, int, int)
174     */
175    public static int hash32(final long data) {
176        return hash32(data, DEFAULT_SEED);
177    }
178
179    /**
180     * Generates 32-bit hash from a long with the given seed.
181     * This is a helper method that will produce the same result as:
182     *
183     * <pre>
184     * int offset = 0;
185     * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8)
186     *                                            .putLong(data)
187     *                                            .array(), offset, 8, seed);
188     * </pre>
189     *
190     * @param data The long to hash
191     * @param seed The initial seed value
192     * @return The 32-bit hash
193     * @see #hash32x86(byte[], int, int, int)
194     */
195    public static int hash32(final long data, final int seed) {
196        int hash = seed;
197        final long r0 = Long.reverseBytes(data);
198
199        hash = mix32((int) r0, hash);
200        hash = mix32((int) (r0 >>> 32), hash);
201
202        hash ^= LONG_BYTES;
203        return fmix32(hash);
204    }
205
206    /**
207     * Generates 32-bit hash from the byte array with a default seed.
208     * This is a helper method that will produce the same result as:
209     *
210     * <pre>
211     * int offset = 0;
212     * int seed = 104729;
213     * int hash = MurmurHash3.hash32(data, offset, data.length, seed);
214     * </pre>
215     *
216     * <p>This implementation contains a sign-extension bug in the finalization step of
217     * any bytes left over from dividing the length by 4. This manifests if any of these
218     * bytes are negative.<p>
219     *
220     * @param data The input byte array
221     * @return The 32-bit hash
222     * @see #hash32(byte[], int, int, int)
223     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
224     */
225    @Deprecated
226    public static int hash32(final byte[] data) {
227        return hash32(data, 0, data.length, DEFAULT_SEED);
228    }
229
230    /**
231     * Generates 32-bit hash from a string with a default seed.
232     * <p>
233     * Before 1.14 the string was converted using default encoding.
234     * Since 1.14 the string is converted to bytes using UTF-8 encoding.
235     * </p>
236     * This is a helper method that will produce the same result as:
237     *
238     * <pre>
239     * int offset = 0;
240     * int seed = 104729;
241     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
242     * int hash = MurmurHash3.hash32(bytes, offset, bytes.length, seed);
243     * </pre>
244     *
245     * <p>This implementation contains a sign-extension bug in the finalization step of
246     * any bytes left over from dividing the length by 4. This manifests if any of these
247     * bytes are negative.<p>
248     *
249     * @param data The input string
250     * @return The 32-bit hash
251     * @see #hash32(byte[], int, int, int)
252     * @deprecated Use {@link #hash32x86(byte[], int, int, int)} with the bytes returned from
253     * {@link String#getBytes(java.nio.charset.Charset)}. This corrects the processing of trailing bytes.
254     */
255    @Deprecated
256    public static int hash32(final String data) {
257        final byte[] bytes = StringUtils.getBytesUtf8(data);
258        return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
259    }
260
261    /**
262     * Generates 32-bit hash from the byte array with the given length and a default seed.
263     * This is a helper method that will produce the same result as:
264     *
265     * <pre>
266     * int offset = 0;
267     * int seed = 104729;
268     * int hash = MurmurHash3.hash32(data, offset, length, seed);
269     * </pre>
270     *
271     * <p>This implementation contains a sign-extension bug in the finalization step of
272     * any bytes left over from dividing the length by 4. This manifests if any of these
273     * bytes are negative.<p>
274     *
275     * @param data The input byte array
276     * @param length The length of array
277     * @return The 32-bit hash
278     * @see #hash32(byte[], int, int, int)
279     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
280     */
281    @Deprecated
282    public static int hash32(final byte[] data, final int length) {
283        return hash32(data, length, DEFAULT_SEED);
284    }
285
286    /**
287     * Generates 32-bit hash from the byte array with the given length and seed. This is a
288     * helper method that will produce the same result as:
289     *
290     * <pre>
291     * int offset = 0;
292     * int hash = MurmurHash3.hash32(data, offset, length, seed);
293     * </pre>
294     *
295     * <p>This implementation contains a sign-extension bug in the finalization step of
296     * any bytes left over from dividing the length by 4. This manifests if any of these
297     * bytes are negative.<p>
298     *
299     * @param data The input byte array
300     * @param length The length of array
301     * @param seed The initial seed value
302     * @return The 32-bit hash
303     * @see #hash32(byte[], int, int, int)
304     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
305     */
306    @Deprecated
307    public static int hash32(final byte[] data, final int length, final int seed) {
308        return hash32(data, 0, length, seed);
309    }
310
311    /**
312     * Generates 32-bit hash from the byte array with the given offset, length and seed.
313     *
314     * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
315     * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
316     *
317     * <p>This implementation contains a sign-extension bug in the finalization step of
318     * any bytes left over from dividing the length by 4. This manifests if any of these
319     * bytes are negative.<p>
320     *
321     * @param data The input byte array
322     * @param offset The offset of data
323     * @param length The length of array
324     * @param seed The initial seed value
325     * @return The 32-bit hash
326     * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes.
327     */
328    @Deprecated
329    public static int hash32(final byte[] data, final int offset, final int length, final int seed) {
330        int hash = seed;
331        final int nblocks = length >> 2;
332
333        // body
334        for (int i = 0; i < nblocks; i++) {
335            final int index = offset + (i << 2);
336            final int k = getLittleEndianInt(data, index);
337            hash = mix32(k, hash);
338        }
339
340        // tail
341        // ************
342        // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
343        // ************
344        final int index = offset + (nblocks << 2);
345        int k1 = 0;
346        switch (offset + length - index) {
347        case 3:
348            k1 ^= data[index + 2] << 16;
349        case 2:
350            k1 ^= data[index + 1] << 8;
351        case 1:
352            k1 ^= data[index];
353
354            // mix functions
355            k1 *= C1_32;
356            k1 = Integer.rotateLeft(k1, R1_32);
357            k1 *= C2_32;
358            hash ^= k1;
359        }
360
361        hash ^= length;
362        return fmix32(hash);
363    }
364
365    /**
366     * Generates 32-bit hash from the byte array with a seed of zero.
367     * This is a helper method that will produce the same result as:
368     *
369     * <pre>
370     * int offset = 0;
371     * int seed = 0;
372     * int hash = MurmurHash3.hash32x86(data, offset, data.length, seed);
373     * </pre>
374     *
375     * @param data The input byte array
376     * @return The 32-bit hash
377     * @see #hash32x86(byte[], int, int, int)
378     * @since 1.14
379     */
380    public static int hash32x86(final byte[] data) {
381        return hash32x86(data, 0, data.length, 0);
382    }
383
384    /**
385     * Generates 32-bit hash from the byte array with the given offset, length and seed.
386     *
387     * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
388     * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
389     *
390     * @param data The input byte array
391     * @param offset The offset of data
392     * @param length The length of array
393     * @param seed The initial seed value
394     * @return The 32-bit hash
395     * @since 1.14
396     */
397    public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) {
398        int hash = seed;
399        final int nblocks = length >> 2;
400
401        // body
402        for (int i = 0; i < nblocks; i++) {
403            final int index = offset + (i << 2);
404            final int k = getLittleEndianInt(data, index);
405            hash = mix32(k, hash);
406        }
407
408        // tail
409        final int index = offset + (nblocks << 2);
410        int k1 = 0;
411        switch (offset + length - index) {
412        case 3:
413            k1 ^= (data[index + 2] & 0xff) << 16;
414        case 2:
415            k1 ^= (data[index + 1] & 0xff) << 8;
416        case 1:
417            k1 ^= (data[index] & 0xff);
418
419            // mix functions
420            k1 *= C1_32;
421            k1 = Integer.rotateLeft(k1, R1_32);
422            k1 *= C2_32;
423            hash ^= k1;
424        }
425
426        hash ^= length;
427        return fmix32(hash);
428    }
429
430    /**
431     * Generates 64-bit hash from a long with a default seed.
432     *
433     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
434     *
435     * <p>This is a Murmur3-like 64-bit variant.
436     * The method does not produce the same result as either half of the hash bytes from
437     * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code long}.
438     * This method will be removed in a future release.</p>
439     *
440     * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
441     * this result as the default seed is positive.</p>
442     *
443     * <p>This is a helper method that will produce the same result as:</p>
444     *
445     * <pre>
446     * int offset = 0;
447     * int seed = 104729;
448     * long hash = MurmurHash3.hash64(ByteBuffer.allocate(8)
449     *                                          .putLong(data)
450     *                                          .array(), offset, 8, seed);
451     * </pre>
452     *
453     * @param data The long to hash
454     * @return The 64-bit hash
455     * @see #hash64(byte[], int, int, int)
456     * @deprecated Not part of the MurmurHash3 implementation.
457     * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code long}.
458     */
459    @Deprecated
460    public static long hash64(final long data) {
461        long hash = DEFAULT_SEED;
462        long k = Long.reverseBytes(data);
463        final int length = LONG_BYTES;
464        // mix functions
465        k *= C1;
466        k = Long.rotateLeft(k, R1);
467        k *= C2;
468        hash ^= k;
469        hash = Long.rotateLeft(hash, R2) * M + N1;
470        // finalization
471        hash ^= length;
472        hash = fmix64(hash);
473        return hash;
474    }
475
476    /**
477     * Generates 64-bit hash from an int with a default seed.
478     *
479     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
480     *
481     * <p>This is a Murmur3-like 64-bit variant.
482     * The method does not produce the same result as either half of the hash bytes from
483     * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code int}.
484     * This method will be removed in a future release.</p>
485     *
486     * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
487     * this result as the default seed is positive.</p>
488     *
489     * <p>This is a helper method that will produce the same result as:</p>
490     *
491     * <pre>
492     * int offset = 0;
493     * int seed = 104729;
494     * long hash = MurmurHash3.hash64(ByteBuffer.allocate(4)
495     *                                          .putInt(data)
496     *                                          .array(), offset, 4, seed);
497     * </pre>
498     *
499     * @param data The int to hash
500     * @return The 64-bit hash
501     * @see #hash64(byte[], int, int, int)
502     * @deprecated Not part of the MurmurHash3 implementation.
503     * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code int}.
504     */
505    @Deprecated
506    public static long hash64(final int data) {
507        long k1 = Integer.reverseBytes(data) & (-1L >>> 32);
508        final int length = INTEGER_BYTES;
509        long hash = DEFAULT_SEED;
510        k1 *= C1;
511        k1 = Long.rotateLeft(k1, R1);
512        k1 *= C2;
513        hash ^= k1;
514        // finalization
515        hash ^= length;
516        hash = fmix64(hash);
517        return hash;
518    }
519
520    /**
521     * Generates 64-bit hash from a short with a default seed.
522     *
523     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
524     *
525     * <p>This is a Murmur3-like 64-bit variant.
526     * The method does not produce the same result as either half of the hash bytes from
527     * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code short}.
528     * This method will be removed in a future release.</p>
529     *
530     * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
531     * this result as the default seed is positive.</p>
532     *
533     * <p>This is a helper method that will produce the same result as:</p>
534     *
535     * <pre>
536     * int offset = 0;
537     * int seed = 104729;
538     * long hash = MurmurHash3.hash64(ByteBuffer.allocate(2)
539     *                                          .putShort(data)
540     *                                          .array(), offset, 2, seed);
541     * </pre>
542     *
543     * @param data The short to hash
544     * @return The 64-bit hash
545     * @see #hash64(byte[], int, int, int)
546     * @deprecated Not part of the MurmurHash3 implementation.
547     * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code short}.
548     */
549    @Deprecated
550    public static long hash64(final short data) {
551        long hash = DEFAULT_SEED;
552        long k1 = 0;
553        k1 ^= ((long) data & 0xff) << 8;
554        k1 ^= ((long) ((data & 0xFF00) >> 8) & 0xff);
555        k1 *= C1;
556        k1 = Long.rotateLeft(k1, R1);
557        k1 *= C2;
558        hash ^= k1;
559
560        // finalization
561        hash ^= SHORT_BYTES;
562        hash = fmix64(hash);
563        return hash;
564    }
565
566    /**
567     * Generates 64-bit hash from a byte array with a default seed.
568     *
569     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
570     *
571     * <p>This is a Murmur3-like 64-bit variant.
572     * The method does not produce the same result as either half of the hash bytes from
573     * {@linkplain #hash128x64(byte[])} with the same byte data.
574     * This method will be removed in a future release.</p>
575     *
576     * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
577     * this result as the default seed is positive.</p>
578     *
579     * <p>This is a helper method that will produce the same result as:</p>
580     *
581     * <pre>
582     * int offset = 0;
583     * int seed = 104729;
584     * long hash = MurmurHash3.hash64(data, offset, data.length, seed);
585     * </pre>
586     *
587     * @param data The input byte array
588     * @return The 64-bit hash
589     * @see #hash64(byte[], int, int, int)
590     * @deprecated Not part of the MurmurHash3 implementation.
591     * Use half of the hash bytes from {@link #hash128x64(byte[])}.
592     */
593    @Deprecated
594    public static long hash64(final byte[] data) {
595        return hash64(data, 0, data.length, DEFAULT_SEED);
596    }
597
598    /**
599     * Generates 64-bit hash from a byte array with the given offset and length and a default seed.
600     *
601     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
602     *
603     * <p>This is a Murmur3-like 64-bit variant.
604     * The method does not produce the same result as either half of the hash bytes from
605     * {@linkplain #hash128x64(byte[])} with the same byte data.
606     * This method will be removed in a future release.</p>
607     *
608     * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect
609     * this result as the default seed is positive.</p>
610     *
611     * <p>This is a helper method that will produce the same result as:</p>
612     *
613     * <pre>
614     * int seed = 104729;
615     * long hash = MurmurHash3.hash64(data, offset, length, seed);
616     * </pre>
617     *
618     * @param data The input byte array
619     * @param offset The offset of data
620     * @param length The length of array
621     * @return The 64-bit hash
622     * @see #hash64(byte[], int, int, int)
623     * @deprecated Not part of the MurmurHash3 implementation.
624     * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}.
625     */
626    @Deprecated
627    public static long hash64(final byte[] data, final int offset, final int length) {
628        return hash64(data, offset, length, DEFAULT_SEED);
629    }
630
631    /**
632     * Generates 64-bit hash from a byte array with the given offset, length and seed.
633     *
634     * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p>
635     *
636     * <p>This is a Murmur3-like 64-bit variant.
637     * This method will be removed in a future release.</p>
638     *
639     * <p>This implementation contains a sign-extension bug in the seed initialization.
640     * This manifests if the seed is negative.</p>
641     *
642     * <p>This algorithm processes 8 bytes chunks of data in a manner similar to the 16 byte chunks
643     * of data processed in the MurmurHash3 {@code MurmurHash3_x64_128} method. However the hash
644     * is not mixed with a hash chunk from the next 8 bytes of data. The method will not return
645     * the same value as the first or second 64-bits of the function
646     * {@link #hash128(byte[], int, int, int)}.</p>
647     *
648     * <p>Use of this method is not advised. Use the first long returned from
649     * {@link #hash128x64(byte[], int, int, int)}.<p>
650     *
651     * @param data The input byte array
652     * @param offset The offset of data
653     * @param length The length of array
654     * @param seed The initial seed value
655     * @return The 64-bit hash
656     * @deprecated Not part of the MurmurHash3 implementation.
657     * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}.
658     */
659    @Deprecated
660    public static long hash64(final byte[] data, final int offset, final int length, final int seed) {
661        // ************
662        // Note: This fails to apply masking using 0xffffffffL to the seed.
663        // ************
664        long hash = seed;
665        final int nblocks = length >> 3;
666
667        // body
668        for (int i = 0; i < nblocks; i++) {
669            final int index = offset + (i << 3);
670            long k = getLittleEndianLong(data, index);
671
672            // mix functions
673            k *= C1;
674            k = Long.rotateLeft(k, R1);
675            k *= C2;
676            hash ^= k;
677            hash = Long.rotateLeft(hash, R2) * M + N1;
678        }
679
680        // tail
681        long k1 = 0;
682        final int index = offset + (nblocks << 3);
683        switch (offset + length - index) {
684        case 7:
685            k1 ^= ((long) data[index + 6] & 0xff) << 48;
686        case 6:
687            k1 ^= ((long) data[index + 5] & 0xff) << 40;
688        case 5:
689            k1 ^= ((long) data[index + 4] & 0xff) << 32;
690        case 4:
691            k1 ^= ((long) data[index + 3] & 0xff) << 24;
692        case 3:
693            k1 ^= ((long) data[index + 2] & 0xff) << 16;
694        case 2:
695            k1 ^= ((long) data[index + 1] & 0xff) << 8;
696        case 1:
697            k1 ^= ((long) data[index] & 0xff);
698            k1 *= C1;
699            k1 = Long.rotateLeft(k1, R1);
700            k1 *= C2;
701            hash ^= k1;
702        }
703
704        // finalization
705        hash ^= length;
706        hash = fmix64(hash);
707
708        return hash;
709    }
710
711    /**
712     * Generates 128-bit hash from the byte array with a default seed.
713     * This is a helper method that will produce the same result as:
714     *
715     * <pre>
716     * int offset = 0;
717     * int seed = 104729;
718     * int hash = MurmurHash3.hash128(data, offset, data.length, seed);
719     * </pre>
720     *
721     * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
722     * this result as the default seed is positive.</p>
723     *
724     * @param data The input byte array
725     * @return The 128-bit hash (2 longs)
726     * @see #hash128(byte[], int, int, int)
727     */
728    public static long[] hash128(final byte[] data) {
729        return hash128(data, 0, data.length, DEFAULT_SEED);
730    }
731
732    /**
733     * Generates 128-bit hash from the byte array with a seed of zero.
734     * This is a helper method that will produce the same result as:
735     *
736     * <pre>
737     * int offset = 0;
738     * int seed = 0;
739     * int hash = MurmurHash3.hash128x64(data, offset, data.length, seed);
740     * </pre>
741     *
742     * @param data The input byte array
743     * @return The 128-bit hash (2 longs)
744     * @see #hash128x64(byte[], int, int, int)
745     * @since 1.14
746     */
747    public static long[] hash128x64(final byte[] data) {
748        return hash128x64(data, 0, data.length, 0);
749    }
750
751    /**
752     * Generates 128-bit hash from a string with a default seed.
753     * <p>
754     * Before 1.14 the string was converted using default encoding.
755     * Since 1.14 the string is converted to bytes using UTF-8 encoding.
756     * </p>
757     * This is a helper method that will produce the same result as:
758     *
759     * <pre>
760     * int offset = 0;
761     * int seed = 104729;
762     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
763     * int hash = MurmurHash3.hash128(bytes, offset, bytes.length, seed);
764     * </pre>
765     *
766     * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect
767     * this result as the default seed is positive.</p>
768     *
769     * @param data The input String
770     * @return The 128-bit hash (2 longs)
771     * @see #hash128(byte[], int, int, int)
772     * @deprecated Use {@link #hash128x64(byte[])} using the bytes returned from
773     * {@link String#getBytes(java.nio.charset.Charset)}.
774     */
775    @Deprecated
776    public static long[] hash128(final String data) {
777        final byte[] bytes = StringUtils.getBytesUtf8(data);
778        return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
779    }
780
781    /**
782     * Generates 128-bit hash from the byte array with the given offset, length and seed.
783     *
784     * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
785     * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
786     *
787     * <p>This implementation contains a sign-extension bug in the seed initialization.
788     * This manifests if the seed is negative.<p>
789     *
790     * @param data The input byte array
791     * @param offset The first element of array
792     * @param length The length of array
793     * @param seed The initial seed value
794     * @return The 128-bit hash (2 longs)
795     * @deprecated Use {@link #hash128x64(byte[], int, int, int)}. This corrects the seed initialization.
796     */
797    @Deprecated
798    public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) {
799        // ************
800        // Note: This deliberately fails to apply masking using 0xffffffffL to the seed
801        // to maintain behavioral compatibility with the original version.
802        // The implicit conversion to a long will extend a negative sign
803        // bit through the upper 32-bits of the long seed. These should be zero.
804        // ************
805        return hash128x64Internal(data, offset, length, seed);
806    }
807
808    /**
809     * Generates 128-bit hash from the byte array with the given offset, length and seed.
810     *
811     * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
812     * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
813     *
814     * @param data The input byte array
815     * @param offset The first element of array
816     * @param length The length of array
817     * @param seed The initial seed value
818     * @return The 128-bit hash (2 longs)
819     * @since 1.14
820     */
821    public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) {
822        // Use an unsigned 32-bit integer as the seed
823        return hash128x64Internal(data, offset, length, seed & 0xffffffffL);
824    }
825
826    /**
827     * Generates 128-bit hash from the byte array with the given offset, length and seed.
828     *
829     * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128}
830     * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
831     *
832     * @param data The input byte array
833     * @param offset The first element of array
834     * @param length The length of array
835     * @param seed The initial seed value
836     * @return The 128-bit hash (2 longs)
837     */
838    private static long[] hash128x64Internal(final byte[] data, final int offset, final int length, final long seed) {
839        long h1 = seed;
840        long h2 = seed;
841        final int nblocks = length >> 4;
842
843        // body
844        for (int i = 0; i < nblocks; i++) {
845            final int index = offset + (i << 4);
846            long k1 = getLittleEndianLong(data, index);
847            long k2 = getLittleEndianLong(data, index + 8);
848
849            // mix functions for k1
850            k1 *= C1;
851            k1 = Long.rotateLeft(k1, R1);
852            k1 *= C2;
853            h1 ^= k1;
854            h1 = Long.rotateLeft(h1, R2);
855            h1 += h2;
856            h1 = h1 * M + N1;
857
858            // mix functions for k2
859            k2 *= C2;
860            k2 = Long.rotateLeft(k2, R3);
861            k2 *= C1;
862            h2 ^= k2;
863            h2 = Long.rotateLeft(h2, R1);
864            h2 += h1;
865            h2 = h2 * M + N2;
866        }
867
868        // tail
869        long k1 = 0;
870        long k2 = 0;
871        final int index = offset + (nblocks << 4);
872        switch (offset + length - index) {
873        case 15:
874            k2 ^= ((long) data[index + 14] & 0xff) << 48;
875        case 14:
876            k2 ^= ((long) data[index + 13] & 0xff) << 40;
877        case 13:
878            k2 ^= ((long) data[index + 12] & 0xff) << 32;
879        case 12:
880            k2 ^= ((long) data[index + 11] & 0xff) << 24;
881        case 11:
882            k2 ^= ((long) data[index + 10] & 0xff) << 16;
883        case 10:
884            k2 ^= ((long) data[index + 9] & 0xff) << 8;
885        case 9:
886            k2 ^= data[index + 8] & 0xff;
887            k2 *= C2;
888            k2 = Long.rotateLeft(k2, R3);
889            k2 *= C1;
890            h2 ^= k2;
891
892        case 8:
893            k1 ^= ((long) data[index + 7] & 0xff) << 56;
894        case 7:
895            k1 ^= ((long) data[index + 6] & 0xff) << 48;
896        case 6:
897            k1 ^= ((long) data[index + 5] & 0xff) << 40;
898        case 5:
899            k1 ^= ((long) data[index + 4] & 0xff) << 32;
900        case 4:
901            k1 ^= ((long) data[index + 3] & 0xff) << 24;
902        case 3:
903            k1 ^= ((long) data[index + 2] & 0xff) << 16;
904        case 2:
905            k1 ^= ((long) data[index + 1] & 0xff) << 8;
906        case 1:
907            k1 ^= data[index] & 0xff;
908            k1 *= C1;
909            k1 = Long.rotateLeft(k1, R1);
910            k1 *= C2;
911            h1 ^= k1;
912        }
913
914        // finalization
915        h1 ^= length;
916        h2 ^= length;
917
918        h1 += h2;
919        h2 += h1;
920
921        h1 = fmix64(h1);
922        h2 = fmix64(h2);
923
924        h1 += h2;
925        h2 += h1;
926
927        return new long[] { h1, h2 };
928    }
929
930    /**
931     * Gets the little-endian long from 8 bytes starting at the specified index.
932     *
933     * @param data The data
934     * @param index The index
935     * @return The little-endian long
936     */
937    private static long getLittleEndianLong(final byte[] data, final int index) {
938        return (((long) data[index    ] & 0xff)      ) |
939               (((long) data[index + 1] & 0xff) <<  8) |
940               (((long) data[index + 2] & 0xff) << 16) |
941               (((long) data[index + 3] & 0xff) << 24) |
942               (((long) data[index + 4] & 0xff) << 32) |
943               (((long) data[index + 5] & 0xff) << 40) |
944               (((long) data[index + 6] & 0xff) << 48) |
945               (((long) data[index + 7] & 0xff) << 56);
946    }
947
948    /**
949     * Gets the little-endian int from 4 bytes starting at the specified index.
950     *
951     * @param data The data
952     * @param index The index
953     * @return The little-endian int
954     */
955    private static int getLittleEndianInt(final byte[] data, final int index) {
956        return ((data[index    ] & 0xff)      ) |
957               ((data[index + 1] & 0xff) <<  8) |
958               ((data[index + 2] & 0xff) << 16) |
959               ((data[index + 3] & 0xff) << 24);
960    }
961
962    /**
963     * Performs the intermediate mix step of the 32-bit hash function {@code MurmurHash3_x86_32}.
964     *
965     * @param k The data to add to the hash
966     * @param hash The current hash
967     * @return The new hash
968     */
969    private static int mix32(int k, int hash) {
970        k *= C1_32;
971        k = Integer.rotateLeft(k, R1_32);
972        k *= C2_32;
973        hash ^= k;
974        return Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
975    }
976
977    /**
978     * Performs the final avalanche mix step of the 32-bit hash function {@code MurmurHash3_x86_32}.
979     *
980     * @param hash The current hash
981     * @return The final hash
982     */
983    private static int fmix32(int hash) {
984        hash ^= (hash >>> 16);
985        hash *= 0x85ebca6b;
986        hash ^= (hash >>> 13);
987        hash *= 0xc2b2ae35;
988        hash ^= (hash >>> 16);
989        return hash;
990    }
991
992    /**
993     * Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}.
994     *
995     * @param hash The current hash
996     * @return The final hash
997     */
998    private static long fmix64(long hash) {
999        hash ^= (hash >>> 33);
1000        hash *= 0xff51afd7ed558ccdL;
1001        hash ^= (hash >>> 33);
1002        hash *= 0xc4ceb9fe1a85ec53L;
1003        hash ^= (hash >>> 33);
1004        return hash;
1005    }
1006
1007    /**
1008     * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
1009     * hash computed.
1010     *
1011     * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
1012     * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
1013     *
1014     * @since 1.14
1015     */
1016    public static class IncrementalHash32x86 {
1017
1018        /** The size of byte blocks that are processed together. */
1019        private static final int BLOCK_SIZE = 4;
1020
1021        /** Up to 3 unprocessed bytes from input data. */
1022        private final byte[] unprocessed = new byte[3];
1023
1024        /** The number of unprocessed bytes in the tail data. */
1025        private int unprocessedLength;
1026
1027        /** The total number of input bytes added since the start. */
1028        private int totalLen;
1029
1030        /**
1031         * The current running hash.
1032         * This must be finalised to generate the 32-bit hash value.
1033         */
1034        private int hash;
1035
1036        /**
1037         * Starts a new incremental hash.
1038         *
1039         * @param seed The initial seed value
1040         */
1041        public final void start(final int seed) {
1042            // Reset
1043            unprocessedLength = totalLen = 0;
1044            this.hash = seed;
1045        }
1046
1047        /**
1048         * Adds the byte array to the current incremental hash.
1049         *
1050         * @param data The input byte array
1051         * @param offset The offset of data
1052         * @param length The length of array
1053         */
1054        public final void add(final byte[] data, final int offset, final int length) {
1055            if (length <= 0) {
1056                // Nothing to add
1057                return;
1058            }
1059            totalLen += length;
1060
1061            // Process the bytes in blocks of 4.
1062            // New bytes must be added to any current unprocessed bytes,
1063            // then processed in blocks of 4 and the remaining bytes saved:
1064            //
1065            //    |--|---------------------------|--|
1066            // unprocessed
1067            //                main block
1068            //                                remaining
1069
1070            // Check if the unprocessed bytes and new bytes can fill a block of 4.
1071            // Make this overflow safe in the event that length is Integer.MAX_VALUE.
1072            // Equivalent to: (unprocessedLength + length < BLOCK_SIZE)
1073            if (unprocessedLength + length - BLOCK_SIZE < 0) {
1074                // Not enough so add to the unprocessed bytes
1075                System.arraycopy(data, offset, unprocessed, unprocessedLength, length);
1076                unprocessedLength += length;
1077                return;
1078            }
1079
1080            // Combine unprocessed bytes with new bytes.
1081            int newOffset;
1082            int newLength;
1083            if (unprocessedLength > 0) {
1084                int k = -1;
1085                switch (unprocessedLength) {
1086                case 1:
1087                    k = orBytes(unprocessed[0], data[offset], data[offset + 1], data[offset + 2]);
1088                    break;
1089                case 2:
1090                    k = orBytes(unprocessed[0], unprocessed[1], data[offset], data[offset + 1]);
1091                    break;
1092                case 3:
1093                    k = orBytes(unprocessed[0], unprocessed[1], unprocessed[2], data[offset]);
1094                    break;
1095                default:
1096                    throw new IllegalStateException("Unprocessed length should be 1, 2, or 3: " + unprocessedLength);
1097                }
1098                hash = mix32(k, hash);
1099                // Update the offset and length
1100                final int consumed = BLOCK_SIZE - unprocessedLength;
1101                newOffset = offset + consumed;
1102                newLength = length - consumed;
1103            } else {
1104                newOffset = offset;
1105                newLength = length;
1106            }
1107
1108            // Main processing of blocks of 4 bytes
1109            final int nblocks = newLength >> 2;
1110
1111            for (int i = 0; i < nblocks; i++) {
1112                final int index = newOffset + (i << 2);
1113                final int k = getLittleEndianInt(data, index);
1114                hash = mix32(k, hash);
1115            }
1116
1117            // Save left-over unprocessed bytes
1118            final int consumed = (nblocks << 2);
1119            unprocessedLength = newLength - consumed;
1120            if (unprocessedLength != 0) {
1121                System.arraycopy(data, newOffset + consumed, unprocessed, 0, unprocessedLength);
1122            }
1123        }
1124
1125        /**
1126         * Generate the 32-bit hash value. Repeat calls to this method with no additional data
1127         * will generate the same hash value.
1128         *
1129         * @return The 32-bit hash
1130         */
1131        public final int end() {
1132            // Allow calling end() again after adding no data to return the same result.
1133            return finalise(hash, unprocessedLength, unprocessed, totalLen);
1134        }
1135
1136        /**
1137         * Finalize the running hash to the output 32-bit hash by processing remaining bytes
1138         * and performing final mixing.
1139         *
1140         * @param hash The running hash
1141         * @param unprocessedLength The number of unprocessed bytes in the tail data.
1142         * @param unprocessed Up to 3 unprocessed bytes from input data.
1143         * @param totalLen The total number of input bytes added since the start.
1144         * @return The 32-bit hash
1145         */
1146        int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) {
1147            int result = hash;
1148            int k1 = 0;
1149            switch (unprocessedLength) {
1150            case 3:
1151                k1 ^= (unprocessed[2] & 0xff) << 16;
1152            case 2:
1153                k1 ^= (unprocessed[1] & 0xff) << 8;
1154            case 1:
1155                k1 ^= (unprocessed[0] & 0xff);
1156
1157                // mix functions
1158                k1 *= C1_32;
1159                k1 = Integer.rotateLeft(k1, R1_32);
1160                k1 *= C2_32;
1161                result ^= k1;
1162            }
1163
1164            // finalization
1165            result ^= totalLen;
1166            return fmix32(result);
1167        }
1168
1169        /**
1170         * Combines the bytes using an Or operation ({@code | } in a little-endian representation
1171         * of a 32-bit integer; byte 1 will be the least significant byte, byte 4 the most
1172         * significant.
1173         *
1174         * @param b1 The first byte
1175         * @param b2 The second byte
1176         * @param b3 The third byte
1177         * @param b4 The fourth byte
1178         * @return The 32-bit integer
1179         */
1180        private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) {
1181            return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24);
1182        }
1183    }
1184
1185    /**
1186     * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new
1187     * hash computed.
1188     *
1189     * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32}
1190     * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p>
1191     *
1192     * <p>This implementation contains a sign-extension bug in the finalization step of
1193     * any bytes left over from dividing the length by 4. This manifests if any of these
1194     * bytes are negative.<p>
1195     *
1196     * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
1197     */
1198    @Deprecated
1199    public static class IncrementalHash32 extends IncrementalHash32x86 {
1200
1201        /**
1202         * {@inheritDoc}
1203         *
1204         * <p>This implementation contains a sign-extension bug in the finalization step of
1205         * any bytes left over from dividing the length by 4. This manifests if any of these
1206         * bytes are negative.<p>
1207         *
1208         * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes.
1209         */
1210        @Override
1211        @Deprecated
1212        int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) {
1213            int result = hash;
1214            // ************
1215            // Note: This fails to apply masking using 0xff to the 3 remaining bytes.
1216            // ************
1217            int k1 = 0;
1218            switch (unprocessedLength) {
1219            case 3:
1220                k1 ^= unprocessed[2] << 16;
1221            case 2:
1222                k1 ^= unprocessed[1] << 8;
1223            case 1:
1224                k1 ^= unprocessed[0];
1225
1226                // mix functions
1227                k1 *= C1_32;
1228                k1 = Integer.rotateLeft(k1, R1_32);
1229                k1 *= C2_32;
1230                result ^= k1;
1231            }
1232
1233            // finalization
1234            result ^= totalLen;
1235            return fmix32(result);
1236        }
1237    }
1238}