001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.digest; 019 020import org.apache.commons.codec.binary.StringUtils; 021 022/** 023 * Implementation of the MurmurHash3 32-bit and 128-bit hash functions. 024 * 025 * <p> 026 * MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic 027 * operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not 028 * specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes. 029 * </p> 030 * 031 * <p> 032 * This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function 033 * {@code MurmurHash3_x64_128} from Austin Applyby's original {@code c++} code in SMHasher. 034 * </p> 035 * 036 * <p> 037 * This is public domain code with no copyrights. From home page of 038 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>: 039 * </p> 040 * 041 * <blockquote> "All MurmurHash versions are public domain software, and the author disclaims all copyright to their 042 * code." </blockquote> 043 * 044 * <p> 045 * Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original 046 * MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a 047 * 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes. 048 * <p> 049 * 050 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a> 051 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> Original MurmurHash3 c++ 052 * code</a> 053 * @see <a href= 054 * "https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java"> 055 * Apache Hive Murmer3</a> 056 * @since 1.13 057 */ 058public final class MurmurHash3 { 059 060 /** 061 * A random number to use for a hash code. 062 * 063 * @deprecated This is not used internally and will be removed in a future release. 064 */ 065 @Deprecated 066 public static final long NULL_HASHCODE = 2862933555777941757L; 067 068 /** 069 * A default seed to use for the murmur hash algorithm. 070 * Has the value {@code 104729}. 071 */ 072 public static final int DEFAULT_SEED = 104729; 073 074 /** TODO Replace on Java 8 with Long.BYTES. */ 075 static final int LONG_BYTES = Long.SIZE / Byte.SIZE; 076 077 /** TODO Replace on Java 8 with Integer.BYTES. */ 078 static final int INTEGER_BYTES = Integer.SIZE / Byte.SIZE; 079 080 /** TODO Replace on Java 8 with Short.BYTES. */ 081 static final int SHORT_BYTES = Short.SIZE / Byte.SIZE; 082 083 // Constants for 32-bit variant 084 private static final int C1_32 = 0xcc9e2d51; 085 private static final int C2_32 = 0x1b873593; 086 private static final int R1_32 = 15; 087 private static final int R2_32 = 13; 088 private static final int M_32 = 5; 089 private static final int N_32 = 0xe6546b64; 090 091 // Constants for 128-bit variant 092 private static final long C1 = 0x87c37b91114253d5L; 093 private static final long C2 = 0x4cf5ad432745937fL; 094 private static final int R1 = 31; 095 private static final int R2 = 27; 096 private static final int R3 = 33; 097 private static final int M = 5; 098 private static final int N1 = 0x52dce729; 099 private static final int N2 = 0x38495ab5; 100 101 /** No instance methods. */ 102 private MurmurHash3() { 103 } 104 105 /** 106 * Generates 32-bit hash from two longs with a default seed value. 107 * This is a helper method that will produce the same result as: 108 * 109 * <pre> 110 * int offset = 0; 111 * int seed = 104729; 112 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16) 113 * .putLong(data1) 114 * .putLong(data2) 115 * .array(), offset, 16, seed); 116 * </pre> 117 * 118 * @param data1 The first long to hash 119 * @param data2 The second long to hash 120 * @return The 32-bit hash 121 * @see #hash32x86(byte[], int, int, int) 122 */ 123 public static int hash32(final long data1, final long data2) { 124 return hash32(data1, data2, DEFAULT_SEED); 125 } 126 127 /** 128 * Generates 32-bit hash from two longs with the given seed. 129 * This is a helper method that will produce the same result as: 130 * 131 * <pre> 132 * int offset = 0; 133 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16) 134 * .putLong(data1) 135 * .putLong(data2) 136 * .array(), offset, 16, seed); 137 * </pre> 138 * 139 * @param data1 The first long to hash 140 * @param data2 The second long to hash 141 * @param seed The initial seed value 142 * @return The 32-bit hash 143 * @see #hash32x86(byte[], int, int, int) 144 */ 145 public static int hash32(final long data1, final long data2, final int seed) { 146 int hash = seed; 147 final long r0 = Long.reverseBytes(data1); 148 final long r1 = Long.reverseBytes(data2); 149 150 hash = mix32((int) r0, hash); 151 hash = mix32((int) (r0 >>> 32), hash); 152 hash = mix32((int) (r1), hash); 153 hash = mix32((int) (r1 >>> 32), hash); 154 155 hash ^= LONG_BYTES * 2; 156 return fmix32(hash); 157 } 158 159 /** 160 * Generates 32-bit hash from a long with a default seed value. 161 * This is a helper method that will produce the same result as: 162 * 163 * <pre> 164 * int offset = 0; 165 * int seed = 104729; 166 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8) 167 * .putLong(data) 168 * .array(), offset, 8, seed); 169 * </pre> 170 * 171 * @param data The long to hash 172 * @return The 32-bit hash 173 * @see #hash32x86(byte[], int, int, int) 174 */ 175 public static int hash32(final long data) { 176 return hash32(data, DEFAULT_SEED); 177 } 178 179 /** 180 * Generates 32-bit hash from a long with the given seed. 181 * This is a helper method that will produce the same result as: 182 * 183 * <pre> 184 * int offset = 0; 185 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8) 186 * .putLong(data) 187 * .array(), offset, 8, seed); 188 * </pre> 189 * 190 * @param data The long to hash 191 * @param seed The initial seed value 192 * @return The 32-bit hash 193 * @see #hash32x86(byte[], int, int, int) 194 */ 195 public static int hash32(final long data, final int seed) { 196 int hash = seed; 197 final long r0 = Long.reverseBytes(data); 198 199 hash = mix32((int) r0, hash); 200 hash = mix32((int) (r0 >>> 32), hash); 201 202 hash ^= LONG_BYTES; 203 return fmix32(hash); 204 } 205 206 /** 207 * Generates 32-bit hash from the byte array with a default seed. 208 * This is a helper method that will produce the same result as: 209 * 210 * <pre> 211 * int offset = 0; 212 * int seed = 104729; 213 * int hash = MurmurHash3.hash32(data, offset, data.length, seed); 214 * </pre> 215 * 216 * <p>This implementation contains a sign-extension bug in the finalization step of 217 * any bytes left over from dividing the length by 4. This manifests if any of these 218 * bytes are negative.<p> 219 * 220 * @param data The input byte array 221 * @return The 32-bit hash 222 * @see #hash32(byte[], int, int, int) 223 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 224 */ 225 @Deprecated 226 public static int hash32(final byte[] data) { 227 return hash32(data, 0, data.length, DEFAULT_SEED); 228 } 229 230 /** 231 * Generates 32-bit hash from a string with a default seed. 232 * <p> 233 * Before 1.14 the string was converted using default encoding. 234 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 235 * </p> 236 * This is a helper method that will produce the same result as: 237 * 238 * <pre> 239 * int offset = 0; 240 * int seed = 104729; 241 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 242 * int hash = MurmurHash3.hash32(bytes, offset, bytes.length, seed); 243 * </pre> 244 * 245 * <p>This implementation contains a sign-extension bug in the finalization step of 246 * any bytes left over from dividing the length by 4. This manifests if any of these 247 * bytes are negative.<p> 248 * 249 * @param data The input string 250 * @return The 32-bit hash 251 * @see #hash32(byte[], int, int, int) 252 * @deprecated Use {@link #hash32x86(byte[], int, int, int)} with the bytes returned from 253 * {@link String#getBytes(java.nio.charset.Charset)}. This corrects the processing of trailing bytes. 254 */ 255 @Deprecated 256 public static int hash32(final String data) { 257 final byte[] bytes = StringUtils.getBytesUtf8(data); 258 return hash32(bytes, 0, bytes.length, DEFAULT_SEED); 259 } 260 261 /** 262 * Generates 32-bit hash from the byte array with the given length and a default seed. 263 * This is a helper method that will produce the same result as: 264 * 265 * <pre> 266 * int offset = 0; 267 * int seed = 104729; 268 * int hash = MurmurHash3.hash32(data, offset, length, seed); 269 * </pre> 270 * 271 * <p>This implementation contains a sign-extension bug in the finalization step of 272 * any bytes left over from dividing the length by 4. This manifests if any of these 273 * bytes are negative.<p> 274 * 275 * @param data The input byte array 276 * @param length The length of array 277 * @return The 32-bit hash 278 * @see #hash32(byte[], int, int, int) 279 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 280 */ 281 @Deprecated 282 public static int hash32(final byte[] data, final int length) { 283 return hash32(data, length, DEFAULT_SEED); 284 } 285 286 /** 287 * Generates 32-bit hash from the byte array with the given length and seed. This is a 288 * helper method that will produce the same result as: 289 * 290 * <pre> 291 * int offset = 0; 292 * int hash = MurmurHash3.hash32(data, offset, length, seed); 293 * </pre> 294 * 295 * <p>This implementation contains a sign-extension bug in the finalization step of 296 * any bytes left over from dividing the length by 4. This manifests if any of these 297 * bytes are negative.<p> 298 * 299 * @param data The input byte array 300 * @param length The length of array 301 * @param seed The initial seed value 302 * @return The 32-bit hash 303 * @see #hash32(byte[], int, int, int) 304 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 305 */ 306 @Deprecated 307 public static int hash32(final byte[] data, final int length, final int seed) { 308 return hash32(data, 0, length, seed); 309 } 310 311 /** 312 * Generates 32-bit hash from the byte array with the given offset, length and seed. 313 * 314 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 315 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 316 * 317 * <p>This implementation contains a sign-extension bug in the finalization step of 318 * any bytes left over from dividing the length by 4. This manifests if any of these 319 * bytes are negative.<p> 320 * 321 * @param data The input byte array 322 * @param offset The offset of data 323 * @param length The length of array 324 * @param seed The initial seed value 325 * @return The 32-bit hash 326 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 327 */ 328 @Deprecated 329 public static int hash32(final byte[] data, final int offset, final int length, final int seed) { 330 int hash = seed; 331 final int nblocks = length >> 2; 332 333 // body 334 for (int i = 0; i < nblocks; i++) { 335 final int index = offset + (i << 2); 336 final int k = getLittleEndianInt(data, index); 337 hash = mix32(k, hash); 338 } 339 340 // tail 341 // ************ 342 // Note: This fails to apply masking using 0xff to the 3 remaining bytes. 343 // ************ 344 final int index = offset + (nblocks << 2); 345 int k1 = 0; 346 switch (offset + length - index) { 347 case 3: 348 k1 ^= data[index + 2] << 16; 349 case 2: 350 k1 ^= data[index + 1] << 8; 351 case 1: 352 k1 ^= data[index]; 353 354 // mix functions 355 k1 *= C1_32; 356 k1 = Integer.rotateLeft(k1, R1_32); 357 k1 *= C2_32; 358 hash ^= k1; 359 } 360 361 hash ^= length; 362 return fmix32(hash); 363 } 364 365 /** 366 * Generates 32-bit hash from the byte array with a seed of zero. 367 * This is a helper method that will produce the same result as: 368 * 369 * <pre> 370 * int offset = 0; 371 * int seed = 0; 372 * int hash = MurmurHash3.hash32x86(data, offset, data.length, seed); 373 * </pre> 374 * 375 * @param data The input byte array 376 * @return The 32-bit hash 377 * @see #hash32x86(byte[], int, int, int) 378 * @since 1.14 379 */ 380 public static int hash32x86(final byte[] data) { 381 return hash32x86(data, 0, data.length, 0); 382 } 383 384 /** 385 * Generates 32-bit hash from the byte array with the given offset, length and seed. 386 * 387 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 388 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 389 * 390 * @param data The input byte array 391 * @param offset The offset of data 392 * @param length The length of array 393 * @param seed The initial seed value 394 * @return The 32-bit hash 395 * @since 1.14 396 */ 397 public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) { 398 int hash = seed; 399 final int nblocks = length >> 2; 400 401 // body 402 for (int i = 0; i < nblocks; i++) { 403 final int index = offset + (i << 2); 404 final int k = getLittleEndianInt(data, index); 405 hash = mix32(k, hash); 406 } 407 408 // tail 409 final int index = offset + (nblocks << 2); 410 int k1 = 0; 411 switch (offset + length - index) { 412 case 3: 413 k1 ^= (data[index + 2] & 0xff) << 16; 414 case 2: 415 k1 ^= (data[index + 1] & 0xff) << 8; 416 case 1: 417 k1 ^= (data[index] & 0xff); 418 419 // mix functions 420 k1 *= C1_32; 421 k1 = Integer.rotateLeft(k1, R1_32); 422 k1 *= C2_32; 423 hash ^= k1; 424 } 425 426 hash ^= length; 427 return fmix32(hash); 428 } 429 430 /** 431 * Generates 64-bit hash from a long with a default seed. 432 * 433 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 434 * 435 * <p>This is a Murmur3-like 64-bit variant. 436 * The method does not produce the same result as either half of the hash bytes from 437 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code long}. 438 * This method will be removed in a future release.</p> 439 * 440 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 441 * this result as the default seed is positive.</p> 442 * 443 * <p>This is a helper method that will produce the same result as:</p> 444 * 445 * <pre> 446 * int offset = 0; 447 * int seed = 104729; 448 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(8) 449 * .putLong(data) 450 * .array(), offset, 8, seed); 451 * </pre> 452 * 453 * @param data The long to hash 454 * @return The 64-bit hash 455 * @see #hash64(byte[], int, int, int) 456 * @deprecated Not part of the MurmurHash3 implementation. 457 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code long}. 458 */ 459 @Deprecated 460 public static long hash64(final long data) { 461 long hash = DEFAULT_SEED; 462 long k = Long.reverseBytes(data); 463 final int length = LONG_BYTES; 464 // mix functions 465 k *= C1; 466 k = Long.rotateLeft(k, R1); 467 k *= C2; 468 hash ^= k; 469 hash = Long.rotateLeft(hash, R2) * M + N1; 470 // finalization 471 hash ^= length; 472 hash = fmix64(hash); 473 return hash; 474 } 475 476 /** 477 * Generates 64-bit hash from an int with a default seed. 478 * 479 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 480 * 481 * <p>This is a Murmur3-like 64-bit variant. 482 * The method does not produce the same result as either half of the hash bytes from 483 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code int}. 484 * This method will be removed in a future release.</p> 485 * 486 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 487 * this result as the default seed is positive.</p> 488 * 489 * <p>This is a helper method that will produce the same result as:</p> 490 * 491 * <pre> 492 * int offset = 0; 493 * int seed = 104729; 494 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(4) 495 * .putInt(data) 496 * .array(), offset, 4, seed); 497 * </pre> 498 * 499 * @param data The int to hash 500 * @return The 64-bit hash 501 * @see #hash64(byte[], int, int, int) 502 * @deprecated Not part of the MurmurHash3 implementation. 503 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code int}. 504 */ 505 @Deprecated 506 public static long hash64(final int data) { 507 long k1 = Integer.reverseBytes(data) & (-1L >>> 32); 508 final int length = INTEGER_BYTES; 509 long hash = DEFAULT_SEED; 510 k1 *= C1; 511 k1 = Long.rotateLeft(k1, R1); 512 k1 *= C2; 513 hash ^= k1; 514 // finalization 515 hash ^= length; 516 hash = fmix64(hash); 517 return hash; 518 } 519 520 /** 521 * Generates 64-bit hash from a short with a default seed. 522 * 523 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 524 * 525 * <p>This is a Murmur3-like 64-bit variant. 526 * The method does not produce the same result as either half of the hash bytes from 527 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code short}. 528 * This method will be removed in a future release.</p> 529 * 530 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 531 * this result as the default seed is positive.</p> 532 * 533 * <p>This is a helper method that will produce the same result as:</p> 534 * 535 * <pre> 536 * int offset = 0; 537 * int seed = 104729; 538 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(2) 539 * .putShort(data) 540 * .array(), offset, 2, seed); 541 * </pre> 542 * 543 * @param data The short to hash 544 * @return The 64-bit hash 545 * @see #hash64(byte[], int, int, int) 546 * @deprecated Not part of the MurmurHash3 implementation. 547 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code short}. 548 */ 549 @Deprecated 550 public static long hash64(final short data) { 551 long hash = DEFAULT_SEED; 552 long k1 = 0; 553 k1 ^= ((long) data & 0xff) << 8; 554 k1 ^= ((long) ((data & 0xFF00) >> 8) & 0xff); 555 k1 *= C1; 556 k1 = Long.rotateLeft(k1, R1); 557 k1 *= C2; 558 hash ^= k1; 559 560 // finalization 561 hash ^= SHORT_BYTES; 562 hash = fmix64(hash); 563 return hash; 564 } 565 566 /** 567 * Generates 64-bit hash from a byte array with a default seed. 568 * 569 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 570 * 571 * <p>This is a Murmur3-like 64-bit variant. 572 * The method does not produce the same result as either half of the hash bytes from 573 * {@linkplain #hash128x64(byte[])} with the same byte data. 574 * This method will be removed in a future release.</p> 575 * 576 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 577 * this result as the default seed is positive.</p> 578 * 579 * <p>This is a helper method that will produce the same result as:</p> 580 * 581 * <pre> 582 * int offset = 0; 583 * int seed = 104729; 584 * long hash = MurmurHash3.hash64(data, offset, data.length, seed); 585 * </pre> 586 * 587 * @param data The input byte array 588 * @return The 64-bit hash 589 * @see #hash64(byte[], int, int, int) 590 * @deprecated Not part of the MurmurHash3 implementation. 591 * Use half of the hash bytes from {@link #hash128x64(byte[])}. 592 */ 593 @Deprecated 594 public static long hash64(final byte[] data) { 595 return hash64(data, 0, data.length, DEFAULT_SEED); 596 } 597 598 /** 599 * Generates 64-bit hash from a byte array with the given offset and length and a default seed. 600 * 601 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 602 * 603 * <p>This is a Murmur3-like 64-bit variant. 604 * The method does not produce the same result as either half of the hash bytes from 605 * {@linkplain #hash128x64(byte[])} with the same byte data. 606 * This method will be removed in a future release.</p> 607 * 608 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 609 * this result as the default seed is positive.</p> 610 * 611 * <p>This is a helper method that will produce the same result as:</p> 612 * 613 * <pre> 614 * int seed = 104729; 615 * long hash = MurmurHash3.hash64(data, offset, length, seed); 616 * </pre> 617 * 618 * @param data The input byte array 619 * @param offset The offset of data 620 * @param length The length of array 621 * @return The 64-bit hash 622 * @see #hash64(byte[], int, int, int) 623 * @deprecated Not part of the MurmurHash3 implementation. 624 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}. 625 */ 626 @Deprecated 627 public static long hash64(final byte[] data, final int offset, final int length) { 628 return hash64(data, offset, length, DEFAULT_SEED); 629 } 630 631 /** 632 * Generates 64-bit hash from a byte array with the given offset, length and seed. 633 * 634 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 635 * 636 * <p>This is a Murmur3-like 64-bit variant. 637 * This method will be removed in a future release.</p> 638 * 639 * <p>This implementation contains a sign-extension bug in the seed initialization. 640 * This manifests if the seed is negative.</p> 641 * 642 * <p>This algorithm processes 8 bytes chunks of data in a manner similar to the 16 byte chunks 643 * of data processed in the MurmurHash3 {@code MurmurHash3_x64_128} method. However the hash 644 * is not mixed with a hash chunk from the next 8 bytes of data. The method will not return 645 * the same value as the first or second 64-bits of the function 646 * {@link #hash128(byte[], int, int, int)}.</p> 647 * 648 * <p>Use of this method is not advised. Use the first long returned from 649 * {@link #hash128x64(byte[], int, int, int)}.<p> 650 * 651 * @param data The input byte array 652 * @param offset The offset of data 653 * @param length The length of array 654 * @param seed The initial seed value 655 * @return The 64-bit hash 656 * @deprecated Not part of the MurmurHash3 implementation. 657 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}. 658 */ 659 @Deprecated 660 public static long hash64(final byte[] data, final int offset, final int length, final int seed) { 661 // ************ 662 // Note: This fails to apply masking using 0xffffffffL to the seed. 663 // ************ 664 long hash = seed; 665 final int nblocks = length >> 3; 666 667 // body 668 for (int i = 0; i < nblocks; i++) { 669 final int index = offset + (i << 3); 670 long k = getLittleEndianLong(data, index); 671 672 // mix functions 673 k *= C1; 674 k = Long.rotateLeft(k, R1); 675 k *= C2; 676 hash ^= k; 677 hash = Long.rotateLeft(hash, R2) * M + N1; 678 } 679 680 // tail 681 long k1 = 0; 682 final int index = offset + (nblocks << 3); 683 switch (offset + length - index) { 684 case 7: 685 k1 ^= ((long) data[index + 6] & 0xff) << 48; 686 case 6: 687 k1 ^= ((long) data[index + 5] & 0xff) << 40; 688 case 5: 689 k1 ^= ((long) data[index + 4] & 0xff) << 32; 690 case 4: 691 k1 ^= ((long) data[index + 3] & 0xff) << 24; 692 case 3: 693 k1 ^= ((long) data[index + 2] & 0xff) << 16; 694 case 2: 695 k1 ^= ((long) data[index + 1] & 0xff) << 8; 696 case 1: 697 k1 ^= ((long) data[index] & 0xff); 698 k1 *= C1; 699 k1 = Long.rotateLeft(k1, R1); 700 k1 *= C2; 701 hash ^= k1; 702 } 703 704 // finalization 705 hash ^= length; 706 hash = fmix64(hash); 707 708 return hash; 709 } 710 711 /** 712 * Generates 128-bit hash from the byte array with a default seed. 713 * This is a helper method that will produce the same result as: 714 * 715 * <pre> 716 * int offset = 0; 717 * int seed = 104729; 718 * int hash = MurmurHash3.hash128(data, offset, data.length, seed); 719 * </pre> 720 * 721 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect 722 * this result as the default seed is positive.</p> 723 * 724 * @param data The input byte array 725 * @return The 128-bit hash (2 longs) 726 * @see #hash128(byte[], int, int, int) 727 */ 728 public static long[] hash128(final byte[] data) { 729 return hash128(data, 0, data.length, DEFAULT_SEED); 730 } 731 732 /** 733 * Generates 128-bit hash from the byte array with a seed of zero. 734 * This is a helper method that will produce the same result as: 735 * 736 * <pre> 737 * int offset = 0; 738 * int seed = 0; 739 * int hash = MurmurHash3.hash128x64(data, offset, data.length, seed); 740 * </pre> 741 * 742 * @param data The input byte array 743 * @return The 128-bit hash (2 longs) 744 * @see #hash128x64(byte[], int, int, int) 745 * @since 1.14 746 */ 747 public static long[] hash128x64(final byte[] data) { 748 return hash128x64(data, 0, data.length, 0); 749 } 750 751 /** 752 * Generates 128-bit hash from a string with a default seed. 753 * <p> 754 * Before 1.14 the string was converted using default encoding. 755 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 756 * </p> 757 * This is a helper method that will produce the same result as: 758 * 759 * <pre> 760 * int offset = 0; 761 * int seed = 104729; 762 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 763 * int hash = MurmurHash3.hash128(bytes, offset, bytes.length, seed); 764 * </pre> 765 * 766 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect 767 * this result as the default seed is positive.</p> 768 * 769 * @param data The input String 770 * @return The 128-bit hash (2 longs) 771 * @see #hash128(byte[], int, int, int) 772 * @deprecated Use {@link #hash128x64(byte[])} using the bytes returned from 773 * {@link String#getBytes(java.nio.charset.Charset)}. 774 */ 775 @Deprecated 776 public static long[] hash128(final String data) { 777 final byte[] bytes = StringUtils.getBytesUtf8(data); 778 return hash128(bytes, 0, bytes.length, DEFAULT_SEED); 779 } 780 781 /** 782 * Generates 128-bit hash from the byte array with the given offset, length and seed. 783 * 784 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 785 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 786 * 787 * <p>This implementation contains a sign-extension bug in the seed initialization. 788 * This manifests if the seed is negative.<p> 789 * 790 * @param data The input byte array 791 * @param offset The first element of array 792 * @param length The length of array 793 * @param seed The initial seed value 794 * @return The 128-bit hash (2 longs) 795 * @deprecated Use {@link #hash128x64(byte[], int, int, int)}. This corrects the seed initialization. 796 */ 797 @Deprecated 798 public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) { 799 // ************ 800 // Note: This deliberately fails to apply masking using 0xffffffffL to the seed 801 // to maintain behavioral compatibility with the original version. 802 // The implicit conversion to a long will extend a negative sign 803 // bit through the upper 32-bits of the long seed. These should be zero. 804 // ************ 805 return hash128x64Internal(data, offset, length, seed); 806 } 807 808 /** 809 * Generates 128-bit hash from the byte array with the given offset, length and seed. 810 * 811 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 812 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 813 * 814 * @param data The input byte array 815 * @param offset The first element of array 816 * @param length The length of array 817 * @param seed The initial seed value 818 * @return The 128-bit hash (2 longs) 819 * @since 1.14 820 */ 821 public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) { 822 // Use an unsigned 32-bit integer as the seed 823 return hash128x64Internal(data, offset, length, seed & 0xffffffffL); 824 } 825 826 /** 827 * Generates 128-bit hash from the byte array with the given offset, length and seed. 828 * 829 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 830 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 831 * 832 * @param data The input byte array 833 * @param offset The first element of array 834 * @param length The length of array 835 * @param seed The initial seed value 836 * @return The 128-bit hash (2 longs) 837 */ 838 private static long[] hash128x64Internal(final byte[] data, final int offset, final int length, final long seed) { 839 long h1 = seed; 840 long h2 = seed; 841 final int nblocks = length >> 4; 842 843 // body 844 for (int i = 0; i < nblocks; i++) { 845 final int index = offset + (i << 4); 846 long k1 = getLittleEndianLong(data, index); 847 long k2 = getLittleEndianLong(data, index + 8); 848 849 // mix functions for k1 850 k1 *= C1; 851 k1 = Long.rotateLeft(k1, R1); 852 k1 *= C2; 853 h1 ^= k1; 854 h1 = Long.rotateLeft(h1, R2); 855 h1 += h2; 856 h1 = h1 * M + N1; 857 858 // mix functions for k2 859 k2 *= C2; 860 k2 = Long.rotateLeft(k2, R3); 861 k2 *= C1; 862 h2 ^= k2; 863 h2 = Long.rotateLeft(h2, R1); 864 h2 += h1; 865 h2 = h2 * M + N2; 866 } 867 868 // tail 869 long k1 = 0; 870 long k2 = 0; 871 final int index = offset + (nblocks << 4); 872 switch (offset + length - index) { 873 case 15: 874 k2 ^= ((long) data[index + 14] & 0xff) << 48; 875 case 14: 876 k2 ^= ((long) data[index + 13] & 0xff) << 40; 877 case 13: 878 k2 ^= ((long) data[index + 12] & 0xff) << 32; 879 case 12: 880 k2 ^= ((long) data[index + 11] & 0xff) << 24; 881 case 11: 882 k2 ^= ((long) data[index + 10] & 0xff) << 16; 883 case 10: 884 k2 ^= ((long) data[index + 9] & 0xff) << 8; 885 case 9: 886 k2 ^= data[index + 8] & 0xff; 887 k2 *= C2; 888 k2 = Long.rotateLeft(k2, R3); 889 k2 *= C1; 890 h2 ^= k2; 891 892 case 8: 893 k1 ^= ((long) data[index + 7] & 0xff) << 56; 894 case 7: 895 k1 ^= ((long) data[index + 6] & 0xff) << 48; 896 case 6: 897 k1 ^= ((long) data[index + 5] & 0xff) << 40; 898 case 5: 899 k1 ^= ((long) data[index + 4] & 0xff) << 32; 900 case 4: 901 k1 ^= ((long) data[index + 3] & 0xff) << 24; 902 case 3: 903 k1 ^= ((long) data[index + 2] & 0xff) << 16; 904 case 2: 905 k1 ^= ((long) data[index + 1] & 0xff) << 8; 906 case 1: 907 k1 ^= data[index] & 0xff; 908 k1 *= C1; 909 k1 = Long.rotateLeft(k1, R1); 910 k1 *= C2; 911 h1 ^= k1; 912 } 913 914 // finalization 915 h1 ^= length; 916 h2 ^= length; 917 918 h1 += h2; 919 h2 += h1; 920 921 h1 = fmix64(h1); 922 h2 = fmix64(h2); 923 924 h1 += h2; 925 h2 += h1; 926 927 return new long[] { h1, h2 }; 928 } 929 930 /** 931 * Gets the little-endian long from 8 bytes starting at the specified index. 932 * 933 * @param data The data 934 * @param index The index 935 * @return The little-endian long 936 */ 937 private static long getLittleEndianLong(final byte[] data, final int index) { 938 return (((long) data[index ] & 0xff) ) | 939 (((long) data[index + 1] & 0xff) << 8) | 940 (((long) data[index + 2] & 0xff) << 16) | 941 (((long) data[index + 3] & 0xff) << 24) | 942 (((long) data[index + 4] & 0xff) << 32) | 943 (((long) data[index + 5] & 0xff) << 40) | 944 (((long) data[index + 6] & 0xff) << 48) | 945 (((long) data[index + 7] & 0xff) << 56); 946 } 947 948 /** 949 * Gets the little-endian int from 4 bytes starting at the specified index. 950 * 951 * @param data The data 952 * @param index The index 953 * @return The little-endian int 954 */ 955 private static int getLittleEndianInt(final byte[] data, final int index) { 956 return ((data[index ] & 0xff) ) | 957 ((data[index + 1] & 0xff) << 8) | 958 ((data[index + 2] & 0xff) << 16) | 959 ((data[index + 3] & 0xff) << 24); 960 } 961 962 /** 963 * Performs the intermediate mix step of the 32-bit hash function {@code MurmurHash3_x86_32}. 964 * 965 * @param k The data to add to the hash 966 * @param hash The current hash 967 * @return The new hash 968 */ 969 private static int mix32(int k, int hash) { 970 k *= C1_32; 971 k = Integer.rotateLeft(k, R1_32); 972 k *= C2_32; 973 hash ^= k; 974 return Integer.rotateLeft(hash, R2_32) * M_32 + N_32; 975 } 976 977 /** 978 * Performs the final avalanche mix step of the 32-bit hash function {@code MurmurHash3_x86_32}. 979 * 980 * @param hash The current hash 981 * @return The final hash 982 */ 983 private static int fmix32(int hash) { 984 hash ^= (hash >>> 16); 985 hash *= 0x85ebca6b; 986 hash ^= (hash >>> 13); 987 hash *= 0xc2b2ae35; 988 hash ^= (hash >>> 16); 989 return hash; 990 } 991 992 /** 993 * Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}. 994 * 995 * @param hash The current hash 996 * @return The final hash 997 */ 998 private static long fmix64(long hash) { 999 hash ^= (hash >>> 33); 1000 hash *= 0xff51afd7ed558ccdL; 1001 hash ^= (hash >>> 33); 1002 hash *= 0xc4ceb9fe1a85ec53L; 1003 hash ^= (hash >>> 33); 1004 return hash; 1005 } 1006 1007 /** 1008 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new 1009 * hash computed. 1010 * 1011 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 1012 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 1013 * 1014 * @since 1.14 1015 */ 1016 public static class IncrementalHash32x86 { 1017 1018 /** The size of byte blocks that are processed together. */ 1019 private static final int BLOCK_SIZE = 4; 1020 1021 /** Up to 3 unprocessed bytes from input data. */ 1022 private final byte[] unprocessed = new byte[3]; 1023 1024 /** The number of unprocessed bytes in the tail data. */ 1025 private int unprocessedLength; 1026 1027 /** The total number of input bytes added since the start. */ 1028 private int totalLen; 1029 1030 /** 1031 * The current running hash. 1032 * This must be finalised to generate the 32-bit hash value. 1033 */ 1034 private int hash; 1035 1036 /** 1037 * Starts a new incremental hash. 1038 * 1039 * @param seed The initial seed value 1040 */ 1041 public final void start(final int seed) { 1042 // Reset 1043 unprocessedLength = totalLen = 0; 1044 this.hash = seed; 1045 } 1046 1047 /** 1048 * Adds the byte array to the current incremental hash. 1049 * 1050 * @param data The input byte array 1051 * @param offset The offset of data 1052 * @param length The length of array 1053 */ 1054 public final void add(final byte[] data, final int offset, final int length) { 1055 if (length <= 0) { 1056 // Nothing to add 1057 return; 1058 } 1059 totalLen += length; 1060 1061 // Process the bytes in blocks of 4. 1062 // New bytes must be added to any current unprocessed bytes, 1063 // then processed in blocks of 4 and the remaining bytes saved: 1064 // 1065 // |--|---------------------------|--| 1066 // unprocessed 1067 // main block 1068 // remaining 1069 1070 // Check if the unprocessed bytes and new bytes can fill a block of 4. 1071 // Make this overflow safe in the event that length is Integer.MAX_VALUE. 1072 // Equivalent to: (unprocessedLength + length < BLOCK_SIZE) 1073 if (unprocessedLength + length - BLOCK_SIZE < 0) { 1074 // Not enough so add to the unprocessed bytes 1075 System.arraycopy(data, offset, unprocessed, unprocessedLength, length); 1076 unprocessedLength += length; 1077 return; 1078 } 1079 1080 // Combine unprocessed bytes with new bytes. 1081 int newOffset; 1082 int newLength; 1083 if (unprocessedLength > 0) { 1084 int k = -1; 1085 switch (unprocessedLength) { 1086 case 1: 1087 k = orBytes(unprocessed[0], data[offset], data[offset + 1], data[offset + 2]); 1088 break; 1089 case 2: 1090 k = orBytes(unprocessed[0], unprocessed[1], data[offset], data[offset + 1]); 1091 break; 1092 case 3: 1093 k = orBytes(unprocessed[0], unprocessed[1], unprocessed[2], data[offset]); 1094 break; 1095 default: 1096 throw new IllegalStateException("Unprocessed length should be 1, 2, or 3: " + unprocessedLength); 1097 } 1098 hash = mix32(k, hash); 1099 // Update the offset and length 1100 final int consumed = BLOCK_SIZE - unprocessedLength; 1101 newOffset = offset + consumed; 1102 newLength = length - consumed; 1103 } else { 1104 newOffset = offset; 1105 newLength = length; 1106 } 1107 1108 // Main processing of blocks of 4 bytes 1109 final int nblocks = newLength >> 2; 1110 1111 for (int i = 0; i < nblocks; i++) { 1112 final int index = newOffset + (i << 2); 1113 final int k = getLittleEndianInt(data, index); 1114 hash = mix32(k, hash); 1115 } 1116 1117 // Save left-over unprocessed bytes 1118 final int consumed = (nblocks << 2); 1119 unprocessedLength = newLength - consumed; 1120 if (unprocessedLength != 0) { 1121 System.arraycopy(data, newOffset + consumed, unprocessed, 0, unprocessedLength); 1122 } 1123 } 1124 1125 /** 1126 * Generate the 32-bit hash value. Repeat calls to this method with no additional data 1127 * will generate the same hash value. 1128 * 1129 * @return The 32-bit hash 1130 */ 1131 public final int end() { 1132 // Allow calling end() again after adding no data to return the same result. 1133 return finalise(hash, unprocessedLength, unprocessed, totalLen); 1134 } 1135 1136 /** 1137 * Finalize the running hash to the output 32-bit hash by processing remaining bytes 1138 * and performing final mixing. 1139 * 1140 * @param hash The running hash 1141 * @param unprocessedLength The number of unprocessed bytes in the tail data. 1142 * @param unprocessed Up to 3 unprocessed bytes from input data. 1143 * @param totalLen The total number of input bytes added since the start. 1144 * @return The 32-bit hash 1145 */ 1146 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) { 1147 int result = hash; 1148 int k1 = 0; 1149 switch (unprocessedLength) { 1150 case 3: 1151 k1 ^= (unprocessed[2] & 0xff) << 16; 1152 case 2: 1153 k1 ^= (unprocessed[1] & 0xff) << 8; 1154 case 1: 1155 k1 ^= (unprocessed[0] & 0xff); 1156 1157 // mix functions 1158 k1 *= C1_32; 1159 k1 = Integer.rotateLeft(k1, R1_32); 1160 k1 *= C2_32; 1161 result ^= k1; 1162 } 1163 1164 // finalization 1165 result ^= totalLen; 1166 return fmix32(result); 1167 } 1168 1169 /** 1170 * Combines the bytes using an Or operation ({@code | } in a little-endian representation 1171 * of a 32-bit integer; byte 1 will be the least significant byte, byte 4 the most 1172 * significant. 1173 * 1174 * @param b1 The first byte 1175 * @param b2 The second byte 1176 * @param b3 The third byte 1177 * @param b4 The fourth byte 1178 * @return The 32-bit integer 1179 */ 1180 private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) { 1181 return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24); 1182 } 1183 } 1184 1185 /** 1186 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new 1187 * hash computed. 1188 * 1189 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 1190 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 1191 * 1192 * <p>This implementation contains a sign-extension bug in the finalization step of 1193 * any bytes left over from dividing the length by 4. This manifests if any of these 1194 * bytes are negative.<p> 1195 * 1196 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes. 1197 */ 1198 @Deprecated 1199 public static class IncrementalHash32 extends IncrementalHash32x86 { 1200 1201 /** 1202 * {@inheritDoc} 1203 * 1204 * <p>This implementation contains a sign-extension bug in the finalization step of 1205 * any bytes left over from dividing the length by 4. This manifests if any of these 1206 * bytes are negative.<p> 1207 * 1208 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes. 1209 */ 1210 @Override 1211 @Deprecated 1212 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) { 1213 int result = hash; 1214 // ************ 1215 // Note: This fails to apply masking using 0xff to the 3 remaining bytes. 1216 // ************ 1217 int k1 = 0; 1218 switch (unprocessedLength) { 1219 case 3: 1220 k1 ^= unprocessed[2] << 16; 1221 case 2: 1222 k1 ^= unprocessed[1] << 8; 1223 case 1: 1224 k1 ^= unprocessed[0]; 1225 1226 // mix functions 1227 k1 *= C1_32; 1228 k1 = Integer.rotateLeft(k1, R1_32); 1229 k1 *= C2_32; 1230 result ^= k1; 1231 } 1232 1233 // finalization 1234 result ^= totalLen; 1235 return fmix32(result); 1236 } 1237 } 1238}