001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.digest; 019 020import org.apache.commons.codec.binary.StringUtils; 021 022/** 023 * Implementation of the MurmurHash2 32-bit and 64-bit hash functions. 024 * 025 * <p>MurmurHash is a non-cryptographic hash function suitable for general 026 * hash-based lookup. The name comes from two basic operations, multiply (MU) 027 * and rotate (R), used in its inner loop. Unlike cryptographic hash functions, 028 * it is not specifically designed to be difficult to reverse by an adversary, 029 * making it unsuitable for cryptographic purposes.</p> 030 * 031 * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2} 032 * and the 64-bit hash function {@code MurmurHash64A} from Austin Applyby's 033 * original {@code c++} code in SMHasher.</p> 034 * 035 * <p>This is a re-implementation of the original C code plus some additional 036 * features.</p> 037 * 038 * <p>This is public domain code with no copyrights. From home page of 039 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p> 040 * 041 * <blockquote> 042 * "All MurmurHash versions are public domain software, and the author 043 * disclaims all copyright to their code." 044 * </blockquote> 045 * 046 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a> 047 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp"> 048 * Original MurmurHash2 c++ code</a> 049 * @since 1.13 050 */ 051public final class MurmurHash2 { 052 053 // Constants for 32-bit variant 054 private static final int M32 = 0x5bd1e995; 055 private static final int R32 = 24; 056 057 // Constants for 64-bit variant 058 private static final long M64 = 0xc6a4a7935bd1e995L; 059 private static final int R64 = 47; 060 061 /** No instance methods. */ 062 private MurmurHash2() { 063 } 064 065 /** 066 * Generates a 32-bit hash from byte array with the given length and seed. 067 * 068 * @param data The input byte array 069 * @param length The length of the array 070 * @param seed The initial seed value 071 * @return The 32-bit hash 072 */ 073 public static int hash32(final byte[] data, final int length, final int seed) { 074 // Initialize the hash to a random value 075 int h = seed ^ length; 076 077 // Mix 4 bytes at a time into the hash 078 final int nblocks = length >> 2; 079 080 // body 081 for (int i = 0; i < nblocks; i++) { 082 final int index = (i << 2); 083 int k = getLittleEndianInt(data, index); 084 k *= M32; 085 k ^= k >>> R32; 086 k *= M32; 087 h *= M32; 088 h ^= k; 089 } 090 091 // Handle the last few bytes of the input array 092 final int index = (nblocks << 2); 093 switch (length - index) { 094 case 3: 095 h ^= (data[index + 2] & 0xff) << 16; 096 case 2: 097 h ^= (data[index + 1] & 0xff) << 8; 098 case 1: 099 h ^= (data[index] & 0xff); 100 h *= M32; 101 } 102 103 // Do a few final mixes of the hash to ensure the last few 104 // bytes are well-incorporated. 105 h ^= h >>> 13; 106 h *= M32; 107 h ^= h >>> 15; 108 109 return h; 110 } 111 112 /** 113 * Generates a 32-bit hash from byte array with the given length and a default seed value. 114 * This is a helper method that will produce the same result as: 115 * 116 * <pre> 117 * int seed = 0x9747b28c; 118 * int hash = MurmurHash2.hash32(data, length, seed); 119 * </pre> 120 * 121 * @param data The input byte array 122 * @param length The length of the array 123 * @return The 32-bit hash 124 * @see #hash32(byte[], int, int) 125 */ 126 public static int hash32(final byte[] data, final int length) { 127 return hash32(data, length, 0x9747b28c); 128 } 129 130 /** 131 * Generates a 32-bit hash from a string with a default seed. 132 * <p> 133 * Before 1.14 the string was converted using default encoding. 134 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 135 * </p> 136 * This is a helper method that will produce the same result as: 137 * 138 * <pre> 139 * int seed = 0x9747b28c; 140 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 141 * int hash = MurmurHash2.hash32(bytes, bytes.length, seed); 142 * </pre> 143 * 144 * @param text The input string 145 * @return The 32-bit hash 146 * @see #hash32(byte[], int, int) 147 */ 148 public static int hash32(final String text) { 149 final byte[] bytes = StringUtils.getBytesUtf8(text); 150 return hash32(bytes, bytes.length); 151 } 152 153 /** 154 * Generates a 32-bit hash from a substring with a default seed value. 155 * The string is converted to bytes using the default encoding. 156 * This is a helper method that will produce the same result as: 157 * 158 * <pre> 159 * int seed = 0x9747b28c; 160 * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8); 161 * int hash = MurmurHash2.hash32(bytes, bytes.length, seed); 162 * </pre> 163 * 164 * @param text The input string 165 * @param from The starting index 166 * @param length The length of the substring 167 * @return The 32-bit hash 168 * @see #hash32(byte[], int, int) 169 */ 170 public static int hash32(final String text, final int from, final int length) { 171 return hash32(text.substring(from, from + length)); 172 } 173 174 /** 175 * Generates a 64-bit hash from byte array of the given length and seed. 176 * 177 * @param data The input byte array 178 * @param length The length of the array 179 * @param seed The initial seed value 180 * @return The 64-bit hash of the given array 181 */ 182 public static long hash64(final byte[] data, final int length, final int seed) { 183 long h = (seed & 0xffffffffL) ^ (length * M64); 184 185 final int nblocks = length >> 3; 186 187 // body 188 for (int i = 0; i < nblocks; i++) { 189 final int index = (i << 3); 190 long k = getLittleEndianLong(data, index); 191 192 k *= M64; 193 k ^= k >>> R64; 194 k *= M64; 195 196 h ^= k; 197 h *= M64; 198 } 199 200 final int index = (nblocks << 3); 201 switch (length - index) { 202 case 7: 203 h ^= ((long) data[index + 6] & 0xff) << 48; 204 case 6: 205 h ^= ((long) data[index + 5] & 0xff) << 40; 206 case 5: 207 h ^= ((long) data[index + 4] & 0xff) << 32; 208 case 4: 209 h ^= ((long) data[index + 3] & 0xff) << 24; 210 case 3: 211 h ^= ((long) data[index + 2] & 0xff) << 16; 212 case 2: 213 h ^= ((long) data[index + 1] & 0xff) << 8; 214 case 1: 215 h ^= ((long) data[index] & 0xff); 216 h *= M64; 217 } 218 219 h ^= h >>> R64; 220 h *= M64; 221 h ^= h >>> R64; 222 223 return h; 224 } 225 226 /** 227 * Generates a 64-bit hash from byte array with given length and a default seed value. 228 * This is a helper method that will produce the same result as: 229 * 230 * <pre> 231 * int seed = 0xe17a1465; 232 * int hash = MurmurHash2.hash64(data, length, seed); 233 * </pre> 234 * 235 * @param data The input byte array 236 * @param length The length of the array 237 * @return The 64-bit hash 238 * @see #hash64(byte[], int, int) 239 */ 240 public static long hash64(final byte[] data, final int length) { 241 return hash64(data, length, 0xe17a1465); 242 } 243 244 /** 245 * Generates a 64-bit hash from a string with a default seed. 246 * <p> 247 * Before 1.14 the string was converted using default encoding. 248 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 249 * </p> 250 * This is a helper method that will produce the same result as: 251 * 252 * <pre> 253 * int seed = 0xe17a1465; 254 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 255 * int hash = MurmurHash2.hash64(bytes, bytes.length, seed); 256 * </pre> 257 * 258 * @param text The input string 259 * @return The 64-bit hash 260 * @see #hash64(byte[], int, int) 261 */ 262 public static long hash64(final String text) { 263 final byte[] bytes = StringUtils.getBytesUtf8(text); 264 return hash64(bytes, bytes.length); 265 } 266 267 /** 268 * Generates a 64-bit hash from a substring with a default seed value. 269 * The string is converted to bytes using the default encoding. 270 * This is a helper method that will produce the same result as: 271 * 272 * <pre> 273 * int seed = 0xe17a1465; 274 * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8); 275 * int hash = MurmurHash2.hash64(bytes, bytes.length, seed); 276 * </pre> 277 * 278 * @param text The The input string 279 * @param from The starting index 280 * @param length The length of the substring 281 * @return The 64-bit hash 282 * @see #hash64(byte[], int, int) 283 */ 284 public static long hash64(final String text, final int from, final int length) { 285 return hash64(text.substring(from, from + length)); 286 } 287 288 /** 289 * Gets the little-endian int from 4 bytes starting at the specified index. 290 * 291 * @param data The data 292 * @param index The index 293 * @return The little-endian int 294 */ 295 private static int getLittleEndianInt(final byte[] data, final int index) { 296 return ((data[index ] & 0xff) ) | 297 ((data[index + 1] & 0xff) << 8) | 298 ((data[index + 2] & 0xff) << 16) | 299 ((data[index + 3] & 0xff) << 24); 300 } 301 302 /** 303 * Gets the little-endian long from 8 bytes starting at the specified index. 304 * 305 * @param data The data 306 * @param index The index 307 * @return The little-endian long 308 */ 309 private static long getLittleEndianLong(final byte[] data, final int index) { 310 return (((long) data[index ] & 0xff) ) | 311 (((long) data[index + 1] & 0xff) << 8) | 312 (((long) data[index + 2] & 0xff) << 16) | 313 (((long) data[index + 3] & 0xff) << 24) | 314 (((long) data[index + 4] & 0xff) << 32) | 315 (((long) data[index + 5] & 0xff) << 40) | 316 (((long) data[index + 6] & 0xff) << 48) | 317 (((long) data[index + 7] & 0xff) << 56); 318 } 319}