small key hashes - optimization for 128-bit MurmurHash for 64-bit x86 platform

Nicolae_P_Intel · ‎05-06-2021

Recently I looked at the implementation of the MurmurHash for the x64 architecture. In the case of the 128bit version of this hash function I found out an optimized way to implement the part that deals with the small key hashes (smaller than 15 bytes). The optimization works for all input cases however the advantages will be lower. The implementation can be found on github (https://github.com/aappleby/smhasher/pull/87/commits/7d8b5ae14f4fe74922d284a2986908a98909e5d5).

The original switch case

switch(len & 15)

{

case 15: k2 ^= ((uint64_t)tail[14]) << 48;

case 14: k2 ^= ((uint64_t)tail[13]) << 40;

case 13: k2 ^= ((uint64_t)tail[12]) << 32;

case 12: k2 ^= ((uint64_t)tail[11]) << 24;

case 11: k2 ^= ((uint64_t)tail[10]) << 16;

case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;

case 9: k2 ^= ((uint64_t)tail[ 8]) << 0;

k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;

case 8: k1 ^= ((uint64_t)tail[ 7]) << 56;

case 7: k1 ^= ((uint64_t)tail[ 6]) << 48;

case 6: k1 ^= ((uint64_t)tail[ 5]) << 40;

case 5: k1 ^= ((uint64_t)tail[ 4]) << 32;

case 4: k1 ^= ((uint64_t)tail[ 3]) << 24;

case 3: k1 ^= ((uint64_t)tail[ 2]) << 16;

case 2: k1 ^= ((uint64_t)tail[ 1]) << 8;

case 1: k1 ^= ((uint64_t)tail[ 0]) << 0;

k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;

};

can be implemented more effective in the following way

c = len & 15;

if (c!=0)

{

uint64_t pIntData = *(const uint64_t *)(data + nblocks * 16 + 8);

pIntData &= (1ULL<<(((c>>3)*(c%8))*8))-1;

k2_02 ^= pIntData; k2_02 *= c2; k2_02 = ROTL64(k2_02,33); k2_02 *= c1; h2 ^= k2_02;

uint64_t pIntData2 = *(const uint64_t *)(data + nblocks * 16);

pIntData2 &= (1ULL<<(c*8))-1;

k1_02 ^= pIntData2; k1_02 *= c1; k1_02 = ROTL64(k1_02,31);k1_02 *= c2; h1 ^= k1_02;

}