<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic small key hashes - optimization for 128-bit MurmurHash for 64-bit x86 platform in Software Tuning, Performance Optimization &amp; Platform Monitoring</title>
    <link>https://community.intel.com/t5/Software-Tuning-Performance/small-key-hashes-optimization-for-128-bit-MurmurHash-for-64-bit/m-p/1279465#M7873</link>
    <description>&lt;P&gt;&lt;EM&gt;Recently I looked at the implementation of the MurmurHash for the x64 architecture. In the case of the 128bit version of this hash function I found out an optimized way to implement the part that deals with the small key hashes (smaller than 15 bytes). The optimization works for all input cases however the advantages will be lower. The implementation can be found on github (&lt;A href="https://github.com/aappleby/smhasher/pull/87/commits/7d8b5ae14f4fe74922d284a2986908a98909e5d5" target="_blank"&gt;https://github.com/aappleby/smhasher/pull/87/commits/7d8b5ae14f4fe74922d284a2986908a98909e5d5&lt;/A&gt;).&lt;/EM&gt;&lt;/P&gt;
&lt;P&gt;&lt;EM&gt;The original switch case&amp;nbsp;&lt;/EM&gt;&lt;/P&gt;
&lt;P&gt;switch(len &amp;amp; 15)&lt;/P&gt;
&lt;P&gt;&amp;nbsp; {&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 15: k2 ^= ((uint64_t)tail[14]) &amp;lt;&amp;lt; 48;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 14: k2 ^= ((uint64_t)tail[13]) &amp;lt;&amp;lt; 40;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 13: k2 ^= ((uint64_t)tail[12]) &amp;lt;&amp;lt; 32;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 12: k2 ^= ((uint64_t)tail[11]) &amp;lt;&amp;lt; 24;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 11: k2 ^= ((uint64_t)tail[10]) &amp;lt;&amp;lt; 16;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 10: k2 ^= ((uint64_t)tail[ 9]) &amp;lt;&amp;lt; 8;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 9: k2 ^= ((uint64_t)tail[ 8]) &amp;lt;&amp;lt; 0;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k2 *= c2; k2&amp;nbsp; = ROTL64(k2,33); k2 *= c1; h2 ^= k2;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;case&amp;nbsp; 8: k1 ^= ((uint64_t)tail[ 7]) &amp;lt;&amp;lt; 56;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 7: k1 ^= ((uint64_t)tail[ 6]) &amp;lt;&amp;lt; 48;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 6: k1 ^= ((uint64_t)tail[ 5]) &amp;lt;&amp;lt; 40;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 5: k1 ^= ((uint64_t)tail[ 4]) &amp;lt;&amp;lt; 32;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 4: k1 ^= ((uint64_t)tail[ 3]) &amp;lt;&amp;lt; 24;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 3: k1 ^= ((uint64_t)tail[ 2]) &amp;lt;&amp;lt; 16;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 2: k1 ^= ((uint64_t)tail[ 1]) &amp;lt;&amp;lt; 8;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 1: k1 ^= ((uint64_t)tail[ 0]) &amp;lt;&amp;lt; 0;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k1 *= c1; k1&amp;nbsp; = ROTL64(k1,31); k1 *= c2; h1 ^= k1;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; };&lt;/P&gt;
&lt;P&gt;can be implemented more effective in the following way&lt;/P&gt;
&lt;P&gt;&amp;nbsp;c = len &amp;amp; 15;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; if (c!=0)&lt;/P&gt;
&lt;P&gt;&amp;nbsp; {&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; uint64_t pIntData = *(const uint64_t *)(data + nblocks * 16 + 8);&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; pIntData &amp;amp;= (1ULL&amp;lt;&amp;lt;(((c&amp;gt;&amp;gt;3)*(c%8))*8))-1;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k2_02 ^= pIntData; k2_02 *= c2; k2_02&amp;nbsp; = ROTL64(k2_02,33); k2_02 *= c1; h2 ^= k2_02;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; uint64_t pIntData2 = *(const uint64_t *)(data + nblocks * 16);&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; pIntData2 &amp;amp;= (1ULL&amp;lt;&amp;lt;(c*8))-1;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k1_02 ^= pIntData2; k1_02 *= c1; k1_02&amp;nbsp; = ROTL64(k1_02,31);k1_02 *= c2; h1 ^= k1_02;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; }&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 06 May 2021 16:16:17 GMT</pubDate>
    <dc:creator>Nicolae_P_Intel</dc:creator>
    <dc:date>2021-05-06T16:16:17Z</dc:date>
    <item>
      <title>small key hashes - optimization for 128-bit MurmurHash for 64-bit x86 platform</title>
      <link>https://community.intel.com/t5/Software-Tuning-Performance/small-key-hashes-optimization-for-128-bit-MurmurHash-for-64-bit/m-p/1279465#M7873</link>
      <description>&lt;P&gt;&lt;EM&gt;Recently I looked at the implementation of the MurmurHash for the x64 architecture. In the case of the 128bit version of this hash function I found out an optimized way to implement the part that deals with the small key hashes (smaller than 15 bytes). The optimization works for all input cases however the advantages will be lower. The implementation can be found on github (&lt;A href="https://github.com/aappleby/smhasher/pull/87/commits/7d8b5ae14f4fe74922d284a2986908a98909e5d5" target="_blank"&gt;https://github.com/aappleby/smhasher/pull/87/commits/7d8b5ae14f4fe74922d284a2986908a98909e5d5&lt;/A&gt;).&lt;/EM&gt;&lt;/P&gt;
&lt;P&gt;&lt;EM&gt;The original switch case&amp;nbsp;&lt;/EM&gt;&lt;/P&gt;
&lt;P&gt;switch(len &amp;amp; 15)&lt;/P&gt;
&lt;P&gt;&amp;nbsp; {&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 15: k2 ^= ((uint64_t)tail[14]) &amp;lt;&amp;lt; 48;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 14: k2 ^= ((uint64_t)tail[13]) &amp;lt;&amp;lt; 40;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 13: k2 ^= ((uint64_t)tail[12]) &amp;lt;&amp;lt; 32;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 12: k2 ^= ((uint64_t)tail[11]) &amp;lt;&amp;lt; 24;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 11: k2 ^= ((uint64_t)tail[10]) &amp;lt;&amp;lt; 16;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case 10: k2 ^= ((uint64_t)tail[ 9]) &amp;lt;&amp;lt; 8;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 9: k2 ^= ((uint64_t)tail[ 8]) &amp;lt;&amp;lt; 0;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k2 *= c2; k2&amp;nbsp; = ROTL64(k2,33); k2 *= c1; h2 ^= k2;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;case&amp;nbsp; 8: k1 ^= ((uint64_t)tail[ 7]) &amp;lt;&amp;lt; 56;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 7: k1 ^= ((uint64_t)tail[ 6]) &amp;lt;&amp;lt; 48;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 6: k1 ^= ((uint64_t)tail[ 5]) &amp;lt;&amp;lt; 40;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 5: k1 ^= ((uint64_t)tail[ 4]) &amp;lt;&amp;lt; 32;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 4: k1 ^= ((uint64_t)tail[ 3]) &amp;lt;&amp;lt; 24;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 3: k1 ^= ((uint64_t)tail[ 2]) &amp;lt;&amp;lt; 16;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 2: k1 ^= ((uint64_t)tail[ 1]) &amp;lt;&amp;lt; 8;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; case&amp;nbsp; 1: k1 ^= ((uint64_t)tail[ 0]) &amp;lt;&amp;lt; 0;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k1 *= c1; k1&amp;nbsp; = ROTL64(k1,31); k1 *= c2; h1 ^= k1;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; };&lt;/P&gt;
&lt;P&gt;can be implemented more effective in the following way&lt;/P&gt;
&lt;P&gt;&amp;nbsp;c = len &amp;amp; 15;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; if (c!=0)&lt;/P&gt;
&lt;P&gt;&amp;nbsp; {&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; uint64_t pIntData = *(const uint64_t *)(data + nblocks * 16 + 8);&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; pIntData &amp;amp;= (1ULL&amp;lt;&amp;lt;(((c&amp;gt;&amp;gt;3)*(c%8))*8))-1;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k2_02 ^= pIntData; k2_02 *= c2; k2_02&amp;nbsp; = ROTL64(k2_02,33); k2_02 *= c1; h2 ^= k2_02;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; uint64_t pIntData2 = *(const uint64_t *)(data + nblocks * 16);&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; pIntData2 &amp;amp;= (1ULL&amp;lt;&amp;lt;(c*8))-1;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k1_02 ^= pIntData2; k1_02 *= c1; k1_02&amp;nbsp; = ROTL64(k1_02,31);k1_02 *= c2; h1 ^= k1_02;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; }&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 06 May 2021 16:16:17 GMT</pubDate>
      <guid>https://community.intel.com/t5/Software-Tuning-Performance/small-key-hashes-optimization-for-128-bit-MurmurHash-for-64-bit/m-p/1279465#M7873</guid>
      <dc:creator>Nicolae_P_Intel</dc:creator>
      <dc:date>2021-05-06T16:16:17Z</dc:date>
    </item>
  </channel>
</rss>

