<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Results Interpreting in Analyzers</title>
    <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085404#M15377</link>
    <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;My application processes in a short loop huge amount of data. At now the application is single threaded and I am trying to speed up single threaded as far as possible before moving to multiple thread&lt;/P&gt;

&lt;P&gt;General Exploration Summary page says&lt;/P&gt;

&lt;BLOCKQUOTE&gt;
	&lt;P&gt;&lt;BR /&gt;
		Elapsed Time:&amp;nbsp;10.346s&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Clockticks:&amp;nbsp;25,370,400,000&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Instructions Retired:&amp;nbsp;21,055,200,000&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; CPI Rate:&amp;nbsp;1.205&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; MUX Reliability:&amp;nbsp;0.917&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Front-End Bound:&amp;nbsp;3.6%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Bad Speculation:&amp;nbsp;0.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Back-End Bound:&amp;nbsp;77.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Memory Bound:&amp;nbsp;46.8%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; L1 Bound:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; L2 Bound:&amp;nbsp;4.3%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; L3 Bound:&amp;nbsp;0.8%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Contested Accesses:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Data Sharing:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; L3 Latency:&amp;nbsp;15.3%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; SQ Full:&amp;nbsp;16.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; DRAM Bound:&amp;nbsp;32.4%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Memory Bandwidth:&amp;nbsp;26.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Memory Latency:&amp;nbsp;73.8%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; LLC Miss:&amp;nbsp;76.4%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Store Bound:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Store Latency:&amp;nbsp;21.1%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; False Sharing:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Split Stores:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; DTLB Store Overhead:&amp;nbsp;0.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Core Bound:&amp;nbsp;30.3%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Divider:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port Utilization:&amp;nbsp;23.5%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Cycles of 0 Ports Utilized:&amp;nbsp;43.5%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Cycles of 1 Port Utilized:&amp;nbsp;16.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Cycles of 2 Ports Utilized:&amp;nbsp;12.6%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Cycles of 3+ Ports Utilized:&amp;nbsp;8.9%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 0:&amp;nbsp;18.4%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 1:&amp;nbsp;16.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 2:&amp;nbsp;21.5%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 3:&amp;nbsp;22.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 4:&amp;nbsp;3.3%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 5:&amp;nbsp;20.5%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Retiring:&amp;nbsp;19.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; General Retirement:&amp;nbsp;19.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; FP Arithmetic:&amp;nbsp;30.6%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; FP x87:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; FP Scalar:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; FP Vector:&amp;nbsp;30.6%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Other:&amp;nbsp;69.4%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Microcode Sequencer:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Assists:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Total Thread Count:&amp;nbsp;1&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Paused Time:&amp;nbsp;3.071s&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;

&lt;P&gt;As far as I see the issues is at Back-End, i.e. all instructions are fetched from DRAM by Front-End but CPU idles on&amp;nbsp;stage where these instructions are executed. The code is consecutive, without conditions. I see that&amp;nbsp;the code is DRAM bound and, especially, Memory Latency&amp;nbsp;&amp;nbsp;bound.&lt;/P&gt;

&lt;P&gt;Does it mean that it is impossible to speedup the code because I limited by DRAM parameters?&lt;/P&gt;</description>
    <pubDate>Sun, 13 Nov 2016 19:40:26 GMT</pubDate>
    <dc:creator>Ayrat_S_</dc:creator>
    <dc:date>2016-11-13T19:40:26Z</dc:date>
    <item>
      <title>Results Interpreting</title>
      <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085404#M15377</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;My application processes in a short loop huge amount of data. At now the application is single threaded and I am trying to speed up single threaded as far as possible before moving to multiple thread&lt;/P&gt;

&lt;P&gt;General Exploration Summary page says&lt;/P&gt;

&lt;BLOCKQUOTE&gt;
	&lt;P&gt;&lt;BR /&gt;
		Elapsed Time:&amp;nbsp;10.346s&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Clockticks:&amp;nbsp;25,370,400,000&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Instructions Retired:&amp;nbsp;21,055,200,000&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; CPI Rate:&amp;nbsp;1.205&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; MUX Reliability:&amp;nbsp;0.917&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Front-End Bound:&amp;nbsp;3.6%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Bad Speculation:&amp;nbsp;0.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Back-End Bound:&amp;nbsp;77.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Memory Bound:&amp;nbsp;46.8%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; L1 Bound:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; L2 Bound:&amp;nbsp;4.3%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; L3 Bound:&amp;nbsp;0.8%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Contested Accesses:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Data Sharing:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; L3 Latency:&amp;nbsp;15.3%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; SQ Full:&amp;nbsp;16.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; DRAM Bound:&amp;nbsp;32.4%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Memory Bandwidth:&amp;nbsp;26.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Memory Latency:&amp;nbsp;73.8%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; LLC Miss:&amp;nbsp;76.4%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Store Bound:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Store Latency:&amp;nbsp;21.1%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; False Sharing:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Split Stores:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; DTLB Store Overhead:&amp;nbsp;0.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Core Bound:&amp;nbsp;30.3%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Divider:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port Utilization:&amp;nbsp;23.5%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Cycles of 0 Ports Utilized:&amp;nbsp;43.5%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Cycles of 1 Port Utilized:&amp;nbsp;16.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Cycles of 2 Ports Utilized:&amp;nbsp;12.6%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Cycles of 3+ Ports Utilized:&amp;nbsp;8.9%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 0:&amp;nbsp;18.4%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 1:&amp;nbsp;16.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 2:&amp;nbsp;21.5%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 3:&amp;nbsp;22.2%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 4:&amp;nbsp;3.3%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Port 5:&amp;nbsp;20.5%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Retiring:&amp;nbsp;19.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; General Retirement:&amp;nbsp;19.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; FP Arithmetic:&amp;nbsp;30.6%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; FP x87:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; FP Scalar:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; FP Vector:&amp;nbsp;30.6%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Other:&amp;nbsp;69.4%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Microcode Sequencer:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Assists:&amp;nbsp;0.0%&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Total Thread Count:&amp;nbsp;1&lt;BR /&gt;
		&amp;nbsp;&amp;nbsp;&amp;nbsp; Paused Time:&amp;nbsp;3.071s&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;

&lt;P&gt;As far as I see the issues is at Back-End, i.e. all instructions are fetched from DRAM by Front-End but CPU idles on&amp;nbsp;stage where these instructions are executed. The code is consecutive, without conditions. I see that&amp;nbsp;the code is DRAM bound and, especially, Memory Latency&amp;nbsp;&amp;nbsp;bound.&lt;/P&gt;

&lt;P&gt;Does it mean that it is impossible to speedup the code because I limited by DRAM parameters?&lt;/P&gt;</description>
      <pubDate>Sun, 13 Nov 2016 19:40:26 GMT</pubDate>
      <guid>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085404#M15377</guid>
      <dc:creator>Ayrat_S_</dc:creator>
      <dc:date>2016-11-13T19:40:26Z</dc:date>
    </item>
    <item>
      <title>Does it mean that it is</title>
      <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085405#M15378</link>
      <description>&lt;BLOCKQUOTE&gt;
	&lt;P&gt;&lt;SPAN style="font-size: 12px;"&gt;Does it mean that it is impossible to speedup the code because I limited by DRAM parameters?&lt;/SPAN&gt;&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;

&lt;P&gt;No it doesn't. It means that if you'd make data access more sequential (helping HW prefetcher to do its work), then DRAM latency can be decreased and overall performance of the execution will improve. You can use Memory Access analysis for determining which data objects create most of latency.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 14 Nov 2016 15:52:07 GMT</pubDate>
      <guid>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085405#M15378</guid>
      <dc:creator>Vladimir_T_Intel</dc:creator>
      <dc:date>2016-11-14T15:52:07Z</dc:date>
    </item>
    <item>
      <title>Thank you for the hint, I</title>
      <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085406#M15379</link>
      <description>&lt;P&gt;Thank you for the hint, I modified my code and the situation&amp;nbsp;became much better.&lt;/P&gt;

&lt;P&gt;It seems the issue now is divider and sqrt&amp;nbsp;operations (I do vector normalization). Is it possible to do anything with this?&lt;/P&gt;

&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="ge.png"&gt;&lt;img src="https://community.intel.com/t5/image/serverpage/image-id/9227i114A44BA3F5D0B0D/image-size/large?v=v2&amp;amp;px=999&amp;amp;whitelist-exif-data=Orientation%2CResolution%2COriginalDefaultFinalSize%2CCopyright" role="button" title="ge.png" alt="ge.png" /&gt;&lt;/span&gt;&lt;/P&gt;

&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="ge_code.png"&gt;&lt;img src="https://community.intel.com/t5/image/serverpage/image-id/9228i3E666A84A9EDC914/image-size/large?v=v2&amp;amp;px=999&amp;amp;whitelist-exif-data=Orientation%2CResolution%2COriginalDefaultFinalSize%2CCopyright" role="button" title="ge_code.png" alt="ge_code.png" /&gt;&lt;/span&gt;&lt;/P&gt;

&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="ma.png"&gt;&lt;img src="https://community.intel.com/t5/image/serverpage/image-id/9229iD69F86D93E69EBAE/image-size/large?v=v2&amp;amp;px=999&amp;amp;whitelist-exif-data=Orientation%2CResolution%2COriginalDefaultFinalSize%2CCopyright" role="button" title="ma.png" alt="ma.png" /&gt;&lt;/span&gt;&lt;/P&gt;

&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="ma_code.png"&gt;&lt;img src="https://community.intel.com/t5/image/serverpage/image-id/9230i2C60912B03F3FE18/image-size/large?v=v2&amp;amp;px=999&amp;amp;whitelist-exif-data=Orientation%2CResolution%2COriginalDefaultFinalSize%2CCopyright" role="button" title="ma_code.png" alt="ma_code.png" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 14 Nov 2016 18:59:57 GMT</pubDate>
      <guid>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085406#M15379</guid>
      <dc:creator>Ayrat_S_</dc:creator>
      <dc:date>2016-11-14T18:59:57Z</dc:date>
    </item>
    <item>
      <title>From the profile summary i</title>
      <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085407#M15380</link>
      <description>&lt;P&gt;From the profile summary i see that the FP ops are scalar - you could prove it looking at the Assembly view in the Source Viewer. If you manage to make your division operations vectorized, you could have significant improvement in time spent in Core execution unit.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 15 Nov 2016 10:34:45 GMT</pubDate>
      <guid>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085407#M15380</guid>
      <dc:creator>Vladimir_T_Intel</dc:creator>
      <dc:date>2016-11-15T10:34:45Z</dc:date>
    </item>
    <item>
      <title> </title>
      <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085408#M15381</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;I manage to speedup my by 20% code&amp;nbsp;by using &amp;nbsp;SSE instructions.&lt;/P&gt;

&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="mixed_t1_ge.png"&gt;&lt;img src="https://community.intel.com/t5/image/serverpage/image-id/9231i5010486B4E1231D3/image-size/large?v=v2&amp;amp;px=999&amp;amp;whitelist-exif-data=Orientation%2CResolution%2COriginalDefaultFinalSize%2CCopyright" role="button" title="mixed_t1_ge.png" alt="mixed_t1_ge.png" /&gt;&lt;/span&gt;&lt;/P&gt;

&lt;P&gt;DRAM bandwidth utilization is increased also.&lt;/P&gt;

&lt;P&gt;Below are charts for 1-, 2-, 3-, 4-threaded versions respectively&amp;nbsp;Almost 100% of time DRAM Bandwidth as about 25GB/sec. &amp;nbsp;&amp;nbsp;&amp;nbsp; Max DRAM System Bandwidth is&amp;nbsp;27 GB/sec.&lt;/P&gt;

&lt;P&gt;Is it possible to say that it is impossible to speedup the code due DRAM limitations?&lt;/P&gt;

&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="mixed_t1_ma.png"&gt;&lt;img src="https://community.intel.com/t5/image/serverpage/image-id/9232i3059ABC82E823B66/image-size/large?v=v2&amp;amp;px=999&amp;amp;whitelist-exif-data=Orientation%2CResolution%2COriginalDefaultFinalSize%2CCopyright" role="button" title="mixed_t1_ma.png" alt="mixed_t1_ma.png" /&gt;&lt;/span&gt;&lt;/P&gt;

&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="mixed_t2_ma.png"&gt;&lt;img src="https://community.intel.com/t5/image/serverpage/image-id/9233iEBE1117036538D84/image-size/large?v=v2&amp;amp;px=999&amp;amp;whitelist-exif-data=Orientation%2CResolution%2COriginalDefaultFinalSize%2CCopyright" role="button" title="mixed_t2_ma.png" alt="mixed_t2_ma.png" /&gt;&lt;/span&gt;&lt;/P&gt;

&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="mixed_t3_ma.png"&gt;&lt;img src="https://community.intel.com/t5/image/serverpage/image-id/9234iD03D78AD6CC10BBD/image-size/large?v=v2&amp;amp;px=999&amp;amp;whitelist-exif-data=Orientation%2CResolution%2COriginalDefaultFinalSize%2CCopyright" role="button" title="mixed_t3_ma.png" alt="mixed_t3_ma.png" /&gt;&lt;/span&gt;&lt;/P&gt;

&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="mixed_t4_ma.png"&gt;&lt;img src="https://community.intel.com/t5/image/serverpage/image-id/9235i14E8D13CEF37A820/image-size/large?v=v2&amp;amp;px=999&amp;amp;whitelist-exif-data=Orientation%2CResolution%2COriginalDefaultFinalSize%2CCopyright" role="button" title="mixed_t4_ma.png" alt="mixed_t4_ma.png" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 16 Nov 2016 19:26:29 GMT</pubDate>
      <guid>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085408#M15381</guid>
      <dc:creator>Ayrat_S_</dc:creator>
      <dc:date>2016-11-16T19:26:29Z</dc:date>
    </item>
    <item>
      <title>It would be nice if you copy</title>
      <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085409#M15382</link>
      <description>&lt;P&gt;It would be nice if you copy a screenshot of asm view for loop.&lt;/P&gt;

&lt;P&gt;With SSE instructions you increased BW requirements and you suffer more from not always contiguous memory access. You might want to focus on that. But anyway, you perf profile looks good already.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 18 Nov 2016 13:15:52 GMT</pubDate>
      <guid>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085409#M15382</guid>
      <dc:creator>Vladimir_T_Intel</dc:creator>
      <dc:date>2016-11-18T13:15:52Z</dc:date>
    </item>
    <item>
      <title>Here is assembly code along</title>
      <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085410#M15383</link>
      <description>&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="assembly.png"&gt;&lt;img src="https://community.intel.com/t5/image/serverpage/image-id/9240iD26F0DCBEF848DAE/image-size/large?v=v2&amp;amp;px=999&amp;amp;whitelist-exif-data=Orientation%2CResolution%2COriginalDefaultFinalSize%2CCopyright" role="button" title="assembly.png" alt="assembly.png" /&gt;&lt;/span&gt;&lt;/P&gt;

&lt;P&gt;Here is assembly code along with source code. Several comments on the code&lt;/P&gt;

&lt;UL&gt;
	&lt;LI&gt;​​Commented-out lines (131, 135, 137)&amp;nbsp; explain the lines below&lt;/LI&gt;
	&lt;LI&gt;Lines 131, 132: p1 and p2 point on adjacent memory regions&amp;nbsp;&amp;nbsp;(p1 + 1 = p2). In real application it will be not always the case, but for this test it is as I said.&lt;/LI&gt;
&lt;/UL&gt;

&lt;P&gt;Yes, you are correct: not all memory reads are from contiguous​ memory areas. The code accesses three memory buffers: 2 for read and one for write. In&amp;nbsp; each iteration the&amp;nbsp;code&lt;/P&gt;

&lt;UL&gt;
	&lt;LI&gt;reads 4*4=16 bytes from the first memory buffer (tasklets)&lt;/LI&gt;
	&lt;LI&gt;reads 2*4 * 4&amp;nbsp;=&amp;nbsp;32 bytes from the second memory buffer (points)&lt;/LI&gt;
	&lt;LI&gt;writes​ 4*4 = 16 bytes to &amp;nbsp;the third memory buffer (projections)&lt;/LI&gt;
&lt;/UL&gt;

&lt;P&gt;​Hence provided I am unable to load CPU due DRAM limitations is it the reason to perform computations on GPU?&lt;/P&gt;

&lt;P&gt;Thank you&lt;/P&gt;

&lt;P&gt;Ayrat&lt;/P&gt;</description>
      <pubDate>Fri, 18 Nov 2016 18:07:30 GMT</pubDate>
      <guid>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085410#M15383</guid>
      <dc:creator>Ayrat_S_</dc:creator>
      <dc:date>2016-11-18T18:07:30Z</dc:date>
    </item>
    <item>
      <title>Assuming your write buffer is</title>
      <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085411#M15384</link>
      <description>&lt;P&gt;Assuming your write buffer is not used very frequently, you might want to implement non-temporal store instruction for writing data. This might free cache L1 resources for data load operations. Sometimes the "&lt;SPAN style="color: rgb(0, 0, 0); font-family: Consolas, &amp;quot;Bitstream Vera Sans Mono&amp;quot;, &amp;quot;Courier New&amp;quot;, Courier, monospace; font-size: 13.008px;"&gt;restrict&lt;/SPAN&gt;" keyword against function parameters helps compiler to generate movntps instruction.&amp;nbsp;&lt;/P&gt;

&lt;P&gt;As for GPU, I suppose you'll face with even more painful memory latency problems, until you have a small sets of data being handled by massively parallel and quite small execution kernels.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 21 Nov 2016 12:37:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085411#M15384</guid>
      <dc:creator>Vladimir_T_Intel</dc:creator>
      <dc:date>2016-11-21T12:37:00Z</dc:date>
    </item>
    <item>
      <title>Thank you for the hint!</title>
      <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085412#M15385</link>
      <description>&lt;P&gt;Thank you for the hint!&lt;/P&gt;

&lt;P&gt;movtnps (_mm_stream_ps​) provided nearly 10% speedup for 1-threaded code and more than 30% speedup for 2-threaded code.&lt;/P&gt;</description>
      <pubDate>Tue, 22 Nov 2016 19:23:14 GMT</pubDate>
      <guid>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085412#M15385</guid>
      <dc:creator>Ayrat_S_</dc:creator>
      <dc:date>2016-11-22T19:23:14Z</dc:date>
    </item>
    <item>
      <title>Would you share profiling</title>
      <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085413#M15386</link>
      <description>&lt;P&gt;Would you share profiling results and the asm view screenshot?&lt;/P&gt;</description>
      <pubDate>Tue, 22 Nov 2016 22:49:17 GMT</pubDate>
      <guid>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085413#M15386</guid>
      <dc:creator>Vladimir_T_Intel</dc:creator>
      <dc:date>2016-11-22T22:49:17Z</dc:date>
    </item>
    <item>
      <title>The screenshots and profiling</title>
      <link>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085414#M15387</link>
      <description>&lt;P&gt;The screenshots and profiling results are in the attached file.&lt;/P&gt;</description>
      <pubDate>Wed, 23 Nov 2016 16:56:56 GMT</pubDate>
      <guid>https://community.intel.com/t5/Analyzers/Results-Interpreting/m-p/1085414#M15387</guid>
      <dc:creator>Ayrat_S_</dc:creator>
      <dc:date>2016-11-23T16:56:56Z</dc:date>
    </item>
  </channel>
</rss>

