<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic at first glance, this is not in Intel® oneAPI Math Kernel Library</title>
    <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/unexpected-copy-in-dsyrk/m-p/1136713#M26079</link>
    <description>&lt;P&gt;at first glance, this is not expected behavior.... as syrk shouldn't internally allocate some buffers.&amp;nbsp; in any case, you may try to set&amp;nbsp;MKL_DISABLE_FAST_MM&amp;nbsp;environment variable to 1 and check if the profile would be the same.&lt;/P&gt;</description>
    <pubDate>Wed, 13 Nov 2019 04:23:48 GMT</pubDate>
    <dc:creator>Gennady_F_Intel</dc:creator>
    <dc:date>2019-11-13T04:23:48Z</dc:date>
    <item>
      <title>unexpected copy in dsyrk</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/unexpected-copy-in-dsyrk/m-p/1136712#M26078</link>
      <description>&lt;PRE class="brush:cpp; class-name:dark;"&gt;I wrote my function to compute the pseudo-inverse for skinny matrices, using the following function and armadillo

&amp;nbsp;&amp;nbsp; &amp;nbsp;inline void inverse_sym_matrix_lapack(double * a, const MKL_INT n, char uplo = 'L') {
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;const auto ipv = static_cast&amp;lt;int*&amp;gt;(malloc(n * sizeof(int)));
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;MKL_INT &amp;nbsp;lwork = -1;
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;//Querying, finding optimal lwork
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;double wkopt;
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;MKL_INT info;
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;dsytrf(&amp;amp;uplo, &amp;amp;n, a, &amp;amp;n, ipv, &amp;amp;wkopt, &amp;amp;lwork, &amp;amp;info);
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;lwork = static_cast&amp;lt;MKL_INT&amp;gt;(wkopt);
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;const auto work = static_cast&amp;lt;double*&amp;gt;(malloc(lwork * sizeof(double)));
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;dsytrf(&amp;amp;uplo, &amp;amp;n, a, &amp;amp;n, ipv, work, &amp;amp;lwork, &amp;amp;info);
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;if (info != 0)
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;{
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;std::cout &amp;lt;&amp;lt; info &amp;lt;&amp;lt; "\n";
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;throw std::runtime_error("dsytrf failed");
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;}
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;dsytri(&amp;amp;uplo, &amp;amp;n, a, &amp;amp;n, ipv, work, &amp;amp;info);
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;if (info != 0)
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;{
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;std::cout &amp;lt;&amp;lt; info &amp;lt;&amp;lt; "\n";
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;throw std::runtime_error("dsytri failed");
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;}
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;free(ipv);
&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;free(work);
&amp;nbsp;&amp;nbsp; &amp;nbsp;}

	inline arma::mat pinvDirectSkinny(const arma::mat&amp;amp; source)
	{
		MKL_INT n_rows = source.n_rows;
		MKL_INT n_cols = source.n_cols;
		if (!(n_cols &amp;lt;= n_rows))
		{
			throw std::runtime_error("Size error: the matrix must be skinny!");
		}
		arma::mat inv(n_cols, n_cols);
		const auto inv_ptr = inv.memptr();
		const double alpha = 1;
		const double beta = 0;
		const char uplo = 'L';
		//inv is now source.t() * source;
		dsyrk(&amp;amp;uplo,"T" , &amp;amp;n_cols, &amp;amp;n_rows , &amp;amp;alpha, source.memptr(), &amp;amp;n_rows, &amp;amp;beta, inv_ptr, &amp;amp;n_cols );
		//inv is now inv(source.t() * source);
		inverse_sym_matrix_lapack(inv_ptr,n_cols, uplo);
		//Filling the diagonal to use dgemm
		for (int col = 1; col &amp;lt; n_cols; col++)
		{
			for (int row = 0; row &amp;lt; col; row++)
			{
				inv(row, col) = inv(col, row);
			}
		}
		//Generic LAPACK
		//inverse_generic_matrix_lapack(inv);
		//inv= inv(source.t() * source) * source.t()
		const MKL_INT inv_n_rows = n_cols;
		const MKL_INT inv_n_cols = n_cols;
		arma::mat res(n_cols, n_rows);
		//res=inv(source.t() * source) * source.t();
		dgemm("N", "T", &amp;amp;inv_n_rows, &amp;amp;n_rows, &amp;amp;inv_n_cols, &amp;amp;alpha, inv_ptr, &amp;amp;inv_n_rows, source.memptr(), &amp;amp;n_rows, &amp;amp;beta, res.memptr(), &amp;amp;inv_n_rows);
		return res;
	}
	&lt;/PRE&gt;

&lt;P&gt;and after some profiling (attached), it seems the most time is spend by dsyrk&amp;nbsp; freeing a allocating memory (see attachment)&lt;/P&gt;
&lt;P&gt;Is that working as intended?&lt;/P&gt;
&lt;P&gt;Is there a way to avoid that?&lt;/P&gt;
&lt;P&gt;I observed that with VS2019 and intel mkl 2019 u4.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 08 Nov 2019 02:13:07 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/unexpected-copy-in-dsyrk/m-p/1136712#M26078</guid>
      <dc:creator>ferrazzano__vincenz1</dc:creator>
      <dc:date>2019-11-08T02:13:07Z</dc:date>
    </item>
    <item>
      <title>at first glance, this is not</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/unexpected-copy-in-dsyrk/m-p/1136713#M26079</link>
      <description>&lt;P&gt;at first glance, this is not expected behavior.... as syrk shouldn't internally allocate some buffers.&amp;nbsp; in any case, you may try to set&amp;nbsp;MKL_DISABLE_FAST_MM&amp;nbsp;environment variable to 1 and check if the profile would be the same.&lt;/P&gt;</description>
      <pubDate>Wed, 13 Nov 2019 04:23:48 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/unexpected-copy-in-dsyrk/m-p/1136713#M26079</guid>
      <dc:creator>Gennady_F_Intel</dc:creator>
      <dc:date>2019-11-13T04:23:48Z</dc:date>
    </item>
  </channel>
</rss>

