unexpected copy in dsyrk

ferrazzano__vincenz1 · ‎11-07-2019

I wrote my function to compute the pseudo-inverse for skinny matrices, using the following function and armadillo

    inline void inverse_sym_matrix_lapack(double * a, const MKL_INT n, char uplo = 'L') {
        const auto ipv = static_cast<int*>(malloc(n * sizeof(int)));
        MKL_INT  lwork = -1;
        //Querying, finding optimal lwork
        double wkopt;
        MKL_INT info;
        dsytrf(&uplo, &n, a, &n, ipv, &wkopt, &lwork, &info);
        lwork = static_cast<MKL_INT>(wkopt);
        const auto work = static_cast<double*>(malloc(lwork * sizeof(double)));
        dsytrf(&uplo, &n, a, &n, ipv, work, &lwork, &info);
        if (info != 0)
        {
            std::cout << info << "\n";
            throw std::runtime_error("dsytrf failed");
        }
        dsytri(&uplo, &n, a, &n, ipv, work, &info);
        if (info != 0)
        {
            std::cout << info << "\n";
            throw std::runtime_error("dsytri failed");
        }
        free(ipv);
        free(work);
    }

	inline arma::mat pinvDirectSkinny(const arma::mat& source)
	{
		MKL_INT n_rows = source.n_rows;
		MKL_INT n_cols = source.n_cols;
		if (!(n_cols <= n_rows))
		{
			throw std::runtime_error("Size error: the matrix must be skinny!");
		}
		arma::mat inv(n_cols, n_cols);
		const auto inv_ptr = inv.memptr();
		const double alpha = 1;
		const double beta = 0;
		const char uplo = 'L';
		//inv is now source.t() * source;
		dsyrk(&uplo,"T" , &n_cols, &n_rows , &alpha, source.memptr(), &n_rows, &beta, inv_ptr, &n_cols );
		//inv is now inv(source.t() * source);
		inverse_sym_matrix_lapack(inv_ptr,n_cols, uplo);
		//Filling the diagonal to use dgemm
		for (int col = 1; col < n_cols; col++)
		{
			for (int row = 0; row < col; row++)
			{
				inv(row, col) = inv(col, row);
			}
		}
		//Generic LAPACK
		//inverse_generic_matrix_lapack(inv);
		//inv= inv(source.t() * source) * source.t()
		const MKL_INT inv_n_rows = n_cols;
		const MKL_INT inv_n_cols = n_cols;
		arma::mat res(n_cols, n_rows);
		//res=inv(source.t() * source) * source.t();
		dgemm("N", "T", &inv_n_rows, &n_rows, &inv_n_cols, &alpha, inv_ptr, &inv_n_rows, source.memptr(), &n_rows, &beta, res.memptr(), &inv_n_rows);
		return res;
	}

and after some profiling (attached), it seems the most time is spend by dsyrk freeing a allocating memory (see attachment)

Is that working as intended?

Is there a way to avoid that?

I observed that with VS2019 and intel mkl 2019 u4.

Gennady_F_Intel · ‎11-12-2019

at first glance, this is not expected behavior.... as syrk shouldn't internally allocate some buffers. in any case, you may try to set MKL_DISABLE_FAST_MM environment variable to 1 and check if the profile would be the same.