Auto-suggest helps you quickly narrow down your search results by suggesting possible matches as you type.

Showing results for

- Intel Community
- Software
- Software Development SDKs and Libraries
- Intel® oneAPI Math Kernel Library
- The rate function dormtr Intel MKL

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Mute
- Printer Friendly Page

yuriisig

Beginner

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

09-14-2010
01:59 AM

152 Views

The rate function dormtr Intel MKL

I wrote on my page several years ago that the algorithm of completion of Householders method in diagonalization of real symmetric matrices, implemented in **Intel MKL** package, can be improved significantly. This improvement is introduced now, and this is done cunningly: it is hidden inside the ** dormql** (

P.S.

As these improvements are included in the code of **Intel MKL** package, it is appropriate to give the information about the necessary changes in **LAPACK** (**CLAPACK**) code, which is the basis of **Intel MKL** package. The below-given code related to the case of storing the initial matrix in the lower triangle in ** dlarfb.c** from

work_dim1 = *ldwork;

work_offset = 1 + work_dim1;

work -= work_offset;/* Function Body */

if (*m <= 0 || *n <= 0) {

return 0;

}if (lsame_(trans, "N")) {

*(unsigned char *)transt = 'T';

} else {

*(unsigned char *)transt = 'N';

}if (lsame_(storev, "C")) {

if (lsame_(direct, "F")) {

/* Let V = ( V1 ) (first K rows) */

/* ( V2 ) */

/* where V1 is unit lower triangular. */if (lsame_(side, "L")) {

/* Form H * C or H' * C where C = ( C1 ) */

/* ( C2 ) *//* Computing MAX */

i__1 = *k, i__2 = iladlr_(m, k, &v[v_offset], ldv);

lastv = max(i__1,i__2);

lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);/* W := C' * V = (C1'*V1 + C2'*V2) (stored in WORK) */

/* W := C1' */

i__1 = *k;

for (j = 1; j <= i__1; ++j) {

dcopy_(&lastc, &c__[j + c_dim1], ldc, &work[j * work_dim1

+ 1], &c__1);

/* L10: */

}/* W := W * V1 */

dtrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &

c_b14, &v[v_offset], ldv, &work[work_offset], ldwork);

if (lastv > *k) {/* W := W + C2'*V2 */

i__1 = lastv - *k;

dgemm_("Transpose", "No transpose", &lastc, k, &i__1, &

c_b14, &c__[*k + 1 + c_dim1], ldc, &v[*k + 1 +

v_dim1], ldv, &c_b14, &work[work_offset], ldwork);

}/* W := W * T' or W * T */

dtrmm_("Right", "Upper", transt, "Non-unit", &lastc, k, &

c_b14, &t[t_offset], ldt, &work[work_offset], ldwork);/* C := C - V * W' */

if (lastv > *k) {

/* C2 := C2 - V2 * W' */

i__1 = lastv - *k;

dgemm_("No transpose", "Transpose", &i__1, &lastc, k, &

c_b25, &v[*k + 1 + v_dim1], ldv, &work[

work_offset], ldwork, &c_b14, &c__[*k + 1 +

c_dim1], ldc);

}/* W := W * V1' */

dtrmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &

c_b14, &v[v_offset], ldv, &work[work_offset], ldwork);/* C1 := C1 - W' */

i__1 = *k;

for (j = 1; j <= i__1; ++j) {

i__2 = lastc;

for (i__ = 1; i__ <= i__2; ++i__) {

c__[j + i__ * c_dim1] -= work[i__ + j * work_dim1];

/* L20: */

}

/* L30: */

}} else if (lsame_(side, "R")) {

should be replaced by (I have not edited comments):

Make these minor changes, and you will reach the same speed as inwork_dim1 = *k;

work_offset = 1 + work_dim1;

work -= work_offset;

/* Function Body */

if (*m <= 0 || *n <= 0) {

return 0;

}

if (lsame_(trans, "N")) {

*(unsigned char *)transt = 'N';

} else {

*(unsigned char *)transt = 'T';

}

if (lsame_(storev, "C")) {

if (lsame_(direct, "F")) {

/* Let V = ( V1 ) (first K rows) */

/* ( V2 ) */

/* where V1 is unit lower triangular. */

if (lsame_(side, "L")) {

/* Form H * C or H' * C where C = ( C1 ) */

/* ( C2 ) */

/* Computing MAX */

i__1 = *k, i__2 = iladlr_(m, k, &v[v_offset], ldv);

lastv = max(i__1,i__2);

lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);

/*W := C' * V = (C1'*V1 + C2'*V2) (stored in WORK) */

/* W := C1' */

i__1 = lastc;

for (j = 1; j <= i__1; ++j) {

i__2 = *k;

for (i__ = 1; i__ <= i__2; ++i__) {

work[i__ + j * work_dim1] = c__[i__ + j * c_dim1];

}

}

/* W := W * V1 */

dtrmm_("Left", "Lower", "Transpose", "Unit", k, &lastc, &

c_b14, &v[v_offset], ldv, &work[work_offset], k);

if (lastv > *k) {

/* W := W + C2'*V2 */

i__1 = lastv - *k;

dgemm_("Transpose", "No transpose", k, &lastc, &i__1, &

c_b14, &v[*k + 1 + v_dim1], ldv,

&c__[*k + 1 + c_dim1], ldc, &c_b14, &work[work_offset], k);

}

/* W := W * T' or W * T */

dtrmm_("Left", "Upper", transt, "Non-unit", k, &lastc, &

c_b14, &t[t_offset], ldt, &work[work_offset], k);

/* C := C - V * W' */

if (lastv > *k) {

/*C2 := C2 - V2 * W' */

i__1 = lastv - *k;

dgemm_("No transpose", "No transpose", &i__1, &lastc, k, &

c_b25, &v[*k + 1 + v_dim1], ldv, &work[

work_offset], k, &c_b14, &c__[*k + 1 +

c_dim1], ldc);

}

/* W := W * V1' */

dtrmm_("Left", "Lower", "No transpose", "Unit", k, &lastc, &

c_b14, &v[v_offset], ldv, &work[work_offset], k);

/* C1 := C1 - W' */

i__1 = lastc;

#pragma omp parallel for num_threads(4) private(j, i__)

for (i__ = 1; i__ <= i__1; ++i__) {i__2 = *k;

for (j = 1; j <= i__2; ++j) {c__[j + i__ * c_dim1] -= work[j + i__ * work_dim1];

/* L20: */

}

/* L30: */

}

} else if (lsame_(side, "R")) {

Link Copied

3 Replies

yuriisig

Beginner

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

10-20-2010
06:04 AM

152 Views

Quoting yuriisig

...it is appropriate to give the information about the necessary changes in **LAPACK** (**CLAPACK**) code...

The article on this topic will appear in November.

yuriisig

Beginner

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

11-08-2010
12:47 AM

152 Views

Here endurances from this article:

Modification of Diagonalization Algorithms.

..........

..........

..........

To overcome limitations imposed by the 32-bit architecture of the processor, the theory presented in [5] was developed. It required considerable revision of many functions of LAPACK package. These modifications enhanced the performance of diagonalization algorithms in [13] with respect to two parameters: calculation rate (by up to 80%) and demand for computer resources (by a factor of 6). A decrease in the computer resource used allowed calculation of more complex systems. New diagonalization algorithms are also efficient on 64-bit processors.

..........

..........

..........

However, in the LAPACK code, certain nontrivial inaccuracies concerning efficient use of block methods for matrix processing have not yet been eliminated. They considerably reduce the performance of the code obtained by assembling the programs that use the LAPACK package.

Let us consider as example the required modification of the dlarfb function from the LAPACK package (the latest version, 3.2.2). The performance of this function is mainly determined by two calls of the matrix multiplication function dgemm. The choice of this example is governed by the fact that the dlarfb function has numerous applications, not restricted to diagonalization of real symmetric matrices. Let us isolate from the code of the dlarfb function the block responsible for setting the initial symmetric real matrix in the lower triangle of a square matrix, and let us extract from this block the call to the dgemm function responsible for slowing down of the diagonalization program.

CALL DGEMM('Transpose', 'No transpose',

$ LASTC, K, LASTV-K,

$ ONE, C(K+1, 1), LDC, V(K+1, 1), LDV,

$ ONE, WORK, LDWORK)

Here two matrices are multiplied: In the first matrix of dimension LASTC* (LASTV-K), the number of rows is smaller than the number of columns, and in the second matrix of dimension (LASTV-K)* K, the number of columns K is very small (64128 at optimal use of dlarfb function). Because of the small number of columns in the second matrix, the processor cache is used extremely inefficiently. The problem can be solved by passing to multiplication of the transposed matrices (corprespondingly, the multipliers switch places). Then multiplication of the matrices will be set by the following code:

CALL DGEMM('Transpose', 'No transpose',

$ K, LASTC, LASTV-K,

$ ONE, V(K+1, 1), LDV, C(K+1, 1), LDC,

$ ONE, WORK, K)

Now the number of columns in the second matrix in the course of calculation of eigenvectors of the initial real symmetric matrix from the eigenvectors of the tridiagonal matrix will be constant and equal to LASTC >> K. In this case, efficient use of the processor cache will be ensured.

..........

..........

..........

Such modification of the code, with proper matrix multiplication strategy, increases the multiplication rate by no less than 35% for the highly optimized code (on old processor, the gain will be still more significant). For the initial dgemm code from the LAPACK package, the difference reaches 45%.

..........

..........

..........

The developers of the LAPACK package and the INTEL Company using the LAPACK package in its work were informed on the possibility of considerable acceleration of the diagonalization procedure.

..........

..........

..........

1. Semenov, S.G. and Sigolaev, Yu.F., Zh. Obshch. Khim., 2006, vol. 76, no. 6, p. 1006.

2. Semenov, S.G. and Sigolaev, Yu.F., Zh. Obshch. Khim., 2006, vol. 76, no. 11, p. 1809.

3. Semenov, S.G. and Sigolaev, Yu.F., Zh. Obshch. Khim., 2007, vol. 77, no. 5, p. 780.

..........

..........

..........

5. Wilkinson, J.H., The Algebraic Eigenvalue Problem, Oxford: Clarendon, 1965.

..........

..........

..........

yuriisig

Beginner

- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

02-25-2011
01:16 AM

152 Views

See article: http://depositfiles.com/files/tr3vox0pp

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page

For more complete information about compiler optimizations, see our Optimization Notice.