#define n1 102 #define n2 102 #define niter 20000 // #define CFDfptype float #define CFDfptype double _RTALIGN16 CFDfptype U0[n1][n2]; _RTALIGN16 CFDfptype U1[n1][n2]; _RTALIGN16 CFDfptype U2[n1][n2]; _RTALIGN16 CFDfptype *cU0[n1][n2]; int i, j, t; CFDfptype Dx, Dy, Lx, Ly; CFDfptype InvDxDx, InvDyDy, Dt, alpha; CFDfptype totaltime, Stab, DtAlpha, DxDx, DyDy; clock_t time0, time1; FILE *f1 = NULL; CFDfptype InvDxDx2, InvDyDy2; alpha = 0.4; // Initializations totaltime = 1.0; Dt = totaltime / ( ( niter-1 ) * 1.0 ); Lx = 1.0; Ly = 1.0; Dx = Lx / ( (n1-1) * 1.0 ); Dy = Ly / ( (n2-1) * 1.0 ); InvDxDx = 1.0 / ( Dx*Dx ); InvDyDy = 1.0 / ( Dy*Dy ); DxDx = Dx*Dx; DyDy = Dy*Dy; Stab = alpha * Dt * ( InvDxDx + InvDyDy ); DtAlpha = Dt * alpha; // Stability if result <= 0.5 printf( "Stability factor : %f\n", Stab ); printf( "DtAlpha : %f\n", DtAlpha ); printf( "InvDxDx : %f\n", InvDxDx ); printf( "InvDyDy : %f\n", InvDyDy ); InvDxDx2 = ( InvDxDx * DtAlpha ); InvDyDy2 = ( InvDyDy * DtAlpha ); printf( "InvDxDx * DtAlpha: %f\n", InvDxDx2 ); printf( "InvDyDy * DtAlpha: %f\n", InvDyDy2 ); CFDfptype InvDD = InvDxDx2; HANDLE hProcess = NULL; hProcess = ::GetCurrentProcess(); // Test-Case 1 - Optimized without SIMD { for( i = 0; i < n1; i++ ) { for( j = 0; j < n2; j++ ) { U0[i][j] = 0.0; U1[i][j] = 0.0; } } for( i = 0; i < n1; i++ ) { U0[i][0] = 1.0; U1[i][0] = 1.0; } printf("Initialization Completed\n"); ::SetPriorityClass( hProcess, REALTIME_PRIORITY_CLASS ); time0 = clock(); // Core Processing for( t = 0; t < niter; t++ ) { // Version 1 - Original // for( i = 1; i < n1-1; i++ ) // Even // { // for( j = 1; j < n2-1; j++ ) // { // U1[i][j] = U0[i][j] + DtAlpha * // ( ( U0[i+1][j] - 2.0 * U0[i][j] + U0[i-1][j] ) * InvDxDx + ( U0[i][j+1] - 2.0 * U0[i][j] + U0[i][j-1] ) * InvDyDy ); // } // } // for( i = 1; i < n1-1; i++ ) // Odd // { // for( j = 1; j < n2-1; j++ ) // { // U0[i][j] = U1[i][j] + DtAlpha * // ( ( U1[i+1][j] - 2.0 * U1[i][j] + U1[i-1][j] ) * InvDxDx + ( U1[i][j+1] - 2.0 * U1[i][j] +U1[i][j-1] ) * InvDyDy ); // } // } // Version 2 - UnRolled Loops - 1-in-1 // for( i = 1; i < ( n1-1 ); i++ ) // { // for( j = 1; j < ( n2-1 ); j++ ) // { // U1[i][j] = U0[i][j] + InvDD * // ( ( U0[i+1][j] - 2.0 * U0[i][j] + U0[i-1][j] ) + ( U0[i][j+1] - 2.0 * U0[i][j] + U0[i][j-1] ) ); // U0[i][j] = U1[i][j] + InvDD * // ( ( U1[i+1][j] - 2.0 * U1[i][j] + U1[i-1][j] ) + ( U1[i][j+1] - 2.0 * U1[i][j] + U1[i][j-1] ) ); // } // } // Version 3 - UnRolled Loops - 2-in-1 // for( i = 1; i < ( n1-1 ); i++ ) // { // for( j = 1; j < ( n2-1 ); j+=2 ) // { // U1[i][j ] = U0[i][j ] + InvDD * // ( ( U0[i+1][j ] - 2.0 * U0[i][j ] + U0[i-1][j ] ) + ( U0[i][j+1 ] - 2.0 * U0[i][j ] + U0[i][j-1 ] ) ); // U0[i][j ] = U1[i][j] + InvDD * // ( ( U1[i+1][j ] - 2.0 * U1[i][j ] + U1[i-1][j ] ) + ( U1[i][j+1 ] - 2.0 * U1[i][j ] + U1[i][j-1 ] ) ); // U1[i][j+1] = U0[i][j+1] + InvDD * // ( ( U0[i+1][j+1] - 2.0 * U0[i][j+1] + U0[i-1][j+1] ) + ( U0[i][j+1+1] - 2.0 * U0[i][j+1] + U0[i][j-1+1] ) ); // U0[i][j+1] = U1[i][j+1] + InvDD * // ( ( U1[i+1][j+1] - 2.0 * U1[i][j+1] + U1[i-1][j+1] ) + ( U1[i][j+1+1] - 2.0 * U1[i][j+1] + U1[i][j-1+1] ) ); // } // } // Version 4 - UnRolled Loops - 3-in-1 for( i = 1; i < ( n1-1 ); i++ ) { j = 1; U1[i][j ] = U0[i][j ] + InvDD * ( ( U0[i+1][j ] - 2.0 * U0[i][j ] + U0[i-1][j ] ) + ( U0[i][j+1 ] - 2.0 * U0[i][j ] + U0[i][j-1 ] ) ); U0[i][j ] = U1[i][j] + InvDD * ( ( U1[i+1][j ] - 2.0 * U1[i][j ] + U1[i-1][j ] ) + ( U1[i][j+1 ] - 2.0 * U1[i][j ] + U1[i][j-1 ] ) ); for( j = 2; j < ( n2-1 ); j+=3 ) { U1[i][j ] = U0[i][j ] + InvDD * ( ( U0[i+1][j ] - 2.0 * U0[i][j ] + U0[i-1][j ] ) + ( U0[i][j+1 ] - 2.0 * U0[i][j ] + U0[i][j-1 ] ) ); U0[i][j ] = U1[i][j] + InvDD * ( ( U1[i+1][j ] - 2.0 * U1[i][j ] + U1[i-1][j ] ) + ( U1[i][j+1 ] - 2.0 * U1[i][j ] + U1[i][j-1 ] ) ); U1[i][j+1] = U0[i][j+1] + InvDD * ( ( U0[i+1][j+1] - 2.0 * U0[i][j+1] + U0[i-1][j+1] ) + ( U0[i][j+1+1] - 2.0 * U0[i][j+1] + U0[i][j-1+1] ) ); U0[i][j+1] = U1[i][j+1] + InvDD * ( ( U1[i+1][j+1] - 2.0 * U1[i][j+1] + U1[i-1][j+1] ) + ( U1[i][j+1+1] - 2.0 * U1[i][j+1] + U1[i][j-1+1] ) ); U1[i][j+2] = U0[i][j+2] + InvDD * ( ( U0[i+1][j+2] - 2.0 * U0[i][j+2] + U0[i-1][j+2] ) + ( U0[i][j+1+2] - 2.0 * U0[i][j+2] + U0[i][j-1+2] ) ); U0[i][j+2] = U1[i][j+2] + InvDD * ( ( U1[i+1][j+2] - 2.0 * U1[i][j+2] + U1[i-1][j+2] ) + ( U1[i][j+1+2] - 2.0 * U1[i][j+2] + U1[i][j-1+2] ) ); } } } time1 = clock(); ::SetPriorityClass( hProcess, NORMAL_PRIORITY_CLASS ); printf("Test-Case 1 - Total time : %f\n", ( double )( time1 - time0 ) ); f1 = fopen( "C:\\TestCase1Results.dat", "wt" ); for( i = 1; i < n1-1; i++ ) { for( j = 1; j < n2-1; j++ ) { fprintf( f1, "%d\t%d\t%f\n", i, j, U0[i][j] ); } } }