So if i call function into win32k.sys, it will be faster than gdi32.dll's function ?
(http://msdn.microsoft.com/en-us/library/windows/hardware/ff564185%28v=vs.85%29.aspx)
And i learned dll is like sys cause i have open win32k.sys into http://www.nirsoft.net/utils/dll_export_viewer.html by modify extension: win32k.dll.
For upload failed process dump file, i can't do that now (reinstall vs2013).
And i have a good new, i have found a new way for make rotation (thanks http://abreojosensamblador.net/Productos/AOWG/html/Pags_en/Chap04.html. sub-chapter: 4.1.7.2.4. Relative to the three axes)
And by this way, i do only 6 calculations instead 20 with precedent way:
; Indice for array
%define _1x 0
%define _1y 4
%define _1z 8
%define _1color 12
%define _2x 16
%define _2y 20
%define _2z 24
%define _2color 28
%define _3x 32
%define _3y 36
%define _3z 40
%define _3color 44
%define _4x 48
%define _4y 52
%define _4z 56
%define _4color 60
; Duplicate cos(x)
vbroadcastss ymm0, [fsincosps_cos + _x]
; Duplicate cos(y)
vbroadcastss ymm1, [fsincosps_cos + _y]
; Duplicate cos(z)
vbroadcastss ymm2, [fsincosps_cos + _z]
; Duplicate sin(x)
vbroadcastss ymm3, [fsincosps_sin + _x]
; Duplicate sin(y)
vbroadcastss ymm4, [fsincosps_sin + _y]
; Duplicate sin(z)
vbroadcastss ymm5, [fsincosps_sin + _z]
make_rotate:
;--------------------------------------------------------.
; X-axe ;|
; y' = (y * cos(phi_x)) - (z * sin(phi_x)) ;| 1 0
; z' = (z * cos(phi_x)) + (y * sin(phi_x)) ;| 2 4
; 2y' = (2y * cos(phi_x)) - (2z * sin(phi_x)) ;| 3 8
; 2z' = (2z * cos(phi_x)) + (2y * sin(phi_x)) ;| 4 12
; 3y' = (3y * cos(phi_x)) - (3z * sin(phi_x)) ;| 5 16
; 3z' = (3z * cos(phi_x)) + (3y * sin(phi_x)) ;| 6 20
; 4y' = (4y * cos(phi_x)) - (4z * sin(phi_x)) ;| 7 24
; 4z' = (4z * cos(phi_x)) + (4y * sin(phi_x)) ;| 8 28
; ;|
; y = y' 3y = 3y' ;|
; z = z' 3z = 3z' ;|
; 2y = 2y' 4y = 4y' ;|
; 2z = 2z' 4z = 4z' ;|
;-------------------------------------------------------;|
; ;|
;-------------------------------------------------------;|
; Y-axe ;|
; End. z = z' = (z * cos(phi_y)) - (x * sin(phi_y)) ;| 1 0
; x' = (x * cos(phi_y)) + (z * sin(phi_y)) ;| 2 4
; End.2z = 2z' = (2z * cos(phi_y)) - (2x * sin(phi_y)) ;| 3 8
; 2x' = (2x * cos(phi_y)) + (2z * sin(phi_y)) ;| 4 12
; End.3z = 3z' = (3z * cos(phi_y)) - (3x * sin(phi_y)) ;| 5 16
; 3x' = (3x * cos(phi_y)) + (3z * sin(phi_y)) ;| 6 20
; End.4z = 4z' = (4z * cos(phi_y)) - (4x * sin(phi_y)) ;| 7 24
; 4x' = (4x * cos(phi_y)) + (4z * sin(phi_y)) ;| 8 28
; ;|
; x = x' ;|
; 2x = 2x' ;|
; 3x = 3x' ;|
; 4x = 4x' ;|
;-------------------------------------------------------;|
; ;|
;-------------------------------------------------------;|
; Z-axe ;|
; End. x = x' = (x * cos(phi_z)) - (y * sin(phi_z)) ;| 1 0
; End. y = y' = (y * cos(phi_z)) + (x * sin(phi_z)) ;| 2 4
; End.2x = 2x' = (2x * cos(phi_z)) - (2y * sin(phi_z)) ;| 3 8
; End.2y = 2y' = (2y * cos(phi_z)) + (2x * sin(phi_z)) ;| 4 12
; End.3x = 3x' = (3x * cos(phi_z)) - (3y * sin(phi_z)) ;| 5 16
; End.3y = 3y' = (3y * cos(phi_z)) + (3x * sin(phi_z)) ;| 6 20
; End.4x = 4x' = (4x * cos(phi_z)) - (4y * sin(phi_z)) ;| 7 24
; End.4y = 4y' = (4y * cos(phi_z)) + (4x * sin(phi_z)) ;| 8 28
;--------------------------------------------------------.
; Store i = 0::Loop(i <= 4 , j <= 8){ ymm = iY; j++; ymm = iZ; i++; j++;}
vmovlps xmm6, [rotate_rsi + _1y]
vmovlps xmm7, [rotate_rsi + _2y]
vmovlps xmm8, [rotate_rsi + _3y]
vmovlps xmm9, [rotate_rsi + _4y]
vmovlps [rotate_x_yz + 0], xmm6
vmovlps [rotate_x_yz + 8], xmm7
vmovlps [rotate_x_yz + 16], xmm8
vmovlps [rotate_x_yz + 24], xmm9
; Store i = 0::Loop(i <= 4 , j <= 8){ ymm = iZ; j++; ymm = iY; i++; j++;}
vextractps dword [rotate_x_zy + 0], xmm6, 1 ; _1z
vmovss dword [rotate_x_zy + 4], xmm6 ; _1y
vextractps dword [rotate_x_zy + 8], xmm7, 1 ; _2z
vmovss dword [rotate_x_zy + 12], xmm7 ; _2y
vextractps dword [rotate_x_zy + 16], xmm8, 1 ; _3z
vmovss dword [rotate_x_zy + 20], xmm8 ; _3y
vextractps dword [rotate_x_zy + 24], xmm9, 1 ; _4z
vmovss dword [rotate_x_zy + 28], xmm9 ; _4y
; X-axe ymm6 ymm7
; y' = (y * cos(phi_x)) - (z * sin(phi_x))
; z' = (z * cos(phi_x)) + (y * sin(phi_x))
vmulps ymm6, ymm0, [rotate_x_yz] ; ymm6 * cos(x)
vmulps ymm7, ymm3, [rotate_x_zy] ; ymm7 * sin(x)
vaddsubps ymm6, ymm7
vmovups [moveobject_tmp], ymm6
; y = y' ymm6
; z = z' ymm6
vmovss xmm7, [rotate_rsi + _1x]
vmovss xmm8, [rotate_rsi + _2x]
vmovss xmm9, [rotate_rsi + _3x]
vmovss xmm10, [rotate_rsi + _4x]
; Store i = 0::Loop(i <= 4 , j <= 8){ ymm = iZ; j++; ymm = iX; i++; j++;}
vmovups [rotate_y_zx - 4], ymm6
vmovss [rotate_y_zx + 4], xmm7
vmovss [rotate_y_zx + 12], xmm8
vmovss [rotate_y_zx + 20], xmm9
vmovss [rotate_y_zx + 28], xmm10
; Store i = 0::Loop(i <= 4 , j <= 8){ ymm = iX; j++; ymm = iZ; i++; j++;}
vmovups [rotate_y_xz + 0], ymm6
vmovss [rotate_y_xz + 0], xmm7
vmovss [rotate_y_xz + 8], xmm8
vmovss [rotate_y_xz + 16], xmm9
vmovss [rotate_y_xz + 24], xmm10
; Y-axe ymm6 ymm7
; z' = (z * cos(phi_y)) - (x * sin(phi_y))
; x' = (x * cos(phi_y)) + (z * sin(phi_y))
vmulps ymm6, ymm1, [rotate_y_zx] ; ymm6 * cos(y)
vmulps ymm7, ymm4, [rotate_y_xz] ; ymm7 * sin(y)
vaddsubps ymm6, ymm7
; x = x' xmm3
vmovss xmm7 , [moveobject_tmp]
vmovss xmm8 , [moveobject_tmp + 8]
vmovss xmm9 , [moveobject_tmp + 16]
vmovss xmm10, [moveobject_tmp + 24]
; Store i = 0::Loop(i <= 4 , j <= 8){ ymm = iX; j++; ymm = iY; i++; j++;}
vmovups [rotate_z_xy - 4], ymm6
vmovss [rotate_z_xy + 4], xmm7
vmovss [rotate_z_xy + 12], xmm8
vmovss [rotate_z_xy + 20], xmm9
vmovss [rotate_z_xy + 28], xmm10
; Store i = 0::Loop(i <= 4 , j <= 8){ ymm = iY; j++; ymm = iX; i++; j++;}
vmovups [rotate_z_yx + 0], ymm6
vmovss [rotate_z_yx + 0], xmm7
vmovss [rotate_z_yx + 8], xmm8
vmovss [rotate_z_yx + 16], xmm9
vmovss [rotate_z_yx + 24], xmm10
; Save z
vmovups [moveobject_tmp], ymm6
vmovss xmm6, [moveobject_tmp + 0]
vmovss xmm7, [moveobject_tmp + 8]
vmovss xmm8, [moveobject_tmp + 16]
vmovss xmm9, [moveobject_tmp + 24]
vmovss [rbx + _1z], xmm6
vmovss [rbx + _2z], xmm7
vmovss [rbx + _3z], xmm8
vmovss [rbx + _4z], xmm9
; Z-axe ymm6 ymm7
; x' = (x * cos(phi_z)) - (y * sin(phi_z))
; y' = (y * cos(phi_z)) + (x * sin(phi_z))
vmulps ymm6, ymm2, [rotate_z_xy] ; ymm6 * cos(z)
vmulps ymm7, ymm5, [rotate_z_yx] ; ymm7 * sin(z)
vaddsubps ymm6, ymm7
; Save x y
vmovups [moveobject_tmp], ymm6
vmovlps xmm6, [moveobject_tmp + 0]
vmovlps xmm7, [moveobject_tmp + 8]
vmovlps xmm8, [moveobject_tmp + 16]
vmovlps xmm9, [moveobject_tmp + 24]
vmovlps [rbx + _1x], xmm6
vmovlps [rbx + _2x], xmm7
vmovlps [rbx + _3x], xmm8
vmovlps [rbx + _4x], xmm9
rotate_rsi: dd 0 ; - p1
dd 0
dd 0
dd 0
dd 0 ; - p2
dd 0
dd 0
dd 0
dd 0 ; - p3
dd 0
dd 0
dd 0
dd 0 ; - p4
dd 0
dd 0
dd 0
moveobject_tmp: dd 0 ; - p1
dd 0
dd 0
dd 0
dd 0 ; - p2
dd 0
dd 0
dd 0
; X-axe:
rotate_x_yz: dd 0 ; y 0
dd 0 ; Z 4
dd 0 ; 2y 8
dd 0 ; 2z 12
dd 0 ; 3y 16
dd 0 ; 3z 20
dd 0 ; 4y 24
dd 0 ; 4z 28
rotate_x_zy: dd 0 ; z 0
dd 0 ; y 4
dd 0 ; 2z 8
dd 0 ; 2y 12
dd 0 ; 3z 16
dd 0 ; 3y 20
dd 0 ; 4z 24
dd 0 ; 4y 28
; Y-axe:
dd 0
rotate_y_zx: dd 0 ; z 0
dd 0 ; x 4
dd 0 ; 2z 8
dd 0 ; 2x 12
dd 0 ; 3z 16
dd 0 ; 3x 20
dd 0 ; 4z 24
dd 0 ; 4x 28
rotate_y_xz: dd 0 ; x 0
dd 0 ; z 4
dd 0 ; 2x 8
dd 0 ; 2z 12
dd 0 ; 3x 16
dd 0 ; 3z 20
dd 0 ; 4x 24
dd 0 ; 4z 28
; Z-axe:
dd 0
rotate_z_xy: dd 0 ; x 0
dd 0 ; y 4
dd 0 ; 2x 8
dd 0 ; 2y 12
dd 0 ; 3x 16
dd 0 ; 3y 20
dd 0 ; 4x 24
dd 0 ; 4y 28
rotate_z_yx: dd 0 ; y 0
dd 0 ; x 4
dd 0 ; 2y 8
dd 0 ; 2x 12
dd 0 ; 3y 16
dd 0 ; 3x 20
dd 0 ; 4y 24
dd 0 ; 4x 28
But like you see, i do a lot of vmovss, cause i do a special instruction (swap: E.0 with E.1, E.2 with E.3, ...) who don't exist on CPU as i know :/
And surprise i calculate 4 pixels by calling this function.
PS: thanks for your programs.