Finnaly when i run into visual studio, i don't get it why fps is 180 fps oO, here the asm code of put pixel:
put_object PROC
; 188 : {
$LN16:
; 189 : while (size--)
test edx, edx
je $LN14@put_object
mov rax, rsp
mov QWORD PTR [rax+8], rbx
mov QWORD PTR [rax+24], rsi
push rdi
sub rsp, 192 ; 000000c0H
movaps XMMWORD PTR [rax-24], xmm6
movsdx xmm6, QWORD PTR __real@3f91df469963e11d
movaps XMMWORD PTR [rax-40], xmm7
; 188 : {
mov rsi, r8
mov ebx, edx
mov rdi, rcx
movaps XMMWORD PTR [rax-56], xmm8
movaps XMMWORD PTR [rax-72], xmm9
movaps XMMWORD PTR [rax-88], xmm10
movss xmm9, DWORD PTR rotation_object
cvtps2pd xmm9, xmm9
movaps XMMWORD PTR [rax-104], xmm11
movaps XMMWORD PTR [rax-120], xmm12
movaps XMMWORD PTR [rsp+64], xmm13
mulsd xmm9, xmm6
; 52 : az.cosX = cos(DEG2RAD(rotation_object[0]));
movaps xmm0, xmm9
movaps XMMWORD PTR [rsp+48], xmm14
movaps XMMWORD PTR [rsp+32], xmm15
call cos
movss xmm8, DWORD PTR rotation_object+4
xorps xmm14, xmm14
cvtps2pd xmm8, xmm8
cvtsd2ss xmm14, xmm0
mulsd xmm8, xmm6
; 53 : az.cosY = cos(DEG2RAD(rotation_object[1]));
movaps xmm0, xmm8
call cos
movss xmm7, DWORD PTR rotation_object+8
xorps xmm15, xmm15
cvtps2pd xmm7, xmm7
cvtsd2ss xmm15, xmm0
mulsd xmm7, xmm6
; 54 : az.cosZ = cos(DEG2RAD(rotation_object[2]));
movaps xmm0, xmm7
call cos
xorps xmm12, xmm12
cvtsd2ss xmm12, xmm0
; 55 : az.sinX = sin(DEG2RAD(rotation_object[0]));
movaps xmm0, xmm9
call sin
xorps xmm3, xmm3
cvtsd2ss xmm3, xmm0
; 56 : az.sinY = sin(DEG2RAD(rotation_object[1]));
movaps xmm0, xmm8
movss DWORD PTR az$4$[rsp], xmm3
call sin
xorps xmm6, xmm6
cvtsd2ss xmm6, xmm0
; 57 : az.sinZ = sin(DEG2RAD(rotation_object[2]));
movaps xmm0, xmm7
movss DWORD PTR az$5$[rsp], xmm6
call sin
; 52 : az.cosX = cos(DEG2RAD(rotation_object[0]));
movss xmm11, DWORD PTR __real@44000000
movss xmm9, DWORD PTR __real@c4000000
xorps xmm13, xmm13
lea rcx, QWORD PTR [rdi+8]
lea r9, OFFSET FLAT:WindowsD
movss xmm10, DWORD PTR __real@43c80000
movss xmm8, DWORD PTR __real@c3c80000
cvtsd2ss xmm13, xmm0
npad 8
$LL2@put_object:
; 190 : {
; 191 : put_pixel(*(object + 0), *(object + 1), *(object + 2), *(object + 3), repere);
movss xmm5, DWORD PTR [rcx]
movss xmm3, DWORD PTR [rcx-4]
movss xmm4, DWORD PTR [rcx-8]
; 66 : az.y_end = az.x * ((az.cosX * az.sinZ) - (az.sinX * az.sinY * az.cosZ)) + az.y * ((az.sinX * az.sinY * az.sinZ) + (az.cosX * az.cosZ)) - az.z * (az.sinX * az.cosY);
movaps xmm1, xmm6
movaps xmm0, xmm13
movaps xmm7, xmm12
; 189 : while (size--)
dec ebx
; 58 :
; 59 : az.x = x;
; 60 : az.y = y;
; 61 : az.z = z;
; 62 : az.x += repere[0];
; 63 : az.y += repere[1];
; 64 : az.z += repere[2];
; 65 : az.x_end = az.x * ((az.cosY * az.cosZ)) - az.y * ((az.cosY * az.sinZ)) - az.z * (az.sinY);
mulss xmm0, xmm15
mulss xmm7, xmm15
; 190 : {
; 191 : put_pixel(*(object + 0), *(object + 1), *(object + 2), *(object + 3), repere);
cvttss2si r8d, DWORD PTR [rcx+4]
; 55 : az.sinX = sin(DEG2RAD(rotation_object[0]));
movss xmm2, DWORD PTR az$4$[rsp]
; 56 : az.sinY = sin(DEG2RAD(rotation_object[1]));
movss DWORD PTR az+28, xmm6
; 66 : az.y_end = az.x * ((az.cosX * az.sinZ) - (az.sinX * az.sinY * az.cosZ)) + az.y * ((az.sinX * az.sinY * az.sinZ) + (az.cosX * az.cosZ)) - az.z * (az.sinX * az.cosY);
mulss xmm1, xmm2
movss DWORD PTR az+24, xmm2
movss DWORD PTR az, xmm4
movss DWORD PTR az+4, xmm3
movss DWORD PTR az+8, xmm5
movss DWORD PTR az+12, xmm14
movss DWORD PTR az+16, xmm15
movss DWORD PTR az+20, xmm12
movss DWORD PTR az+32, xmm13
addss xmm4, DWORD PTR [rsi]
mulss xmm7, xmm4
movss DWORD PTR az, xmm4
addss xmm3, DWORD PTR [rsi+4]
movss DWORD PTR az+4, xmm3
addss xmm5, DWORD PTR [rsi+8]
mulss xmm0, xmm3
subss xmm7, xmm0
movaps xmm0, xmm5
mulss xmm0, xmm6
movaps xmm6, xmm13
mulss xmm6, xmm14
subss xmm7, xmm0
movaps xmm0, xmm1
mulss xmm0, xmm12
mulss xmm1, xmm13
movss DWORD PTR az+36, xmm7
subss xmm6, xmm0
movaps xmm0, xmm12
; 67 : az.z_end = az.x * ((az.cosX * az.sinY * az.cosZ) + (az.sinX * az.sinZ)) + az.y * ((az.sinX * az.cosZ) - (az.cosX * az.sinY * az.sinZ)) + az.z * (az.cosX * az.cosY);
; 68 :
; 69 : az.x = az.x_end;
; 70 : az.y = az.y_end;
; 71 : az.z = az.z_end;
; 72 :
; 73 : az.x = (az.x * FOCALE) / (az.z + PROFONDEUR);
mulss xmm7, DWORD PTR __real@44480000
mulss xmm0, xmm14
mulss xmm6, xmm4
addss xmm1, xmm0
movaps xmm0, xmm2
mulss xmm0, xmm15
mulss xmm1, xmm3
mulss xmm2, xmm12
addss xmm6, xmm1
movss xmm1, DWORD PTR az$5$[rsp]
mulss xmm0, xmm5
mulss xmm1, xmm14
subss xmm6, xmm0
movaps xmm0, xmm1
mulss xmm0, xmm13
mulss xmm1, xmm12
movss DWORD PTR az+40, xmm6
subss xmm2, xmm0
movaps xmm0, xmm13
mulss xmm0, DWORD PTR az$4$[rsp]
; 74 : az.y = (az.y * FOCALE) / (az.z + PROFONDEUR);
mulss xmm6, DWORD PTR __real@44480000
mulss xmm2, xmm3
addss xmm1, xmm0
movaps xmm0, xmm15
mulss xmm1, xmm4
mulss xmm0, xmm14
addss xmm2, xmm1
mulss xmm0, xmm5
addss xmm2, xmm0
movss DWORD PTR az+44, xmm2
movss DWORD PTR az+8, xmm2
addss xmm2, DWORD PTR __real@44fa0000
divss xmm6, xmm2
divss xmm7, xmm2
; 75 :
; 76 : offset_pixel = REPERE - (LENGTH * (int)az.y) + (int)az.x;
cvttss2si eax, xmm6
cvttss2si edx, xmm7
movss DWORD PTR az, xmm7
shl eax, 10
sub edx, eax
add edx, 409087 ; 00063dffH
movss DWORD PTR az+4, xmm6
; 77 :
; 78 : if (offset_pixel < LENGTH*WIDTH && offset_pixel >= 0)
cmp edx, 819199 ; 000c7fffH
ja SHORT $LN5@put_object
; 79 : if ((az.x <= LENGTH / 2 && az.x >= -LENGTH / 2) && (az.y <= WIDTH / 2 && az.y >= -WIDTH / 2))
comiss xmm11, xmm7
jb SHORT $LN5@put_object
comiss xmm7, xmm9
jb SHORT $LN5@put_object
comiss xmm10, xmm6
jb SHORT $LN5@put_object
comiss xmm6, xmm8
jb SHORT $LN5@put_object
; 80 : WindowsD[offset_pixel] = color;
movsxd rax, edx
mov DWORD PTR [r9+rax*4], r8d
$LN5@put_object:
; 189 : while (size--)
movss xmm6, DWORD PTR az$5$[rsp]
; 192 : object += 4;
add rcx, 16
test ebx, ebx
jne $LL2@put_object
; 193 : }
; 194 : }
movaps xmm15, XMMWORD PTR [rsp+32]
movaps xmm14, XMMWORD PTR [rsp+48]
movaps xmm13, XMMWORD PTR [rsp+64]
lea r11, QWORD PTR [rsp+192]
mov rbx, QWORD PTR [r11+16]
mov rsi, QWORD PTR [r11+32]
movaps xmm12, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm10, XMMWORD PTR [rsp+112]
movaps xmm9, XMMWORD PTR [rsp+128]
movaps xmm8, XMMWORD PTR [rsp+144]
movaps xmm7, XMMWORD PTR [rsp+160]
movaps xmm6, XMMWORD PTR [rsp+176]
mov rsp, r11
pop rdi
$LN14@put_object:
ret 0
put_object ENDP
don't get it, really, and if i put /arch:AVX option, i fall down at 160 fps, 20 frame less :/
and it's the same thing, only work on one packed single:
put_object PROC
; 188 : {
$LN16:
; 189 : while (size--)
test edx, edx
je $LN14@put_object
mov rax, rsp
mov QWORD PTR [rax+8], rbx
mov QWORD PTR [rax+24], rsi
push rdi
sub rsp, 208 ; 000000d0H
vmovss xmm0, DWORD PTR rotation_object
vmovaps XMMWORD PTR [rax-24], xmm6
vmovsd xmm6, QWORD PTR __real@3f91df469963e11d
vmovaps XMMWORD PTR [rax-40], xmm7
vmovaps XMMWORD PTR [rax-56], xmm8
vmovaps XMMWORD PTR [rax-72], xmm9
; 188 : {
mov rsi, r8
mov ebx, edx
mov rdi, rcx
vcvtps2pd xmm0, xmm0
vmovaps XMMWORD PTR [rax-88], xmm10
vmovaps XMMWORD PTR [rax-104], xmm11
vmovaps XMMWORD PTR [rax-120], xmm12
vmulsd xmm8, xmm0, xmm6
vmovaps XMMWORD PTR [rsp+80], xmm13
; 52 : az.cosX = cos(DEG2RAD(rotation_object[0]));
vmovaps xmm0, xmm8
vmovaps XMMWORD PTR [rsp+64], xmm14
vmovaps XMMWORD PTR [rsp+48], xmm15
call cos
vmovss xmm1, DWORD PTR rotation_object+4
vcvtps2pd xmm1, xmm1
_vcvtsd2ss2 xmm15, xmm0
vmulsd xmm7, xmm1, xmm6
; 53 : az.cosY = cos(DEG2RAD(rotation_object[1]));
vmovups xmm0, xmm7
call cos
_vcvtsd2ss2 xmm1, xmm0
vmovss DWORD PTR az$2$[rsp], xmm1
vmovss xmm1, DWORD PTR rotation_object+8
vcvtps2pd xmm1, xmm1
vmulsd xmm6, xmm1, xmm6
; 54 : az.cosZ = cos(DEG2RAD(rotation_object[2]));
vmovups xmm0, xmm6
call cos
_vcvtsd2ss2 xmm13, xmm0
; 55 : az.sinX = sin(DEG2RAD(rotation_object[0]));
vmovups xmm0, xmm8
call sin
_vcvtsd2ss2 xmm8, xmm0
; 56 : az.sinY = sin(DEG2RAD(rotation_object[1]));
vmovups xmm0, xmm7
vmovss DWORD PTR az$4$[rsp], xmm8
call sin
_vcvtsd2ss2 xmm7, xmm0
; 57 : az.sinZ = sin(DEG2RAD(rotation_object[2]));
vmovups xmm0, xmm6
vmovss DWORD PTR az$5$[rsp], xmm7
call sin
; 52 : az.cosX = cos(DEG2RAD(rotation_object[0]));
vmovss xmm12, DWORD PTR __real@44000000
vmovss xmm10, DWORD PTR __real@c4000000
lea rcx, QWORD PTR [rdi+8]
lea r9, OFFSET FLAT:WindowsD
vmovss xmm11, DWORD PTR __real@43c80000
vmovss xmm9, DWORD PTR __real@c3c80000
_vcvtsd2ss2 xmm14, xmm0
npad 13
$LL2@put_object:
; 190 : {
; 191 : put_pixel(*(object + 0), *(object + 1), *(object + 2), *(object + 3), repere);
vmovss xmm1, DWORD PTR [rcx-4]
vmovss xmm2, DWORD PTR [rcx]
vmovss xmm0, DWORD PTR [rcx-8]
dec ebx
vcvttss2si r8d, DWORD PTR [rcx+4]
; 53 : az.cosY = cos(DEG2RAD(rotation_object[1]));
vmovss xmm4, DWORD PTR az$2$[rsp]
; 58 :
; 59 : az.x = x;
; 60 : az.y = y;
vmovss DWORD PTR az+4, xmm1
vmovss DWORD PTR az, xmm0
; 61 : az.z = z;
vmovss DWORD PTR az+8, xmm2
vmovss DWORD PTR az+16, xmm4
vmovss DWORD PTR az+12, xmm15
vmovss DWORD PTR az+20, xmm13
vmovss DWORD PTR az+24, xmm8
vmovss DWORD PTR az+28, xmm7
vmovss DWORD PTR az+32, xmm14
; 62 : az.x += repere[0];
vaddss xmm8, xmm0, DWORD PTR [rsi]
; 63 : az.y += repere[1];
; 64 : az.z += repere[2];
; 65 : az.x_end = az.x * ((az.cosY * az.cosZ)) - az.y * ((az.cosY * az.sinZ)) - az.z * (az.sinY);
vmulss xmm0, xmm13, xmm4
vmulss xmm3, xmm0, xmm8
vmovss DWORD PTR az, xmm8
vaddss xmm5, xmm1, DWORD PTR [rsi+4]
vmulss xmm1, xmm14, xmm4
vmovss DWORD PTR az+4, xmm5
vaddss xmm7, xmm2, DWORD PTR [rsi+8]
vmulss xmm2, xmm1, xmm5
vmovss xmm1, DWORD PTR az$5$[rsp]
vsubss xmm3, xmm3, xmm2
; 66 : az.y_end = az.x * ((az.cosX * az.sinZ) - (az.sinX * az.sinY * az.cosZ)) + az.y * ((az.sinX * az.sinY * az.sinZ) + (az.cosX * az.cosZ)) - az.z * (az.sinX * az.cosY);
vmulss xmm2, xmm1, DWORD PTR az$4$[rsp]
vmulss xmm0, xmm7, xmm1
vmulss xmm1, xmm14, xmm15
vsubss xmm6, xmm3, xmm0
vmovss DWORD PTR az+36, xmm6
vmulss xmm0, xmm2, xmm13
vmulss xmm2, xmm2, xmm14
vsubss xmm1, xmm1, xmm0
vmulss xmm3, xmm1, xmm8
vmulss xmm0, xmm13, xmm15
vaddss xmm1, xmm2, xmm0
vmulss xmm0, xmm4, DWORD PTR az$4$[rsp]
vmulss xmm2, xmm1, xmm5
vaddss xmm3, xmm3, xmm2
; 67 : az.z_end = az.x * ((az.cosX * az.sinY * az.cosZ) + (az.sinX * az.sinZ)) + az.y * ((az.sinX * az.cosZ) - (az.cosX * az.sinY * az.sinZ)) + az.z * (az.cosX * az.cosY);
vmulss xmm2, xmm15, DWORD PTR az$5$[rsp]
vmulss xmm1, xmm0, xmm7
vsubss xmm4, xmm3, xmm1
vmovss DWORD PTR az+40, xmm4
vmulss xmm1, xmm13, DWORD PTR az$4$[rsp]
vmulss xmm0, xmm2, xmm14
vmulss xmm2, xmm2, xmm13
vsubss xmm1, xmm1, xmm0
vmulss xmm0, xmm14, DWORD PTR az$4$[rsp]
vmulss xmm3, xmm1, xmm5
vaddss xmm1, xmm2, xmm0
vmulss xmm0, xmm15, DWORD PTR az$2$[rsp]
vmulss xmm2, xmm1, xmm8
vaddss xmm3, xmm3, xmm2
vmulss xmm1, xmm0, xmm7
; 68 :
; 69 : az.x = az.x_end;
; 70 : az.y = az.y_end;
; 71 : az.z = az.z_end;
; 72 :
; 73 : az.x = (az.x * FOCALE) / (az.z + PROFONDEUR);
vmulss xmm0, xmm6, DWORD PTR __real@44480000
vaddss xmm2, xmm3, xmm1
vaddss xmm3, xmm2, DWORD PTR __real@44fa0000
vmovss DWORD PTR az+44, xmm2
vmovss DWORD PTR az+8, xmm2
; 74 : az.y = (az.y * FOCALE) / (az.z + PROFONDEUR);
vmulss xmm1, xmm4, DWORD PTR __real@44480000
vdivss xmm5, xmm0, xmm3
vdivss xmm0, xmm1, xmm3
; 75 :
; 76 : offset_pixel = REPERE - (LENGTH * (int)az.y) + (int)az.x;
vcvttss2si eax, xmm0
vcvttss2si edx, xmm5
vmovss DWORD PTR az, xmm5
shl eax, 10
sub edx, eax
add edx, 409087 ; 00063dffH
vmovss DWORD PTR az+4, xmm0
; 77 :
; 78 : if (offset_pixel < LENGTH*WIDTH && offset_pixel >= 0)
cmp edx, 819199 ; 000c7fffH
ja SHORT $LN5@put_object
; 79 : if ((az.x <= LENGTH / 2 && az.x >= -LENGTH / 2) && (az.y <= WIDTH / 2 && az.y >= -WIDTH / 2))
vcomiss xmm12, xmm5
jb SHORT $LN5@put_object
vcomiss xmm5, xmm10
jb SHORT $LN5@put_object
vcomiss xmm11, xmm0
jb SHORT $LN5@put_object
vcomiss xmm0, xmm9
jb SHORT $LN5@put_object
; 80 : WindowsD[offset_pixel] = color;
movsxd rax, edx
mov DWORD PTR [r9+rax*4], r8d
$LN5@put_object:
; 189 : while (size--)
vmovss xmm8, DWORD PTR az$4$[rsp]
vmovss xmm7, DWORD PTR az$5$[rsp]
; 192 : object += 4;
add rcx, 16
test ebx, ebx
jne $LL2@put_object
; 193 : }
; 194 : }
vmovaps xmm15, XMMWORD PTR [rsp+48]
vmovaps xmm14, XMMWORD PTR [rsp+64]
vmovaps xmm13, XMMWORD PTR [rsp+80]
lea r11, QWORD PTR [rsp+208]
mov rbx, QWORD PTR [r11+16]
mov rsi, QWORD PTR [r11+32]
vmovaps xmm12, XMMWORD PTR [rsp+96]
vmovaps xmm11, XMMWORD PTR [rsp+112]
vmovaps xmm10, XMMWORD PTR [rsp+128]
vmovaps xmm9, XMMWORD PTR [rsp+144]
vmovaps xmm8, XMMWORD PTR [rsp+160]
vmovaps xmm7, XMMWORD PTR [rsp+176]
vmovaps xmm6, XMMWORD PTR [rsp+192]
mov rsp, r11
pop rdi
$LN14@put_object:
ret 0
put_object ENDP
But anyway, the code product by visual studio est more readable than asm's gcc :o and more fast, 180 fps vs 30.
i will integrate intel compiler later.
And for c++, i don't know what's going on if i translate into c++.