I want talk about function sin and cos doing by intel compiler, i have try to know how he do it, and i have see this:
call __libm_sse2_sincos
Like i know fsincos, i ask to me, what this function do, and like i don't find source code, i dissassembly this code:
And i saw this:
sub rsp, 68
movaps ss:[rsp+0x40], xmm7
movaps ss:[rsp+0x30], xmm6
movsd ss:[rsp+0x70], xmm0
pextrw eax, xmm0, 3
and ax, 0x7FFF
sub ax, 0x3030
cmp ax, 0x10C5
ja libmmd.1800FDC1F
;{
unpcklpd xmm0, xmm0
movapd xmm1, ds:[0x18020EB70]
mulpd xmm1, xmm0
movapd xmm2, ds:[0x18020EB60]
cvtsd2si edx, xmm1
addpd xmm1, xmm2
movapd xmm3, ds:[0x18020EB50]
subpd xmm1, xmm2
movapd xmm2, ds:[0x18020EB40]
mulpd xmm3, xmm1
add rdx, 0x1C7600
movapd xmm4, xmm0
and rdx, 0x3F
movapd xmm5, ds:[0x18020EB30]
lea rax, qword ds:[0x18020D9E0]
shl rdx, 6
add rax, rdx
mulpd xmm2, xmm1
subpd xmm0, xmm3
mulpd xmm1, ds:[0x18020EB20]
subpd xmm4, xmm3
movapd xmm7, ds:[rax+0x10]
movapd xmm3, xmm4
subpd xmm4, xmm2
mulpd xmm5, xmm0
subpd xmm0, xmm2
movapd xmm6, ds:[0x18020EB10]
mulpd xmm7, xmm4
subpd xmm3, xmm4
mulpd xmm5, xmm0
mulpd xmm0, xmm0
subpd xmm3, xmm2
movapd xmm2, ds:[rax]
subpd xmm1, xmm3
movapd xmm3, ds:[rax+0x30]
addpd xmm2, xmm3
subpd xmm7, xmm2
mulpd xmm1, xmm7
movapd xmm7, ds:[rax+0x10]
mulpd xmm2, xmm4
mulpd xmm6, xmm0
mulpd xmm3, xmm4
mulpd xmm2, xmm0
mulpd xmm7, xmm0
mulpd xmm0, xmm0
addpd xmm5, ds:[0x18020EB00]
mulpd xmm4, ds:[rax]
addpd xmm6, ds:[0x18020EAF0]
mulpd xmm5, xmm0
movapd xmm0, xmm3
addpd xmm3, ds:[rax+0x10]
addpd xmm6, xmm5
movq xmm5, xmm6
unpckhpd xmm6, xmm6
unpcklpd xmm5, xmm5
mulpd xmm6, xmm7
mulpd xmm2, xmm5
movapd xmm7, xmm4
addpd xmm4, xmm3
movapd xmm5, ds:[rax+0x10]
subpd xmm5, xmm3
subpd xmm3, xmm4
addpd xmm1, ds:[rax+0x20]
addpd xmm5, xmm0
addpd xmm3, xmm7
addpd xmm1, xmm5
addpd xmm1, xmm3
addpd xmm1, xmm2
addpd xmm1, xmm6
addpd xmm1, xmm4
movq xmm0, xmm1
unpckhpd xmm1, xmm1
;} jmp libmmd.1800FDE1E
; ... Prépare les donnée a etre traitée, a mon avis.
libmmd.1800FDC1F:
; ... Contient autant d'instruction SIMD que celles du 1st bloc.
ret
Later i have see another function who look like a little bit same in this url:
https://github.com/mario007/renmas/blob/master/renmas3/asm/sincosps.py
I have transfer this code in my project (SDL asm), but my fps fall down, 20 less, relative of fsincos, is it normal ?
;=============================================================================================================
; float sin[4], cos[4] sincosps (float angle_radians[4])
; Calcule les fonctions sin et cos des 4 angles contenu dans angle_radians[4].
; Entrée : angle_radians[4]
; Sotie: sin[4] et cos[4]
; Destroyed: ebx - edx - ebp
; DATA:
_ps_am_inv_sign_mask dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
_ps_am_sign_mask dd 0x80000000, 0x80000000, 0x80000000, 0x80000000
_ps_am_pi_o_2 dd 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679
_ps_am_2_o_pi dd 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236
_epi32_1 dd 1.0, 1.0, 1.0, 1.0
_ps_am_1 dd 1.0, 1.0, 1.0, 1.0
_epi32_2 dd 2.0, 2.0, 2.0, 2.0
_ps_sincos_p3 dd -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413
_ps_sincos_p2 dd 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262
_ps_sincos_p1 dd -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621
_ps_sincos_p0 dd 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896
;=============================================================================================================
sincosps:
movups xmm7, [sincosps_angle_radians]
movups xmm1, [_ps_am_inv_sign_mask]
andps xmm0, xmm1
movups xmm1, [_ps_am_sign_mask]
andps xmm7, xmm1
movups xmm1, [_ps_am_2_o_pi]
mulps xmm0, xmm1
pxor xmm3, xmm3
movups xmm5, [_epi32_1]
movups xmm4, [_ps_am_1]
cvttps2dq xmm2, xmm0
pand xmm5, xmm2
pcmpeqd xmm5, xmm3
movups xmm3, [_epi32_1]
movups xmm1, [_epi32_2]
cvtdq2ps xmm6, xmm2
paddd xmm3, xmm2
pand xmm2, xmm1
pand xmm3, xmm1
subps xmm0, xmm6
pslld xmm2, 30
minps xmm0, xmm4
subps xmm4, xmm0
pslld xmm3, 30
movaps xmm6, xmm4
xorps xmm2, xmm7
movaps xmm7, xmm5
andps xmm6, xmm7
andnps xmm7, xmm0
andps xmm0, xmm5
andnps xmm5, xmm4
movups xmm4, [_ps_sincos_p3]
orps xmm6, xmm7
orps xmm0, xmm5
movups xmm5, [_ps_sincos_p2]
movaps xmm1, xmm0
movaps xmm7, xmm6
mulps xmm0, xmm0
mulps xmm6, xmm6
orps xmm1, xmm2
orps xmm7, xmm3
movaps xmm2, xmm0
movaps xmm3, xmm6
mulps xmm0, xmm4
mulps xmm6, xmm4
movups xmm4, [_ps_sincos_p1]
addps xmm0, xmm5
addps xmm6, xmm5
movups xmm5, [_ps_sincos_p0]
mulps xmm0, xmm2
mulps xmm6, xmm3
addps xmm0, xmm4
addps xmm6, xmm4
mulps xmm0, xmm2
mulps xmm6, xmm3
addps xmm0, xmm5
addps xmm6, xmm5
mulps xmm0, xmm1
mulps xmm6, xmm7
movups [sincosps_sin], xmm0 ; sinus(xmm0)
movups [sincosps_cos], xmm6 ; cosinus(xmm0)
; to add in put_object
movups xmm0, [deg_rotation_x]
movups xmm1, [pi_180]
mulps xmm0, xmm1
movups [sincosps_angle_radians], xmm0
call sincosps
movups xmm1, [sincosps_cos]
movsldup xmm0, xmm1
movsd [_xmm0 + 4], xmm0 ; save cos(x)
movhps [_xmm2 + 4], xmm0 ; save cos(y)
movups xmm1, [sincosps_cos + 8]
movsldup xmm0, xmm1
movsd [_xmm1 + 4], xmm0 ; save cos(z)
; save sin(x)
movups xmm0, [sincosps_sin]
movss [_xmm0 + 0], xmm0
movss [_xmm0 + 12], xmm0
movss [_xmm0 + 20], xmm0
; save sin(y)
movups xmm0, [sincosps_sin + 4]
movss [_xmm2 + 0], xmm0
movss [_xmm2 + 12], xmm0
movss [_xmm2 + 20], xmm0
; save sin(z)
movups xmm0, [sincosps_sin + 8]
movss [_xmm1 + 0], xmm0
movss [_xmm1 + 12], xmm0
movss [_xmm1 + 20], xmm0
ret
;=============================================================================================================
; / sincosps
;=============================================================================================================
I saw in optimization reference manual, fsincos have 119 latency, and latency's instruction of SIMD are around 5-6.
Latency is like clock cycle ?