I want talk about function sin and cos doing by intel compiler, i have try to know how he do it, and i have see this:
call      __libm_sse2_sincos 
Like i know fsincos, i ask to me, what this function do, and like i don't find source code, i dissassembly this code:
And i saw this:
sub 		rsp, 68
	movaps		ss:[rsp+0x40], xmm7
	movaps		ss:[rsp+0x30], xmm6
	movsd		ss:[rsp+0x70], xmm0
	pextrw 		eax, xmm0, 3
	and 		ax, 0x7FFF
	sub 		ax, 0x3030
	cmp 		ax, 0x10C5
	ja libmmd.1800FDC1F
	;{
		unpcklpd	xmm0, xmm0
		movapd 		xmm1, ds:[0x18020EB70]
		mulpd		xmm1, xmm0
		movapd		xmm2, ds:[0x18020EB60]
		cvtsd2si	edx, xmm1
		addpd		xmm1, xmm2
		movapd		xmm3, ds:[0x18020EB50]
		subpd		xmm1, xmm2
		movapd		xmm2, ds:[0x18020EB40]
		mulpd		xmm3, xmm1
		add			rdx, 0x1C7600
		movapd		xmm4, xmm0
		and			rdx, 0x3F
		movapd		xmm5, ds:[0x18020EB30]
		lea			rax, qword  ds:[0x18020D9E0]
		shl			rdx, 6
		add 		rax, rdx
		mulpd		xmm2, xmm1
		subpd 		xmm0, xmm3
		mulpd 		xmm1, ds:[0x18020EB20]
		subpd 		xmm4, xmm3
		movapd 		xmm7, ds:[rax+0x10]
		movapd	 	xmm3, xmm4
		subpd 		xmm4, xmm2
		mulpd		xmm5, xmm0
		subpd 		xmm0, xmm2
		movapd 		xmm6, ds:[0x18020EB10]
		mulpd 		xmm7, xmm4
		subpd		xmm3, xmm4
		mulpd 		xmm5, xmm0
		mulpd 		xmm0, xmm0
		subpd 		xmm3, xmm2
		movapd 		xmm2, ds:[rax]
		subpd 		xmm1, xmm3
		movapd 		xmm3, ds:[rax+0x30]
		addpd 		xmm2, xmm3
		subpd 		xmm7, xmm2
		mulpd 		xmm1, xmm7
		movapd 		xmm7, ds:[rax+0x10]
		mulpd 		xmm2, xmm4
		mulpd 		xmm6, xmm0
		mulpd 		xmm3, xmm4
		mulpd 		xmm2, xmm0
		mulpd 		xmm7, xmm0
		mulpd 		xmm0, xmm0
		addpd 		xmm5, ds:[0x18020EB00]
		mulpd 		xmm4, ds:[rax]
		addpd		xmm6, ds:[0x18020EAF0]
		mulpd		xmm5, xmm0
		movapd 		xmm0, xmm3
		addpd 		xmm3, ds:[rax+0x10]
		addpd		xmm6, xmm5
		movq 		xmm5, xmm6
		unpckhpd	xmm6, xmm6
		unpcklpd	xmm5, xmm5
		mulpd		xmm6, xmm7
		mulpd 		xmm2, xmm5
		movapd 		xmm7, xmm4
		addpd		xmm4, xmm3
		movapd		xmm5, ds:[rax+0x10]
		subpd 		xmm5, xmm3
		subpd		xmm3, xmm4
		addpd 		xmm1, ds:[rax+0x20]
		addpd 		xmm5, xmm0
		addpd 		xmm3, xmm7
		addpd 		xmm1, xmm5
		addpd 		xmm1, xmm3
		addpd 		xmm1, xmm2
		addpd 		xmm1, xmm6
		addpd 		xmm1, xmm4
		movq 		xmm0, xmm1
		unpckhpd 	xmm1, xmm1
	;} jmp libmmd.1800FDE1E
	; ... Prépare les donnée a etre traitée, a mon avis.
	libmmd.1800FDC1F:
	; ... Contient autant d'instruction SIMD que celles du 1st bloc.
	ret
Later i have see another function who look like a little bit same in this url:
https://github.com/mario007/renmas/blob/master/renmas3/asm/sincosps.py
I have transfer this code in my project (SDL asm), but my fps fall down, 20 less, relative of fsincos, is it normal ?
	;=============================================================================================================
	 ; float sin[4], cos[4] sincosps (float angle_radians[4])
	 ; Calcule les fonctions sin et cos des 4 angles contenu dans angle_radians[4].
	 ; Entrée : angle_radians[4]
	 ; Sotie: sin[4] et cos[4]
	 ; Destroyed: ebx - edx - ebp
	 ; DATA:
			_ps_am_inv_sign_mask	dd	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
			_ps_am_sign_mask		dd	0x80000000, 0x80000000, 0x80000000, 0x80000000
			_ps_am_pi_o_2			dd	1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679
			_ps_am_2_o_pi			dd	0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236
			_epi32_1				dd	1.0, 1.0, 1.0, 1.0
			_ps_am_1 				dd	1.0, 1.0, 1.0, 1.0
			_epi32_2 				dd	2.0, 2.0, 2.0, 2.0
			_ps_sincos_p3			dd	-0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413
			_ps_sincos_p2 			dd	0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262
			_ps_sincos_p1 			dd	-0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621
			_ps_sincos_p0 			dd	1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896
			
	;=============================================================================================================
	sincosps:
			movups 		xmm7, [sincosps_angle_radians]
			movups		xmm1, [_ps_am_inv_sign_mask]
		andps 		xmm0, xmm1
		movups		xmm1, [_ps_am_sign_mask]
		andps 		xmm7, xmm1
		movups		xmm1, [_ps_am_2_o_pi]
		mulps 		xmm0, xmm1
		pxor 		xmm3, xmm3
		movups 		xmm5, [_epi32_1]
		movups 		xmm4, [_ps_am_1]
		cvttps2dq 	xmm2, xmm0
		pand 		xmm5, xmm2
		pcmpeqd		xmm5, xmm3
		movups 		xmm3, [_epi32_1]
		movups 		xmm1, [_epi32_2]
		cvtdq2ps 	xmm6, xmm2
		paddd 		xmm3, xmm2
		pand		xmm2, xmm1
		pand 		xmm3, xmm1
		subps		xmm0, xmm6
		pslld 		xmm2, 30
		minps 		xmm0, xmm4
		subps 		xmm4, xmm0
		pslld 		xmm3, 30
		movaps 		xmm6, xmm4
		xorps 		xmm2, xmm7
		movaps		xmm7, xmm5
		andps 		xmm6, xmm7
		andnps 		xmm7, xmm0
		andps 		xmm0, xmm5
		andnps		xmm5, xmm4
		movups 		xmm4, [_ps_sincos_p3]
		orps 		xmm6, xmm7
		orps 		xmm0, xmm5
		movups 		xmm5, [_ps_sincos_p2]
		movaps 		xmm1, xmm0
		movaps 		xmm7, xmm6
		mulps 		xmm0, xmm0
		mulps 		xmm6, xmm6
		orps 		xmm1, xmm2
		orps 		xmm7, xmm3
		movaps 		xmm2, xmm0
		movaps 		xmm3, xmm6
		mulps 		xmm0, xmm4
		mulps 		xmm6, xmm4
		movups 		xmm4, [_ps_sincos_p1]
		addps 		xmm0, xmm5
		addps 		xmm6, xmm5
		movups 		xmm5, [_ps_sincos_p0]
		mulps 		xmm0, xmm2
		mulps 		xmm6, xmm3
		addps 		xmm0, xmm4
		addps 		xmm6, xmm4
		mulps 		xmm0, xmm2
		mulps 		xmm6, xmm3
		addps 		xmm0, xmm5
		addps 		xmm6, xmm5
		mulps 		xmm0, xmm1
		mulps		xmm6, xmm7
		movups		[sincosps_sin], xmm0 	; sinus(xmm0)
		movups		[sincosps_cos], xmm6 	; cosinus(xmm0)
		
		; to add in put_object
			movups		xmm0, [deg_rotation_x]
			movups		xmm1, [pi_180]
			mulps		xmm0, xmm1
			movups		[sincosps_angle_radians], xmm0
		call	sincosps
		
		movups		xmm1, [sincosps_cos]
		movsldup	xmm0, xmm1
		movsd 		[_xmm0 + 4], xmm0			; save cos(x)
		movhps 		[_xmm2 + 4], xmm0			; save cos(y)
		movups		xmm1, [sincosps_cos + 8]
		movsldup	xmm0, xmm1
		movsd		[_xmm1 + 4], xmm0			; save cos(z)
		
		; save sin(x)
			movups		xmm0, [sincosps_sin]
			movss		[_xmm0 + 0], xmm0
			movss		[_xmm0 + 12], xmm0
			movss		[_xmm0 + 20], xmm0
		
		; save sin(y)
			movups		xmm0, [sincosps_sin + 4]
			movss		[_xmm2 + 0], xmm0
			movss		[_xmm2 + 12], xmm0
			movss		[_xmm2 + 20], xmm0
		; save sin(z)
			movups		xmm0, [sincosps_sin + 8]
			movss		[_xmm1 + 0], xmm0
			movss		[_xmm1 + 12], xmm0
			movss		[_xmm1 + 20], xmm0
	ret
	;=============================================================================================================
	; / sincosps
	;=============================================================================================================
I saw in optimization reference manual, fsincos have 119 latency, and latency's instruction of SIMD are around 5-6.
Latency is like clock cycle ?