Finnaly for the bug (crash when i want more pixel), isn't the fault to code, just i erase some important BIOS DATA  and cause reset of CPU :/.
Actually i put data over 0x10_0000 (Mo) of RAM :/, i guess i will rearrange all fragment BIOS DATA AREA in RAM one day :o (why this fragmentation !)
And finally by your advice, i translate my engine in C, i use SDL for initializes video mode, and here the result:
for 1_000_000 voxels, my engine run under 7 fps.
for 125_000 voxels, it's under 50 fps.
But the asm write by GCC without built-in is still fill by fpu instruction. And i have take decision to integrate some assembler code (SMID instruction) in line to C, but have a pb with it (not declaration of matrix's data):
    asm ("       jmp        skip_data           \n"
         "       translation:   .long   0       \n"
         "       translation_x:	.long   0       \n"
         "       translation_y:	.long   0       \n"
         "       translation_z:	.long   0       \n"
         "                      .long   0       \n"         //; reserved: color
         "    angle:            .long   0       \n"
         "        rotation_x:   .long   0       \n"
         "        _xmm0:        .long   0       \n"			//; sin.x	0
         "                      .long   1,0     \n"			//; cos.x	4
         "                      .long   1,0     \n"			//; cos.x	8
         "                      .long   0       \n"			//; sin.x	12
         "                      .long   1,0     \n"			//; 1		16
         "                      .long   0       \n"			//; sin.y	20
         "        rotation_z:   .long	0       \n"
         "        _xmm1:        .long	0       \n"			//; sin.z	0
         "                      .long	1,0     \n"			//; cos.z 4
         "                      .long   1,0      \n"			//; cos.z	8
         "                      .long   0        \n"			//; sin.z	12
         "                      .long	1,0     \n"			//; 1		16
         "                      .long   0        \n"			//; sin.y	20
         "        rotation_y:   .long	0       \n"
         "        _xmm2:        .long   0        \n"			//; sin.y	0
         "                      .long   1,0      \n"			//; cos.y 4
         "                      .long   1,0      \n"			//; cos.y 8
         "                      .long   0        \n"			//; sin.y	12
         "                      .long   1,0      \n"			//; 1		16
         "                      .long   0        \n"			//; sin.y	20
         "    color_pixel:		.long	0       \n"
         "    coordonee:                        \n"
         "            x:		.long	0       \n"
         "            y:		.long	0       \n"
         "            z:		.long	0       \n"
         "                      .long	0       \n"			//; reserved: color
         "            _x:       .long	0       \n"
         "            _y:       .long   0       \n"
         "   conv_signe:        .long   -0,0        \n"
         "   rapport:           .long	1,3333333   \n"
         "    skip_data:                            \n"
			 );
    asm (
              "make_rotations:                        \n"
                //;=============
		// yaw
		//=============
                // y
				// On applique la rotation au point	|[esi + 0] = x
				//									|[esi + 4] = y
				//									|[esi + 8] = z
				// On calcule x = x.cos(phi.y) * cos(phi.z) - y.cos(phi.y) * sin(phi.z) - z.sin(phi.y)
				//
				// On calcule  A = x.cos(phi.y), B = y.cos(phi.y) et C = z.sin(phi.y)
		"			movups	xmm0, [_xmm2 + 4]       \n"
		"			movups	xmm1, [coordonee]       \n"
		"			mulps	xmm0, xmm1              \n"
				// On calcule D = A * cos(phi.z), E = B * sin(phi.z) et C = C * 1
		"			movups	xmm1, [_xmm1 + 8]       \n"
		"			mulps	xmm0, xmm1              \n"
				// On calcule F = D - E, C = C - 0
		"			hsubps	xmm0, xmm0              \n"
				// On calcule xmm0 = F - C
		"			hsubps	xmm0, xmm0              \n"
				// On modifie x selon selon le rapport entre x et y pour que x soit proportionnelle à y
		"			movd	xmm1, [rapport]         \n"
		"			divps	xmm0, xmm1              \n"
				// On save la new coordonée
		"			movd	[_x], xmm0              \n"
		//=============
		// / yaw
		//=============
		//=============
		// pitch
		//=============
                // x
				// On applique la rotation au point	|[esi + 0] = x
				//									|[esi + 4] = y
				//									|[esi + 8] = z
				// On calcule y = x.(cos(phi.x) * sin(phi.z) - sin(phi.x) * cos(phi.z) * sin(phi.y)) +
				//				 y.(sin(phi.x) * sin(phi.z) * sin(phi.y) + cos(phi.x) * cos(phi.z)) -
				//				 z.(sin(phi.x) * cos(phi.y))
				//
				// On calcule A = cos(phi.x) * sin(phi.z), B = sin(phi.x) * cos(phi.z), E = cos(phi.x) * cos(phi.z) et F = sin(phi.x) * sin(phi.z)
		"			movddup xmm0, [_xmm0 + 8]       \n"
		"			movups 	xmm1, [_xmm1]           \n"
		"			mulps	xmm0, xmm1              \n"
				// on sauve xmm0 dans xmm7 pour le copier dans xmm0 de Roll car l'equation de y ressemblent a l'equation de z mis a part que la valeur sin(phi.y) est
				// multiplié par d'autres equations
				// On calcule C' = A' * sin(phi.y) et G' = E' * sin(phi.y)
		"			movddup	xmm7, [_xmm2 + 12]       \n"
		"			mulps	xmm7, xmm0              \n"
				// On calcule C = B * sin(phi.y) et G = F * sin(phi.y)
		"			movddup	xmm2, [_xmm2 + 16]      \n"
		"			mulps	xmm0, xmm2              \n"
				// Copie le contenu du haut (64..127) d'un paquet de valeurs réel de simple précision (4*32 bits) dans sa partie basse (0..31).
				// En somme on separe les deux partie x et y:	xmm0 =	A) cos(phi.x) * sin(phi.z)								xmm0 =	cos(phi.x) * sin(phi.z)
				//											 			C) sin(phi.x) * cos(phi.z) * sin(phi.y) 			=>			sin(phi.x) * sin(phi.y) * cos(phi.z)
				//														E) cos(phi.x) * cos(phi.z)								xmm1 =	cos(phi.x) * cos(phi.z)
				//														G) sin(phi.x) * sin(phi.z) * sin(phi.y)							sin(phi.x) * sin(phi.y) * sin(phi.z)
		"			movhlps xmm1, xmm0          \n"
				// On calcule D = A - C
		"			hsubps xmm0, xmm0           \n"
				// On calcule H = E + G
		"			haddps xmm1, xmm1           \n"
				// On calcule sin(phi.x) * cos(phi.y) et cos(phi.x) * cos(phi.y)
				//
				// On calcule I.roll = cos(phi.x) * cos(phi.y) et I.Pitch = sin(phi.x) * cos(phi.y)
		"			movlps		xmm3, [_xmm0 + 8]       \n"
		"			movlps		xmm2, [_xmm2 + 4]       \n"
		"			mulps		xmm2, xmm3              \n"
		"			movshdup 	xmm3, xmm2              \n"
				// On calcule x.D + y.H - z.I
				//
				// On calcule J = x.D, K = y.H et L = z.I
		"			movups		xmm5, [coordonee]       \n"
		"			movsldup	xmm4, xmm1              \n"    //; y.H
		"			movss		xmm4, xmm0              \n"    //; x.D
		"			movlhps 	xmm4, xmm3              \n"    //; z.I.Pitch
		"			mulps		xmm4, xmm5              \n"
				// On calcule M = J + K
		"			haddps	xmm4, xmm4       \n"
				// On calcule N = M - L
		"			hsubps	xmm4, xmm4       \n"
				// On save la new coordonée
		"			movd	[_y], xmm4       \n"
		//=============
		// / pitch
		//=============
		//=============
		// roll
		//=============
                // z
				// On applique la rotation au point	|[esi + 0] = x
				//									|[esi + 4] = y
				//									|[esi + 8] = z
				// On calcule z' = x.(cos(phi.x) * cos(phi.z) * sin(phi.y) + sin(phi.x) * sin(phi.z)) +
				//				  y.(sin(phi.x) * cos(phi.z) - cos(phi.x) * sin(phi.z) * sin(phi.y)) +
				//				  z.(cos(phi.x) * cos(phi.y))
				//
				// Copie le contenu du haut (64..127) d'un paquet de valeurs réel de simple précision (4*32 bits) dans sa partie basse (0..31).
				// En somme on separe les deux partie x et y:	xmm7 =	C') cos(phi.x) * sin(phi.z) * sin(phi.y)				xmm7 =	C') cos(phi.x) * sin(phi.z) * sin(phi.y))
				//											 			B') sin(phi.x) * cos(phi.z)						 =>				B') sin(phi.x) * cos(phi.z)
				//														G') cos(phi.x) * cos(phi.z) * sin(phi.y)				xmm1 =	G') cos(phi.x) * cos(phi.z) * sin(phi.y)
				//														F') sin(phi.x) * sin(phi.z)										F') sin(phi.x) * sin(phi.z
		"			movhlps xmm1, xmm7          \n"
				// On calcule D' = -B' + C'
		"			movd	xmm6, [conv_signe]  \n"
		"			orps	xmm7, xmm6          \n"
		"			haddps	xmm7, xmm7          \n"
				// On calcule H' = G' + F'
		"			haddps	xmm1, xmm1          \n"
				// On calcule x.D' + y.H' + z.I'
				//
				// On calcule J = x.D', K = y.H' et L = z.I'
		"			movups		xmm3, [coordonee]       \n"
		"			movsldup	xmm4, xmm7              \n"    // y.D'
		"			movss		xmm4, xmm1              \n"    // x.H'
		"			movlhps 	xmm4, xmm2              \n"    // z.I'
		"			mulps		xmm4, xmm3              \n"
				// On calcule M' = J' + K'
		"			haddps	xmm4, xmm4       \n"
				// On calcule N' = M' + L'
		"			haddps	xmm4, xmm4       \n"
		//=============
		// / roll
		//=============
			 );
i have debug this manualy, and i found only roll block run normally, other make my program crash, strange (code return 0x3) i guess i will go compile it with intel compiler.
Do you know any IDE with icc ? i use codeblock but it's seem complicate for configue another compiler than gcc, and honnestly i don't like how gcc say to me how to code in assembly (dd -> .long, can't do:      (value), need to put ' : ' and other i guess) just detail but still very embarrassing if i multiply asm inline :/
When i clear the screen by SDL_FillRect(screen,NULL,0);, it's very fast and i don't see the executable of this function, very fast relative of mine:
	;=============
	 ; void clear_screen (void)
	 ; Clear screen
	 ; Entrée : None
	 ; Sotie: Screen
	 ; Destroyed: edi
	;=============	 
	clear_screen:
		mov		edi, [PhysBasePtr]
		mov		ecx, (WIDTH*LENGTH*4)/16 	;	mov		ecx, (WIDTH*LENGTH)/8
		xorps	xmm0, xmm0					;	vxorps	ymm1, ymm1		; 256 bit instruction !
		clear_s:
			movdqu 	[edi], xmm0				;	vmovapd	[edi], ymm1		; 256 bit instruction !
			add		edi, 16					;	add		edi, 32
		loop	clear_s
	ret
	;===============
	; / clear_screen
	;===============	
Do SDL interact with gpu for execute the clearing of screen ? i'm lost
Here's my code for count  FPS, is it good, or i can do it faster ?
        start_time = SDL_GetTicks();
    while (1)
    {
         ....
        calculate_fps:
            current_time = SDL_GetTicks();
            if (current_time - start_time >= 1000)
            {
                fps = compteur_boucle;
                compteur_boucle = 0;
                start_time = SDL_GetTicks();
            }
            else
                compteur_boucle++;
            printf("FPS = %d\n", fps);
Thanks
PS: sorry for comment, still in french, i'm just lazy to translate it ^^