Finnaly for the bug (crash when i want more pixel), isn't the fault to code, just i erase some important BIOS DATA and cause reset of CPU :/.
Actually i put data over 0x10_0000 (Mo) of RAM :/, i guess i will rearrange all fragment BIOS DATA AREA in RAM one day :o (why this fragmentation !)
And finally by your advice, i translate my engine in C, i use SDL for initializes video mode, and here the result:
for 1_000_000 voxels, my engine run under 7 fps.
for 125_000 voxels, it's under 50 fps.
But the asm write by GCC without built-in is still fill by fpu instruction. And i have take decision to integrate some assembler code (SMID instruction) in line to C, but have a pb with it (not declaration of matrix's data):
asm (" jmp skip_data \n"
" translation: .long 0 \n"
" translation_x: .long 0 \n"
" translation_y: .long 0 \n"
" translation_z: .long 0 \n"
" .long 0 \n" //; reserved: color
" angle: .long 0 \n"
" rotation_x: .long 0 \n"
" _xmm0: .long 0 \n" //; sin.x 0
" .long 1,0 \n" //; cos.x 4
" .long 1,0 \n" //; cos.x 8
" .long 0 \n" //; sin.x 12
" .long 1,0 \n" //; 1 16
" .long 0 \n" //; sin.y 20
" rotation_z: .long 0 \n"
" _xmm1: .long 0 \n" //; sin.z 0
" .long 1,0 \n" //; cos.z 4
" .long 1,0 \n" //; cos.z 8
" .long 0 \n" //; sin.z 12
" .long 1,0 \n" //; 1 16
" .long 0 \n" //; sin.y 20
" rotation_y: .long 0 \n"
" _xmm2: .long 0 \n" //; sin.y 0
" .long 1,0 \n" //; cos.y 4
" .long 1,0 \n" //; cos.y 8
" .long 0 \n" //; sin.y 12
" .long 1,0 \n" //; 1 16
" .long 0 \n" //; sin.y 20
" color_pixel: .long 0 \n"
" coordonee: \n"
" x: .long 0 \n"
" y: .long 0 \n"
" z: .long 0 \n"
" .long 0 \n" //; reserved: color
" _x: .long 0 \n"
" _y: .long 0 \n"
" conv_signe: .long -0,0 \n"
" rapport: .long 1,3333333 \n"
" skip_data: \n"
);
asm (
"make_rotations: \n"
//;=============
// yaw
//=============
// y
// On applique la rotation au point |[esi + 0] = x
// |[esi + 4] = y
// |[esi + 8] = z
// On calcule x = x.cos(phi.y) * cos(phi.z) - y.cos(phi.y) * sin(phi.z) - z.sin(phi.y)
//
// On calcule A = x.cos(phi.y), B = y.cos(phi.y) et C = z.sin(phi.y)
" movups xmm0, [_xmm2 + 4] \n"
" movups xmm1, [coordonee] \n"
" mulps xmm0, xmm1 \n"
// On calcule D = A * cos(phi.z), E = B * sin(phi.z) et C = C * 1
" movups xmm1, [_xmm1 + 8] \n"
" mulps xmm0, xmm1 \n"
// On calcule F = D - E, C = C - 0
" hsubps xmm0, xmm0 \n"
// On calcule xmm0 = F - C
" hsubps xmm0, xmm0 \n"
// On modifie x selon selon le rapport entre x et y pour que x soit proportionnelle à y
" movd xmm1, [rapport] \n"
" divps xmm0, xmm1 \n"
// On save la new coordonée
" movd [_x], xmm0 \n"
//=============
// / yaw
//=============
//=============
// pitch
//=============
// x
// On applique la rotation au point |[esi + 0] = x
// |[esi + 4] = y
// |[esi + 8] = z
// On calcule y = x.(cos(phi.x) * sin(phi.z) - sin(phi.x) * cos(phi.z) * sin(phi.y)) +
// y.(sin(phi.x) * sin(phi.z) * sin(phi.y) + cos(phi.x) * cos(phi.z)) -
// z.(sin(phi.x) * cos(phi.y))
//
// On calcule A = cos(phi.x) * sin(phi.z), B = sin(phi.x) * cos(phi.z), E = cos(phi.x) * cos(phi.z) et F = sin(phi.x) * sin(phi.z)
" movddup xmm0, [_xmm0 + 8] \n"
" movups xmm1, [_xmm1] \n"
" mulps xmm0, xmm1 \n"
// on sauve xmm0 dans xmm7 pour le copier dans xmm0 de Roll car l'equation de y ressemblent a l'equation de z mis a part que la valeur sin(phi.y) est
// multiplié par d'autres equations
// On calcule C' = A' * sin(phi.y) et G' = E' * sin(phi.y)
" movddup xmm7, [_xmm2 + 12] \n"
" mulps xmm7, xmm0 \n"
// On calcule C = B * sin(phi.y) et G = F * sin(phi.y)
" movddup xmm2, [_xmm2 + 16] \n"
" mulps xmm0, xmm2 \n"
// Copie le contenu du haut (64..127) d'un paquet de valeurs réel de simple précision (4*32 bits) dans sa partie basse (0..31).
// En somme on separe les deux partie x et y: xmm0 = A) cos(phi.x) * sin(phi.z) xmm0 = cos(phi.x) * sin(phi.z)
// C) sin(phi.x) * cos(phi.z) * sin(phi.y) => sin(phi.x) * sin(phi.y) * cos(phi.z)
// E) cos(phi.x) * cos(phi.z) xmm1 = cos(phi.x) * cos(phi.z)
// G) sin(phi.x) * sin(phi.z) * sin(phi.y) sin(phi.x) * sin(phi.y) * sin(phi.z)
" movhlps xmm1, xmm0 \n"
// On calcule D = A - C
" hsubps xmm0, xmm0 \n"
// On calcule H = E + G
" haddps xmm1, xmm1 \n"
// On calcule sin(phi.x) * cos(phi.y) et cos(phi.x) * cos(phi.y)
//
// On calcule I.roll = cos(phi.x) * cos(phi.y) et I.Pitch = sin(phi.x) * cos(phi.y)
" movlps xmm3, [_xmm0 + 8] \n"
" movlps xmm2, [_xmm2 + 4] \n"
" mulps xmm2, xmm3 \n"
" movshdup xmm3, xmm2 \n"
// On calcule x.D + y.H - z.I
//
// On calcule J = x.D, K = y.H et L = z.I
" movups xmm5, [coordonee] \n"
" movsldup xmm4, xmm1 \n" //; y.H
" movss xmm4, xmm0 \n" //; x.D
" movlhps xmm4, xmm3 \n" //; z.I.Pitch
" mulps xmm4, xmm5 \n"
// On calcule M = J + K
" haddps xmm4, xmm4 \n"
// On calcule N = M - L
" hsubps xmm4, xmm4 \n"
// On save la new coordonée
" movd [_y], xmm4 \n"
//=============
// / pitch
//=============
//=============
// roll
//=============
// z
// On applique la rotation au point |[esi + 0] = x
// |[esi + 4] = y
// |[esi + 8] = z
// On calcule z' = x.(cos(phi.x) * cos(phi.z) * sin(phi.y) + sin(phi.x) * sin(phi.z)) +
// y.(sin(phi.x) * cos(phi.z) - cos(phi.x) * sin(phi.z) * sin(phi.y)) +
// z.(cos(phi.x) * cos(phi.y))
//
// Copie le contenu du haut (64..127) d'un paquet de valeurs réel de simple précision (4*32 bits) dans sa partie basse (0..31).
// En somme on separe les deux partie x et y: xmm7 = C') cos(phi.x) * sin(phi.z) * sin(phi.y) xmm7 = C') cos(phi.x) * sin(phi.z) * sin(phi.y))
// B') sin(phi.x) * cos(phi.z) => B') sin(phi.x) * cos(phi.z)
// G') cos(phi.x) * cos(phi.z) * sin(phi.y) xmm1 = G') cos(phi.x) * cos(phi.z) * sin(phi.y)
// F') sin(phi.x) * sin(phi.z) F') sin(phi.x) * sin(phi.z
" movhlps xmm1, xmm7 \n"
// On calcule D' = -B' + C'
" movd xmm6, [conv_signe] \n"
" orps xmm7, xmm6 \n"
" haddps xmm7, xmm7 \n"
// On calcule H' = G' + F'
" haddps xmm1, xmm1 \n"
// On calcule x.D' + y.H' + z.I'
//
// On calcule J = x.D', K = y.H' et L = z.I'
" movups xmm3, [coordonee] \n"
" movsldup xmm4, xmm7 \n" // y.D'
" movss xmm4, xmm1 \n" // x.H'
" movlhps xmm4, xmm2 \n" // z.I'
" mulps xmm4, xmm3 \n"
// On calcule M' = J' + K'
" haddps xmm4, xmm4 \n"
// On calcule N' = M' + L'
" haddps xmm4, xmm4 \n"
//=============
// / roll
//=============
);
i have debug this manualy, and i found only roll block run normally, other make my program crash, strange (code return 0x3) i guess i will go compile it with intel compiler.
Do you know any IDE with icc ? i use codeblock but it's seem complicate for configue another compiler than gcc, and honnestly i don't like how gcc say to me how to code in assembly (dd -> .long, can't do: (value), need to put ' : ' and other i guess) just detail but still very embarrassing if i multiply asm inline :/
When i clear the screen by SDL_FillRect(screen,NULL,0);, it's very fast and i don't see the executable of this function, very fast relative of mine:
;=============
; void clear_screen (void)
; Clear screen
; Entrée : None
; Sotie: Screen
; Destroyed: edi
;=============
clear_screen:
mov edi, [PhysBasePtr]
mov ecx, (WIDTH*LENGTH*4)/16 ; mov ecx, (WIDTH*LENGTH)/8
xorps xmm0, xmm0 ; vxorps ymm1, ymm1 ; 256 bit instruction !
clear_s:
movdqu [edi], xmm0 ; vmovapd [edi], ymm1 ; 256 bit instruction !
add edi, 16 ; add edi, 32
loop clear_s
ret
;===============
; / clear_screen
;===============
Do SDL interact with gpu for execute the clearing of screen ? i'm lost
Here's my code for count FPS, is it good, or i can do it faster ?
start_time = SDL_GetTicks();
while (1)
{
....
calculate_fps:
current_time = SDL_GetTicks();
if (current_time - start_time >= 1000)
{
fps = compteur_boucle;
compteur_boucle = 0;
start_time = SDL_GetTicks();
}
else
compteur_boucle++;
printf("FPS = %d\n", fps);
Thanks
PS: sorry for comment, still in french, i'm just lazy to translate it ^^