Why does newer Processor(microarchitecture) is run slower than older Processor?

Nguyen__Brian · ‎03-25-2020

- I'm currently making a game and have fun with assembly stuff.

- My development computer use Xeon socket 771 processor: E5440 (Intel core 2 microprocessor)

- Today I'm testing my game on Core i7 3770 (sandy bridge microprocessor)

- The results is ridiculous when compare together:

+ E5440 and others 771 processor: 128-136 cycles

+ Core i7 3770: 450-500 cycles

- I known sandy bridge have a feature called AVX, but I don't really known how it affect to SSE. Even if it affect to SSE, the result is still unacceptable. And I even not use SSE too much in my code segment.

- Is anyone know what thing made that effect? (any others condition are identical, eg: ram, os, hdd, and so on).

Here is my simple code segment:

static void inline update_character_sub(void)
{
	asm (
			//"; construct SOA position of character xxxxyyyy;"
			"movq r13, mm6;"
			"mov ecx, r13d;"
			"shr r13, 32;"
			";"
			"xor eax, eax;"
			"mov ax, cx;"
			"lea bx, [eax+64];"
			"shl ebx, 16;"
			"mov bx, cx;"
			"movd mm0, ebx;"
			"pshufw mm0, mm0, 0b00010100;"
			";"
			"shr ecx, 16;"
			"xor eax, eax;"
			"mov ax, cx;"
			"lea bx, [eax+64];"
			"shl ebx, 16;"
			"mov bx, cx;"
			"movd mm1, ebx;"
			"pshufw mm1, mm1, 0b01010000;"
			";"
			"movq rax, mm0;"
			"movq xmm0, rax;"
			"movq rbx, mm1;"
			"movq xmm1, rbx;"
			"shufps xmm0, xmm1, 0b01000100;"
			";"
			"pxor xmm2, xmm2;"
			"mov r12d, [stateChar];"
			"cmp    r12d,0x7;"
			"je     case_CHARACTER_MOVE_LEFT;"
			"cmp    r12d,0x8;"
			"je     case_CHARACTER_MOVE_RIGHT;"
			"cmp    r12d,0x5;"
			"je     case_CHARACTER_MOVE_DOWN;"
//			"cmp    r12d,0x6;"
//			"je     case_CHARACTER_MOVE_UP;"
			//"; texture coord;"
			"mov ecx, 2;"
			"mov edx, 7;"
			//"; pos;"
			"mov r8d, 0x00040004;"
			"movd xmm2, r8d;"
			"pshufd xmm2, xmm2, 0b00000101;"
			";"
			"jmp r11;"
			";"
			//"; post calculate;"
			"CAL_DONE:;"
			";"
			//"; texture coord;"
			//"; multiply with one_part_size;"
			"shl ecx, 5;"
			"shl edx, 5;"
			"mov edi, ecx;"
			"mov esi, edx;"
			//"; it'll be one_part_size - 1;"
			"add edi, 32-1;"
			"add esi, 32-1;"
			";"
			"xor eax, eax;"
			"mov ah, dl;"
			"mov al, cl;"
			"mov bh, dl;"
			"mov bl, dil;"
			"shl ebx, 16;"
			"or  ebx, eax;"
			"mov r10d, ebx;"
			";"
			"mov edx, esi;"
			"mov ah, dl;"
			"mov al, dil;"
			"mov bh, dl; ;"
			"mov bl, cl;"
			"shl ebx, 16;"
			"or  ebx, eax;"
			";"
			"shl rbx, 32;"
			"or rbx, r10;"
			"mov [r14+64], rbx;"
			";"
			//"; character pos;"
			"paddw xmm0, xmm2;"
			"movdqa xmm1, xmm0;"
			"pshufd xmm1, xmm1, 0b11011000;"
			"movdq2q mm0, xmm1;"
			"pshufw mm0, mm0, 0b11011000;"
			"movhlps xmm1, xmm1;"
			//";      pshufd xmm1, xmm1, 0b11101110;"
			"movdq2q mm1, xmm1;"
			"pshufw mm1, mm1, 0b11011000;"
			"movq  r8, mm0;"
			"movq  r9, mm1;"
			"movq xmm1, r8;"
			"movq xmm2, r9;"
			"shufps xmm1, xmm2, 0b01000100;"
			//";      pshufd xmm2, xmm2, 0b01000100;"
			//";      movq xmm2, xmm1;"
			"movd eax, xmm1;"
			";"
			//"; update memory;"
			"shl r13, 32;"
			"or r13, rax;"
			"movq mm6, r13;"
			";"
			"movdqa [r14+16], xmm1;"
			";"
			"jmp r11;"
			";"
			//"; cases character state;"
			"case_CHARACTER_MOVE_DOWN:;"
			//"; texture coord;"
			"mov ecx, 1;"
			"mov edx, 7;"
			//"; pos;"
			"mov r8d, 0xfffcfffc;"
			"movd xmm2, r8d;"
			"pshufd xmm2, xmm2, 0b00000101;"
			"jmp CAL_DONE;"
			";"
			"case_CHARACTER_MOVE_UP:;"
			//"; texture coord;"
			"mov ecx, 2;"
			"mov edx, 7;"
			//"; pos;"
			"mov r8d, 0x00040004;"
			"movd xmm2, r8d;"
			"pshufd xmm2, xmm2, 0b00000101;"
			"jmp CAL_DONE;"
			";"
			"case_CHARACTER_MOVE_LEFT:;"
			//"; texture coord;"
			"inc r13d;"
			"mov ebx, r13d;"
			"sub ebx, 8;"
			"sbb edx, edx;"
			"and r13d, edx;"
			"mov ecx, r13d;"
			"mov edx, 6;"
			//"; sub eax, ebx ; = a - b;"
			//"; sbb edx, edx ; = (b > a) ? 0xFFFFFFFF : 0;"
			//"; and edx, eax ; = (b > a) ? a - b : 0;"
			//"; add ebx, edx ; b = (b > a) ? b + (a - b) : b + 0;"
			//"; pos;"
			"mov r8d, 0xfffcfffc;"
			"movd xmm2, r8d;"
			"pshufd xmm2, xmm2, 0b11100000;"
			"jmp CAL_DONE;"
			";"
			"case_CHARACTER_MOVE_RIGHT:;"
			//"; texture coord;"
			"inc r13d;"
			"mov ebx, r13d;"
			"sub ebx, 8;"
			"sbb edx, edx;"
			"and r13d, edx;"
			"mov ecx, r13d;"
			"mov edx, 7;"
			//"; pos;"
			"mov r8d, 0x00040004;"
			"movd xmm2, r8d;"
			"pshufd xmm2, xmm2, 0b11100000;"
			"jmp CAL_DONE;"
			";"
	);
}

talmi__amos · ‎04-24-2020

Your code uses partial registers AH, AL, BH, BL and mix it with code using AX, BX ..

This is very not-efficient. Modern CPU are geared and optimized, toward 64-bit code and 64-bit registers (perhaps even 128-bit ones).

Support for ancient 8086 machine instructions is included - for compatibility sake - at a price.

Logically AL and AH are parts of AX, and AX is a part of EAX , and EAX is half of RAX. I suspect that in reality - AH register is really a different hardware register, not a part of the EAX register. Keeping them synchronized is probably done in microcode. This takes time.

The more advanced the CPU is, the heavier the penality is..

In any book about X86 optimization, You will find a whole paragraph - even a chapter - about this bad practice.