- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
I am looking for a way to convert between 8xFloat64 vector and 8xInt64 vector. I was looking at intrinsics(in 14.0 reference and in zmmintr.h) and I found only this:
__m512i __cdecl _mm512_cvtfxpnt_roundpd_epu32lo(__m512d v2, int rc);
but what I'm looking for is rather like:
__m512i __cdecl _mm512_cvtfxpnt_roundpd_epu64(__m512d v2, int rc);
Is there any trick for doing this kind of conversion?
Thanks,
Przemek.
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
One of our expert Developers shared the following code to perform such conversion relying on compiler vectorization:
__m512i _vec512_cvtpd_epi64(__m512 d) {
__m512i res;
int i;
for (i=0;i<8;i++) {
((__int64*)&res) = (__int64)(((double*)&d));
}
return res;
}
Hope this helps.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I think there is a typo. The input parameter must be a __m512d type.
How is it possible to vectorize that code if there is no corresponding vector conversion from float64 to int64?
I tried it myself with the following code:
#include <iostream>
#include <immintrin.h>
inline __m512i _vec512_cvtpd_epi64( __m512d d ) {
__m512i res;
#pragma ivdep
for ( int i = 0; i < 8; ++i ) {
( (__int64*)&res ) = (__int64)( ((double*)&d) );
}
return res;
}
int main( int argc, char **argv ) {
double *array_d = ( double*)_mm_malloc( 16 * sizeof( double), 64 );
__int64 *array_i = (__int64*)_mm_malloc( 16 * sizeof(__int64), 64 );
for ( int i = 0; i < 16; ++i ) {
array_d = 12.123456789 + i;
array_i = 0;
}
__m512d d_;
__m512i i_;
asm("start__");
for ( int i = 0; i < 2; ++i ) {
d_ = _mm512_load_pd( array_d + 8*i );
asm("start_cnvt");
i_ = _vec512_cvtpd_epi64( d_ );
asm("end_cnvt");
_mm512_store_epi64( array_i + 8*i, i_ );
}
asm("end__");
for ( int i = 0; i < 16; ++i ) {
std::cout << array_i << " ";
}
std::cout << std::endl;
_mm_free( array_d );
_mm_free( array_i );
return 0;
}
I compiled it with -O3. The compiler generates the following assembly code:
# Begin ASM
# Begin ASM
start_cnvt
# End ASM #36.0
# End ASM
# LOE rbx r12 r13 r15 r14b
..B1.22: # Preds ..B1.23 Latency 153
vmovaps 64(%rsp), %zmm0 #37.8 c1
movl $21845, %eax #37.8 c1
vbroadcastsd .L_2il0floatpacket.8(%rip), %zmm2 #37.8 c5
vpandq .L_2il0floatpacket.7(%rip){1to8}, %zmm0, %zmm14 #37.8 c9
vpandnq %zmm14, %zmm2, %zmm17 #37.8 c13
kmov %eax, %k6 #37.8 c13
vpxord %zmm19, %zmm19, %zmm19 #37.8 c17
movl $43690, %eax #37.8 c17
vmovaps %zmm19, %zmm3 #37.8 c21
kmov %eax, %k1 #37.8 c21
vmovdqa32 %zmm17{cdab}, %zmm3{%k6} #37.8 c25
vpbroadcastq .L_2il0floatpacket.10(%rip), %zmm5 #37.8 c29
vpsrld $20, %zmm3, %zmm4 #37.8 c33
vpsubd %zmm5, %zmm4, %zmm6 #37.8 c37
vpandq .L_2il0floatpacket.8(%rip){1to8}, %zmm0, %zmm1 #37.8 c41
vpshufd $160, %zmm6, %zmm11 #37.8 c45
vporq .L_2il0floatpacket.9(%rip){1to8}, %zmm1, %zmm9 #37.8 c49
vpsubrd .L_2il0floatpacket.12(%rip){1to16}, %zmm11, %zmm10 #37.8 c53
vpcmpltd .L_2il0floatpacket.12(%rip){1to16}, %zmm11, %k4 #37.8 c57
vpcmpd $4, %zmm19, %zmm11, %k3 #37.8 c61
vbroadcastsd .L_2il0floatpacket.11(%rip), %zmm16 #37.8 c65
vpsrlvd %zmm10, %zmm9, %zmm12 #37.8 c69
kand %k4, %k3 #37.8 c69
vpsllvd %zmm11, %zmm9, %zmm13 #37.8 c73
kand %k1, %k3 #37.8 c73
vaddpd {rz}, %zmm16, %zmm14, %zmm15 #37.8 c77
vmovaps %zmm19, %zmm7 #37.8 c81
vmovdqa32 %zmm9{cdab}, %zmm7{%k1} #37.8 c85
vpsubd .L_2il0floatpacket.12(%rip){1to16}, %zmm11, %zmm8 #37.8 c89
vpord %zmm12{cdab}, %zmm13, %zmm13{%k3} #37.8 c93
vpsubd %zmm16, %zmm15, %zmm18 #37.8 c97
vcmpltpd .L_2il0floatpacket.11(%rip){1to8}, %zmm17, %k5 #37.8 c101
vpsllvd %zmm8, %zmm7, %zmm20 #37.8 c105
vmovdqa32 %zmm13, %zmm20{%k4} #37.8 c109
vmovdqa64 %zmm18, %zmm20{%k5} #37.8 c113
vcmpltpd %zmm19, %zmm0, %k2 #37.8 c117
vpsubsetbd %zmm20, %k7, %zmm19{%k6} #37.8 c121
nop #37.8 c125
kmov %k7, %edx #37.8 c129
addl %edx, %edx #37.8 c133
kmov %edx, %k3 #37.8 c137
vpsbbd %zmm20, %k3, %zmm19{%k1} #37.8 c141
vmovapd %zmm19, %zmm20{%k2} #37.8 c145
nop #37.8 c149
vmovaps %zmm20, (%rsp) #37.8 c153
# LOE rbx r12 r13 r15 r14b
..B1.21: # Preds ..B1.22 Latency 0
# Begin ASM
# Begin ASM
end_cnvt
# End ASM #38.0
# End ASM
This doesn't look quiet efficient for me compared to e.g. a conversion from float to int32.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
thanks for the tip. I've checked it and I'm a little bit disappointed with the result (in terms of performance)... I was hoping that there might be some piece of documentation missing about int64 vector intrinsics. The main reason why I don't want to rely on compiler optimizations is because i'm doing instruction level performance measurements. Otherwise I see no good reason for using intrinsics...
I've made some examples (see attachments). Both of them created with:
icc -mmic -O2 -S XXX.cpp
This is the assembly that I'm getting when using that piece of code (are #preds labels correct - e.g. shouldn't B1.1 be B1.7?) in test.cpp:
# LOE rbx r12 r14
..B1.7: # Preds ..B1.1 Latency 163
stmxcsr (%rsp) #21.1 c1
movl $il0_peep_printf_format_0, %edi #35.3 c2
orl $32832, (%rsp) #21.1 c2
ldmxcsr (%rsp) #21.1 c6
vbroadcastsd .L_2il0floatpacket.2(%rip), %zmm0 #22.16 c7
vpxord %zmm19, %zmm19, %zmm19 #30.8 c11
.byte 15 #28.17 c11
.byte 49 #28.17
vmovaps %zmm19, %zmm3 #30.8 c15
movq %rax, %r13 #28.17 c15
vbroadcastsd .L_2il0floatpacket.4(%rip), %zmm2 #30.8 c19
vpandq .L_2il0floatpacket.3(%rip){1to8}, %zmm0, %zmm14 #30.8 c23
vpandnq %zmm14, %zmm2, %zmm17 #30.8 c27
movl $21845, %eax #30.8 c27
vpbroadcastq .L_2il0floatpacket.6(%rip), %zmm5 #30.8 c31
vpandq .L_2il0floatpacket.4(%rip){1to8}, %zmm0, %zmm1 #30.8 c35
vporq .L_2il0floatpacket.5(%rip){1to8}, %zmm1, %zmm9 #30.8 c39
vbroadcastsd .L_2il0floatpacket.7(%rip), %zmm16 #30.8 c43
vaddpd {rz}, %zmm16, %zmm14, %zmm15 #30.8 c47
kmov %eax, %k6 #30.8 c47
vmovdqa32 %zmm17{cdab}, %zmm3{%k6} #30.8 c51
movl $43690, %eax #30.8 c51
vpsrld $20, %zmm3, %zmm4 #30.8 c55
kmov %eax, %k1 #30.8 c55
vpsubd %zmm5, %zmm4, %zmm6 #30.8 c59
vmovaps %zmm19, %zmm7 #30.8 c63
vpshufd $160, %zmm6, %zmm11 #30.8 c67
vpsubrd .L_2il0floatpacket.8(%rip){1to16}, %zmm11, %zmm10 #30.8 c71
vpcmpltd .L_2il0floatpacket.8(%rip){1to16}, %zmm11, %k4 #30.8 c75
vpcmpd $4, %zmm19, %zmm11, %k3 #30.8 c79
vpsrlvd %zmm10, %zmm9, %zmm12 #30.8 c83
vpsllvd %zmm11, %zmm9, %zmm13 #30.8 c87
kand %k4, %k3 #30.8 c87
vmovdqa32 %zmm9{cdab}, %zmm7{%k1} #30.8 c91
kand %k1, %k3 #30.8 c91
vpsubd .L_2il0floatpacket.8(%rip){1to16}, %zmm11, %zmm8 #30.8 c95
vpord %zmm12{cdab}, %zmm13, %zmm13{%k3} #30.8 c99
vpsubd %zmm16, %zmm15, %zmm18 #30.8 c103
vcmpltpd .L_2il0floatpacket.7(%rip){1to8}, %zmm17, %k5 #30.8 c107
vpsllvd %zmm8, %zmm7, %zmm1 #30.8 c111
.byte 15 #32.16 c111
.byte 49 #32.16
vmovdqa32 %zmm13, %zmm1{%k4} #30.8 c115
movq %rdx, %r15 #32.16 c115
vmovdqa64 %zmm18, %zmm1{%k5} #30.8 c119
vcmpltpd %zmm19, %zmm0, %k2 #30.8 c123
vpsubsetbd %zmm1, %k7, %zmm19{%k6} #30.8 c127
nop #30.8 c131
kmov %k7, %ecx #30.8 c135
addl %ecx, %ecx #30.8 c139
kmov %ecx, %k3 #30.8 c143
vpsbbd %zmm1, %k3, %zmm19{%k1} #30.8 c147
vmovapd %zmm19, %zmm1{%k2} #30.8 c151
nop #35.3 c155
vmovaps %zmm1, 64(%rsp) #35.3 c159
call puts #35.3 c163
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.7 Latency 9
For the comparison this is the code for _mm512_cvtfxpnt_roundpd_epu32lo in test2.cpp:
# LOE r12 r13 r14
..B1.7: # Preds ..B1.1 Latency 35
stmxcsr 64(%rsp) #21.1 c1
movl $il0_peep_printf_format_0, %edi #35.3 c2
orl $32832, 64(%rsp) #21.1 c2
ldmxcsr 64(%rsp) #21.1 c6
vbroadcastsd .L_2il0floatpacket.0(%rip), %zmm0 #22.16 c7
.byte 15 #28.17 c11
.byte 49 #28.17
vcvtfxpntpd2udq $0, %zmm0, %zmm1 #30.8 c15
movq %rax, %rbx #28.17 c15
nop #30.8 c19
vmovaps %zmm1, (%rsp) #30.8 c23
movb %al, %al #32.16 c23
.byte 15 #32.16 c27
.byte 49 #32.16
movq %rdx, %r15 #32.16 c31
call puts #35.3 c35
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.7 Latency 9
Thanks,
Przemek.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Przemek,
I will outline the scheme, and let you select the _mm512_... intrinsics.
if the input vector d contains negative numbers
produce a mask of the negative numbers in incoming vector d
using XOR (abs, or other fast means) remove the sign bit from the input vector d (sign bit is left most bit of 64-bit elements). At this point d will have the absolute values of input vector d.
now then for both inputs of +/- (now abs(d)) perform a vector-wide add of (double)((__int64)1 << 51). The result now has right justified integer values combined with a non-zero exponent.
Next, remove the exponent with a vector wide and mask of (((__int64)1 <<51) -1). You now have a vector of int64's. These are truncated. If you want rounded, then add (or subtract) 0.5d as a separate step prior to the adding of (double)((__int64)1 << 51). Alternatively, after removing sign bit you can add (double)((__int64)1 << 50). Then add +1, then /2. More steps.
If the input vector could potentially contain negative numbers, then use the mask obtained earlier to perform the negates/merge.
In the case were input is only positive numbers, this should reduce to one ADD and one AND (with appropriate casts)
In the case where input may contain negative numbers, it will be a bit more complicated to produce properly signed results, however it will be completely vectorized and should be faster than the element by element conversion which will require the use of RAM/cache as opposed to registers.
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Further feedback from the Developer is:
The assembly code for double->int64 conversion, which is shown in the forum, is expected instruction sequence.
The difference between float->int32 conversion and double->int64 conversion is that vector float->int32 has hardware support, so the compiler generates a single instruction for such conversion. However, there is no single vector instruction for double->int64 conversion on KNC, so the compiler generates quite long instruction sequence to emulate such conversion.
The labels "# Preds" labels appear correct. Those identify from which basic blocks the code can reach the current basic block. I cannot find any other reference other than this one that is dated but still useful, http://www.intel.in/content/dam/www/public/us/en/documents/white-papers/ia-32-64-assembly-lang-paper.pdf (see pages 16-17).
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
After re-think, pseudo code
always positive inputs, truncated
out = and(in+Kshift, maskMantissa)
always positive inputs, rounded
out = and((in+.5) + Kshift, maskMantissa)
perform (in+.5) as a separate _mm512 operation
signed inputs, truncated
signs = and(in, maskSigns)
out = or(and(xor(in,signs)+Kshift, maskMantissa), signs)
signed input rounded
signs = and(in, maskSigns)
out = or(and((xor(in,signs)+.5)+Kshift, maskMantissa), signs)
Where:
Kshift is a vector of doubles that when added to input value x (absolute value if necessary) causes the lsb of the integer portion of the number to shift to the lsb of the 64-bit double. *** There may be an issue where the in+Kshift rounds the number, you will have to decide if you want rounding or truncation. If the +Kshift rounds .AND. you want rounded numbers, then you can omit the +0.5 step as this is done for you in hardware.
maskMantissa is a bitmask of 51 bits of 1's in the lsb position of the 64-bit integer
maksSigns is a bit mask with only the msb (sign bit) set in the 64-bit int (and in the double)
Assuming in+Kshift produced the correct right shifted number (rounding behavior you want), then the signed input rounded should reduce to
signs = and(in, maskSigns)
out = or(and(xor(in,signs)+Kshift, maskMantissa), signs)
or 5 instructions to convert the 8 doubles to signed int64
YMMV
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Ignore signed for now, what I showed won't work. Sorry for any inconvenience this may have cause you.
Jim Dempsey
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page