OpenCL* for CPU
Ask questions and share information on Intel® SDK for OpenCL™ Applications and OpenCL™ implementations for Intel® CPU.
Announcements
This forum covers OpenCL* for CPU only. OpenCL* for GPU questions can be asked in the GPU Compute Software forum. Intel® FPGA SDK for OpenCL™ questions can be ask in the FPGA Intel® High Level Design forum.
1719 Discussions

Error compiling kernel for GPU

peastman
Beginner
789 Views
I'm testing my code on an Ivy Bridge processor under Windows 7. When I tell it to use the CPU device, it works correctly. When I try to run on the GPU device, clCreateProgramWithSource() fails with the following error:

igdfcl32.dll successfully completed build.
error: Cannot yet select: 0x2fe3f10: i8 = any_extend 0x3d24138 [ID=66]
0x3d24138: i32 = and 0x3d240b0, 0x3e5ac30 [ID=63]
0x3d240b0: i32 = xor 0x2fe31c8, 0x3d233f0 [ID=61]
0x2fe31c8: i32 = GHAL3DISD::GHAL3DSETCC 0x3e5c078, 0x3e5ab20, 0x3d23f18 [ID=59]
0x3e5c078: f32,ch = llvm.GHAL3D.fabsf 0x2fe2f48, 0x3e5b0f8, 0x3d23830 [ORD=123] [ID=57]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5b0f8: i32 = Constant<50> [ID=18]
0x3d23830: f32 = fdiv 0x3e5acb8, 0x3e5c430 [ORD=103] [ID=55]
0x3e5acb8: f32 = fmul 0x3e5aa98, 0x2fe3690 [ORD=102] [ID=54]
0x3e5aa98: f32 = fmul 0x3e5b4c8, 0x3d23258 [ORD=101] [ID=37]
0x3e5b4c8: f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3be0 [ORD=101] [ID=25]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3be0: f32 = Register %reg1038 [ORD=101] [ID=6]
0x3d23258: f32 = ConstantFP<-2.000000e+000> [ORD=101] [ID=7]
0x2fe3690: f32 = fadd 0x3d23a50, 0x2fe3cf0 [ORD=100] [ID=53]
0x3d23a50: f32 = fadd 0x3d23500, 0x2fe3608 [ORD=96] [ID=52]
0x3d23500: f32 = fmul 0x2fe3e00, 0x3e5c100 [ORD=92] [ID=47]
0x2fe3e00: f32 = GHAL3DISD::EXTRACTX 0x3d5ab58 [ID=40]
0x3d5ab58: v4f32 = fsub 0x3e5ae50, 0x3d5aad0 [ORD=89] [ID=33]
0x3e5ae50: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3d5a828 [ORD=89] [ID=22]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3d5a828: v4f32 = Register %reg1057 [ORD=89] [ID=3]
0x3d5aad0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a878 [ORD=89] [ID=23]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a878: v4f32 = Register %reg1056 [ORD=89] [ID=4]
0x3e5c100: f32 = GHAL3DISD::EXTRACTX 0x3d5abe0 [ID=34]
0x3d5abe0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3580 [ORD=91] [ID=24]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3580: v4f32 = Register %reg1048 [ORD=91] [ID=5]
0x2fe3608: f32 = fmul 0x3d232e0, 0x2fe3938 [ORD=95] [ID=48]
0x3d232e0: f32 = GHAL3DISD::EXTRACTY 0x3d5ab58 [ID=41]
0x3d5ab58: v4f32 = fsub 0x3e5ae50, 0x3d5aad0 [ORD=89] [ID=33]
0x3e5ae50: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3d5a828 [ORD=89] [ID=22]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3d5a828: v4f32 = Register %reg1057 [ORD=89] [ID=3]
0x3d5aad0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a878 [ORD=89] [ID=23]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a878: v4f32 = Register %reg1056 [ORD=89] [ID=4]
0x2fe3938: f32 = GHAL3DISD::EXTRACTY 0x3d5abe0 [ID=35]
0x3d5abe0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3580 [ORD=91] [ID=24]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3580: v4f32 = Register %reg1048 [ORD=91] [ID=5]
0x2fe3cf0: f32 = fmul 0x3d23610, 0x2fe37a0 [ORD=99] [ID=49]
0x3d23610: f32 = GHAL3DISD::EXTRACTZ 0x3d5ab58 [ID=42]
0x3d5ab58: v4f32 = fsub 0x3e5ae50, 0x3d5aad0 [ORD=89] [ID=33]
0x3e5ae50: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3d5a828 [ORD=89] [ID=22]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3d5a828: v4f32 = Register %reg1057 [ORD=89] [ID=3]
0x3d5aad0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a878 [ORD=89] [ID=23]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a878: v4f32 = Register %reg1056 [ORD=89] [ID=4]
0x2fe37a0: f32 = GHAL3DISD::EXTRACTZ 0x3d5abe0 [ID=36]
0x3d5abe0: v4f32,ch = CopyFromReg 0x2fe2f48, 0x2fe3580 [ORD=91] [ID=24]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x2fe3580: v4f32 = Register %reg1048 [ORD=91] [ID=5]
0x3e5c430: f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a6e0 [ORD=103] [ID=26]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a6e0: f32 = Register %reg1051 [ORD=103] [ID=8]
0x3e5ab20: f32,ch = CopyFromReg 0x2fe2f48, 0x3e5a988 [ORD=124] [ID=29]
0x2fe2f48: ch = EntryToken [ORD=88] [ID=0]
0x3e5a988: f32 = Register %reg1025 [ORD=124] [ID=11]
0x3d23f18: ch = setgt [ID=20]
0x3d233f0: i32 = Constant<-1> [ORD=88] [ID=2]
0x3e5ac30: i32 = Constant<1> [ID=17]
error: midlevel compiler failed build.

Any idea what this means?

Peter
0 Kudos
5 Replies
Brijender_B_Intel
789 Views

Its hard to know from the error log. If you can share the code that will be great. Or I will suggest start deleting the code from kernel to pin point to a small set of lines which causes this issue.

0 Kudos
peastman
Beginner
789 Views
Here is the source code for the kernel. This is part of OpenMM (https://simtk.org/home/openmm):

#define EXP native_exp
#define LOG native_log
#define RECIP native_recip
#define RSQRT native_rsqrt
#define SQRT native_sqrt
#define WORK_GROUP_SIZE 64

#define CONSTRAIN_VELOCITIES 1

/**
 * Enforce constraints on SHAKE clusters
 */

__kernel void applyShakeToHydrogens(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global const int4* restrict clusterAtoms, __global const float4* restrict clusterParams) {
    int index = get_global_id(0);
    while (index < numClusters) {
        // Load the data for this cluster.

        int4 atoms = clusterAtoms[index];
        float4 params = clusterParams[index];
        float4 pos = oldPos[atoms.x];
        float4 xpi = posDelta[atoms.x];
        float4 pos1 = oldPos[atoms.y];
        float4 xpj1 = posDelta[atoms.y];
        float4 pos2 = {0.0f, 0.0f, 0.0f, 0.0f};
        float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
        float invMassCentral = params.x;
        float avgMass = params.y;
        float d2 = params.z;
        float invMassPeripheral = params.w;
        if (atoms.z != -1) {
            pos2 = oldPos[atoms.z];
            xpj2 = posDelta[atoms.z];
        }
        float4 pos3 = {0.0f, 0.0f, 0.0f, 0.0f};
        float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
        if (atoms.w != -1) {
            pos3 = oldPos[atoms.w];
            xpj3 = posDelta[atoms.w];
        }

        // Precompute quantities.

        float4 rij1 = pos-pos1;
        float4 rij2 = pos-pos2;
        float4 rij3 = pos-pos3;
        float rij1sq = rij1.x*rij1.x + rij1.y*rij1.y + rij1.z*rij1.z;
        float rij2sq = rij2.x*rij2.x + rij2.y*rij2.y + rij2.z*rij2.z;
        float rij3sq = rij3.x*rij3.x + rij3.y*rij3.y + rij3.z*rij3.z;
        float ld1 = d2-rij1sq;
        float ld2 = d2-rij2sq;
        float ld3 = d2-rij3sq;

        // Iterate until convergence.

        bool converged = false;
        int iteration = 0;
        while (iteration < 15 && !converged) {
            converged = true;
#ifdef CONSTRAIN_VELOCITIES
            float4 rpij = xpi-xpj1;
            float rrpr = rpij.x*rij1.x + rpij.y*rij1.y + rpij.z*rij1.z;
            float delta = -2.0f*avgMass*rrpr/rij1sq;
            float4 dr = rij1*delta;
            xpi.xyz += dr.xyz*invMassCentral;
            xpj1.xyz -= dr.xyz*invMassPeripheral;
            if (fabs(delta) > tol)
                converged = false;
            if (atoms.z != -1) {
                rpij = xpi-xpj2;
                rrpr = rpij.x*rij2.x + rpij.y*rij2.y + rpij.z*rij2.z;
                delta = -2.0f*avgMass*rrpr/rij2sq;
                dr = rij2*delta;
                xpi.xyz += dr.xyz*invMassCentral;
                xpj2.xyz -= dr.xyz*invMassPeripheral;
                if (fabs(delta) > tol)
                    converged = false;
            }
            if (atoms.w != -1) {
                rpij = xpi-xpj3;
                rrpr = rpij.x*rij3.x + rpij.y*rij3.y + rpij.z*rij3.z;
                delta = -2.0f*avgMass*rrpr/rij3sq;
                dr = rij3*delta;
                xpi.xyz += dr.xyz*invMassCentral;
                xpj3.xyz -= dr.xyz*invMassPeripheral;
                if (fabs(delta) > tol)
                    converged = false;
            }
#else
            float4 rpij = xpi-xpj1;
            float rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
            float rrpr = rij1.x*rpij.x + rij1.y*rpij.y + rij1.z*rpij.z;
            float diff = fabs(ld1-2.0f*rrpr-rpsqij) / (d2*tol);
            if (diff >= 1.0f) {
                float acor  = (ld1-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij1sq);
                float4 dr = rij1*acor;
                xpi.xyz += dr.xyz*invMassCentral;
                xpj1.xyz -= dr.xyz*invMassPeripheral;
                converged = false;
            }
            if (atoms.z != -1) {
                rpij.xyz = xpi.xyz-xpj2.xyz;
                rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
                rrpr = rij2.x*rpij.x + rij2.y*rpij.y + rij2.z*rpij.z;
                diff = fabs(ld2-2.0f*rrpr-rpsqij) / (d2*tol);
                if (diff >= 1.0f) {
                    float acor  = (ld2 - 2.0f*rrpr - rpsqij)*avgMass / (rrpr + rij2sq);
                    float4 dr = rij2*acor;
                    xpi.xyz += dr.xyz*invMassCentral;
                    xpj2.xyz -= dr.xyz*invMassPeripheral;
                    converged = false;
                }
            }
            if (atoms.w != -1) {
                rpij.xyz = xpi.xyz-xpj3.xyz;
                rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
                rrpr = rij3.x*rpij.x + rij3.y*rpij.y + rij3.z*rpij.z;
                diff = fabs(ld3 - 2.0f*rrpr - rpsqij) / (d2*tol);
                if (diff >= 1.0f) {
                    float acor  = (ld3-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij3sq);
                    float4 dr = rij3*acor;
                    xpi.xyz += dr.xyz*invMassCentral;
                    xpj3.xyz -= dr.xyz*invMassPeripheral;
                    converged = false;
                }
            }
#endif
            iteration++;
        }

        // Record the new positions.

        posDelta[atoms.x] = xpi;
        posDelta[atoms.y] = xpj1;
        if (atoms.z != -1)
            posDelta[atoms.z] = xpj2;
        if (atoms.w != -1)
            posDelta[atoms.w] = xpj3;
        index += get_global_size(0);
    }
}

Peter

0 Kudos
Raghupathi_M_Intel
789 Views
Hi Peter,

Thanks for providing the kernel. I am able to reproduce the build error on the GPU using the 2012SDK. I'll get back to you on my findings after debug it more.

Thanks,
Raghu
0 Kudos
Raghupathi_M_Intel
789 Views
I was able to narrow down the test case to a few lines causing the build to fail. I have filed a bug and will let you know if development suggests a fix or a workaround.

Thanks,
Raghu
0 Kudos
peastman
Beginner
789 Views
Great, thank you for the update.

Peter
0 Kudos
Reply