- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Crashing on build
/*
Kernel is attempting to average float8 values between neighboring elements of inputBuffer and then multiply them by associated 64 elements in matrixBuffer Crash occurs when DIM is 2 or 3, both multiplies are performed, and both stateL and stateR are read from. Setting DIM to 1 avoids the crash. Reading only stateR avoids the crash. */ #define DIM 3 __kernel void calcFlux( __global float8* resultBuffer, const __global float8* inputBuffer, const __global float8* matrixBuffer) { #if DIM == 1 int4 i = (int4)(get_global_id(0), 0, 0, 0); if (i.x < 1 || i.x >= 64) return; #elif DIM == 2 int4 i = (int4)(get_global_id(0), get_global_id(1), 0, 0); if (i.x < 1 || i.x >= 64 || i.y < 1 || i.y >= 64) return; #elif DIM == 3 int4 i = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); if (i.x < 1 || i.x >= 64 || i.y < 1 || i.y >= 64 || i.z < 1 || i.z >= 64) return; #else #error unsupported DIM #endif int index = i.x + 64 * (i.y + 64 * i.z); for (int side = 0; side < DIM; ++side) { int4 iPrev = i; --iPrev[side]; int indexPrev = iPrev.x + 64 * (iPrev.y + 64 * iPrev.z); float8 stateL = inputBuffer[indexPrev]; float8 stateR = inputBuffer[index]; int interfaceIndex = side + DIM * index; float8 stateAvg = (stateR + stateL) * .5f; #if 1 //crashes float8 result; for (int i = 0; i < 8; ++i) { float8 mrow = matrixBuffer[i + 8 * interfaceIndex]; result = dot(mrow.s0123, stateAvg.s0123) + dot(mrow.s4567, stateAvg.s4567); } resultBuffer[interfaceIndex] = result; #endif #if 0 //crashes float8 result = (float8)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f); for (int i = 0; i < 8; ++i) { float8 mrow = matrixBuffer[i + 8 * interfaceIndex]; for (int j = 0; j < 8; ++j) { result += mrow* stateAvg ; } } resultBuffer[interfaceIndex] = result; #endif #if 0 //crashes float8 ma = matrixBuffer[0 + 8 * interfaceIndex]; float8 mb = matrixBuffer[1 + 8 * interfaceIndex]; float8 mc = matrixBuffer[2 + 8 * interfaceIndex]; float8 md = matrixBuffer[3 + 8 * interfaceIndex]; float8 me = matrixBuffer[4 + 8 * interfaceIndex]; float8 mf = matrixBuffer[5 + 8 * interfaceIndex]; float8 mg = matrixBuffer[6 + 8 * interfaceIndex]; float8 mh = matrixBuffer[7 + 8 * interfaceIndex]; float ra = dot(ma.s0123, stateAvg.s0123) + dot(ma.s4567, stateAvg.s4567); float rb = dot(mb.s0123, stateAvg.s0123) + dot(mb.s4567, stateAvg.s4567); float rc = dot(mc.s0123, stateAvg.s0123) + dot(mc.s4567, stateAvg.s4567); float rd = dot(md.s0123, stateAvg.s0123) + dot(md.s4567, stateAvg.s4567); float re = dot(me.s0123, stateAvg.s0123) + dot(me.s4567, stateAvg.s4567); float rf = dot(mf.s0123, stateAvg.s0123) + dot(mf.s4567, stateAvg.s4567); float rg = dot(mg.s0123, stateAvg.s0123) + dot(mg.s4567, stateAvg.s4567); float rh = dot(mh.s0123, stateAvg.s0123) + dot(mh.s4567, stateAvg.s4567); resultBuffer[interfaceIndex] = (float8)(ra, rb, rc, rd, re, rf, rg, rh); #endif } }
stack trace:
Exception Type: EXC_BAD_ACCESS (SIGSEGV)
Exception Codes: KERN_INVALID_ADDRESS at 0x000000000000000c
VM Regions Near 0xc:
-->
__TEXT 00000001071e6000-00000001071f4000 [ 56K] r-x/rwx SM=COW /Users/USER/*
Thread 0 Crashed:: Dispatch queue: com.apple.main-thread
0 com.apple.driver.AppleIntelHD4000GraphicsGLDriver 0x00001234003fbaa8 gldBuildComputeProgram + 84
1 com.apple.opencl 0x00007fff91c302a4 0x7fff91c2c000 + 17060
2 com.apple.opencl 0x00007fff91c3e574 clBuildProgram + 2072
3 test 0x00000001071e9e0f cl::Program::build(std::__1::vector<cl::Device, std::__1::allocator<cl::Device> > const&, char const*, void (*)(_cl_program*, void*), void*) const + 255
4 test 0x00000001071e90f1 main + 6033
5 libdyld.dylib 0x00007fff91bb95fd start + 1
CL device properties:
CL_DEVICE_NAME: HD Graphics 4000 CL_DEVICE_VENDOR: Intel CL_DEVICE_VERSION: OpenCL 1.2 CL_DRIVER_VERSION: 1.2(May 5 2014 20:39:17) CL_DEVICE_VENDOR_ID: 16925696 CL_DEVICE_PLATFORM: 0x7fff0000 CL_DEVICE_AVAILABLE: 1 CL_DEVICE_COMPILER_AVAILABLE: 1 CL_DEVICE_MAX_CLOCK_FREQUENCY: 1200 CL_DEVICE_MAX_COMPUTE_UNITS: 16 CL_DEVICE_TYPE: 4 CL_DEVICE_ADDRESS_BITS: CL_FP_SOFT_FLOAT CL_DEVICE_HALF_FP_CONFIG: -failed- CL_DEVICE_SINGLE_FP_CONFIG: CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_DEVICE_ENDIAN_LITTLE: 1 CL_DEVICE_EXECUTION_CAPABILITIES: CL_EXEC_KERNEL CL_DEVICE_ADDRESS_BITS: 64 CL_DEVICE_GLOBAL_MEM_SIZE: 1073741824 CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: 0 CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: 0 CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: 0 CL_DEVICE_LOCAL_MEM_SIZE: 65536 CL_DEVICE_LOCAL_MEM_TYPE: 1 CL_DEVICE_MEM_BASE_ADDR_ALIGN: 1024 CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE: 128 CL_DEVICE_IMAGE_SUPPORT: 1 CL_DEVICE_IMAGE2D_MAX_WIDTH: 16384 CL_DEVICE_IMAGE2D_MAX_HEIGHT: 16384 CL_DEVICE_IMAGE3D_MAX_WIDTH: 2048 CL_DEVICE_IMAGE3D_MAX_HEIGHT: 2048 CL_DEVICE_IMAGE3D_MAX_DEPTH: 2048 CL_DEVICE_MAX_CONSTANT_ARGS: 8 CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: -failed- CL_DEVICE_MAX_MEM_ALLOC_SIZE: 268435456 CL_DEVICE_MAX_PARAMETER_SIZE: 1024 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8 CL_DEVICE_MAX_SAMPLERS: 16 CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: 1 CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: 1 CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: 1 CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: 1 CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: 1 CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: 0 CL_DEVICE_MAX_WORK_GROUP_SIZE: 512 CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3 CL_DEVICE_MAX_WORK_ITEM_SIZES: (512, 512, 512) CL_DEVICE_PROFILE: FULL_PROFILE CL_DEVICE_PROFILING_TIMER_RESOLUTION: 80 CL_DEVICE_QUEUE_PROPERTIES: 2
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
The for-loop is the problem.
A #pragma unroll does not fix it -- the segfault still occurs upon building the program.
However manually unrolling the loop or keeping the loop and moving the loop body into a separate function does fix it.
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page