OpenCL* for CPU
Ask questions and share information on Intel® SDK for OpenCL™ Applications and OpenCL™ implementations for Intel® CPU.
Announcements
This forum covers OpenCL* for CPU only. OpenCL* for GPU questions can be asked in the GPU Compute Software forum. Intel® FPGA SDK for OpenCL™ questions can be ask in the FPGA Intel® High Level Design forum.
1718 Discussions

getting a crash when building a program

Moore__Christopher
296 Views

Crashing on build

/*

Kernel is attempting to average float8 values between neighboring elements of inputBuffer and then multiply them by associated 64 elements in matrixBuffer
Crash occurs when DIM is 2 or 3, both multiplies are performed, and both stateL and stateR are read from.
Setting DIM to 1 avoids the crash.
Reading only stateR avoids the crash.
*/

#define DIM 3

__kernel void calcFlux(
	__global float8* resultBuffer,
	const __global float8* inputBuffer,
	const __global float8* matrixBuffer)
{
#if DIM == 1
	int4 i = (int4)(get_global_id(0), 0, 0, 0);
	if (i.x < 1 || i.x >= 64) return;
#elif DIM == 2
	int4 i = (int4)(get_global_id(0), get_global_id(1), 0, 0);
	if (i.x < 1 || i.x >= 64 || i.y < 1 || i.y >= 64) return;
#elif DIM == 3
	int4 i = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
	if (i.x < 1 || i.x >= 64 || i.y < 1 || i.y >= 64 || i.z < 1 || i.z >= 64) return;
#else
#error unsupported DIM
#endif
	
	int index = i.x + 64 * (i.y + 64 * i.z);
	
	for (int side = 0; side < DIM; ++side) {
		int4 iPrev = i;
		--iPrev[side];
		int indexPrev = iPrev.x + 64 * (iPrev.y + 64 * iPrev.z);

		float8 stateL = inputBuffer[indexPrev];
		float8 stateR = inputBuffer[index];

		int interfaceIndex = side + DIM * index;

		float8 stateAvg = (stateR + stateL) * .5f;
	
#if 1	//crashes
		float8 result;
		for (int i = 0; i < 8; ++i) {
			float8 mrow = matrixBuffer[i + 8 * interfaceIndex];
			result = dot(mrow.s0123, stateAvg.s0123) + dot(mrow.s4567, stateAvg.s4567);
		}
		resultBuffer[interfaceIndex] = result;
#endif

#if 0	//crashes
		float8 result = (float8)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
		for (int i = 0; i < 8; ++i) {
			float8 mrow = matrixBuffer[i + 8 * interfaceIndex];
			for (int j = 0; j < 8; ++j) {
				result += mrow * stateAvg;
			}
		}
		resultBuffer[interfaceIndex] = result;
#endif

#if 0	//crashes
		float8 ma = matrixBuffer[0 + 8 * interfaceIndex];
		float8 mb = matrixBuffer[1 + 8 * interfaceIndex];
		float8 mc = matrixBuffer[2 + 8 * interfaceIndex];
		float8 md = matrixBuffer[3 + 8 * interfaceIndex];
		float8 me = matrixBuffer[4 + 8 * interfaceIndex];
		float8 mf = matrixBuffer[5 + 8 * interfaceIndex];
		float8 mg = matrixBuffer[6 + 8 * interfaceIndex];
		float8 mh = matrixBuffer[7 + 8 * interfaceIndex];
		float ra = dot(ma.s0123, stateAvg.s0123) + dot(ma.s4567, stateAvg.s4567);
		float rb = dot(mb.s0123, stateAvg.s0123) + dot(mb.s4567, stateAvg.s4567);
		float rc = dot(mc.s0123, stateAvg.s0123) + dot(mc.s4567, stateAvg.s4567);
		float rd = dot(md.s0123, stateAvg.s0123) + dot(md.s4567, stateAvg.s4567);
		float re = dot(me.s0123, stateAvg.s0123) + dot(me.s4567, stateAvg.s4567);
		float rf = dot(mf.s0123, stateAvg.s0123) + dot(mf.s4567, stateAvg.s4567);
		float rg = dot(mg.s0123, stateAvg.s0123) + dot(mg.s4567, stateAvg.s4567);
		float rh = dot(mh.s0123, stateAvg.s0123) + dot(mh.s4567, stateAvg.s4567);
		resultBuffer[interfaceIndex] = (float8)(ra, rb, rc, rd, re, rf, rg, rh);
#endif
	}
}

 

stack trace:

Exception Type:  EXC_BAD_ACCESS (SIGSEGV)
Exception Codes: KERN_INVALID_ADDRESS at 0x000000000000000c

VM Regions Near 0xc:
--> 
    __TEXT                 00000001071e6000-00000001071f4000 [   56K] r-x/rwx SM=COW  /Users/USER/*

Thread 0 Crashed:: Dispatch queue: com.apple.main-thread
0   com.apple.driver.AppleIntelHD4000GraphicsGLDriver    0x00001234003fbaa8 gldBuildComputeProgram + 84
1   com.apple.opencl                  0x00007fff91c302a4 0x7fff91c2c000 + 17060
2   com.apple.opencl                  0x00007fff91c3e574 clBuildProgram + 2072
3   test                              0x00000001071e9e0f cl::Program::build(std::__1::vector<cl::Device, std::__1::allocator<cl::Device> > const&, char const*, void (*)(_cl_program*, void*), void*) const + 255
4   test                              0x00000001071e90f1 main + 6033
5   libdyld.dylib                     0x00007fff91bb95fd start + 1

 

CL device properties:

CL_DEVICE_NAME:	HD Graphics 4000
CL_DEVICE_VENDOR:	Intel
CL_DEVICE_VERSION:	OpenCL 1.2 
CL_DRIVER_VERSION:	1.2(May  5 2014 20:39:17)
CL_DEVICE_VENDOR_ID:	16925696
CL_DEVICE_PLATFORM:	0x7fff0000
CL_DEVICE_AVAILABLE:	1
CL_DEVICE_COMPILER_AVAILABLE:	1
CL_DEVICE_MAX_CLOCK_FREQUENCY:	1200
CL_DEVICE_MAX_COMPUTE_UNITS:	16
CL_DEVICE_TYPE:	4
CL_DEVICE_ADDRESS_BITS:	
	CL_FP_SOFT_FLOAT
CL_DEVICE_HALF_FP_CONFIG:	-failed-
CL_DEVICE_SINGLE_FP_CONFIG:	
	CL_FP_INF_NAN
	CL_FP_ROUND_TO_NEAREST
	CL_FP_ROUND_TO_ZERO
	CL_FP_ROUND_TO_INF
CL_DEVICE_ENDIAN_LITTLE:	1
CL_DEVICE_EXECUTION_CAPABILITIES:	
	CL_EXEC_KERNEL
CL_DEVICE_ADDRESS_BITS:	64
CL_DEVICE_GLOBAL_MEM_SIZE:	1073741824
CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:	0
CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:	0
CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:	0
CL_DEVICE_LOCAL_MEM_SIZE:	65536
CL_DEVICE_LOCAL_MEM_TYPE:	1
CL_DEVICE_MEM_BASE_ADDR_ALIGN:	1024
CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:	128
CL_DEVICE_IMAGE_SUPPORT:	1
CL_DEVICE_IMAGE2D_MAX_WIDTH:	16384
CL_DEVICE_IMAGE2D_MAX_HEIGHT:	16384
CL_DEVICE_IMAGE3D_MAX_WIDTH:	2048
CL_DEVICE_IMAGE3D_MAX_HEIGHT:	2048
CL_DEVICE_IMAGE3D_MAX_DEPTH:	2048
CL_DEVICE_MAX_CONSTANT_ARGS:	8
CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:	-failed-
CL_DEVICE_MAX_MEM_ALLOC_SIZE:	268435456
CL_DEVICE_MAX_PARAMETER_SIZE:	1024
CL_DEVICE_MAX_READ_IMAGE_ARGS:	128
CL_DEVICE_MAX_WRITE_IMAGE_ARGS:	8
CL_DEVICE_MAX_SAMPLERS:	16
CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:	1
CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:	1
CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:	1
CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:	1
CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:	1
CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:	0
CL_DEVICE_MAX_WORK_GROUP_SIZE:	512
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:	3
CL_DEVICE_MAX_WORK_ITEM_SIZES:	(512, 512, 512)
CL_DEVICE_PROFILE:	FULL_PROFILE
CL_DEVICE_PROFILING_TIMER_RESOLUTION:	80
CL_DEVICE_QUEUE_PROPERTIES:	2

 

0 Kudos
1 Reply
Moore__Christopher
296 Views

The for-loop is the problem.

A #pragma unroll does not fix it -- the segfault still occurs upon building the program.

However manually unrolling the loop or keeping the loop and moving the loop body into a separate function does fix it.

0 Kudos
Reply