Crash Intel(R) OpenCL Offline Compiler (x64)

Alex_Tar · ‎07-07-2011

This code will cause to crash Offline Compiler

[bash]void sum_reduce_and_store(__local float *sdata, __global float *store_arr, float value, int store_off)
{
	unsigned int lsz = get_local_size(0);
	unsigned int lid = get_local_id(0);
	sdata[lid] = value;
	barrier(CLK_LOCAL_MEM_FENCE);
	if (lsz != 1) {
		if (lsz >= 512) { if (lid < 256) { sdata[lid] += sdata[lid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
		if (lsz >= 256) { if (lid < 128) { sdata[lid] += sdata[lid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
		if (lsz >= 128) { if (lid <  64) { sdata[lid] += sdata[lid +  64]; } barrier(CLK_LOCAL_MEM_FENCE); }
		if (lid <  32) { sdata[lid] += sdata[lid +  32]; } barrier(CLK_LOCAL_MEM_FENCE);
		if (lid <  16) { sdata[lid] += sdata[lid +  16]; } barrier(CLK_LOCAL_MEM_FENCE);
		if (lid <  8) { sdata[lid] += sdata[lid +  8]; } barrier(CLK_LOCAL_MEM_FENCE);
		if (lid <  4) { sdata[lid] += sdata[lid +  4]; } barrier(CLK_LOCAL_MEM_FENCE);
		if (lid <  2) { sdata[lid] += sdata[lid +  2]; } barrier(CLK_LOCAL_MEM_FENCE);
		if (lid <  1) { sdata[lid] += sdata[lid +  1]; } barrier(CLK_LOCAL_MEM_FENCE);
	}
	if (lid == 0) store_arr[store_off] = sdata[0];
	barrier(CLK_LOCAL_MEM_FENCE);
}
float activation_derived(float steepness, int act_func, __global float *outputs, __global float *sums, int o_i)
{
	switch (act_func)
	{
		case 0:
		case 12:
		case 13:
			return (steepness);
		case 3:
		case 4:
			return (2.0f * steepness * (((outputs[o_i]) < (0.01f)) ? (0.01f) : (((outputs[o_i]) > (0.99f)) ? (0.99f) : (outputs[o_i]))) * (1.0f - (((outputs[o_i]) < (0.01f)) ? (0.01f) : (((outputs[o_i]) > (0.99f)) ? (0.99f) : (outputs[o_i])))));
		case 5:
		case 6:
			return steepness * (1.0f - ((((outputs[o_i]) < (-0.98f)) ? (-0.98f) : (((outputs[o_i]) > (0.98f)) ? (0.98f) : (outputs[o_i])))*(((outputs[o_i]) < (-0.98f)) ? (-0.98f) : (((outputs[o_i]) > (0.98f)) ? (0.98f) : (outputs[o_i])))));
		case 7:
			return (-2.0f * sums[o_i] * outputs[o_i] * steepness * steepness);
		case 8:
			return (-2.0f * sums[o_i] * (outputs[o_i]+1.0f) * steepness * steepness);
		case 10:
			return (steepness * 1.0f / (2.0f * (1.0f + (((sums[o_i]) > 0) ? (sums[o_i]) : -(sums[o_i]))) * (1.0f + (((sums[o_i]) > 0) ? (sums[o_i]) : -(sums[o_i])))));
		case 11:
			return (steepness * 1.0f / ((1.0f + (((sums[o_i]) > 0) ? (sums[o_i]) : -(sums[o_i]))) * (1.0f + (((sums[o_i]) > 0) ? (sums[o_i]) : -(sums[o_i])))));
		case 14:
			return (steepness*cos(steepness*sums[o_i]));
		case 15:
			return (steepness*-sin(steepness*sums[o_i]));
		case 16:
			return (steepness*cos(steepness*sums[o_i])/2.0f);
		case 17:
			return (steepness*-sin(steepness*sums[o_i])/2.0f);
		case 2: //This should be an error
		case 1: //This should be an error
		case 9: //FIXME
			return -99.0;
		default: return 0;
	}
}
void backpropagate_MSE(__constant unsigned int *sizes,
			__global unsigned int *num_layers,
			__global unsigned int *num_neurons, 
			__global unsigned int *num_inputs,
			__global unsigned int *num_outputs,
			__global float *steepness,
			__global int *activation,
			__global float *weights,
			__global float *inputs,
			__global float *sums,
			__global float *outputs,
			__global float *train_errors,
			__global float *weights_deltas,
			__local float *steep_s,
			__local int *act_s,
			__local float *weights_s,
			__local float *reduce_s )
{
	unsigned int input_id = get_global_id(0);
	unsigned int lid = get_local_id(0);
	unsigned int lsz = get_local_size(0);
	unsigned int gnum;
	unsigned int gid = get_group_id(0);
	int l;
	if (sizes[5] % sizes[7])
		gnum = 1 + (sizes[5] / sizes[7]);
	else
		gnum = sizes[5] / sizes[7];
	for(l = num_layers[get_global_id(1)]-1; l >= 0; --l) {
		unsigned int part_layer_off = get_global_id(1)*sizes[1]+l;
		unsigned int num_neurons_l = num_neurons[part_layer_off];
		unsigned int n_layer_off = sizes[2]*part_layer_off;
		unsigned int o_layer_off = sizes[4]*part_layer_off;
		unsigned int output_off = o_layer_off-sizes[4];
		unsigned int n;
		barrier(CLK_LOCAL_MEM_FENCE);
		for(n = 0; n < num_neurons_l; n += lsz) {
			unsigned int neuron_num = n+lid;
			if (neuron_num < num_neurons[part_layer_off]){
				steep_s[neuron_num] = steepness[n_layer_off+neuron_num];
				act_s[neuron_num] = activation[n_layer_off+neuron_num];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		for(n = 0; n < num_neurons_l && l != 0; ++n) {
			unsigned int num_outputs_l = num_outputs[n_layer_off+n];
			unsigned int o;
			for(o = 0; o < num_outputs_l; ++o) {
				if (sizes[5] > input_id)
					train_errors[output_off*sizes[5]+input_id] = 0.0f;
				++output_off;
			}
		}
		output_off = o_layer_off;
		for(n = 0; n < num_neurons[part_layer_off]; ++n) {
			unsigned int num_outputs_l = num_outputs[n_layer_off+n];
			unsigned int num_inputs_l  = num_inputs[n_layer_off+n];
			unsigned int o;
			for(o = 0; o < num_outputs_l; ++o) {
				unsigned int i;
				unsigned int o_i = output_off*sizes[5]+input_id;
				float error;
				if (sizes[5] > input_id)
					train_errors[o_i] = error =
					train_errors[o_i]*activation_derived(steep_s, act_s, outputs, sums, o_i);
				for(i = 0; i < num_inputs_l; ++i) {
					unsigned int weights_i = 0;
					unsigned int prev_output_i = 0;
					float delta = 0.0f;
					if (l != 0) {
						weights_i = (sizes[3]*o+i) % lsz;
						if (weights_i == 0) {
							barrier(CLK_LOCAL_MEM_FENCE);
							if (sizes[3]*o+i+lid < sizes[3]*num_outputs_l)
								weights_s[lid] = weights[output_off*sizes[3]+i+lid];
							barrier(CLK_LOCAL_MEM_FENCE);
						}
					}
					if (sizes[5] > input_id) {
						if(i == num_inputs_l-1){
							prev_output_i = (o_layer_off-sizes[4]+i)*sizes[5]+input_id;
							delta = error;
						} else if(l == 0) {
							delta = inputs[i*sizes[5]+input_id] * error;
						} else {
							prev_output_i = (o_layer_off-sizes[4]+i)*sizes[5]+input_id;
							delta = outputs[prev_output_i] * error;
						}
					}
					sum_reduce_and_store(reduce_s, weights_deltas, delta,
						(output_off*sizes[3]+i)*gnum+gid);
					if(l != 0 && sizes[5] > input_id)
						train_errors[prev_output_i] += error * weights_s[weights_i];
				}
				++output_off;
			}
		}
	}
}

__kernel void train_batch(
		__constant unsigned int *sizes,
		__global float *f_params,
		__global unsigned int *num_layers,
		__global unsigned int *num_neurons,
		__global unsigned int *num_inputs,
		__global unsigned int *num_outputs,
		__global float *steepness,
		__global int *activation,
		__global float *weights,
		__global float *inputs,
		__global float *sums,
		__global float *outputs,
		__global float *train_errors,
		__global float *actual_outputs,
		__global float *MSE_values,
		__global float *num_bit_fail,
		__global float *weights_deltas,
		__local float *steep_s,
		__local int *act_s,
		__local float *weights_s,
		__local float *reduce_s)
{
	backpropagate_MSE(sizes, num_layers, num_neurons, num_inputs, num_outputs,
		steepness, activation, weights, inputs, sums,
		outputs, train_errors, weights_deltas,
		steep_s, act_s, weights_s, reduce_s);
}

[/bash]

Michael_Downey · ‎07-08-2011

I would expect you are running into the same bug I am in: http://software.intel.com/en-us/forums/showthread.php?t=83665&o=a&s=lr

Did you run your code using the latest release? I didn't see that my bug was fixed in the release notes and I haven't installed the latest release yet to check if it's fixed. So you'll save me some time trying if you are.

Alex_Tar · ‎07-08-2011

Yes I use latest release. And this is more strange in beta version this code compiled.

Uri_L_Intel · ‎07-10-2011

Hello Alex,

Weve managed to reproduce the failure and the issue was submitted to the compiler team to find the root cause.

Well work to fix it on our next release.

Thanks,

Uri Levy