OpenCL* for CPU
Ask questions and share information on Intel® SDK for OpenCL™ Applications and OpenCL™ implementations for Intel® CPU.
This forum covers OpenCL* for CPU only. OpenCL* for GPU questions can be asked in the GPU Compute Software forum. Intel® FPGA SDK for OpenCL™ questions can be ask in the FPGA Intel® High Level Design forum.

Different output results after the driver update



I have been using the HD4600.
After version up of the driver,
Changes in the outcome of OpenCL occurred.

Is there a way to avoid?

(attached the resources)

· OpenCL SDK
Intel SDK for OpenCL Applications 2016

· GPU driver version
before: (2014/03/11)
after: (2015/08/04)

clGetDeviceIDs() -
cl_device_type device_type = CL_DEVICE_TYPE_GPU; // (Intel(R) HD Graphics 4600)

const char *options = NULL

1. Have you tried  driver?

2. Could you please provide a complete example and steps to reproduce?

3. What did you get before? What are you getting now?

4. What are the sizes of your input/output buffers? What is the size of your enqueue? global/local sizes? Complete example would help a lot!


BTW, on the latest and greatest production driver (4380) on Windows 10, I am getting the results I expect from your kernel. The assembly for your kernel looks correct as well. So it could be that what you got previously is in error.

Dear Robert,
Thank you kind reply

1. Have you tried  driver?
  I tried. But the results did not change

2. Could you please provide a complete example and steps to reproduce?
3. What did you get before? What are you getting now?
4. What are the sizes of your input/output buffers? What is the size of your enqueue? global/local sizes? Complete example would help a lot!
  Upload all of the data
  Please check the readme.txt

BTW, on the latest and greatest production driver (4380) on Windows 10, I am getting the results I expect from your kernel. The assembly for your kernel looks correct as well. So it could be that what you got previously is in error.
  Now I can not update to Windows10. However, there are plans to update

Please let me questions again.
It is a new program and run results
Problem due has occurred in local_work_size

GPU:Intel(R) HD Graphics 4600
OpenCL SDK: 2016
OpenCL Code Builder:

#include "stdafx.h"
#include <Windows.h>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#include "CL/cl.h"

cl_device_id  g_DeviceID  = 0;
cl_context   g_Context  = 0;
cl_command_queue g_CommandQueue = 0;
cl_program   clProgram  = 0;
cl_kernel   clKernel  = 0;
cl_mem    g_InputBinBuf = 0;
cl_mem    g_OutputBuf  = 0;

const char* DRIVER_NAME = "Intel(R) HD Graphics 4600";

#define INPUT_BIN_WIDTH  (8)
#define INPUT_BIN_HEIGHT (4)
#define DATA_SIZE   (DATA_NUM*4)

// Test Data
unsigned int InputData[DATA_NUM] = {
 0,   0, 0, 0, 0, 0, 0, 0,
 0x00000102, 0, 0, 0, 0, 0, 0, 0,
 0,   0, 0, 0, 0, 0, 0, 0,
 0,   0, 0, 0, 0, 0, 0, 0

// Kernel Code
const char* kernel_code = ""
"__kernel void TestKernel(   __global unsigned int* oBuffer,  \n"
"       const __global unsigned int* iRgbaImage, \n"
"       const int iImageWidth      \n"
")                  \n"
"{                  \n"
" int gx = get_global_id(0);           \n"
"    int gy = get_global_id(1);           \n"
"                  \n"
" unsigned int Input;             \n"
" unsigned int result;            \n"
" int pos;               \n"
"                  \n"
" pos  = gy * iImageWidth + gx; // ReadAddress     \n"
" Input = iRgbaImage[pos];   // TargetData     \n"
"                  \n"
" result = Input & 0xff;            \n"
" if ( ((Input>>8)&0xff) != 0){          \n"
"   result = ((Input>>8)&0xff) + (Input&0xff);      \n"
" }                 \n"
"                  \n"
" oBuffer[pos] = result;            \n"
"}                  \n";

// Init Resource
void Init(void) 
 cl_uint numPlatforms;
 cl_int ret;

 // CreateOpenCL
    ret = clGetPlatformIDs(0, NULL, &numPlatforms);
 cl_platform_id *platform_id = new cl_platform_id[numPlatforms];
 cl_device_id device_id;
    ret = clGetPlatformIDs(numPlatforms, platform_id, NULL);

 static const int INFO_MAX = 128;
    char device_name[INFO_MAX];

 for(cl_uint i = 0; i < numPlatforms; ++i){
  clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
  clGetDeviceInfo( device_id, CL_DEVICE_NAME, INFO_MAX, device_name, NULL);
  if(NULL != strstr(device_name, DRIVER_NAME)){
   g_DeviceID = device_id;
 if(g_DeviceID == 0){
  printf("no match driver");
 g_Context  = clCreateContext( NULL, 1, &g_DeviceID, NULL, NULL, NULL);
 g_CommandQueue = clCreateCommandQueue(g_Context, g_DeviceID, 0, NULL);
 delete[] platform_id;

 size_t source_size = strlen(kernel_code);
 clProgram = clCreateProgramWithSource(g_Context, 1, (const char **)&kernel_code, (const size_t *)&source_size, &ret);
 ret   = clBuildProgram(clProgram, 1, &g_DeviceID, NULL, NULL, NULL);
 clKernel = clCreateKernel(clProgram, "TestKernel", &ret);

 // Create Buffer
 g_InputBinBuf = clCreateBuffer(g_Context, CL_MEM_READ_WRITE, DATA_SIZE, NULL, &ret);
 g_OutputBuf  = clCreateBuffer(g_Context, CL_MEM_READ_WRITE, DATA_SIZE, NULL, &ret);

// Call Kernel
void NDRangeKernel(const UINT32* image, size_t* local_work_size)
 // Copy InputBin To g_InputBinBuf
 clEnqueueWriteBuffer(g_CommandQueue, g_InputBinBuf, CL_TRUE, 0, DATA_SIZE, image, 0, NULL, NULL);

 // Kernl
 int data_width = INPUT_BIN_WIDTH;
 clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void*)&g_OutputBuf); // output
 clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void*)&g_InputBinBuf); // input
 clSetKernelArg(clKernel, 2, sizeof(int), (void *)&data_width);

 size_t global_item_size[] = {INPUT_BIN_WIDTH, INPUT_BIN_HEIGHT};
 clEnqueueNDRangeKernel( g_CommandQueue,  // 
       clKernel,   // 
       2,     // dim
       NULL,    // global_work_offset
       0, NULL, NULL

// Clear Resource
void End(void) 



// main
int _tmain(int argc, _TCHAR* argv[])
 // Init Resource

 // Test Main
 UINT32 ResultBuf[2][DATA_NUM];

 // NDRangeKernel LocalWork=NULL
 NDRangeKernel(InputData, NULL);
 clEnqueueReadBuffer(g_CommandQueue, g_OutputBuf, CL_TRUE, 0, DATA_SIZE, ResultBuf[0], 0, NULL, NULL);

#if 0 // another error case
 NDRangeKernel(InputData, NULL);
 size_t full_item_size[] = {INPUT_BIN_WIDTH, INPUT_BIN_HEIGHT};
 NDRangeKernel(InputData, full_item_size);
 clEnqueueReadBuffer(g_CommandQueue, g_OutputBuf, CL_TRUE, 0, DATA_SIZE, ResultBuf[0], 0, NULL, NULL);

 // NDRangeKernel LocalWork={1,1}
 size_t local_item_size[] = {1,1};
 NDRangeKernel(InputData, local_item_size);
 clEnqueueReadBuffer(g_CommandQueue, g_OutputBuf, CL_TRUE, 0, DATA_SIZE, ResultBuf[1], 0, NULL, NULL);

 // Print Result
 for(int i = 0; i < INPUT_BIN_HEIGHT; i++){
  for(int j = 0; j < INPUT_BIN_WIDTH; j++) printf("%d ",ResultBuf[0][i*INPUT_BIN_WIDTH + j]);

 for(int i = 0; i < INPUT_BIN_HEIGHT; i++){
  for(int j = 0; j < INPUT_BIN_WIDTH; j++) printf("%d ",ResultBuf[1][i*INPUT_BIN_WIDTH + j]);

 // Clear Resource

 return 0;



0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0

