OpenCL* for CPU
Ask questions and share information on Intel® SDK for OpenCL™ Applications and OpenCL™ implementations for Intel® CPU.
Announcements
This forum covers OpenCL* for CPU only. OpenCL* for GPU questions can be asked in the GPU Compute Software forum. Intel® FPGA SDK for OpenCL™ questions can be ask in the FPGA Intel® High Level Design forum.

Different output results after the driver update

tomohiro_m_
Beginner
512 Views

Hi,

I have been using the HD4600.
After version up of the driver,
Changes in the outcome of OpenCL occurred.

Is there a way to avoid?

(attached the resources)

· OpenCL SDK
Intel SDK for OpenCL Applications 2016

· GPU driver version
before: 10.18.10.3496 (2014/03/11)
after: 10.18.14.4264 (2015/08/04)

・clCreateContext()
clGetDeviceIDs() -
cl_device_type device_type = CL_DEVICE_TYPE_GPU; // (Intel(R) HD Graphics 4600)

・clBuildProgram()
const char *options = NULL

0 Kudos
4 Replies
Robert_I_Intel
Employee
512 Views

1. Have you tried https://downloadcenter.intel.com/download/25588/Intel-Graphics-Driver-for-Windows-7-8-1-15-36-  driver?

2. Could you please provide a complete example and steps to reproduce?

3. What did you get before? What are you getting now?

4. What are the sizes of your input/output buffers? What is the size of your enqueue? global/local sizes? Complete example would help a lot!

 

0 Kudos
Robert_I_Intel
Employee
512 Views

BTW, on the latest and greatest production driver (4380) on Windows 10, I am getting the results I expect from your kernel. The assembly for your kernel looks correct as well. So it could be that what you got previously is in error.

0 Kudos
tomohiro_m_
Beginner
512 Views

Dear Robert,
Thank you kind reply

1. Have you tried https://downloadcenter.intel.com/download/25588/Intel-Graphics-Driver-for-Windows-7-8-1-15-36-  driver?
  I tried. But the results did not change

2. Could you please provide a complete example and steps to reproduce?
3. What did you get before? What are you getting now?
4. What are the sizes of your input/output buffers? What is the size of your enqueue? global/local sizes? Complete example would help a lot!
  Sorry
  Upload all of the data
  Please check the readme.txt

BTW, on the latest and greatest production driver (4380) on Windows 10, I am getting the results I expect from your kernel. The assembly for your kernel looks correct as well. So it could be that what you got previously is in error.
  Now I can not update to Windows10. However, there are plans to update

0 Kudos
tomohiro_m_
Beginner
512 Views

Hi,

Please let me questions again.
It is a new program and run results
Problem due has occurred in local_work_size

OS:Windows7
GPU:Intel(R) HD Graphics 4600
Driver:10.18.14.4332
OpenCL SDK: 2016
OpenCL Code Builder:6.0.0.1049

#include "stdafx.h"
#include <Windows.h>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include "CL/cl.h"
#endif

cl_device_id  g_DeviceID  = 0;
cl_context   g_Context  = 0;
cl_command_queue g_CommandQueue = 0;
cl_program   clProgram  = 0;
cl_kernel   clKernel  = 0;
cl_mem    g_InputBinBuf = 0;
cl_mem    g_OutputBuf  = 0;

const char* DRIVER_NAME = "Intel(R) HD Graphics 4600";

#define INPUT_BIN_WIDTH  (8)
#define INPUT_BIN_HEIGHT (4)
#define DATA_NUM   (INPUT_BIN_WIDTH*INPUT_BIN_HEIGHT)
#define DATA_SIZE   (DATA_NUM*4)

// Test Data
unsigned int InputData[DATA_NUM] = {
 0,   0, 0, 0, 0, 0, 0, 0,
 0x00000102, 0, 0, 0, 0, 0, 0, 0,
 0,   0, 0, 0, 0, 0, 0, 0,
 0,   0, 0, 0, 0, 0, 0, 0
};

// Kernel Code
const char* kernel_code = ""
"__kernel void TestKernel(   __global unsigned int* oBuffer,  \n"
"       const __global unsigned int* iRgbaImage, \n"
"       const int iImageWidth      \n"
")                  \n"
"{                  \n"
" int gx = get_global_id(0);           \n"
"    int gy = get_global_id(1);           \n"
"                  \n"
" unsigned int Input;             \n"
" unsigned int result;            \n"
" int pos;               \n"
"                  \n"
" pos  = gy * iImageWidth + gx; // ReadAddress     \n"
" Input = iRgbaImage[pos];   // TargetData     \n"
"                  \n"
" result = Input & 0xff;            \n"
" if ( ((Input>>8)&0xff) != 0){          \n"
"   result = ((Input>>8)&0xff) + (Input&0xff);      \n"
" }                 \n"
"                  \n"
" oBuffer[pos] = result;            \n"
"}                  \n";

//-----------------------------------
// Init Resource
//-----------------------------------
void Init(void) 
{
 cl_uint numPlatforms;
 cl_int ret;

 // CreateOpenCL
    ret = clGetPlatformIDs(0, NULL, &numPlatforms);
 cl_platform_id *platform_id = new cl_platform_id[numPlatforms];
 cl_device_id device_id;
    ret = clGetPlatformIDs(numPlatforms, platform_id, NULL);

 static const int INFO_MAX = 128;
    char device_name[INFO_MAX];

 for(cl_uint i = 0; i < numPlatforms; ++i){
  clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
  clGetDeviceInfo( device_id, CL_DEVICE_NAME, INFO_MAX, device_name, NULL);
  if(NULL != strstr(device_name, DRIVER_NAME)){
   g_DeviceID = device_id;
   break;
  }
 }
 if(g_DeviceID == 0){
  printf("no match driver");
  exit(0);
 }
 g_Context  = clCreateContext( NULL, 1, &g_DeviceID, NULL, NULL, NULL);
 g_CommandQueue = clCreateCommandQueue(g_Context, g_DeviceID, 0, NULL);
 delete[] platform_id;

 size_t source_size = strlen(kernel_code);
 clProgram = clCreateProgramWithSource(g_Context, 1, (const char **)&kernel_code, (const size_t *)&source_size, &ret);
 ret   = clBuildProgram(clProgram, 1, &g_DeviceID, NULL, NULL, NULL);
 clKernel = clCreateKernel(clProgram, "TestKernel", &ret);

 // Create Buffer
 g_InputBinBuf = clCreateBuffer(g_Context, CL_MEM_READ_WRITE, DATA_SIZE, NULL, &ret);
 g_OutputBuf  = clCreateBuffer(g_Context, CL_MEM_READ_WRITE, DATA_SIZE, NULL, &ret);
}

//-----------------------------------
// Call Kernel
//-----------------------------------
void NDRangeKernel(const UINT32* image, size_t* local_work_size)
{
 // Copy InputBin To g_InputBinBuf
 clEnqueueWriteBuffer(g_CommandQueue, g_InputBinBuf, CL_TRUE, 0, DATA_SIZE, image, 0, NULL, NULL);

 // Kernl
 int data_width = INPUT_BIN_WIDTH;
 clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void*)&g_OutputBuf); // output
 clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void*)&g_InputBinBuf); // input
 clSetKernelArg(clKernel, 2, sizeof(int), (void *)&data_width);

 size_t global_item_size[] = {INPUT_BIN_WIDTH, INPUT_BIN_HEIGHT};
 clEnqueueNDRangeKernel( g_CommandQueue,  // 
       clKernel,   // 
       2,     // dim
       NULL,    // global_work_offset
       global_item_size,
       local_work_size,
       0, NULL, NULL
      );
}

//-----------------------------------
// Clear Resource
//-----------------------------------
void End(void) 
{
 clReleaseMemObject(g_InputBinBuf);
 clReleaseMemObject(g_OutputBuf);

 clReleaseKernel(clKernel);
 clReleaseProgram(clProgram);

 clFlush(g_CommandQueue);
 clFinish(g_CommandQueue);
 clReleaseCommandQueue(g_CommandQueue);
 clReleaseContext(g_Context);
}

//================================================
// main
//================================================
int _tmain(int argc, _TCHAR* argv[])
{
 //-------------
 // Init Resource
 Init();

 //-------------
 // Test Main
 UINT32 ResultBuf[2][DATA_NUM];

 // NDRangeKernel LocalWork=NULL
 NDRangeKernel(InputData, NULL);
 clEnqueueReadBuffer(g_CommandQueue, g_OutputBuf, CL_TRUE, 0, DATA_SIZE, ResultBuf[0], 0, NULL, NULL);

#if 0 // another error case
 NDRangeKernel(InputData, NULL);
 size_t full_item_size[] = {INPUT_BIN_WIDTH, INPUT_BIN_HEIGHT};
 NDRangeKernel(InputData, full_item_size);
 clEnqueueReadBuffer(g_CommandQueue, g_OutputBuf, CL_TRUE, 0, DATA_SIZE, ResultBuf[0], 0, NULL, NULL);
#endif

 // NDRangeKernel LocalWork={1,1}
 size_t local_item_size[] = {1,1};
 NDRangeKernel(InputData, local_item_size);
 clEnqueueReadBuffer(g_CommandQueue, g_OutputBuf, CL_TRUE, 0, DATA_SIZE, ResultBuf[1], 0, NULL, NULL);

 //-------------
 // Print Result
 printf("LocalWork=NULL\n");
 for(int i = 0; i < INPUT_BIN_HEIGHT; i++){
  for(int j = 0; j < INPUT_BIN_WIDTH; j++) printf("%d ",ResultBuf[0][i*INPUT_BIN_WIDTH + j]);
  printf("\n");
 }
 printf("\n");

 printf("LocalWork={1,1}\n");
 for(int i = 0; i < INPUT_BIN_HEIGHT; i++){
  for(int j = 0; j < INPUT_BIN_WIDTH; j++) printf("%d ",ResultBuf[1][i*INPUT_BIN_WIDTH + j]);
  printf("\n");
 }
 printf("\n");

 //-------------
 // Clear Resource
 End();

 system("pause");
 return 0;
}

 

result

LocalWork=NULL
0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0

LocalWork={1,1}
0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0

0 Kudos
Reply