Intel® C++ Compiler
Community support and assistance for creating C++ code that runs on platforms based on Intel® processors.

FATAL ERROR: could not create surface....why?

Ben_Rush
Beginner
530 Views

I'm just starting off coding for the GPGPU using the Intel compiler, so there's probably a lesson to be learned here. But can someone look at this code and tell me why I get this output when running it. Please note that the function isn't called (in this code example), I've commented it out and am executing the code within the cilk_for loop instead. 

Here is the code: 
 

// IntelTestBed2.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"
#include<gfx/gfx_rt.h>
#include "cilk/cilk.h"
#include <iostream>
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>

struct rgb
{
    unsigned char red;
    unsigned char green; 
    unsigned char blue; 
};

__declspec (target(gfx))
void process_image(rgb &originalImage, rgb &modifiedImage) {

    float temp;
    temp = (0.393f * originalImage.red) + (0.769f * originalImage.green) + (0.189f * originalImage.blue);
    modifiedImage.red = (temp > 255.f) ? 255.f : temp;
    temp = (0.349f * originalImage.red) + (0.686f * originalImage.green) + (0.168f * originalImage.blue);
    modifiedImage.green = (temp > 255.f) ? 255.f : temp;
    temp = (0.272f * originalImage.red) + (0.534f * originalImage.green) + (0.131f * originalImage.blue);
    modifiedImage.blue = (temp > 255.f) ? 255.f : temp;
}

int main()
{
    rgb* originalImage = new rgb[320 * 240]; 
    rgb* modifiedImage = new rgb[320 * 240]; 

#pragma offload target(gfx) pin(originalImage,modifiedImage:length(320*240*sizeof(rgb)))
    cilk_for(int c = 0; c < 320 * 240; c++)
    {
        //process_image(originalImage, modifiedImage); 

        float temp;
        temp = (0.393f * originalImage.red) + (0.769f * originalImage.green) + (0.189f * originalImage.blue);
        modifiedImage.red = (temp > 255.f) ? 255.f : temp;
        temp = (0.349f * originalImage.red) + (0.686f * originalImage.green) + (0.168f * originalImage.blue);
        modifiedImage.green = (temp > 255.f) ? 255.f : temp;
        temp = (0.272f * originalImage.red) + (0.534f * originalImage.green) + (0.131f * originalImage.blue);
        modifiedImage.blue = (temp > 255.f) ? 255.f : temp;
    }

    getchar(); 

    return 0;
}



Here is the output (note the error is bold below):

GFX(10:39:45): Linked with dynamic gfx-runtime library
GFX(10:39:45): Library loaded C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2016\windows\redist\intel64\compiler\libgfxoffload.dll
GFX(10:39:45): Library version 13.0.0.0
GFX(10:39:45): Found OS: Windows 8
GFX(10:39:45):
GFX(10:39:45): Environment vars:
GFX(10:39:45):      GFX_SLM_LIMIT                  = 65536
GFX(10:39:45):      GFX_L3_CONFIG                  = -2
GFX(10:39:45):      GFX_NOTIFY_DEBUGGER            = 0
GFX(10:39:45):      GFX_THREAD_SPACE_HEIGHT        = 0
GFX(10:39:45):      GFX_THREAD_SPACE_WIDTH         = 0
GFX(10:39:45):      GFX_THREAD_GROUP_HEIGHT        = 8
GFX(10:39:45):      GFX_THREAD_GROUP_WIDTH         = 1
GFX(10:39:45):      GFX_STACK_SIZE                 = 33554432
GFX(10:39:45):      GFX_SPECIAL_EXIT               = 0
GFX(10:39:45):      GFX_DUMP_CISA                  = 0
GFX(10:39:45):      GFX_DUMP_RESOURCE              = 0
GFX(10:39:45):      GFX_OFFLOAD_DISABLE            = 0
GFX(10:39:45):      GFX_SIM_MODE                   = 0
GFX(10:39:45):      GFX_DX_MODE                    = UNDEFINED
GFX(10:39:45):      GFX_PROG_OPTIONS               =
GFX(10:39:45):      GFX_UNION_PTRS                 = 8
GFX(10:39:45):      GFX_ALIGN_PINS                 = 1
GFX(10:39:45):    * GFX_DEBUG                      = 1
GFX(10:39:45):      GFX_CPU_BACKUP                 = 2
GFX(10:39:45):      GFX_USE_MEDIA_WALKER           = 0
GFX(10:39:45):      GFX_CACHE_KERNELS              = 0
GFX(10:39:45):      GFX_LOOP_MAPPING               = 2
GFX(10:39:45):      GFX_OFFLOAD_TIMEOUT_FUNC       = -1
GFX(10:39:45):      GFX_OFFLOAD_TIMEOUT            = 60
GFX(10:39:45):      GFX_USE_BUFFER_UP              = 1
GFX(10:39:45):    * GFX_SHOW_TIME                  = 1
GFX(10:39:45):    * GFX_PRINT_DIAG                 = 1
GFX(10:39:45):      GFX_MAX_THREAD_COUNT           = -1
GFX(10:39:45):    * GFX_LOG_OFFLOAD                = 1
GFX(10:39:45): Library loaded C:\WINDOWS\SYSTEM32\igfx11cmrt64.dll
GFX(10:39:46): Library version CmRT(5.0.0.1133), CmJIT(5.0.0.1133)
GFX(10:39:46): Selected DirectX mode: DX11
GFX(10:39:46): Simulator mode: OFF
GFX(10:39:46): TDR info: Graphics timeout set to default (2 seconds)
GFX(10:39:46):
GFX(10:39:46): Device capabilites:
GFX(10:39:46):     CAP_KERNEL_COUNT_PER_TASK                = 16
GFX(10:39:46):     CAP_KERNEL_BINARY_SIZE                   = 65536
GFX(10:39:46):     CAP_SAMPLER_COUNT                        = 64
GFX(10:39:46):     CAP_SAMPLER_COUNT_PER_KERNEL             = 16
GFX(10:39:46):     CAP_BUFFER_COUNT                         = 256
GFX(10:39:46):     CAP_SURFACE2D_COUNT                      = 256
GFX(10:39:46):     CAP_SURFACE3D_COUNT                      = 64
GFX(10:39:46):     CAP_SURFACE_COUNT_PER_KERNEL             = 255
GFX(10:39:46):     CAP_ARG_COUNT_PER_KERNEL                 = 255
GFX(10:39:46):     CAP_ARG_SIZE_PER_KERNEL                  = 2016
GFX(10:39:46):     CAP_USER_DEFINED_THREAD_COUNT_PER_TASK   = 262144
GFX(10:39:46):     CAP_HW_THREAD_COUNT                      = 161
GFX(10:39:46):     CAP_SURFACE2D_FORMAT_COUNT               = 23
GFX(10:39:46):     CAP_SURFACE3D_FORMAT_COUNT               = 2
GFX(10:39:46):     CAP_VME_STATE_G6_COUNT                   = 8
GFX(10:39:46):     CAP_GPU_PLATFORM                         = <future platform1>
GFX(10:39:46):     CAP_GT_PLATFORM                          = GT2
GFX(10:39:46):     CAP_MIN_FREQUENCY                        = 350
GFX(10:39:46):     CAP_MAX_FREQUENCY                        = 1200
GFX(10:39:46):     CAP_GPU_CURRENT_FREQUENCY                = 0
GFX(10:39:46): The number of available hardware threads: 161
GFX(10:39:46):
GFX(10:39:46): GFX image loaded from executable: resource name 2500414GFX6.gfx
GFX(10:39:46): Program created
GFX(10:39:46):     GFX image size: 36232
GFX(10:39:46):     CISA count: 1
GFX(10:39:46):
GFX(10:39:46): CISA #0:
GFX(10:39:46):     version 3.1
GFX(10:39:46):     num_kernels 1
GFX(10:39:46):         kernel #0: 'L_main_IntelTestBed2_cpp_35_35__par_region0_2'
GFX(10:39:46):         num_binaries 0
GFX(10:39:46):     num_functions 1
GFX(10:39:46):         function #0: '_process_image__YAXAEAUrgb__0_Z'
GFX(10:39:46):
GFX(10:39:46): LoadProgram (cisa #0)...
GFX(10:39:46):     LoadProgram (cisa #0)...success
GFX(10:39:46): WARNING: surface creation failed

GFX(10:39:46): FATAL ERROR: could not create surface

GFX performance timers with non-zero value (milliseconds, activation counter):
                   Offload Total = 13.83, 1
                 Device Creation = 36.70, 1
                 Kernel Creation = 10.82, 1
                 Buffer Creation = 0.06, 1
       Iteration Space Splitting = 0.01, 1
                  Argument Setup = 0.01, 1
                     ELF Parsing = 2.02, 1
                 Program Loading = 11.80, 1
Press any key to continue . . .

0 Kudos
6 Replies
Ben_Rush
Beginner
530 Views

...it seems like I'm in the habit of answering my own questions these days. 

The issue was the pragma: #pragma offload target(gfx) pin(originalImage,modifiedImage:length(320*240*sizeof(rgb)))

I didn't need to multiply it by the size of the rgb structure. That was allocating too much memory to be pinned effectively, it seems. Sometimes it worked, others it didn't (why it worked sometimes and not others I do not know). 

 

0 Kudos
Kittur_G_Intel
Employee
530 Views

Hi Ben!
I just got to read this issue and I do agree that the issue is related to the pragma you noted. I'll touch base with our gen offload dev expert and get back to you on why it didn't work sometimes and see if memory is a factor here. Appreciate your patience till then.

Kittur

0 Kudos
Anoop_M_Intel
Employee
530 Views

I can reproduce this problem. I see that when the surface size is less than 128KB, the surface creation is always successful but once it exceeds 128KB, the program works sometimes. Could you please execute the gfx_sys_check.exe and share the output with us. This will give us more information on processor and driver versions installed on your machine. We have increased the surface size with the latest drivers. Will post more details on this soon.

Thanks and Regards
Anoop

0 Kudos
Ben_Rush
Beginner
530 Views

Anoop Madhusoodhanan Prabha (Intel) wrote:

Could you please execute the gfx_sys_check.exe and share the output with us.

Certainly, 

C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2016.2.180\windows\bin\intel64>gfx_sys_check.exe

Checking CPU

Checking OS
 Windows 8 x64

Checking display
 device:
       Provider: Intel Corporation
    Description: Intel(R) HD Graphics 530
        Version: 20.19.15.4380
 device:
       Provider: NVIDIA
    Description: NVIDIA GeForce GT 420
        Version: 10.18.13.5582

Checking Intel HD Graphics Driver
  RT Dll version: (5.0.0.1133)
 JIT Dll version: (5.0.0.1133)
 GPU architecture: skylake
     vISA support: visa3.2

C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2016.2.180\windows\bin\intel64>

0 Kudos
Kittur_G_Intel
Employee
530 Views

Thanks Ben for the output and Anoop will investigate further and get back to you accordingly - appreciate much.

Kittur

0 Kudos
Zhongqi_Zhang
Novice
530 Views

Anoop Madhusoodhanan Prabha (Intel) wrote:

I can reproduce this problem. I see that when the surface size is less than 128KB, the surface creation is always successful but once it exceeds 128KB, the program works sometimes. Could you please execute the gfx_sys_check.exe and share the output with us. This will give us more information on processor and driver versions installed on your machine. We have increased the surface size with the latest drivers. Will post more details on this soon.

Thanks and Regards
Anoop

Dear Anoop,

I have met this problem recent days. What I need is to PIN an image with size of about 520K.

The main code block looks like below:

#define AVXLENGTH 190*88*8
    float* fKernel = NULL;
    fKernel = (float*)malloc(AVXLENGTH * 4);
    for (int i = 0; i < AVXLENGTH; ++i)
    {
        fKernel = rand() / 1000.0f;
    }
    __m256 fv1;

#pragma offload target(gfx) if (true)\
    pin(fKernel:length(AVXLENGTH*4)) /// about 520K
    _Cilk_for(int i = 0; i < AVXLENGTH; i += 8)
    {
        _Cilk_for(int j = 0; j < 4096; ++j)
        {
            fv1 = _mm256_load_ps(fKernel + i);
            fv1 = _mm256_exp_ps(fv1);
            /// do sth...
        }
    }

This block works good on Inter(R) HD Graphics 4600, however, when I moved it to a computer with Inter(R) HD Graphics P530, a fatel error occurred which is "FATAL ERROR: could not create surface", the same as Ben as been through. When I reduce the pointer length to a very small number, say 10K, it works the same on two platforms.

I am wondering whether the pointer fKernel is too large to pin? But 128K is not enough to accomplish my algorithm, and I have no idea why the same code works so different on these to HD Graphics. Is there any method which can allow me to pin a 520K data in one time?

Thanks and Regards

Zhongqi.Zhang

0 Kudos
Reply