Intel® Distribution of OpenVINO™ Toolkit
Community assistance about the Intel® Distribution of OpenVINO™ toolkit, OpenCV, and all aspects of computer vision-related on Intel® platforms.

Scale Image using Openvx target_vx GPU

Prasad__Nagendra
Beginner
671 Views

Hi,

I am trying to do image scaling using openvx, set node target_vx to GPU, it consumes 9%GPU, and more (64%) CPU on Intel Core M3 stick.

i created bgr file of 3171 frames, 1280X720 resolution. and trying to scale down to 640X360 resolution.

bgr file read takes 24% cpu, but still it(image scaling) consumes 40%(64-24) CPU, though i set target node GPU.

can we reduce the cpu usage??

here is my code,

#include <stdio.h>
#include <cstdint>
#include <opencv2/opencv.hpp>
#include <VX/vx.h>
#include <VX/vx_intel_volatile.h>

#define CHECK_VX_STATUS(COMMAND)                                \
    {                                                           \
        vx_status __local_status = COMMAND;                     \
        if(__local_status != VX_SUCCESS)                        \
        {                                                       \
            std::cerr                                           \
                << "[ ERROR ] VX API call failed with "         \
                << __local_status << "\n" \
                << "    expression: " << #COMMAND << "\n"       \
                << "    file:       " << __FILE__ << "\n"       \
                << "    line:       " << __LINE__ << "\n";      \
            std::exit(1);                                       \
        }                                                       \
    }
inline void VX_CALLBACK errorReceiver (vx_context context, vx_reference ref, vx_status statux, const vx_char string[])
{
    printf("[ ERROR ] OpenVX error callback: %s\n", string);
}

uint64_t sr_clock_gettime_microseconds()
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);

    uint64_t ticks = (ts.tv_sec * 1000 * 1000) + (ts.tv_nsec / 1000);
    return ticks;
}

vx_image createVxImage(vx_context context,
                       uint32_t width,
                       uint32_t height,
                       vx_df_image_e color,
                       uint8_t *buf);


vx_graph createGraphBGR(vx_context  vxContext,
                     vx_image in_img,
                     vx_image out_img,
                     vx_enum target_vx,
                     int w, int h, std::vector<vx_node> &vxNodes);

int main(int argc, char *argv[])
{
    printf("main\n");

    if(argc != 5)
    {
        std::cerr<<"./test <input-file> <output-file> <w> <h>"<<std::endl;
        return -1;
    }

    const char *input_file=argv[1];
    const char *output_file=argv[2];
    int W=atoi(argv[3]);
    int H=atoi(argv[4]);

    printf("input_file=%s\n", input_file);
    printf("output_file=%s\n", output_file);
    printf("W=%d\n", W);
    printf("H=%d\n", H);

    FILE *inFile = fopen(input_file, "rb");
    if(inFile == NULL)
    {
        printf("inFile cant be opened\n");
        return -1;
    }

    FILE *outFile = fopen(output_file, "wb");
    if(outFile == NULL)
    {
        printf("outFile cant be opened\n");
        return -1;
    }

    fseek(inFile, 0, SEEK_END);
    size_t size = ftell(inFile);
    fseek(inFile, 0, SEEK_SET);
    if(size == 0)
    {
        fclose(inFile);
        return 0;
    }

    size_t in_buffer_size;
    size_t out_buffer_size;
    in_buffer_size = W*H*3;
    out_buffer_size = W/2*H/2*3;

    printf("in_buffer_size=%lu\n", in_buffer_size);
    printf("out_buffer_size=%lu\n", out_buffer_size);
    uint8_t **in_buffers = new uint8_t*[2];
    in_buffers[0] = new uint8_t[in_buffer_size];
    in_buffers[1] = new uint8_t[in_buffer_size];
    uint8_t **out_buffers = new uint8_t*[2];
    out_buffers[0] = new uint8_t[out_buffer_size];
    out_buffers[1] = new uint8_t[out_buffer_size];

    int in_bufferIndx = 0;
    int out_bufferIndx = 0;

    uint64_t numOfFrames=0;
    size_t readBytes=0;
    while(readBytes < size)
    {
        int ret = fread(in_buffers[in_bufferIndx], 1, in_buffer_size, inFile);
        if(ret <= 0)
        {
            return -1;
        }
        readBytes += in_buffer_size;
        break;
    }

    vx_context m_vxContext = vxCreateContext();
    vxRegisterLogCallback(m_vxContext, errorReceiver, vx_true_e);

    vx_image in_img = createVxImage(m_vxContext, W, H, VX_DF_IMAGE_RGB, in_buffers[in_bufferIndx]);
    vx_image out_img = createVxImage(m_vxContext, W/2, H/2, VX_DF_IMAGE_RGB, out_buffers[out_bufferIndx]);
    vx_enum target_vx = VX_TARGET_GPU_INTEL;


    std::vector<vx_node> vxNodes;
    vx_graph vxGraph= createGraphBGR(m_vxContext, in_img, out_img, target_vx, W, H, vxNodes);
    vx_status status_verifyGraph = vxVerifyGraph(vxGraph);
    if(status_verifyGraph != VX_SUCCESS) {
        printf("graph verify failed %d \n", status_verifyGraph);
        return -1;
    }

    uint64_t t1 = sr_clock_gettime_microseconds();

    while(readBytes < size)
    {
        if(in_bufferIndx==0) {
            in_bufferIndx=1;
        } else{
            in_bufferIndx=0;
        }
        if(out_bufferIndx==0) {
            out_bufferIndx=1;
        } else{
            out_bufferIndx=0;
        }
        int ret = fread(in_buffers[in_bufferIndx], 1, in_buffer_size, inFile);
        if(ret <= 0)
        {
            break;
        }
        readBytes += in_buffer_size;
        numOfFrames++;

        void *prev_videoFrameBuf;
        vx_status status_swapImg = vxSwapImageHandle(in_img, (void**)&in_buffers[in_bufferIndx], &prev_videoFrameBuf, 1);
        CHECK_VX_STATUS(status_swapImg);

        vxProcessGraph(vxGraph);

        uint8_t *out_buf;
        status_swapImg = vxSwapImageHandle(out_img, (void **) &out_buffers[out_bufferIndx], (void **) &out_buf, 1);
        CHECK_VX_STATUS(status_swapImg);

        fwrite(out_buf, out_buffer_size, 1, outFile);

    }
    uint64_t t2 = sr_clock_gettime_microseconds();

    fclose(inFile);
    fclose(outFile);
    printf("done\n");
    printf("numOfFrames=%llu\n", numOfFrames);
    printf("t2-t1=%llu micro secs\n", t2-t1);

    for (int i = 0; i < vxNodes.size(); i++)
    CHECK_VX_STATUS(vxReleaseNode(&vxNodes[i]));

    CHECK_VX_STATUS(vxReleaseGraph(&vxGraph));

    CHECK_VX_STATUS(vxReleaseContext(&m_vxContext));

    vxNodes.clear();

    return 0;
}

vx_image createVxImage(vx_context context,
                       uint32_t width,
                       uint32_t height,
                       vx_df_image_e color,
                       uint8_t *buf)
{
    uint32_t stride_x;
    uint32_t stride_y;
    if(color == VX_DF_IMAGE_YUYV) {
        stride_x = 2;
        stride_y = width * 2;
    } else if(color == VX_DF_IMAGE_UYVY) {
        stride_x = 2;
        stride_y = width * 2;
    } else if(color == VX_DF_IMAGE_RGB) {
        stride_x = 3;
        stride_y = width * 3;
    } else if(color == VX_DF_IMAGE_RGBX) {
        stride_x = 4;
        stride_y = width * 4;
    } else if(color == VX_DF_IMAGE_U8) {
        stride_x = 1;
        stride_y = width * 1;
    } else {
        printf("invalid img format\n");
        throw "invalid img format "+ color;
    }

    // Prepare structures for correct sharing with OpenCV
    // Here vx_array instance is created over cv::Mat data structure
    // To do that, we need to describe data alignement in vx_imagepatch_addressing_t
    // structure instance and then pass it to vxCreateImageFromHandle function.

    vx_imagepatch_addressing_t frameFormat;
    frameFormat.dim_x = width;
    frameFormat.dim_y = height;
    frameFormat.stride_x = stride_x;
    frameFormat.stride_y = stride_y;  // number of bytes each matrix row occupies
    frameFormat.scale_x = VX_SCALE_UNITY;
    frameFormat.scale_y = VX_SCALE_UNITY;
    frameFormat.step_x = 1;
    frameFormat.step_y = 1;

    vx_image vxImg = vxCreateImageFromHandle(
            context,
            color,
            &frameFormat,
            (void**)&buf,
            VX_MEMORY_TYPE_HOST
    );

    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxImg));
    return vxImg;
}

vx_graph createGraphBGR(vx_context  vxContext,
                     vx_image in_img,
                     vx_image out_img,
                     vx_enum target_vx,
                     int w, int h, std::vector<vx_node> &vxNodes)
{


    vx_graph graph = vxCreateGraph(vxContext);
    CHECK_VX_STATUS(vxGetStatus((vx_reference)graph));
    std::vector<vx_image> vxImages;

    vx_image vxChannelBImg = vxCreateVirtualImage(graph, w,h, VX_DF_IMAGE_U8);
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxChannelBImg));
    vxImages.push_back(vxChannelBImg);

    vx_image vxChannelGImg = vxCreateVirtualImage(graph, w,h, VX_DF_IMAGE_U8);
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxChannelGImg));
    vxImages.push_back(vxChannelGImg);

    vx_image vxChannelRImg = vxCreateVirtualImage(graph, w,h, VX_DF_IMAGE_U8);
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxChannelRImg));
    vxImages.push_back(vxChannelRImg);

    vxNodes.push_back(vxChannelExtractNode(graph, in_img, VX_CHANNEL_B, vxChannelBImg));
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxNodes.back()));
    CHECK_VX_STATUS(vxSetNodeTarget(vxNodes.back(), target_vx, NULL));

    vxNodes.push_back(vxChannelExtractNode(graph, in_img, VX_CHANNEL_G, vxChannelGImg));
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxNodes.back()));
    CHECK_VX_STATUS(vxSetNodeTarget(vxNodes.back(), target_vx, NULL));

    vxNodes.push_back(vxChannelExtractNode(graph, in_img, VX_CHANNEL_R, vxChannelRImg));
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxNodes.back()));
    CHECK_VX_STATUS(vxSetNodeTarget(vxNodes.back(), target_vx, NULL));

    vx_image vxChannelB2Img = vxCreateVirtualImage(graph, w/2,h/2, VX_DF_IMAGE_U8);
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxChannelB2Img));
    vxImages.push_back(vxChannelB2Img);

    vx_image vxChannelG2Img = vxCreateVirtualImage(graph, w/2,h/2, VX_DF_IMAGE_U8);
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxChannelG2Img));
    vxImages.push_back(vxChannelG2Img);

    vx_image vxChannelR2Img = vxCreateVirtualImage(graph, w/2,h/2, VX_DF_IMAGE_U8);
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxChannelR2Img));
    vxImages.push_back(vxChannelR2Img);

    vxNodes.push_back(vxScaleImageNode(graph, vxChannelBImg, vxChannelB2Img, VX_INTERPOLATION_BILINEAR));
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxNodes.back()));
    CHECK_VX_STATUS(vxSetNodeTarget(vxNodes.back(), target_vx, NULL));
    vxNodes.push_back(vxScaleImageNode(graph, vxChannelGImg, vxChannelG2Img, VX_INTERPOLATION_BILINEAR));
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxNodes.back()));
    CHECK_VX_STATUS(vxSetNodeTarget(vxNodes.back(), target_vx, NULL));
    vxNodes.push_back(vxScaleImageNode(graph, vxChannelRImg, vxChannelR2Img, VX_INTERPOLATION_BILINEAR));
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxNodes.back()));
    CHECK_VX_STATUS(vxSetNodeTarget(vxNodes.back(), target_vx, NULL));

    vxNodes.push_back(vxChannelCombineNode(graph, vxChannelB2Img, vxChannelG2Img, vxChannelR2Img, NULL, out_img));
    CHECK_VX_STATUS(vxGetStatus((vx_reference)vxNodes.back()));
    CHECK_VX_STATUS(vxSetNodeTarget(vxNodes.back(), target_vx, NULL));

    for (int i = 0; i < vxImages.size(); i++)
    CHECK_VX_STATUS(vxReleaseImage(&vxImages[i]));

    return graph;
}

RUN:

./mytest 1280x720x9.bgr 1280x720x9_s.bgr 1280 720

OUTPUT:

main
input_file=1280x720x9.bgr
output_file=1280x720x9_s.bgr
W=1280
H=720
in_buffer_size=2764800
out_buffer_size=691200
done
numOfFrames=3171
t2-t1=90636292 micro secs

CPU usage 64%, GPU usage 9%

 

 

 

0 Kudos
4 Replies
Ryan_M_Intel1
Employee
671 Views

Hello,

Which OS & OpenCL version are you using for this test? Also, what are you using to measure the CPU & GPU utilization %?

One potential optimization is to make sure your input & output image buffer ptr's are 4K-byte aligned. If they are not, there would be an implicit memcpy from non-aligned buffers into mapped GPU buffers.

Regards,

Ryan 

0 Kudos
Prasad__Nagendra
Beginner
671 Views

Hi Ryan,

Thanks for your reply.

OS: Ubuntu 16.04.1 LTS

OpenCL verson:opencl-1.2-sdk-6.3.0.1914

i am using top command to find CPU and intel_gpu_top command to find GPU usage.

I changed ptr are 4K-byte aligned, still CPU usage is same.

my understanding of openvx is like this, when i set all nodes target to GPU, it should only use CPU to copy buffers to GPU at the beginning and at the end to CPU back. in my case RGB input of size(1280x720x3) and RGB output(640x480x3). max it should take 4 to 5 percent cpu.

is my understanding correct??

Thanks.

 

0 Kudos
Shubha_R_Intel
Employee
671 Views

 

Hi Nagendra. I'd like to point you to a few articles:

https://software.intel.com/en-us/openvino-ovx-guide-scheduling-individual-nodes-to-different-tar gets

I noticed in your code you are doing  

vx_enum target_vx = VX_TARGET_GPU_INTEL;

But the above article prefers to use  vxSetNodeTarget  followed by checking the status code. Perhaps setting the GPU target as you did  in your code failed and therefore didn't take effect.

More good articles for you to study:

https://software.intel.com/en-us/openvino-ovx-guide-striving-for-performance

https://software.intel.com/en-us/openvino-ovx-guide-openvx-performance-tips

Excerpt from 2nd article:

  • Provide enough parallel slack to the scheduler- do not break work (for example, images) into too many tiny pieces. Consider kernel fusion.

Also did you try vxScheduleGraph instead of vxProcessGraph ?

Finally, to narrow things down I'd try to get rid of unnecessary loops in the code which could definitely be CPU intensive.  For instance, instead of 

 while(readBytes < size)
    {
        int ret = fread(in_buffers[in_bufferIndx], 1, in_buffer_size, inFile);
        if(ret <= 0)
        {
            return -1;
        }
        readBytes += in_buffer_size;
        break;
    }

you can do something like this (reference https://stackoverflow.com/questions/238603/how-can-i-get-a-files-size-in-c )

fseek(fp, 0L, SEEK_END);
sz = ftell(fp);
You can then seek back, e.g.:

fseek(fp, 0L, SEEK_SET);
or (if seeking to go to the beginning)

rewind(fp);

Hope these tips help you. Thank you for using OpenVino !

Shubha

0 Kudos
Prasad__Nagendra
Beginner
671 Views

Hi subha,

Thanks for your reply.

i am using vxSetNodeTarget. that you can see in code.

  vxNodes.push_back(vxChannelExtractNode(graph, in_img, VX_CHANNEL_R, vxChannelRImg));
  CHECK_VX_STATUS(vxGetStatus((vx_reference)vxNodes.back()));
  CHECK_VX_STATUS(vxSetNodeTarget(vxNodes.back(), target_vx, NULL));

i didnt try the vxScheduleGraph. I will try this, and check.

Thanks.

 

0 Kudos
Reply