Openvino Remote tensor

Gayathri_Sankaran · ‎07-26-2024

Hi

I have created the openvino remote context creation using the directx device. Does the compilemodel class support only nv12 texture tensor for the directx device. My compilation of model gets failing with exemption if I try to pass the rgba 2dtexture to it.

It would be helpful, if any insights into this. I will be attaching my code here.

Thanks

"

#include <windows.h>
#include <d3d11.h>
#include <dxgi1_2.h>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <openvino/openvino.hpp>
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
#include <openvino/runtime/intel_gpu/ocl/dx.hpp>
#include "WICTextureLoader.h"
#include<fstream>

#include "openvino/core/core_visibility.hpp"
#include "openvino/core/preprocess/input_info.hpp"
#include "openvino/core/preprocess/output_info.hpp"

#define file_name_t std::string
#define imread_t cv::imread

////////// For Model Output Inference - Post Processing /////////////////////////
#define NMS_THRESH 0.45
#define BBOX_CONF_THRESH 0.3
#define PADDING_VALUE 114

static const int INPUT_W = 416;
static const int INPUT_H = 416;
static const int NUM_CLASSES = 8; // 8 classes for . Modify this value on your own dataset.

struct Object
{
cv::Rect_<float> rect;
int label;
float prob;
};

struct GridAndStride
{
int grid0;
int grid1;
int stride;
};

static void generate_grids_and_stride(const int target_w, const int target_h,
std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
{
for (auto stride : strides)
{
int num_grid_w = target_w / stride;
int num_grid_h = target_h / stride;
for (int g1 = 0; g1 < num_grid_h; g1++)
{
for (int g0 = 0; g0 < num_grid_w; g0++)
{
grid_strides.push_back({ g0, g1, stride });
}

}
}
}

static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides,
const float* feat_ptr, float prob_threshold, std::vector<Object>& objects)
{
const int num_anchors = grid_strides.size();
for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
{
const int grid0 = grid_strides[anchor_idx].grid0;
const int grid1 = grid_strides[anchor_idx].grid1;
const int stride = grid_strides[anchor_idx].stride;
const int basic_pos = anchor_idx * (NUM_CLASSES + 5);
// yolox/models/yolo_head.py decode logic
// outputs[..., :2] = (outputs[..., :2] + grids) * strides
// outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
float x_center = (feat_ptr[basic_pos + 0] + grid0) * stride;
float y_center = (feat_ptr[basic_pos + 1] + grid1) * stride;
float w = exp(feat_ptr[basic_pos + 2]) * stride;
float h = exp(feat_ptr[basic_pos + 3]) * stride;
float x0 = x_center - w * 0.5f;
float y0 = y_center - h * 0.5f;

float box_objectness = feat_ptr[basic_pos + 4];
for (int class_idx = 0; class_idx < NUM_CLASSES; class_idx++)
{
float box_cls_score = feat_ptr[basic_pos + 5 + class_idx];
float box_prob = box_objectness * box_cls_score;
if (box_prob > prob_threshold)
{
Object obj;
obj.rect.x = x0;
obj.rect.y = y0;
obj.rect.width = w;
obj.rect.height = h;
obj.label = class_idx;
obj.prob = box_prob;
objects.push_back(obj);
}
} // class loop
} // point anchor loop
}

static inline float intersection_area(const Object& a, const Object& b)
{
cv::Rect_<float> inter = a.rect & b.rect;
return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int
right)
{
int i = left;
int j = right;
float p = faceobjects[(left + right) / 2].prob;
while (i <= j)
{
while (faceobjects[i].prob > p)
i++;
while (faceobjects[j].prob < p)
j--;
if (i <= j)
{
// swap
std::swap(faceobjects[i], faceobjects[j]);
i++;
j--;
}
}
#pragma omp parallel sections
{
#pragma omp section
{
if (left < j) qsort_descent_inplace(faceobjects, left, j);
}
#pragma omp section
{
if (i < right) qsort_descent_inplace(faceobjects, i, right);
}
}
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
if (objects.empty())
return;
qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
{
picked.clear();

const int n = faceobjects.size();
std::vector<float> areas(n);
for (int i = 0; i < n; i++)
{
areas[i] = faceobjects[i].rect.area();
}
for (int i = 0; i < n; i++)
{
const Object& a = faceobjects[i];
int keep = 1;
for (int j = 0; j < (int)picked.size(); j++)
{
const Object& b = faceobjects[picked[j]];
// intersection over union
float inter_area = intersection_area(a, b);
float union_area = areas[i] + areas[picked[j]] - inter_area;
// float IoU = inter_area / union_area
if (inter_area / union_area > nms_threshold)
keep = 0;
}
if (keep)
picked.push_back(i);
}
}

static void decode_outputs(const float* prob, std::vector<Object>& objects, float scale, const int img_w, const int img_h) {

std::vector<Object> proposals;
std::vector<int> strides = { 8, 16, 32 };
std::vector<GridAndStride> grid_strides;
generate_grids_and_stride(INPUT_W, INPUT_H, strides, grid_strides);
generate_yolox_proposals(grid_strides, prob, BBOX_CONF_THRESH,
proposals);
qsort_descent_inplace(proposals);
std::vector<int> picked;
nms_sorted_bboxes(proposals, picked, NMS_THRESH);
int count = picked.size();
std::cout << "picked size " << picked.size() << std::endl;

objects.resize(count);
for (int i = 0; i < count; i++)
{
objects[i] = proposals[picked[i]];
// adjust offset to original unpadded
float x0 = (objects[i].rect.x) / scale;
float y0 = (objects[i].rect.y) / scale;
float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
// clip
x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
objects[i].rect.x = x0;
objects[i].rect.y = y0;
objects[i].rect.width = x1 - x0;
objects[i].rect.height = y1 - y0;
}
}

const float color_list[8][3] =
{
{0.000, 0.447, 0.741},
{0.850, 0.325, 0.098},
{0.929, 0.694, 0.125},
{0.494, 0.184, 0.556},
{0.466, 0.674, 0.188},
{0.301, 0.745, 0.933},
{0.635, 0.078, 0.184},
{0.300, 0.300, 0.300},

};

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
std::cout << "Inside draw_object" << std::endl;

static const char* class_names[] = {
"TBC_LOSS", "CRACK", "RUB", "DENT_NICK", "TEAR", "EROSION", "DEPOSIT", "OXIDATION"
};

cv::Mat image = bgr.clone();

cv::imshow("draw_object", image);
cv::waitKey(0);

std::cout << "obje size" << objects.size() << std::endl;

for (size_t i = 0; i < objects.size(); i++)
{
const Object& obj = objects[i];

fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

cv::Scalar color = cv::Scalar(color_list[obj.label][0], color_list[obj.label][1], color_list[obj.label][2]);
float c_mean = cv::mean(color)[0];
cv::Scalar txt_color;
if (c_mean > 0.5) {
txt_color = cv::Scalar(0, 0, 0);
}
else {
txt_color = cv::Scalar(255, 255, 255);
}

cv::rectangle(image, obj.rect, color * 255, 2);

char text[256];
sprintf_s(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

int baseLine = 0;
cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);

cv::Scalar txt_bk_color = color * 0.7 * 255;

int x = obj.rect.x;
int y = obj.rect.y + 1;
//int y = obj.rect.y - label_size.height - baseLine;
if (y > image.rows)
y = image.rows;
//if (x + label_size.width > image.cols)
//x = image.cols - label_size.width;

cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
txt_bk_color, -1);

cv::putText(image, text, cv::Point(x, y + label_size.height),
cv::FONT_HERSHEY_SIMPLEX, 0.4, txt_color, 1);
}

cv::imwrite("_demo.jpg", image);
fprintf(stderr, "save vis file\n");
cv::imshow("image", image);
cv::waitKey(0);
}
////////// For Model Output Inference - Post Processing /////////////////////////

// Function to create a Direct3D 11 device with the specified adapter
ID3D11Device* CreateD3DDeviceWithAdapter(IDXGIAdapter* adapter)
{
D3D_FEATURE_LEVEL featureLevels[] = { D3D_FEATURE_LEVEL_11_0, D3D_FEATURE_LEVEL_10_1 };
ID3D11Device* device = nullptr;
ID3D11DeviceContext* context = nullptr;

HRESULT hr = D3D11CreateDevice(
adapter, // Specify the adapter (Intel GPU)
D3D_DRIVER_TYPE_UNKNOWN, // Driver type (unknown because adapter is specified)
NULL, // Software rasterizer module
0, // Device creation flags
featureLevels, // Feature levels array
ARRAYSIZE(featureLevels), // Number of feature levels in array
D3D11_SDK_VERSION, // SDK version
&device, // OUT: Device
NULL, // OUT: Selected feature level
&context // OUT: Device context
);

if (FAILED(hr)) {
std::cerr << "Failed to create Direct3D 11 device." << std::endl;
return nullptr;
}

return device;
}

IDXGIAdapter* SelectIntelGPU(IDXGIFactory* factory) {
IDXGIAdapter* intelAdapter = nullptr;

// Enumerate adapters (GPUs)
for (UINT i = 0; factory->EnumAdapters(i, &intelAdapter) != DXGI_ERROR_NOT_FOUND; ++i) {
DXGI_ADAPTER_DESC adapterDesc;
intelAdapter->GetDesc(&adapterDesc);

// Check for Intel GPU
if (wcscmp(adapterDesc.Description, L"Intel(R) UHD Graphics") == 0) {
std::wcout << L"Found Intel GPU: " << adapterDesc.Description << std::endl;
return intelAdapter;
}

intelAdapter->Release();
intelAdapter = nullptr;
}

return nullptr; // Intel GPU not found
}

// Function to load a texture from a PNG or JPG file

HRESULT LoadTextureFromFile(ID3D11Device* device, ID3D11DeviceContext* context,const wchar_t* filePath,ID3D11Resource** texture, ID3D11ShaderResourceView** textureView)
{
HRESULT hr = CreateWICTextureFromFile(device, nullptr, DXGI_FORMAT_UNKNOWN, filePath, texture, textureView, 0);

if (FAILED(hr)) {
std::cerr << "Failed to load texture from file: " << std::hex << hr << std::endl;
}
return hr;
}

ID3D11Texture2D* ConvertResourceToTexture2D(ID3D11Resource* resource) {
if (!resource) {
return nullptr;
}

ID3D11Texture2D* texture2D = nullptr;
HRESULT hr = resource->QueryInterface(__uuidof(ID3D11Texture2D), (void**)&texture2D);

if (FAILED(hr)) {
std::cerr << "Failed to query ID3D11Texture2D interface from resource." << std::endl;
return nullptr;
}

return texture2D;
}

void PrintTexture2DProperties(ID3D11Texture2D* texture) {
if (!texture) {
std::cerr << "Invalid texture." << std::endl;
return;
}

D3D11_TEXTURE2D_DESC desc;
texture->GetDesc(&desc);

std::cout << "Texture2D Properties:" << std::endl;
std::cout << "Width: " << desc.Width << std::endl;
std::cout << "Height: " << desc.Height << std::endl;
std::cout << "MipLevels: " << desc.MipLevels << std::endl;
std::cout << "ArraySize: " << desc.ArraySize << std::endl;
std::cout << "Format: " << desc.Format << std::endl;
std::cout << "SampleDesc.Count: " << desc.SampleDesc.Count << std::endl;
std::cout << "SampleDesc.Quality: " << desc.SampleDesc.Quality << std::endl;
std::cout << "Usage: " << desc.Usage << std::endl;
std::cout << "BindFlags: " << desc.BindFlags << std::endl;
std::cout << "CPUAccessFlags: " << desc.CPUAccessFlags << std::endl;
std::cout << "MiscFlags: " << desc.MiscFlags << std::endl;
}

ID3D11Texture2D* CreateNV12Texture(ID3D11Device* device, int width, int height)
{
D3D11_TEXTURE2D_DESC desc = {};
desc.Width = width;
desc.Height = height;// NV12 height is Y plane height + UV plane height/2
desc.MipLevels = 1;
desc.ArraySize = 1;
desc.Format = DXGI_FORMAT_NV12;
desc.SampleDesc.Count = 1;
desc.Usage = D3D11_USAGE_DYNAMIC;
desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;

ID3D11Texture2D* texture = nullptr;
HRESULT hr = device->CreateTexture2D(&desc, nullptr, &texture);
if (FAILED(hr)) {
std::cerr << "Failed to create NV12 texture." << std::endl;
return nullptr;
}

return texture;
}

int main(int argc, char* argv[])
{

if (argc < 3) {
std::cerr << "Usage" << argv[0] << "model.xml" << "input image" << std::endl;
return -1;
}

std::string modelPath = argv[1];
const file_name_t input_image_path{ argv[2] };
//std::string cv_imagePath = argv[2];
std::wstring imagePath = std::wstring(argv[2], argv[2] + strlen(argv[2]));
// const wchar_t* filepath = argv[2];
//const wchar_t* modelPath = argv[1];

HRESULT hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);
IDXGIFactory* factory = nullptr;
IDXGIAdapter* intelAdapter = nullptr;
ID3D11Device* device = nullptr;

ID3D11Texture2D* nv12Texture = nullptr;

// Create DXGI factory
hr = CreateDXGIFactory(__uuidof(IDXGIFactory), (void**)&factory);
if (FAILED(hr)) {
std::cerr << "Failed to create DXGI factory." << std::endl;
return -1;
}

std::cout << "DirectX DXGI Factory created successfully." << std::endl;

// Select Intel GPU
intelAdapter = SelectIntelGPU(factory);
if (!intelAdapter) {
std::cerr << "Failed to find Intel GPU." << std::endl;
factory->Release();
return -1;
}

std::cout << "Intel GPU selected successfully." << std::endl;

// Create Direct3D 11 device with Intel adapter
device = CreateD3DDeviceWithAdapter(intelAdapter);
if (!device) {
std::cerr << "Failed to create Direct3D 11 device." << std::endl;
factory->Release();
intelAdapter->Release();
return -1;
}

std::cout << "Direct3D 11 device created successfully." << std::endl;

//Load texture from file
ID3D11Resource* texResource = nullptr;
hr = LoadTextureFromFile(device, nullptr, imagePath.c_str(), &texResource, nullptr);

ID3D11Texture2D* imgTexture = ConvertResourceToTexture2D(texResource);
PrintTexture2DProperties(imgTexture);

cv::Mat bgrImage = cv::imread(input_image_path);
if (bgrImage.empty()) {
std::cerr << "Failed to load image: " << input_image_path << std::endl;
device->Release();
factory->Release();
intelAdapter->Release();
return -1;
}
cv::imshow("input_image", bgrImage);
// cv::waitKey(0);

size_t input_width = bgrImage.cols;
size_t input_height = bgrImage.rows;

std::cout << "Image loaded successfully." << std::endl;

// -------- Step 1. Initialize OpenVINO Runtime Core ---------
ov::Core core;

// -------- Step 2. Read a model --------
std::shared_ptr<ov::Model> model = core.read_model(modelPath);

OPENVINO_ASSERT(model->inputs().size() == 1, "Sample supports models with 1 input only");
OPENVINO_ASSERT(model->outputs().size() == 1, "Sample supports models with 1 output only");

std::string input_tensor_name = model->input().get_any_name();
std::string output_tensor_name = model->output().get_any_name();

// Validate the model inputs and outputs
if (model->inputs().size() != 1 || model->outputs().size() != 1) {
std::cerr << "Error: This sample supports models with 1 input and 1 output only." << std::endl;
return EXIT_FAILURE;
}

/* Take information about all topology inputs */
auto inputs = model->inputs();
/* Take information about all topology outputs */
auto outputs = model->outputs();
//std::string output_tensor_name = model->output().get_any_name();

std::cout << "inside main" << std::endl;
//input and output structure of the model
// std::wcout << "Input Node Name: " << inputs.get_node() << std::endl;
for (const auto& inputNode : inputs) {
// Assuming 'getName()' is a method that retrieves the name of the input node
std::wcout << "Input Node Name: " << inputNode.get_node() << std::endl;
}

for (const auto& outNode : outputs) {
// Assuming 'getName()' is a method that retrieves the name of the input node
std::cout << "output Node Name: " << outNode.get_node() << std::endl;
}

std::cout << "inside main" << std::endl;
//input and output structure of the model
// std::wcout << "Input Node Name: " << inputs.get_node() << std::endl;

std::cout << "model input size " << model->inputs().size() << std::endl;
std::cout << "Model output size " << model->outputs().size() << std::endl;

// -------- Step 3. Configure preprocessing --------
using namespace ov::preprocess;

auto p = PrePostProcessor(model);

ov::preprocess::InputInfo& input = p.input(input_tensor_name);

// InputInfo& inputs = p.input(input_tensor_name);

p.input().tensor().set_element_type(ov::element::u8)
// .set_spatial_static_shape(input_height, input_width) //height - 640
// .set_color_format(ColorFormat::NV12_SINGLE_PLANE)
.set_spatial_static_shape(input_height, input_width)
.set_color_format(ColorFormat::RGBX)
.set_memory_type(ov::intel_gpu::memory_type::surface);

p.input().preprocess().convert_color(ov::preprocess::ColorFormat::BGR).resize(ResizeAlgorithm::RESIZE_LINEAR);
p.input().model().set_layout("NCHW");
model = p.build();

std::cout << "ppp" << p << std::endl;

// ------------step 4. Creation of RemoteContext from Native Handle ------------
int tile_id =-1;

// Create GPU context for a specific tile
auto gpu_context = ov::intel_gpu::ocl::D3DContext(core,device, tile_id);
// ov::intel_gpu::ocl::D3DContext gpu_context(core, device);

// Proceed with the rest of your OpenVINO operations...

const auto t0 = std::chrono::high_resolution_clock::now();

// -------- Step 5. Loading a model to the device --------

ov::CompiledModel compiled_model = core.compile_model(model, gpu_context);

std::cout << "---Load model";
const auto t1 = std::chrono::high_resolution_clock::now();
std::cout << " - " << std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count() << "ms\n";

// -------- Step 6. Create an infer request --------
auto infer_request = compiled_model.create_infer_request();

std::cout << "---Create an infer request";
const auto t2 = std::chrono::high_resolution_clock::now();

std::cout << " - " << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() * 0.001 << "ms\n";

auto input0 = model->get_parameters().at(0);

auto shape = input0->get_shape();
size_t width = shape[1];
size_t height = shape[2];
size_t channel = shape[3];

std::cout << "width" << width << std::endl;
std::cout << "height" << height << std::endl;
std::cout << "channel " << channel << std::endl;

PrintTexture2DProperties(imgTexture);

auto input_yuv = model->input(0);
//auto nv12Tensor = gpu_context.create_tensor(ov::element::f32, shape, nv12Texture);

// -------- Step 7. Prepare input data --------

std::cout << "Use remote tensor API and set_tensor\n";

const auto t_k_start = std::chrono::high_resolution_clock::now();

ov::intel_gpu::ocl::D3DSurface2DTensor nv12Tensor = gpu_context.create_tensor(ov::element::u8, shape, imgTexture);

// std::cout << "inside try " << std::endl;

//ov::intel_gpu::ocl::D3DSurface2DTensor nv12Tensor = nv12Texture;

//auto nv12Tensor = gpu_context.create_tensor_nv12(416, 416, nv12Texture);

infer_request.set_tensor(input_yuv.get_any_name(), nv12Tensor);
// std::cout << "inside try " << std::endl;

const auto t_buffer = std::chrono::high_resolution_clock::now();
std::cout << " - " << std::chrono::duration_cast<std::chrono::microseconds>(t_buffer - t_k_start).count() * 0.001 << "ms\n";

const auto t3 = std::chrono::high_resolution_clock::now();
// -------- Step 8. Do inference --------
// infer_request.start_async();
// infer_request.wait();
infer_request.infer();

std::cout << "---Run infer req";
const auto t4 = std::chrono::high_resolution_clock::now();
std::cout << " - " << std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count() * 0.001 << "ms\n";

// Access the output tensor

auto output0 = model->get_results().at(0);
// -------- Step 9. Process output --------
ov::Tensor output_tensor = infer_request.get_tensor(output_tensor_name);

// Print the output tensor details
std::cout << "Output tensor element type: " << output_tensor.get_element_type() << std::endl;
std::cout << "Output tensor shape: ";
for (const auto& dim : output_tensor.get_shape()) {
std::cout << dim << " ";
}
std::cout << std::endl;

auto output_data = output_tensor.data<float>();
// If you want to access the data, you can map it to a memory pointer
if (output_tensor.get_element_type() == ov::element::f32)
{

std::cout << "Output data: ";
for (size_t i = 0; i < output_tensor.get_size(); ++i)
{
// std::cout << output_data[i] << '\n';
}
std::cout << std::endl;
}
else
{
std::cerr << "Unexpected output tensor element type: " << output_tensor.get_element_type() << std::endl;
}

int img_w = bgrImage.cols;
int img_h = bgrImage.rows;
//float scale = std::min(INPUT_W / (image_input.cols * 1.0), INPUT_H / (image_input.rows * 1.0));;
float scale = 1.0;
std::vector<Object> objects;

decode_outputs(output_data, objects, scale, img_w, img_h);
draw_objects(bgrImage, objects);

// Cleanup
if (factory) factory->Release();
if (intelAdapter) intelAdapter->Release();
if (device) device->Release();
if (nv12Texture) nv12Texture->Release();

return 0;
}"

Gayathri_Sankaran · ‎07-29-2024

Hi Team

I have attached my sample code here.

Iffa_Intel · ‎08-16-2024

Hi,

To further investigate this, could you share:

Model that you are trying to infer (framework, files, etc)
Have you tried to use the official Openvino hello_nv12_input_classification sample application (if yes, does it works?)
Your model conversion steps
General OpenVINO setup steps that you did
Your error messages

Cordially,

Iffa

Iffa_Intel · ‎08-26-2024

Hi,

Thank you for your question. If you need any additional information from Intel, please submit a new question as Intel is no longer monitoring this thread.

Cordially,

Iffa