Only one class predicted regardless of input image in Image Classification

Shelke__Sagar · ‎07-16-2019

I have trained a mobilenet for classifying 5 gestures. For any gesture performed, neural network predicts as class 3 only.

Due to preprocessing, input image is FP32

NOTE* NN is properly trained and it predicts correctly when used in OpenCV DNN API with same steps.

I think I am going wrong somewhere in setting input with openVINO. Following are the steps I have followed

1. Initialize IE

// 1. Add CPU plugins
	InferenceEnginePluginPtr _plugin = PluginDispatcher({""}).getPluginByDevice(TARGET_HDWARE);
	InferencePlugin plugin(_plugin);
	//TODO: Write shell to make in 'ie_cpu_extension'
	// make generates 'libcpu_extension.so'
	string s_ext_plugin = "./ie_cpu_extension/libcpu_extension.so";
    auto extension_ptr = make_so_pointer<InferenceEngine::IExtension>(s_ext_plugin);
	plugin.AddExtension(extension_ptr);
	
	// 2. Create an IR reader and read network files
	CNNNetReader network_reader;
	network_reader.ReadNetwork("../mnet_v6/openvino_fp32/mobilenet_v2_336x448.xml");
	network_reader.ReadWeights("../mnet_v6/openvino_fp32/mobilenet_v2_336x448.bin");
	
	CNNNetwork network = network_reader.getNetwork();
	/** Set network batch size to 1 **/
	network.setBatchSize(BATCH_SIZE);
	size_t batchSize = network.getBatchSize();
	
	//3. Configure input and output
	
	/**Get NN input information **/
	InferenceEngine::InputsDataMap input_info(network.getInputsInfo());
	InferenceEngine::SizeVector inputDims;
	
	for (auto &item : input_info) {
		auto input_data = item.second;
		input_data->setPrecision(Precision::FP32);
		input_data->setLayout(Layout::NCHW);
		inputDims=input_data->getDims();
	}
	cout << "inputDims=";
        for (int i=0; i<inputDims.size(); i++) {
            cout << (int)inputDims << " ";
        }
	cout << endl;
	infer_width=inputDims[0];
    infer_height=inputDims[1];
    num_channels=inputDims[2];
	
	/**Get NN output information **/
	
	InferenceEngine::OutputsDataMap output_info(network.getOutputsInfo());
	InferenceEngine::SizeVector outputDims;
	for (auto &item : output_info) {
		auto output_data = item.second;
		output_data->setPrecision(Precision::FP32);
		output_data->setLayout(Layout::NC);
		outputDims=output_data->getDims();
	}
	cout << "outputDims=";
	for (int i=0; i<outputDims.size(); i++) {
		cout << (int)outputDims << " ";
	}
	// 4. Load model to plugin
	ExecutableNetwork executable_network = plugin.LoadNetwork(network,{});
	
	// 5. Create infer request
	InferRequest infer_request = executable_network.CreateInferRequest();

2. Get input/output buffer information

// get the input blob buffer pointer location
	float *input_buffer;
	float *input_buffer_current_image;
	for (auto &item : nn_read_data.input_info) {
		input_name = item.first;
		auto input_data = item.second;
		input = nn_read_data.infer_request.GetBlob(input_name);
		input_buffer = input->buffer().as<PrecisionTrait<Precision::FP32>::value_type *>();
	}

	// get the output blob pointer location
	float *output_buffer;
	float *output_buffer_current_image;
	for (auto &item : nn_read_data.output_info) {
		auto output_name = item.first;
		output = nn_read_data.infer_request.GetBlob(output_name);
		output_buffer = output->buffer().as<PrecisionTrait<Precision::FP32>::value_type *>();
	}

3. Feed in the image and run inference

image3ch_gesture = // code to generate image
imshow("original image", image3ch_gesture);
									
                                        preprocess_image(image3ch_gesture);
									
										auto dims = input->getTensorDesc().getDims();
										size_t channels_number = dims[1];
										size_t image_size = dims[3]*dims[2];	
										for (size_t pid = 0; pid < image_size; ++pid){
											for (size_t ch = 0; ch < channels_number; ++ch) {
												input_buffer[ch * image_size + pid] = image3ch_gesture.at<cv::Vec3b>(pid)[ch];
											}
										}
										
										// perform inference. Async
										nn_read_data.infer_request.Infer();
										
										// Get detection output using output blob pointer. 
										// For 5 class classification with batch size of N,
										// output is Nx5
										vector<unsigned> results;
										TopResults(2, *output, results);

NOTE* imshow() shows me the correct processed image.

My guess is I am going wrong somewhere in 1st or 2nd stage. 3rd stage seems find because when I feed 32F mat with all zeros, NN classify it as background as expected.

For all other image, always class 3 is predicted.

Thanks.