Mobilenet SSD C++ implementation detection wrong on NCSDK v2

idata · ‎09-17-2018

Hi guys, I am facing issues trying to implement the live object detector sample provided with ncappzoo v1 in C++ for NCSDK v2. Could you please look into my code or share with me a working code? I have tried row major, col major, changing tensor datatype, rgb and bgr conversion and everything I could think of. I am new to this and would be grateful if you could help me out. It always returns one/two detections for tv monitor and train and nothing else.

//the problem is somewhere here
bool preprocess_image(const cv::Mat& src_image_mat, cv::Mat& preprocessed_image_mat)
{
    // find ratio of to adjust width and height by to make them fit in network image width and height
    double width_ratio = (double)NETWORK_IMAGE_WIDTH / (double)src_image_mat.cols;
    double height_ratio = (double)NETWORK_IMAGE_HEIGHT / (double)src_image_mat.rows;

    // the largest ratio is the one to use for scaling both height and width.
    double largest_ratio = (width_ratio > height_ratio) ? width_ratio : height_ratio;

    cv::resize(src_image_mat, preprocessed_image_mat, cv::Size(), largest_ratio, largest_ratio, CV_INTER_AREA);

    int mid_row = preprocessed_image_mat.rows / 2.0;
    int mid_col = preprocessed_image_mat.cols / 2.0;
    int x_start = mid_col - (NETWORK_IMAGE_WIDTH / 2);
    int y_start = mid_row - (NETWORK_IMAGE_HEIGHT / 2);
    cv::Rect roi(x_start, y_start, NETWORK_IMAGE_WIDTH, NETWORK_IMAGE_HEIGHT);
    preprocessed_image_mat = preprocessed_image_mat(roi);

    //RGB to BRG
    //cvtColor(preprocessed_image_mat, preprocessed_image_mat, COLOR_RGB2BGR);
    return true;
}

std::shared_ptr<list<networkResults>> getInferenceResults(cv::Mat inputMat,
    struct ncGraphHandle_t* graphHandle, struct ncFifoHandle_t* fifoIn,
    struct ncFifoHandle_t* fifoOut)
{
    cv::Mat preprocessed_image_mat;
    preprocess_image(inputMat, preprocessed_image_mat);
    if (preprocessed_image_mat.rows != NETWORK_IMAGE_HEIGHT ||
        preprocessed_image_mat.cols != NETWORK_IMAGE_WIDTH) {
        cout << "Error - preprocessed image is unexpected size!" << endl;
        return 0;
    }

    float_t tensor32[NETWORK_IMAGE_WIDTH * NETWORK_IMAGE_HEIGHT * 3];

    uint8_t* image_data_ptr = (uint8_t*)preprocessed_image_mat.data;
    int chan = preprocessed_image_mat.channels();

    int tensor_index = 0;
    for (int col = 0; col < preprocessed_image_mat.cols; col++) {
        for (int row = 0; row < preprocessed_image_mat.rows; row++) {

            int pixel_start_index = col * (preprocessed_image_mat.rows + 0) * chan + row * chan; // TODO: don't hard code

            // assuming the image is in BGR format here
            uint8_t blue = image_data_ptr[pixel_start_index + 0];
            uint8_t green = image_data_ptr[pixel_start_index + 1];
            uint8_t red = image_data_ptr[pixel_start_index + 2];

            tensor32[tensor_index++] = (((float_t)blue - networkMean) * networkStd);
            tensor32[tensor_index++] = (((float_t)green - networkMean) * networkStd);
            tensor32[tensor_index++] = (((float_t)red - networkMean) * networkStd);

        }
    }

    // queue for inference
    unsigned int inputTensorLength = NETWORK_IMAGE_HEIGHT * NETWORK_IMAGE_WIDTH * 3 * sizeof(float_t);
    retCode = ncGraphQueueInferenceWithFifoElem(graphHandle, fifoIn, fifoOut, tensor32, &inputTensorLength, 0);
    if (retCode != NC_OK) {
        cout << "Error[" << retCode << "] - could not queue inference." << endl;
        return 0;
    }

    // get the size of the result
    unsigned int res_length;
    unsigned int option_length = sizeof(res_length);
    retCode = ncFifoGetOption(fifoOut, NC_RO_FIFO_ELEMENT_DATA_SIZE, &res_length, &option_length);
    if (retCode != NC_OK) {
        cout << "Error[" << retCode << "] - could not get output result size." << endl;
        return 0;
    }

    float_t result_buf[res_length];
    retCode = ncFifoReadElem(fifoOut, result_buf, &res_length, NULL);
    if (retCode != NC_OK) {
        cout << "Error[" << retCode << "] - could not get output result." << endl;
        return 0;
    }

    list<networkResult> *objectInferenceResults = new list<networkResult>();

    float number = *result_buf;

    for (int n = 0; n < number; ++n)
    {
        float* pointer = result_buf + (7 + n * 7);
        if (pointer[2] > CONFIDENCE_THRESHOLD)
        {
            networkResult r;
            r.class_ID = pointer[1];
            r.confidence = pointer[2] * 100;
            r.x1 = pointer[3] * NETWORK_IMAGE_WIDTH;
            r.y1 = pointer[4] * NETWORK_IMAGE_HEIGHT;
            r.x2 = pointer[5] * NETWORK_IMAGE_WIDTH;
            r.y2 = pointer[6] * NETWORK_IMAGE_HEIGHT;
            objectInferenceResults->push_back(r);
        }
    }
    return std::shared_ptr<list<networkResult>>(objectInferenceResults);
}

int main(int argc, char** argv) {
    // Camera and image frames
    VideoCapture capture;
    Mat imgIn;

    // Key to escape from main loop and close program
    const int breakKey = 27;  // esc == 27
    int key;

    // Struct that will hold inference results
    std::shared_ptr<list<networkResult>>(Result);

    // Set up the camera
    capture.open(CAM_SOURCE);
    capture.set(CV_CAP_PROP_FRAME_WIDTH, WINDOW_WIDTH);
    capture.set(CV_CAP_PROP_FRAME_HEIGHT, WINDOW_HEIGHT);

    // Set up the display window
    namedWindow(WINDOW_NAME, WINDOW_NORMAL);
    resizeWindow(WINDOW_NAME, WINDOW_WIDTH, WINDOW_HEIGHT);
    setWindowProperty(WINDOW_NAME, CV_WND_PROP_ASPECTRATIO, CV_WINDOW_KEEPRATIO);
    moveWindow(WINDOW_NAME, 0, 0);
    Point winTextOrigin(0, 20);

    // Initialize the NCS device(s) and network graphs and FIFO queues
    initNCS();

    unsigned int frame_count = 0;

    //Initialize Mobilenet SSD graph with IO FIFOs
    initSSD();

    // main loop
    while (true) {
        // If the user presses the break key exit the loop
        key = waitKey(1);
        if ((key & 0xFF) == breakKey) {
            break;
        }

        // Get a frame from the camera
        capture >> imgIn;
        if (frame_count++ >= SKIP_AFTER)
        {
            capture >> imgIn;
            frame_count = 0;
        }

        // Flip the image horizontally
        //flip(imgIn, imgIn, 1);

        //Inference
        Result = getInferenceResults(imgIn, ssd_graph_handle, ssd_fifo_in, ssd_fifo_out);

        // Draw labels and rectangles on the image
        putText(imgIn, "Press ESC to exit", winTextOrigin, FONT, 2, GREEN, 2);
        for (list<networkResult>::iterator it = Result->begin(); it != Result->end();  it++) {
            // Draw a rectangle around the detected face
            rectangle(imgIn, Point(it->x1, it->y1), Point(it->x2, it->y2), RED, 1, 8, 0);

            // print the age and gender text to the window
            putText(imgIn, LABELS[it->class_ID], Point(it->x2, it->y2), FONT, FONT_SIZE, BLACK, 3);
        }


        // Show the image in the window
        imshow(WINDOW_NAME, imgIn);

    } // end main while loop

    //Close device and deallocate graph
    closeNCS();
    return 0;
}