OpenVINO inference time speedup

Pugach__Yaroslav · ‎05-02-2020

I am running the following script to compare SSD Lite MobileNet V2 Coco model performance with and without OpenVINO. Surprisingly, the test shows that OpenVINO performs inference about 25 times faster than the original model. I am using Intel Xeon 2.3 GHz CPU and no GPU/TPU/VPU accelerators. What could be the reason for such a huge improvement? Or can you, probably, see some errors in the code?

import tensorflow as tf
from openvino.inference_engine import IENetwork, IECore
import numpy as np 
import time
import os
import cv2


def benchmark_tf(input, saved_model_path, 
                 length_limit, dsize,
                 confidence=0.5, class_id=1):
    
    print('benchmark_tf:', saved_model_path)
    
    sess=tf.Session() 

    #signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
    signature_key = tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY

    #saved_model_path =  'ssdlite_mobilenet_v2_coco_2018_05_09/saved_model'
    #export_path = 'ssd_resnet50_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03/saved_model'
    meta_graph_def = tf.saved_model.loader.load(
            sess,
            [tf.saved_model.SERVING],
            saved_model_path)

    signature = meta_graph_def.signature_def

    #print(signature[signature_key].inputs['inputs'])
    x_tensor_name = signature[signature_key].inputs['inputs'].name
    #y_tensor_name = signature[signature_key].outputs['outputs'].detection_classes.name
    #print(signature[signature_key].outputs)
    outs = signature[signature_key].outputs
    y = { k : sess.graph.get_tensor_by_name(outs.name) for k in outs }

    x = sess.graph.get_tensor_by_name(x_tensor_name)
    #y = sess.graph.get_tensor_by_name(y_tensor_name)

    detections = []

    cap = cv2.VideoCapture(input)
    assert cap.isOpened(), "Failed to open the input"
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frames_processed = 0
    inference_time = 0
    
    while cap.isOpened():
        captured, im = cap.read()
        if not captured:
            break
                        
        
        if length_limit > 0 and frames_processed >= length_limit:
            break
        
        frames_processed += 1
        print('processed: {}/{}'.format(frames_processed, frame_count))
                    
        #im = imageio.imread(im_path)
        #print(im.shape)

        #im_resized = tf.image.resize(im, size=(300,300))
        if dsize:
            im_resized = cv2.resize(im, dsize=dsize)
        else:
            im_resized = im
            
        im_batch = im_resized[tf.newaxis, ..., :3]
        #im_batch = im[None,...,:3]
        
        inference_time -= time.time()
        y_out = sess.run(y, {x: im_batch})
        inference_time += time.time()

        #print(y_out)
        detected = False
        num_detections = y_out['num_detections']
        if num_detections >= 1:
            classes = y_out['detection_classes']
            scores = y_out['detection_scores']
            assert classes.shape[1] == scores.shape[1], "Classes and scores mismatch"
            for i in range(classes.shape[1]):
                if classes[0] == class_id and scores[0] > confidence:
                    detected = True
                    break

        detections.append(detected)

    return detections, inference_time


def benchmark_openvino(input, model_path, 
                       length_limit, dsize, 
                       confidence=0.5, class_id=1):
    
    print('benchmark_openvino:', model_path)
    
    # Initialize the Inference Engine
    plugin = IECore()
        
    plugin.add_extension(
        '/opt/intel/openvino/deployment_tools/inference_engine'
        '/lib/intel64/libcpu_extension_sse4.so', 'CPU')
    
    model_xml = model_path
    model_bin = os.path.splitext(model_xml)[0] + '.bin'        
    network = IENetwork(model=model_xml, weights=model_bin)
    
    image_tensor_blob = None
    image_info_blob = None
                
    for input_key, input_val in network.inputs.items():
        if len(input_val.shape) == 4: # image tensor
            image_tensor_blob = input_key
        elif len(input_val.shape) == 2: # image info
            image_info_blob = input_key
        
    assert image_tensor_blob is not None, \
        "Failed to find the input image specification"
        
    output_blob = next(iter(network.outputs))
    
    exec_network = plugin.load_network(
            network=network, device_name='CPU', 
            num_requests=1)
    
    detections = []

    cap = cv2.VideoCapture(input)
    assert cap.isOpened(), "Failed to open the input"
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frames_processed = 0
    inference_time = 0
    
    while cap.isOpened():
        captured, frame = cap.read()
        if not captured:
            break
            
        if length_limit > 0 and frames_processed > length_limit:
            break

        frames_processed += 1
            
        print('processed: {}/{}'.format(frames_processed, frame_count))
        
        if dsize:
            frame_resized = cv2.resize(frame, dsize=dsize)
        else:
            frame_resized = frame
            
        frame_resized = frame_resized.transpose(2,0,1)
        batch = frame_resized[None,...]
        
        input_dict = { image_tensor_blob : batch }        
        
        # Faster RCNN additionally needs image info
        if image_info_blob:
            image_info = (dsize[1], dsize[0], 1)
            input_dict[image_info_blob] = image_info            
        
        # Start inference 
        
        inference_time -= time.time()
        
        request_handle = exec_network.start_async(
            request_id = 0,
            inputs=input_dict)
                
        infer_status = request_handle.wait(-1)
        
        inference_time += time.time() 
        
        detected = False
        
        if infer_status == 0:
            out = request_handle.outputs[output_blob]
            for detection in out[0,0,...]:
                if detection[1]==class_id and detection[2]>confidence:
                    detected = True
                    break
        
        detections.append(detected)
    
    return detections, inference_time


# Maximal number of frames from the input video. If negative,
# the whole video will be processed.
max_frames = -1

detections_faster_rcnn, _ = benchmark_tf(
    'resources/Pedestrian_Detect_2_1_1.mp4',
    'models/faster_rcnn_inception_v2_coco_2018_01_28/saved_model',
    length_limit=max_frames, dsize=None
)


detections_ssdlite_tf, t_ssdlite_tf = benchmark_tf(
    'resources/Pedestrian_Detect_2_1_1.mp4',
    'models/ssdlite_mobilenet_v2_coco_2018_05_09/saved_model',
    #length_limit=max_frames, dsize=(400,400)
    length_limit=max_frames, dsize=(300,300)
    )


detections_ssdlite_openvino, t_ssdlite_openvino = benchmark_openvino(
    'resources/Pedestrian_Detect_2_1_1.mp4', 
    #'models/ssdlite_mobilenet_v2_coco_custom_shape.xml',
    'models/ssdlite_mobilenet_v2_coco.xml',
    length_limit=max_frames, dsize=(300,300))



# The length of the output corresponds to the number of processed frames
n = len(detections_faster_rcnn)

correct_ssdlite_tf = 0
correct_ssdlite_openvino = 0
for i in range(n):
    if detections_faster_rcnn==detections_ssdlite_tf:
        correct_ssdlite_tf += 1
    if detections_faster_rcnn==detections_ssdlite_openvino:
        correct_ssdlite_openvino += 1
        
print('SSD Lite (TF)')
print('Accuracy:', correct_ssdlite_tf/n)
print('Total inference time:', t_ssdlite_tf, 'seconds')
print('Average inference time per frame:', t_ssdlite_tf/n, 'seconds')

print('SSD Lite (OpenVINO)')
print('Accuracy:', correct_ssdlite_openvino/n)
print('Total inference time:', t_ssdlite_openvino, 'seconds')
print('Average inference time per frame:', 
      t_ssdlite_openvino/n, 'seconds')

TF model was converted to IR by this command:

/opt/intel/openvino/deployment_tools/model_optimizer/mo_tf.py --model_name ssdlite_mobilenet_v2_coco --input_model  ssdlite_mobilenet_v2_coco_2018_05_09/frozen_inference_graph.pb --tensorflow_use_custom_operations_config /opt/intel/openvino/deployment_tools/model_optimizer/extensions/front/tf/ssd_v2_support.json --tensorflow_object_detection_api_pipeline_config ssdlite_mobilenet_v2_coco_2018_05_09/pipeline.config --reverse_input_channels

Munesh_Intel · ‎05-04-2020

Hi Yaroslav,

Thank you for reaching out to OpenVINO forum.

OpenVINO speeds up performance by optimizing pre-trained neural network models using Model Optimizer.

Model Optimizer is a cross-platform command-line tool that facilitates the transition between the training and deployment environment, performs static model analysis, and adjusts deep learning models for optimal execution on end-point target devices.

If a group of operations can be represented as a single mathematical operation, and thus as a single operation node in a model graph, the Model Optimizer recognizes such patterns and replaces this group of operation nodes with the only one operation.

For example, certain primitives like linear operations (BatchNorm and ScaleShift), are automatically fused into convolutions.

Apart from that, Model Optimizer also removes layers that are important for training, but are useless during inference, such as the Dropout layer.

The result is an Intermediate Representation that has fewer operation nodes than the original model, which decreases the inference time.

The two links below will provide more information related to the performance improvements.

The first link is regarding Model Optimizer Knobs Related to Performance, given as follows:

https://docs.openvinotoolkit.org/latest/_docs_optimization_guide_dldt_optimization_guide.html#mo-knobs-related-to-performance

The second link is regarding performance benchmarks, given as follows:

https://docs.openvinotoolkit.org/latest/_docs_performance_benchmarks.html

Regards,

Munesh

Pugach__Yaroslav · ‎05-04-2020

Ok, yes, I understand that the model is optimized by OpenVINO, but I would expect a small improvement whereas here I got ~25 times faster. So I was wondering if it's possible.