- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I am running the following script to compare SSD Lite MobileNet V2 Coco model performance with and without OpenVINO. Surprisingly, the test shows that OpenVINO performs inference about 25 times faster than the original model. I am using Intel Xeon 2.3 GHz CPU and no GPU/TPU/VPU accelerators. What could be the reason for such a huge improvement? Or can you, probably, see some errors in the code?
import tensorflow as tf from openvino.inference_engine import IENetwork, IECore import numpy as np import time import os import cv2 def benchmark_tf(input, saved_model_path, length_limit, dsize, confidence=0.5, class_id=1): print('benchmark_tf:', saved_model_path) sess=tf.Session() #signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_key = tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY #saved_model_path = 'ssdlite_mobilenet_v2_coco_2018_05_09/saved_model' #export_path = 'ssd_resnet50_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03/saved_model' meta_graph_def = tf.saved_model.loader.load( sess, [tf.saved_model.SERVING], saved_model_path) signature = meta_graph_def.signature_def #print(signature[signature_key].inputs['inputs']) x_tensor_name = signature[signature_key].inputs['inputs'].name #y_tensor_name = signature[signature_key].outputs['outputs'].detection_classes.name #print(signature[signature_key].outputs) outs = signature[signature_key].outputs y = { k : sess.graph.get_tensor_by_name(outs.name) for k in outs } x = sess.graph.get_tensor_by_name(x_tensor_name) #y = sess.graph.get_tensor_by_name(y_tensor_name) detections = [] cap = cv2.VideoCapture(input) assert cap.isOpened(), "Failed to open the input" frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) frames_processed = 0 inference_time = 0 while cap.isOpened(): captured, im = cap.read() if not captured: break if length_limit > 0 and frames_processed >= length_limit: break frames_processed += 1 print('processed: {}/{}'.format(frames_processed, frame_count)) #im = imageio.imread(im_path) #print(im.shape) #im_resized = tf.image.resize(im, size=(300,300)) if dsize: im_resized = cv2.resize(im, dsize=dsize) else: im_resized = im im_batch = im_resized[tf.newaxis, ..., :3] #im_batch = im[None,...,:3] inference_time -= time.time() y_out = sess.run(y, {x: im_batch}) inference_time += time.time() #print(y_out) detected = False num_detections = y_out['num_detections'] if num_detections >= 1: classes = y_out['detection_classes'] scores = y_out['detection_scores'] assert classes.shape[1] == scores.shape[1], "Classes and scores mismatch" for i in range(classes.shape[1]): if classes[0] == class_id and scores[0] > confidence: detected = True break detections.append(detected) return detections, inference_time def benchmark_openvino(input, model_path, length_limit, dsize, confidence=0.5, class_id=1): print('benchmark_openvino:', model_path) # Initialize the Inference Engine plugin = IECore() plugin.add_extension( '/opt/intel/openvino/deployment_tools/inference_engine' '/lib/intel64/libcpu_extension_sse4.so', 'CPU') model_xml = model_path model_bin = os.path.splitext(model_xml)[0] + '.bin' network = IENetwork(model=model_xml, weights=model_bin) image_tensor_blob = None image_info_blob = None for input_key, input_val in network.inputs.items(): if len(input_val.shape) == 4: # image tensor image_tensor_blob = input_key elif len(input_val.shape) == 2: # image info image_info_blob = input_key assert image_tensor_blob is not None, \ "Failed to find the input image specification" output_blob = next(iter(network.outputs)) exec_network = plugin.load_network( network=network, device_name='CPU', num_requests=1) detections = [] cap = cv2.VideoCapture(input) assert cap.isOpened(), "Failed to open the input" frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) frames_processed = 0 inference_time = 0 while cap.isOpened(): captured, frame = cap.read() if not captured: break if length_limit > 0 and frames_processed > length_limit: break frames_processed += 1 print('processed: {}/{}'.format(frames_processed, frame_count)) if dsize: frame_resized = cv2.resize(frame, dsize=dsize) else: frame_resized = frame frame_resized = frame_resized.transpose(2,0,1) batch = frame_resized[None,...] input_dict = { image_tensor_blob : batch } # Faster RCNN additionally needs image info if image_info_blob: image_info = (dsize[1], dsize[0], 1) input_dict[image_info_blob] = image_info # Start inference inference_time -= time.time() request_handle = exec_network.start_async( request_id = 0, inputs=input_dict) infer_status = request_handle.wait(-1) inference_time += time.time() detected = False if infer_status == 0: out = request_handle.outputs[output_blob] for detection in out[0,0,...]: if detection[1]==class_id and detection[2]>confidence: detected = True break detections.append(detected) return detections, inference_time # Maximal number of frames from the input video. If negative, # the whole video will be processed. max_frames = -1 detections_faster_rcnn, _ = benchmark_tf( 'resources/Pedestrian_Detect_2_1_1.mp4', 'models/faster_rcnn_inception_v2_coco_2018_01_28/saved_model', length_limit=max_frames, dsize=None ) detections_ssdlite_tf, t_ssdlite_tf = benchmark_tf( 'resources/Pedestrian_Detect_2_1_1.mp4', 'models/ssdlite_mobilenet_v2_coco_2018_05_09/saved_model', #length_limit=max_frames, dsize=(400,400) length_limit=max_frames, dsize=(300,300) ) detections_ssdlite_openvino, t_ssdlite_openvino = benchmark_openvino( 'resources/Pedestrian_Detect_2_1_1.mp4', #'models/ssdlite_mobilenet_v2_coco_custom_shape.xml', 'models/ssdlite_mobilenet_v2_coco.xml', length_limit=max_frames, dsize=(300,300)) # The length of the output corresponds to the number of processed frames n = len(detections_faster_rcnn) correct_ssdlite_tf = 0 correct_ssdlite_openvino = 0 for i in range(n): if detections_faster_rcnn==detections_ssdlite_tf: correct_ssdlite_tf += 1 if detections_faster_rcnn==detections_ssdlite_openvino: correct_ssdlite_openvino += 1 print('SSD Lite (TF)') print('Accuracy:', correct_ssdlite_tf/n) print('Total inference time:', t_ssdlite_tf, 'seconds') print('Average inference time per frame:', t_ssdlite_tf/n, 'seconds') print('SSD Lite (OpenVINO)') print('Accuracy:', correct_ssdlite_openvino/n) print('Total inference time:', t_ssdlite_openvino, 'seconds') print('Average inference time per frame:', t_ssdlite_openvino/n, 'seconds')
TF model was converted to IR by this command:
/opt/intel/openvino/deployment_tools/model_optimizer/mo_tf.py --model_name ssdlite_mobilenet_v2_coco --input_model ssdlite_mobilenet_v2_coco_2018_05_09/frozen_inference_graph.pb --tensorflow_use_custom_operations_config /opt/intel/openvino/deployment_tools/model_optimizer/extensions/front/tf/ssd_v2_support.json --tensorflow_object_detection_api_pipeline_config ssdlite_mobilenet_v2_coco_2018_05_09/pipeline.config --reverse_input_channels
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Yaroslav,
Thank you for reaching out to OpenVINO forum.
OpenVINO speeds up performance by optimizing pre-trained neural network models using Model Optimizer.
Model Optimizer is a cross-platform command-line tool that facilitates the transition between the training and deployment environment, performs static model analysis, and adjusts deep learning models for optimal execution on end-point target devices.
If a group of operations can be represented as a single mathematical operation, and thus as a single operation node in a model graph, the Model Optimizer recognizes such patterns and replaces this group of operation nodes with the only one operation.
For example, certain primitives like linear operations (BatchNorm and ScaleShift), are automatically fused into convolutions.
Apart from that, Model Optimizer also removes layers that are important for training, but are useless during inference, such as the Dropout layer.
The result is an Intermediate Representation that has fewer operation nodes than the original model, which decreases the inference time.
The two links below will provide more information related to the performance improvements.
The first link is regarding Model Optimizer Knobs Related to Performance, given as follows:
The second link is regarding performance benchmarks, given as follows:
https://docs.openvinotoolkit.org/latest/_docs_performance_benchmarks.html
Regards,
Munesh
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Ok, yes, I understand that the model is optimized by OpenVINO, but I would expect a small improvement whereas here I got ~25 times faster. So I was wondering if it's possible.
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page