<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re:mnist_multi_worker_strategy distribution error in Intel® Optimized AI Frameworks</title>
    <link>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1285011#M231</link>
    <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;We are glad that your issue is resolved. For the multiple worker crash issue, Intel Optimized Tensorflow 2.5.0 has included the fix. You can use the command "pip install intel-tensorflow==2.5.0" to install the latest Intel Optimzied Tensorflow.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt; Thanks and Regards&lt;/P&gt;&lt;P&gt; Rahul&lt;/P&gt;&lt;BR /&gt;</description>
    <pubDate>Thu, 27 May 2021 06:11:55 GMT</pubDate>
    <dc:creator>RahulU_Intel</dc:creator>
    <dc:date>2021-05-27T06:11:55Z</dc:date>
    <item>
      <title>mnist_multi_worker_strategy distribution error</title>
      <link>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1280839#M221</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;System information&lt;/STRONG&gt;&lt;/P&gt;
&lt;UL&gt;
&lt;LI&gt;I can run this multi-node case with tensorflow 2.2.0&amp;nbsp; successfully. But with intel&amp;nbsp;tensorflow 2.2.0,it occur error related with mkl.&lt;/LI&gt;
&lt;LI&gt;OS Platform and Distribution : Linux CentOS 8.2&lt;/LI&gt;
&lt;LI&gt;TensorFlow installed with pip&amp;nbsp;&amp;nbsp;on both machines.&lt;/LI&gt;
&lt;/UL&gt;
&lt;P&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;pip install intel-tensorflow==2.2.0&lt;/P&gt;
&lt;UL&gt;
&lt;LI&gt;Python version: 3.7&lt;/LI&gt;
&lt;LI&gt;Bazel version (if compiling from source): 1.1.0&lt;/LI&gt;
&lt;/UL&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;multi-nodes code&lt;/P&gt;
&lt;P&gt;mnist_multi_worker_strategy.py :&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;#!/usr/bin/python&lt;BR /&gt;# -*-coding:utf-8 -*-&lt;/P&gt;
&lt;P&gt;import os&lt;BR /&gt;import json&lt;BR /&gt;import argparse&lt;/P&gt;
&lt;P&gt;import tensorflow as tf&lt;BR /&gt;from tensorflow.keras import datasets&lt;BR /&gt;from tensorflow.keras import layers, models&lt;BR /&gt;from tensorflow.keras import optimizers&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;def set_strategy(args):&lt;BR /&gt;if args.job_name != 'worker':&lt;BR /&gt;raise ValueError(&lt;BR /&gt;"Multi strategy only support worker mode, please check job name")&lt;/P&gt;
&lt;P&gt;tf_config = args.worker_hosts.split(',')&lt;BR /&gt;os.environ["TF_CONFIG"] = json.dumps({&lt;BR /&gt;'cluster': {&lt;BR /&gt;'worker': tf_config&lt;BR /&gt;},&lt;BR /&gt;'task': {'type': args.job_name, 'index': args.task_index}&lt;BR /&gt;})&lt;BR /&gt;print(os.environ["TF_CONFIG"])&lt;/P&gt;
&lt;P&gt;strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()&lt;/P&gt;
&lt;P&gt;return strategy&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;# create cnn model&lt;BR /&gt;class Net(object):&lt;BR /&gt;def __init__(self):&lt;BR /&gt;model = models.Sequential()&lt;BR /&gt;model.add(layers.Conv2D(&lt;BR /&gt;32, (3, 3), activation='relu', input_shape=(28, 28, 1)))&lt;BR /&gt;model.add(layers.MaxPooling2D((2, 2)))&lt;BR /&gt;model.add(layers.Conv2D(64, (3, 3), activation='relu'))&lt;BR /&gt;model.add(layers.MaxPooling2D((2, 2)))&lt;BR /&gt;model.add(layers.Conv2D(64, (3, 3), activation='relu'))&lt;/P&gt;
&lt;P&gt;model.add(layers.Flatten())&lt;BR /&gt;model.add(layers.Dense(64, activation='relu'))&lt;BR /&gt;model.add(layers.Dense(10, activation='softmax'))&lt;/P&gt;
&lt;P&gt;model.summary()&lt;/P&gt;
&lt;P&gt;self.model = model&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;# inital dateset&lt;BR /&gt;class DataSet(object):&lt;BR /&gt;def __init__(self):&lt;BR /&gt;data_path = os.path.dirname(os.path.realpath(__file__)) \&lt;BR /&gt;+ '/../../datasets/mnist/mnist.npz'&lt;BR /&gt;(train_images, train_labels), (test_images, test_labels) = \&lt;BR /&gt;datasets.mnist.load_data(path=data_path)&lt;BR /&gt;train_images = train_images.reshape((60000, 28, 28, 1))&lt;BR /&gt;test_images = test_images.reshape((10000, 28, 28, 1))&lt;/P&gt;
&lt;P&gt;train_images, test_images = train_images / 255.0, test_images / 255.0&lt;/P&gt;
&lt;P&gt;self.train_images, self.train_labels = train_images, train_labels&lt;BR /&gt;self.test_images, self.test_labels = test_images, test_labels&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;# train and val&lt;BR /&gt;class Train:&lt;BR /&gt;def __init__(self):&lt;BR /&gt;self.data = DataSet()&lt;/P&gt;
&lt;P&gt;def train(self, args, strategy):&lt;BR /&gt;# Define the checkpoint directory to store the checkpoints&lt;BR /&gt;checkpoint_dir = args.train_dir&lt;BR /&gt;# Name of the checkpoint files&lt;BR /&gt;checkpoint_path = os.path.join(checkpoint_dir, "ckpt_{epoch}")&lt;/P&gt;
&lt;P&gt;callbacks = [&lt;BR /&gt;tf.keras.callbacks.TensorBoard(&lt;BR /&gt;log_dir=args.train_dir, histogram_freq=1),&lt;BR /&gt;tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,&lt;BR /&gt;save_weights_only=True),&lt;BR /&gt;]&lt;/P&gt;
&lt;P&gt;with strategy.scope():&lt;BR /&gt;model = Net().model&lt;/P&gt;
&lt;P&gt;model.compile(optimizer=optimizers.Adam(),&lt;BR /&gt;loss='sparse_categorical_crossentropy',&lt;BR /&gt;metrics=['accuracy'])&lt;/P&gt;
&lt;P&gt;model.fit(self.data.train_images, self.data.train_labels,&lt;BR /&gt;batch_size=args.batch_size,&lt;BR /&gt;epochs=args.epochs,&lt;BR /&gt;callbacks=callbacks,&lt;BR /&gt;validation_data=(self.data.test_images, self.data.test_labels))&lt;/P&gt;
&lt;P&gt;# EVAL&lt;BR /&gt;model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))&lt;BR /&gt;eval_loss, eval_acc = model.evaluate(&lt;BR /&gt;self.data.test_images, self.data.test_labels, verbose=2)&lt;BR /&gt;print('Eval loss: {}, Eval Accuracy: {}'.format(eval_loss, eval_acc))&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;def main():&lt;BR /&gt;# training params settings&lt;BR /&gt;parser = argparse.ArgumentParser(description='Tensorflow 2.0 MNIST Example,'&lt;BR /&gt;' use Mirrorstrategy')&lt;BR /&gt;parser.add_argument('--train_dir', '-td', type=str, default='./train_dir',&lt;BR /&gt;help='the folder of svaing model')&lt;BR /&gt;parser.add_argument('--batch_size', '-b', type=int, default=64,&lt;BR /&gt;help='input batch size for training (default: 64)')&lt;BR /&gt;parser.add_argument('--test_batchsize', '-tb', type=int, default=1000,&lt;BR /&gt;help='input batch size for testing (default: 1000)')&lt;BR /&gt;parser.add_argument('--epochs', '-e', type=int, default=10,&lt;BR /&gt;help='number of epochs to train (default: 10)')&lt;BR /&gt;parser.add_argument('--gpu_nums', '-g', type=int, default=0,&lt;BR /&gt;help='number of gpus')&lt;BR /&gt;parser.add_argument('--cpu_nums', '-c', type=int, default=0,&lt;BR /&gt;help='number of cpus')&lt;BR /&gt;parser.add_argument('--learning_rate', '-lr', type=float, default=0.01,&lt;BR /&gt;help='learning rate (default: 0.01)')&lt;BR /&gt;parser.add_argument('--momentum', type=float, default=0.5,&lt;BR /&gt;help='SGD momentum (default: 0.5)')&lt;BR /&gt;parser.add_argument('--log_interval', type=int, default=10,&lt;BR /&gt;help='how many batches to wait before logging training status')&lt;BR /&gt;parser.add_argument('--save_model', '-sm', action='store_true', default=False,&lt;BR /&gt;help='For Saving the current Model')&lt;BR /&gt;parser.add_argument('--worker_hosts', '-wh', type=str, required=True,&lt;BR /&gt;help='Comma-separated list of hostname:port pairs')&lt;BR /&gt;parser.add_argument('--job_name', '-j', type=str, default='worker',&lt;BR /&gt;help='Ps or worker')&lt;BR /&gt;parser.add_argument('--task_index', '-i', type=int, required=True,&lt;BR /&gt;help='Index of task within the job')&lt;/P&gt;
&lt;P&gt;args = parser.parse_args()&lt;/P&gt;
&lt;P&gt;strategy = set_strategy(args)&lt;/P&gt;
&lt;P&gt;app = Train()&lt;BR /&gt;app.train(args, strategy)&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;if __name__ == "__main__":&lt;BR /&gt;main()&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;(/home/hpcadmin/wj/intel_tf2.2) [root@c1 ai]# python /home/hpcadmin/lico-demo/ai/tensorflow2/mnist_multi_worker_strategy.py --worker_hosts=c1:27481,c2:27252 --job_name=worker --task_index=0&lt;BR /&gt;{"cluster": {"worker": ["c1:27481", "c2:27252"]}, "task": {"type": "worker", "index": 0}}&lt;/P&gt;
&lt;P&gt;User settings:&lt;/P&gt;
&lt;P&gt;KMP_AFFINITY=granularity=fine,verbose,compact,1,0&lt;BR /&gt;KMP_BLOCKTIME=0&lt;BR /&gt;KMP_SETTINGS=1&lt;/P&gt;
&lt;P&gt;Effective settings:&lt;/P&gt;
&lt;P&gt;KMP_ABORT_DELAY=0&lt;BR /&gt;KMP_ADAPTIVE_LOCK_PROPS='1,1024'&lt;BR /&gt;KMP_ALIGN_ALLOC=64&lt;BR /&gt;KMP_ALL_THREADPRIVATE=128&lt;BR /&gt;KMP_ATOMIC_MODE=2&lt;BR /&gt;KMP_BLOCKTIME=0&lt;BR /&gt;KMP_CPUINFO_FILE: value is not defined&lt;BR /&gt;KMP_DETERMINISTIC_REDUCTION=false&lt;BR /&gt;KMP_DEVICE_THREAD_LIMIT=2147483647&lt;BR /&gt;KMP_DISP_HAND_THREAD=false&lt;BR /&gt;KMP_DISP_NUM_BUFFERS=7&lt;BR /&gt;KMP_DUPLICATE_LIB_OK=false&lt;BR /&gt;KMP_ENABLE_TASK_THROTTLING=true&lt;BR /&gt;KMP_FORCE_MONOTONIC_DYNAMIC_SCHEDULE=false&lt;BR /&gt;KMP_FORCE_REDUCTION: value is not defined&lt;BR /&gt;KMP_FOREIGN_THREADS_THREADPRIVATE=true&lt;BR /&gt;KMP_FORKJOIN_BARRIER='2,2'&lt;BR /&gt;KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'&lt;BR /&gt;KMP_FORKJOIN_FRAMES=true&lt;BR /&gt;KMP_FORKJOIN_FRAMES_MODE=3&lt;BR /&gt;KMP_GTID_MODE=3&lt;BR /&gt;KMP_HANDLE_SIGNALS=false&lt;BR /&gt;KMP_HOT_TEAMS_MAX_LEVEL=1&lt;BR /&gt;KMP_HOT_TEAMS_MODE=0&lt;BR /&gt;KMP_INIT_AT_FORK=true&lt;BR /&gt;KMP_ITT_PREPARE_DELAY=0&lt;BR /&gt;KMP_LIBRARY=throughput&lt;BR /&gt;KMP_LOCK_KIND=queuing&lt;BR /&gt;KMP_MALLOC_POOL_INCR=1M&lt;BR /&gt;KMP_MWAIT_HINTS=0&lt;BR /&gt;KMP_NUM_LOCKS_IN_BLOCK=1&lt;BR /&gt;KMP_PLAIN_BARRIER='2,2'&lt;BR /&gt;KMP_PLAIN_BARRIER_PATTERN='hyper,hyper'&lt;BR /&gt;KMP_REDUCTION_BARRIER='1,1'&lt;BR /&gt;KMP_REDUCTION_BARRIER_PATTERN='hyper,hyper'&lt;BR /&gt;KMP_SCHEDULE='static,balanced;guided,iterative'&lt;BR /&gt;KMP_SETTINGS=true&lt;BR /&gt;KMP_SPIN_BACKOFF_PARAMS='4096,100'&lt;BR /&gt;KMP_STACKOFFSET=64&lt;BR /&gt;KMP_STACKPAD=0&lt;BR /&gt;KMP_STACKSIZE=8M&lt;BR /&gt;KMP_STORAGE_MAP=false&lt;BR /&gt;KMP_TASKING=2&lt;BR /&gt;KMP_TASKLOOP_MIN_TASKS=0&lt;BR /&gt;KMP_TASK_STEALING_CONSTRAINT=1&lt;BR /&gt;KMP_TEAMS_THREAD_LIMIT=32&lt;BR /&gt;KMP_TOPOLOGY_METHOD=all&lt;BR /&gt;KMP_USER_LEVEL_MWAIT=false&lt;BR /&gt;KMP_USE_YIELD=1&lt;BR /&gt;KMP_VERSION=false&lt;BR /&gt;KMP_WARNINGS=true&lt;BR /&gt;OMP_AFFINITY_FORMAT='OMP: pid %P tid %i thread %n bound to OS proc set {%A}'&lt;BR /&gt;OMP_ALLOCATOR=omp_default_mem_alloc&lt;BR /&gt;OMP_CANCELLATION=false&lt;BR /&gt;OMP_DEBUG=disabled&lt;BR /&gt;OMP_DEFAULT_DEVICE=0&lt;BR /&gt;OMP_DISPLAY_AFFINITY=false&lt;BR /&gt;OMP_DISPLAY_ENV=false&lt;BR /&gt;OMP_DYNAMIC=false&lt;BR /&gt;OMP_MAX_ACTIVE_LEVELS=1&lt;BR /&gt;OMP_MAX_TASK_PRIORITY=0&lt;BR /&gt;OMP_NESTED: deprecated; max-active-levels-var=1&lt;BR /&gt;OMP_NUM_TEAMS=0&lt;BR /&gt;OMP_NUM_THREADS: value is not defined&lt;BR /&gt;OMP_PLACES='threads'&lt;BR /&gt;OMP_PROC_BIND='intel'&lt;BR /&gt;OMP_SCHEDULE='static'&lt;BR /&gt;OMP_STACKSIZE=8M&lt;BR /&gt;OMP_TARGET_OFFLOAD=DEFAULT&lt;BR /&gt;OMP_TEAMS_THREAD_LIMIT=0&lt;BR /&gt;OMP_THREAD_LIMIT=2147483647&lt;BR /&gt;OMP_TOOL=enabled&lt;BR /&gt;OMP_TOOL_LIBRARIES: value is not defined&lt;BR /&gt;OMP_TOOL_VERBOSE_LOAD=disabled&lt;BR /&gt;OMP_WAIT_POLICY=PASSIVE&lt;BR /&gt;KMP_AFFINITY='verbose,warnings,respect,granularity=thread,compact,1,0'&lt;/P&gt;
&lt;P&gt;2021-05-12 18:10:11.273100: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2599980000 Hz&lt;BR /&gt;2021-05-12 18:10:11.277218: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5583f4f889d0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:&lt;BR /&gt;2021-05-12 18:10:11.277242: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version&lt;BR /&gt;2021-05-12 18:10:11.277396: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.&lt;BR /&gt;2021-05-12 18:10:11.291073: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -&amp;gt; {0 -&amp;gt; c1:27481, 1 -&amp;gt; c2:27252}&lt;BR /&gt;2021-05-12 18:10:11.295355: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:405] Started server with target: grpc://c1:27481&lt;BR /&gt;2021-05-12 18:10:11.780776: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session started.&lt;BR /&gt;Model: "sequential"&lt;BR /&gt;_________________________________________________________________&lt;BR /&gt;Layer (type) Output Shape Param # &lt;BR /&gt;=================================================================&lt;BR /&gt;conv2d (Conv2D) (None, 26, 26, 32) 320 &lt;BR /&gt;_________________________________________________________________&lt;BR /&gt;max_pooling2d (MaxPooling2D) (None, 13, 13, 32) 0 &lt;BR /&gt;_________________________________________________________________&lt;BR /&gt;conv2d_1 (Conv2D) (None, 11, 11, 64) 18496 &lt;BR /&gt;_________________________________________________________________&lt;BR /&gt;max_pooling2d_1 (MaxPooling2 (None, 5, 5, 64) 0 &lt;BR /&gt;_________________________________________________________________&lt;BR /&gt;conv2d_2 (Conv2D) (None, 3, 3, 64) 36928 &lt;BR /&gt;_________________________________________________________________&lt;BR /&gt;flatten (Flatten) (None, 576) 0 &lt;BR /&gt;_________________________________________________________________&lt;BR /&gt;dense (Dense) (None, 64) 36928 &lt;BR /&gt;_________________________________________________________________&lt;BR /&gt;dense_1 (Dense) (None, 10) 650 &lt;BR /&gt;=================================================================&lt;BR /&gt;Total params: 93,322&lt;BR /&gt;Trainable params: 93,322&lt;BR /&gt;Non-trainable params: 0&lt;BR /&gt;_________________________________________________________________&lt;BR /&gt;WARNING:tensorflow:`eval_fn` is not passed in. The `worker_fn` will be used if an "evaluator" task exists in the cluster.&lt;BR /&gt;WARNING:tensorflow:`eval_strategy` is not passed in. No distribution strategy will be used for evaluation.&lt;BR /&gt;2021-05-12 18:10:43.333950: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:521] In AUTO-mode, and switching to DATA-based sharding, instead of FILE-based sharding as we cannot find appropriate reader dataset op(s) to shard. Error: Did not find a shardable source, walked to a node which is not a dataset: name: "FlatMapDataset/_9"&lt;BR /&gt;op: "FlatMapDataset"&lt;BR /&gt;input: "PrefetchDataset/_8"&lt;BR /&gt;attr {&lt;BR /&gt;key: "Targuments"&lt;BR /&gt;value {&lt;BR /&gt;list {&lt;BR /&gt;}&lt;BR /&gt;}&lt;BR /&gt;}&lt;BR /&gt;attr {&lt;BR /&gt;key: "f"&lt;BR /&gt;value {&lt;BR /&gt;func {&lt;BR /&gt;name: "__inference_Dataset_flat_map_slice_batch_indices_245"&lt;BR /&gt;}&lt;BR /&gt;}&lt;BR /&gt;}&lt;BR /&gt;attr {&lt;BR /&gt;key: "output_shapes"&lt;BR /&gt;value {&lt;BR /&gt;list {&lt;BR /&gt;shape {&lt;BR /&gt;dim {&lt;BR /&gt;size: -1&lt;BR /&gt;}&lt;BR /&gt;}&lt;BR /&gt;}&lt;BR /&gt;}&lt;BR /&gt;}&lt;BR /&gt;attr {&lt;BR /&gt;key: "output_types"&lt;BR /&gt;value {&lt;BR /&gt;list {&lt;BR /&gt;type: DT_INT64&lt;BR /&gt;}&lt;BR /&gt;}&lt;BR /&gt;}&lt;BR /&gt;. Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA` before applying the options object to the dataset via `dataset.with_options(options)`.&lt;BR /&gt;Epoch 1/10&lt;BR /&gt;WARNING:tensorflow:From /home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/data/ops/multi_device_iterator_ops.py:601: get_next_as_optional (from tensorflow.python.data.ops.iterator_ops) is deprecated and will be removed in a future version.&lt;BR /&gt;Instructions for updating:&lt;BR /&gt;Use `tf.data.Iterator.get_next_as_optional()` instead.&lt;BR /&gt;2021-05-12 18:10:46.623967: E tensorflow/core/common_runtime/ring_alg.cc:274] Aborting RingReduce with Cancelled: [_Derived_]Cancelled&lt;BR /&gt;Additional GRPC error information from remote target /job:worker/replica:0/task:1:&lt;BR /&gt;:{"created":"@1620814246.623910026","description":"Error received from peer ipv4:10.240.212.98:27252","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Cancelled","grpc_status":1}&lt;BR /&gt;2021-05-12 18:10:46.624121: W tensorflow/core/framework/op_kernel.cc:1767] OP_REQUIRES failed at collective_ops.cc:257 : Cancelled: [_Derived_]Cancelled&lt;BR /&gt;Additional GRPC error information from remote target /job:worker/replica:0/task:1:&lt;BR /&gt;:{"created":"@1620814246.623910026","description":"Error received from peer ipv4:10.240.212.98:27252","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Cancelled","grpc_status":1}&lt;BR /&gt;Traceback (most recent call last):&lt;BR /&gt;File "/home/hpcadmin/lico-demo/ai/tensorflow2/mnist_multi_worker_strategy.py", line 147, in &amp;lt;module&amp;gt;&lt;BR /&gt;main()&lt;BR /&gt;File "/home/hpcadmin/lico-demo/ai/tensorflow2/mnist_multi_worker_strategy.py", line 143, in main&lt;BR /&gt;app.train(args, strategy)&lt;BR /&gt;File "/home/hpcadmin/lico-demo/ai/tensorflow2/mnist_multi_worker_strategy.py", line 98, in train&lt;BR /&gt;validation_data=(self.data.test_images, self.data.test_labels))&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py", line 117, in _method_wrapper&lt;BR /&gt;mode=dc.CoordinatorMode.INDEPENDENT_WORKER)&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_coordinator.py", line 860, in run_distribute_coordinator&lt;BR /&gt;task_id, session_config, rpc_layer)&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_coordinator.py", line 360, in _run_single_worker&lt;BR /&gt;return worker_fn(strategy)&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py", line 115, in &amp;lt;lambda&amp;gt;&lt;BR /&gt;lambda _: method(self, *args, **kwargs),&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py", line 1098, in fit&lt;BR /&gt;tmp_logs = train_function(iterator)&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py", line 780, in __call__&lt;BR /&gt;result = self._call(*args, **kwds)&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py", line 840, in _call&lt;BR /&gt;return self._stateless_fn(*args, **kwds)&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/eager/function.py", line 2829, in __call__&lt;BR /&gt;return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/eager/function.py", line 1848, in _filtered_call&lt;BR /&gt;cancellation_manager=cancellation_manager)&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/eager/function.py", line 1924, in _call_flat&lt;BR /&gt;ctx, args, cancellation_manager=cancellation_manager))&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/eager/function.py", line 550, in call&lt;BR /&gt;ctx=ctx)&lt;BR /&gt;File "/home/hpcadmin/wj/intel_tf2.4.1_pip/lib/python3.7/site-packages/tensorflow/python/eager/execute.py", line 60, in quick_execute&lt;BR /&gt;inputs, attrs, num_outputs)&lt;BR /&gt;tensorflow.python.framework.errors_impl.InvalidArgumentError: Upper bound check fail for input 5 from node Mkl2Tf/_47 to node scoped_allocator_concat_1_8 input bounds = [0x7f602004fc40, 0x7f602004fd40] backing_tensor bounds = [0x7f5d11a56540, 0x7f5d11ab1780]&lt;BR /&gt;[[{{node scoped_allocator_concat_1_8}}]] [Op:__inference_train_function_1152]&lt;/P&gt;
&lt;P&gt;Function call stack:&lt;BR /&gt;train_function&lt;/P&gt;
&lt;P&gt;2021-05-12 18:10:46.797979: W tensorflow/core/common_runtime/eager/context.cc:566] Unable to destroy server_ object, so releasing instead. Servers don't support clean shutdown.&lt;/P&gt;
&lt;P&gt;tensorflow.python.framework.errors_impl.InvalidArgumentError: Upper bound check fail for input 5 from node Mkl2Tf/_47 to node scoped_allocator_concat_1_8 input bounds = [0x7f602004fc40, 0x7f602004fd40] backing_tensor bounds = [0x7f5d11a56540, 0x7f5d11ab1780]&lt;/P&gt;</description>
      <pubDate>Wed, 12 May 2021 10:36:53 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1280839#M221</guid>
      <dc:creator>wangjian</dc:creator>
      <dc:date>2021-05-12T10:36:53Z</dc:date>
    </item>
    <item>
      <title>Re:mnist_multi_worker_strategy distribution error</title>
      <link>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1280854#M222</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Thanks for posting in Intel Forums. We will try to reproduce your issue from our side and let you know the updates.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Rahul&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Wed, 12 May 2021 12:33:06 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1280854#M222</guid>
      <dc:creator>RahulU_Intel</dc:creator>
      <dc:date>2021-05-12T12:33:06Z</dc:date>
    </item>
    <item>
      <title>Re:mnist_multi_worker_strategy distribution error</title>
      <link>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1281846#M224</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;We tried the command you sent, but we could not reproduce the error. Can you reconfirm the command you sent to execute the python file?&lt;/P&gt;&lt;P&gt;Also can you attach the python file so that it will be easier for us to execute the same&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;Rahul&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Mon, 17 May 2021 07:13:50 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1281846#M224</guid>
      <dc:creator>RahulU_Intel</dc:creator>
      <dc:date>2021-05-17T07:13:50Z</dc:date>
    </item>
    <item>
      <title>Re:mnist_multi_worker_strategy distribution error</title>
      <link>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1282350#M225</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;We tried executing your code using Intel optimized tensorflow(2.2.0), but we were not able to reproduce the error. Can you attach the python script as a file you used to execute the same?&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;Rahul&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Tue, 18 May 2021 13:14:29 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1282350#M225</guid>
      <dc:creator>RahulU_Intel</dc:creator>
      <dc:date>2021-05-18T13:14:29Z</dc:date>
    </item>
    <item>
      <title>Re:mnist_multi_worker_strategy distribution error</title>
      <link>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1283157#M228</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN style="font-size: 11pt;"&gt;We looked into your case, It’s known bug in multiple worker model. It has been fixed in latest Tensorflow . Stack tf .2.5.0 has included the fix.&lt;/SPAN&gt;&lt;SPAN style="font-size: 12pt;"&gt; &lt;/SPAN&gt;&lt;SPAN style="font-size: 11pt;"&gt;You could install google official TF 2.5.0, and enable Intel optimization of Tensorflow by setting the environment variable &lt;/SPAN&gt;&lt;SPAN style="font-size: 10pt;"&gt;TF_ENABLE_ONEDNN_OPTS=1&lt;/SPAN&gt;&lt;SPAN style="font-size: 11pt;"&gt;. Can you try executing your code by upgrading your Tensorflow and let us know your results?&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;Rahul&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Thu, 20 May 2021 11:51:48 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1283157#M228</guid>
      <dc:creator>RahulU_Intel</dc:creator>
      <dc:date>2021-05-20T11:51:48Z</dc:date>
    </item>
    <item>
      <title>Re: mnist_multi_worker_strategy distribution error</title>
      <link>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1283411#M229</link>
      <description>&lt;P&gt;Yes,I have execute code under TF 2.5.0 successfully. Thank your support. BTW , I want to know do&amp;nbsp; you have plan to merge this fix to Intel optimized TF?&lt;/P&gt;</description>
      <pubDate>Fri, 21 May 2021 03:24:39 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1283411#M229</guid>
      <dc:creator>wangjian</dc:creator>
      <dc:date>2021-05-21T03:24:39Z</dc:date>
    </item>
    <item>
      <title>Re:mnist_multi_worker_strategy distribution error</title>
      <link>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1285011#M231</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;We are glad that your issue is resolved. For the multiple worker crash issue, Intel Optimized Tensorflow 2.5.0 has included the fix. You can use the command "pip install intel-tensorflow==2.5.0" to install the latest Intel Optimzied Tensorflow.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt; Thanks and Regards&lt;/P&gt;&lt;P&gt; Rahul&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Thu, 27 May 2021 06:11:55 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1285011#M231</guid>
      <dc:creator>RahulU_Intel</dc:creator>
      <dc:date>2021-05-27T06:11:55Z</dc:date>
    </item>
    <item>
      <title>Re:mnist_multi_worker_strategy distribution error</title>
      <link>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1287077#M233</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;We haven't heard back anything from you. Could you please confirm if the issue is resolved.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Fri, 04 Jun 2021 04:11:57 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1287077#M233</guid>
      <dc:creator>RahulU_Intel</dc:creator>
      <dc:date>2021-06-04T04:11:57Z</dc:date>
    </item>
    <item>
      <title>Re:mnist_multi_worker_strategy distribution error</title>
      <link>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1289502#M236</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;I have not heard back from you, so I will close this inquiry now. If you need further assistance, please post a new question.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Thanks and Regards&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Mon, 14 Jun 2021 03:44:44 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1289502#M236</guid>
      <dc:creator>RahulU_Intel</dc:creator>
      <dc:date>2021-06-14T03:44:44Z</dc:date>
    </item>
    <item>
      <title>Re: mnist_multi_worker_strategy distribution error</title>
      <link>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1293132#M247</link>
      <description>&lt;P&gt;I have no problem,Thank you very much.&lt;/P&gt;</description>
      <pubDate>Fri, 25 Jun 2021 03:00:18 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Optimized-AI-Frameworks/mnist-multi-worker-strategy-distribution-error/m-p/1293132#M247</guid>
      <dc:creator>wangjian</dc:creator>
      <dc:date>2021-06-25T03:00:18Z</dc:date>
    </item>
  </channel>
</rss>

