alex@thor|~/mpi_mic> setenv I_MPI_MIC 1
alex@thor|~/mpi_mic> setenv I_MPI_MIC_PREFIX $I_MPI_ROOT/mic/bin/
alex@thor|~/mpi_mic> setenv I_MPI_DEBUG 100
alex@thor|~/mpi_mic> setenv I_MPI_FABRICS shm:tcp
alex@thor|~/mpi_mic> mpirun -ppn 1 -n 2 -hosts thor,mic0 IMB-MPI1 pingpong 

[0] MPI startup(): Intel(R) MPI Library, Version 5.0 Update 3  Build 20150128 (build id: 11250)
[0] MPI startup(): Copyright (C) 2003-2015 Intel Corporation.  All rights reserved.
[0] MPI startup(): Multi-threaded optimized library
[0] MPID_nem_impi_create_numa_nodes_map(): NUMA map->self_id     = 0
[0] MPID_nem_impi_create_numa_nodes_map(): NUMA map->devices_num = 2
[0] MPID_nem_impi_create_numa_nodes_map(): NUMA map->nodes_num   = 2
[0] MPID_nem_impi_create_numa_nodes_map(): NUMA map->devices:
[0] MPID_nem_impi_create_numa_nodes_map(): mic0:1
[0] MPID_nem_impi_create_numa_nodes_map(): mic1:1
[0] MPID_nem_impi_create_numa_nodes_map(): NUMA map->distances:
[0] MPID_nem_impi_create_numa_nodes_map(): 0 -> 0 = 10
[0] MPID_nem_impi_create_numa_nodes_map(): 0 -> 1 = 21
[1] MPID_nem_impi_create_numa_nodes_map(): Fetching extra numa information from /etc/ofed-mic.map
[0] MPI startup(): shm and tcp data transfer modes
[1] MPI startup(): shm and tcp data transfer modes
[1] MPI startup(): Recognition mode: 2, selected platform: 64 own platform: 64
[0] MPI startup(): Recognition mode: 2, selected platform: 64 own platform: 16
[0] MPI startup(): Device_reset_idx=1
[0] MPI startup(): Allgather: 1: 0-2147483647 & 0-2
[0] MPI startup(): Allgather: 1: 0-8192 & 0-2147483647
[0] MPI startup(): Allgather: 1: 0-131072 & 0-4
[0] MPI startup(): Allgather: 3: 0-2147483647 & 0-2147483647
[0] MPI startup(): Allgatherv: 0: 0-2147483647 & 0-2147483647
[0] MPI startup(): Allreduce: 0: 4194300-2147483647 & 0-3
[0] MPI startup(): Allreduce: 1: 0-1024 & 0-2147483647
[0] MPI startup(): Allreduce: 1: 0-2147483647 & 0-2
[0] MPI startup(): Allreduce: 1: 0-16384 & 0-4
[0] MPI startup(): Allreduce: 1: 0-8182 & 0-8
[0] MPI startup(): Allreduce: 1: 0-4096 & 0-16
[0] MPI startup(): Allreduce: 2: 0-2147483647 & 0-2147483647
[0] MPI startup(): Alltoall: 1: 0-16 & 9-2147483647
[0] MPI startup(): Alltoall: 1: 0-32 & 17-2147483647
[0] MPI startup(): Alltoall: 1: 4097-16384 & 0-2
[0] MPI startup(): Alltoall: 2: 0-2147483647 & 0-2
[0] MPI startup(): Alltoall: 2: 0-8192 & 0-2147483647
[0] MPI startup(): Alltoall: 2: 0-32768 & 0-16
[0] MPI startup(): Alltoall: 2: 0-262144 & 0-8
[0] MPI startup(): Alltoall: 2: 1048576-2147483647 & 0-4
[0] MPI startup(): Alltoall: 4: 32768-2147483647 & 3-16
[0] MPI startup(): Alltoall: 3: 0-2147483647 & 0-2147483647
[0] MPI startup(): Alltoallv: 1: 0-2147483647 & 0-2147483647
[0] MPI startup(): Alltoallw: 0: 0-2147483647 & 0-2147483647
[0] MPI startup(): Barrier: 1: 0-2147483647 & 0-2
[0] MPI startup(): Barrier: 2: 0-2147483647 & 0-2147483647
[0] MPI startup(): Bcast: 1: 0-2147483647 & 0-2
[0] MPI startup(): Bcast: 1: 0-1024 & 0-2147483647
[0] MPI startup(): Bcast: 1: 0-8192 & 0-4
[0] MPI startup(): Bcast: 7: 0-2147483647 & 0-2147483647
[0] MPI startup(): Exscan: 0: 0-2147483647 & 0-2147483647
[0] MPI startup(): Gather: 1: 0-1048576 & 0-2
[0] MPI startup(): Gather: 1: 262145-1048576 & 9-16
[0] MPI startup(): Gather: 3: 0-1024 & 0-2147483647
[0] MPI startup(): Gather: 0: 0-2147483647 & 0-2147483647
[0] MPI startup(): Gatherv: 1: 0-2147483647 & 0-2147483647
[0] MPI startup(): Reduce_scatter: 1: 0-32768 & 0-2147483647
[0] MPI startup(): Reduce_scatter: 1: 0-65536 & 0-2
[0] MPI startup(): Reduce_scatter: 1: 0-65536 & 9-2147483647
[0] MPI startup(): Reduce_scatter: 2: 0-2147483647 & 0-2147483647
[0] MPI startup(): Reduce: 1: 0-2147483647 & 0-2147483647
[0] MPI startup(): Scan: 0: 0-2147483647 & 0-2147483647
[0] MPI startup(): Scatter: 1: 0-524288 & 0-2
[0] MPI startup(): Scatter: 3: 0-1024 & 0-2147483647
[0] MPI startup(): Scatter: 2: 0-2147483647 & 0-2147483647
[0] MPI startup(): Scatterv: 1: 0-2147483647 & 0-2147483647
[0] MPI startup(): Rank    Pid      Node name  Pin cpu
[0] MPI startup(): 0       25433    thor       {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23}
[0] MPI startup(): 1       7658     thor-mic0  {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,
                                 30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56
                                 ,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,8
                                 3,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,10
                                 7,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,12
                                 7,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,14
                                 7,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,16
                                 7,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,18
                                 7,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,20
                                 7,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,22
                                 7}
[0] MPI startup(): Recognition=2 Platform(code=64 ippn=0 dev=5) Fabric(intra=1 inter=6 flags=0x0)
[0] MPI startup(): Topology split mode = 1

| rank | node | space=2
|  0  |  0  |
|  1  |  1  |
[0] MPI startup(): I_MPI_DEBUG=100
[0] MPI startup(): I_MPI_FABRICS=shm:tcp
[1] MPI startup(): Recognition=2 Platform(code=64 ippn=0 dev=5) Fabric(intra=1 inter=6 flags=0x0)
[0] MPI startup(): I_MPI_INFO_BRAND=Intel(R) Xeon(R) 
[0] MPI startup(): I_MPI_INFO_CACHE1=0,1,2,3,4,5,16,17,18,19,20,21,0,1,2,3,4,5,16,17,18,19,20,21
[0] MPI startup(): I_MPI_INFO_CACHE2=0,1,2,3,4,5,16,17,18,19,20,21,0,1,2,3,4,5,16,17,18,19,20,21
[0] MPI startup(): I_MPI_INFO_CACHE3=0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1
[0] MPI startup(): I_MPI_INFO_CACHES=3
[0] MPI startup(): I_MPI_INFO_CACHE_SHARE=2,2,32
[0] MPI startup(): I_MPI_INFO_CACHE_SIZE=32768,262144,15728640
[0] MPI startup(): I_MPI_INFO_CORE=0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5
[0] MPI startup(): I_MPI_INFO_C_NAME=Unknown
[0] MPI startup(): I_MPI_INFO_DESC=1342177285
[0] MPI startup(): I_MPI_INFO_FLGB=641
[0] MPI startup(): I_MPI_INFO_FLGC=2143216639
[0] MPI startup(): I_MPI_INFO_FLGD=-1075053569
[0] MPI startup(): I_MPI_INFO_LCPU=24
[0] MPI startup(): I_MPI_INFO_MODE=775
[0] MPI startup(): I_MPI_INFO_NUMA_NODE_DIST=10,21,21,10
[0] MPI startup(): I_MPI_INFO_NUMA_NODE_MAP=mic0:1,mic1:1
[0] MPI startup(): I_MPI_INFO_NUMA_NODE_NUM=2
[0] MPI startup(): I_MPI_INFO_PACK=0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1
[0] MPI startup(): I_MPI_INFO_SERIAL=E5-2620 v2 
[0] MPI startup(): I_MPI_INFO_SIGN=198372
[0] MPI startup(): I_MPI_INFO_STATE=0
[0] MPI startup(): I_MPI_INFO_THREAD=0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1
[0] MPI startup(): I_MPI_INFO_VEND=1
[0] MPI startup(): I_MPI_MIC=1
[0] MPI startup(): I_MPI_PIN_INFO=x0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
[0] MPI startup(): I_MPI_PIN_MAPPING=1:0 0
 benchmarks to run pingpong 
#------------------------------------------------------------
#    Intel (R) MPI Benchmarks 4.0 Update 1, MPI-1 part    
#------------------------------------------------------------
# Date                  : Mon Jan 18 13:14:59 2016
# Machine               : x86_64
# System                : Linux
# Release               : 3.10.0-327.4.4.el7.x86_64
# Version               : #1 SMP Tue Jan 5 16:07:00 UTC 2016
# MPI Version           : 3.0
# MPI Thread Environment: 

# New default behavior from Version 3.2 on:

# the number of iterations per message size is cut down 
# dynamically when a certain run time (per message size sample) 
# is expected to be exceeded. Time limit is defined by variable 
# "SECS_PER_SAMPLE" (=> IMB_settings.h) 
# or through the flag => -time 
  

# Calling sequence was: 

# IMB-MPI1 pingpong

# Minimum message length in bytes:   0
# Maximum message length in bytes:   4194304
#
# MPI_Datatype                   :   MPI_BYTE 
# MPI_Datatype for reductions    :   MPI_FLOAT
# MPI_Op                         :   MPI_SUM  
#
#

# List of Benchmarks to run:

# PingPong

#---------------------------------------------------
# Benchmarking PingPong 
# #processes = 2 
#---------------------------------------------------
       #bytes #repetitions      t[usec]   Mbytes/sec
            0         1000       215.20         0.00
            1         1000       185.85         0.01
            2         1000       126.51         0.02
            4         1000       179.30         0.02
            8         1000       128.98         0.06
           16         1000       158.92         0.10
           32         1000       130.78         0.23
           64         1000       186.40         0.33
          128         1000       261.80         0.47
          256         1000       270.87         0.90
          512         1000       248.77         1.96
         1024         1000       129.55         7.54
         2048         1000       134.49        14.52
         4096         1000       136.24        28.67
         8192         1000       162.11        48.19
        16384         1000       197.71        79.03
        32768         1000       281.52       111.00
        65536          640       399.17       156.58
       131072          320       617.67       202.37
       262144          160      1761.63       141.91
       524288           80      2908.96       171.88
      1048576           40      4502.54       222.10
      2097152           20      7349.97       272.11
      4194304           10     12400.15       322.58


# All processes entering MPI_Finalize