Community
cancel
Showing results for 
Search instead for 
Did you mean: 
alexkvashnin
Beginner
118 Views

Doesn't work Infiniband using IntelMPI

I gave tried to run parallel program using Intel MPI Library, but when I started like
mpirun -hosts node01,node02 -np 36 ./prog
it finished work without start witj an error message 
node02-ib0:56fa:17a5a440: 705 us(705 us):  open_hca: get lid ERR for mlx4_0 port=2, err=Invalid argument
node01-ib0:4bd1:ceb2a440: 590 us(590 us):  open_hca: get lid ERR for mlx4_0 port=2, err=Invalid argument
node02-ib0:56fa:17a5a440: 522 us(522 us):  open_hca: getaddr_netdev ERROR: No such device. Is ib1 configured?
node02-ib0:56fa:17a5a440: 7180 us(6475 us):  open_hca: device mthca0 not found
node02-ib0:56fa:17a5a440: 7403 us(223 us):  open_hca: device mthca0 not found
node02-ib0:56fa:17a5a440: 7613 us(210 us):  open_hca: device ipath0 not found
node02-ib0:56fa:17a5a440: 7810 us(197 us):  open_hca: device ipath0 not found
node02-ib0:56fa:17a5a440: 8128 us(318 us):  open_hca: device ehca0 not found
node02-ib0:56fa:17a5a440: 1994 us(1472 us):  open_hca: getaddr_netdev ERROR: No such device. Is eth2 configured?
node01-ib0:4bd1:ceb2a440: 596 us(596 us):  open_hca: getaddr_netdev ERROR: No such device. Is ib1 configured?
node01-ib0:4bd1:ceb2a440: 7904 us(7314 us):  open_hca: device mthca0 not found
node01-ib0:4bd1:ceb2a440: 8221 us(317 us):  open_hca: device mthca0 not found
node01-ib0:4bd1:ceb2a440: 8524 us(303 us):  open_hca: device ipath0 not found
node02-ib0:56fa:17a5a440: 869 us(869 us):  ucm_create_services: ERR Cannot allocate memory
node01-ib0:4bd1:ceb2a440: 8784 us(260 us):  open_hca: device ipath0 not found
node01-ib0:4bd1:ceb2a440: 9042 us(258 us):  open_hca: device ehca0 not found
node01-ib0:4bd1:ceb2a440: 2440 us(1844 us):  open_hca: getaddr_netdev ERROR: No such device. Is eth2 configured?
node01-ib0:4bd1:ceb2a440: 859 us(859 us):  ucm_create_services: ERR Cannot allocate memory
APPLICATION TERMINATED WITH THE EXIT STRING: Hangup (signal 1)
Infiniband devices works, everything seems to be OK. People who were asked said that this error related with permissions to devices, because when I ran it under root it worked well, but when I ran it under common user I had this error. Could you please help me?
[root@tisnum-head1 ~]# ibstat
CA 'mlx4_0'
        CA type: MT26428
        Number of ports: 2
        Firmware version: 2.9.1000
        Hardware version: b0
        Node GUID: 0x0002c90300565970
        System image GUID: 0x0002c90300565973
        Port 1:
                State: Down
                Physical state: Polling
                Rate: 70
                Base lid: 0
                LMC: 0
                SM lid: 0
                Capability mask: 0x02510868
                Port GUID: 0x0002c90300565971
                Link layer: InfiniBand
        Port 2:
                State: Active
                Physical state: LinkUp
                Rate: 40
                Base lid: 1
                LMC: 0
                SM lid: 1
                Capability mask: 0x0251086a
                Port GUID: 0x0002c90300565972
                Link layer: InfiniBand
[root@tisnum-head1 ~]# lsmod
Module                  Size  Used by
rdma_ucm               12586  0
ib_ucm                 12255  0
rdma_cm                35175  1 rdma_ucm
iw_cm                   8836  1 rdma_cm
ib_addr                 6321  1 rdma_cm
ib_ipoib               84890  0
ib_cm                  38085  3 ib_ucm,rdma_cm,ib_ipoib
ib_sa                  44401  4 rdma_ucm,rdma_cm,ib_ipoib,ib_cm
ib_uverbs              39637  2 rdma_ucm,ib_ucm
ib_umad                12477  6
iw_nes                192353  0
iw_cxgb3              133047  0
cxgb3                 196233  1 iw_cxgb3
mlx4_ib                80171  0
mlx4_en                97664  0
mlx4_core             185193  2 mlx4_ib,mlx4_en
ib_mthca              141407  0
ib_mad                 40497  5 ib_cm,ib_sa,ib_umad,mlx4_ib,ib_mthca
ib_core                69979  14 rdma_ucm,ib_ucm,rdma_cm,iw_cm,ib_ipoib,ib_cm,ib_sa,ib_uverbs,ib_umad,iw_nes,iw_cxgb3,mlx4_ib,ib_mthca,ib_mad
mpt2sas               173216  0
scsi_transport_sas     35070  1 mpt2sas
raid_class              4804  1 mpt2sas
mptctl                 31976  0
mptbase                93845  1 mptctl
nfsd                  305799  13
lockd                  74270  1 nfsd
nfs_acl                 2647  1 nfsd
auth_rpcgss            44895  1 nfsd
exportfs                4236  1 nfsd
autofs4                26888  3
ipmi_devintf            8049  0
ipmi_si                42401  0
ipmi_msghandler        35992  2 ipmi_devintf,ipmi_si
sunrpc                243758  26 nfsd,lockd,nfs_acl,auth_rpcgss
8021q                  23575  0
garp                    7344  1 8021q
stp                     2173  1 garp
llc                     5642  2 garp,stp
ipv6                  322029  134 ib_addr,ib_ipoib
libcrc32c               1246  1 iw_nes
nls_utf8                1455  1
ext3                  235341  1
jbd                    80337  1 ext3
sg                     30124  0
igb                   157825  0
dca                     7197  1 igb
microcode             112594  0
sr_mod                 16228  0
cdrom                  39771  1 sr_mod
serio_raw               4818  0
amd64_edac_mod         21461  0
edac_core              46773  6 amd64_edac_mod
edac_mce_amd           15488  1 amd64_edac_mod
i2c_piix4              12608  0
i2c_core               31276  1 i2c_piix4
shpchp                 33482  0
ext4                  364410  3
mbcache                 8144  2 ext3,ext4
jbd2                   88738  1 ext4
sd_mod                 39488  7
crc_t10dif              1541  1 sd_mod
usb_storage            49452  0
megaraid_sas           77090  5
ata_generic             3837  0
pata_acpi               3701  0
pata_atiixp             4211  0
ahci                   40455  0
dm_mirror              14101  0
dm_region_hash         12170  1 dm_mirror
dm_log                 10122  2 dm_mirror,dm_region_hash
dm_mod                 81500  2 dm_mirror,dm_log
Thank you!
--
Alexander Kvashnin
0 Kudos
0 Replies
Reply