- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I gave tried to run parallel program using Intel MPI Library, but when I started like
mpirun -hosts node01,node02 -np 36 ./prog
it finished work without start witj an error message
node02-ib0:56fa:17a5a440: 705 us(705 us): open_hca: get lid ERR for mlx4_0 port=2, err=Invalid argument
node01-ib0:4bd1:ceb2a440: 590 us(590 us): open_hca: get lid ERR for mlx4_0 port=2, err=Invalid argument
node02-ib0:56fa:17a5a440: 522 us(522 us): open_hca: getaddr_netdev ERROR: No such device. Is ib1 configured?
node02-ib0:56fa:17a5a440: 7180 us(6475 us): open_hca: device mthca0 not found
node02-ib0:56fa:17a5a440: 7403 us(223 us): open_hca: device mthca0 not found
node02-ib0:56fa:17a5a440: 7613 us(210 us): open_hca: device ipath0 not found
node02-ib0:56fa:17a5a440: 7810 us(197 us): open_hca: device ipath0 not found
node02-ib0:56fa:17a5a440: 8128 us(318 us): open_hca: device ehca0 not found
node02-ib0:56fa:17a5a440: 1994 us(1472 us): open_hca: getaddr_netdev ERROR: No such device. Is eth2 configured?
node01-ib0:4bd1:ceb2a440: 596 us(596 us): open_hca: getaddr_netdev ERROR: No such device. Is ib1 configured?
node01-ib0:4bd1:ceb2a440: 7904 us(7314 us): open_hca: device mthca0 not found
node01-ib0:4bd1:ceb2a440: 8221 us(317 us): open_hca: device mthca0 not found
node01-ib0:4bd1:ceb2a440: 8524 us(303 us): open_hca: device ipath0 not found
node02-ib0:56fa:17a5a440: 869 us(869 us): ucm_create_services: ERR Cannot allocate memory
node01-ib0:4bd1:ceb2a440: 8784 us(260 us): open_hca: device ipath0 not found
node01-ib0:4bd1:ceb2a440: 9042 us(258 us): open_hca: device ehca0 not found
node01-ib0:4bd1:ceb2a440: 2440 us(1844 us): open_hca: getaddr_netdev ERROR: No such device. Is eth2 configured?
node01-ib0:4bd1:ceb2a440: 859 us(859 us): ucm_create_services: ERR Cannot allocate memory
APPLICATION TERMINATED WITH THE EXIT STRING: Hangup (signal 1)
Infiniband devices works, everything seems to be OK. People who were asked said that this error related with permissions to devices, because when I ran it under root it worked well, but when I ran it under common user I had this error. Could you please help me?
[root@tisnum-head1 ~]# ibstat
CA 'mlx4_0'
CA type: MT26428
Number of ports: 2
Firmware version: 2.9.1000
Hardware version: b0
Node GUID: 0x0002c90300565970
System image GUID: 0x0002c90300565973
Port 1:
State: Down
Physical state: Polling
Rate: 70
Base lid: 0
LMC: 0
SM lid: 0
Capability mask: 0x02510868
Port GUID: 0x0002c90300565971
Link layer: InfiniBand
Port 2:
State: Active
Physical state: LinkUp
Rate: 40
Base lid: 1
LMC: 0
SM lid: 1
Capability mask: 0x0251086a
Port GUID: 0x0002c90300565972
Link layer: InfiniBand
[root@tisnum-head1 ~]# lsmod
Module Size Used by
rdma_ucm 12586 0
ib_ucm 12255 0
rdma_cm 35175 1 rdma_ucm
iw_cm 8836 1 rdma_cm
ib_addr 6321 1 rdma_cm
ib_ipoib 84890 0
ib_cm 38085 3 ib_ucm,rdma_cm,ib_ipoib
ib_sa 44401 4 rdma_ucm,rdma_cm,ib_ipoib,ib_cm
ib_uverbs 39637 2 rdma_ucm,ib_ucm
ib_umad 12477 6
iw_nes 192353 0
iw_cxgb3 133047 0
cxgb3 196233 1 iw_cxgb3
mlx4_ib 80171 0
mlx4_en 97664 0
mlx4_core 185193 2 mlx4_ib,mlx4_en
ib_mthca 141407 0
ib_mad 40497 5 ib_cm,ib_sa,ib_umad,mlx4_ib,ib_mthca
ib_core 69979 14 rdma_ucm,ib_ucm,rdma_cm,iw_cm,ib_ipoib,ib_cm,ib_sa,ib_uverbs,ib_umad,iw_nes,iw_cxgb3,mlx4_ib,ib_mthca,ib_mad
mpt2sas 173216 0
scsi_transport_sas 35070 1 mpt2sas
raid_class 4804 1 mpt2sas
mptctl 31976 0
mptbase 93845 1 mptctl
nfsd 305799 13
lockd 74270 1 nfsd
nfs_acl 2647 1 nfsd
auth_rpcgss 44895 1 nfsd
exportfs 4236 1 nfsd
autofs4 26888 3
ipmi_devintf 8049 0
ipmi_si 42401 0
ipmi_msghandler 35992 2 ipmi_devintf,ipmi_si
sunrpc 243758 26 nfsd,lockd,nfs_acl,auth_rpcgss
8021q 23575 0
garp 7344 1 8021q
stp 2173 1 garp
llc 5642 2 garp,stp
ipv6 322029 134 ib_addr,ib_ipoib
libcrc32c 1246 1 iw_nes
nls_utf8 1455 1
ext3 235341 1
jbd 80337 1 ext3
sg 30124 0
igb 157825 0
dca 7197 1 igb
microcode 112594 0
sr_mod 16228 0
cdrom 39771 1 sr_mod
serio_raw 4818 0
amd64_edac_mod 21461 0
edac_core 46773 6 amd64_edac_mod
edac_mce_amd 15488 1 amd64_edac_mod
i2c_piix4 12608 0
i2c_core 31276 1 i2c_piix4
shpchp 33482 0
ext4 364410 3
mbcache 8144 2 ext3,ext4
jbd2 88738 1 ext4
sd_mod 39488 7
crc_t10dif 1541 1 sd_mod
usb_storage 49452 0
megaraid_sas 77090 5
ata_generic 3837 0
pata_acpi 3701 0
pata_atiixp 4211 0
ahci 40455 0
dm_mirror 14101 0
dm_region_hash 12170 1 dm_mirror
dm_log 10122 2 dm_mirror,dm_region_hash
dm_mod 81500 2 dm_mirror,dm_log
Thank you!
--
Alexander Kvashnin
Link Copied
0 Replies

Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page