- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Simple mpi-helloworld mpi program crashes when using shm:dapl mode and MLNX OFED 2.1-1.0.0 IB stack. shm:ofa works fine. shm:dapl mode used to work fine with MLNX OFED 1.5.3 but latest el6.5 kernel requires 2.1-1.0.0 version.
intelmpi/4.1.3
I_MPI_FABRICS=shm:dapl srun -pdebug -n2 -N2 ~/mpi/intelhellog
srun: job 830396 queued and waiting for resources
srun: job 830396 has been allocated resources
c578: Before MPI_INIT
c577: Before MPI_INIT
srun: error: c578: task 1: Segmentation fault (core dumped)
srun: Terminating job step 830396.0
srun: error: c577: task 0: Segmentation fault (core dumped)
(gdb) bt
#0 0x00007f222c9ad85c in I_MPI_dat_extension_op (handle=0x1e5e770, ext_op=6)
at ./../../include/I_MPI_wrap_dat.h:352
#1 0x00007f222c9b0b56 in dapl_rc_init_infostructure_20 (proc=0x7f222d02e040,
param=0x7fffe3b5d6c8, rc_proc=0x7f222d02db00, rc_param=0x7fffe3b5d650,
rc_secondary_proc_params=0x7f222d02dfe0, p_ia_attr=0x7fffe3b5d068,
p_provider_attr=0x7fffe3b5d318) at ../../dapl_init_rc.c:948
#2 0x00007f222c9aea39 in MPID_nem_dapl_rc_init_20 (params=0x7fffe3b5d6c8,
rc_params=0x7fffe3b5d650, pg_p=0x1e41480, pg_rank=1, fallback_device=0,
provider_found=0x7fffe3b5d64c) at ../../dapl_init_rc.c:207
#3 0x00007f222ca10b0c in MPID_nem_dapl_init (pg_p=0x1e41480, pg_rank=1,
bc_val_p=0x7fffe3b5d980, val_max_sz_p=0x7fffe3b5da98) at ../../dapls_module_init.c:437
#4 0x00007f222caf41a7 in MPID_nem_impi_netmod_init (pg_rank=1, pg_p=0x1e41480,
ckpt_restart=0, bc_val=0x1e41bb0 "", pval_max_remaining=0x7fffe3b5da98,
num_processes=2) at ../../mpid_nem_init.c:146
#5 0x00007f222caf7482 in MPID_nem_init_ckpt (pg_rank=1, pg_p=0x1e41480, ckpt_restart=0,
has_parent=0) at ../../mpid_nem_init.c:859
#6 0x00007f222caf563e in MPID_nem_init (pg_rank=1, pg_p=0x1e41480, has_parent=0)
at ../../mpid_nem_init.c:490
#7 0x00007f222c8f6a2a in MPIDI_CH3_Init (has_parent=0, pg_p=0x1e41480, pg_rank=1)
at ../../ch3_init.c:64
---Type <return> to continue, or q <return> to quit---
#8 0x00007f222cad5244 in MPID_Init (argc=0x7fffe3b5e8a8, argv=0x7fffe3b5e8b0,
requested=0, provided=0x7fffe3b5e4c0, has_args=0x7fffe3b5e4c4, has_env=0x7fffe3b5e4c8)
at ../../mpid_init.c:193
#9 0x00007f222caa3019 in MPIR_Init_thread (argc=0x7fffe3b5e8a8, argv=0x7fffe3b5e8b0,
required=0, provided=0x7fffe3b5e6bc) at ../../initthread.c:539
#10 0x00007f222ca90190 in PMPI_Init (argc=0x7fffe3b5e8a8, argv=0x7fffe3b5e8b0)
at ../../init.c:195
#11 0x0000000000400a81 in main (argc=1, argv=0x7fffe3b5e9c8) at hello.c:26
(gdb) bt full
#0 0x00007f222c9ad85c in I_MPI_dat_extension_op (handle=0x1e5e770, ext_op=6)
at ./../../include/I_MPI_wrap_dat.h:352
ret = 0
args = {{<No data fields>}}
dapl_handle = 0x1
#1 0x00007f222c9b0b56 in dapl_rc_init_infostructure_20 (proc=0x7f222d02e040,
param=0x7fffe3b5d6c8, rc_proc=0x7f222d02db00, rc_param=0x7fffe3b5d650,
rc_secondary_proc_params=0x7f222d02dfe0, p_ia_attr=0x7fffe3b5d068,
p_provider_attr=0x7fffe3b5d318) at ../../dapl_init_rc.c:948
ret = 0
region = {for_va = 0x1ea3b78, for_lmr_handle = 0x1ea3b78, for_shared_memory = {
virtual_address = 0x1ea3b78, shared_memory_id = 0x7f2229656260}}
dummy_registered_size = 4096
dummy_registered_addr = 32127864
dummy_rkey = 805377050
fcname = 0x0
#2 0x00007f222c9aea39 in MPID_nem_dapl_rc_init_20 (params=0x7fffe3b5d6c8,
rc_params=0x7fffe3b5d650, pg_p=0x1e41480, pg_rank=1, fallback_device=0,
provider_found=0x7fffe3b5d64c) at ../../dapl_init_rc.c:207
mpi_error = 1
---Type <return> to continue, or q <return> to quit---
i = 1
vbuf_header_size = 1
rtc_err = 1
pg_size = 2
ia_attr = {adapter_name = "mlx4_0", '\000' <repeats 249 times>,
vendor_name = '\000' <repeats 255 times>, hardware_version_major = 1,
hardware_version_minor = 0, firmware_version_major = 0,
firmware_version_minor = 0, ia_address_ptr = 0x1e42168, max_eps = 393080,
max_dto_per_ep = 16351, max_rdma_read_per_ep_in = 16,
max_rdma_read_per_ep_out = 128, max_evds = 65408, max_evd_qlen = 4194303,
max_iov_segments_per_dto = 32, max_lmrs = 524032,
max_lmr_block_size = 4294967295, max_lmr_virtual_address = 18446744073709551615,
max_pzs = 32764, max_message_size = 1073741824, max_rdma_size = 1073741824,
max_rmrs = 0, max_rmr_target_address = 18446744073709551615, max_srqs = 0,
max_ep_per_srq = 0, max_recv_per_srq = 0, max_iov_segments_per_rdma_read = 32,
max_iov_segments_per_rdma_write = 32, max_rdma_read_in = 16,
max_rdma_read_out = 128, max_rdma_read_per_ep_in_guaranteed = DAT_TRUE,
max_rdma_read_per_ep_out_guaranteed = DAT_TRUE, zb_supported = DAT_FALSE,
extension_supported = DAT_EXTENSION_IB, extension_version = 207,
num_transport_attr = 0, transport_attr = 0x0, num_vendor_attr = 0,
---Type <return> to continue, or q <return> to quit---
vendor_attr = 0x0}
provider_attr = {provider_name = "ofa-v2-mlx4_0-1", '\000' <repeats 240 times>,
provider_version_major = 2, provider_version_minor = 0, dapl_version_major = 2,
dapl_version_minor = 0, lmr_mem_types_supported = 3,
iov_ownership_on_return = DAT_IOV_CONSUMER,
dat_qos_supported = DAT_QOS_BEST_EFFORT,
completion_flags_supported = DAT_COMPLETION_DEFAULT_FLAG,
is_thread_safe = DAT_FALSE, max_private_data_size = 118,
supports_multipath = DAT_FALSE, ep_creator = DAT_PSP_CREATES_EP_NEVER,
pz_support = DAT_PZ_UNIQUE, optimal_buffer_alignment = 256,
evd_stream_merging_supported = {{DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_TRUE,
DAT_TRUE, DAT_FALSE}, {DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_TRUE,
DAT_FALSE}, {DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_FALSE, DAT_TRUE, DAT_FALSE}, {
DAT_TRUE, DAT_TRUE, DAT_FALSE, DAT_TRUE, DAT_TRUE, DAT_FALSE}, {DAT_TRUE,
DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_FALSE}, {DAT_FALSE, DAT_FALSE,
DAT_FALSE, DAT_FALSE, DAT_FALSE, DAT_FALSE}}, srq_supported = 2147483647,
srq_watermarks_supported = 0, srq_ep_pz_difference_supported = DAT_TRUE,
srq_info_supported = 0, ep_recv_info_supported = 31726976,
lmr_sync_req = DAT_FALSE, dto_async_return_guaranteed = 18,
rdma_write_for_rdma_read_req = 32767,
---Type <return> to continue, or q <return> to quit---
rdma_read_lmr_rmr_context_exposure = DAT_FALSE, rmr_scope_supported = 32767,
is_signal_safe = 751259856, ha_supported = 32546, ha_loadbalancing = 751259872,
num_provider_specific_attr = 12, provider_specific_attr = 0x7f21e832da20}
local_ia_addr = {sa_family = 1,
sa_data = "\000\000\001\000\000\000\000\000\000\000\377\177\000"}
rtc_attr = {struct_size = 5, max_memory_size = 8182128, max_cache_entries = 0,
lazy_mem_unregister = 0, min_memblock_size = 0,
dont_use_overlapping = RTC_OVERLAP_ALLOWED, avl_block_size = 0,
use_avl_tree = 746536960, rtc_register_fn = 0x7f222d26dff0 <_dl_fixup+224>,
rtc_unregister_fn = 0x200000005, info = 0x0}
dapl_index = 0
proc = 0x7f222d02e040
rc_secondary_proc_params = 0x7f222d02dfe0
rc_proc = 0x7f222d02db00
__I_MPI_dat2_lib_names__ = {0x7f222cc750d0 "libdat2.so.2",
0x7f222cc750e0 "libdat2.so"}
libdat_names = 0x7fffe3b5d050
nlibdat = 2
#3 0x00007f222ca10b0c in MPID_nem_dapl_init (pg_p=0x1e41480, pg_rank=1,
bc_val_p=0x7fffe3b5d980, val_max_sz_p=0x7fffe3b5da98) at ../../dapls_module_init.c:437
---Type <return> to continue, or q <return> to quit---
mpi_errno = 0
fallback_device = 0
num_providers_in_datconf = 27
provider_found = 0
i = 0
check_provider_from_all_processes = 0
nproviders = 1
proc = 0x7f222d02e040
default_params = {{create_conn_qual = 1, rdma_use_mreg_cache = 1,
vbuf_total_size = 23808, rtc_max_cache_entries = 1024,
rtc_max_memory_size = 0, rtc_use_avl_tree = 0, rtc_avl_block = 32546,
req_evd_qlen = 2000, conn_evd_qlen = 36, rdma_optimal_buffer_align = 0,
rdma_dont_use_rtc_overlapping = 0}, {create_conn_qual = -474623584,
rdma_use_mreg_cache = 32767, vbuf_total_size = 705030728,
rtc_max_cache_entries = 32546, rtc_max_memory_size = 140737013734464,
rtc_use_avl_tree = -474622112, rtc_avl_block = 32767,
req_evd_qlen = 749592018, conn_evd_qlen = 32546,
rdma_optimal_buffer_align = 0, rdma_dont_use_rtc_overlapping = 0}, {
create_conn_qual = 735235728, rdma_use_mreg_cache = 0,
vbuf_total_size = 696639508, rtc_max_cache_entries = 0,
---Type <return> to continue, or q <return> to quit---
rtc_max_memory_size = 140733193388033, rtc_use_avl_tree = 0,
rtc_avl_block = 32767, req_evd_qlen = 4196624, conn_evd_qlen = 0,
rdma_optimal_buffer_align = 3820349888, rdma_dont_use_rtc_overlapping = 32767}}
default_ud_params = {ud_default_recv_evd_qlen = 48769090,
ud_recv_buf_total_size = 0, ud_send_buf_total_size = -474622312,
rndv_ep_ud_max_dtos = 32767, can_use_ud_ext = 1}
default_rc_params = {rdma_default_max_wqe = 400, rdma_default_max_recv_wqe = 8,
rdma_req_wqe_reserve = 32, rdma_check_max_rdma_attr = 0,
rdma_eager_msg_aggregation = 0, dapl_scalable_read_progress = 0}
#4 0x00007f222caf41a7 in MPID_nem_impi_netmod_init (pg_rank=1, pg_p=0x1e41480,
ckpt_restart=0, bc_val=0x1e41bb0 "", pval_max_remaining=0x7fffe3b5da98,
num_processes=2) at ../../mpid_nem_init.c:146
mpi_errno = 0
netmod_configuration = 1
netmod_index = 1
i = 0
#5 0x00007f222caf7482 in MPID_nem_init_ckpt (pg_rank=1, pg_p=0x1e41480, ckpt_restart=0,
has_parent=0) at ../../mpid_nem_init.c:859
mpi_errno = 0
num_procs = 2
---Type <return> to continue, or q <return> to quit---
ret = 0
num_local = 1
local_procs = 0x1e410b0
local_rank = 0
index = 128
i = 32546
publish_bc_orig = 0x1e41bb0 ""
bc_val = 0x1e41bb0 ""
val_max_remaining = 256
grank = 2
fastboxes_p = 0x7f222985e080
cells_p = 0x7f222985e0c0
network_cells_p = 0x0
recv_queues_p = 0x7f222a05e140
free_queues_p = 0x7f222a05e0c0
envvar = 0x7f22fbad8001 <Address 0x7f22fbad8001 out of bounds>
envname = 0x7f222ba9cda0 "H9\\$8t\bH\213T$(\306\002"
num_cpus = 0
netmod_configuration = 729385136
check_fabrics_compatibility = 0
---Type <return> to continue, or q <return> to quit---
sshmqs_p = 0x0
sshmconns_p = 0x0
mpiu_chkpmem_stk_ = {0x1e41cc0, 0x1e41090, 0x1e41070, 0x1e41d40, 0x1e40e90,
0x7fffe3b5dd00, 0x1e418f0, 0x7fffe3b5ddf0, 0x1}
mpiu_chkpmem_stk_sp_ = 5
mpiu_chkpmem_stk_sz_ = 9
#6 0x00007f222caf563e in MPID_nem_init (pg_rank=1, pg_p=0x1e41480, has_parent=0)
at ../../mpid_nem_init.c:490
No locals.
#7 0x00007f222c8f6a2a in MPIDI_CH3_Init (has_parent=0, pg_p=0x1e41480, pg_rank=1)
at ../../ch3_init.c:64
mpi_errno = 0
i = 0
#8 0x00007f222cad5244 in MPID_Init (argc=0x7fffe3b5e8a8, argv=0x7fffe3b5e8b0,
requested=0, provided=0x7fffe3b5e4c0, has_args=0x7fffe3b5e4c4, has_env=0x7fffe3b5e4c8)
at ../../mpid_init.c:193
mpi_errno = 0
has_parent = 0
pg = 0x1e41480
pg_rank = 1
---Type <return> to continue, or q <return> to quit---
pg_size = 2
comm = 0x7f222d47d358
p = 746568848
envvar = 0x0
envname = 0x7f222ccf8ed4 "I_MPI_IPROBE_SPIN_COUNT"
attr_val = 0x0
#9 0x00007f222caa3019 in MPIR_Init_thread (argc=0x7fffe3b5e8a8, argv=0x7fffe3b5e8b0,
required=0, provided=0x7fffe3b5e6bc) at ../../initthread.c:539
mpi_errno = 0
has_args = 1
has_env = 1
thread_provided = 0
exit_init_cs_on_failure = 1
envvar = 0x100000000 <Address 0x100000000 out of bounds>
envname = 0x100000000 <Address 0x100000000 out of bounds>
interaction = 0x0
#10 0x00007f222ca90190 in PMPI_Init (argc=0x7fffe3b5e8a8, argv=0x7fffe3b5e8b0)
at ../../init.c:195
FCNAME = "MPI_Init"
mpi_errno = 0
---Type <return> to continue, or q <return> to quit---
rc = 0
threadLevel = 0
provided = 32546
dtm = 0
#11 0x0000000000400a81 in main (argc=1, argv=0x7fffe3b5e9c8) at hello.c:26
rank = 32546
size = 0
len = 0
name = "\001\000\000\000\000\000\000\000\210\021H-\"\177\000\000\210\350\265\343\377\177\000\000v\000\000\000\000\000\000\000\t\000\000\000\000\000\000\000\236\350\265\343\377\177\000\000\000\000\000\000\000\000\000\000\340\024H-\"\177\000\000\260\350\265\343\377\177\000\000\207֢+\"\177\000\000\310\350\265\343\377\177\000\000\211W\251+\001\000\000\000\000\000H-\"\177\000\000\006\006@\000\000\000\000\000\302\000\000\000\000\000\000\000\236\350\265\343\377\177\000"
to_wait = 0
sleep_diff = 0
max_limit = 0
sleep_start = 0
sleep_now = 0
hostname = "c578\000\000\000\000\000\000\000\000\001\000\000\000\371\a\000\000\001\0---Type <return> to continue, or q <return> to quit---
00\000\000Z\236&-\"\177\000\000\000\000\000\000\000\000\000\000\340\024H-\"\177\000\000\000\351\265\343\377\177\000\000(\351\265\343\377\177\000\000\210\021H-\"\177\000\000X\327F-\"\177\000\000.N=\366\000\000\000\000Z\236&-\"\177\000\000\000\000\000\000\000\000\000\000X\327F-\"\177\000\000\001", '\000' <repeats 14 times>
(gdb) bt full
#0 0x00007f222c9ad85c in I_MPI_dat_extension_op (handle=0x1e5e770, ext_op=6)
at ./../../include/I_MPI_wrap_dat.h:352
ret = 0
args = {{<No data fields>}}
dapl_handle = 0x1
#1 0x00007f222c9b0b56 in dapl_rc_init_infostructure_20 (proc=0x7f222d02e040,
param=0x7fffe3b5d6c8, rc_proc=0x7f222d02db00, rc_param=0x7fffe3b5d650,
rc_secondary_proc_params=0x7f222d02dfe0, p_ia_attr=0x7fffe3b5d068,
p_provider_attr=0x7fffe3b5d318) at ../../dapl_init_rc.c:948
ret = 0
region = {for_va = 0x1ea3b78, for_lmr_handle = 0x1ea3b78, for_shared_memory = {
virtual_address = 0x1ea3b78, shared_memory_id = 0x7f2229656260}}
dummy_registered_size = 4096
dummy_registered_addr = 32127864
dummy_rkey = 805377050
fcname = 0x0
#2 0x00007f222c9aea39 in MPID_nem_dapl_rc_init_20 (params=0x7fffe3b5d6c8,
rc_params=0x7fffe3b5d650, pg_p=0x1e41480, pg_rank=1, fallback_device=0,
provider_found=0x7fffe3b5d64c) at ../../dapl_init_rc.c:207
mpi_error = 1
---Type <return> to continue, or q <return> to quit---
i = 1
vbuf_header_size = 1
rtc_err = 1
pg_size = 2
ia_attr = {adapter_name = "mlx4_0", '\000' <repeats 249 times>,
vendor_name = '\000' <repeats 255 times>, hardware_version_major = 1,
hardware_version_minor = 0, firmware_version_major = 0,
firmware_version_minor = 0, ia_address_ptr = 0x1e42168, max_eps = 393080,
max_dto_per_ep = 16351, max_rdma_read_per_ep_in = 16,
max_rdma_read_per_ep_out = 128, max_evds = 65408, max_evd_qlen = 4194303,
max_iov_segments_per_dto = 32, max_lmrs = 524032,
max_lmr_block_size = 4294967295, max_lmr_virtual_address = 18446744073709551615,
max_pzs = 32764, max_message_size = 1073741824, max_rdma_size = 1073741824,
max_rmrs = 0, max_rmr_target_address = 18446744073709551615, max_srqs = 0,
max_ep_per_srq = 0, max_recv_per_srq = 0, max_iov_segments_per_rdma_read = 32,
max_iov_segments_per_rdma_write = 32, max_rdma_read_in = 16,
max_rdma_read_out = 128, max_rdma_read_per_ep_in_guaranteed = DAT_TRUE,
max_rdma_read_per_ep_out_guaranteed = DAT_TRUE, zb_supported = DAT_FALSE,
extension_supported = DAT_EXTENSION_IB, extension_version = 207,
num_transport_attr = 0, transport_attr = 0x0, num_vendor_attr = 0,
---Type <return> to continue, or q <return> to quit---
vendor_attr = 0x0}
provider_attr = {provider_name = "ofa-v2-mlx4_0-1", '\000' <repeats 240 times>,
provider_version_major = 2, provider_version_minor = 0, dapl_version_major = 2,
dapl_version_minor = 0, lmr_mem_types_supported = 3,
iov_ownership_on_return = DAT_IOV_CONSUMER,
dat_qos_supported = DAT_QOS_BEST_EFFORT,
completion_flags_supported = DAT_COMPLETION_DEFAULT_FLAG,
is_thread_safe = DAT_FALSE, max_private_data_size = 118,
supports_multipath = DAT_FALSE, ep_creator = DAT_PSP_CREATES_EP_NEVER,
pz_support = DAT_PZ_UNIQUE, optimal_buffer_alignment = 256,
evd_stream_merging_supported = {{DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_TRUE,
DAT_TRUE, DAT_FALSE}, {DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_TRUE,
DAT_FALSE}, {DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_FALSE, DAT_TRUE, DAT_FALSE}, {
DAT_TRUE, DAT_TRUE, DAT_FALSE, DAT_TRUE, DAT_TRUE, DAT_FALSE}, {DAT_TRUE,
DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_TRUE, DAT_FALSE}, {DAT_FALSE, DAT_FALSE,
DAT_FALSE, DAT_FALSE, DAT_FALSE, DAT_FALSE}}, srq_supported = 2147483647,
srq_watermarks_supported = 0, srq_ep_pz_difference_supported = DAT_TRUE,
srq_info_supported = 0, ep_recv_info_supported = 31726976,
lmr_sync_req = DAT_FALSE, dto_async_return_guaranteed = 18,
rdma_write_for_rdma_read_req = 32767,
---Type <return> to continue, or q <return> to quit---
rdma_read_lmr_rmr_context_exposure = DAT_FALSE, rmr_scope_supported = 32767,
is_signal_safe = 751259856, ha_supported = 32546, ha_loadbalancing = 751259872,
num_provider_specific_attr = 12, provider_specific_attr = 0x7f21e832da20}
local_ia_addr = {sa_family = 1,
sa_data = "\000\000\001\000\000\000\000\000\000\000\377\177\000"}
rtc_attr = {struct_size = 5, max_memory_size = 8182128, max_cache_entries = 0,
lazy_mem_unregister = 0, min_memblock_size = 0,
dont_use_overlapping = RTC_OVERLAP_ALLOWED, avl_block_size = 0,
use_avl_tree = 746536960, rtc_register_fn = 0x7f222d26dff0 <_dl_fixup+224>,
rtc_unregister_fn = 0x200000005, info = 0x0}
dapl_index = 0
proc = 0x7f222d02e040
rc_secondary_proc_params = 0x7f222d02dfe0
rc_proc = 0x7f222d02db00
__I_MPI_dat2_lib_names__ = {0x7f222cc750d0 "libdat2.so.2",
0x7f222cc750e0 "libdat2.so"}
libdat_names = 0x7fffe3b5d050
nlibdat = 2
#3 0x00007f222ca10b0c in MPID_nem_dapl_init (pg_p=0x1e41480, pg_rank=1,
bc_val_p=0x7fffe3b5d980, val_max_sz_p=0x7fffe3b5da98) at ../../dapls_module_init.c:437
---Type <return> to continue, or q <return> to quit---
mpi_errno = 0
fallback_device = 0
num_providers_in_datconf = 27
provider_found = 0
i = 0
check_provider_from_all_processes = 0
nproviders = 1
proc = 0x7f222d02e040
default_params = {{create_conn_qual = 1, rdma_use_mreg_cache = 1,
vbuf_total_size = 23808, rtc_max_cache_entries = 1024,
rtc_max_memory_size = 0, rtc_use_avl_tree = 0, rtc_avl_block = 32546,
req_evd_qlen = 2000, conn_evd_qlen = 36, rdma_optimal_buffer_align = 0,
rdma_dont_use_rtc_overlapping = 0}, {create_conn_qual = -474623584,
rdma_use_mreg_cache = 32767, vbuf_total_size = 705030728,
rtc_max_cache_entries = 32546, rtc_max_memory_size = 140737013734464,
rtc_use_avl_tree = -474622112, rtc_avl_block = 32767,
req_evd_qlen = 749592018, conn_evd_qlen = 32546,
rdma_optimal_buffer_align = 0, rdma_dont_use_rtc_overlapping = 0}, {
create_conn_qual = 735235728, rdma_use_mreg_cache = 0,
vbuf_total_size = 696639508, rtc_max_cache_entries = 0,
---Type <return> to continue, or q <return> to quit---
rtc_max_memory_size = 140733193388033, rtc_use_avl_tree = 0,
rtc_avl_block = 32767, req_evd_qlen = 4196624, conn_evd_qlen = 0,
rdma_optimal_buffer_align = 3820349888, rdma_dont_use_rtc_overlapping = 32767}}
default_ud_params = {ud_default_recv_evd_qlen = 48769090,
ud_recv_buf_total_size = 0, ud_send_buf_total_size = -474622312,
rndv_ep_ud_max_dtos = 32767, can_use_ud_ext = 1}
default_rc_params = {rdma_default_max_wqe = 400, rdma_default_max_recv_wqe = 8,
rdma_req_wqe_reserve = 32, rdma_check_max_rdma_attr = 0,
rdma_eager_msg_aggregation = 0, dapl_scalable_read_progress = 0}
#4 0x00007f222caf41a7 in MPID_nem_impi_netmod_init (pg_rank=1, pg_p=0x1e41480,
ckpt_restart=0, bc_val=0x1e41bb0 "", pval_max_remaining=0x7fffe3b5da98,
num_processes=2) at ../../mpid_nem_init.c:146
mpi_errno = 0
netmod_configuration = 1
netmod_index = 1
i = 0
#5 0x00007f222caf7482 in MPID_nem_init_ckpt (pg_rank=1, pg_p=0x1e41480, ckpt_restart=0,
has_parent=0) at ../../mpid_nem_init.c:859
mpi_errno = 0
num_procs = 2
---Type <return> to continue, or q <return> to quit---
ret = 0
num_local = 1
local_procs = 0x1e410b0
local_rank = 0
index = 128
i = 32546
publish_bc_orig = 0x1e41bb0 ""
bc_val = 0x1e41bb0 ""
val_max_remaining = 256
grank = 2
fastboxes_p = 0x7f222985e080
cells_p = 0x7f222985e0c0
network_cells_p = 0x0
recv_queues_p = 0x7f222a05e140
free_queues_p = 0x7f222a05e0c0
envvar = 0x7f22fbad8001 <Address 0x7f22fbad8001 out of bounds>
envname = 0x7f222ba9cda0 "H9\\$8t\bH\213T$(\306\002"
num_cpus = 0
netmod_configuration = 729385136
check_fabrics_compatibility = 0
---Type <return> to continue, or q <return> to quit---
sshmqs_p = 0x0
sshmconns_p = 0x0
mpiu_chkpmem_stk_ = {0x1e41cc0, 0x1e41090, 0x1e41070, 0x1e41d40, 0x1e40e90,
0x7fffe3b5dd00, 0x1e418f0, 0x7fffe3b5ddf0, 0x1}
mpiu_chkpmem_stk_sp_ = 5
mpiu_chkpmem_stk_sz_ = 9
#6 0x00007f222caf563e in MPID_nem_init (pg_rank=1, pg_p=0x1e41480, has_parent=0)
at ../../mpid_nem_init.c:490
No locals.
#7 0x00007f222c8f6a2a in MPIDI_CH3_Init (has_parent=0, pg_p=0x1e41480, pg_rank=1)
at ../../ch3_init.c:64
mpi_errno = 0
i = 0
#8 0x00007f222cad5244 in MPID_Init (argc=0x7fffe3b5e8a8, argv=0x7fffe3b5e8b0,
requested=0, provided=0x7fffe3b5e4c0, has_args=0x7fffe3b5e4c4, has_env=0x7fffe3b5e4c8)
at ../../mpid_init.c:193
mpi_errno = 0
has_parent = 0
pg = 0x1e41480
pg_rank = 1
---Type <return> to continue, or q <return> to quit---
pg_size = 2
comm = 0x7f222d47d358
p = 746568848
envvar = 0x0
envname = 0x7f222ccf8ed4 "I_MPI_IPROBE_SPIN_COUNT"
attr_val = 0x0
#9 0x00007f222caa3019 in MPIR_Init_thread (argc=0x7fffe3b5e8a8, argv=0x7fffe3b5e8b0,
required=0, provided=0x7fffe3b5e6bc) at ../../initthread.c:539
mpi_errno = 0
has_args = 1
has_env = 1
thread_provided = 0
exit_init_cs_on_failure = 1
envvar = 0x100000000 <Address 0x100000000 out of bounds>
envname = 0x100000000 <Address 0x100000000 out of bounds>
interaction = 0x0
#10 0x00007f222ca90190 in PMPI_Init (argc=0x7fffe3b5e8a8, argv=0x7fffe3b5e8b0)
at ../../init.c:195
FCNAME = "MPI_Init"
mpi_errno = 0
---Type <return> to continue, or q <return> to quit---
rc = 0
threadLevel = 0
provided = 32546
dtm = 0
#11 0x0000000000400a81 in main (argc=1, argv=0x7fffe3b5e9c8) at hello.c:26
rank = 32546
size = 0
len = 0
name = "\001\000\000\000\000\000\000\000\210\021H-\"\177\000\000\210\350\265\343\377\177\000\000v\000\000\000\000\000\000\000\t\000\000\000\000\000\000\000\236\350\265\343\377\177\000\000\000\000\000\000\000\000\000\000\340\024H-\"\177\000\000\260\350\265\343\377\177\000\000\207֢+\"\177\000\000\310\350\265\343\377\177\000\000\211W\251+\001\000\000\000\000\000H-\"\177\000\000\006\006@\000\000\000\000\000\302\000\000\000\000\000\000\000\236\350\265\343\377\177\000"
to_wait = 0
sleep_diff = 0
max_limit = 0
sleep_start = 0
sleep_now = 0
hostname = "c578\000\000\000\000\000\000\000\000\001\000\000\000\371\a\000\000\001\0---Type <return> to continue, or q <return> to quit---
00\000\000Z\236&-\"\177\000\000\000\000\000\000\000\000\000\000\340\024H-\"\177\000\000\000\351\265\343\377\177\000\000(\351\265\343\377\177\000\000\210\021H-\"\177\000\000X\327F-\"\177\000\000.N=\366\000\000\000\000Z\236&-\"\177\000\000\000\000\000\000\000\000\000\000X\327F-\"\177\000\000\001", '\000' <repeats 14 times>
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Tommi,
Thank you for this report. I have submitted this to our developers for further investigation.
Sincerely,
James Tullos
Technical Consulting Engineer
Intel® Cluster Tools
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Sorry I forgot to update solution here, latest Mellanox OFED_LINUX v2.1-1.0.6 does not crash anymore, Mellanox compiled dapl without proprietary FCA support.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Tommi,
Thanks for the update. I'll let our developers and other users know.
Sincerely,
James Tullos
Technical Consulting Engineer
Intel® Cluster Tools
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page