Intel® MPI Library
Get help with building, analyzing, optimizing, and scaling high-performance computing (HPC) applications.

mpirun hangs up up

psing51
New Contributor I
2,841 Views

Hi,
While trying to run intel mpi on single node , the application is getting stuck.

Here is the mpirun version - 
Intel(R) MPI Library for Linux* OS, Version 2019 Update 4 Build 20190430 (id: cbdd16069)
Copyright 2003-2019, Intel Corporation.


here are the logs when i trigger mpirun - 
I_MPI_DEBUG=16 I_MPI_HYDRA_DEBUG=on FI_LOG_LEVEL=debug /usr/diags/mpi/impi/2019.4.243/intel64/bin/mpirun -np 1 /bin/date
[mpiexec@cf-icex-82-1] Launch arguments: /usr/diags/mpi/impi/2019.4.243//intel64/bin//hydra_bstrap_proxy --upstream-host cf-icex-82-1 --upstream-port 45883 --pgid 0 --launcher ssh --launcher-number 0 --base-path /usr/diags/mpi/impi/2019.4.243//intel64/bin/ --tree-width 16 --tree-level 1 --time-left -1 --collective-launch 1 --debug --proxy-id 0 --node-id 0 --subtree-size 1 --upstream-fd 7 /usr/diags/mpi/impi/2019.4.243//intel64/bin//hydra_pmi_proxy --usize -1 --auto-cleanup 1 --abort-signal 9

I have to terminate the mpirun by issuing ctrl+c, and 

here's what i see in top  - PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
77177 root 20 0 17228 3056 2812 R 100.0 0.006 42:57.39 hydra_pmi_proxy

OS - SUSE Linux Enterprise Server 15 SP4

tried with FI_PROVIDER=TCP and verbs, result is same.





0 Kudos
5 Replies
psing51
New Contributor I
2,833 Views

here's what i have in strace - 
access("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", R_OK) = 0
stat("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", {st_mode=S_IFREG|0755, st_size=2972372, ...}) = 0
stat("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", {st_mode=S_IFREG|0755, st_size=2972372, ...}) = 0
geteuid() = 0
getegid() = 0
getuid() = 0
getgid() = 0
access("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", X_OK) = 0
stat("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", {st_mode=S_IFREG|0755, st_size=2972372, ...}) = 0
geteuid() = 0
getegid() = 0
getuid() = 0
getgid() = 0
access("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", R_OK) = 0
rt_sigprocmask(SIG_BLOCK, [INT CHLD], [], = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [INT CHLD], = 0
rt_sigprocmask(SIG_SETMASK, [INT CHLD], NULL, = 0
lseek(255, -515, SEEK_CUR) = 3270
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f6fb33626d0) = 21419
rt_sigprocmask(SIG_SETMASK, [], NULL, = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], = 0
rt_sigaction(SIGINT, {sa_handler=0x55d1f7cb8d10, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7f6fb294cd50}, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7f6fb294cd50}, = 0
wait4(-1,

0 Kudos
psing51
New Contributor I
2,833 Views

when i run the mpiexec.hydra -np 1 hostname , here's what i get  - 
write(6, "\2\0\0\0\6\0\0\0\0\0\0\0\0\0\0\0\20\314\32\320", 20) = 20
write(6, "/root\0", 6) = 6
write(6, "\3\0\0\0\31\0\0\0\1\0\0\0\25\0\0\0\4\0\0\0", 20) = 20
write(6, "\1\0\0\0\r\0\0\0/bin/hostname\0\0\0\0", 25) = 25
write(6, "\10\0\0\0\21\0\0\0\360>m\0\0\0\0\0 m\0", 20) = 20
write(6, "(vector,(0,1,1))\0", 17) = 17
getpid() = 21442
write(6, "\4\0\0\0\f\0\0\0\37\242@\0\0\0\0\0\10\0\0\0", 20) = 20
write(6, "kvs_21442_0\0", 12) = 12
write(6, "\5\0\0\0\0\0\0\0\37\242@\0\0\0\0\0\10\0\0\0", 20) = 20
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])

0 Kudos
VarshaS_Intel
Moderator
2,813 Views

Hi,

 

Thanks for posting in Intel Communities.

 

Could you please provide us with the complete debug log after running the below command at your end?

I_MPI_DEBUG=30 I_MPI_HYDRA_DEBUG=on FI_LOG_LEVEL=debug /usr/diags/mpi/impi/2019.4.243/intel64/bin/mpirun -n 2 ./hello

Please find the hello_mpi.cpp code is attached below.

 

Thanks & Regards,

Varsha

 

0 Kudos
VarshaS_Intel
Moderator
2,768 Views

Hi,


We have not heard back from you. Could you please provide an update on your issue?


Thanks & Regards,

Varsha


0 Kudos
VarshaS_Intel
Moderator
2,739 Views

Hi,


We have not heard back from you. This thread will no longer be monitored by Intel.If you need further assistance, please post a new question.


Thanks & Regards,

Varsha


0 Kudos
Reply