- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
While trying to run intel mpi on single node , the application is getting stuck.
Here is the mpirun version -
Intel(R) MPI Library for Linux* OS, Version 2019 Update 4 Build 20190430 (id: cbdd16069)
Copyright 2003-2019, Intel Corporation.
here are the logs when i trigger mpirun -
I_MPI_DEBUG=16 I_MPI_HYDRA_DEBUG=on FI_LOG_LEVEL=debug /usr/diags/mpi/impi/2019.4.243/intel64/bin/mpirun -np 1 /bin/date
[mpiexec@cf-icex-82-1] Launch arguments: /usr/diags/mpi/impi/2019.4.243//intel64/bin//hydra_bstrap_proxy --upstream-host cf-icex-82-1 --upstream-port 45883 --pgid 0 --launcher ssh --launcher-number 0 --base-path /usr/diags/mpi/impi/2019.4.243//intel64/bin/ --tree-width 16 --tree-level 1 --time-left -1 --collective-launch 1 --debug --proxy-id 0 --node-id 0 --subtree-size 1 --upstream-fd 7 /usr/diags/mpi/impi/2019.4.243//intel64/bin//hydra_pmi_proxy --usize -1 --auto-cleanup 1 --abort-signal 9
I have to terminate the mpirun by issuing ctrl+c, and
here's what i see in top - PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
77177 root 20 0 17228 3056 2812 R 100.0 0.006 42:57.39 hydra_pmi_proxy
OS - SUSE Linux Enterprise Server 15 SP4
tried with FI_PROVIDER=TCP and verbs, result is same.
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
here's what i have in strace -
access("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", R_OK) = 0
stat("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", {st_mode=S_IFREG|0755, st_size=2972372, ...}) = 0
stat("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", {st_mode=S_IFREG|0755, st_size=2972372, ...}) = 0
geteuid() = 0
getegid() = 0
getuid() = 0
getgid() = 0
access("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", X_OK) = 0
stat("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", {st_mode=S_IFREG|0755, st_size=2972372, ...}) = 0
geteuid() = 0
getegid() = 0
getuid() = 0
getgid() = 0
access("/usr/diags/mpi/impi/2019.4.243/intel64/bin/mpiexec.hydra", R_OK) = 0
rt_sigprocmask(SIG_BLOCK, [INT CHLD], [],
rt_sigprocmask(SIG_BLOCK, [CHLD], [INT CHLD],
rt_sigprocmask(SIG_SETMASK, [INT CHLD], NULL,
lseek(255, -515, SEEK_CUR) = 3270
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f6fb33626d0) = 21419
rt_sigprocmask(SIG_SETMASK, [], NULL,
rt_sigprocmask(SIG_BLOCK, [CHLD], [],
rt_sigprocmask(SIG_SETMASK, [], NULL,
rt_sigprocmask(SIG_BLOCK, [CHLD], [],
rt_sigaction(SIGINT, {sa_handler=0x55d1f7cb8d10, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7f6fb294cd50}, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7f6fb294cd50},
wait4(-1,
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
when i run the mpiexec.hydra -np 1 hostname , here's what i get -
write(6, "\2\0\0\0\6\0\0\0\0\0\0\0\0\0\0\0\20\314\32\320", 20) = 20
write(6, "/root\0", 6) = 6
write(6, "\3\0\0\0\31\0\0\0\1\0\0\0\25\0\0\0\4\0\0\0", 20) = 20
write(6, "\1\0\0\0\r\0\0\0/bin/hostname\0\0\0\0", 25) = 25
write(6, "\10\0\0\0\21\0\0\0\360>m\0\0\0\0\0 m\0", 20) = 20
write(6, "(vector,(0,1,1))\0", 17) = 17
getpid() = 21442
write(6, "\4\0\0\0\f\0\0\0\37\242@\0\0\0\0\0\10\0\0\0", 20) = 20
write(6, "kvs_21442_0\0", 12) = 12
write(6, "\5\0\0\0\0\0\0\0\37\242@\0\0\0\0\0\10\0\0\0", 20) = 20
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
read(0, "\n", 16384) = 1
write(6, "\20\0\0\0\1\0\0\0\320\24\22\2\0\0\0\0@\23\22\2", 20) = 20
write(6, "\n", 1) = 1
poll([{fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=10, events=POLLIN}, {fd=12, events=POLLIN}, {fd=0, events=POLLIN}], 5, -1
) = 1 ([{fd=0, revents=POLLIN}])
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Thanks for posting in Intel Communities.
Could you please provide us with the complete debug log after running the below command at your end?
I_MPI_DEBUG=30 I_MPI_HYDRA_DEBUG=on FI_LOG_LEVEL=debug /usr/diags/mpi/impi/2019.4.243/intel64/bin/mpirun -n 2 ./hello
Please find the hello_mpi.cpp code is attached below.
Thanks & Regards,
Varsha
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
We have not heard back from you. Could you please provide an update on your issue?
Thanks & Regards,
Varsha
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
We have not heard back from you. This thread will no longer be monitored by Intel.If you need further assistance, please post a new question.
Thanks & Regards,
Varsha
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page