Hi, Ivan,
Thank you for your reply.
Our distribution is Open MPI 3.1.6. gdb is available on my HPC.
I list the code I run on the HPC and the result also:
Linux script:
======================#!/bin/bash
#PJM -L "rscunit=ito-a"
#PJM -L "rscgrp=ito-s-dbg"
#PJM -L "vnode=4"
#PJM -L "vnode-core=36"
#PJM -L "elapse=00:05:00"
#PJM -j
#PJM -X
module use /home/exp/modulefiles
module load gcc/10.2.0
module load exp-openmpi/3.1.6-nocuda-gcc10.2.0
module load exp-R/4.0.5-gcc10.2.0-openmpi-debug
module load exp-libtool/2.4.6
echo {with the command \(without gdb option\):}
mpirun -np 4 --map-by ppr:1:node --bind-to none -mca plm_rsh_agent
/bin/pjrsh -machinefile ${PJM_O_NODEINF} R --no-save -q -f
/home/usr6/q70176a/testReturnCoreMememory.R
echo {with the command \(with gdb option\)}:
mpirun -np 4 --map-by ppr:1:node --bind-to none -mca plm_rsh_agent
/bin/pjrsh -machinefile ${PJM_O_NODEINF} R -d gdb --no-save -q -f
/home/usr6/q70176a/testReturnCoreMememory.R
======================
R code:
======================library(Rmpi)
mpi.spawn.Rslaves(nslaves=4)
tailslave.log()
mpi.remote.exec(rnorm(10))
mpi.close.Rslaves()
======================
Results:
======================{with the command (without gdb
option):}> > > > library(Rmpi)
library(Rmpi)
library(Rmpi)
library(Rmpi)
*** caught segfault ***
address 0x1, cause 'memory not mapped'
*** caught segfault ***
address 0x1, cause 'memory not mapped'
*** caught segfault ***
address 0x1, cause 'memory not mapped'
*** caught segfault ***
address 0x1, cause 'memory not mapped'
Traceback:
1: fun(libname, pkgname)
2: doTryCatch(return(expr), name, parentenv, handler)
3: tryCatchOne(expr, names, parentenv, handlers[[1L]])
4: tryCatchList(expr, classes, parentenv, handlers)
5: tryCatch(fun(libname, pkgname), error = identity)
6: runHook(".onLoad", env, package.lib, package)
7: loadNamespace(package, lib.loc)
8: doTryCatch(return(expr), name, parentenv, handler)
9: tryCatchOne(expr, names, parentenv, handlers[[1L]])
10: tryCatchList(expr, classes, parentenv, handlers)
11: tryCatch({ attr(package, "LibPath") <- which.lib.loc ns
<-
loadNamespace(package, lib.loc) env <- attachNamespace(ns, pos = pos,
deps, exclude, include.only)}, error = function(e) { P <- if
(!is.null(cc <- conditionCall(e))) paste(" in",
deparse(cc)[1L])
else "" msg <- gettextf("package or namespace load failed
for %s%s:\n
%s", sQuote(package), P, conditionMessage(e)) if
(logical.return) message(paste("Error:", msg), domain = NA)
else
stop(msg, call. = FALSE, domain = NA)})
12: library(Rmpi)
An irrecoverable exception occurred. R is aborting now ...
Traceback:
1: fun(libname, pkgname)
2: doTryCatch(return(expr), name, parentenv, handler)
3: tryCatchOne(expr, names, parentenv, handlers[[1L]])
4: tryCatchList(expr, classes, parentenv, handlers)
5: tryCatch(fun(libname, pkgname), error = identity)
6: runHook(".onLoad", env, package.lib, package)
7: loadNamespace(package, lib.loc)
8: doTryCatch(return(expr), name, parentenv, handler)
9: tryCatchOne(expr, names, parentenv, handlers[[1L]])
10: tryCatchList(expr, classes, parentenv, handlers)
11: tryCatch({ attr(package, "LibPath") <- which.lib.loc ns
<-
loadNamespace(package, lib.loc) env <- attachNamespace(ns, pos = pos,
deps, exclude, include.only)}, error = function(e) { P <- if
(!is.null(cc <- conditionCall(e))) paste(" in",
deparse(cc)[1L])
else "" msg <- gettextf("package or namespace load failed
for %s%s:\n
%s", sQuote(package), P, conditionMessage(e)) if
(logical.return) message(paste("Error:", msg), domain = NA)
else
stop(msg, call. = FALSE, domain = NA)})
12: library(Rmpi)
An irrecoverable exception occurred. R is aborting now ...
Traceback:
1: fun(libname, pkgname)
2: doTryCatch(return(expr), name, parentenv, handler)
3: tryCatchOne(expr, names, parentenv, handlers[[1L]])
4: tryCatchList(expr, classes, parentenv, handlers)
5: tryCatch(fun(libname, pkgname), error = identity)
6: runHook(".onLoad", env, package.lib, package)
7: loadNamespace(package, lib.loc)
8: doTryCatch(return(expr), name, parentenv, handler)
9: tryCatchOne(expr, names, parentenv, handlers[[1L]])
10: tryCatchList(expr, classes, parentenv, handlers)
11: tryCatch({ attr(package, "LibPath") <- which.lib.loc ns
<-
loadNamespace(package, lib.loc) env <- attachNamespace(ns, pos = pos,
deps, exclude, include.only)}, error = function(e) { P <- if
(!is.null(cc <- conditionCall(e))) paste(" in",
deparse(cc)[1L])
else "" msg <- gettextf("package or namespace load failed
for %s%s:\n
%s", sQuote(package), P, conditionMessage(e)) if
(logical.return) message(paste("Error:", msg), domain = NA)
else
stop(msg, call. = FALSE, domain = NA)})
12: library(Rmpi)
An irrecoverable exception occurred. R is aborting now ...
Traceback:
1: fun(libname, pkgname)
2: doTryCatch(return(expr), name, parentenv, handler)
3: tryCatchOne(expr, names, parentenv, handlers[[1L]])
4: tryCatchList(expr, classes, parentenv, handlers)
5: tryCatch(fun(libname, pkgname), error = identity)
6: runHook(".onLoad", env, package.lib, package)
7: loadNamespace(package, lib.loc)
8: doTryCatch(return(expr), name, parentenv, handler)
9: tryCatchOne(expr, names, parentenv, handlers[[1L]])
10: tryCatchList(expr, classes, parentenv, handlers)
11: tryCatch({ attr(package, "LibPath") <- which.lib.loc ns
<-
loadNamespace(package, lib.loc) env <- attachNamespace(ns, pos = pos,
deps, exclude, include.only)}, error = function(e) { P <- if
(!is.null(cc <- conditionCall(e))) paste(" in",
deparse(cc)[1L])
else "" msg <- gettextf("package or namespace load failed
for %s%s:\n
%s", sQuote(package), P, conditionMessage(e)) if
(logical.return) message(paste("Error:", msg), domain = NA)
else
stop(msg, call. = FALSE, domain = NA)})
12: library(Rmpi)
An irrecoverable exception occurred. R is aborting now ...
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node sca0171 exited on
signal 11 (Segmentation fault).
--------------------------------------------------------------------------
3 total processes killed (some possibly by mpirun during cleanup)
{with the command (with gdb option)}:
[?1034hGNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-120.el7
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later
<http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show
copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
[?1034hGNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-120.el7
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later
<http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show
copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
[?1034hGNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-120.el7
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later
<http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show
copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
[?1034hGNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-120.el7
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later
<http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show
copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from
/home/exp/R/4.0.5-gcc10.2.0-openmpi-debug/lib64/R/bin/exec/R...Reading
symbols from
/home/exp/R/4.0.5-gcc10.2.0-openmpi-debug/lib64/R/bin/exec/R...Reading
symbols from
/home/exp/R/4.0.5-gcc10.2.0-openmpi-debug/lib64/R/bin/exec/R...Reading
symbols from
/home/exp/R/4.0.5-gcc10.2.0-openmpi-debug/lib64/R/bin/exec/R...done.
done.
done.
(gdb) quit
done.
(gdb) quit
(gdb) quit
(gdb) quit
======================
Thank you
Best Regards
Ivan Krylov <krylov.r00t at gmail.com> ?2022?9?14??? 16:35???
> On Wed, 14 Sep 2022 15:00:45 +0900
> James Li <jamesli200116 at gmail.com> wrote:
>
> > However, when he tried a simple R script with only one line:
> > library(Rmpi)
> > the job caused "Segmentation Fault" error.
>
> Is there a job submission system on the HPC that is required for
> programs that need MPI? Does the error still happen if you use it,
> sacrificing the ability to run R with Rmpi interactively?
>
> Do you have a debugger (probably gdb) available to you? Debugging
> segmentation faults without a debugger is, in theory, possible, but
> requires a lot of experience and effort. We'll need at least a
> backtrace in order to start debugging, and for that we'll need the
> debugging symbols.
>
> What does the HPC sysadmin say about this problem? You'll probably have
> to contact this person anyway in order to get access to a debugger and
> ensure that the debugging symbols are available.
>
> --
> Best regards,
> Ivan
>
--
---------------------------------------------------
James Li
This is a machine email!
[[alternative HTML version deleted]]