I am attempting to run lustre-manager on a dell 2850 (2 real cpus, 4
hyperthreaded) and the lmd agent crashes. It works fine on a 2 cpu box
(same version of OS and lustre).
My guess is lmd is hard coded to know about only 2 cpus?
We use RHEL 4 (4.2 current release) and lustre 1.4.5.
Thanks,
Steve
Here is an strace of the crash (trying to read cpu data from
/proc/stat). If you need more info let me know:
execve("/usr/sbin/lmd", ["lmd"], [/* 24 vars */]) = 0
uname({sys="Linux", node="csia7mds01", ...}) = 0
brk(0) = 0x804f000
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or
directory)
open("/etc/ld.so.cache", O_RDONLY) = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=37550, ...}) = 0
old_mmap(NULL, 37550, PROT_READ, MAP_PRIVATE, 3, 0) = 0x656000
close(3) = 0
open("/lib/tls/libpthread.so.0", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\20\10F"...,
512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=93865, ...}) = 0
old_mmap(0x45c000, 70108, PROT_READ|PROT_EXEC,
MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x45c000
old_mmap(0x46a000, 8192, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0xd000) = 0x46a000
old_mmap(0x46c000, 4572, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x46c000
close(3) = 0
open("/lib/tls/libc.so.6", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\20\257"...,
512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=1454462, ...}) = 0
old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
-1, 0) = 0x879000
old_mmap(0x286000, 1219772, PROT_READ|PROT_EXEC,
MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x286000
old_mmap(0x3aa000, 16384, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x124000) = 0x3aa000
old_mmap(0x3ae000, 7356, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3ae000
close(3) = 0
old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
-1, 0) = 0xa9c000
mprotect(0x3aa000, 4096, PROT_READ) = 0
mprotect(0x282000, 4096, PROT_READ) = 0
set_thread_area({entry_number:-1 -> 6, base_addr:0xa9c6c0,
limit:1048575, seg_32bit:1, contents:0, read_exec_only:0,
limit_in_pages:1, seg_not_present:0, useable:1}) = 0
munmap(0x656000, 37550) = 0
set_tid_address(0xa9c708) = 30960
rt_sigaction(SIGRTMIN, {0x460380, [], SA_RESTORER|SA_SIGINFO, 0x4677c0},
NULL, 8) = 0
rt_sigaction(SIGRT_1, {0x4603f0, [], SA_RESTORER|SA_RESTART|SA_SIGINFO,
0x4677c0}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=10240*1024, rlim_max=RLIM_INFINITY}) 0
_sysctl({{CTL_KERN, KERN_VERSION, 0, 20ca9, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, 2, 0xbffff75c,
35, (nil), 0}) = 0
open("/tmp/lmd.debug", O_WRONLY|O_CREAT|O_TRUNC, 0644) = 3
close(0) = 0
dup2(3, 1) = 1
dup2(3, 2) = 2
close(3) = 0
clone(Process 30961 attached
child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0xa9c708) = 30961
[pid 30961] setsid( <unfinished ...>
[pid 30960] exit_group(0) = ?
[pid 30961] <... setsid resumed> ) = 30961
[pid 30961] chdir("/"Process 30960 detached
) = 0
uname({sys="Linux", node="csia7mds01", ...}) = 0
socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 0
fcntl64(0, F_GETFL) = 0x2 (flags O_RDWR)
brk(0) = 0x804f000
brk(0x8070000) = 0x8070000
fstat64(0, {st_mode=S_IFSOCK|0777, st_size=0, ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x9c3000
_llseek(0, 0, 0xbffff374, SEEK_CUR) = -1 ESPIPE (Illegal seek)
bind(0, {sa_family=AF_INET, sin_port=htons(9988),
sin_addr=inet_addr("0.0.0.0")}, 16) = 0
mmap2(NULL, 10489856, PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xa9d000
mprotect(0xa9d000, 4096, PROT_NONE) = 0
clone(Process 30962 attached
child_stack=0x149d4c4,
flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYS
VSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID|CLONE_DETACHE
D, parent_tidptr=0x149dbf8, {entry_number:6, base_addr:0x149dbb0,
limit:1048575, seg_32bit:1, contents:0, read_exec_only:0,
limit_in_pages:1, seg_not_present:0, useable:1}, child_tidptr=0x149dbf8)
= 30962
[pid 30961] mmap2(NULL, 10489856, PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...>
[pid 30962] select(0, NULL, NULL, NULL, {5, 0} <unfinished ...>
[pid 30961] <... mmap2 resumed> ) = 0x5896000
[pid 30961] mprotect(0x5896000, 4096, PROT_NONE) = 0
[pid 30961] clone(Process 30963 attached
child_stack=0x62964c4,
flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYS
VSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID|CLONE_DETACHE
D, parent_tidptr=0x6296bf8, {entry_number:6, base_addr:0x6296bb0,
limit:1048575, seg_32bit:1, contents:0, read_exec_only:0,
limit_in_pages:1, seg_not_present:0, useable:1}, child_tidptr=0x6296bf8)
= 30963
[pid 30961] select(1, [0], NULL, NULL, {5, 0} <unfinished ...>
[pid 30963] open("/proc/stat", O_RDONLY) = 3
[pid 30963] fstat64(1, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0
[pid 30963] mmap2(NULL, 4096, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x111000
[pid 30963] read(3, "c", 1) = 1
[pid 30963] read(3, "p", 1) = 1
[pid 30963] read(3, "u", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "6", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "6", 1) = 1
[pid 30963] read(3, "6", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "9", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "6", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "6", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "9", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "\n", 1) = 1
[pid 30963] gettimeofday({1130266886, 103174}, NULL) = 0
[pid 30963] read(3, "c", 1) = 1
[pid 30963] read(3, "p", 1) = 1
[pid 30963] read(3, "u", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "9", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "6", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, "9", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "6", 1) = 1
[pid 30963] read(3, "9", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, "6", 1) = 1
[pid 30963] read(3, "6", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "9", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "9", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "\n", 1) = 1
[pid 30963] read(3, "c", 1) = 1
[pid 30963] read(3, "p", 1) = 1
[pid 30963] read(3, "u", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "9", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "9", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "9", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, "9", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "\n", 1) = 1
[pid 30963] gettimeofday({1130266886, 107284}, NULL) = 0
[pid 30963] read(3, "c", 1) = 1
[pid 30963] read(3, "p", 1) = 1
[pid 30963] read(3, "u", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "4", 1) = 1
[pid 30963] read(3, "3", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "8", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "5", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, " ", 1) = 1
[pid 30963] read(3, "1", 1) = 1
[pid 30963] read(3, "0", 1) = 1
[pid 30963] read(3, "7", 1) = 1
[pid 30963] read(3, "2", 1) = 1
[pid 30963] read(3, "\n", 1) = 1
[pid 30963] --- SIGSEGV (Segmentation fault) @ 0 (0) ---
Process 30961 detached
Process 30963 detached
[pid 30962] <... select resumed> ) = ? ERESTARTNOHAND (To be
restarted)
[pid 30962] +++ killed by SIGSEGV +++
PANIC: handle_group_exit: 30962 leader 30961
Process 30962 detached
Process 30961 detached