I am attempting to run lustre-manager on a dell 2850 (2 real cpus, 4 hyperthreaded) and the lmd agent crashes. It works fine on a 2 cpu box (same version of OS and lustre). My guess is lmd is hard coded to know about only 2 cpus? We use RHEL 4 (4.2 current release) and lustre 1.4.5. Thanks, Steve Here is an strace of the crash (trying to read cpu data from /proc/stat). If you need more info let me know: execve("/usr/sbin/lmd", ["lmd"], [/* 24 vars */]) = 0 uname({sys="Linux", node="csia7mds01", ...}) = 0 brk(0) = 0x804f000 access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory) open("/etc/ld.so.cache", O_RDONLY) = 3 fstat64(3, {st_mode=S_IFREG|0644, st_size=37550, ...}) = 0 old_mmap(NULL, 37550, PROT_READ, MAP_PRIVATE, 3, 0) = 0x656000 close(3) = 0 open("/lib/tls/libpthread.so.0", O_RDONLY) = 3 read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\20\10F"..., 512) = 512 fstat64(3, {st_mode=S_IFREG|0755, st_size=93865, ...}) = 0 old_mmap(0x45c000, 70108, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x45c000 old_mmap(0x46a000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0xd000) = 0x46a000 old_mmap(0x46c000, 4572, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x46c000 close(3) = 0 open("/lib/tls/libc.so.6", O_RDONLY) = 3 read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\20\257"..., 512) = 512 fstat64(3, {st_mode=S_IFREG|0755, st_size=1454462, ...}) = 0 old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x879000 old_mmap(0x286000, 1219772, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x286000 old_mmap(0x3aa000, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x124000) = 0x3aa000 old_mmap(0x3ae000, 7356, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3ae000 close(3) = 0 old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xa9c000 mprotect(0x3aa000, 4096, PROT_READ) = 0 mprotect(0x282000, 4096, PROT_READ) = 0 set_thread_area({entry_number:-1 -> 6, base_addr:0xa9c6c0, limit:1048575, seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0, useable:1}) = 0 munmap(0x656000, 37550) = 0 set_tid_address(0xa9c708) = 30960 rt_sigaction(SIGRTMIN, {0x460380, [], SA_RESTORER|SA_SIGINFO, 0x4677c0}, NULL, 8) = 0 rt_sigaction(SIGRT_1, {0x4603f0, [], SA_RESTORER|SA_RESTART|SA_SIGINFO, 0x4677c0}, NULL, 8) = 0 rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0 getrlimit(RLIMIT_STACK, {rlim_cur=10240*1024, rlim_max=RLIM_INFINITY}) 0 _sysctl({{CTL_KERN, KERN_VERSION, 0, 20ca9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, 2, 0xbffff75c, 35, (nil), 0}) = 0 open("/tmp/lmd.debug", O_WRONLY|O_CREAT|O_TRUNC, 0644) = 3 close(0) = 0 dup2(3, 1) = 1 dup2(3, 2) = 2 close(3) = 0 clone(Process 30961 attached child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0xa9c708) = 30961 [pid 30961] setsid( <unfinished ...> [pid 30960] exit_group(0) = ? [pid 30961] <... setsid resumed> ) = 30961 [pid 30961] chdir("/"Process 30960 detached ) = 0 uname({sys="Linux", node="csia7mds01", ...}) = 0 socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 0 fcntl64(0, F_GETFL) = 0x2 (flags O_RDWR) brk(0) = 0x804f000 brk(0x8070000) = 0x8070000 fstat64(0, {st_mode=S_IFSOCK|0777, st_size=0, ...}) = 0 mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x9c3000 _llseek(0, 0, 0xbffff374, SEEK_CUR) = -1 ESPIPE (Illegal seek) bind(0, {sa_family=AF_INET, sin_port=htons(9988), sin_addr=inet_addr("0.0.0.0")}, 16) = 0 mmap2(NULL, 10489856, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xa9d000 mprotect(0xa9d000, 4096, PROT_NONE) = 0 clone(Process 30962 attached child_stack=0x149d4c4, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYS VSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID|CLONE_DETACHE D, parent_tidptr=0x149dbf8, {entry_number:6, base_addr:0x149dbb0, limit:1048575, seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0, useable:1}, child_tidptr=0x149dbf8) = 30962 [pid 30961] mmap2(NULL, 10489856, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...> [pid 30962] select(0, NULL, NULL, NULL, {5, 0} <unfinished ...> [pid 30961] <... mmap2 resumed> ) = 0x5896000 [pid 30961] mprotect(0x5896000, 4096, PROT_NONE) = 0 [pid 30961] clone(Process 30963 attached child_stack=0x62964c4, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYS VSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID|CLONE_DETACHE D, parent_tidptr=0x6296bf8, {entry_number:6, base_addr:0x6296bb0, limit:1048575, seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0, useable:1}, child_tidptr=0x6296bf8) = 30963 [pid 30961] select(1, [0], NULL, NULL, {5, 0} <unfinished ...> [pid 30963] open("/proc/stat", O_RDONLY) = 3 [pid 30963] fstat64(1, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0 [pid 30963] mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x111000 [pid 30963] read(3, "c", 1) = 1 [pid 30963] read(3, "p", 1) = 1 [pid 30963] read(3, "u", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "6", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "6", 1) = 1 [pid 30963] read(3, "6", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "9", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "6", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "6", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "9", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "\n", 1) = 1 [pid 30963] gettimeofday({1130266886, 103174}, NULL) = 0 [pid 30963] read(3, "c", 1) = 1 [pid 30963] read(3, "p", 1) = 1 [pid 30963] read(3, "u", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "9", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "6", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, "9", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "6", 1) = 1 [pid 30963] read(3, "9", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, "6", 1) = 1 [pid 30963] read(3, "6", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "9", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "9", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "\n", 1) = 1 [pid 30963] read(3, "c", 1) = 1 [pid 30963] read(3, "p", 1) = 1 [pid 30963] read(3, "u", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "9", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "9", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "9", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, "9", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "\n", 1) = 1 [pid 30963] gettimeofday({1130266886, 107284}, NULL) = 0 [pid 30963] read(3, "c", 1) = 1 [pid 30963] read(3, "p", 1) = 1 [pid 30963] read(3, "u", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "4", 1) = 1 [pid 30963] read(3, "3", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "8", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "5", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, " ", 1) = 1 [pid 30963] read(3, "1", 1) = 1 [pid 30963] read(3, "0", 1) = 1 [pid 30963] read(3, "7", 1) = 1 [pid 30963] read(3, "2", 1) = 1 [pid 30963] read(3, "\n", 1) = 1 [pid 30963] --- SIGSEGV (Segmentation fault) @ 0 (0) --- Process 30961 detached Process 30963 detached [pid 30962] <... select resumed> ) = ? ERESTARTNOHAND (To be restarted) [pid 30962] +++ killed by SIGSEGV +++ PANIC: handle_group_exit: 30962 leader 30961 Process 30962 detached Process 30961 detached