Occasionally we see a process get stuck in an unkillable state and
the only solution is a hard reboot.
Occasionally == once every two weeks across 60+ servers, which are spread
across the globe in customer sites. We have no remote access to these boxes.
The process that most often that gets stuck, but not limited to, is a large
scale Ping/SNMP poller. It is a fairly simplistic C program that just fires
out lots of ping (raw ICMP socket) and SNMP (UDP socket) requests
asynchronously.
We've managed to trap the problem a few times on a test server running in
VirtualBox, but it also occurs on customer sites who run VMware, Hyper-V,
QEMU and on bare metal.
We raise this PR
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=204081
but suspect it is a similar/same issue as
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=200992
This is the info we've gathered from the most recent time it has occurred:
# uname -a
FreeBSD shed153.akips.com 10.2-RELEASE-p12 FreeBSD 10.2-RELEASE-p12 #0 r295070:
Sat Jan 30 20:03:44 UTC 2016 root at
shed21.akips.com:/usr/obj/usr/src/sys/GENERIC amd64
The nm-poller has no state in top for some reason ??
last pid: 1847; load averages: 0.62, 1.20, 1.33 up 13+16:06:04 13:36:46
103 processes: 1 running, 102 sleeping
CPU: 1.0% user, 0.0% nice, 4.3% system, 0.0% interrupt, 94.7% idle
Mem: 650M Active, 541M Inact, 2527M Wired, 16M Cache, 417M Buf, 217M Free
ARC: 2087M Total, 102M MFU, 1968M MRU, 18K Anon, 9409K Header, 9088K Other
Swap: 4096M Total, 256M Used, 3840M Free, 6% Inuse
PID USERNAME THR PRI NICE SIZE RES STATE C TIME WCPU COMMAND
1013 akips 1 20 0 74076K 5544K select 1 195:41 0.59% nm-httpd
1003 akips 1 20 0 164M 54328K select 0 236:18 0.49%
nm-flow-collector
888 root 1 20 0 101M 14920K select 0 163:56 0.39% nm-joatd
885 akips 1 20 0 74004K 3092K nanslp 1 116:52 0.29% nm-timed
1014 akips 1 4 0 851M 104M 0 18.0H 0.00% nm-poller
1086 akips 1 20 0 21940K 2680K nanslp 0 66:25 0.00% top
1015 akips 1 20 0 819M 256M nanslp 1 56:45 0.00%
nm-poller-db
1023 akips 1 20 0 114M 44760K select 0 55:00 0.00%
nm-flow-meter
1005 akips 1 20 0 159M 4172K select 0 51:00 0.00% nm-msgd
1025 akips 1 20 0 114M 45644K select 0 44:22 0.00%
nm-flow-meter
1012 akips 1 20 0 60360K 5132K piperd 1 20:08 0.00% perl
1027 akips 1 20 0 110M 34564K select 1 18:58 0.00%
nm-flow-meter
997 akips 1 20 0 819M 27600K select 0 12:59 0.00%
nm-snmp-trapd
991 akips 1 20 0 78104K 5384K select 1 10:53 0.00%
nm-fifo-tee
989 akips 1 20 0 78104K 5764K select 1 10:34 0.00%
nm-fifo-tee
990 akips 1 20 0 78104K 5496K select 0 10:31 0.00%
nm-fifo-tee
1047 akips 1 20 0 102M 29108K select 0 10:25 0.00%
nm-flow-meter
1111 akips 1 20 0 102M 36000K select 0 9:18 0.00%
nm-flow-meter
1231 akips 1 20 0 102M 35952K select 1 9:17 0.00%
nm-flow-meter
1239 akips 1 20 0 102M 33132K select 0 8:51 0.00%
nm-flow-meter
1240 akips 1 20 0 102M 33132K select 1 8:51 0.00%
nm-flow-meter
1002 akips 1 20 0 74016K 3480K select 1 8:50 0.00% nm-syslogd
1234 akips 1 20 0 102M 35920K select 1 8:49 0.00%
nm-flow-meter
1243 akips 1 20 0 102M 33148K select 0 8:46 0.00%
nm-flow-meter
1039 akips 1 20 0 820M 31388K select 0 8:46 0.00% nm-db
1233 akips 1 20 0 102M 31256K select 0 8:43 0.00%
nm-flow-meter
1237 akips 1 20 0 102M 33168K select 0 8:43 0.00%
nm-flow-meter
1235 akips 1 20 0 102M 29040K select 1 8:41 0.00%
nm-flow-meter
1259 akips 1 20 0 102M 29096K select 0 8:40 0.00%
nm-flow-meter
1255 akips 1 20 0 102M 31756K select 1 8:40 0.00%
nm-flow-meter
1232 akips 1 20 0 102M 31780K select 1 8:39 0.00%
nm-flow-meter
1041 akips 1 20 0 820M 45284K select 0 8:34 0.00% nm-db
1044 akips 1 20 0 820M 26172K select 1 8:28 0.00% nm-db
1060 akips 1 20 0 74008K 3380K select 1 8:22 0.00% nm-syslog
1077 akips 1 20 0 820M 26076K select 1 8:22 0.00% nm-db
1048 akips 1 20 0 820M 26076K select 1 8:16 0.00% nm-db
1045 akips 1 20 0 820M 27056K select 1 8:16 0.00% nm-db
1046 akips 1 20 0 820M 26156K select 1 8:16 0.00% nm-db
22541 akips 1 20 0 820M 26092K select 1 8:16 0.00% nm-db
1049 root 1 20 0 820M 26076K select 0 8:15 0.00% nm-db
1043 akips 1 20 0 820M 26076K select 0 8:15 0.00% nm-db
1006 akips 1 20 0 74004K 3364K nanslp 1 8:04 0.00%
nm-timed-watcher
62906 akips 1 20 0 102M 33488K select 0 7:50 0.00%
nm-flow-meter
89368 akips 1 20 0 96556K 28228K select 0 5:57 0.00%
nm-flow-meter
9528 akips 1 20 0 96556K 27828K select 1 5:35 0.00%
nm-flow-meter
56009 akips 1 20 0 96556K 26120K select 0 5:03 0.00%
nm-flow-meter
985 akips 1 20 0 68684K 5248K nanslp 1 3:52 0.00% perl
986 akips 1 20 0 50112K 5596K nanslp 1 3:04 0.00% perl
22909 akips 1 20 0 88364K 19352K select 0 2:34 0.00%
nm-flow-meter
993 akips 1 20 0 68684K 5036K nanslp 0 2:10 0.00% perl
55564 akips 1 20 0 88364K 17916K select 0 2:05 0.00%
nm-flow-meter
995 akips 1 20 0 68684K 4904K nanslp 0 2:04 0.00% perl
561 root 1 20 0 21880K 13788K select 0 1:24 0.00% ntpd
994 akips 1 20 0 68684K 4552K piperd 1 0:24 0.00% perl
999 akips 1 20 0 68684K 7888K piperd 1 0:10 0.00% perl
996 akips 1 20 0 68684K 7720K piperd 1 0:09 0.00% perl
648 root 1 20 0 30336K 840K kqread 1 0:08 0.00% master
90631 akips 1 20 0 80172K 12048K select 0 0:07 0.00%
nm-flow-meter
920 root 1 20 0 16612K 684K nanslp 1 0:05 0.00% cron
945 akips 1 20 0 21940K 3008K select 0 0:04 0.00% top
412 root 1 20 0 14512K 1644K select 1 0:03 0.00% syslogd
998 akips 1 20 0 68684K 4160K piperd 1 0:02 0.00% perl
98732 akips 1 20 0 820M 28780K select 1 0:02 0.00% nm-db
98844 akips 1 20 0 820M 28780K select 1 0:02 0.00% nm-db
650 postfix 1 20 0 30460K 796K kqread 1 0:01 0.00% qmgr
323 root 1 20 0 13624K 540K select 0 0:01 0.00% devd
368 root 1 20 0 14456K 1560K select 1 0:01 0.00% rtsold
1037 akips 1 20 0 86492K 6444K select 1 0:00 0.00% sshd
928 akips 1 20 0 86492K 6408K select 1 0:00 0.00% sshd
1058 akips 1 52 0 58304K 3688K piperd 0 0:00 0.00% perl
22538 akips 1 52 0 58304K 3688K piperd 0 0:00 0.00% perl
98843 akips 1 48 0 58304K 16864K piperd 1 0:00 0.00% perl
98731 akips 1 48 0 58304K 16864K piperd 0 0:00 0.00% perl
1570 root 1 20 0 35756K 12644K ttyin 1 0:00 0.00% vi
1506 akips 1 20 0 86492K 6552K select 1 0:00 0.00% sshd
1803 root 1 20 0 21940K 3132K CPU0 0 0:00 0.00% top
1096 root 1 20 0 17088K 2676K wait 1 0:00 0.00% sh
926 root 1 25 0 86492K 6352K select 0 0:00 0.00% sshd
1503 root 1 22 0 86492K 6552K select 0 0:00 0.00% sshd
1258 root 1 24 0 86492K 6452K select 0 0:00 0.00% sshd
1034 root 1 23 0 86492K 6456K select 0 0:00 0.00% sshd
1261 akips 1 20 0 86492K 6452K select 0 0:00 0.00% sshd
965 root 1 52 0 14508K 1680K ttyin 1 0:00 0.00% getty
1038 akips 1 20 0 17088K 0K wait 1 0:00 0.00% <sh>
1527 akips 1 26 0 47732K 2684K wait 0 0:00 0.00% su
1095 akips 1 21 0 47728K 0K wait 0 0:00 0.00% <su>
1278 akips 1 39 0 47728K 0K wait 1 0:00 0.00% <su>
929 akips 1 20 0 17088K 0K wait 1 0:00 0.00% <sh>
916 root 1 20 0 61224K 4300K select 0 0:00 0.00% sshd
1507 akips 1 37 0 17088K 2636K wait 1 0:00 0.00% sh
964 root 1 52 0 14508K 1680K ttyin 1 0:00 0.00% getty
1439 postfix 1 20 0 30404K 3848K kqread 1 0:00 0.00% pickup
# ps auxww | grep nm-poller
akips 1014 0.0 2.6 871820 106540 - Ds 10Feb16 1078:59.06 nm-poller
# procstat -k 1014
PID TID COMM TDNAME KSTACK
1014 100365 nm-poller - mi_switch sleepq_timedwait_sig
_cv_timedwait_sig_sbt seltdwait kern_select sys_select amd64_syscall
Xfast_syscall
# ps axHS -o pid,lwp,paddr,tdaddr 1014
PID LWP PADDR TDADDR
1014 100365 fffff800bb819000 fffff800bb963940
(kgdb) set print pretty on
(kgdb) p *(struct proc *)0xfffff800bb819000
$4 = {
p_list = {
le_next = 0xfffff800bb6829d0,
le_prev = 0xfffff800bb8199d0
},
p_threads = {
tqh_first = 0xfffff800bb963940,
tqh_last = 0xfffff800bb963950
},
p_slock = {
lock_object = {
lo_name = 0xffffffff80fc8245 "process slock",
lo_flags = 720896,
lo_data = 0,
lo_witness = 0x0
},
mtx_lock = 4
},
p_ucred = 0xfffff800bb55f200,
p_fd = 0xfffff800bb840000,
p_fdtol = 0x0,
p_stats = 0xfffff800bb408600,
p_limit = 0xfffff8001339c900,
p_limco = {
c_links = {
le = {
le_next = 0x0,
le_prev = 0x0
},
sle = {
sle_next = 0x0
},
tqe = {
tqe_next = 0x0,
tqe_prev = 0x0
}
},
c_time = 0,
c_precision = 0,
c_arg = 0x0,
c_func = 0,
c_lock = 0xfffff800bb819100,
c_flags = 0,
c_iflags = 0,
c_cpu = 0
},
p_sigacts = 0xfffff800bba1b000,
p_flag = 268435712,
p_flag2 = 0,
p_state = PRS_NORMAL,
p_pid = 1014,
p_hash = {
le_next = 0x0,
le_prev = 0xfffffe00006effb0
},
p_pglist = {
le_next = 0x0,
le_prev = 0xfffff800bb2c6290
},
p_pptr = 0xfffff800029724e8,
p_sibling = {
le_next = 0xfffff800bb458000,
le_prev = 0xfffff800bb819ab8
},
p_children = {
lh_first = 0x0
},
p_mtx = {
lock_object = {
lo_name = 0xffffffff80fc8238 "process lock",
lo_flags = 21168128,
lo_data = 0,
lo_witness = 0x0
},
mtx_lock = 4
},
p_ksi = 0xfffff800133020e0,
p_sigqueue = {
sq_signals = {
__bits = {0, 0, 0, 0}
},
sq_kill = {
__bits = {0, 0, 0, 0}
},
sq_list = {
tqh_first = 0x0,
tqh_last = 0xfffff800bb819148
},
sq_proc = 0xfffff800bb819000,
sq_flags = 1
},
p_oppid = 0,
p_vmspace = 0xfffff800bb9668c0,
p_swtick = 24753,
p_realtimer = {
it_interval = {
tv_sec = 0,
tv_usec = 0
},
it_value = {
tv_sec = 0,
tv_usec = 0
}
},
p_ru = {
ru_utime = {
tv_sec = 0,
tv_usec = 0
},
ru_stime = {
tv_sec = 0,
tv_usec = 0
},
ru_maxrss = 0,
ru_ixrss = 0,
ru_idrss = 0,
ru_isrss = 0,
ru_minflt = 0,
ru_majflt = 0,
ru_nswap = 0,
ru_inblock = 0,
ru_oublock = 0,
ru_msgsnd = 0,
ru_msgrcv = 0,
ru_nsignals = 0,
ru_nvcsw = 0,
ru_nivcsw = 0
},
p_rux = {
rux_runtime = 109968287043644,
rux_uticks = 1450275,
rux_sticks = 6625562,
rux_iticks = 0,
rux_uu = 11625969653,
rux_su = 53113087346,
rux_tu = 64739057000
},
p_crux = {
rux_runtime = 0,
rux_uticks = 0,
rux_sticks = 0,
rux_iticks = 0,
rux_uu = 0,
rux_su = 0,
rux_tu = 0
},
p_profthreads = 0,
p_exitthreads = 0,
p_traceflag = 0,
p_tracevp = 0x0,
p_tracecred = 0x0,
p_textvp = 0xfffff800bb42b3b0,
p_lock = 0,
p_sigiolst = {
slh_first = 0x0
},
p_sigparent = 20,
p_sig = 0,
p_code = 0,
p_stops = 0,
p_stype = 0,
p_step = 0 '\0',
p_pfsflags = 0 '\0',
p_nlminfo = 0x0,
p_aioinfo = 0x0,
p_singlethread = 0x0,
p_suspcount = 0,
p_xthread = 0x0,
p_boundary_count = 0,
p_pendingcnt = 0,
p_itimers = 0x0,
p_procdesc = 0x0,
p_magic = 3203398350,
p_osrel = 1002000,
p_comm = "nm-poller\000\000\000\000\000\000\000\000\000\000",
p_pgrp = 0xfffff800bb2c6280,
p_sysent = 0xffffffff814d4388,
p_args = 0xfffff800133796c0,
p_cpulimit = 9223372036854775807,
p_nice = 0 '\0',
p_fibnum = 0,
p_xstat = 0,
p_klist = {
kl_list = {
slh_first = 0x0
},
kl_lock = 0xffffffff80907dc0 <knlist_mtx_lock>,
kl_unlock = 0xffffffff80907e00 <knlist_mtx_unlock>,
kl_assert_locked = 0xffffffff80907e40 <knlist_mtx_assert_locked>,
kl_assert_unlocked = 0xffffffff80907e50 <knlist_mtx_assert_unlocked>,
kl_lockarg = 0xfffff800bb819100
},
p_numthreads = 1,
p_md = {
md_ldt = 0x0,
md_ldt_sd = {
sd_lolimit = 0,
sd_lobase = 0,
sd_type = 0,
sd_dpl = 0,
sd_p = 0,
sd_hilimit = 0,
sd_xx0 = 0,
sd_gran = 0,
sd_hibase = 0,
sd_xx1 = 0,
sd_mbz = 0,
sd_xx2 = 0
}
},
p_itcallout = {
c_links = {
le = {
le_next = 0x0,
le_prev = 0x0
},
sle = {
sle_next = 0x0
},
tqe = {
tqe_next = 0x0,
tqe_prev = 0x0
}
},
c_time = 0,
c_precision = 0,
c_arg = 0x0,
c_func = 0,
c_lock = 0xfffff800bb819100,
c_flags = 0,
c_iflags = 0,
c_cpu = 0
},
p_acflag = 1,
p_peers = 0x0,
p_leader = 0xfffff800bb819000,
p_emuldata = 0x0,
p_label = 0x0,
p_sched = 0xfffff800bb8194e8,
p_ktr = {
stqh_first = 0x0,
stqh_last = 0xfffff800bb819448
},
p_mqnotifier = {
lh_first = 0x0
},
p_dtrace = 0xfffff800bb502a40,
p_pwait = {
cv_description = 0xffffffff80fc8c39 "ppwait",
cv_waiters = 0
},
p_dbgwait = {
cv_description = 0xffffffff80fc8c40 "dbgwait",
cv_waiters = 0
},
p_prev_runtime = 0,
p_racct = 0x0,
p_throttled = 0 '\0',
p_orphan = {
le_next = 0x0,
le_prev = 0x0
},
p_orphans = {
lh_first = 0x0
},
p_treeflag = 0,
p_reaper = 0xfffff800029724e8,
p_reaplist = {
lh_first = 0x0
},
p_reapsibling = {
le_next = 0xfffff800bb6829d0,
le_prev = 0xfffff800bb819ea0
},
p_reapsubtree = 16
}
(kgdb) p *(struct thread *)0xfffff800bb963940
$6 = {
td_lock = 0xffffffff815cb380,
td_proc = 0xfffff800bb819000,
td_plist = {
tqe_next = 0x0,
tqe_prev = 0xfffff800bb819010
},
td_runq = {
tqe_next = 0x0,
tqe_prev = 0xffffffff815cb5c8
},
td_slpq = {
tqe_next = 0x0,
tqe_prev = 0xfffff800027d0a00
},
td_lockq = {
tqe_next = 0x0,
tqe_prev = 0xfffffe0122ba3428
},
td_hash = {
le_next = 0x0,
le_prev = 0xfffffe0000866068
},
td_cpuset = 0xfffff8000296c3a8,
td_sel = 0xfffff80013530f00,
td_sleepqueue = 0xfffff800027d0a00,
td_turnstile = 0xfffff800bbbbdc00,
td_rlqe = 0xfffff800134bad20,
td_umtxq = 0xfffff800bb2c6480,
td_tid = 100365,
td_sigqueue = {
sq_signals = {
__bits = {0, 0, 0, 0}
},
sq_kill = {
__bits = {0, 0, 0, 0}
},
sq_list = {
tqh_first = 0x0,
tqh_last = 0xfffff800bb9639f8
},
sq_proc = 0xfffff800bb819000,
sq_flags = 1
},
td_lend_user_pri = 255 '?',
td_flags = 20,
td_inhibitors = 2,
td_pflags = 0,
td_dupfd = 0,
td_sqqueue = 0,
td_wchan = 0x0,
td_wmesg = 0x0,
td_lastcpu = 0 '\0',
td_oncpu = 255 '?',
td_owepreempt = 0 '\0',
td_tsqueue = 0 '\0',
td_locks = 29149,
td_rw_rlocks = 0,
td_lk_slocks = 0,
td_stopsched = 0,
td_blocked = 0x0,
td_lockname = 0x0,
td_contested = {
lh_first = 0x0
},
td_sleeplocks = 0x0,
td_intr_nesting_level = 0,
td_pinned = 0,
td_ucred = 0xfffff800bb55f200,
td_estcpu = 0,
td_slptick = 0,
td_blktick = 0,
td_swvoltick = 1092393710,
td_cow = 0,
td_ru = {
ru_utime = {
tv_sec = 0,
tv_usec = 0
},
ru_stime = {
tv_sec = 0,
tv_usec = 0
},
ru_maxrss = 218024,
ru_ixrss = 4102525196,
ru_idrss = 256714706556,
ru_isrss = 1033707136,
ru_minflt = 768721,
ru_majflt = 3262,
ru_nswap = 0,
ru_inblock = 573,
ru_oublock = 1,
ru_msgsnd = 395465017,
ru_msgrcv = 178667178,
ru_nsignals = 0,
ru_nvcsw = 213744212,
ru_nivcsw = 56200635
},
td_rux = {
rux_runtime = 109968287043644,
rux_uticks = 1450275,
rux_sticks = 6625562,
rux_iticks = 0,
rux_uu = 11625969653,
rux_su = 53113087346,
rux_tu = 64739057000
},
td_incruntime = 0,
td_runtime = 109968287043644,
td_pticks = 0,
td_sticks = 0,
td_iticks = 0,
td_uticks = 0,
td_intrval = 0,
td_oldsigmask = {
__bits = {0, 0, 0, 0}
},
td_generation = 269944847,
td_sigstk = {
ss_sp = 0x0,
ss_size = 0,
ss_flags = 4
},
td_xsig = 0,
td_profil_addr = 0,
td_profil_ticks = 0,
td_name = "nm-poller\000\000\000\000\000\000\000\000\000\000",
td_fpop = 0x0,
td_dbgflags = 0,
td_dbgksi = {
ksi_link = {
tqe_next = 0x0,
tqe_prev = 0x0
},
ksi_info = {
si_signo = 0,
si_errno = 0,
si_code = 0,
si_pid = 0,
si_uid = 0,
si_status = 0,
si_addr = 0x0,
si_value = {
sival_int = 0,
sival_ptr = 0x0,
sigval_int = 0,
sigval_ptr = 0x0
},
_reason = {
_fault = {
_trapno = 0
},
_timer = {
_timerid = 0,
_overrun = 0
},
_mesgq = {
_mqd = 0
},
_poll = {
_band = 0
},
__spare__ = {
__spare1__ = 0,
__spare2__ = {0, 0, 0, 0, 0, 0, 0}
}
}
},
ksi_flags = 0,
ksi_sigq = 0x0
},
td_ng_outbound = 0,
td_osd = {
osd_nslots = 0,
osd_slots = 0x0,
osd_next = {
le_next = 0x0,
le_prev = 0x0
}
},
td_map_def_user = 0x0,
td_dbg_forked = 0,
td_vp_reserv = 0,
td_no_sleeping = 0,
td_dom_rr_idx = 0,
td_sigmask = {
__bits = {1611157507, 0, 0, 0}
},
td_rqindex = 26 '\032',
td_base_pri = 104 'h',
td_priority = 104 'h',
td_pri_class = 3 '\003',
td_user_pri = 124 '|',
td_base_user_pri = 124 '|',
td_pcb = 0xfffffe0122abdcc0,
td_state = TDS_INHIBITED,
td_retval = {0, 0},
td_slpcallout = {
c_links = {
le = {
le_next = 0x0,
le_prev = 0xfffffe000087d5b8
},
sle = {
sle_next = 0x0
},
tqe = {
tqe_next = 0x0,
tqe_prev = 0xfffffe000087d5b8
}
},
c_time = 4691782411130129,
c_precision = 407753,
c_arg = 0xfffff800bb963940,
c_func = 0xffffffff809902b0 <sleepq_timeout>,
c_lock = 0x0,
c_flags = 0,
c_iflags = 272,
c_cpu = 1
},
td_frame = 0xfffffe0122abdc00,
td_kstack_obj = 0xfffff800bb837700,
td_kstack = 18446741879562936320,
td_kstack_pages = 4,
td_critnest = 1,
td_md = {
md_spinlock_count = 1,
md_saved_flags = 582,
md_spurflt_addr = 0
},
td_sched = 0xfffff800bb963db0,
td_ar = 0x0,
td_lprof = {{
lh_first = 0x0
}, {
lh_first = 0x0
}},
td_dtrace = 0xfffff80013b46700,
td_errno = 0,
td_vnet = 0x0,
td_vnet_lpush = 0x0,
td_intr_frame = 0x0,
td_rfppwait_p = 0x0,
td_ma = 0x0,
td_ma_cnt = 0,
td_su = 0x0
}
Paul.
--
Paul Koch | Founder, CEO
AKIPS Network Monitor | akips.com
Brisbane, Australia