kern/143521
2010/2/2 pluknet <pluknet@gmail.com>:> Hi.
>
> I've got NMI on an almost idle system - FreeBSD 7.2-R amd64.
> I guess the reason may be in (hardware?) binary garbage
> seen once in a while on serial port (loader, then ttyd0).
> Ask me for more details.
>
> Tracing command swi4: clock sio pid 20 tid 100011 td 0xffffff000144e370
> cpustop_handler() at cpustop_handler+64
> ipi_nmi_handler() at ipi_nmi_handler+48
> trap() at trap+796
> nmi_calltrap() at nmi_calltrap+8
> --- trap 19, rip = 18446744071567390785, rsp = 18446744067267268592,
> rbp = 18446744067267558272 ---
> _mtx_lock_spin() at _mtx_lock_spin+113
> siopoll() at siopoll+206
> ithread_loop() at ithread_loop+384
> fork_exit() at fork_exit+287
> fork_trampoline() at fork_trampoline+14
> --- trap 0, rip = 0, rsp = 18446744067267558704, rbp = 0 ---
>
> (kgdb) thr 13
> [Switching to thread 13 (Thread 100011)]#0 ?cpustop_handler () at
atomic.h:264
> 264 ? ? atomic.h: No such file or directory.
> ? ? ? ?in atomic.h
> (kgdb) bt
> #0 ?cpustop_handler () at atomic.h:264
> #1 ?0xffffffff807ec400 in ipi_nmi_handler ()
> ? ?at /usr/src/sys/amd64/amd64/mp_machdep.c:1144
> #2 ?0xffffffff807fceec in trap (frame=0xfffffffe80028f40)
> ? ?at /usr/src/sys/amd64/amd64/trap.c:198
> #3 ?0xffffffff807e0aeb in nmi_calltrap ()
> ? ?at /usr/src/sys/amd64/amd64/exception.S:427
> #4 ?0xffffffff80513841 in _mtx_lock_spin (m=0xffffffff80bb3d00,
> ? ?tid=18446742974219215728, opts=Variable "opts" is not
available.
> ) at /usr/src/sys/kern/kern_mutex.c:474
> #5 ?0xffffffff8082b96e in siopoll (dummy=Variable "dummy" is not
available.
> ) at /usr/src/sys/dev/sio/sio.c:1703
> #6 ?0xffffffff804ff940 in ithread_loop (arg=0xffffff000142bac0)
> ? ?at /usr/src/sys/kern/kern_intr.c:1088
> #7 ?0xffffffff804fc1df in fork_exit (
> ? ?callout=0xffffffff804ff7c0 <ithread_loop>, arg=0xffffff000142bac0,
> ? ?frame=0xfffffffe8006fc80) at /usr/src/sys/kern/kern_fork.c:834
> #8 ?0xffffffff807e0b5e in fork_trampoline ()
> ? ?at /usr/src/sys/amd64/amd64/exception.S:455
> #9 ?0x0000000000000000 in ?? ()
> #10 0x0000000000000000 in ?? ()
> #11 0x0000000000000001 in ?? ()
> #12 0x0000000000000000 in ?? ()
> #13 0x0000000000000000 in ?? ()
> #14 0x0000000000000000 in ?? ()
> ---Type <return> to continue, or q <return> to quit---q
> Quit
> (kgdb) f 5
> #5 ?0xffffffff8082b96e in siopoll (dummy=Variable "dummy" is not
available.
> ) at /usr/src/sys/dev/sio/sio.c:1703
> 1703 ? ? ? ? ? ? ? ? ? ? ? ? ? ?mtx_lock_spin(&sio_lock);
> (kgdb) i loc
> _tid = Variable "_tid" is not available.
> (kgdb) list
> 1698 ? ? ? ? ? ? ? ? ? ? ? ? ? ?com_events -= incc;
> 1699 ? ? ? ? ? ? ? ? ? ? ? ? ? ?mtx_unlock_spin(&sio_lock);
> 1700 ? ? ? ? ? ? ? ? ? ? ? ? ? ?continue;
> 1701 ? ? ? ? ? ? ? ? ? ?}
> 1702 ? ? ? ? ? ? ? ? ? ?if (com->iptr != com->ibuf) {
> 1703 ? ? ? ? ? ? ? ? ? ? ? ? ? ?mtx_lock_spin(&sio_lock);
> 1704 ? ? ? ? ? ? ? ? ? ? ? ? ? ?sioinput(com);
> 1705 ? ? ? ? ? ? ? ? ? ? ? ? ? ?mtx_unlock_spin(&sio_lock);
> 1706 ? ? ? ? ? ? ? ? ? ?}
> 1707 ? ? ? ? ? ? ? ? ? ?if (com->state & CS_CHECKMSR) {
> (kgdb) p sio_lock
> $1 = {lock_object = {lo_name = 0xffffffff80b15380 "sio",
> ? ?lo_type = 0xffffffff80b15380 "sio", lo_flags = 458752,
lo_witness_data = {
> ? ? ?lod_list = {stqe_next = 0x0}, lod_witness = 0x0}},
> ?mtx_lock = 18446742974219094608, mtx_recurse = 0}
> (kgdb) p/x sio_lock->mtx_lock
> $10 = 0xffffff0001430a50 ?# == td for pid 17 tid 100008
>
>
> Binary garbage is like below (not touching anything on k/board atm).
>
> login:
> FreeBSD/amd64 (ho
> FreeBSD/amd64 (host) (ttyd0)
>
> login: <<|?
> FreeBSD
> FreeBSD
> FreeBS
> FreeBSD
> Free
> FreeBSD/amd64 (host) (ttyd0)
>
> login:
> FreeBSD/amd64 (host) (ttyd0)
>
> login:
> FreeBSD
> Free
>
> FreeBS
> FreeBSD
> FreeBS
> FreeBSD
> FreeBSD
> FreeBS
> FreeBSD
> FreeBS
> FreeBSD
> FreeBSD
> FreeBS
> FreeBSD
> FreeBS??H?? ? ? M5
> FreeBSD
> FreeBS
> FreeBSD
> FreeBSD
> FreeBS
> FreeBSD
> FreeBS??H?? ? ? M5
> FreeBSD
> FreeBS
> FreeBSD
> FreeB
> FreeBSD/amd6
> FreeBS
> FreeBS
> FreeBSD
> FreeBSD
> FreeBS
> FreeBSD
> FreeBS
> FreeBSD
> FreeBSD
> FreeBS
> FreeBSD
> FreeBS??H?? ? ? M5
> FreeBSD
> FreeBS
> FreeBSD
> [..a little later..]
>
> [root@host ~]#
<<<<<<<<<<<><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<8<<<<<8<<<<
>
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<8<<<<<<<><
>
> Other useful stuff.
>
> (kgdb) f 4
> #4 ?0xffffffff80513841 in _mtx_lock_spin (m=0xffffffff80bb3d00,
> ? ?tid=18446742974219215728, opts=Variable "opts" is not
available.
> ) at /usr/src/sys/kern/kern_mutex.c:474
> 474 ? ? ? ? ? ? ? ? ? ? while (m->mtx_lock != MTX_UNOWNED) {
> (kgdb) list
> 469 ? ? ? ? ? ? lock_profile_obtain_lock_failed(&m->lock_object,
> &contested, &waittime);
> 470 ? ? ? ? ? ? while (!_obtain_lock(m, tid)) {
> 471
> 472 ? ? ? ? ? ? ? ? ? ? /* Give interrupts a chance while we spin. */
> 473 ? ? ? ? ? ? ? ? ? ? spinlock_exit();
> 474 ? ? ? ? ? ? ? ? ? ? while (m->mtx_lock != MTX_UNOWNED) {
> 475 ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (i++ < 10000000) {
> 476 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cpu_spinwait();
> 477 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? continue;
> 478 ? ? ? ? ? ? ? ? ? ? ? ? ? ? }
> (kgdb) f 3
> #3 ?0xffffffff807e0aeb in nmi_calltrap ()
> ? ?at /usr/src/sys/amd64/amd64/exception.S:427
> 427 ? ? ? ? ? ? call ? ?trap
> Current language: ?auto; currently asm
> (kgdb) list
> 422 ? ? ? ? ? ? swapgs
> 423 ? ? /* Note: this label is also used by ddb and gdb: */
> 424 ? ? nmi_calltrap:
> 425 ? ? ? ? ? ? FAKE_MCOUNT(TF_RIP(%rsp))
> 426 ? ? ? ? ? ? movq ? ?%rsp, %rdi
> 427 ? ? ? ? ? ? call ? ?trap
> 428 ? ? ? ? ? ? MEXITCOUNT
> 429 ? ? ? ? ? ? testl ? %ebx,%ebx
> 430 ? ? ? ? ? ? jz ? ? ?nmi_restoreregs
> 431 ? ? ? ? ? ? swapgs
> (kgdb) f 2
> #2 ?0xffffffff807fceec in trap (frame=0xfffffffe80028f40)
> ? ?at /usr/src/sys/amd64/amd64/trap.c:198
> 198 ? ? ? ? ? ? ? ? ? ? ?if (ipi_nmi_handler() == 0)
> Current language: ?auto; currently c
> (kgdb) list
> 193
> 194 ? ? #ifdef SMP
> 195 ? ? #ifdef STOP_NMI
> 196 ? ? ? ? ? ? /* Handler for NMI IPIs used for stopping CPUs. */
> 197 ? ? ? ? ? ? if (type == T_NMI) {
> 198 ? ? ? ? ? ? ? ? ? ? ?if (ipi_nmi_handler() == 0)
> 199 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?goto out;
> 200 ? ? ? ? ? ? }
> 201 ? ? #endif /* STOP_NMI */
> 202 ? ? #endif /* SMP */
>
>
> db> bt
> Tracing pid 17 tid 100008 td 0xffffff0001430a50
> kdb_enter_why() at kdb_enter_why+0x3d
> siointr1() at siointr1+0x2c5
> siointr() at siointr+0x58
> intr_execute_handlers() at intr_execute_handlers+0x8b
> Xapic_isr1() at Xapic_isr1+0x7f
> --- interrupt, rip = 0xffffffff807d8bd6, rsp = 0xfffffffe8005fb90, rbp
> = 0xfffffffe8005fba0 ---
> acpi_cpu_c1() at acpi_cpu_c1+0x6
> acpi_cpu_idle() at acpi_cpu_idle+0x19c
> sched_idletd() at sched_idletd+0x46
> fork_exit() at fork_exit+0x11f
> fork_trampoline() at fork_trampoline+0xe
> --- trap 0, rip = 0, rsp = 0xfffffffe8005fd30, rbp = 0 ---
>
> (kgdb) p (struct thread) *sio_lock->mtx_lock
> $15 = {td_lock = 0xffffffff80b7bc40, td_proc = 0xffffff000142e478, td_plist
= {
> ? ?tqe_next = 0x0, tqe_prev = 0xffffff000142e488}, td_slpq = {tqe_next =
0x0,
> ? ?tqe_prev = 0x0}, td_lockq = {tqe_next = 0x0, tqe_prev = 0x0}, td_selq =
{
> ? ?tqh_first = 0x0, tqh_last = 0x0}, td_sleepqueue = 0xffffff0001418c00,
> ?td_turnstile = 0xffffff00012f9a00, td_umtxq = 0xffffff000142a380,
> ?td_tid = 100008, td_sigqueue = {sq_signals = {__bits = {0, 0, 0, 0}},
> ? ?sq_kill = {__bits = {0, 0, 0, 0}}, sq_list = {tqh_first = 0x0,
> ? ? ?tqh_last = 0xffffff0001430ae0}, sq_proc = 0xffffff000142e478,
> ? ?sq_flags = 1}, td_flags = 65572, td_inhibitors = 0, td_pflags = 0,
> ?td_dupfd = 0, td_sqqueue = 0, td_wchan = 0x0, td_wmesg = 0x0,
> ?td_lastcpu = 1 '\001', td_oncpu = 1 '\001', td_owepreempt
= 0 '\0',
> ?td_locks = 0, td_tsqueue = 0 '\0', td_blocked = 0x0, td_lockname =
0x0,
> ?td_contested = {lh_first = 0x0}, td_sleeplocks = 0x0,
> ?td_intr_nesting_level = 1, td_pinned = 0, td_mailbox = 0x0,
> ?td_ucred = 0xffffff0001300e00, td_standin = 0x0, td_upcall = 0x0,
> ?td_estcpu = 0, td_slptick = 0, td_ru = {ru_utime = {tv_sec = 0,
> ? ? ?tv_usec = 0}, ru_stime = {tv_sec = 0, tv_usec = 0}, ru_maxrss = 0,
> ? ?ru_ixrss = 0, ru_idrss = 0, ru_isrss = 0, ru_minflt = 0, ru_majflt = 0,
> ? ?ru_nswap = 0, ru_inblock = 0, ru_oublock = 0, ru_msgsnd = 0,
> ? ?ru_msgrcv = 0, ru_nsignals = 0, ru_nvcsw = 6414, ru_nivcsw = 38607927},
> ?td_runtime = 2179322361251400, td_pticks = 145105307, td_sticks = 7704049,
> ?td_iticks = 0, td_uticks = 0, td_uuticks = 0, td_usticks = 0,
> ?td_intrval = 0, td_oldsigmask = {__bits = {0, 0, 0, 0}}, td_sigmask = {
> ---Type <return> to continue, or q <return> to quit---
> ? ?__bits = {0, 0, 0, 0}}, td_generation = 38614341, td_sigstk = {
> ? ?ss_sp = 0x0, ss_size = 0, ss_flags = 0}, td_kflags = 0, td_xsig = 0,
> ?td_profil_addr = 0, td_profil_ticks = 0, td_name = '\0'
<repeats 19 times>,
> ?td_base_pri = 255 '?', td_priority = 255 '?', td_pri_class
= 4 '\004',
> ?td_user_pri = 180 '?', td_base_user_pri = 180 '?',
> ?td_pcb = 0xfffffffe8005fd40, td_state = TDS_RUNNING, td_retval = {0, 0},
> ?td_slpcallout = {c_links = {sle = {sle_next = 0x0}, tqe = {tqe_next = 0x0,
> ? ? ? ?tqe_prev = 0x0}}, c_time = 0, c_arg = 0x0, c_func = 0, c_mtx = 0x0,
> ? ?c_flags = 16}, td_frame = 0xfffffffe8005fc80,
> ?td_kstack_obj = 0xffffff0001431740, td_kstack = 18446744067267477504,
> ?td_kstack_pages = 4, td_altkstack_obj = 0x0, td_altkstack = 0,
> ?td_altkstack_pages = 0, td_critnest = 2, td_md = {md_spinlock_count = 1,
> ? ?md_saved_flags = 70}, td_sched = 0xffffff0001430d80, td_ar = 0x0,
> ?td_syscalls = 0, td_incruntime = 115662355444194,
> ?td_cpuset = 0xffffff0001424dc8, td_fpop = 0x0, td_dtrace = 0x0, td_errno =
0}
>
> (kgdb) p (struct proc) *0xffffff000142e478
> $16 = {p_list = {le_next = 0xffffff000142e8f0, le_prev =
0xffffff00014458f0},
> ?p_threads = {tqh_first = 0xffffff0001430a50, tqh_last =
0xffffff0001430a60},
> ?p_upcalls = {tqh_first = 0x0, tqh_last = 0xffffff000142e498}, p_slock = {
> ? ?lock_object = {lo_name = 0xffffffff808da2a3 "process slock",
> ? ? ?lo_type = 0xffffffff808da2a3 "process slock", lo_flags =
720896,
> ? ? ?lo_witness_data = {lod_list = {stqe_next = 0x0}, lod_witness = 0x0}},
> ? ?mtx_lock = 4, mtx_recurse = 0}, p_ucred = 0xffffff0001300e00,
> ?p_fd = 0xffffff0001443400, p_fdtol = 0x0, p_stats = 0xffffff00012ff600,
> ?p_limit = 0xffffff0001300c00, p_limco = {c_links = {sle = {sle_next =
0x0},
> ? ? ?tqe = {tqe_next = 0x0, tqe_prev = 0x0}}, c_time = 0, c_arg = 0x0,
> ? ?c_func = 0, c_mtx = 0xffffff000142e590, c_flags = 0},
> ?p_sigacts = 0xffffff000143e000, p_flag = 268435980, p_state = PRS_NORMAL,
> ?p_pid = 17, p_hash = {le_next = 0x0, le_prev = 0xfffffffe8021c088},
> ?p_pglist = {le_next = 0xffffff000142e8f0, le_prev = 0xffffff00014459d8},
> ?p_pptr = 0xffffffff80b64640, p_sibling = {le_next = 0xffffff000142e8f0,
> ? ?le_prev = 0xffffff00014459f0}, p_children = {lh_first = 0x0}, p_mtx = {
> ? ?lock_object = {lo_name = 0xffffffff808da296 "process lock",
> ? ? ?lo_type = 0xffffffff808da296 "process lock", lo_flags =
21168128,
> ? ? ?lo_witness_data = {lod_list = {stqe_next = 0x0}, lod_witness = 0x0}},
> ? ?mtx_lock = 4, mtx_recurse = 0}, p_ksi = 0x0, p_sigqueue = {sq_signals =
{
> ? ? ?__bits = {0, 0, 0, 0}}, sq_kill = {__bits = {0, 0, 0, 0}}, sq_list = {
> ? ? ?tqh_first = 0x0, tqh_last = 0xffffff000142e5e8},
> ? ?sq_proc = 0xffffff000142e478, sq_flags = 1}, p_oppid = 0,
> ---Type <return> to continue, or q <return> to quit---
> ?p_vmspace = 0xffffffff80b64e00, p_swtick = 0, p_realtimer = {it_interval =
{
> ? ? ?tv_sec = 0, tv_usec = 0}, it_value = {tv_sec = 0, tv_usec = 0}}, p_ru
= {
> ? ?ru_utime = {tv_sec = 0, tv_usec = 0}, ru_stime = {tv_sec = 0,
> ? ? ?tv_usec = 0}, ru_maxrss = 0, ru_ixrss = 0, ru_idrss = 0, ru_isrss = 0,
> ? ?ru_minflt = 0, ru_majflt = 0, ru_nswap = 0, ru_inblock = 0,
> ? ?ru_oublock = 0, ru_msgsnd = 0, ru_msgrcv = 0, ru_nsignals = 0,
> ? ?ru_nvcsw = 0, ru_nivcsw = 0}, p_rux = {rux_runtime = 2063660005807206,
> ? ?rux_uticks = 0, rux_sticks = 137401258, rux_iticks = 0, rux_uu = 0,
> ? ?rux_su = 371936150861, rux_tu = 1034416043011}, p_crux = {rux_runtime =
0,
> ? ?rux_uticks = 0, rux_sticks = 0, rux_iticks = 0, rux_uu = 0, rux_su = 0,
> ? ?rux_tu = 0}, p_profthreads = 0, p_exitthreads = 0, p_traceflag = 0,
> ?p_tracevp = 0x0, p_tracecred = 0x0, p_textvp = 0x0, p_lock = 0
'\0',
> ?p_sigiolst = {slh_first = 0x0}, p_sigparent = 20, p_sig = 0, p_code = 0,
> ?p_stops = 0, p_stype = 0, p_step = 0 '\0', p_pfsflags = 0
'\0',
> ?p_nlminfo = 0x0, p_aioinfo = 0x0, p_singlethread = 0x0, p_suspcount = 0,
> ?p_xthread = 0x0, p_boundary_count = 0, p_pendingcnt = 0, p_itimers = 0x0,
> ?p_numupcalls = 0, p_upsleeps = 0, p_completed = 0x0, p_nextupcall = 0,
> ?p_upquantum = 0, p_magic = 3203398350, p_osrel = 702000,
> ?p_comm = "idle: cpu1\000\000\000\000\000\000\000\000\000",
> ?p_pgrp = 0xffffffff80b65060, p_sysent = 0xffffffff80ad4d80, p_args = 0x0,
> ?p_cpulimit = 9223372036854775807, p_nice = 0 '\0', p_fibnum = 0,
> ?p_xstat = 0, p_klist = {kl_list = {slh_first = 0x0},
> ? ?kl_lock = 0xffffffff804f65d0 <knlist_mtx_lock>,
> ---Type <return> to continue, or q <return> to quit---
> ? ?kl_unlock = 0xffffffff804f5ff0 <knlist_mtx_unlock>,
> ? ?kl_locked = 0xffffffff804f5fd0 <knlist_mtx_locked>,
> ? ?kl_lockarg = 0xffffff000142e590}, p_numthreads = 1,
> ?p_md = <incomplete type>, p_itcallout = {c_links = {sle = {sle_next
= 0x0},
> ? ? ?tqe = {tqe_next = 0x0, tqe_prev = 0x0}}, c_time = 0, c_arg = 0x0,
> ? ?c_func = 0, c_mtx = 0x0, c_flags = 16}, p_acflag = 1, p_peers = 0x0,
> ?p_leader = 0xffffff000142e478, p_emuldata = 0x0, p_label = 0x0,
> ?p_sched = 0xffffff000142e8f0, p_ktr = {stqh_first = 0x0,
> ? ?stqh_last = 0xffffff000142e8d0}, p_mqnotifier = {lh_first = 0x0},
> ?p_dtrace = 0x0}
>
>
> db> show allpcpu
> Current CPU: 1
>
> cpuid ? ? ? ?= 0
> curthread ? ?= 0xffffff00014306e0: pid 18 "idle: cpu0"
> curpcb ? ? ? = 0xfffffffe80064d40
> fpcurthread ?= none
> idlethread ? = 0xffffff00014306e0: pid 18 "idle: cpu0"
>
> cpuid ? ? ? ?= 1
> curthread ? ?= 0xffffff0001430a50: pid 17 "idle: cpu1"
> curpcb ? ? ? = 0xfffffffe8005fd40
> fpcurthread ?= none
> idlethread ? = 0xffffff0001430a50: pid 17 "idle: cpu1"
>
> cpuid ? ? ? ?= 2
> curthread ? ?= 0xffffff000143c000: pid 16 "idle: cpu2"
> curpcb ? ? ? = 0xfffffffe8005ad40
> fpcurthread ?= none
> idlethread ? = 0xffffff000143c000: pid 16 "idle: cpu2"
>
> cpuid ? ? ? ?= 3
> curthread ? ?= 0xffffff000143c370: pid 15 "idle: cpu3"
> curpcb ? ? ? = 0xfffffffe80055d40
> fpcurthread ?= none
> idlethread ? = 0xffffff000143c370: pid 15 "idle: cpu3"
>
> cpuid ? ? ? ?= 4
> curthread ? ?= 0xffffff000143c6e0: pid 14 "idle: cpu4"
> curpcb ? ? ? = 0xfffffffe80050d40
> fpcurthread ?= none
> idlethread ? = 0xffffff000143c6e0: pid 14 "idle: cpu4"
>
> cpuid ? ? ? ?= 5
> curthread ? ?= 0xffffff000144e370: pid 20 "swi4: clock sio"
> curpcb ? ? ? = 0xfffffffe8006fd40
> fpcurthread ?= none
> idlethread ? = 0xffffff000142f000: pid 13 "idle: cpu5"
>
> cpuid ? ? ? ?= 6
> curthread ? ?= 0xffffff000142f370: pid 12 "idle: cpu6"
> curpcb ? ? ? = 0xfffffffe80046d40
> fpcurthread ?= none
> idlethread ? = 0xffffff000142f370: pid 12 "idle: cpu6"
>
> cpuid ? ? ? ?= 7
> curthread ? ?= 0xffffff000142f6e0: pid 11 "idle: cpu7"
> curpcb ? ? ? = 0xfffffffe80041d40
> fpcurthread ?= none
> idlethread ? = 0xffffff000142f6e0: pid 11 "idle: cpu7"
>
>
> --
> wbr,
> pluknet
>
--
wbr,
pluknet