Vitaly Magerya
2014-Jan-21 10:22 UTC
Any news about "msk0 watchdog timeout" regression in 10-RELEASE?
Hi, folks. I've just upgraded to 10.0-RELEASE, and my msk(4) card stopped working; it would work for a few minutes, and then it will start printing "msk0 watchdog timeout" messages with an interrupt storm accompanying it. I think this problem was described earlier this month in [1]. My question is: was there a workaround found, or should I just downgrade back to 9.2? [1] https://lists.freebsd.org/pipermail/freebsd-stable/2014-January/076676.html
Curtis Villamizar
2014-Jan-21 19:56 UTC
Any news about "msk0 watchdog timeout" regression in 10-RELEASE?
In message <52DE4A69.9090304 at gmail.com> Vitaly Magerya writes:> Hi, folks. I've just upgraded to 10.0-RELEASE, and my msk(4) card > stopped working; it would work for a few minutes, and then it will > start printing "msk0 watchdog timeout" messages with an interrupt > storm accompanying it. I think this problem was described earlier > this month in [1]. > > My question is: was there a workaround found, or should I just > downgrade back to 9.2? > > [1] https://lists.freebsd.org/pipermail/freebsd-stable/2014-January/076676.htmlI have mine working but I haven't done a lot of reboots to see if it is a "fix" or luck. There is a lot of junk that you won't need in the code that is running well for me. But here it is, as-is warts and all. I've been swamped lately and haven't had time to look at this further. Curtis Notes- 1. the change in watchdog code has no effect (not hit when working, does not fix things when not working). 2. Lots of printf thingies in there that you can delete if you like. If things work you don't hit this code. 3. Chanes to the interrupt handler also seem to do nothing (good or bad) if things are working. 4. Why this is working for me is at this point a mystery but whether it works for you gives us another data point. Index: if_msk.c ==================================================================--- if_msk.c (revision 260441) +++ if_msk.c (working copy) @@ -2161,6 +2161,10 @@ count = imin(4096, roundup2(count, 1024)); sc->msk_stat_count = count; stat_sz = count * sizeof(struct msk_stat_desc); +#if 1 + device_printf(sc->msk_dev, + "msk_status_dma_alloc: %d %lu\n", count, stat_sz); +#endif error = bus_dma_tag_create( bus_get_dma_tag(sc->msk_dev), /* parent */ MSK_STAT_ALIGN, 0, /* alignment, boundary */ @@ -2975,6 +2979,14 @@ } } +#if 1 +static uint32_t msk_last_status = 0; +static uint16_t last_stat_put_idx = 0; +static uint32_t last_msk_control = 0; +static uint16_t last_good_stat_put_idx = 0; +static uint32_t last_good_msk_control = 0; +#endif + static void msk_watchdog(struct msk_if_softc *sc_if) { @@ -2995,7 +3007,70 @@ return; } - if_printf(ifp, "watchdog timeout\n"); +#if 1 + if_printf(ifp, +"watchdog timeout: 0x%08x\n (0x%04x 0x%08x) (0x%04x 0x%08x) 0x%08x 0x%08x\n", + msk_last_status, + sc_if->msk_softc->msk_stat_cons, last_msk_control, + last_good_stat_put_idx, last_good_msk_control, + last_stat_put_idx, sc_if->msk_softc->msk_stat_count); + { + struct msk_softc *sc = sc_if->msk_softc; + uint16_t cons, count; + struct msk_stat_desc *sd; + uint32_t control; +#if 0 + char linebuf[8192]; + char *pt = linebuf; + size_t bytes = 8192; + size_t used; + + count = sc->msk_stat_count; + for (cons = 0; cons < count; ++cons) { + if ((cons > 0) && ((cons & 0xff) == 0)) { + if_printf(ifp, "%s\n", linebuf); + pt = linebuf; + bytes = sizeof(linebuf); + } + if ((cons & 7) == 0) { + snprintf(pt, bytes - 1, "\n%03x ", cons); + used = strlen(pt); pt += used; bytes -= used; + } else if ((cons & 3) == 0) { + snprintf(pt, bytes - 1, " "); + used = strlen(pt); pt += used; bytes -= used; + } + sd = &sc->msk_stat_ring[cons]; + control = le32toh(sd->msk_control); + snprintf(pt, bytes - 1, " %08x", control); + used = strlen(pt); pt += used; bytes -= used; + } + if_printf(ifp, "%s\n\n", linebuf); +#endif + /* bump the count if we got stuck on HW_OWNER */ + if (((msk_last_status & Y2_IS_STAT_BMU) != 0) + && (sc->msk_stat_cons != last_stat_put_idx) + && ((last_msk_control & HW_OWNER) == 0)) { + /* Sync status LEs. */ + bus_dmamap_sync(sc->msk_stat_tag, sc->msk_stat_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + cons = sc->msk_stat_cons; + count = sc->msk_stat_count; + do { + MSK_INC(cons, count); + sd = &sc->msk_stat_ring[cons]; + control = le32toh(sd->msk_control); + } while ((cons != sc->msk_stat_cons) + && ((control & HW_OWNER) == 0)); + if (cons != sc->msk_stat_cons) { + if_printf(ifp, "msk_stat_cons changed 0x%04x -> 0x%04x\n", + sc->msk_stat_cons, cons); + sc->msk_stat_cons = cons; + } + } + } +#else + if_printf(ifp, "watchdog timeout: status\n"); +#endif ifp->if_oerrors++; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; msk_init_locked(sc_if); @@ -3599,8 +3674,12 @@ int rxput[2]; struct msk_stat_desc *sd; uint32_t control, status; - int cons, len, port, rxprog; + int len, port, rxprog; + uint16_t cons; +#if 1 + last_stat_put_idx = CSR_READ_2(sc, STAT_PUT_IDX); +#endif if (sc->msk_stat_cons == CSR_READ_2(sc, STAT_PUT_IDX)) return (0); @@ -3614,8 +3693,15 @@ for (;;) { sd = &sc->msk_stat_ring[cons]; control = le32toh(sd->msk_control); +#if 1 + last_msk_control = control; +#endif if ((control & HW_OWNER) == 0) break; +#if 1 + last_good_stat_put_idx = cons; + last_good_msk_control = control; +#endif control &= ~HW_OWNER; sd->msk_control = htole32(control); status = le32toh(sd->msk_status); @@ -3689,6 +3775,11 @@ if (rxput[MSK_PORT_B] > 0) msk_rxput(sc->msk_if[MSK_PORT_B]); +#if 1 + last_stat_put_idx = CSR_READ_2(sc, STAT_PUT_IDX); +#endif + if ((control & HW_OWNER) == 0) + return 1; return (sc->msk_stat_cons != CSR_READ_2(sc, STAT_PUT_IDX)); } @@ -3742,8 +3833,11 @@ CSR_WRITE_4(sc, B0_IMSK, sc->msk_intrmask); CSR_READ_4(sc, B0_IMSK); } - if ((status & Y2_IS_HW_ERR) != 0) + if ((status & Y2_IS_HW_ERR) != 0) { msk_intr_hwerr(sc); + device_printf(sc->msk_dev, + "Y2_IS_HW_ERR is set: status 0x%x\n", status); + } domore = msk_handle_events(sc); if ((status & Y2_IS_STAT_BMU) != 0 && domore == 0) @@ -3762,6 +3856,17 @@ !IFQ_DRV_IS_EMPTY(&ifp1->if_snd)) msk_start_locked(ifp1); +#if 1 +#define Y2_IS_OTHER_INTR \ + (Y2_IS_ASF | Y2_IS_POLL_CHK | Y2_IS_IRQ_SW | Y2_IS_TIMINT | \ + Y2_IS_CHK_TXS2 | Y2_IS_PSM_ACK | Y2_IS_PTP_TIST | Y2_IS_CHK_TXS1) + if ((status & (Y2_IS_OTHER_INTR)) != 0) { + device_printf(sc->msk_dev, "unknown interupt bits 0x%x\n", + status & (Y2_IS_OTHER_INTR)); + } + msk_last_status = status; +#endif + MSK_UNLOCK(sc); } Index: if_mskreg.h ==================================================================--- if_mskreg.h (revision 260441) +++ if_mskreg.h (working copy) @@ -156,7 +156,7 @@ #define DEVICEID_DLINK_DGE560SX 0x4002 #define DEVICEID_DLINK_DGE560T 0x4b00 -#define BIT_31 (1 << 31) +#define BIT_31 (1U << 31) #define BIT_30 (1 << 30) #define BIT_29 (1 << 29) #define BIT_28 (1 << 28) @@ -2329,8 +2329,13 @@ */ #if (BUS_SPACE_MAXADDR > 0xFFFFFFFF) #define MSK_64BIT_DMA +#if 1 +#define MSK_TX_RING_CNT 256 +#define MSK_RX_RING_CNT 256 +#else #define MSK_TX_RING_CNT 384 #define MSK_RX_RING_CNT 512 +#endif #else #undef MSK_64BIT_DMA #define MSK_TX_RING_CNT 256 @@ -2539,8 +2544,8 @@ bus_addr_t msk_stat_ring_paddr; int msk_int_holdoff; int msk_process_limit; - int msk_stat_cons; - int msk_stat_count; + uint16_t msk_stat_cons; + uint16_t msk_stat_count; struct mtx msk_mtx; };