Jesper Dangaard Brouer
2006-Jun-14 09:40 UTC
[PATCH 2/2] NET: Accurate packet scheduling for ATM/ADSL (userspace)
The Linux traffic''s control engine inaccurately calculates transmission times for packets sent over ADSL links. For some packet sizes the error rises to over 50%. This occurs because ADSL uses ATM as its link layer transport, and ATM transmits packets in fixed sized 53 byte cells. This changes the userspace tool iproute2/tc by adding an option to calculate traffic transmission times (rate table) over all ATM links, including ADSL, with perfect accuracy. A longer presentation of the patch, its rational, what it does and how to use it can be found here: http://www.stuart.id.au/russell/files/tc/tc-atm/ A earlier version of the patch, and a _detailed_ empirical investigation of its effects can be found here: http://www.adsl-optimizer.dk/ Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk> Signed-off-by: Russell Stuart <russell-tcatm@stuart.id.au> --- diff -Nurp iproute2.orig/include/linux/pkt_sched.h iproute2/include/linux/pkt_sched.h --- iproute2.orig/include/linux/pkt_sched.h 2005-12-10 09:27:44.000000000 +1000 +++ iproute2/include/linux/pkt_sched.h 2006-06-13 11:53:27.000000000 +1000 @@ -77,8 +77,9 @@ struct tc_ratespec { unsigned char cell_log; unsigned char __reserved; - unsigned short feature; - short addend; + unsigned short feature; /* Always 0 in pre-atm patch kernels */ + char cell_align; /* Always 0 in pre-atm patch kernels */ + unsigned char __unused; unsigned short mpu; __u32 rate; }; diff -Nurp iproute2.orig/tc/m_police.c iproute2/tc/m_police.c --- iproute2.orig/tc/m_police.c 2005-01-19 08:11:58.000000000 +1000 +++ iproute2/tc/m_police.c 2006-06-13 11:53:27.000000000 +1000 @@ -35,7 +35,7 @@ struct action_util police_action_util = static void explain(void) { fprintf(stderr, "Usage: ... police rate BPS burst BYTES[/BYTES] [ mtu BYTES[/BYTES] ]\n"); - fprintf(stderr, " [ peakrate BPS ] [ avrate BPS ]\n"); + fprintf(stderr, " [ peakrate BPS ] [ avrate BPS ] [ overhead OVERHEAD ] [ atm ]\n"); fprintf(stderr, " [ ACTIONTERM ]\n"); fprintf(stderr, "Old Syntax ACTIONTERM := action <EXCEEDACT>[/NOTEXCEEDACT] \n"); fprintf(stderr, "New Syntax ACTIONTERM := conform-exceed <EXCEEDACT>[/NOTEXCEEDACT] \n"); @@ -134,7 +134,10 @@ int act_parse_police(struct action_util __u32 ptab[256]; __u32 avrate = 0; int presult = 0; - unsigned buffer=0, mtu=0, mpu=0; + unsigned buffer=0, mtu=0; + __u8 mpu=0; + __s8 overhead=0; + int atm=0; int Rcell_log=-1, Pcell_log = -1; struct rtattr *tail; @@ -184,7 +187,7 @@ int act_parse_police(struct action_util fprintf(stderr, "Double \"mpu\" spec\n"); return -1; } - if (get_size(&mpu, *argv)) { + if (get_u8(&mpu, *argv, 10)) { explain1("mpu"); return -1; } @@ -198,6 +201,18 @@ int act_parse_police(struct action_util explain1("rate"); return -1; } + } else if (strcmp(*argv, "overhead") == 0) { + NEXT_ARG(); + if (p.rate.rate) { + fprintf(stderr, "Double \"overhead\" spec\n"); + return -1; + } + if (get_s8(&overhead, *argv, 10)) { + explain1("overhead"); + return -1; + } + } else if (strcmp(*argv, "atm") == 0) { + atm = 1; } else if (strcmp(*argv, "avrate") == 0) { NEXT_ARG(); if (avrate) { @@ -264,22 +279,12 @@ int act_parse_police(struct action_util } if (p.rate.rate) { - if ((Rcell_log = tc_calc_rtable(p.rate.rate, rtab, Rcell_log, mtu, mpu)) < 0) { - fprintf(stderr, "TBF: failed to calculate rate table.\n"); - return -1; - } + tc_calc_ratespec(&p.rate, rtab, p.rate.rate, Rcell_log, mtu, mpu, atm, overhead); p.burst = tc_calc_xmittime(p.rate.rate, buffer); - p.rate.cell_log = Rcell_log; - p.rate.mpu = mpu; } p.mtu = mtu; if (p.peakrate.rate) { - if ((Pcell_log = tc_calc_rtable(p.peakrate.rate, ptab, Pcell_log, mtu, mpu)) < 0) { - fprintf(stderr, "POLICE: failed to calculate peak rate table.\n"); - return -1; - } - p.peakrate.cell_log = Pcell_log; - p.peakrate.mpu = mpu; + tc_calc_ratespec(&p.peakrate, ptab, p.peakrate.rate, Pcell_log, mtu, mpu, atm, overhead); } tail = NLMSG_TAIL(n); diff -Nurp iproute2.orig/tc/q_cbq.c iproute2/tc/q_cbq.c --- iproute2.orig/tc/q_cbq.c 2005-07-06 08:37:15.000000000 +1000 +++ iproute2/tc/q_cbq.c 2006-06-13 11:53:27.000000000 +1000 @@ -32,6 +32,7 @@ static void explain_class(void) fprintf(stderr, " [ prio NUMBER ] [ cell BYTES ] [ ewma LOG ]\n"); fprintf(stderr, " [ estimator INTERVAL TIME_CONSTANT ]\n"); fprintf(stderr, " [ split CLASSID ] [ defmap MASK/CHANGE ]\n"); + fprintf(stderr, " [ overhead BYTES ] [ atm ]\n"); } static void explain(void) @@ -52,7 +53,10 @@ static int cbq_parse_opt(struct qdisc_ut struct tc_ratespec r; struct tc_cbq_lssopt lss; __u32 rtab[256]; - unsigned mpu=0, avpkt=0, allot=0; + unsigned avpkt=0, allot=0; + __u8 mpu=0; + __s8 overhead=0; + int atm=0; int cell_log=-1; int ewma_log=-1; struct rtattr *tail; @@ -102,7 +106,7 @@ static int cbq_parse_opt(struct qdisc_ut } } else if (strcmp(*argv, "mpu") == 0) { NEXT_ARG(); - if (get_size(&mpu, *argv)) { + if (get_u8(&mpu, *argv, 10)) { explain1("mpu"); return -1; } @@ -113,6 +117,14 @@ static int cbq_parse_opt(struct qdisc_ut explain1("allot"); return -1; } + } else if (strcmp(*argv, "overhead") == 0) { + NEXT_ARG(); + if (get_s8(&overhead, *argv, 10)) { + explain1("overhead"); + return -1; + } + } else if (strcmp(*argv, "atm") == 0) { + atm = 1; } else if (strcmp(*argv, "help") == 0) { explain(); return -1; @@ -137,12 +149,7 @@ static int cbq_parse_opt(struct qdisc_ut if (allot < (avpkt*3)/2) allot = (avpkt*3)/2; - if ((cell_log = tc_calc_rtable(r.rate, rtab, cell_log, allot, mpu)) < 0) { - fprintf(stderr, "CBQ: failed to calculate rate table.\n"); - return -1; - } - r.cell_log = cell_log; - r.mpu = mpu; + tc_calc_ratespec(&r, rtab, r.rate, cell_log, allot, mpu, atm, overhead); if (ewma_log < 0) ewma_log = TC_CBQ_DEF_EWMA; @@ -175,7 +182,9 @@ static int cbq_parse_class_opt(struct qd struct tc_cbq_fopt fopt; struct tc_cbq_ovl ovl; __u32 rtab[256]; - unsigned mpu=0; + __u8 mpu=0; + __s8 overhead = 0; + int atm = 0; int cell_log=-1; int ewma_log=-1; unsigned bndw = 0; @@ -289,10 +298,18 @@ static int cbq_parse_class_opt(struct qd lss.change |= TCF_CBQ_LSS_AVPKT; } else if (strcmp(*argv, "mpu") == 0) { NEXT_ARG(); - if (get_size(&mpu, *argv)) { + if (get_u8(&mpu, *argv, 10)) { explain1("mpu"); return -1; } + } else if (strcmp(*argv, "overhead") == 0) { + NEXT_ARG(); + if (get_s8(&overhead, *argv, 10)) { + explain1("overhead"); + return -1; + } + } else if (strcmp(*argv, "atm") == 0) { + atm = 1; } else if (strcmp(*argv, "weight") == 0) { NEXT_ARG(); if (get_size(&wrr.weight, *argv)) { @@ -336,12 +353,7 @@ static int cbq_parse_class_opt(struct qd unsigned pktsize = wrr.allot; if (wrr.allot < (lss.avpkt*3)/2) wrr.allot = (lss.avpkt*3)/2; - if ((cell_log = tc_calc_rtable(r.rate, rtab, cell_log, pktsize, mpu)) < 0) { - fprintf(stderr, "CBQ: failed to calculate rate table.\n"); - return -1; - } - r.cell_log = cell_log; - r.mpu = mpu; + tc_calc_ratespec(&r, rtab, r.rate, cell_log, pktsize, mpu, atm, overhead); } if (ewma_log < 0) ewma_log = TC_CBQ_DEF_EWMA; @@ -463,8 +475,12 @@ static int cbq_print_opt(struct qdisc_ut fprintf(f, "rate %s ", buf); if (show_details) { fprintf(f, "cell %ub ", 1<<r->cell_log); - if (r->mpu) - fprintf(f, "mpu %ub ", r->mpu); + if (r->mpu & 0xff) + fprintf(f, "mpu %ub ", (__u8)r->mpu); + if ((r->mpu >> 8)) + fprintf(f, "overhead %db ", (__s8)(r->mpu >> 8)); + if (r->feature & 0x0001) + fprintf(f, "atm "); } } if (lss && lss->flags) { diff -Nurp iproute2.orig/tc/q_htb.c iproute2/tc/q_htb.c --- iproute2.orig/tc/q_htb.c 2005-01-19 08:11:58.000000000 +1000 +++ iproute2/tc/q_htb.c 2006-06-13 11:53:27.000000000 +1000 @@ -34,14 +34,14 @@ static void explain(void) " default minor id of class to which unclassified packets are sent {0}\n" " r2q DRR quantums are computed as rate in Bps/r2q {10}\n" " debug string of 16 numbers each 0-3 {0}\n\n" - "... class add ... htb rate R1 [burst B1] [mpu B] [overhead O]\n" + "... class add ... htb rate R1 [burst B1] [mpu B] [overhead O] [atm]\n" " [prio P] [slot S] [pslot PS]\n" " [ceil R2] [cburst B2] [mtu MTU] [quantum Q]\n" " rate rate allocated to this class (class can still borrow)\n" " burst max bytes burst which can be accumulated during idle period {computed}\n" " mpu minimum packet size used in rate computations\n" " overhead per-packet size overhead used in rate computations\n" - + " atm include atm cell tax in rate computations\n" " ceil definite upper class rate (no borrows) {rate}\n" " cburst burst but for ceil {computed}\n" " mtu max packet size we create rate map for {1600}\n" @@ -107,8 +107,10 @@ static int htb_parse_class_opt(struct qd __u32 rtab[256],ctab[256]; unsigned buffer=0,cbuffer=0; int cell_log=-1,ccell_log = -1; - unsigned mtu, mpu; - unsigned char mpu8 = 0, overhead = 0; + unsigned mtu; + __u8 mpu8=0; + __s8 overhead=0; + int atm=0; struct rtattr *tail; memset(&opt, 0, sizeof(opt)); mtu = 1600; /* eth packet len */ @@ -132,9 +134,11 @@ static int htb_parse_class_opt(struct qd } } else if (matches(*argv, "overhead") == 0) { NEXT_ARG(); - if (get_u8(&overhead, *argv, 10)) { + if (get_s8(&overhead, *argv, 10)) { explain1("overhead"); return -1; } + } else if (matches(*argv, "atm") == 0) { + atm = 1; } else if (matches(*argv, "quantum") == 0) { NEXT_ARG(); if (get_u32(&opt.quantum, *argv, 10)) { @@ -206,23 +210,11 @@ static int htb_parse_class_opt(struct qd if (!buffer) buffer = opt.rate.rate / get_hz() + mtu; if (!cbuffer) cbuffer = opt.ceil.rate / get_hz() + mtu; -/* encode overhead and mpu, 8 bits each, into lower 16 bits */ - mpu = (unsigned)mpu8 | (unsigned)overhead << 8; - opt.ceil.mpu = mpu; opt.rate.mpu = mpu; - - if ((cell_log = tc_calc_rtable(opt.rate.rate, rtab, cell_log, mtu, mpu)) < 0) { - fprintf(stderr, "htb: failed to calculate rate table.\n"); - return -1; - } + /* encode overhead and mpu, 8 bits each, into lower 16 bits */ + tc_calc_ratespec(&opt.rate, rtab, opt.rate.rate, cell_log, mtu, mpu8, atm, overhead); + tc_calc_ratespec(&opt.ceil, ctab, opt.ceil.rate, cell_log, mtu, mpu8, atm, overhead); opt.buffer = tc_calc_xmittime(opt.rate.rate, buffer); - opt.rate.cell_log = cell_log; - - if ((ccell_log = tc_calc_rtable(opt.ceil.rate, ctab, cell_log, mtu, mpu)) < 0) { - fprintf(stderr, "htb: failed to calculate ceil rate table.\n"); - return -1; - } opt.cbuffer = tc_calc_xmittime(opt.ceil.rate, cbuffer); - opt.ceil.cell_log = ccell_log; tail = NLMSG_TAIL(n); addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); @@ -267,12 +259,16 @@ static int htb_print_opt(struct qdisc_ut sprint_size(buffer, b1), 1<<hopt->rate.cell_log, sprint_size(hopt->rate.mpu&0xFF, b2), - sprint_size((hopt->rate.mpu>>8)&0xFF, b3)); + sprint_size((__s8)(hopt->rate.mpu>>8), b3)); + if (hopt->rate.feature & 0x0001) + fprintf(f, "atm "); fprintf(f, "cburst %s/%u mpu %s overhead %s ", sprint_size(cbuffer, b1), 1<<hopt->ceil.cell_log, sprint_size(hopt->ceil.mpu&0xFF, b2), - sprint_size((hopt->ceil.mpu>>8)&0xFF, b3)); + sprint_size((__s8)(hopt->ceil.mpu>>8), b3)); + if (hopt->ceil.feature & 0x0001) + fprintf(f, "atm "); fprintf(f, "level %d ", (int)hopt->level); } else { fprintf(f, "burst %s ", sprint_size(buffer, b1)); diff -Nurp iproute2.orig/tc/q_tbf.c iproute2/tc/q_tbf.c --- iproute2.orig/tc/q_tbf.c 2005-01-19 08:11:58.000000000 +1000 +++ iproute2/tc/q_tbf.c 2006-06-13 11:53:27.000000000 +1000 @@ -26,7 +26,7 @@ static void explain(void) { fprintf(stderr, "Usage: ... tbf limit BYTES burst BYTES[/BYTES] rate KBPS [ mtu BYTES[/BYTES] ]\n"); - fprintf(stderr, " [ peakrate KBPS ] [ latency TIME ]\n"); + fprintf(stderr, " [ peakrate KBPS ] [ latency TIME ] [ overhead OVERHEAD ] [ atm ]\n"); } static void explain1(char *arg) @@ -43,7 +43,10 @@ static int tbf_parse_opt(struct qdisc_ut struct tc_tbf_qopt opt; __u32 rtab[256]; __u32 ptab[256]; - unsigned buffer=0, mtu=0, mpu=0, latency=0; + unsigned buffer=0, mtu=0, latency=0; + __u8 mpu=0; + __s8 overhead=0; + int atm=0; int Rcell_log=-1, Pcell_log = -1; struct rtattr *tail; @@ -103,7 +106,7 @@ static int tbf_parse_opt(struct qdisc_ut fprintf(stderr, "Double \"mpu\" spec\n"); return -1; } - if (get_size(&mpu, *argv)) { + if (get_u8(&mpu, *argv, 10)) { explain1("mpu"); return -1; } @@ -119,6 +122,20 @@ static int tbf_parse_opt(struct qdisc_ut return -1; } ok++; + } else if (strcmp(*argv, "overhead") == 0) { + NEXT_ARG(); + if (overhead) { + fprintf(stderr, "Double \"overhead\" spec\n"); + return -1; + } + if (get_s8(&overhead, *argv, 10)) { + explain1("overhead"); + return -1; + } + ok++; + } else if (strcmp(*argv, "atm") == 0) { + atm = 1; + ok++; } else if (matches(*argv, "peakrate") == 0) { NEXT_ARG(); if (opt.peakrate.rate) { @@ -170,21 +187,11 @@ static int tbf_parse_opt(struct qdisc_ut opt.limit = lim; } - if ((Rcell_log = tc_calc_rtable(opt.rate.rate, rtab, Rcell_log, mtu, mpu)) < 0) { - fprintf(stderr, "TBF: failed to calculate rate table.\n"); - return -1; - } + tc_calc_ratespec(&opt.rate, rtab, opt.rate.rate, Rcell_log, mtu, mpu, atm, overhead); opt.buffer = tc_calc_xmittime(opt.rate.rate, buffer); - opt.rate.cell_log = Rcell_log; - opt.rate.mpu = mpu; if (opt.peakrate.rate) { - if ((Pcell_log = tc_calc_rtable(opt.peakrate.rate, ptab, Pcell_log, mtu, mpu)) < 0) { - fprintf(stderr, "TBF: failed to calculate peak rate table.\n"); - return -1; - } + tc_calc_ratespec(&opt.peakrate, ptab, opt.peakrate.rate, Pcell_log, mtu, mpu, atm, overhead); opt.mtu = tc_calc_xmittime(opt.peakrate.rate, mtu); - opt.peakrate.cell_log = Pcell_log; - opt.peakrate.mpu = mpu; } tail = NLMSG_TAIL(n); @@ -220,8 +227,12 @@ static int tbf_print_opt(struct qdisc_ut fprintf(f, "rate %s ", sprint_rate(qopt->rate.rate, b1)); buffer = ((double)qopt->rate.rate*tc_core_tick2usec(qopt->buffer))/1000000; if (show_details) { - fprintf(f, "burst %s/%u mpu %s ", sprint_size(buffer, b1), - 1<<qopt->rate.cell_log, sprint_size(qopt->rate.mpu, b2)); + fprintf(f, "burst %s/%u mpu %s overhead %d ", sprint_size(buffer, b1), + 1<<qopt->rate.cell_log, + sprint_size(qopt->rate.mpu & 0xFF, b2), + (__s8)(qopt->rate.mpu >> 8)); + if (qopt->rate.feature & 0x0001) + fprintf(f, "atm "); } else { fprintf(f, "burst %s ", sprint_size(buffer, b1)); } @@ -232,8 +243,12 @@ static int tbf_print_opt(struct qdisc_ut if (qopt->mtu || qopt->peakrate.mpu) { mtu = ((double)qopt->peakrate.rate*tc_core_tick2usec(qopt->mtu))/1000000; if (show_details) { - fprintf(f, "mtu %s/%u mpu %s ", sprint_size(mtu, b1), - 1<<qopt->peakrate.cell_log, sprint_size(qopt->peakrate.mpu, b2)); + fprintf(f, "mtu %s/%u mpu %s overhead %d ", sprint_size(mtu, b1), + 1<<qopt->peakrate.cell_log, + sprint_size(qopt->peakrate.mpu & 0xFF, b2), + (__s8)(qopt->peakrate.mpu >> 8)); + if (qopt->peakrate.feature & 0x0001) + fprintf(f, "atm "); } else { fprintf(f, "minburst %s ", sprint_size(mtu, b1)); } diff -Nurp iproute2.orig/tc/tc_core.c iproute2/tc/tc_core.c --- iproute2.orig/tc/tc_core.c 2004-07-31 06:26:15.000000000 +1000 +++ iproute2/tc/tc_core.c 2006-06-13 11:53:27.000000000 +1000 @@ -23,6 +23,9 @@ #include "tc_core.h" +#define ATM_CELL_SIZE 53 +#define ATM_CELL_PAYLOAD 48 + static __u32 t2us=1; static __u32 us2t=1; static double tick_in_usec = 1; @@ -43,33 +46,124 @@ unsigned tc_calc_xmittime(unsigned rate, } /* - rtab[pkt_len>>cell_log] = pkt_xmit_time + * Calculate the ATM cell overhead. ATM sends each packet in 48 byte + * chunks, the last chunk being padded if necessary. Each chunk carries + * an additional 5 byte overhead - the ATM header. */ +static int tc_align_to_cells(int size) +{ + int cells; + + cells = size / ATM_CELL_PAYLOAD; + if (size % ATM_CELL_PAYLOAD != 0) + cells++; + return cells * ATM_CELL_SIZE; +} -int tc_calc_rtable(unsigned bps, __u32 *rtab, int cell_log, unsigned mtu, - unsigned mpu) +/* + * The number this function calculates is subtle. Ignore it and just believe + * it works if you have a choice, otherwise .. + * + * If there we are calculating the ATM cell overhead the kernel calculations + * will be out sometimes if the range of packet sizes spanned by one + * rate table element crosses an ATM cell boundary. Consider these three + * senarios: + * (a) the packet is sent across the ATM link without addition + * overheads the kernel doesn''t know about, and + * (b) a packet that has 1 byte of additional overhead the kernel + * doesn''t know about. Here + * (c) a packet that has 2 bytes of additional overhead the + * kernel doesn''t know about. + * The table below presents what happens. Each row is for a single rate + * table element. The "Sizes" column shows what packet sizes the rate table + * element will be used for. This packet size includes the "unknown to + * kernel" overhead, but does not include overhead incurred by breaking the + * packet up into ATM cells. This ATM cell overhead consists of the 5 byte + * header per ATM cell, plus the padding in the last cell. The "ATM" column + * shows how many bytes are actually sent across the ATM link, ie it does + * include the ATM cell overhead. + * + * RateTable Entry Sizes(a) ATM(a) Sizes(b) ATM(b) Sizes(c) ATM(c) + * ratetable[0] 0..7 53 1..8 53 2..9 53 + * ratetable[1] 8..15 53 9..16 53 2..17 53 + * ratetable[2] 16..23 53 17..24 53 18..25 53 + * ratetable[3] 24..31 53 25..32 53 26..33 53 + * ratetable[4] 32..39 53 33..40 53 34..41 53 + * ratetable[5] 40..47 53 41..48 53 42..49 53,106 + * ratetable[6] 48..55 53,106 49..56 106 50..57 106 + * + * For senario (a), the ratetable[6] entry covers two cases: one were a single + * ATM cell is needed to transmit the data, and one where two ATM cells are + * required. It can''t be right for both. Unfortunately the error is large. + * The same problem arises in senario (c) for ratetable[5]. The problem + * doesn''t happen for senario (b), because the boundary between rate table + * entries happens to match the boundary between ATM cells. + * + * What we would like to do is ensure that ratetable boundaries always match + * the ATM cells. If we do this the error goes away. The solution is to make + * the kernel add a small bias to the packet size. (Small because the bias + * will always be smaller than cell_log.) Adding this small bias will in + * effect slide the ratetable along a bit, so the boundaries match. The code + * below calculates that bias. Provided the MTU is less than 4092, doing + * this can always eliminate the error. + * + * Old kernels won''t add this bias, so they will have the error described above + * in most cases. In the worst case senario, considering all possible ATM cell + * sizes (1..48), for 7 of these sizes the old kernel will calculate the rate + * wrongly - ie, be out by 53 bytes. + */ +static int tc_calc_cell_align(int atm_cell_tax, char overhead, int cell_log) +{ + int cell_size; + + if (!atm_cell_tax) + return 0; + cell_size = 1 << cell_log; + return (overhead + cell_size - 2) % cell_size - cell_size + 1; +} + + +/* + * A constructor for a tc_ratespec. + */ +void tc_calc_ratespec(struct tc_ratespec* spec, __u32* rtab, unsigned bps, + int cell_log, unsigned mtu, unsigned char mpu, int atm_cell_tax, + char overhead) { int i; - unsigned overhead = (mpu >> 8) & 0xFF; - mpu = mpu & 0xFF; if (mtu == 0) mtu = 2047; + /* rtab[pkt_len>>cell_log] = pkt_xmit_time */ if (cell_log < 0) { cell_log = 0; while ((mtu>>cell_log) > 255) cell_log++; } + for (i=0; i<256; i++) { - unsigned sz = (i<<cell_log); - if (overhead) - sz += overhead; + /* + * sz is the length of packet we will use for this ratetable + * entry. The time taken to send a packet of this length will + * be used for all packet lengths this ratetable entry applies + * to. As underestimating how long it will take to transmit a + * packet is a worse error than overestimating it, the longest + * packet this rate table entry applies to is used. + */ + int sz = ((i+1)<<cell_log) - 1 + overhead; if (sz < mpu) sz = mpu; - rtab[i] = tc_core_usec2tick(1000000*((double)sz/bps)); + if (atm_cell_tax) + sz = tc_align_to_cells(sz); + rtab[i] = tc_calc_xmittime(bps, sz); } - return cell_log; + + spec->cell_align = tc_calc_cell_align(atm_cell_tax, overhead, cell_log); + spec->cell_log = cell_log; + spec->feature = 0x8000 | (atm_cell_tax ? 1 : 0); + spec->mpu = mpu | (unsigned)(overhead << 8); + spec->rate = bps; } int tc_core_init() diff -Nurp iproute2.orig/tc/tc_core.h iproute2/tc/tc_core.h --- iproute2.orig/tc/tc_core.h 2005-03-15 05:02:41.000000000 +1000 +++ iproute2/tc/tc_core.h 2006-06-13 11:53:27.000000000 +1000 @@ -7,7 +7,9 @@ long tc_core_usec2tick(long usec); long tc_core_tick2usec(long tick); unsigned tc_calc_xmittime(unsigned rate, unsigned size); -int tc_calc_rtable(unsigned bps, __u32 *rtab, int cell_log, unsigned mtu, unsigned mpu); +void tc_calc_ratespec(struct tc_ratespec* spec, __u32* rtab, unsigned bps, + int cell_log, unsigned mtu, unsigned char mpu, int atm_cell_tax, + char overhead); int tc_setup_estimator(unsigned A, unsigned time_const, struct tc_estimator *est);
Alan Cox
2006-Jun-14 10:57 UTC
Re: [PATCH 2/2] NET: Accurate packet scheduling for ATM/ADSL (userspace)
Ar Mer, 2006-06-14 am 11:40 +0200, ysgrifennodd Jesper Dangaard Brouer:> option to calculate traffic transmission times (rate table) > over all ATM links, including ADSL, with perfect accuracy.<Pedant> Only if the lowest level is encoded in a time linear manner. If you are using NRZ, NRZI etc at the bottom level then you may still be out... </Pedant> The other problem I see with this code is it is very tightly tied to ATM cell sizes, not to solving the generic question of packetisation. I''m not sure if that matters but for modern processors I''m also sceptical that the clever computation is actually any faster than just doing the maths, especially if something cache intensive is also running. Alan - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer
2006-Jun-14 13:18 UTC
Re: [PATCH 2/2] NET: Accurate packet scheduling for ATM/ADSL (userspace)
On Wed, 2006-06-14 at 11:57 +0100, Alan Cox wrote:> Ar Mer, 2006-06-14 am 11:40 +0200, ysgrifennodd Jesper Dangaard Brouer: > > option to calculate traffic transmission times (rate table) > > over all ATM links, including ADSL, with perfect accuracy. > > The other problem I see with this code is it is very tightly tied to ATM > cell sizes, not to solving the generic question of packetisation.Well, we did consider to do so, but we though that it would be harder to get it into the kernel. Actually thats the reason for the defines: #define ATM_CELL_SIZE 53 #define ATM_CELL_PAYLOAD 48 Changing these should should make it possible to adapt to any other SAR (Segment And Reasembly) link layer.> I''m > not sure if that matters but for modern processors I''m also sceptical > that the clever computation is actually any faster than just doing the > maths, especially if something cache intensive is also running.I guess you are refering to the rate table lookup system, that is based upon array lookups. I do think that the rate table array lookup system has been outdated, as memory access is the bottleneck on modern CPUs. But its design by Alexey for a long time ago where the hardware restrictions were different. It also avoids floting point operations in the kernel. Thanks for your comments. -- Med venlig hilsen / Best regards Jesper Brouer ComX Networks A/S Linux Network developer Cand. Scient Datalog / MSc. Author of http://adsl-optimizer.dk
Jesper Dangaard Brouer
2006-Jun-16 08:27 UTC
[PATCH 2/2] NET: Accurate packet scheduling for ATM/ADSL (userspace)
(Resend message bounced to LARTC) The Linux traffic''s control engine inaccurately calculates transmission times for packets sent over ADSL links. For some packet sizes the error rises to over 50%. This occurs because ADSL uses ATM as its link layer transport, and ATM transmits packets in fixed sized 53 byte cells. This changes the userspace tool iproute2/tc by adding an option to calculate traffic transmission times (rate table) over all ATM links, including ADSL, with perfect accuracy. A longer presentation of the patch, its rational, what it does and how to use it can be found here: http://www.stuart.id.au/russell/files/tc/tc-atm/ A earlier version of the patch, and a _detailed_ empirical investigation of its effects can be found here: http://www.adsl-optimizer.dk/ Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk> Signed-off-by: Russell Stuart <russell-tcatm@stuart.id.au> --- diff -Nurp iproute2.orig/include/linux/pkt_sched.h iproute2/include/linux/pkt_sched.h --- iproute2.orig/include/linux/pkt_sched.h 2005-12-10 09:27:44.000000000 +1000 +++ iproute2/include/linux/pkt_sched.h 2006-06-13 11:53:27.000000000 +1000 @@ -77,8 +77,9 @@ struct tc_ratespec { unsigned char cell_log; unsigned char __reserved; - unsigned short feature; - short addend; + unsigned short feature; /* Always 0 in pre-atm patch kernels */ + char cell_align; /* Always 0 in pre-atm patch kernels */ + unsigned char __unused; unsigned short mpu; __u32 rate; }; diff -Nurp iproute2.orig/tc/m_police.c iproute2/tc/m_police.c --- iproute2.orig/tc/m_police.c 2005-01-19 08:11:58.000000000 +1000 +++ iproute2/tc/m_police.c 2006-06-13 11:53:27.000000000 +1000 @@ -35,7 +35,7 @@ struct action_util police_action_util static void explain(void) { fprintf(stderr, "Usage: ... police rate BPS burst BYTES[/BYTES] [ mtu BYTES[/BYTES] ]\n"); - fprintf(stderr, " [ peakrate BPS ] [ avrate BPS ]\n"); + fprintf(stderr, " [ peakrate BPS ] [ avrate BPS ] [ overhead OVERHEAD ] [ atm ]\n"); fprintf(stderr, " [ ACTIONTERM ]\n"); fprintf(stderr, "Old Syntax ACTIONTERM := action <EXCEEDACT>[/NOTEXCEEDACT] \n"); fprintf(stderr, "New Syntax ACTIONTERM := conform-exceed <EXCEEDACT>[/NOTEXCEEDACT] \n"); @@ -134,7 +134,10 @@ int act_parse_police(struct action_util __u32 ptab[256]; __u32 avrate = 0; int presult = 0; - unsigned buffer=0, mtu=0, mpu=0; + unsigned buffer=0, mtu=0; + __u8 mpu=0; + __s8 overhead=0; + int atm=0; int Rcell_log=-1, Pcell_log = -1; struct rtattr *tail; @@ -184,7 +187,7 @@ int act_parse_police(struct action_util fprintf(stderr, "Double \"mpu\" spec\n"); return -1; } - if (get_size(&mpu, *argv)) { + if (get_u8(&mpu, *argv, 10)) { explain1("mpu"); return -1; } @@ -198,6 +201,18 @@ int act_parse_police(struct action_util explain1("rate"); return -1; } + } else if (strcmp(*argv, "overhead") == 0) { + NEXT_ARG(); + if (p.rate.rate) { + fprintf(stderr, "Double \"overhead\" spec\n"); + return -1; + } + if (get_s8(&overhead, *argv, 10)) { + explain1("overhead"); + return -1; + } + } else if (strcmp(*argv, "atm") == 0) { + atm = 1; } else if (strcmp(*argv, "avrate") == 0) { NEXT_ARG(); if (avrate) { @@ -264,22 +279,12 @@ int act_parse_police(struct action_util } if (p.rate.rate) { - if ((Rcell_log = tc_calc_rtable(p.rate.rate, rtab, Rcell_log, mtu, mpu)) < 0) { - fprintf(stderr, "TBF: failed to calculate rate table.\n"); - return -1; - } + tc_calc_ratespec(&p.rate, rtab, p.rate.rate, Rcell_log, mtu, mpu, atm, overhead); p.burst = tc_calc_xmittime(p.rate.rate, buffer); - p.rate.cell_log = Rcell_log; - p.rate.mpu = mpu; } p.mtu = mtu; if (p.peakrate.rate) { - if ((Pcell_log = tc_calc_rtable(p.peakrate.rate, ptab, Pcell_log, mtu, mpu)) < 0) { - fprintf(stderr, "POLICE: failed to calculate peak rate table.\n"); - return -1; - } - p.peakrate.cell_log = Pcell_log; - p.peakrate.mpu = mpu; + tc_calc_ratespec(&p.peakrate, ptab, p.peakrate.rate, Pcell_log, mtu, mpu, atm, overhead); } tail = NLMSG_TAIL(n); diff -Nurp iproute2.orig/tc/q_cbq.c iproute2/tc/q_cbq.c --- iproute2.orig/tc/q_cbq.c 2005-07-06 08:37:15.000000000 +1000 +++ iproute2/tc/q_cbq.c 2006-06-13 11:53:27.000000000 +1000 @@ -32,6 +32,7 @@ static void explain_class(void) fprintf(stderr, " [ prio NUMBER ] [ cell BYTES ] [ ewma LOG ]\n"); fprintf(stderr, " [ estimator INTERVAL TIME_CONSTANT ]\n"); fprintf(stderr, " [ split CLASSID ] [ defmap MASK/CHANGE ]\n"); + fprintf(stderr, " [ overhead BYTES ] [ atm ]\n"); } static void explain(void) @@ -52,7 +53,10 @@ static int cbq_parse_opt(struct qdisc_ut struct tc_ratespec r; struct tc_cbq_lssopt lss; __u32 rtab[256]; - unsigned mpu=0, avpkt=0, allot=0; + unsigned avpkt=0, allot=0; + __u8 mpu=0; + __s8 overhead=0; + int atm=0; int cell_log=-1; int ewma_log=-1; struct rtattr *tail; @@ -102,7 +106,7 @@ static int cbq_parse_opt(struct qdisc_ut } } else if (strcmp(*argv, "mpu") == 0) { NEXT_ARG(); - if (get_size(&mpu, *argv)) { + if (get_u8(&mpu, *argv, 10)) { explain1("mpu"); return -1; } @@ -113,6 +117,14 @@ static int cbq_parse_opt(struct qdisc_ut explain1("allot"); return -1; } + } else if (strcmp(*argv, "overhead") == 0) { + NEXT_ARG(); + if (get_s8(&overhead, *argv, 10)) { + explain1("overhead"); + return -1; + } + } else if (strcmp(*argv, "atm") == 0) { + atm = 1; } else if (strcmp(*argv, "help") == 0) { explain(); return -1; @@ -137,12 +149,7 @@ static int cbq_parse_opt(struct qdisc_ut if (allot < (avpkt*3)/2) allot = (avpkt*3)/2; - if ((cell_log = tc_calc_rtable(r.rate, rtab, cell_log, allot, mpu)) < 0) { - fprintf(stderr, "CBQ: failed to calculate rate table.\n"); - return -1; - } - r.cell_log = cell_log; - r.mpu = mpu; + tc_calc_ratespec(&r, rtab, r.rate, cell_log, allot, mpu, atm, overhead); if (ewma_log < 0) ewma_log = TC_CBQ_DEF_EWMA; @@ -175,7 +182,9 @@ static int cbq_parse_class_opt(struct qd struct tc_cbq_fopt fopt; struct tc_cbq_ovl ovl; __u32 rtab[256]; - unsigned mpu=0; + __u8 mpu=0; + __s8 overhead = 0; + int atm = 0; int cell_log=-1; int ewma_log=-1; unsigned bndw = 0; @@ -289,10 +298,18 @@ static int cbq_parse_class_opt(struct qd lss.change |= TCF_CBQ_LSS_AVPKT; } else if (strcmp(*argv, "mpu") == 0) { NEXT_ARG(); - if (get_size(&mpu, *argv)) { + if (get_u8(&mpu, *argv, 10)) { explain1("mpu"); return -1; } + } else if (strcmp(*argv, "overhead") == 0) { + NEXT_ARG(); + if (get_s8(&overhead, *argv, 10)) { + explain1("overhead"); + return -1; + } + } else if (strcmp(*argv, "atm") == 0) { + atm = 1; } else if (strcmp(*argv, "weight") == 0) { NEXT_ARG(); if (get_size(&wrr.weight, *argv)) { @@ -336,12 +353,7 @@ static int cbq_parse_class_opt(struct qd unsigned pktsize = wrr.allot; if (wrr.allot < (lss.avpkt*3)/2) wrr.allot = (lss.avpkt*3)/2; - if ((cell_log = tc_calc_rtable(r.rate, rtab, cell_log, pktsize, mpu)) < 0) { - fprintf(stderr, "CBQ: failed to calculate rate table.\n"); - return -1; - } - r.cell_log = cell_log; - r.mpu = mpu; + tc_calc_ratespec(&r, rtab, r.rate, cell_log, pktsize, mpu, atm, overhead); } if (ewma_log < 0) ewma_log = TC_CBQ_DEF_EWMA; @@ -463,8 +475,12 @@ static int cbq_print_opt(struct qdisc_ut fprintf(f, "rate %s ", buf); if (show_details) { fprintf(f, "cell %ub ", 1<<r->cell_log); - if (r->mpu) - fprintf(f, "mpu %ub ", r->mpu); + if (r->mpu & 0xff) + fprintf(f, "mpu %ub ", (__u8)r->mpu); + if ((r->mpu >> 8)) + fprintf(f, "overhead %db ", (__s8)(r->mpu >> 8)); + if (r->feature & 0x0001) + fprintf(f, "atm "); } } if (lss && lss->flags) { diff -Nurp iproute2.orig/tc/q_htb.c iproute2/tc/q_htb.c --- iproute2.orig/tc/q_htb.c 2005-01-19 08:11:58.000000000 +1000 +++ iproute2/tc/q_htb.c 2006-06-13 11:53:27.000000000 +1000 @@ -34,14 +34,14 @@ static void explain(void) " default minor id of class to which unclassified packets are sent {0}\n" " r2q DRR quantums are computed as rate in Bps/r2q {10}\n" " debug string of 16 numbers each 0-3 {0}\n\n" - "... class add ... htb rate R1 [burst B1] [mpu B] [overhead O]\n" + "... class add ... htb rate R1 [burst B1] [mpu B] [overhead O] [atm]\n" " [prio P] [slot S] [pslot PS]\n" " [ceil R2] [cburst B2] [mtu MTU] [quantum Q]\n" " rate rate allocated to this class (class can still borrow)\n" " burst max bytes burst which can be accumulated during idle period {computed}\n" " mpu minimum packet size used in rate computations\n" " overhead per-packet size overhead used in rate computations\n" - + " atm include atm cell tax in rate computations\n" " ceil definite upper class rate (no borrows) {rate}\n" " cburst burst but for ceil {computed}\n" " mtu max packet size we create rate map for {1600}\n" @@ -107,8 +107,10 @@ static int htb_parse_class_opt(struct qd __u32 rtab[256],ctab[256]; unsigned buffer=0,cbuffer=0; int cell_log=-1,ccell_log = -1; - unsigned mtu, mpu; - unsigned char mpu8 = 0, overhead = 0; + unsigned mtu; + __u8 mpu8=0; + __s8 overhead=0; + int atm=0; struct rtattr *tail; memset(&opt, 0, sizeof(opt)); mtu = 1600; /* eth packet len */ @@ -132,9 +134,11 @@ static int htb_parse_class_opt(struct qd } } else if (matches(*argv, "overhead") == 0) { NEXT_ARG(); - if (get_u8(&overhead, *argv, 10)) { + if (get_s8(&overhead, *argv, 10)) { explain1("overhead"); return -1; } + } else if (matches(*argv, "atm") == 0) { + atm = 1; } else if (matches(*argv, "quantum") == 0) { NEXT_ARG(); if (get_u32(&opt.quantum, *argv, 10)) { @@ -206,23 +210,11 @@ static int htb_parse_class_opt(struct qd if (!buffer) buffer = opt.rate.rate / get_hz() + mtu; if (!cbuffer) cbuffer = opt.ceil.rate / get_hz() + mtu; -/* encode overhead and mpu, 8 bits each, into lower 16 bits */ - mpu = (unsigned)mpu8 | (unsigned)overhead << 8; - opt.ceil.mpu = mpu; opt.rate.mpu = mpu; - - if ((cell_log = tc_calc_rtable(opt.rate.rate, rtab, cell_log, mtu, mpu)) < 0) { - fprintf(stderr, "htb: failed to calculate rate table.\n"); - return -1; - } + /* encode overhead and mpu, 8 bits each, into lower 16 bits */ + tc_calc_ratespec(&opt.rate, rtab, opt.rate.rate, cell_log, mtu, mpu8, atm, overhead); + tc_calc_ratespec(&opt.ceil, ctab, opt.ceil.rate, cell_log, mtu, mpu8, atm, overhead); opt.buffer = tc_calc_xmittime(opt.rate.rate, buffer); - opt.rate.cell_log = cell_log; - - if ((ccell_log = tc_calc_rtable(opt.ceil.rate, ctab, cell_log, mtu, mpu)) < 0) { - fprintf(stderr, "htb: failed to calculate ceil rate table.\n"); - return -1; - } opt.cbuffer = tc_calc_xmittime(opt.ceil.rate, cbuffer); - opt.ceil.cell_log = ccell_log; tail = NLMSG_TAIL(n); addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); @@ -267,12 +259,16 @@ static int htb_print_opt(struct qdisc_ut sprint_size(buffer, b1), 1<<hopt->rate.cell_log, sprint_size(hopt->rate.mpu&0xFF, b2), - sprint_size((hopt->rate.mpu>>8)&0xFF, b3)); + sprint_size((__s8)(hopt->rate.mpu>>8), b3)); + if (hopt->rate.feature & 0x0001) + fprintf(f, "atm "); fprintf(f, "cburst %s/%u mpu %s overhead %s ", sprint_size(cbuffer, b1), 1<<hopt->ceil.cell_log, sprint_size(hopt->ceil.mpu&0xFF, b2), - sprint_size((hopt->ceil.mpu>>8)&0xFF, b3)); + sprint_size((__s8)(hopt->ceil.mpu>>8), b3)); + if (hopt->ceil.feature & 0x0001) + fprintf(f, "atm "); fprintf(f, "level %d ", (int)hopt->level); } else { fprintf(f, "burst %s ", sprint_size(buffer, b1)); diff -Nurp iproute2.orig/tc/q_tbf.c iproute2/tc/q_tbf.c --- iproute2.orig/tc/q_tbf.c 2005-01-19 08:11:58.000000000 +1000 +++ iproute2/tc/q_tbf.c 2006-06-13 11:53:27.000000000 +1000 @@ -26,7 +26,7 @@ static void explain(void) { fprintf(stderr, "Usage: ... tbf limit BYTES burst BYTES[/BYTES] rate KBPS [ mtu BYTES[/BYTES] ]\n"); - fprintf(stderr, " [ peakrate KBPS ] [ latency TIME ]\n"); + fprintf(stderr, " [ peakrate KBPS ] [ latency TIME ] [ overhead OVERHEAD ] [ atm ]\n"); } static void explain1(char *arg) @@ -43,7 +43,10 @@ static int tbf_parse_opt(struct qdisc_ut struct tc_tbf_qopt opt; __u32 rtab[256]; __u32 ptab[256]; - unsigned buffer=0, mtu=0, mpu=0, latency=0; + unsigned buffer=0, mtu=0, latency=0; + __u8 mpu=0; + __s8 overhead=0; + int atm=0; int Rcell_log=-1, Pcell_log = -1; struct rtattr *tail; @@ -103,7 +106,7 @@ static int tbf_parse_opt(struct qdisc_ut fprintf(stderr, "Double \"mpu\" spec\n"); return -1; } - if (get_size(&mpu, *argv)) { + if (get_u8(&mpu, *argv, 10)) { explain1("mpu"); return -1; } @@ -119,6 +122,20 @@ static int tbf_parse_opt(struct qdisc_ut return -1; } ok++; + } else if (strcmp(*argv, "overhead") == 0) { + NEXT_ARG(); + if (overhead) { + fprintf(stderr, "Double \"overhead\" spec\n"); + return -1; + } + if (get_s8(&overhead, *argv, 10)) { + explain1("overhead"); + return -1; + } + ok++; + } else if (strcmp(*argv, "atm") == 0) { + atm = 1; + ok++; } else if (matches(*argv, "peakrate") == 0) { NEXT_ARG(); if (opt.peakrate.rate) { @@ -170,21 +187,11 @@ static int tbf_parse_opt(struct qdisc_ut opt.limit = lim; } - if ((Rcell_log = tc_calc_rtable(opt.rate.rate, rtab, Rcell_log, mtu, mpu)) < 0) { - fprintf(stderr, "TBF: failed to calculate rate table.\n"); - return -1; - } + tc_calc_ratespec(&opt.rate, rtab, opt.rate.rate, Rcell_log, mtu, mpu, atm, overhead); opt.buffer = tc_calc_xmittime(opt.rate.rate, buffer); - opt.rate.cell_log = Rcell_log; - opt.rate.mpu = mpu; if (opt.peakrate.rate) { - if ((Pcell_log = tc_calc_rtable(opt.peakrate.rate, ptab, Pcell_log, mtu, mpu)) < 0) { - fprintf(stderr, "TBF: failed to calculate peak rate table.\n"); - return -1; - } + tc_calc_ratespec(&opt.peakrate, ptab, opt.peakrate.rate, Pcell_log, mtu, mpu, atm, overhead); opt.mtu = tc_calc_xmittime(opt.peakrate.rate, mtu); - opt.peakrate.cell_log = Pcell_log; - opt.peakrate.mpu = mpu; } tail = NLMSG_TAIL(n); @@ -220,8 +227,12 @@ static int tbf_print_opt(struct qdisc_ut fprintf(f, "rate %s ", sprint_rate(qopt->rate.rate, b1)); buffer = ((double)qopt->rate.rate*tc_core_tick2usec(qopt->buffer))/1000000; if (show_details) { - fprintf(f, "burst %s/%u mpu %s ", sprint_size(buffer, b1), - 1<<qopt->rate.cell_log, sprint_size(qopt->rate.mpu, b2)); + fprintf(f, "burst %s/%u mpu %s overhead %d ", sprint_size(buffer, b1), + 1<<qopt->rate.cell_log, + sprint_size(qopt->rate.mpu & 0xFF, b2), + (__s8)(qopt->rate.mpu >> 8)); + if (qopt->rate.feature & 0x0001) + fprintf(f, "atm "); } else { fprintf(f, "burst %s ", sprint_size(buffer, b1)); } @@ -232,8 +243,12 @@ static int tbf_print_opt(struct qdisc_ut if (qopt->mtu || qopt->peakrate.mpu) { mtu = ((double)qopt->peakrate.rate*tc_core_tick2usec(qopt->mtu))/1000000; if (show_details) { - fprintf(f, "mtu %s/%u mpu %s ", sprint_size(mtu, b1), - 1<<qopt->peakrate.cell_log, sprint_size(qopt->peakrate.mpu, b2)); + fprintf(f, "mtu %s/%u mpu %s overhead %d ", sprint_size(mtu, b1), + 1<<qopt->peakrate.cell_log, + sprint_size(qopt->peakrate.mpu & 0xFF, b2), + (__s8)(qopt->peakrate.mpu >> 8)); + if (qopt->peakrate.feature & 0x0001) + fprintf(f, "atm "); } else { fprintf(f, "minburst %s ", sprint_size(mtu, b1)); } diff -Nurp iproute2.orig/tc/tc_core.c iproute2/tc/tc_core.c --- iproute2.orig/tc/tc_core.c 2004-07-31 06:26:15.000000000 +1000 +++ iproute2/tc/tc_core.c 2006-06-13 11:53:27.000000000 +1000 @@ -23,6 +23,9 @@ #include "tc_core.h" +#define ATM_CELL_SIZE 53 +#define ATM_CELL_PAYLOAD 48 + static __u32 t2us=1; static __u32 us2t=1; static double tick_in_usec = 1; @@ -43,33 +46,124 @@ unsigned tc_calc_xmittime(unsigned rate, } /* - rtab[pkt_len>>cell_log] = pkt_xmit_time + * Calculate the ATM cell overhead. ATM sends each packet in 48 byte + * chunks, the last chunk being padded if necessary. Each chunk carries + * an additional 5 byte overhead - the ATM header. */ +static int tc_align_to_cells(int size) +{ + int cells; + + cells = size / ATM_CELL_PAYLOAD; + if (size % ATM_CELL_PAYLOAD != 0) + cells++; + return cells * ATM_CELL_SIZE; +} -int tc_calc_rtable(unsigned bps, __u32 *rtab, int cell_log, unsigned mtu, - unsigned mpu) +/* + * The number this function calculates is subtle. Ignore it and just believe + * it works if you have a choice, otherwise .. + * + * If there we are calculating the ATM cell overhead the kernel calculations + * will be out sometimes if the range of packet sizes spanned by one + * rate table element crosses an ATM cell boundary. Consider these three + * senarios: + * (a) the packet is sent across the ATM link without addition + * overheads the kernel doesn''t know about, and + * (b) a packet that has 1 byte of additional overhead the kernel + * doesn''t know about. Here + * (c) a packet that has 2 bytes of additional overhead the + * kernel doesn''t know about. + * The table below presents what happens. Each row is for a single rate + * table element. The "Sizes" column shows what packet sizes the rate table + * element will be used for. This packet size includes the "unknown to + * kernel" overhead, but does not include overhead incurred by breaking the + * packet up into ATM cells. This ATM cell overhead consists of the 5 byte + * header per ATM cell, plus the padding in the last cell. The "ATM" column + * shows how many bytes are actually sent across the ATM link, ie it does + * include the ATM cell overhead. + * + * RateTable Entry Sizes(a) ATM(a) Sizes(b) ATM(b) Sizes(c) ATM(c) + * ratetable[0] 0..7 53 1..8 53 2..9 53 + * ratetable[1] 8..15 53 9..16 53 2..17 53 + * ratetable[2] 16..23 53 17..24 53 18..25 53 + * ratetable[3] 24..31 53 25..32 53 26..33 53 + * ratetable[4] 32..39 53 33..40 53 34..41 53 + * ratetable[5] 40..47 53 41..48 53 42..49 53,106 + * ratetable[6] 48..55 53,106 49..56 106 50..57 106 + * + * For senario (a), the ratetable[6] entry covers two cases: one were a single + * ATM cell is needed to transmit the data, and one where two ATM cells are + * required. It can''t be right for both. Unfortunately the error is large. + * The same problem arises in senario (c) for ratetable[5]. The problem + * doesn''t happen for senario (b), because the boundary between rate table + * entries happens to match the boundary between ATM cells. + * + * What we would like to do is ensure that ratetable boundaries always match + * the ATM cells. If we do this the error goes away. The solution is to make + * the kernel add a small bias to the packet size. (Small because the bias + * will always be smaller than cell_log.) Adding this small bias will in + * effect slide the ratetable along a bit, so the boundaries match. The code + * below calculates that bias. Provided the MTU is less than 4092, doing + * this can always eliminate the error. + * + * Old kernels won''t add this bias, so they will have the error described above + * in most cases. In the worst case senario, considering all possible ATM cell + * sizes (1..48), for 7 of these sizes the old kernel will calculate the rate + * wrongly - ie, be out by 53 bytes. + */ +static int tc_calc_cell_align(int atm_cell_tax, char overhead, int cell_log) +{ + int cell_size; + + if (!atm_cell_tax) + return 0; + cell_size = 1 << cell_log; + return (overhead + cell_size - 2) % cell_size - cell_size + 1; +} + + +/* + * A constructor for a tc_ratespec. + */ +void tc_calc_ratespec(struct tc_ratespec* spec, __u32* rtab, unsigned bps, + int cell_log, unsigned mtu, unsigned char mpu, int atm_cell_tax, + char overhead) { int i; - unsigned overhead = (mpu >> 8) & 0xFF; - mpu = mpu & 0xFF; if (mtu == 0) mtu = 2047; + /* rtab[pkt_len>>cell_log] = pkt_xmit_time */ if (cell_log < 0) { cell_log = 0; while ((mtu>>cell_log) > 255) cell_log++; } + for (i=0; i<256; i++) { - unsigned sz = (i<<cell_log); - if (overhead) - sz += overhead; + /* + * sz is the length of packet we will use for this ratetable + * entry. The time taken to send a packet of this length will + * be used for all packet lengths this ratetable entry applies + * to. As underestimating how long it will take to transmit a + * packet is a worse error than overestimating it, the longest + * packet this rate table entry applies to is used. + */ + int sz = ((i+1)<<cell_log) - 1 + overhead; if (sz < mpu) sz = mpu; - rtab[i] = tc_core_usec2tick(1000000*((double)sz/bps)); + if (atm_cell_tax) + sz = tc_align_to_cells(sz); + rtab[i] = tc_calc_xmittime(bps, sz); } - return cell_log; + + spec->cell_align = tc_calc_cell_align(atm_cell_tax, overhead, cell_log); + spec->cell_log = cell_log; + spec->feature = 0x8000 | (atm_cell_tax ? 1 : 0); + spec->mpu = mpu | (unsigned)(overhead << 8); + spec->rate = bps; } int tc_core_init() diff -Nurp iproute2.orig/tc/tc_core.h iproute2/tc/tc_core.h --- iproute2.orig/tc/tc_core.h 2005-03-15 05:02:41.000000000 +1000 +++ iproute2/tc/tc_core.h 2006-06-13 11:53:27.000000000 +1000 @@ -7,7 +7,9 @@ long tc_core_usec2tick(long usec); long tc_core_tick2usec(long tick); unsigned tc_calc_xmittime(unsigned rate, unsigned size); -int tc_calc_rtable(unsigned bps, __u32 *rtab, int cell_log, unsigned mtu, unsigned mpu); +void tc_calc_ratespec(struct tc_ratespec* spec, __u32* rtab, unsigned bps, + int cell_log, unsigned mtu, unsigned char mpu, int atm_cell_tax, + char overhead); int tc_setup_estimator(unsigned A, unsigned time_const, struct tc_estimator *est);
Jesper Dangaard Brouer
2006-Jun-19 19:31 UTC
Re: [PATCH 2/2] NET: Accurate packet scheduling for ATM/ADSL (userspace)
On Thu, 15 Jun 2006, jamal wrote:> On Thu, 2006-15-06 at 10:47 +1000, Russell Stuart wrote: >> On Wed, 2006-06-14 at 11:57 +0100, Alan Cox wrote: >>> The other problem I see with this code is it is very tightly tied to ATM >>> cell sizes, not to solving the generic question of packetisation. >> >> Others have made this point also. I can''t speak for Jesper, >> but I did consider making it generic.I also have considered to make it generic, but choose to make my patch as non-intrusive as possible to the kernel (and try to handle as much in userspace as possible). Actually I do think that the kernel patch part is very generic. The patch simply allow us to align the rate table/array. With the kernel patch in place, we can work on the userspace TC program to support more and more types of exotic link layer modeling.>> The issue was that >> doing so would add more code, but I don''t personally know >> of any real world situation that would use the generic >> solution. I didn''t fancy the thought of arguing on these >> lists for code that no one would actually use.;-)>> If someone could put up their hand and say "Hey, I need >> this," then expanding the patch to accommodate them would >> be a pleasure. I like generic code too. >> > > It is probably doable by just looking at netdevice->type and figuring > the link layer technology. Totally in user space and building the > compensated for tables there before telling the kernel (advantage is no > kernel changes and therefore it would work with older kernels as well).I think you have got the setup all wrong. The linux middlebox/router has two ethernet interfaces, one of the ethernet interfaces is connected to the ADSL modem. Thus, the linux ethernet card cannot determine that it is connected to an ADSL line. The patch is the solution to the classical problem people have when tryng to configure traffic control on an ADSL link? Q: The packet scheduling does not work all the time? A: Try to decrease to bandwidth. The issue here is, that ATM does not have fixed overhead (due to alignment and padding). This means that a fixed reduction of the bandwidth is not the solution. We could reduce the bandwidth to the worst-case overhead, which is 62%, I do not think that is a good solution... With the patch, you can now simply configure HTB to use the rate that was specified by the ISP. Please read chapter 6 ("Achieving Queue Control") page 55-65, where I demonstrate that the naive approach of reducing bandwidth does not work, when the packet distribution change on the link. http://www.adsl-optimizer.dk/thesis/ Cheers, Jesper Brouer -- ------------------------------------------------------------------- MSc. Master of Computer Science Dept. of Computer Science, University of Copenhagen Author of http://www.adsl-optimizer.dk ------------------------------------------------------------------- - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html