Stephen Hemminger
2004-Jul-01 18:33 UTC
[PATCH 2.6] update to network emulation QOS scheduler
This patch updates the network emulation packet scheduler. * name changed from delay to netem since it does more than just delay * Catalin''s merged code to do packet reordering * uses a socket queue''s directly rather than layering on qdisc(fifo) because this is used in performance tests. * adds placeholder in API for future enhancements (rate and duplicate). Signed-off-by: Stephen Hemminger <shemminger@osdl.org> diff -urNp -X dontdiff linux-2.6/include/linux/pkt_sched.h sched-2.6/include/linux/pkt_sched.h --- linux-2.6/include/linux/pkt_sched.h 2004-06-24 08:52:58.000000000 -0700 +++ sched-2.6/include/linux/pkt_sched.h 2004-07-01 03:53:31.185482832 -0700 @@ -439,11 +439,14 @@ enum { #define TCA_ATM_MAX TCA_ATM_STATE -/* Delay section */ -struct tc_dly_qopt +/* Network emulator */ +struct tc_netem_qopt { - __u32 latency; - __u32 limit; - __u32 loss; + __u32 latency; /* added delay (us) */ + __u32 limit; /* fifo limit (packets) */ + __u32 loss; /* random packet loss (0=none ~0=100%) */ + __u32 gap; /* re-ordering gap (0 for delay all) */ + __u32 duplicate; /* random packet dup (0=none ~0=100%) */ + __u32 rate; /* maximum transmit rate (bytes/sec) */ }; #endif diff -urNp -X dontdiff linux-2.6/net/sched/Kconfig sched-2.6/net/sched/Kconfig --- linux-2.6/net/sched/Kconfig 2004-06-25 09:41:00.000000000 -0700 +++ sched-2.6/net/sched/Kconfig 2004-06-28 09:17:19.000000000 -0700 @@ -164,12 +164,12 @@ config NET_SCH_DSMARK To compile this code as a module, choose M here: the module will be called sch_dsmark. -config NET_SCH_DELAY - tristate "Delay simulator" +config NET_SCH_NETEM + tristate "Network emulator" depends on NET_SCHED help - Say Y if you want to delay packets by a fixed amount of - time. This is often useful to simulate network delay when + Say Y if you want to emulate network delay, loss, and packet + re-ordering. This is often useful to simulate networks when testing applications or protocols. To compile this driver as a module, choose M here: the module diff -urNp -X dontdiff linux-2.6/net/sched/Makefile sched-2.6/net/sched/Makefile --- linux-2.6/net/sched/Makefile 2004-06-24 08:52:58.000000000 -0700 +++ sched-2.6/net/sched/Makefile 2004-06-28 09:17:49.000000000 -0700 @@ -24,7 +24,7 @@ obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o -obj-$(CONFIG_NET_SCH_DELAY) += sch_delay.o +obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o obj-$(CONFIG_NET_CLS_FW) += cls_fw.o diff -urNp -X dontdiff linux-2.6/net/sched/sch_delay.c sched-2.6/net/sched/sch_delay.c --- linux-2.6/net/sched/sch_delay.c 2004-06-21 09:23:15.000000000 -0700 +++ sched-2.6/net/sched/sch_delay.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,281 +0,0 @@ -/* - * net/sched/sch_delay.c Simple constant delay - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Authors: Stephen Hemminger <shemminger@osdl.org> - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> - -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <net/pkt_sched.h> - -/* Network delay simulator - This scheduler adds a fixed delay to all packets. - Similar to NISTnet and BSD Dummynet. - - It uses byte fifo underneath similar to TBF */ -struct dly_sched_data { - u32 latency; - u32 limit; - u32 loss; - struct timer_list timer; - struct Qdisc *qdisc; -}; - -/* Time stamp put into socket buffer control block */ -struct dly_skb_cb { - psched_time_t queuetime; -}; - -/* Enqueue packets with underlying discipline (fifo) - * but mark them with current time first. - */ -static int dly_enqueue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - struct dly_skb_cb *cb = (struct dly_skb_cb *)skb->cb; - int ret; - - /* Random packet drop 0 => none, ~0 => all */ - if (q->loss >= net_random()) { - sch->stats.drops++; - return 0; /* lie about loss so TCP doesn''t know */ - } - - PSCHED_GET_TIME(cb->queuetime); - - /* Queue to underlying scheduler */ - ret = q->qdisc->enqueue(skb, q->qdisc); - if (ret) - sch->stats.drops++; - else { - sch->q.qlen++; - sch->stats.bytes += skb->len; - sch->stats.packets++; - } - return ret; -} - -/* Requeue packets but don''t change time stamp */ -static int dly_requeue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - int ret; - - ret = q->qdisc->ops->requeue(skb, q->qdisc); - if (ret == 0) - sch->q.qlen++; - return ret; -} - -static unsigned int dly_drop(struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - unsigned int len; - - len = q->qdisc->ops->drop(q->qdisc); - if (len) { - sch->q.qlen--; - sch->stats.drops++; - } - return len; -} - -/* Dequeue packet. - * If packet needs to be held up, then stop the - * queue and set timer to wakeup later. - */ -static struct sk_buff *dly_dequeue(struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - struct sk_buff *skb; - - retry: - skb = q->qdisc->dequeue(q->qdisc); - if (skb) { - struct dly_skb_cb *cb = (struct dly_skb_cb *)skb->cb; - psched_time_t now; - long diff, delay; - - PSCHED_GET_TIME(now); - diff = q->latency - PSCHED_TDIFF(now, cb->queuetime); - - if (diff <= 0) { - sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; - return skb; - } - - if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { - sch->q.qlen--; - sch->stats.drops++; - goto retry; - } - - delay = PSCHED_US2JIFFIE(diff); - if (delay <= 0) - delay = 1; - mod_timer(&q->timer, jiffies+delay); - - sch->flags |= TCQ_F_THROTTLED; - } - return NULL; -} - -static void dly_reset(struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - - qdisc_reset(q->qdisc); - sch->q.qlen = 0; - sch->flags &= ~TCQ_F_THROTTLED; - del_timer(&q->timer); -} - -static void dly_timer(unsigned long arg) -{ - struct Qdisc *sch = (struct Qdisc *)arg; - - sch->flags &= ~TCQ_F_THROTTLED; - netif_schedule(sch->dev); -} - -/* Tell Fifo the new limit. */ -static int change_limit(struct Qdisc *q, u32 limit) -{ - struct rtattr *rta; - int ret; - - rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); - if (!rta) - return -ENOMEM; - - rta->rta_type = RTM_NEWQDISC; - rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); - ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; - ret = q->ops->change(q, rta); - kfree(rta); - - return ret; -} - -/* Setup underlying FIFO discipline */ -static int dly_change(struct Qdisc *sch, struct rtattr *opt) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - struct tc_dly_qopt *qopt = RTA_DATA(opt); - int err; - - if (q->qdisc == &noop_qdisc) { - struct Qdisc *child - = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops); - if (!child) - return -EINVAL; - q->qdisc = child; - } - - err = change_limit(q->qdisc, qopt->limit); - if (err) { - qdisc_destroy(q->qdisc); - q->qdisc = &noop_qdisc; - } else { - q->latency = qopt->latency; - q->limit = qopt->limit; - q->loss = qopt->loss; - } - return err; -} - -static int dly_init(struct Qdisc *sch, struct rtattr *opt) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - - if (!opt) - return -EINVAL; - - init_timer(&q->timer); - q->timer.function = dly_timer; - q->timer.data = (unsigned long) sch; - q->qdisc = &noop_qdisc; - - return dly_change(sch, opt); -} - -static void dly_destroy(struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - - del_timer(&q->timer); - qdisc_destroy(q->qdisc); - q->qdisc = &noop_qdisc; -} - -static int dly_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - unsigned char *b = skb->tail; - struct tc_dly_qopt qopt; - - qopt.latency = q->latency; - qopt.limit = q->limit; - qopt.loss = q->loss; - - RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); - - return skb->len; - -rtattr_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -static struct Qdisc_ops dly_qdisc_ops = { - .id = "delay", - .priv_size = sizeof(struct dly_sched_data), - .enqueue = dly_enqueue, - .dequeue = dly_dequeue, - .requeue = dly_requeue, - .drop = dly_drop, - .init = dly_init, - .reset = dly_reset, - .destroy = dly_destroy, - .change = dly_change, - .dump = dly_dump, - .owner = THIS_MODULE, -}; - - -static int __init dly_module_init(void) -{ - return register_qdisc(&dly_qdisc_ops); -} -static void __exit dly_module_exit(void) -{ - unregister_qdisc(&dly_qdisc_ops); -} -module_init(dly_module_init) -module_exit(dly_module_exit) -MODULE_LICENSE("GPL"); diff -urNp -X dontdiff linux-2.6/net/sched/sch_netem.c sched-2.6/net/sched/sch_netem.c --- linux-2.6/net/sched/sch_netem.c 1969-12-31 16:00:00.000000000 -0800 +++ sched-2.6/net/sched/sch_netem.c 2004-06-30 14:05:13.000000000 -0700 @@ -0,0 +1,255 @@ +/* + * net/sched/sch_netem.c Network emulator + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Stephen Hemminger <shemminger@osdl.org> + * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> + +#include <net/pkt_sched.h> + +/* Network emulator + * + * This scheduler can alters spacing and order + * Similar to NISTnet and BSD Dummynet. + */ + +struct netem_sched_data { + struct sk_buff_head qnormal; + struct sk_buff_head qdelay; + struct timer_list timer; + + u32 latency; + u32 loss; + u32 counter; + u32 gap; +}; + +/* Time stamp put into socket buffer control block */ +struct netem_skb_cb { + psched_time_t time_to_send; +}; + +/* Enqueue packets with underlying discipline (fifo) + * but mark them with current time first. + */ +static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; + + pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies); + + /* Random packet drop 0 => none, ~0 => all */ + if (q->loss >= net_random()) { + sch->stats.drops++; + return 0; /* lie about loss so TCP doesn''t know */ + } + + if (q->qnormal.qlen < sch->dev->tx_queue_len) { + PSCHED_GET_TIME(cb->time_to_send); + PSCHED_TADD(cb->time_to_send, q->latency); + + __skb_queue_tail(&q->qnormal, skb); + sch->q.qlen++; + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 0; + } + + sch->stats.drops++; + kfree_skb(skb); + return NET_XMIT_DROP; +} + +/* Requeue packets but don''t change time stamp */ +static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + + __skb_queue_head(&q->qnormal, skb); + sch->q.qlen++; + return 0; +} + +/* + * Check the look aside buffer list, and see if any freshly baked buffers. + * If head of queue is not baked, set timer. + */ +static struct sk_buff *netem_get_delayed(struct netem_sched_data *q) +{ + struct sk_buff *skb; + psched_time_t now; + long delay; + + skb = skb_peek(&q->qdelay); + if (skb) { + const struct netem_skb_cb *cb + = (const struct netem_skb_cb *)skb->cb; + + PSCHED_GET_TIME(now); + delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); + pr_debug("netem_dequeue: delay queue %p@%lu %ld\n", + skb, jiffies, delay); + + /* it''s baked enough */ + if (delay <= 0) { + __skb_unlink(skb, &q->qdelay); + del_timer(&q->timer); + return skb; + } + + if (!timer_pending(&q->timer)) { + q->timer.expires = jiffies + delay; + add_timer(&q->timer); + } + } + return NULL; +} + +/* Dequeue packet. + * If packet needs to be held up, then put in the delay + * queue and set timer to wakeup later. + */ +static struct sk_buff *netem_dequeue(struct Qdisc *sch) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + struct sk_buff *skb; + + skb = netem_get_delayed(q); + if (!skb && (skb = __skb_dequeue(&q->qnormal))) { + /* are we doing out of order packet skip? */ + if (q->counter < q->gap) { + pr_debug("netem_dequeue: send %p normally\n", skb); + q->counter++; + } else { + /* don''t send now hold for later */ + pr_debug("netem_dequeue: hold [%p]@%lu\n", skb, jiffies); + __skb_queue_tail(&q->qdelay, skb); + q->counter = 0; + skb = netem_get_delayed(q); + } + } + + if (skb) + sch->q.qlen--; + return skb; +} + +static void netem_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + + pr_debug("netem_timer: fired @%lu\n", jiffies); + netif_schedule(sch->dev); +} + +static void netem_reset(struct Qdisc *sch) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + + skb_queue_purge(&q->qnormal); + skb_queue_purge(&q->qdelay); + + sch->q.qlen = 0; + del_timer_sync(&q->timer); +} + +static int netem_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + struct tc_netem_qopt *qopt = RTA_DATA(opt); + + if (qopt->limit) + sch->dev->tx_queue_len = qopt->limit; + + q->gap = qopt->gap; + q->loss = qopt->loss; + q->latency = qopt->latency; + + return 0; +} + +static int netem_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + + if (!opt) + return -EINVAL; + + skb_queue_head_init(&q->qnormal); + skb_queue_head_init(&q->qdelay); + init_timer(&q->timer); + q->timer.function = netem_timer; + q->timer.data = (unsigned long) sch; + q->counter = 0; + + return netem_change(sch, opt); +} + +static void netem_destroy(struct Qdisc *sch) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + + del_timer_sync(&q->timer); +} + +static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct tc_netem_qopt qopt; + + qopt.latency = q->latency; + qopt.limit = sch->dev->tx_queue_len; + qopt.loss = q->loss; + qopt.gap = q->gap; + + RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct Qdisc_ops netem_qdisc_ops = { + .id = "netem", + .priv_size = sizeof(struct netem_sched_data), + .enqueue = netem_enqueue, + .dequeue = netem_dequeue, + .requeue = netem_requeue, + .init = netem_init, + .reset = netem_reset, + .destroy = netem_destroy, + .change = netem_change, + .dump = netem_dump, + .owner = THIS_MODULE, +}; + + +static int __init netem_module_init(void) +{ + return register_qdisc(&netem_qdisc_ops); +} +static void __exit netem_module_exit(void) +{ + unregister_qdisc(&netem_qdisc_ops); +} +module_init(netem_module_init) +module_exit(netem_module_exit) +MODULE_LICENSE("GPL");
Stephen Hemminger
2004-Jul-01 20:11 UTC
[PATCH 2.4] update to network emulation QOS scheduler
This is the 2.4 version of the conversion of simple network delay scheduler to network emulator. Signed-off-by: Stephen Hemminger <shemminger@osdl.org> diff -Nru a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h --- a/include/linux/pkt_sched.h 2004-07-01 13:06:36 -07:00 +++ b/include/linux/pkt_sched.h 2004-07-01 13:06:36 -07:00 @@ -432,12 +432,15 @@ #define TCA_ATM_MAX TCA_ATM_STATE -/* Delay section */ -struct tc_dly_qopt +/* Network emulator */ +struct tc_netem_qopt { - __u32 latency; - __u32 limit; - __u32 loss; + __u32 latency; /* added delay (us) */ + __u32 limit; /* fifo limit (packets) */ + __u32 loss; /* random packet loss (0=none ~0=100%) */ + __u32 gap; /* re-ordering gap (0 for delay all) */ + __u32 duplicate; /* random packet dup (0=none ~0=100%) */ + __u32 rate; /* maximum transmit rate (bytes/sec) */ }; #endif diff -Nru a/net/sched/Config.in b/net/sched/Config.in --- a/net/sched/Config.in 2004-07-01 13:06:36 -07:00 +++ b/net/sched/Config.in 2004-07-01 13:06:36 -07:00 @@ -15,7 +15,7 @@ tristate '' TEQL queue'' CONFIG_NET_SCH_TEQL tristate '' TBF queue'' CONFIG_NET_SCH_TBF tristate '' GRED queue'' CONFIG_NET_SCH_GRED -tristate '' Network delay simulator'' CONFIG_NET_SCH_DELAY +tristate '' Network emulator'' CONFIG_NET_SCH_NETEM tristate '' Diffserv field marker'' CONFIG_NET_SCH_DSMARK if [ "$CONFIG_NETFILTER" = "y" ]; then tristate '' Ingress Qdisc'' CONFIG_NET_SCH_INGRESS diff -Nru a/net/sched/Makefile b/net/sched/Makefile --- a/net/sched/Makefile 2004-07-01 13:06:36 -07:00 +++ b/net/sched/Makefile 2004-07-01 13:06:36 -07:00 @@ -14,7 +14,7 @@ obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_CSZ) += sch_csz.o -obj-$(CONFIG_NET_SCH_DELAY) += sch_delay.o +obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff -Nru a/net/sched/sch_delay.c b/net/sched/sch_delay.c --- a/net/sched/sch_delay.c 2004-07-01 13:06:36 -07:00 +++ /dev/null Wed Dec 31 16:00:00 196900 @@ -1,289 +0,0 @@ -/* - * net/sched/sch_delay.c Simple constant delay - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Authors: Stephen Hemminger <shemminger@osdl.org> - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> - -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <net/pkt_sched.h> - -/* Network delay simulator - This scheduler adds a fixed delay to all packets. - Similar to NISTnet and BSD Dummynet. - - It uses byte fifo underneath similar to TBF */ -struct dly_sched_data { - u32 latency; - u32 limit; - u32 loss; - struct timer_list timer; - struct Qdisc *qdisc; -}; - -/* Time stamp put into socket buffer control block */ -struct dly_skb_cb { - psched_time_t queuetime; -}; - -/* Enqueue packets with underlying discipline (fifo) - * but mark them with current time first. - */ -static int dly_enqueue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - struct dly_skb_cb *cb = (struct dly_skb_cb *)skb->cb; - int ret; - - /* Random packet drop 0 => none, ~0 => all */ - if (q->loss >= net_random()) { - sch->stats.drops++; - return 0; /* lie about loss so TCP doesn''t know */ - } - - PSCHED_GET_TIME(cb->queuetime); - - /* Queue to underlying scheduler */ - ret = q->qdisc->enqueue(skb, q->qdisc); - if (ret) - sch->stats.drops++; - else { - sch->q.qlen++; - sch->stats.bytes += skb->len; - sch->stats.packets++; - } - return ret; -} - -/* Requeue packets but don''t change time stamp */ -static int dly_requeue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - int ret; - - ret = q->qdisc->ops->requeue(skb, q->qdisc); - if (ret == 0) - sch->q.qlen++; - return ret; -} - -static unsigned int dly_drop(struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - unsigned int len; - - len = q->qdisc->ops->drop(q->qdisc); - if (len) { - sch->q.qlen--; - sch->stats.drops++; - } - return len; -} - -/* Dequeue packet. - * If packet needs to be held up, then stop the - * queue and set timer to wakeup later. - */ -static struct sk_buff *dly_dequeue(struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - struct sk_buff *skb; - - retry: - skb = q->qdisc->dequeue(q->qdisc); - if (skb) { - struct dly_skb_cb *cb = (struct dly_skb_cb *)skb->cb; - psched_time_t now; - long diff, delay; - - PSCHED_GET_TIME(now); - diff = q->latency - PSCHED_TDIFF(now, cb->queuetime); - - if (diff <= 0) { - sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; - return skb; - } - - if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { - sch->q.qlen--; - sch->stats.drops++; - goto retry; - } - - delay = PSCHED_US2JIFFIE(diff); - if (delay <= 0) - delay = 1; - mod_timer(&q->timer, jiffies+delay); - - sch->flags |= TCQ_F_THROTTLED; - } - return NULL; -} - -static void dly_reset(struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - - qdisc_reset(q->qdisc); - sch->q.qlen = 0; - sch->flags &= ~TCQ_F_THROTTLED; - del_timer(&q->timer); -} - -static void dly_timer(unsigned long arg) -{ - struct Qdisc *sch = (struct Qdisc *)arg; - - sch->flags &= ~TCQ_F_THROTTLED; - netif_schedule(sch->dev); -} - -/* Tell Fifo the new limit. */ -static int change_limit(struct Qdisc *q, u32 limit) -{ - struct rtattr *rta; - int ret; - - rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); - if (!rta) - return -ENOMEM; - - rta->rta_type = RTM_NEWQDISC; - rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); - ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; - ret = q->ops->change(q, rta); - kfree(rta); - - return ret; -} - -/* Setup underlying FIFO discipline */ -static int dly_change(struct Qdisc *sch, struct rtattr *opt) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - struct tc_dly_qopt *qopt = RTA_DATA(opt); - int err; - - if (q->qdisc == &noop_qdisc) { - struct Qdisc *child - = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops); - if (!child) - return -EINVAL; - q->qdisc = child; - } - - err = change_limit(q->qdisc, qopt->limit); - if (err) { - qdisc_destroy(q->qdisc); - q->qdisc = &noop_qdisc; - } else { - q->latency = qopt->latency; - q->limit = qopt->limit; - q->loss = qopt->loss; - } - return err; -} - -static int dly_init(struct Qdisc *sch, struct rtattr *opt) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - int err; - - if (!opt) - return -EINVAL; - - MOD_INC_USE_COUNT; - - init_timer(&q->timer); - q->timer.function = dly_timer; - q->timer.data = (unsigned long) sch; - q->qdisc = &noop_qdisc; - - err = dly_change(sch, opt); - if (err) - MOD_DEC_USE_COUNT; - - return err; -} - -static void dly_destroy(struct Qdisc *sch) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - - del_timer(&q->timer); - qdisc_destroy(q->qdisc); - q->qdisc = &noop_qdisc; - - MOD_DEC_USE_COUNT; -} - -static int dly_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - struct dly_sched_data *q = (struct dly_sched_data *)sch->data; - unsigned char *b = skb->tail; - struct tc_dly_qopt qopt; - - qopt.latency = q->latency; - qopt.limit = q->limit; - qopt.loss = q->loss; - - RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); - - return skb->len; - -rtattr_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -struct Qdisc_ops dly_qdisc_ops = { - .id = "delay", - .priv_size = sizeof(struct dly_sched_data), - .enqueue = dly_enqueue, - .dequeue = dly_dequeue, - .requeue = dly_requeue, - .drop = dly_drop, - .init = dly_init, - .reset = dly_reset, - .destroy = dly_destroy, - .change = dly_change, - .dump = dly_dump, -}; - -#ifdef MODULE -int init_module(void) -{ - return register_qdisc(&dly_qdisc_ops); -} - -void cleanup_module(void) -{ - unregister_qdisc(&dly_qdisc_ops); -} -#endif -MODULE_LICENSE("GPL"); diff -Nru a/net/sched/sch_netem.c b/net/sched/sch_netem.c --- /dev/null Wed Dec 31 16:00:00 196900 +++ b/net/sched/sch_netem.c 2004-07-01 13:06:36 -07:00 @@ -0,0 +1,255 @@ +/* + * net/sched/sch_netem.c Network emulator + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Stephen Hemminger <shemminger@osdl.org> + * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> + +#include <net/pkt_sched.h> + +/* Network emulator + * + * This scheduler can alters spacing and order + * Similar to NISTnet and BSD Dummynet. + */ + +struct netem_sched_data { + struct sk_buff_head qnormal; + struct sk_buff_head qdelay; + struct timer_list timer; + + u32 latency; + u32 loss; + u32 counter; + u32 gap; +}; + +/* Time stamp put into socket buffer control block */ +struct netem_skb_cb { + psched_time_t time_to_send; +}; + +/* Enqueue packets with underlying discipline (fifo) + * but mark them with current time first. + */ +static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; + + pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies); + + /* Random packet drop 0 => none, ~0 => all */ + if (q->loss >= net_random()) { + sch->stats.drops++; + return 0; /* lie about loss so TCP doesn''t know */ + } + + if (q->qnormal.qlen < sch->dev->tx_queue_len) { + PSCHED_GET_TIME(cb->time_to_send); + PSCHED_TADD(cb->time_to_send, q->latency); + + __skb_queue_tail(&q->qnormal, skb); + sch->q.qlen++; + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 0; + } + + sch->stats.drops++; + kfree_skb(skb); + return NET_XMIT_DROP; +} + +/* Requeue packets but don''t change time stamp */ +static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + + __skb_queue_head(&q->qnormal, skb); + sch->q.qlen++; + return 0; +} + +/* + * Check the look aside buffer list, and see if any freshly baked buffers. + * If head of queue is not baked, set timer. + */ +static struct sk_buff *netem_get_delayed(struct netem_sched_data *q) +{ + struct sk_buff *skb; + psched_time_t now; + long delay; + + skb = skb_peek(&q->qdelay); + if (skb) { + const struct netem_skb_cb *cb + = (const struct netem_skb_cb *)skb->cb; + + PSCHED_GET_TIME(now); + delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); + pr_debug("netem_dequeue: delay queue %p@%lu %ld\n", + skb, jiffies, delay); + + /* it''s baked enough */ + if (delay <= 0) { + __skb_unlink(skb, &q->qdelay); + del_timer(&q->timer); + return skb; + } + + if (!timer_pending(&q->timer)) { + q->timer.expires = jiffies + delay; + add_timer(&q->timer); + } + } + return NULL; +} + +/* Dequeue packet. + * If packet needs to be held up, then put in the delay + * queue and set timer to wakeup later. + */ +static struct sk_buff *netem_dequeue(struct Qdisc *sch) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + struct sk_buff *skb; + + skb = netem_get_delayed(q); + if (!skb && (skb = __skb_dequeue(&q->qnormal))) { + /* are we doing out of order packet skip? */ + if (q->counter < q->gap) { + pr_debug("netem_dequeue: send %p normally\n", skb); + q->counter++; + } else { + /* don''t send now hold for later */ + pr_debug("netem_dequeue: hold [%p]@%lu\n", skb, jiffies); + __skb_queue_tail(&q->qdelay, skb); + q->counter = 0; + skb = netem_get_delayed(q); + } + } + + if (skb) + sch->q.qlen--; + return skb; +} + +static void netem_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + + pr_debug("netem_timer: fired @%lu\n", jiffies); + netif_schedule(sch->dev); +} + +static void netem_reset(struct Qdisc *sch) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + + skb_queue_purge(&q->qnormal); + skb_queue_purge(&q->qdelay); + + sch->q.qlen = 0; + del_timer_sync(&q->timer); +} + +static int netem_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + struct tc_netem_qopt *qopt = RTA_DATA(opt); + + if (qopt->limit) + sch->dev->tx_queue_len = qopt->limit; + + q->gap = qopt->gap; + q->loss = qopt->loss; + q->latency = qopt->latency; + + return 0; +} + +static int netem_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + + if (!opt) + return -EINVAL; + + skb_queue_head_init(&q->qnormal); + skb_queue_head_init(&q->qdelay); + init_timer(&q->timer); + q->timer.function = netem_timer; + q->timer.data = (unsigned long) sch; + q->counter = 0; + + return netem_change(sch, opt); +} + +static void netem_destroy(struct Qdisc *sch) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + + del_timer_sync(&q->timer); +} + +static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct netem_sched_data *q = (struct netem_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct tc_netem_qopt qopt; + + qopt.latency = q->latency; + qopt.limit = sch->dev->tx_queue_len; + qopt.loss = q->loss; + qopt.gap = q->gap; + + RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct Qdisc_ops netem_qdisc_ops = { + .id = "netem", + .priv_size = sizeof(struct netem_sched_data), + .enqueue = netem_enqueue, + .dequeue = netem_dequeue, + .requeue = netem_requeue, + .init = netem_init, + .reset = netem_reset, + .destroy = netem_destroy, + .change = netem_change, + .dump = netem_dump, +}; + + +static int __init netem_module_init(void) +{ + return register_qdisc(&netem_qdisc_ops); +} +static void __exit netem_module_exit(void) +{ + unregister_qdisc(&netem_qdisc_ops); +} +module_init(netem_module_init) +module_exit(netem_module_exit) +MODULE_LICENSE("GPL");
Stephen Hemminger
2004-Jul-02 20:44 UTC
Re: [PATCH 2.6] update to network emulation QOS scheduler
Here is an enhancement to netem to do allow emulating lower speed networks. The resolution is close, but obviously limited by the granularity of timers and size of packets. Also, fixes a rtnetlink dependency which showed up in some configurations and optimizes for the non-loss case by avoiding net_random call. Signed-off-by: Stephen Hemminger <shemminger@osdl.org> diff -Nru a/net/sched/sch_netem.c b/net/sched/sch_netem.c --- a/net/sched/sch_netem.c 2004-07-02 13:40:11 -07:00 +++ b/net/sched/sch_netem.c 2004-07-02 13:40:11 -07:00 @@ -18,6 +18,7 @@ #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/rtnetlink.h> #include <net/pkt_sched.h> @@ -31,11 +32,13 @@ struct sk_buff_head qnormal; struct sk_buff_head qdelay; struct timer_list timer; + psched_time_t last; u32 latency; u32 loss; u32 counter; u32 gap; + u32 rate; }; /* Time stamp put into socket buffer control block */ @@ -54,13 +57,23 @@ pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies); /* Random packet drop 0 => none, ~0 => all */ - if (q->loss >= net_random()) { + if (q->loss && q->loss >= net_random()) { sch->stats.drops++; return 0; /* lie about loss so TCP doesn''t know */ } if (q->qnormal.qlen < sch->dev->tx_queue_len) { PSCHED_GET_TIME(cb->time_to_send); + if (q->rate) { + if (!PSCHED_IS_PASTPERFECT(q->last) && + PSCHED_TLESS(cb->time_to_send, q->last)) + cb->time_to_send = q->last; + + PSCHED_TADD(cb->time_to_send, + (USEC_PER_SEC * skb->len) / q->rate); + q->last = cb->time_to_send; + } + PSCHED_TADD(cb->time_to_send, q->latency); __skb_queue_tail(&q->qnormal, skb); @@ -179,6 +192,7 @@ q->gap = qopt->gap; q->loss = qopt->loss; q->latency = qopt->latency; + q->rate = qopt->rate; return 0; } @@ -196,6 +210,7 @@ q->timer.function = netem_timer; q->timer.data = (unsigned long) sch; q->counter = 0; + PSCHED_SET_PASTPERFECT(q->last); return netem_change(sch, opt); } @@ -217,6 +232,7 @@ qopt.limit = sch->dev->tx_queue_len; qopt.loss = q->loss; qopt.gap = q->gap; + qopt.rate = q->rate; RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
On Fri, 2004-07-02 at 16:44, Stephen Hemminger wrote:> Here is an enhancement to netem to do allow emulating lower speed > networks. The resolution is close, but obviously limited by the > granularity of timers and size of packets. > > Also, fixes a rtnetlink dependency which showed up in some configurations > and optimizes for the non-loss case by avoiding net_random call. >I think its time i illustrate my comments earlier with some example hopefully this will curb the amount of features on this qdisc. I do think theres value in having this thing do delay and jitter, but you have gone waay beyond that now; Let illustrate things which apply to what you are trying to do in network condituions emulation. Although i show ingress qdisc , this applies to egress just the same. #drop 1 out 10 packets randomly using the netrand generator tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 \ match ip src 10.0.0.21/32 flowid 1:16 \ action drop random netrand ok 0xa Note, you can plugin another randomization algorithm (a point i was trying to make earlier; currently supporting netrandom and deterministic algorithms only) #deterministically accept every second packet, drop the rest tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 \ match ip src 10.0.0.22/32 flowid 1:16 \ action drop random determ ok 2 # deterministically duplicate every second packet # tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 \ match ip src 10.0.0.21/32 flowid 1:16 \ action ok random determ pipe 2 \ action mirred egress mirror dev dummy0 Interesting thing is depending on what you have attached on the dummy0 device you could reorder or delay. Example you could hookup that qdisc to delay. Lets do something more interesting ... # # deterministically duplicate every second packet that # exceeds 100Kbit for packets coming in from 10.0.0.31 # and give it a fwmark of 2. Dummy could do something # with that info # # tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 \ match ip src 10.0.0.31/32 flowid 1:16 \ action ipt -j mark --set-mark 1 \ action police rate 100kbit burst 90k pipe \ action ok random determ pipe 2 \ action ipt -j mark --set-mark 2 \ action mirred egress mirror dev dummy0 Lets do something even more interesting .. tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 \ match ip src 10.0.0.31/32 flowid 1:16 \ action police rate 100kbit burst 90k pipe \ action pedit munge offset 4 u32 set 0x12341234 \ action ok random determ pipe 2 \ action mirred egress mirror dev dummy0 Note: we introduced some error in the packet bits by randomly setting some value in some offset; checksums etc will be wrong as a result. This happens for all packets exceeding 100Kbits. Every second packet which has been trampled is also duplicated... I hope this makes my point clearer. For testing or emulating conditions just create actions and serialize them for the flows and in/egress device you want them on. Create new ones that dont exist. cheers, jamal
Ed Wildgoose
2004-Jul-03 13:32 UTC
Re: Re: [PATCH 2.6] update to network emulation QOS scheduler
>Here is an enhancement to netem to do allow emulating lower speed >networks. The resolution is close, but obviously limited by the >granularity of timers and size of packets. > >Hi Stephen, This looks extremely useful. I have a need for a simulator for the Iridium Satellite phone network, this is at most 2,400 baud with perhaps 1 sec or more latency. Do you expect this scheduler to be able to do this slow? On a related note, for better accuracy I currently need a bit of stochastic variability on the latency, ie assume a 1 sec min delay, but also to vary that a little through time. Have you considered adding that as a feature? Thanks Ed W _______________________________________________ LARTC mailing list / LARTC@mailman.ds9a.nl http://mailman.ds9a.nl/mailman/listinfo/lartc HOWTO: http://lartc.org/
David S. Miller
2004-Jul-06 02:49 UTC
Re: [PATCH 2.6] update to network emulation QOS scheduler
I''m going to hold off on Stephen''s patches until Jamal and he has a chance to fight it out :-)
On Mon, 2004-07-05 at 22:49, David S. Miller wrote:> I''m going to hold off on Stephen''s patches until Jamal and he has > a chance to fight it out :-)Actually i would be fine with it if Stephen gets rid of the new "rate" thing. cheers, jamal
Stephen Hemminger
2004-Jul-06 16:09 UTC
Re: [PATCH 2.6] update to network emulation QOS scheduler
On 02 Jul 2004 23:13:52 -0400 jamal <hadi@cyberus.ca> wrote:> On Fri, 2004-07-02 at 16:44, Stephen Hemminger wrote: > > Here is an enhancement to netem to do allow emulating lower speed > > networks. The resolution is close, but obviously limited by the > > granularity of timers and size of packets. > > > > Also, fixes a rtnetlink dependency which showed up in some configurations > > and optimizes for the non-loss case by avoiding net_random call. > > > > I think its time i illustrate my comments earlier with some example > hopefully this will curb the amount of features on this qdisc. > I do think theres value in having this thing do delay and jitter, but > you have gone waay beyond that now; > Let illustrate things which apply to what you are trying to do in > network condituions emulation. Although i show ingress qdisc , this > applies to egress just the same. > > #drop 1 out 10 packets randomly using the netrand generator > tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 \ > match ip src 10.0.0.21/32 flowid 1:16 \ > action drop random netrand ok 0xaYour examples made me think about this more. The netfilter seem best suited to things that effect the flow of packets (dropping, reordering, even corrupting), and the qdisc seems best when the timing needs to change. The limit match in netfilter is not the same as the rate in the qdisc. The netem scheduler acts as if the link is a slow fixed rate. The netfilter limit is usually targeted to drop packets over the rate which is not the same. Reordering is also hard without going out to a user log or building a custom target. So, you have convinced me that loss is unnecessary but not the rate, or delay. If we can figure out how to re-ordering with netfilter then that could go too, which would make it possible to use a layered qdisc again.
David S. Miller
2004-Jul-06 21:02 UTC
Re: [PATCH 2.6] update to network emulation QOS scheduler
On Thu, 1 Jul 2004 11:33:12 -0700 Stephen Hemminger <shemminger@osdl.org> wrote:> This patch updates the network emulation packet scheduler. > * name changed from delay to netem since it does more than just delay > * Catalin''s merged code to do packet reordering > * uses a socket queue''s directly rather than layering on qdisc(fifo) > because this is used in performance tests. > * adds placeholder in API for future enhancements (rate and duplicate). > > Signed-off-by: Stephen Hemminger <shemminger@osdl.org>Applied, thanks Stephen.
David S. Miller
2004-Jul-06 21:06 UTC
Re: [PATCH 2.4] update to network emulation QOS scheduler
On Thu, 1 Jul 2004 13:11:01 -0700 Stephen Hemminger <shemminger@osdl.org> wrote:> This is the 2.4 version of the conversion of simple network delay scheduler to > network emulator. > > Signed-off-by: Stephen Hemminger <shemminger@osdl.org>Also applied.
David S. Miller
2004-Jul-06 21:41 UTC
Re: [PATCH 2.6] update to network emulation QOS scheduler
On 06 Jul 2004 09:04:50 -0400 jamal <hadi@cyberus.ca> wrote:> On Mon, 2004-07-05 at 22:49, David S. Miller wrote: > > I''m going to hold off on Stephen''s patches until Jamal and he has > > a chance to fight it out :-) > > Actually i would be fine with it if Stephen gets rid of the new "rate" > thing.Ok, so for now I''m going to just put in this part of Stephen''s patch which just adds the rtnetlink.h include and the loss optimization. To be honest, the rate feature is such a tiny amount of code... it''s not the end of the world if we put it in :-) # This is a BitKeeper generated diff -Nru style patch. # # ChangeSet # 2004/07/06 14:35:36-07:00 shemminger@osdl.org # [PKT_SCHED]: Two small netem fixes. # # - rtnetlink.h needs including # - optimize loss test so that net_random() call is not done # when no-loss is indicated # # Signed-off-by: Stephen Hemminger <shemminger@osdl.org> # Signed-off-by: David S. Miller <davem@redhat.com> # # net/sched/sch_netem.c # 2004/07/06 14:31:50-07:00 shemminger@osdl.org +2 -1 # [PKT_SCHED]: Two small netem fixes. # # - rtnetlink.h needs including # - optimize loss test so that net_random() call is not done # when no-loss is indicated # # Signed-off-by: Stephen Hemminger <shemminger@osdl.org> # Signed-off-by: David S. Miller <davem@redhat.com> # diff -Nru a/net/sched/sch_netem.c b/net/sched/sch_netem.c --- a/net/sched/sch_netem.c 2004-07-06 14:36:05 -07:00 +++ b/net/sched/sch_netem.c 2004-07-06 14:36:05 -07:00 @@ -18,6 +18,7 @@ #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/rtnetlink.h> #include <net/pkt_sched.h> @@ -54,7 +55,7 @@ pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies); /* Random packet drop 0 => none, ~0 => all */ - if (q->loss >= net_random()) { + if (q->loss && q->loss >= net_random()) { sch->stats.drops++; return 0; /* lie about loss so TCP doesn''t know */ }
On Tue, 2004-07-06 at 12:09, Stephen Hemminger wrote:> Your examples made me think about this more. The netfilter seem best > suited to things that effect the flow of packets (dropping, reordering, > even corrupting), and the qdisc seems best when the timing needs to change.Some of the attributes you are trying to control need queueing; no doubt the best spot to do queueing is on a qdisc. Delays, and reordering for example are ideal. Rate control as well fits here. There are other qdiscs which have done a really good job at rate control hence my arguement against you doing it - you will either not do a better job at it or if you do a good job you will be replicating what they already did; just stash your qdisc in another qdisc which can do a good rate control job (CBQ, TBF, HFSC, HTB) - we are flexible enough in Linux. Depending on where you want to do things, netfilter may be a good candidate (example IP protocol) or things that dont need queueing. The examples i gave are more powerful than anything netfilter can do at the moment though with only caveat theres only two "hooks".> The limit match in netfilter is not the same as the rate in the qdisc. > The netem scheduler acts as if the link is a slow fixed rate. The netfilter > limit is usually targeted to drop packets over the rate which is not the same. > Reordering is also hard without going out to a user log or building a custom > target.Not sure what the netfilter limit target is - i suspect its something that limits based on a group of flows. You can still do that with a fwamrk at the qdisc level. Reordering needs a queue. Even the example i gave uses a queue that resides on the dummy device.> So, you have convinced me that loss is unnecessary but not the rate, or delay. > If we can figure out how to re-ordering with netfilter then that could go too, > which would make it possible to use a layered qdisc again.I think keep the reordering aspect of it unless it is very complex. The delay is a must. If you can add configurable jitter to it that would be a big bonus. Keep the randomization. Duplication, dropping, bit error injection, and rate control are the ones i didnt see belonging there mostly because they can be done better elsewhere. Again this is just opinion, if you think that theres no complexity in the architecture, by all means keep all those features - my recommendation is to pick a few things that will work well and implement them well. cheers, jamal
David S. Miller
2004-Jul-07 01:51 UTC
Re: [PATCH 2.6] update to network emulation QOS scheduler
On 06 Jul 2004 21:36:20 -0400 jamal <hadi@cyberus.ca> wrote:> Not sure what the netfilter limit target is - i suspect its something > that limits based on a group of flows. You can still do that with a > fwamrk at the qdisc level. Reordering needs a queue. Even the example i > gave uses a queue that resides on the dummy device.It''s a netfilter iptables module that essentially uses sch_tbf.c''s simple token bucket filter algorithm. See net/ipv4/netfilter/ipt_limit.c for details.
Catalin BOIE
2004-Jul-07 06:39 UTC
Re: [PATCH 2.6] update to network emulation QOS scheduler
On Tue, 6 Jul 2004, jamal wrote:> On Mon, 2004-07-05 at 22:49, David S. Miller wrote: >> I''m going to hold off on Stephen''s patches until Jamal and he has >> a chance to fight it out :-) > > Actually i would be fine with it if Stephen gets rid of the new "rate" > thing.I expect that duplicates of packet will not going to sch_netem, right? I''m asking because I have a pactch pending. Thank you.> > cheers, > jamal >--- Catalin(ux aka Dino) BOIE catab at deuroconsult.ro http://kernel.umbrella.ro/
Catalin BOIE
2004-Jul-07 07:01 UTC
Re: [PATCH 2.6] update to network emulation QOS scheduler
On Wed, 6 Jul 2004, jamal wrote:> On Tue, 2004-07-06 at 12:09, Stephen Hemminger wrote: > >> Your examples made me think about this more. The netfilter seem best >> suited to things that effect the flow of packets (dropping, reordering, >> even corrupting), and the qdisc seems best when the timing needs to change. > > Some of the attributes you are trying to control need queueing; no doubt > the best spot to do queueing is on a qdisc. Delays, and reordering for > example are ideal. Rate control as well fits here. There are other > qdiscs which have done a really good job at rate control hence my > arguement against you doing it - you will either not do a better job at > it or if you do a good job you will be replicating what they already > did; just stash your qdisc in another qdisc which can do a good rate > control job (CBQ, TBF, HFSC, HTB) - we are flexible enough in Linux. > > Depending on where you want to do things, netfilter may be a good > candidate (example IP protocol) or things that dont need queueing. > The examples i gave are more powerful than anything netfilter can do at > the moment though with only caveat theres only two "hooks". > >> The limit match in netfilter is not the same as the rate in the qdisc. >> The netem scheduler acts as if the link is a slow fixed rate. The netfilter >> limit is usually targeted to drop packets over the rate which is not the same. >> Reordering is also hard without going out to a user log or building a custom >> target. > > Not sure what the netfilter limit target is - i suspect its something > that limits based on a group of flows. You can still do that with a > fwamrk at the qdisc level. Reordering needs a queue. Even the example i > gave uses a queue that resides on the dummy device. > >> So, you have convinced me that loss is unnecessary but not the rate, or delay. >> If we can figure out how to re-ordering with netfilter then that could go too, >> which would make it possible to use a layered qdisc again. > > I think keep the reordering aspect of it unless it is very complex. The > delay is a must. If you can add configurable jitter to it that would be > a big bonus. Keep the randomization. Duplication, dropping, bit error > injection, and rate control are the ones i didnt see belonging there > mostly because they can be done better elsewhere. > Again this is just opinion, if you think that theres no complexity in > the architecture, by all means keep all those features - my > recommendation is to pick a few things that will work well and implement > them well. > > cheers, > jamalI suggest to keep duplication because: 1. Adds 5-10 lines of code and no complexity 2. It''s very easy to use it attached directly to a device. Thank you. --- Catalin(ux aka Dino) BOIE catab at deuroconsult.ro http://kernel.umbrella.ro/ _______________________________________________ LARTC mailing list / LARTC@mailman.ds9a.nl http://mailman.ds9a.nl/mailman/listinfo/lartc HOWTO: http://lartc.org/
Stephen Hemminger
2004-Jul-07 18:10 UTC
Re: [PATCH 2.6] update to network emulation QOS scheduler
Ok, I''ll bite how would you do: Rate limit packet egress on a ethernet device (eth0) so it looks like a slow DSL link (25 Kbps) by not dropping packets but by pacing the data.
On Wed, 2004-07-07 at 03:01, Catalin BOIE wrote:> I expect that duplicates of packet will not going to sch_netem, right? > I''m asking because I have a pactch pending.I dont think that should stop you from adding the feature. To answer your question, yes, duplicates can be sent to that qdisc using the tc extensions.> > I suggest to keep duplication because: > 1. Adds 5-10 lines of code and no complexity > 2. It''s very easy to use it attached directly to a device.Go ahead. The tc extension creation of duplicates can be used with other qdiscs as well. cheers, jamal
I seem to have hit the jackpot - all my emails to netdev are showing up and on time too. On Wed, 2004-07-07 at 14:10, Stephen Hemminger wrote:> Ok, I''ll bite how would you do: > > Rate limit packet egress on a ethernet device (eth0) so it looks like a slow DSL link (25 Kbps) > by not dropping packets but by pacing the data.Doesnt TBF work? rate 25kbit burst 90k should probably do it. Maybe i misunderstood the question. You may be able to avoid dropping but dont think you can guarantee it simply because you have finite buffers. At some point you will congest that queue and packets will be dropped; and if you dont limit your queue buffer size, sooner than later you are bound to hog all the system memory. Having said that, i have never seen a good arguement for why pacing traffic vs dropping to initiate a slowdown is better than the other. So in that case, a policer/meter should suffice. cheers, jamal
Stephen Hemminger
2004-Jul-07 20:58 UTC
Re: [PATCH 2.6] update to network emulation QOS scheduler
On 07 Jul 2004 14:57:48 -0400 jamal <hadi@cyberus.ca> wrote:> I seem to have hit the jackpot - all my emails to netdev are showing > up and on time too. > > On Wed, 2004-07-07 at 14:10, Stephen Hemminger wrote: > > Ok, I''ll bite how would you do: > > > > Rate limit packet egress on a ethernet device (eth0) so it looks like a slow DSL link (25 Kbps) > > by not dropping packets but by pacing the data. > > Doesnt TBF work? > rate 25kbit burst 90k should probably do it. Maybe i misunderstood the > question.TBF works but since the sender (on the same local machine) may go over it''s allocation, it will drop packets. For example, if I use tbf to simulate a slow 33k bits/sec link then TCP test never completes, it just hangs! TBF does work for intermediate sizes. But if I use the pacing simulation it works.> > You may be able to avoid dropping but dont think you can guarantee it > simply because you have finite buffers. At some point you will congest > that queue and packets will be dropped; and if you dont limit your queue > buffer size, sooner than later you are bound to hog all the system > memory.I understand that, every queue has to have a limit.> Having said that, i have never seen a good arguement for why pacing > traffic vs dropping to initiate a slowdown is better than the other. > So in that case, a policer/meter should suffice._______________________________________________ LARTC mailing list / LARTC@mailman.ds9a.nl http://mailman.ds9a.nl/mailman/listinfo/lartc HOWTO: http://lartc.org/
On Wed, 2004-07-07 at 16:58, Stephen Hemminger wrote:> TBF works but since the sender (on the same local machine) may go over > it''s allocation, it will drop packets.As should any queue that gets congested.> For example, if I use tbf to simulate a slow 33k bits/sec link then > TCP test never > completes, it just hangs! TBF does work for intermediate sizes. > > But if I use the pacing simulation it works.I am not sure i follow; is this because of the return code from the enqueue? cheers, jamal
Stephen Hemminger
2004-Jul-07 21:22 UTC
Re: [PATCH 2.6] update to network emulation QOS scheduler
On 07 Jul 2004 17:11:39 -0400 jamal <hadi@cyberus.ca> wrote:> On Wed, 2004-07-07 at 16:58, Stephen Hemminger wrote: > > > TBF works but since the sender (on the same local machine) may go over > > it''s allocation, it will drop packets. > > As should any queue that gets congested.> > For example, if I use tbf to simulate a slow 33k bits/sec link then > > TCP test never > > completes, it just hangs! TBF does work for intermediate sizes. > > > > But if I use the pacing simulation it works. > > I am not sure i follow; is this because of the return code from the > enqueue?Actually, the problem only occurs if burst is set large (like 2mb). I think it gets stuck waiting for that much data. _______________________________________________ LARTC mailing list / LARTC@mailman.ds9a.nl http://mailman.ds9a.nl/mailman/listinfo/lartc HOWTO: http://lartc.org/