'[RFC] QoS: new per flow queue'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       linux-netdev
Subject:    [RFC] QoS: new per flow queue
From:       Wang Jian <lark () linux ! net ! cn>
Date:       2005-04-05 15:25:28
Message-ID: 20050405224956.0258.LARK () linux ! net ! cn
[Download RAW message or body]

Hi,

I write a per flow rate control qdisc. I posted it to LARTC list. Some
discussion about it is here

    http://mailman.ds9a.nl/pipermail/lartc/2005q2/015381.html

I think I need more feedback and suggestion on it so I repost the patch
here. Please read the thread and get a picture about why and how.

The kernel patch is agains kernel 2.6.11, the iproute2 patch is against
iproute2-2.6.11-050314. 

The test scenario is like this

      www server <- [ eth0   eth1 ] -> www clients

The attached t.sh is used to generate test rules. Clients download a
big ISO file from www server, so flows' rate can be estimated by view
progress.

I have some test on it and it works well. It provides good fairness.
When all slot being used, in most time, the real rate can keep at
specified guaranteed rate. But I know it should receive more test.

I have some consideration though

1. In the test sometimes there a pair of unbalanced stream and don't get
balanced quickly. One stream get 8.4kbps and another get 11.5kbps. How
to find the flow with highest traffic and punish it most?

2. The default ceil equals to rate. Should I calculate it as
   ceil = rate * 1.05 * limit, or
   ceil = rate * 1.05?

3. when flow slots are full, optionally reclassify untraceable traffic
   into another specified class, instead of dropping it?

TODO:

1. rtnetlink related code should be improved;
2. dump() and dump_stat();


Regards
-- 
  lark

["t.sh" (application/octet-stream)]

./tc qdisc del dev eth1 root
./tc qdisc add dev eth1 root handle 1: htb default 10
./tc class add dev eth1 parent 1: classid 1:1 htb rate 100Mbit ceil 100Mbit

./tc class add dev eth1 parent 1:1 classid 1:10 htb rate 80kbps ceil 80kbps

./tc class add dev eth1 parent 1:1 classid 1:12 htb rate 80kbps ceil 80kbps
./tc qdisc add dev eth1 parent 1:12 handle 20: perflow rate 10kbps ceil 15kbps limit 7
./tc filter add dev eth1 parent 1:0 protocol ip u32 match ip sport 80 0xffff flowid 1:12

["iproute2-2.6.11-050314-perflow.diff" (application/octet-stream)]

Index: iproute2-2.6.11-w/include/linux/pkt_sched.h
===================================================================
--- iproute2-2.6.11-w/include/linux/pkt_sched.h	(revision 1)
+++ iproute2-2.6.11-w/include/linux/pkt_sched.h	(working copy)
@@ -272,6 +272,28 @@
 	__u32 ctokens;
 };
 
+
+/* PERFLOW section */
+
+#define TC_PERFLOW_DEFAULTLIMIT	1024
+
+struct tc_perflow_qopt
+{
+	struct tc_ratespec 	rate;
+	struct tc_ratespec 	ceil;
+	__u32  limit;
+	__u32  qlen;
+};
+enum
+{
+	TCA_PERFLOW_UNSPEC,
+	TCA_PERFLOW_PARMS,
+	__TCA_PERFLOW_MAX,
+};
+
+#define TCA_PERFLOW_MAX	(__TCA_PERFLOW_MAX - 1)
+
+
 /* HFSC section */
 
 struct tc_hfsc_qopt
Index: iproute2-2.6.11-w/tc/q_perflow.c
===================================================================
--- iproute2-2.6.11-w/tc/q_perflow.c	(revision 0)
+++ iproute2-2.6.11-w/tc/q_perflow.c	(revision 0)
@@ -0,0 +1,153 @@
+/*
+ * q_perflow.c		PERFLOW.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Wang Jian <lark@linux.net.cn>, 2005
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+	fprintf(stderr,
+"Usage: ... perflow rate RATE [ ceil CEIL ] [ limit LIMIT ] [ qlen QLEN ]\n"
+"\n"
+"rate    rate allocated to each flow (flow can still borrow)\n"
+"ceil    upper limit for each flow (flow can not borrow)\n"
+"limit   maximum concurrent flows\n"
+"qlen    maximum queue length\n"
+	);
+}
+
+#define usage() return(-1)
+
+static int perflow_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+	struct tc_perflow_qopt opt;
+	struct rtattr *tail;
+	int ok = 0;
+
+	memset(&opt, 0, sizeof(opt));
+
+	while (argc > 0) {
+		if (strcmp(*argv, "rate") == 0) {
+			NEXT_ARG();
+			if (opt.rate.rate) {
+				fprintf(stderr, "Double \"ceil\" spec\n");
+				return -1;
+			}
+			if (get_rate(&opt.rate.rate, *argv)) {
+				fprintf(stderr, "Illegal \"rate\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "ceil") == 0) {
+			NEXT_ARG();
+			if (opt.ceil.rate) {
+				fprintf(stderr, "Double \"ceil\" spec\n");
+				return -1;
+			}
+			if (get_rate(&opt.ceil.rate, *argv)) {
+				fprintf(stderr, "Illegal \"ceil\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "limit") == 0) {
+			NEXT_ARG();
+			if (get_size(&opt.limit, *argv)) {
+				fprintf(stderr, "Illegal \"limit\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "qlen") == 0) {
+			NEXT_ARG();
+			if (get_size(&opt.qlen, *argv)) {
+				fprintf(stderr, "Illegal \"limit\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "help") == 0) {
+			explain();
+			return -1;
+		} else {
+			fprintf(stderr, "What is \"%s\"?\n", *argv);
+			explain();
+			return -1;
+		}
+		argc--; argv++;
+	}
+
+	if (!ok)
+		return 0;
+
+	if (opt.rate.rate == 0) {
+		fprintf(stderr, "\"rate\" is required.\n");
+		return -1;
+	}
+
+	if (opt.ceil.rate == 0)
+		opt.ceil = opt.rate;
+
+	if (opt.ceil.rate < opt.rate.rate) {
+		fprintf(stderr, "\"ceil\" must be >= \"rate\".\n");
+		return -1;
+	}
+
+	if (opt.limit == 0)
+		opt.limit = TC_PERFLOW_DEFAULTLIMIT;
+
+	if (opt.qlen > 0 && opt.qlen < 5) {
+		fprintf(stderr, "\"qlen\" must be >= 5.\n");
+		return -1;
+	}
+
+	tail = NLMSG_TAIL(n);
+	//addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+	//addattr_l(n, 1024, TCA_PERFLOW_PARMS, &opt, sizeof(opt));
+	addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt));
+	tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+
+	return 0;
+}
+
+static int perflow_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+	struct tc_perflow_qopt *qopt;
+
+	if (opt == NULL)
+		return 0;
+
+	if (RTA_PAYLOAD(opt) < sizeof(*qopt))
+		return -1;
+
+	qopt = RTA_DATA(opt);
+
+	SPRINT_BUF(b1);
+	fprintf(f, "rate %s ", sprint_rate(qopt->rate.rate, b1));
+	fprintf(f, "ceil %s ", sprint_rate(qopt->ceil.rate, b1));
+	fprintf(f, "limit %s ", sprint_size(qopt->rate.rate, b1));
+
+	return 0;
+}
+
+struct qdisc_util perflow_qdisc_util = {
+	.id		= "perflow",
+	.parse_qopt	= perflow_parse_opt,
+	.print_qopt	= perflow_print_opt,
+};
Index: iproute2-2.6.11-w/tc/Makefile
===================================================================
--- iproute2-2.6.11-w/tc/Makefile	(revision 1)
+++ iproute2-2.6.11-w/tc/Makefile	(working copy)
@@ -7,6 +7,7 @@
 TCMODULES += q_fifo.o
 TCMODULES += q_sfq.o
 TCMODULES += q_red.o
+TCMODULES += q_perflow.o
 TCMODULES += q_prio.o
 TCMODULES += q_tbf.o
 TCMODULES += q_cbq.o

["linux-2.6.11-perflow-r3.diff" (application/octet-stream)]

Index: linux-2.6.11-w/include/linux/pkt_sched.h
===================================================================
--- linux-2.6.11-w/include/linux/pkt_sched.h	(revision 2)
+++ linux-2.6.11-w/include/linux/pkt_sched.h	(revision 3)
@@ -272,6 +272,28 @@
 	__u32 ctokens;
 };
 
+
+/* PERFLOW section */
+
+#define TC_PERFLOW_DEFAULTLIMIT	1024
+
+struct tc_perflow_qopt
+{
+	struct tc_ratespec 	rate;
+	struct tc_ratespec 	ceil;
+	__u32  limit;
+	__u32  qlen;
+};
+enum
+{
+	TCA_PERFLOW_UNSPEC,
+	TCA_PERFLOW_PARMS,
+	__TCA_PERFLOW_MAX,
+};
+
+#define TCA_PERFLOW_MAX	(__TCA_PERFLOW_MAX - 1)
+
+
 /* HFSC section */
 
 struct tc_hfsc_qopt
Index: linux-2.6.11-w/net/sched/Kconfig
===================================================================
--- linux-2.6.11-w/net/sched/Kconfig	(revision 2)
+++ linux-2.6.11-w/net/sched/Kconfig	(revision 3)
@@ -192,6 +192,15 @@
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_gred.
 
+config NET_SCH_PERFLOW
+	tristate "PERFLOW queue"
+	depends on NET_SCHED
+	---help---
+	  Say Y here if you want to use per flow rate control.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_perflow.
+
 config NET_SCH_DSMARK
 	tristate "Diffserv field marker"
 	depends on NET_SCHED
Index: linux-2.6.11-w/net/sched/Makefile
===================================================================
--- linux-2.6.11-w/net/sched/Makefile	(revision 2)
+++ linux-2.6.11-w/net/sched/Makefile	(revision 3)
@@ -19,6 +19,7 @@
 obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
 obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
 obj-$(CONFIG_NET_SCH_GRED)	+= sch_gred.o
+obj-$(CONFIG_NET_SCH_PERFLOW)	+= sch_perflow.o
 obj-$(CONFIG_NET_SCH_INGRESS)	+= sch_ingress.o 
 obj-$(CONFIG_NET_SCH_DSMARK)	+= sch_dsmark.o
 obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
Index: linux-2.6.11-w/net/sched/sch_perflow.c
===================================================================
--- linux-2.6.11-w/net/sched/sch_perflow.c	(revision 0)
+++ linux-2.6.11-w/net/sched/sch_perflow.c	(revision 3)
@@ -0,0 +1,555 @@
+/*
+ * net/sched/sch_perflow.c	Per flow rate control queue.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Wang Jian <lark@linux.net.cn>, 2005
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/jhash.h>
+#include <linux/timer.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/dsfield.h>
+
+/* Per flow rate control algorithm
+
+Description
+-----------
+
+	When a new packet arrives, we lookup in the table to see if it
+	belongs to a flow being traced. If not, we create new entry for it.
+
+	For a packet belongs to a flow being traced, we see if it has
+	available token to send it out.
+
+	The 'rate' and 'ceil' have same meaning of HTB qdisc. The 'limit'
+	parameter defined how many flows we will trace, which defaults to
+	1024.
+
+	Any flow entry without traffic in LIFETIME seconds will be wiped
+	to free slot.
+
+
+Algorithm
+---------
+
+	The algorithm is simple and naive. We have a token bucket, which
+	generate grate/10 (= rate * limit / 10) token in 1/10 seconds.
+
+	A flow can always send when it is under rate. When a flow is over
+	rate but under ceil, every time it borrows, it is put into borrow
+	list, so it may receive penalty.
+
+	When a under-rate flow sends but token is short, penalty is added
+	to one of flows in borrow list, and this punished flow is
+	removed from borrow list.
+
+	When a flow carrying penalty has packet, the penalty is
+	added to used token. This means, this flow can borrow less than
+	last time (if in a new time slot), or can borrow less 
+	(ceil - rate - penalty).
+
+	There are other rules that handle fairness and low rate situation.
+	See code for details.
+
+	NOTE: the implementation of algorithm is not timer driven, but
+	packet driven.
+
+	The ideas behind this algorithm are
+
+	1. We assume that perflow qdisc has rate * limit guaranteed.
+	2. We can't affect past. We can only affect future.
+	3. If a flow's borrowing leads to overlimit, we let this flow
+	   borrow less in future.
+	4. With a round robin punishment style, a flow borrows more times,
+	   it stays in borrow list more times, and so receives more penalty.
+	   (But we should consider more about this, I think the fairness
+	   should be improved by sort borrow list.)
+
+
+Security Consideration
+----------------------
+
+	It's dangerous to create new entry for any new packets not belongs
+	to a flow being traced. For example, syn flood attack on a single
+	port may cause thousands of entries created. The 'limit' parameter
+	is used to set limit on maximum entries we will create.
+
+	But even with 'limit', port scan can fill the slots and make valid
+	new flow has no slot available and not be traced.
+
+	One of solution is to use netfilter MARK for classification 
+	(a.k.a. cls_fw.c). Only established session is given a fw mark, and
+	then, if port scan use a spoofing source address, no fw mark gotten.
+
+
+Application
+-----------
+
+	You should use HTB or other classful qdisc to enclose this qdisc.
+
+ */
+
+
+#define PERFLOW_HSIZE		251
+#define PERFLOW_LIFETIME	(10*HZ)
+
+struct perflow_entry
+{
+	u32	src;
+	u32	dst;
+	u16	sport;
+	u16	dport;
+
+	struct	list_head hlist;
+	struct	list_head borrow; 
+	struct	timer_list timer;
+	struct	perflow_sched_data *q;
+	u32	jiffies;
+	u32	used;
+	u32	penalty;
+
+	u8	protocol;
+};
+
+struct perflow_sched_data
+{
+	/* parameters */
+	u32	rate;			/* guaranted rate */
+	u32	ceil;			/* upper bound */
+	u32	limit;			/* maximum flows we trace */
+	u32	qlen;			/* maximum queue length */
+
+	/* variables */
+	u32	grate;			/* aggregative rate */
+
+	u32	jiffies;
+	u32	token;
+	u32	flow_count;		/* how many flows we are tracing */
+	struct	list_head ht[PERFLOW_HSIZE];	/* hash table */
+	struct	list_head borrow;	/* lists of borrowing flow */
+};
+
+struct commonhdr
+{
+	u16	source;
+	u16	dest;
+};
+
+static __inline__ u32 perflow_hash(struct perflow_entry *tuple)
+{
+	return (jhash_3words(tuple->src ^ tuple->protocol,
+				tuple->dst,
+				(tuple->sport << 16 | tuple->dport),
+				0x5E83AD03) % PERFLOW_HSIZE);
+}
+
+static __inline__ int perflow_is_valid(struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+
+	if (skb->protocol != __constant_htons(ETH_P_IP))
+		return 0;
+
+	if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_TCP
+			&& iph->protocol != IPPROTO_SCTP)
+		return 0;
+
+	return 1;
+}
+
+static __inline__ void perflow_flow_timer(unsigned long arg)
+{
+	struct perflow_entry *e = (struct perflow_entry *) arg;
+
+	e->q->flow_count--;
+	list_del_init(&e->hlist);
+	if (!list_empty(&e->borrow))
+		list_del_init(&e->borrow);
+	kfree(e);
+}
+
+static __inline__ struct perflow_entry *
+perflow_new_flow(u32 hash, struct perflow_entry *tuple, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct perflow_entry *e;
+
+	if (q->flow_count >= q->limit)
+		return NULL;
+
+	e = kmalloc(sizeof(*e), GFP_KERNEL);
+	if (!e)
+		return NULL;
+
+	q->flow_count++;
+
+	memset(e, 0, sizeof(*e));
+
+	e->src = tuple->src;
+	e->dst = tuple->dst;
+	e->sport = tuple->sport;
+	e->dport = tuple->dport;
+	e->protocol = tuple->protocol;
+
+	INIT_LIST_HEAD(&e->hlist);
+	INIT_LIST_HEAD(&e->borrow);
+	init_timer(&e->timer);
+	e->timer.function = perflow_flow_timer;
+	e->timer.data = (unsigned long) e;
+	/* sync flow's tick to queue's tick */
+	e->jiffies = q->jiffies;
+	e->q = q;
+
+	list_add(&e->hlist, q->ht + hash);
+
+	return e;
+}
+
+static __inline__ void
+perflow_fill_tuple(struct perflow_entry *tuple, struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+	struct commonhdr *ch;
+
+	memset(tuple, 0, sizeof(struct perflow_entry));
+#if 1
+	/* not a traceable flow, do nothing */
+	if (!perflow_is_valid(skb))
+		return;
+#endif
+
+	ch = (struct commonhdr *)((void *)iph + (iph->ihl << 2));
+	tuple->src = iph->saddr;
+	tuple->dst = iph->daddr;
+	tuple->sport = ch->source;
+	tuple->dport = ch->dest;
+	tuple->protocol = iph->protocol;
+}
+
+static struct perflow_entry *
+perflow_find_flow(u32 *hash, struct perflow_entry *flow, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct list_head *h, *head;
+	struct perflow_entry *e;
+       
+	*hash = perflow_hash(flow);
+	head = q->ht + (*hash);
+
+	list_for_each(h, head) {
+		e = list_entry(h, struct perflow_entry, hlist);
+		if (e->src == flow->src
+				&& e->dst == flow->dst
+				&& e->sport == flow->sport
+				&& e->dport == flow->dport
+				&& e->protocol == flow->protocol)
+			del_timer(&e->timer);
+			return e;
+	}
+	return NULL;
+}
+
+static void perflow_punish(int penalty, struct list_head *borrow)
+{
+	struct perflow_entry *e;
+	struct list_head *h;
+
+	if (!list_empty(borrow)) {
+		h = borrow->next;
+		list_del_init(h);
+		e = list_entry(h, struct perflow_entry, hlist);
+		e->penalty += penalty;
+	}
+}
+
+static __inline__ void
+perflow_adjust_used(struct perflow_entry *flow, u32 rate, u32 ceil)
+{
+	int cycles;
+
+	/* still in this time slot */
+	if ((jiffies - flow->jiffies) <= HZ) {
+		if ((ceil - flow->used) > flow->penalty) {
+			flow->used += flow->penalty;
+			flow->penalty = 0;
+		} else {
+			flow->used = ceil;
+			flow->penalty -= ceil - flow->used;
+		}
+		return;
+	}
+
+	/* the packet arrives after cycles slots */
+	cycles = (jiffies - flow->jiffies) / HZ;
+	flow->jiffies += cycles * HZ;
+
+	if ((cycles * ceil - flow->used) > flow->penalty) {
+		flow->used = 0;
+		flow->penalty = 0;
+	} else {
+		flow->used = flow->penalty + flow->used - cycles * ceil;
+		if (flow->used > ceil) {
+			flow->used = ceil;
+			flow->penalty -= ceil;
+		}
+	}
+}
+
+static int perflow_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct perflow_entry tuple, *flow;
+	u32 hash;
+
+	/* not a traceable flow */
+	if (!perflow_is_valid(skb))
+		goto drop;
+
+	/* if max qlen is set and current queue is full, drop */
+	if (q->qlen && sch->q.qlen >= q->qlen) {
+		printk(KERN_WARNING "qlen overlimit drop\n");
+		goto drop;
+	}
+
+	perflow_fill_tuple(&tuple, skb);
+	flow = perflow_find_flow(&hash, &tuple, sch);
+
+	if (flow == NULL) {
+		flow = perflow_new_flow(hash, &tuple, sch);
+		if (flow == NULL)
+			goto drop;
+	}
+
+	/* renew timer for flow */
+	flow->timer.expires = jiffies + PERFLOW_LIFETIME;
+	add_timer(&flow->timer);
+
+	if ((jiffies - q->jiffies) > HZ) {
+		q->token = q->grate;
+		q->jiffies += (jiffies - q->jiffies) / HZ * HZ;
+	}
+
+	/* renew used for this flow */
+	perflow_adjust_used(flow, q->rate, q->ceil);
+
+	/* we always satisfy flow under rate */
+	if (flow->used < q->rate) {
+		flow->used += skb->len;
+		if (flow->used > q->rate) {
+			/* if we borrow, we may receive penalty */
+			if (list_empty(&flow->borrow))
+				list_add_tail(&flow->borrow, &q->borrow);
+		}
+		if (flow->used > q->ceil)
+			flow->penalty += flow->used - q->ceil;
+		if (q->token < skb->len) {
+			q->token = 0;
+			perflow_punish(skb->len - q->token, &q->borrow);
+		} else {
+			q->token -= skb->len;
+		}
+	} else {
+		if (q->token < skb->len || flow->used >= q->ceil)
+			goto drop;
+
+		flow->used += skb->len;
+		q->token -= skb->len;
+
+		if (list_empty(&flow->borrow))
+			list_add_tail(&flow->borrow, &q->borrow);
+		else
+			list_move(&flow->borrow, &q->borrow);
+
+		if (flow->used > q->ceil)
+			flow->penalty += flow->used - q->ceil;
+	}
+
+enqueue:
+	sch->qstats.backlog += skb->len;
+	sch->bstats.bytes += skb->len;
+	sch->bstats.packets++;
+	__skb_queue_tail(&sch->q, skb);
+	return NET_XMIT_SUCCESS;
+
+drop:
+	sch->qstats.overlimits++;
+	kfree_skb(skb);
+	sch->qstats.drops++;
+	return NET_XMIT_CN;
+}
+
+static int perflow_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	__skb_queue_head(&sch->q, skb);
+	sch->qstats.backlog += skb->len;
+	sch->qstats.requeues++;
+	return 0;
+}
+
+static struct sk_buff *perflow_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue(&sch->q);
+	if (skb) {
+		sch->qstats.backlog -= skb->len;
+	}
+	return skb;
+}
+
+static unsigned int perflow_drop(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue_tail(&sch->q);
+	if (skb) {
+		unsigned int len = skb->len;
+		sch->qstats.backlog -= len;
+		sch->qstats.drops++;
+		kfree_skb(skb);
+		return len;
+	}
+	return 0;
+}
+
+static void perflow_reset(struct Qdisc *sch)
+{
+	skb_queue_purge(&sch->q);
+	sch->qstats.backlog = 0;
+}
+
+static int perflow_change(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct tc_perflow_qopt *ctl;
+
+	/* if limit and qlen are lowered, and we are in a over limit
+	 * state, we still don't drop flow or drop packet.
+	 * sch->q will decrease to allowed length
+	 * q->flow_count will decrease because no new flow is traced
+	 */
+	if (opt == NULL) {
+		return -EINVAL;
+	} else {
+		ctl = RTA_DATA(opt);
+		if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+			return -EINVAL;
+
+		q->rate = ctl->rate.rate;
+		q->ceil = ctl->ceil.rate;
+		q->limit = ctl->limit;
+		q->qlen = ctl->qlen;
+
+		if (q->limit == 0)
+			q->limit = 1024;
+		/* aggregative rate is (rate * 1.05 * limit) */
+		q->grate = (q->rate + q->rate/20) * q->limit;
+
+		q->jiffies = jiffies;
+	}
+
+	return 0;
+}
+
+static int perflow_init(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	int i, ret;
+
+	ret = perflow_change(sch, opt);
+	if (ret)
+		return ret;
+
+	INIT_LIST_HEAD(&q->borrow);
+
+	for (i = 0; i < PERFLOW_HSIZE; i++)
+		INIT_LIST_HEAD(q->ht + i);
+
+	return 0;
+}
+
+static void perflow_destroy(struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct list_head *h, *n;
+	struct perflow_entry *e;
+	int i;
+
+	for (i = 0; i < PERFLOW_HSIZE; i++) {
+		list_for_each_safe (h, n, q->ht + i) {
+			e = list_entry(h, struct perflow_entry, hlist);
+			del_timer(&e->timer);
+			if (!list_empty(&e->borrow))
+				list_del_init(&e->borrow);
+			list_del(h);
+			kfree(e);
+		}
+	}
+}
+
+static int perflow_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	return -1;
+}
+
+static struct Qdisc_ops perflow_qdisc_ops = {
+	.next		= NULL,
+	.cl_ops		= NULL,
+	.id		= "perflow",
+	.priv_size	= sizeof(struct perflow_sched_data),
+	.enqueue	= perflow_enqueue,
+	.dequeue	= perflow_dequeue,
+	.requeue	= perflow_requeue,
+	.drop		= perflow_drop,
+	.init		= perflow_init,
+	.reset		= perflow_reset,
+	.destroy	= perflow_destroy,
+	.change		= perflow_change,
+	.dump		= perflow_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init perflow_module_init(void)
+{
+	return register_qdisc(&perflow_qdisc_ops);
+}
+
+static void __exit perflow_module_exit(void)
+{
+	unregister_qdisc(&perflow_qdisc_ops);
+}
+
+module_init(perflow_module_init);
+module_exit(perflow_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Wang Jian <lark@linux.net.cn>");


[prev in list] [next in list] [prev in thread] [next in thread]
Configure | About | News | Add a list | Sponsored by KoreLogic