Below is a fix for the current problem of checksum offload not working in a NAT''ed network. The cause is the NAT/iptables code incorrectly modifying the TCP/UDP checksum (for the checksum offload case). The original code assumes a valid checksum, which is not the case for checksum offload packets (which has a complimented, partial checksum for the hardware to use). The fix is to compliment the new address and not compliment the old address (which is complimented in the partial checksum), and roll that with the ip_nat_cheat_check function. There are two "versions" of the patch below. The first version is a diff to show the actual changes made to the ip_nat_proto_udp.c and ip_nat_proto_tcp.c file (as it is difficult/impossible to tell from the second patch). The second version is the one to commit to the tree (which creates 2 new files in the sparse directory). Thanks, Jon --- ../xen-unstable.hg/linux-2.6.12-xen0/net/ipv4/netfilter/ip_nat_proto_udp.c 2005-06-17 14:48:29.000000000 -0500 +++ linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_udp.c 2005-10-14 15:17:53.000000000 -0500 @@ -112,11 +112,19 @@ udp_manip_pkt(struct sk_buff **pskb, newport = tuple->dst.u.udp.port; portptr = &hdr->dest; } - if (hdr->check) /* 0 is a special case meaning no checksum */ - hdr->check = ip_nat_cheat_check(~oldip, newip, + + if (hdr->check) { /* 0 is a special case meaning no checksum */ + if ((*pskb)->proto_csum_blank) { + hdr->check = ip_nat_cheat_check(oldip, ~newip, + ip_nat_cheat_check(*portptr ^ 0xFFFF, + newport, hdr->check)); + } else { + hdr->check = ip_nat_cheat_check(~oldip, newip, ip_nat_cheat_check(*portptr ^ 0xFFFF, newport, hdr->check)); + } + } *portptr = newport; return 1; } --- ../xen-unstable.hg/linux-2.6.12-xen0/net/ipv4/netfilter/ip_nat_proto_tcp.c 2005-06-17 14:48:29.000000000 -0500 +++ linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_tcp.c 2005-10-14 16:41:20.000000000 -0500 @@ -127,10 +127,16 @@ tcp_manip_pkt(struct sk_buff **pskb, if (hdrsize < sizeof(*hdr)) return 1; - hdr->check = ip_nat_cheat_check(~oldip, newip, + if ((*pskb)->proto_csum_blank) { + hdr->check = ip_nat_cheat_check(oldip, ~newip, + ip_nat_cheat_check(oldport ^ 0xFFFF, + newport, hdr->check)); + } else { + hdr->check = ip_nat_cheat_check(~oldip, newip, ip_nat_cheat_check(oldport ^ 0xFFFF, newport, hdr->check)); + } return 1; } Signed-off-by: Jon Mason <jdmason@us.ibm.com> # HG changeset patch # User root@pentium4 # Node ID 51129b413cc4dd097b415d89c81965e51bfe3e0e # Parent b7a71e4db06d0235d67fef66db1e78cd2238c240 Fix NAT for domU checksum offload. diff -r b7a71e4db06d -r 51129b413cc4 linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_tcp.c --- /dev/null Tue Oct 11 21:44:55 2005 +++ b/linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_tcp.c Fri Oct 14 21:52:41 2005 @@ -0,0 +1,184 @@ +/* (C) 1999-2001 Paul `Rusty'' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/if.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> + +static int +tcp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + u_int16_t port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.tcp.port; + else + port = tuple->dst.u.tcp.port; + + return ntohs(port) >= ntohs(min->tcp.port) + && ntohs(port) <= ntohs(max->tcp.port); +} + +static int +tcp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t port, *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.tcp.port; + else + portptr = &tuple->dst.u.tcp.port; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it''s dst rewrite, can''t change port */ + if (maniptype == IP_NAT_MANIP_DST) + return 0; + + /* Map privileged onto privileged. */ + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr)<512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.tcp.port); + range_size = ntohs(range->max.tcp.port) - min + 1; + } + + for (i = 0; i < range_size; i++, port++) { + *portptr = htons(min + port % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) { + return 1; + } + } + return 0; +} + +static int +tcp_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); + struct tcphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + u32 oldip, newip; + u16 *portptr, newport, oldport; + int hdrsize = 8; /* TCP connection tracking guarantees this much */ + + /* this could be a inner header returned in icmp packet; in such + cases we cannot update the checksum field since it is outside of + the 8 bytes of transport layer headers we are guaranteed */ + if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) + hdrsize = sizeof(struct tcphdr); + + if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) + return 0; + + iph = (struct iphdr *)((*pskb)->data + iphdroff); + hdr = (struct tcphdr *)((*pskb)->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.ip; + newport = tuple->src.u.tcp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.ip; + newport = tuple->dst.u.tcp.port; + portptr = &hdr->dest; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return 1; + + if ((*pskb)->proto_csum_blank) { + hdr->check = ip_nat_cheat_check(oldip, ~newip, + ip_nat_cheat_check(oldport ^ 0xFFFF, + newport, hdr->check)); + } else { + hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(oldport ^ 0xFFFF, + newport, + hdr->check)); + } + return 1; +} + +static unsigned int +tcp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.tcp.port) + len += sprintf(buffer + len, "srcpt=%u ", + ntohs(match->src.u.tcp.port)); + + + if (mask->dst.u.tcp.port) + len += sprintf(buffer + len, "dstpt=%u ", + ntohs(match->dst.u.tcp.port)); + + return len; +} + +static unsigned int +tcp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) { + if (range->min.tcp.port == range->max.tcp.port) + return sprintf(buffer, "port %u ", + ntohs(range->min.tcp.port)); + else + return sprintf(buffer, "ports %u-%u ", + ntohs(range->min.tcp.port), + ntohs(range->max.tcp.port)); + } + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_tcp += { "TCP", IPPROTO_TCP, + tcp_manip_pkt, + tcp_in_range, + tcp_unique_tuple, + tcp_print, + tcp_print_range +}; diff -r b7a71e4db06d -r 51129b413cc4 linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_udp.c --- /dev/null Tue Oct 11 21:44:55 2005 +++ b/linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_udp.c Fri Oct 14 21:52:41 2005 @@ -0,0 +1,173 @@ +/* (C) 1999-2001 Paul `Rusty'' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/if.h> + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +static int +udp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + u_int16_t port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.udp.port; + else + port = tuple->dst.u.udp.port; + + return ntohs(port) >= ntohs(min->udp.port) + && ntohs(port) <= ntohs(max->udp.port); +} + +static int +udp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t port, *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.udp.port; + else + portptr = &tuple->dst.u.udp.port; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it''s dst rewrite, can''t change port */ + if (maniptype == IP_NAT_MANIP_DST) + return 0; + + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr)<512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.udp.port); + range_size = ntohs(range->max.udp.port) - min + 1; + } + + for (i = 0; i < range_size; i++, port++) { + *portptr = htons(min + port % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) + return 1; + } + return 0; +} + +static int +udp_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); + struct udphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + u32 oldip, newip; + u16 *portptr, newport; + + if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + return 0; + + iph = (struct iphdr *)((*pskb)->data + iphdroff); + hdr = (struct udphdr *)((*pskb)->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.ip; + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.ip; + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + + if (hdr->check) { /* 0 is a special case meaning no checksum */ + if ((*pskb)->proto_csum_blank) { + hdr->check = ip_nat_cheat_check(oldip, ~newip, + ip_nat_cheat_check(*portptr ^ 0xFFFF, + newport, hdr->check)); + } else { + hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(*portptr ^ 0xFFFF, + newport, + hdr->check)); + } + } + *portptr = newport; + return 1; +} + +static unsigned int +udp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.udp.port) + len += sprintf(buffer + len, "srcpt=%u ", + ntohs(match->src.u.udp.port)); + + + if (mask->dst.u.udp.port) + len += sprintf(buffer + len, "dstpt=%u ", + ntohs(match->dst.u.udp.port)); + + return len; +} + +static unsigned int +udp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) { + if (range->min.udp.port == range->max.udp.port) + return sprintf(buffer, "port %u ", + ntohs(range->min.udp.port)); + else + return sprintf(buffer, "ports %u-%u ", + ntohs(range->min.udp.port), + ntohs(range->max.udp.port)); + } + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_udp += { "UDP", IPPROTO_UDP, + udp_manip_pkt, + udp_in_range, + udp_unique_tuple, + udp_print, + udp_print_range +}; _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
> Below is a fix for the current problem of checksum offload > not working in a NAT''ed network. The cause is the > NAT/iptables code incorrectly modifying the TCP/UDP checksum > (for the checksum offload case). The original code assumes a > valid checksum, which is not the case for checksum offload > packets (which has a complimented, partial checksum for the > hardware to use). The fix is to compliment the new address > and not compliment the old address (which is complimented in > the partial checksum), and roll that with the > ip_nat_cheat_check function.Thanks for looking into this -- this issue has been nagging away for a long time.> There are two "versions" of the patch below. The first > version is a diff to show the actual changes made to the > ip_nat_proto_udp.c and ip_nat_proto_tcp.c file (as it is > difficult/impossible to tell from the second patch). The > second version is the one to commit to the tree (which > creates 2 new files in the sparse directory).Would we be better off committing the first patch to the patches directory rather than adding to the sparse tree. Do you think you could send this upstream via davem? [Today has been a good day for vanquishing bugs. We''re working on a few save/restore fixes and have a list of tools issues, but 32bit isn''t in too bad shape right now.] Ian _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Sat, Oct 15, 2005 at 12:09:36AM +0100, Ian Pratt wrote:> > Below is a fix for the current problem of checksum offload > > not working in a NAT''ed network. The cause is the > > NAT/iptables code incorrectly modifying the TCP/UDP checksum > > (for the checksum offload case). The original code assumes a > > valid checksum, which is not the case for checksum offload > > packets (which has a complimented, partial checksum for the > > hardware to use). The fix is to compliment the new address > > and not compliment the old address (which is complimented in > > the partial checksum), and roll that with the > > ip_nat_cheat_check function. > > Thanks for looking into this -- this issue has been nagging away for a > long time.Sorry it took me so long. Hopefully, I can knock out the IPSec one faster.> > There are two "versions" of the patch below. The first > > version is a diff to show the actual changes made to the > > ip_nat_proto_udp.c and ip_nat_proto_tcp.c file (as it is > > difficult/impossible to tell from the second patch). The > > second version is the one to commit to the tree (which > > creates 2 new files in the sparse directory). > > Would we be better off committing the first patch to the patches > directory rather than adding to the sparse tree.You are right. Patch to follow.> Do you think you could send this upstream via davem?I can send this to DaveM, but it is very Xen specific. Should we wait for the big Xen/Linux merge for this, or is he currently going through the changes?> [Today has been a good day for vanquishing bugs. We''re working on a few > save/restore fixes and have a list of tools issues, but 32bit isn''t in > too bad shape right now.] > > Ian > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
I added the patch to the already existing "net-csum.patch", as it seemed to be the right fit. Signed-off-by: Jon Mason <jdmason@us.ibm.com> # HG changeset patch # User root@pentium4 # Node ID db72430136444d143a9ad0849922621e9ad7c9aa # Parent f9b300fab36e2a7fef2160ca2e6ab0db1f1b3280 Fix NAT for domU checksum offload diff -r f9b300fab36e -r db7243013644 patches/linux-2.6.12/net-csum.patch --- a/patches/linux-2.6.12/net-csum.patch Fri Oct 14 17:27:25 2005 +++ b/patches/linux-2.6.12/net-csum.patch Fri Oct 14 23:42:24 2005 @@ -9,3 +9,48 @@ && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, udplen, 0))) { + +--- ../xen-unstable.hg/linux-2.6.12-xen0/net/ipv4/netfilter/ip_nat_proto_udp.c 2005-06-17 14:48:29.000000000 -0500 ++++ linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_udp.c 2005-10-14 15:17:53.000000000 -0500 +@@ -112,11 +112,19 @@ udp_manip_pkt(struct sk_buff **pskb, + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } +- if (hdr->check) /* 0 is a special case meaning no checksum */ +- hdr->check = ip_nat_cheat_check(~oldip, newip, ++ ++ if (hdr->check) { /* 0 is a special case meaning no checksum */ ++ if ((*pskb)->proto_csum_blank) { ++ hdr->check = ip_nat_cheat_check(oldip, ~newip, ++ ip_nat_cheat_check(*portptr ^ 0xFFFF, ++ newport, hdr->check)); ++ } else { ++ hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(*portptr ^ 0xFFFF, + newport, + hdr->check)); ++ } ++ } + *portptr = newport; + return 1; + } +--- ../xen-unstable.hg/linux-2.6.12-xen0/net/ipv4/netfilter/ip_nat_proto_tcp.c 2005-06-17 14:48:29.000000000 -0500 ++++ linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_tcp.c 2005-10-14 16:41:20.000000000 -0500 +@@ -127,10 +127,16 @@ tcp_manip_pkt(struct sk_buff **pskb, + if (hdrsize < sizeof(*hdr)) + return 1; + +- hdr->check = ip_nat_cheat_check(~oldip, newip, ++ if ((*pskb)->proto_csum_blank) { ++ hdr->check = ip_nat_cheat_check(oldip, ~newip, ++ ip_nat_cheat_check(oldport ^ 0xFFFF, ++ newport, hdr->check)); ++ } else { ++ hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(oldport ^ 0xFFFF, + newport, + hdr->check)); ++ } + return 1; + } + _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2005-Oct-15 07:24 UTC
Re: [Xen-devel] [PATCH] Fix NAT for domU checksum offload
On 15 Oct 2005, at 00:49, Jon Mason wrote:> I added the patch to the already existing "net-csum.patch", as it > seemed > to be the right fit.I''ve applied the patch to the patch, but we should bear in mind that this will break non-Xen builds. If this were to be pushed upstream then we''d need to send up some of the csum_blank/csum_valid logic also. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
> > I added the patch to the already existing "net-csum.patch", as it > > seemed to be the right fit. > > I''ve applied the patch to the patch, but we should bear in > mind that this will break non-Xen builds. If this were to be > pushed upstream then we''d need to send up some of the > csum_blank/csum_valid logic also.Good point. The patch needs to be made self-contained, and will interfere with the merge work as is. It might be best to keep this as a second patch applied after the first in the meantime. Ian _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel