GIT 684a28105211367d9040ee945cead597912cc49e master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6.14.git --- Signed-off-by: Andrew Morton --- Documentation/feature-removal-schedule.txt | 12 drivers/atm/nicstar.c | 157 - drivers/atm/nicstar.h | 16 drivers/atm/zatm.c | 8 drivers/block/aoe/aoenet.c | 2 drivers/bluetooth/bfusb.c | 8 drivers/char/random.c | 34 drivers/ieee1394/ieee1394_core.c | 4 drivers/isdn/act2000/capi.c | 2 drivers/isdn/i4l/isdn_net.c | 1 drivers/isdn/i4l/isdn_ppp.c | 1 drivers/net/bonding/bond_3ad.c | 11 drivers/net/bonding/bond_3ad.h | 2 drivers/net/bonding/bond_alb.c | 5 drivers/net/hamradio/bpqether.c | 4 drivers/net/ppp_generic.c | 1 drivers/net/pppoe.c | 6 drivers/net/rrunner.c | 3 drivers/net/shaper.c | 50 drivers/net/tg3.c | 325 ++ drivers/net/tg3.h | 10 drivers/net/wan/hdlc_generic.c | 2 drivers/net/wan/lapbether.c | 2 drivers/net/wan/sdla_fr.c | 22 drivers/net/wan/syncppp.c | 2 drivers/usb/net/usbnet.c | 21 drivers/w1/w1_int.c | 2 fs/smbfs/sock.c | 2 include/asm-alpha/socket.h | 2 include/asm-arm/socket.h | 2 include/asm-arm26/socket.h | 2 include/asm-cris/socket.h | 2 include/asm-frv/socket.h | 2 include/asm-h8300/socket.h | 2 include/asm-i386/socket.h | 2 include/asm-ia64/socket.h | 2 include/asm-m32r/socket.h | 2 include/asm-m68k/socket.h | 2 include/asm-mips/socket.h | 2 include/asm-parisc/socket.h | 2 include/asm-ppc/socket.h | 2 include/asm-ppc64/socket.h | 2 include/asm-s390/socket.h | 2 include/asm-sh/socket.h | 2 include/asm-sparc/socket.h | 2 include/asm-sparc64/socket.h | 2 include/asm-v850/socket.h | 2 include/asm-x86_64/socket.h | 2 include/asm-xtensa/socket.h | 2 include/linux/dccp.h | 431 +++ include/linux/hippidevice.h | 5 include/linux/if_vlan.h | 1 include/linux/in.h | 1 include/linux/ip.h | 2 include/linux/ipv6.h | 47 include/linux/list.h | 13 include/linux/net.h | 4 include/linux/netdevice.h | 10 include/linux/netfilter.h | 81 include/linux/netfilter/nfnetlink.h | 148 + include/linux/netfilter/nfnetlink_conntrack.h | 124 + include/linux/netfilter/nfnetlink_log.h | 85 include/linux/netfilter/nfnetlink_queue.h | 86 include/linux/netfilter_decnet.h | 3 include/linux/netfilter_ipv4.h | 8 include/linux/netfilter_ipv4/ip_conntrack.h | 189 + include/linux/netfilter_ipv4/ip_conntrack_core.h | 18 include/linux/netfilter_ipv4/ip_conntrack_helper.h | 2 include/linux/netfilter_ipv4/ip_conntrack_protocol.h | 24 include/linux/netfilter_ipv4/ip_nat_protocol.h | 25 include/linux/netfilter_ipv4/ipt_LOG.h | 1 include/linux/netfilter_ipv4/ipt_NFQUEUE.h | 16 include/linux/netfilter_ipv6.h | 6 include/linux/netfilter_ipv6/ip6t_LOG.h | 1 include/linux/netlink.h | 2 include/linux/random.h | 2 include/linux/skbuff.h | 43 include/linux/socket.h | 1 include/linux/tcp.h | 75 include/net/act_api.h | 2 include/net/arp.h | 2 include/net/ax25.h | 2 include/net/datalink.h | 2 include/net/dn.h | 1 include/net/inet_common.h | 6 include/net/inet_connection_sock.h | 251 ++ include/net/inet_hashtables.h | 427 +++ include/net/inet_timewait_sock.h | 154 + include/net/ip.h | 4 include/net/ip6_route.h | 1 include/net/ip_vs.h | 1 include/net/ipv6.h | 4 include/net/llc.h | 8 include/net/p8022.h | 3 include/net/pkt_cls.h | 6 include/net/psnap.h | 2 include/net/raw.h | 2 include/net/rawv6.h | 5 include/net/request_sock.h | 14 include/net/route.h | 4 include/net/sctp/constants.h | 2 include/net/sock.h | 80 include/net/tcp.h | 601 ----- include/net/tcp_ecn.h | 2 include/net/tcp_states.h | 34 include/net/x25.h | 2 include/net/x25device.h | 1 include/net/xfrm.h | 1 kernel/audit.c | 3 lib/kobject_uevent.c | 3 net/802/hippi.c | 4 net/802/p8022.c | 3 net/802/psnap.c | 7 net/8021q/vlan.h | 2 net/8021q/vlan_dev.c | 2 net/Kconfig | 3 net/Makefile | 2 net/appletalk/aarp.c | 2 net/appletalk/ddp.c | 8 net/atm/ipcommon.c | 3 net/ax25/af_ax25.c | 2 net/ax25/ax25_ds_in.c | 2 net/ax25/ax25_ds_timer.c | 2 net/ax25/ax25_in.c | 10 net/ax25/ax25_std_in.c | 2 net/ax25/ax25_std_timer.c | 2 net/ax25/ax25_subr.c | 4 net/bridge/netfilter/ebt_mark.c | 5 net/bridge/netfilter/ebt_ulog.c | 2 net/core/datagram.c | 6 net/core/dev.c | 45 net/core/netfilter.c | 305 +- net/core/request_sock.c | 28 net/core/rtnetlink.c | 2 net/core/skbuff.c | 66 net/core/sock.c | 133 + net/dccp/Kconfig | 24 net/dccp/Makefile | 5 net/dccp/ccid.c | 139 + net/dccp/ccid.h | 156 + net/dccp/ccids/Kconfig | 25 net/dccp/ccids/Makefile | 3 net/dccp/ccids/ccid3.c | 2156 +++++++++++++++++++ net/dccp/ccids/ccid3.h | 137 + net/dccp/dccp.h | 423 +++ net/dccp/input.c | 510 ++++ net/dccp/ipv4.c | 1291 +++++++++++ net/dccp/minisocks.c | 199 + net/dccp/options.c | 763 ++++++ net/dccp/output.c | 409 +++ net/dccp/proto.c | 818 +++++++ net/dccp/timer.c | 245 ++ net/decnet/af_decnet.c | 6 net/decnet/dn_nsp_in.c | 2 net/decnet/dn_nsp_out.c | 2 net/decnet/dn_route.c | 2 net/decnet/netfilter/dn_rtmsg.c | 4 net/econet/af_econet.c | 6 net/ethernet/eth.c | 1 net/ipv4/Makefile | 5 net/ipv4/af_inet.c | 134 + net/ipv4/arp.c | 4 net/ipv4/datagram.c | 2 net/ipv4/fib_frontend.c | 3 net/ipv4/inet_connection_sock.c | 635 +++++ net/ipv4/inet_hashtables.c | 165 + net/ipv4/inet_timewait_sock.c | 114 + net/ipv4/inetpeer.c | 2 net/ipv4/ip_input.c | 6 net/ipv4/ip_options.c | 3 net/ipv4/ip_output.c | 22 net/ipv4/ip_sockglue.c | 2 net/ipv4/ipconfig.c | 8 net/ipv4/ipvs/ip_vs_app.c | 1 net/ipv4/ipvs/ip_vs_core.c | 9 net/ipv4/ipvs/ip_vs_xmit.c | 2 net/ipv4/multipath_drr.c | 2 net/ipv4/netfilter.c | 139 + net/ipv4/netfilter/Kconfig | 23 net/ipv4/netfilter/Makefile | 5 net/ipv4/netfilter/ip_conntrack_core.c | 376 ++- net/ipv4/netfilter/ip_conntrack_ftp.c | 12 net/ipv4/netfilter/ip_conntrack_netlink.c | 1579 +++++++++++++ net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 73 net/ipv4/netfilter/ip_conntrack_proto_sctp.c | 9 net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 48 net/ipv4/netfilter/ip_conntrack_proto_udp.c | 14 net/ipv4/netfilter/ip_conntrack_standalone.c | 49 net/ipv4/netfilter/ip_nat_core.c | 104 net/ipv4/netfilter/ip_nat_helper.c | 8 net/ipv4/netfilter/ip_nat_proto_icmp.c | 23 net/ipv4/netfilter/ip_nat_proto_tcp.c | 24 net/ipv4/netfilter/ip_nat_proto_udp.c | 23 net/ipv4/netfilter/ip_nat_proto_unknown.c | 13 net/ipv4/netfilter/ip_nat_snmp_basic.c | 2 net/ipv4/netfilter/ip_nat_standalone.c | 4 net/ipv4/netfilter/ip_queue.c | 36 net/ipv4/netfilter/ip_tables.c | 1 net/ipv4/netfilter/ipt_CLASSIFY.c | 4 net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 net/ipv4/netfilter/ipt_CONNMARK.c | 15 net/ipv4/netfilter/ipt_DSCP.c | 3 net/ipv4/netfilter/ipt_ECN.c | 6 net/ipv4/netfilter/ipt_LOG.c | 86 net/ipv4/netfilter/ipt_MARK.c | 22 net/ipv4/netfilter/ipt_NFQUEUE.c | 70 net/ipv4/netfilter/ipt_REJECT.c | 1 net/ipv4/netfilter/ipt_TCPMSS.c | 3 net/ipv4/netfilter/ipt_TOS.c | 3 net/ipv4/netfilter/ipt_ULOG.c | 36 net/ipv4/netfilter/ipt_connmark.c | 7 net/ipv4/netfilter/ipt_mark.c | 7 net/ipv4/protocol.c | 1 net/ipv4/raw.c | 7 net/ipv4/route.c | 8 net/ipv4/syncookies.c | 2 net/ipv4/tcp.c | 375 --- net/ipv4/tcp_diag.c | 60 net/ipv4/tcp_input.c | 302 +- net/ipv4/tcp_ipv4.c | 910 +------- net/ipv4/tcp_minisocks.c | 313 -- net/ipv4/tcp_output.c | 99 net/ipv4/tcp_timer.c | 229 -- net/ipv4/udp.c | 3 net/ipv4/xfrm4_state.c | 2 net/ipv6/Makefile | 2 net/ipv6/addrconf.c | 4 net/ipv6/af_inet6.c | 38 net/ipv6/datagram.c | 1 net/ipv6/icmp.c | 3 net/ipv6/ip6_input.c | 15 net/ipv6/ip6_output.c | 52 net/ipv6/ipv6_sockglue.c | 15 net/ipv6/ipv6_syms.c | 3 net/ipv6/netfilter.c | 105 net/ipv6/netfilter/Kconfig | 11 net/ipv6/netfilter/Makefile | 1 net/ipv6/netfilter/ip6_queue.c | 32 net/ipv6/netfilter/ip6_tables.c | 1 net/ipv6/netfilter/ip6t_LOG.c | 93 net/ipv6/netfilter/ip6t_MARK.c | 5 net/ipv6/netfilter/ip6t_NFQUEUE.c | 70 net/ipv6/raw.c | 17 net/ipv6/tcp_ipv6.c | 254 +- net/ipv6/udp.c | 1 net/ipx/af_ipx.c | 4 net/ipx/ipx_proc.c | 2 net/irda/af_irda.c | 2 net/irda/irlap_frame.c | 8 net/irda/irmod.c | 2 net/lapb/lapb_subr.c | 2 net/llc/af_llc.c | 4 net/llc/llc_conn.c | 8 net/llc/llc_core.c | 3 net/llc/llc_if.c | 2 net/llc/llc_input.c | 4 net/llc/llc_sap.c | 2 net/netfilter/Kconfig | 24 net/netfilter/Makefile | 3 net/netfilter/nfnetlink.c | 376 +++ net/netfilter/nfnetlink_log.c | 996 ++++++++ net/netfilter/nfnetlink_queue.c | 1072 +++++++++ net/netlink/af_netlink.c | 108 net/netrom/af_netrom.c | 2 net/netrom/nr_dev.c | 2 net/netrom/nr_in.c | 2 net/netrom/nr_subr.c | 4 net/netrom/nr_timer.c | 2 net/packet/af_packet.c | 6 net/rose/af_rose.c | 2 net/rose/rose_in.c | 2 net/rose/rose_route.c | 2 net/rose/rose_subr.c | 4 net/rose/rose_timer.c | 2 net/sched/act_api.c | 7 net/sched/gact.c | 2 net/sched/ipt.c | 2 net/sched/mirred.c | 2 net/sched/pedit.c | 2 net/sched/police.c | 3 net/sched/simple.c | 2 net/sctp/ipv6.c | 7 net/sctp/protocol.c | 4 net/sctp/socket.c | 4 net/sctp/ulpqueue.c | 63 net/sunrpc/svcsock.c | 3 net/unix/af_unix.c | 2 net/unix/garbage.c | 14 net/wanrouter/af_wanpipe.c | 2 net/x25/af_x25.c | 2 net/x25/x25_dev.c | 2 net/x25/x25_in.c | 2 net/x25/x25_subr.c | 4 net/x25/x25_timer.c | 2 net/xfrm/xfrm_user.c | 4 security/selinux/netlink.c | 2 296 files changed, 17935 insertions(+), 3656 deletions(-) diff -puN Documentation/feature-removal-schedule.txt~git-net Documentation/feature-removal-schedule.txt --- devel/Documentation/feature-removal-schedule.txt~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/Documentation/feature-removal-schedule.txt 2005-08-06 23:39:09.000000000 -0700 @@ -135,3 +135,15 @@ Why: With the 16-bit PCMCIA subsystem no pcmciautils package available at http://kernel.org/pub/linux/utils/kernel/pcmcia/ Who: Dominik Brodowski + +--------------------------- + +What: ip_queue and ip6_queue (old ipv4-only and ipv6-only netfilter queue) +When: December 2005 +Why: This interface has been obsoleted by the new layer3-independent + "nfnetlink_queue". The Kernel interface is compatible, so the old + ip[6]tables "QUEUE" targets still work and will transparently handle + all packets into nfnetlink queue number 0. Userspace users will have + to link against API-compatible library on top of libnfnetlink_queue + instead of the current 'libipq'. +Who: Harald Welte diff -puN drivers/atm/nicstar.c~git-net drivers/atm/nicstar.c --- devel/drivers/atm/nicstar.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/atm/nicstar.c 2005-08-06 23:39:09.000000000 -0700 @@ -214,8 +214,7 @@ static int __devinit ns_init_card(int i, static void __devinit ns_init_card_error(ns_dev *card, int error); static scq_info *get_scq(int size, u32 scd); static void free_scq(scq_info *scq, struct atm_vcc *vcc); -static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, - u32 handle2, u32 addr2); +static void push_rxbufs(ns_dev *, struct sk_buff *); static irqreturn_t ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs); static int ns_open(struct atm_vcc *vcc); static void ns_close(struct atm_vcc *vcc); @@ -766,6 +765,7 @@ static int __devinit ns_init_card(int i, ns_init_card_error(card, error); return error; } + NS_SKB_CB(hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, hb); card->hbpool.count++; } @@ -786,9 +786,10 @@ static int __devinit ns_init_card(int i, ns_init_card_error(card, error); return error; } + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); /* Due to the implementation of push_rxbufs() this is 1, not 0 */ if (j == 1) { @@ -822,9 +823,10 @@ static int __devinit ns_init_card(int i, ns_init_card_error(card, error); return error; } + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } /* Test for strange behaviour which leads to crashes */ if ((bcount = ns_stat_sfbqc_get(readl(card->membase + STAT))) < card->sbnr.min) @@ -852,6 +854,7 @@ static int __devinit ns_init_card(int i, ns_init_card_error(card, error); return error; } + NS_SKB_CB(iovb)->buf_type = BUF_NONE; skb_queue_tail(&card->iovpool.queue, iovb); card->iovpool.count++; } @@ -1078,12 +1081,18 @@ static void free_scq(scq_info *scq, stru /* The handles passed must be pointers to the sk_buff containing the small or large buffer(s) cast to u32. */ -static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, - u32 handle2, u32 addr2) +static void push_rxbufs(ns_dev *card, struct sk_buff *skb) { + struct ns_skb_cb *cb = NS_SKB_CB(skb); + u32 handle1, addr1; + u32 handle2, addr2; u32 stat; unsigned long flags; + /* *BARF* */ + handle2 = addr2 = 0; + handle1 = (u32)skb; + addr1 = (u32)virt_to_bus(skb->data); #ifdef GENERAL_DEBUG if (!addr1) @@ -1093,7 +1102,7 @@ static void push_rxbufs(ns_dev *card, u3 stat = readl(card->membase + STAT); card->sbfqc = ns_stat_sfbqc_get(stat); card->lbfqc = ns_stat_lfbqc_get(stat); - if (type == BUF_SM) + if (cb->buf_type == BUF_SM) { if (!addr2) { @@ -1111,7 +1120,7 @@ static void push_rxbufs(ns_dev *card, u3 } } } - else /* type == BUF_LG */ + else /* buf_type == BUF_LG */ { if (!addr2) { @@ -1132,26 +1141,26 @@ static void push_rxbufs(ns_dev *card, u3 if (addr2) { - if (type == BUF_SM) + if (cb->buf_type == BUF_SM) { if (card->sbfqc >= card->sbnr.max) { - skb_unlink((struct sk_buff *) handle1); + skb_unlink((struct sk_buff *) handle1, &card->sbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle1); - skb_unlink((struct sk_buff *) handle2); + skb_unlink((struct sk_buff *) handle2, &card->sbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle2); return; } else card->sbfqc += 2; } - else /* (type == BUF_LG) */ + else /* (buf_type == BUF_LG) */ { if (card->lbfqc >= card->lbnr.max) { - skb_unlink((struct sk_buff *) handle1); + skb_unlink((struct sk_buff *) handle1, &card->lbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle1); - skb_unlink((struct sk_buff *) handle2); + skb_unlink((struct sk_buff *) handle2, &card->lbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle2); return; } @@ -1166,12 +1175,12 @@ static void push_rxbufs(ns_dev *card, u3 writel(handle2, card->membase + DR2); writel(addr1, card->membase + DR1); writel(handle1, card->membase + DR0); - writel(NS_CMD_WRITE_FREEBUFQ | (u32) type, card->membase + CMD); + writel(NS_CMD_WRITE_FREEBUFQ | cb->buf_type, card->membase + CMD); spin_unlock_irqrestore(&card->res_lock, flags); XPRINTK("nicstar%d: Pushing %s buffers at 0x%x and 0x%x.\n", card->index, - (type == BUF_SM ? "small" : "large"), addr1, addr2); + (cb->buf_type == BUF_SM ? "small" : "large"), addr1, addr2); } if (!card->efbie && card->sbfqc >= card->sbnr.min && @@ -1322,9 +1331,10 @@ static irqreturn_t ns_irq_handler(int ir card->efbie = 0; break; } + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } card->sbfqc = i; process_rsq(card); @@ -1348,9 +1358,10 @@ static irqreturn_t ns_irq_handler(int ir card->efbie = 0; break; } + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } card->lbfqc = i; process_rsq(card); @@ -2227,6 +2238,7 @@ static void dequeue_rx(ns_dev *card, ns_ recycle_rx_buf(card, skb); return; } + NS_SKB_CB(iovb)->buf_type = BUF_NONE; } else if (--card->iovpool.count < card->iovnr.min) @@ -2234,6 +2246,7 @@ static void dequeue_rx(ns_dev *card, ns_ struct sk_buff *new_iovb; if ((new_iovb = alloc_skb(NS_IOVBUFSIZE, GFP_ATOMIC)) != NULL) { + NS_SKB_CB(iovb)->buf_type = BUF_NONE; skb_queue_tail(&card->iovpool.queue, new_iovb); card->iovpool.count++; } @@ -2264,7 +2277,7 @@ static void dequeue_rx(ns_dev *card, ns_ if (NS_SKB(iovb)->iovcnt == 1) { - if (skb->list != &card->sbpool.queue) + if (NS_SKB_CB(skb)->buf_type != BUF_SM) { printk("nicstar%d: Expected a small buffer, and this is not one.\n", card->index); @@ -2278,7 +2291,7 @@ static void dequeue_rx(ns_dev *card, ns_ } else /* NS_SKB(iovb)->iovcnt >= 2 */ { - if (skb->list != &card->lbpool.queue) + if (NS_SKB_CB(skb)->buf_type != BUF_LG) { printk("nicstar%d: Expected a large buffer, and this is not one.\n", card->index); @@ -2322,8 +2335,7 @@ static void dequeue_rx(ns_dev *card, ns_ /* skb points to a small buffer */ if (!atm_charge(vcc, skb->truesize)) { - push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data), - 0, 0); + push_rxbufs(card, skb); atomic_inc(&vcc->stats->rx_drop); } else @@ -2350,8 +2362,7 @@ static void dequeue_rx(ns_dev *card, ns_ { if (!atm_charge(vcc, sb->truesize)) { - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), - 0, 0); + push_rxbufs(card, sb); atomic_inc(&vcc->stats->rx_drop); } else @@ -2367,16 +2378,14 @@ static void dequeue_rx(ns_dev *card, ns_ atomic_inc(&vcc->stats->rx); } - push_rxbufs(card, BUF_LG, (u32) skb, - (u32) virt_to_bus(skb->data), 0, 0); + push_rxbufs(card, skb); } else /* len > NS_SMBUFSIZE, the usual case */ { if (!atm_charge(vcc, skb->truesize)) { - push_rxbufs(card, BUF_LG, (u32) skb, - (u32) virt_to_bus(skb->data), 0, 0); + push_rxbufs(card, skb); atomic_inc(&vcc->stats->rx_drop); } else @@ -2394,8 +2403,7 @@ static void dequeue_rx(ns_dev *card, ns_ atomic_inc(&vcc->stats->rx); } - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), - 0, 0); + push_rxbufs(card, sb); } @@ -2430,6 +2438,7 @@ static void dequeue_rx(ns_dev *card, ns_ card->hbpool.count++; } } + NS_SKB_CB(hb)->buf_type = BUF_NONE; } else if (--card->hbpool.count < card->hbnr.min) @@ -2437,6 +2446,7 @@ static void dequeue_rx(ns_dev *card, ns_ struct sk_buff *new_hb; if ((new_hb = dev_alloc_skb(NS_HBUFSIZE)) != NULL) { + NS_SKB_CB(new_hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, new_hb); card->hbpool.count++; } @@ -2444,6 +2454,7 @@ static void dequeue_rx(ns_dev *card, ns_ { if ((new_hb = dev_alloc_skb(NS_HBUFSIZE)) != NULL) { + NS_SKB_CB(new_hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, new_hb); card->hbpool.count++; } @@ -2473,8 +2484,7 @@ static void dequeue_rx(ns_dev *card, ns_ remaining = len - iov->iov_len; iov++; /* Free the small buffer */ - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), - 0, 0); + push_rxbufs(card, sb); /* Copy all large buffers to the huge buffer and free them */ for (j = 1; j < NS_SKB(iovb)->iovcnt; j++) @@ -2485,8 +2495,7 @@ static void dequeue_rx(ns_dev *card, ns_ skb_put(hb, tocopy); iov++; remaining -= tocopy; - push_rxbufs(card, BUF_LG, (u32) lb, - (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } #ifdef EXTRA_DEBUG if (remaining != 0 || hb->len != len) @@ -2527,9 +2536,10 @@ static void ns_sb_destructor(struct sk_b sb = __dev_alloc_skb(NS_SMSKBSIZE, GFP_KERNEL); if (sb == NULL) break; + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } while (card->sbfqc < card->sbnr.min); } @@ -2550,9 +2560,10 @@ static void ns_lb_destructor(struct sk_b lb = __dev_alloc_skb(NS_LGSKBSIZE, GFP_KERNEL); if (lb == NULL) break; + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } while (card->lbfqc < card->lbnr.min); } @@ -2569,6 +2580,7 @@ static void ns_hb_destructor(struct sk_b hb = __dev_alloc_skb(NS_HBUFSIZE, GFP_KERNEL); if (hb == NULL) break; + NS_SKB_CB(hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, hb); card->hbpool.count++; } @@ -2577,45 +2589,25 @@ static void ns_hb_destructor(struct sk_b #endif /* NS_USE_DESTRUCTORS */ - static void recycle_rx_buf(ns_dev *card, struct sk_buff *skb) { - if (skb->list == &card->sbpool.queue) - push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data), 0, 0); - else if (skb->list == &card->lbpool.queue) - push_rxbufs(card, BUF_LG, (u32) skb, (u32) virt_to_bus(skb->data), 0, 0); - else - { - printk("nicstar%d: What kind of rx buffer is this?\n", card->index); - dev_kfree_skb_any(skb); - } -} + struct ns_skb_cb *cb = NS_SKB_CB(skb); + if (unlikely(cb->buf_type == BUF_NONE)) { + printk("nicstar%d: What kind of rx buffer is this?\n", card->index); + dev_kfree_skb_any(skb); + } else + push_rxbufs(card, skb); +} static void recycle_iovec_rx_bufs(ns_dev *card, struct iovec *iov, int count) { - struct sk_buff *skb; - - for (; count > 0; count--) - { - skb = (struct sk_buff *) (iov++)->iov_base; - if (skb->list == &card->sbpool.queue) - push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data), - 0, 0); - else if (skb->list == &card->lbpool.queue) - push_rxbufs(card, BUF_LG, (u32) skb, (u32) virt_to_bus(skb->data), - 0, 0); - else - { - printk("nicstar%d: What kind of rx buffer is this?\n", card->index); - dev_kfree_skb_any(skb); - } - } + while (count-- > 0) + recycle_rx_buf(card, (struct sk_buff *) (iov++)->iov_base); } - static void recycle_iov_buf(ns_dev *card, struct sk_buff *iovb) { if (card->iovpool.count < card->iovnr.max) @@ -2631,7 +2623,7 @@ static void recycle_iov_buf(ns_dev *card static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb) { - skb_unlink(sb); + skb_unlink(sb, &card->sbpool.queue); #ifdef NS_USE_DESTRUCTORS if (card->sbfqc < card->sbnr.min) #else @@ -2640,10 +2632,10 @@ static void dequeue_sm_buf(ns_dev *card, struct sk_buff *new_sb; if ((new_sb = dev_alloc_skb(NS_SMSKBSIZE)) != NULL) { + NS_SKB_CB(new_sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, new_sb); skb_reserve(new_sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) new_sb, - (u32) virt_to_bus(new_sb->data), 0, 0); + push_rxbufs(card, new_sb); } } if (card->sbfqc < card->sbnr.init) @@ -2652,10 +2644,10 @@ static void dequeue_sm_buf(ns_dev *card, struct sk_buff *new_sb; if ((new_sb = dev_alloc_skb(NS_SMSKBSIZE)) != NULL) { + NS_SKB_CB(new_sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, new_sb); skb_reserve(new_sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) new_sb, - (u32) virt_to_bus(new_sb->data), 0, 0); + push_rxbufs(card, new_sb); } } } @@ -2664,7 +2656,7 @@ static void dequeue_sm_buf(ns_dev *card, static void dequeue_lg_buf(ns_dev *card, struct sk_buff *lb) { - skb_unlink(lb); + skb_unlink(lb, &card->lbpool.queue); #ifdef NS_USE_DESTRUCTORS if (card->lbfqc < card->lbnr.min) #else @@ -2673,10 +2665,10 @@ static void dequeue_lg_buf(ns_dev *card, struct sk_buff *new_lb; if ((new_lb = dev_alloc_skb(NS_LGSKBSIZE)) != NULL) { + NS_SKB_CB(new_lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, new_lb); skb_reserve(new_lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) new_lb, - (u32) virt_to_bus(new_lb->data), 0, 0); + push_rxbufs(card, new_lb); } } if (card->lbfqc < card->lbnr.init) @@ -2685,10 +2677,10 @@ static void dequeue_lg_buf(ns_dev *card, struct sk_buff *new_lb; if ((new_lb = dev_alloc_skb(NS_LGSKBSIZE)) != NULL) { + NS_SKB_CB(new_lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, new_lb); skb_reserve(new_lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) new_lb, - (u32) virt_to_bus(new_lb->data), 0, 0); + push_rxbufs(card, new_lb); } } } @@ -2880,9 +2872,10 @@ static int ns_ioctl(struct atm_dev *dev, sb = __dev_alloc_skb(NS_SMSKBSIZE, GFP_KERNEL); if (sb == NULL) return -ENOMEM; + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } break; @@ -2894,9 +2887,10 @@ static int ns_ioctl(struct atm_dev *dev, lb = __dev_alloc_skb(NS_LGSKBSIZE, GFP_KERNEL); if (lb == NULL) return -ENOMEM; + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } break; @@ -2923,6 +2917,7 @@ static int ns_ioctl(struct atm_dev *dev, hb = __dev_alloc_skb(NS_HBUFSIZE, GFP_KERNEL); if (hb == NULL) return -ENOMEM; + NS_SKB_CB(hb)->buf_type = BUF_NONE; ns_grab_int_lock(card, flags); skb_queue_tail(&card->hbpool.queue, hb); card->hbpool.count++; @@ -2953,6 +2948,7 @@ static int ns_ioctl(struct atm_dev *dev, iovb = alloc_skb(NS_IOVBUFSIZE, GFP_KERNEL); if (iovb == NULL) return -ENOMEM; + NS_SKB_CB(iovb)->buf_type = BUF_NONE; ns_grab_int_lock(card, flags); skb_queue_tail(&card->iovpool.queue, iovb); card->iovpool.count++; @@ -2979,17 +2975,12 @@ static int ns_ioctl(struct atm_dev *dev, } - static void which_list(ns_dev *card, struct sk_buff *skb) { - printk("It's a %s buffer.\n", skb->list == &card->sbpool.queue ? - "small" : skb->list == &card->lbpool.queue ? "large" : - skb->list == &card->hbpool.queue ? "huge" : - skb->list == &card->iovpool.queue ? "iovec" : "unknown"); + printk("skb buf_type: 0x%08x\n", NS_SKB_CB(skb)->buf_type); } - static void ns_poll(unsigned long arg) { int i; diff -puN drivers/atm/nicstar.h~git-net drivers/atm/nicstar.h --- devel/drivers/atm/nicstar.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/atm/nicstar.h 2005-08-06 23:39:09.000000000 -0700 @@ -103,8 +103,14 @@ #define NS_IOREMAP_SIZE 4096 -#define BUF_SM 0x00000000 /* These two are used for push_rxbufs() */ -#define BUF_LG 0x00000001 /* CMD, Write_FreeBufQ, LBUF bit */ +/* + * BUF_XX distinguish the Rx buffers depending on their (small/large) size. + * BUG_SM and BUG_LG are both used by the driver and the device. + * BUF_NONE is only used by the driver. + */ +#define BUF_SM 0x00000000 /* These two are used for push_rxbufs() */ +#define BUF_LG 0x00000001 /* CMD, Write_FreeBufQ, LBUF bit */ +#define BUF_NONE 0xffffffff /* Software only: */ #define NS_HBUFSIZE 65568 /* Size of max. AAL5 PDU */ #define NS_MAX_IOVECS (2 + (65568 - NS_SMBUFSIZE) / \ @@ -684,6 +690,12 @@ enum ns_regs /* Device driver structures ***************************************************/ +struct ns_skb_cb { + u32 buf_type; /* BUF_SM/BUF_LG/BUF_NONE */ +}; + +#define NS_SKB_CB(skb) ((struct ns_skb_cb *)((skb)->cb)) + typedef struct tsq_info { void *org; diff -puN drivers/atm/zatm.c~git-net drivers/atm/zatm.c --- devel/drivers/atm/zatm.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/atm/zatm.c 2005-08-06 23:39:09.000000000 -0700 @@ -417,10 +417,12 @@ printk("dummy: 0x%08lx, 0x%08lx\n",dummy chan = (here[3] & uPD98401_AAL5_CHAN) >> uPD98401_AAL5_CHAN_SHIFT; if (chan < zatm_dev->chans && zatm_dev->rx_map[chan]) { + int pos = ZATM_VCC(vcc)->pool; + vcc = zatm_dev->rx_map[chan]; - if (skb == zatm_dev->last_free[ZATM_VCC(vcc)->pool]) - zatm_dev->last_free[ZATM_VCC(vcc)->pool] = NULL; - skb_unlink(skb); + if (skb == zatm_dev->last_free[pos]) + zatm_dev->last_free[pos] = NULL; + skb_unlink(skb, zatm_dev->pool + pos); } else { printk(KERN_ERR DEV_LABEL "(itf %d): RX indication " diff -puN drivers/block/aoe/aoenet.c~git-net drivers/block/aoe/aoenet.c --- devel/drivers/block/aoe/aoenet.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/block/aoe/aoenet.c 2005-08-06 23:39:09.000000000 -0700 @@ -120,7 +120,7 @@ aoenet_xmit(struct sk_buff *sl) * (1) len doesn't include the header by default. I want this. */ static int -aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt) +aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) { struct aoe_hdr *h; u32 n; diff -puN drivers/bluetooth/bfusb.c~git-net drivers/bluetooth/bfusb.c --- devel/drivers/bluetooth/bfusb.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/bluetooth/bfusb.c 2005-08-06 23:39:09.000000000 -0700 @@ -158,7 +158,7 @@ static int bfusb_send_bulk(struct bfusb if (err) { BT_ERR("%s bulk tx submit failed urb %p err %d", bfusb->hdev->name, urb, err); - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); usb_free_urb(urb); } else atomic_inc(&bfusb->pending_tx); @@ -212,7 +212,7 @@ static void bfusb_tx_complete(struct urb read_lock(&bfusb->lock); - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); skb_queue_tail(&bfusb->completed_q, skb); bfusb_tx_wakeup(bfusb); @@ -253,7 +253,7 @@ static int bfusb_rx_submit(struct bfusb if (err) { BT_ERR("%s bulk rx submit failed urb %p err %d", bfusb->hdev->name, urb, err); - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); kfree_skb(skb); usb_free_urb(urb); } @@ -398,7 +398,7 @@ static void bfusb_rx_complete(struct urb buf += len; } - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); kfree_skb(skb); bfusb_rx_submit(bfusb, urb); diff -puN drivers/char/random.c~git-net drivers/char/random.c --- devel/drivers/char/random.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/char/random.c 2005-08-06 23:39:09.000000000 -0700 @@ -1589,6 +1589,40 @@ u32 secure_tcpv6_port_ephemeral(const __ EXPORT_SYMBOL(secure_tcpv6_port_ephemeral); #endif +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +/* Similar to secure_tcp_sequence_number but generate a 48 bit value + * bit's 32-47 increase every key exchange + * 0-31 hash(source, dest) + */ +u64 secure_dccp_sequence_number(__u32 saddr, __u32 daddr, + __u16 sport, __u16 dport) +{ + struct timeval tv; + u64 seq; + __u32 hash[4]; + struct keydata *keyptr = get_keyptr(); + + hash[0] = saddr; + hash[1] = daddr; + hash[2] = (sport << 16) + dport; + hash[3] = keyptr->secret[11]; + + seq = half_md4_transform(hash, keyptr->secret); + seq |= ((u64)keyptr->count) << (32 - HASH_BITS); + + do_gettimeofday(&tv); + seq += tv.tv_usec + tv.tv_sec * 1000000; + seq &= (1ull << 48) - 1; +#if 0 + printk("dccp init_seq(%lx, %lx, %d, %d) = %d\n", + saddr, daddr, sport, dport, seq); +#endif + return seq; +} + +EXPORT_SYMBOL(secure_dccp_sequence_number); +#endif + #endif /* CONFIG_INET */ diff -puN drivers/ieee1394/ieee1394_core.c~git-net drivers/ieee1394/ieee1394_core.c --- devel/drivers/ieee1394/ieee1394_core.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/ieee1394/ieee1394_core.c 2005-08-06 23:39:09.000000000 -0700 @@ -681,7 +681,7 @@ static void handle_packet_response(struc return; } - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &host->pending_packet_queue); if (packet->state == hpsb_queued) { packet->sendtime = jiffies; @@ -989,7 +989,7 @@ void abort_timedouts(unsigned long __opa packet = (struct hpsb_packet *)skb->data; if (time_before(packet->sendtime + expire, jiffies)) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &host->pending_packet_queue); packet->state = hpsb_complete; packet->ack_code = ACKX_TIMEOUT; queue_packet_complete(packet); diff -puN drivers/isdn/act2000/capi.c~git-net drivers/isdn/act2000/capi.c --- devel/drivers/isdn/act2000/capi.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/isdn/act2000/capi.c 2005-08-06 23:39:09.000000000 -0700 @@ -606,7 +606,7 @@ handle_ack(act2000_card *card, act2000_c if ((((m->msg.data_b3_req.fakencci >> 8) & 0xff) == chan->ncci) && (m->msg.data_b3_req.blocknr == blocknr)) { /* found corresponding DATA_B3_REQ */ - skb_unlink(tmp); + skb_unlink(tmp, &card->ackq); chan->queued -= m->msg.data_b3_req.datalen; if (m->msg.data_b3_req.flags) ret = m->msg.data_b3_req.datalen; diff -puN drivers/isdn/i4l/isdn_net.c~git-net drivers/isdn/i4l/isdn_net.c --- devel/drivers/isdn/i4l/isdn_net.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/isdn/i4l/isdn_net.c 2005-08-06 23:39:09.000000000 -0700 @@ -1786,7 +1786,6 @@ isdn_net_receive(struct net_device *ndev lp->stats.rx_bytes += skb->len; } skb->dev = ndev; - skb->input_dev = ndev; skb->pkt_type = PACKET_HOST; skb->mac.raw = skb->data; #ifdef ISDN_DEBUG_NET_DUMP diff -puN drivers/isdn/i4l/isdn_ppp.c~git-net drivers/isdn/i4l/isdn_ppp.c --- devel/drivers/isdn/i4l/isdn_ppp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/isdn/i4l/isdn_ppp.c 2005-08-06 23:39:09.000000000 -0700 @@ -1177,7 +1177,6 @@ isdn_ppp_push_higher(isdn_net_dev * net_ mlp->huptimer = 0; #endif /* CONFIG_IPPP_FILTER */ skb->dev = dev; - skb->input_dev = dev; skb->mac.raw = skb->data; netif_rx(skb); /* net_dev->local->stats.rx_packets++; done in isdn_net.c */ diff -puN drivers/net/bonding/bond_3ad.c~git-net drivers/net/bonding/bond_3ad.c --- devel/drivers/net/bonding/bond_3ad.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/bonding/bond_3ad.c 2005-08-06 23:39:09.000000000 -0700 @@ -2419,22 +2419,19 @@ out: return 0; } -int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype) +int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev) { struct bonding *bond = dev->priv; struct slave *slave = NULL; int ret = NET_RX_DROP; - if (!(dev->flags & IFF_MASTER)) { + if (!(dev->flags & IFF_MASTER)) goto out; - } read_lock(&bond->lock); - slave = bond_get_slave_by_dev((struct bonding *)dev->priv, - skb->real_dev); - if (slave == NULL) { + slave = bond_get_slave_by_dev((struct bonding *)dev->priv, orig_dev); + if (!slave) goto out_unlock; - } bond_3ad_rx_indication((struct lacpdu *) skb->data, slave, skb->len); diff -puN drivers/net/bonding/bond_3ad.h~git-net drivers/net/bonding/bond_3ad.h --- devel/drivers/net/bonding/bond_3ad.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/bonding/bond_3ad.h 2005-08-06 23:39:09.000000000 -0700 @@ -295,6 +295,6 @@ void bond_3ad_adapter_duplex_changed(str void bond_3ad_handle_link_change(struct slave *slave, char link); int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info); int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev); -int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype); +int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev); #endif //__BOND_3AD_H__ diff -puN drivers/net/bonding/bond_alb.c~git-net drivers/net/bonding/bond_alb.c --- devel/drivers/net/bonding/bond_alb.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/bonding/bond_alb.c 2005-08-06 23:39:09.000000000 -0700 @@ -354,15 +354,14 @@ static void rlb_update_entry_from_arp(st _unlock_rx_hashtbl(bond); } -static int rlb_arp_recv(struct sk_buff *skb, struct net_device *bond_dev, struct packet_type *ptype) +static int rlb_arp_recv(struct sk_buff *skb, struct net_device *bond_dev, struct packet_type *ptype, struct net_device *orig_dev) { struct bonding *bond = bond_dev->priv; struct arp_pkt *arp = (struct arp_pkt *)skb->data; int res = NET_RX_DROP; - if (!(bond_dev->flags & IFF_MASTER)) { + if (!(bond_dev->flags & IFF_MASTER)) goto out; - } if (!arp) { dprintk("Packet has no ARP data\n"); diff -puN drivers/net/hamradio/bpqether.c~git-net drivers/net/hamradio/bpqether.c --- devel/drivers/net/hamradio/bpqether.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/hamradio/bpqether.c 2005-08-06 23:39:09.000000000 -0700 @@ -98,7 +98,7 @@ static char bcast_addr[6]={0xFF,0xFF,0xF static char bpq_eth_addr[6]; -static int bpq_rcv(struct sk_buff *, struct net_device *, struct packet_type *); +static int bpq_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); static int bpq_device_event(struct notifier_block *, unsigned long, void *); static const char *bpq_print_ethaddr(const unsigned char *); @@ -165,7 +165,7 @@ static inline int dev_is_ethdev(struct n /* * Receive an AX.25 frame via an ethernet interface. */ -static int bpq_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype) +static int bpq_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { int len; char * ptr; diff -puN drivers/net/ppp_generic.c~git-net drivers/net/ppp_generic.c --- devel/drivers/net/ppp_generic.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/ppp_generic.c 2005-08-06 23:39:09.000000000 -0700 @@ -1657,7 +1657,6 @@ ppp_receive_nonmp_frame(struct ppp *ppp, skb->dev = ppp->dev; skb->protocol = htons(npindex_to_ethertype[npi]); skb->mac.raw = skb->data; - skb->input_dev = ppp->dev; netif_rx(skb); ppp->dev->last_rx = jiffies; } diff -puN drivers/net/pppoe.c~git-net drivers/net/pppoe.c --- devel/drivers/net/pppoe.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/pppoe.c 2005-08-06 23:39:09.000000000 -0700 @@ -377,7 +377,8 @@ abort_kfree: ***********************************************************************/ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, + struct net_device *orig_dev) { struct pppoe_hdr *ph; @@ -426,7 +427,8 @@ out: ***********************************************************************/ static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, + struct net_device *orig_dev) { struct pppoe_hdr *ph; diff -puN drivers/net/rrunner.c~git-net drivers/net/rrunner.c --- devel/drivers/net/rrunner.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/rrunner.c 2005-08-06 23:39:09.000000000 -0700 @@ -1429,6 +1429,7 @@ static int rr_start_xmit(struct sk_buff { struct rr_private *rrpriv = netdev_priv(dev); struct rr_regs __iomem *regs = rrpriv->regs; + struct hippi_cb *hcb = (struct hippi_cb *) skb->cb; struct ring_ctrl *txctrl; unsigned long flags; u32 index, len = skb->len; @@ -1460,7 +1461,7 @@ static int rr_start_xmit(struct sk_buff ifield = (u32 *)skb_push(skb, 8); ifield[0] = 0; - ifield[1] = skb->private.ifield; + ifield[1] = hcb->ifield; /* * We don't need the lock before we are actually going to start diff -puN drivers/net/shaper.c~git-net drivers/net/shaper.c --- devel/drivers/net/shaper.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/shaper.c 2005-08-06 23:39:09.000000000 -0700 @@ -156,52 +156,6 @@ static int shaper_start_xmit(struct sk_b SHAPERCB(skb)->shapelen= shaper_clocks(shaper,skb); -#ifdef SHAPER_COMPLEX /* and broken.. */ - - while(ptr && ptr!=(struct sk_buff *)&shaper->sendq) - { - if(ptr->pripri - && jiffies - SHAPERCB(ptr)->shapeclock < SHAPER_MAXSLIP) - { - struct sk_buff *tmp=ptr->prev; - - /* - * It goes before us therefore we slip the length - * of the new frame. - */ - - SHAPERCB(ptr)->shapeclock+=SHAPERCB(skb)->shapelen; - SHAPERCB(ptr)->shapelatency+=SHAPERCB(skb)->shapelen; - - /* - * The packet may have slipped so far back it - * fell off. - */ - if(SHAPERCB(ptr)->shapelatency > SHAPER_LATENCY) - { - skb_unlink(ptr); - dev_kfree_skb(ptr); - } - ptr=tmp; - } - else - break; - } - if(ptr==NULL || ptr==(struct sk_buff *)&shaper->sendq) - skb_queue_head(&shaper->sendq,skb); - else - { - struct sk_buff *tmp; - /* - * Set the packet clock out time according to the - * frames ahead. Im sure a bit of thought could drop - * this loop. - */ - for(tmp=skb_peek(&shaper->sendq); tmp!=NULL && tmp!=ptr; tmp=tmp->next) - SHAPERCB(skb)->shapeclock+=tmp->shapelen; - skb_append(ptr,skb); - } -#else { struct sk_buff *tmp; /* @@ -220,7 +174,7 @@ static int shaper_start_xmit(struct sk_b } else skb_queue_tail(&shaper->sendq, skb); } -#endif + if(sh_debug) printk("Frame queued.\n"); if(skb_queue_len(&shaper->sendq)>SHAPER_QLEN) @@ -302,7 +256,7 @@ static void shaper_kick(struct shaper *s * Pull the frame and get interrupts back on. */ - skb_unlink(skb); + skb_unlink(skb, &shaper->sendq); if (shaper->recovery < SHAPERCB(skb)->shapeclock + SHAPERCB(skb)->shapelen) shaper->recovery = SHAPERCB(skb)->shapeclock + SHAPERCB(skb)->shapelen; diff -puN drivers/net/tg3.c~git-net drivers/net/tg3.c --- devel/drivers/net/tg3.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/tg3.c 2005-08-06 23:39:09.000000000 -0700 @@ -340,41 +340,92 @@ static struct { static void tg3_write_indirect_reg32(struct tg3 *tp, u32 off, u32 val) { - if ((tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) != 0) { - spin_lock_bh(&tp->indirect_lock); - pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); - pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); - spin_unlock_bh(&tp->indirect_lock); - } else { - writel(val, tp->regs + off); - if ((tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG) != 0) - readl(tp->regs + off); + unsigned long flags; + + spin_lock_irqsave(&tp->indirect_lock, flags); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); + pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); + spin_unlock_irqrestore(&tp->indirect_lock, flags); +} + +static void tg3_write_flush_reg32(struct tg3 *tp, u32 off, u32 val) +{ + writel(val, tp->regs + off); + readl(tp->regs + off); +} + +static u32 tg3_read_indirect_reg32(struct tg3 *tp, u32 off) +{ + unsigned long flags; + u32 val; + + spin_lock_irqsave(&tp->indirect_lock, flags); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); + pci_read_config_dword(tp->pdev, TG3PCI_REG_DATA, &val); + spin_unlock_irqrestore(&tp->indirect_lock, flags); + return val; +} + +static void tg3_write_indirect_mbox(struct tg3 *tp, u32 off, u32 val) +{ + unsigned long flags; + + if (off == (MAILBOX_RCVRET_CON_IDX_0 + TG3_64BIT_REG_LOW)) { + pci_write_config_dword(tp->pdev, TG3PCI_RCV_RET_RING_CON_IDX + + TG3_64BIT_REG_LOW, val); + return; + } + if (off == (MAILBOX_RCV_STD_PROD_IDX + TG3_64BIT_REG_LOW)) { + pci_write_config_dword(tp->pdev, TG3PCI_STD_RING_PROD_IDX + + TG3_64BIT_REG_LOW, val); + return; + } + + spin_lock_irqsave(&tp->indirect_lock, flags); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off + 0x5600); + pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); + spin_unlock_irqrestore(&tp->indirect_lock, flags); + + /* In indirect mode when disabling interrupts, we also need + * to clear the interrupt bit in the GRC local ctrl register. + */ + if ((off == (MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW)) && + (val == 0x1)) { + pci_write_config_dword(tp->pdev, TG3PCI_MISC_LOCAL_CTRL, + tp->grc_local_ctrl|GRC_LCLCTRL_CLEARINT); } } +static u32 tg3_read_indirect_mbox(struct tg3 *tp, u32 off) +{ + unsigned long flags; + u32 val; + + spin_lock_irqsave(&tp->indirect_lock, flags); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off + 0x5600); + pci_read_config_dword(tp->pdev, TG3PCI_REG_DATA, &val); + spin_unlock_irqrestore(&tp->indirect_lock, flags); + return val; +} + static void _tw32_flush(struct tg3 *tp, u32 off, u32 val) { - if ((tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) != 0) { - spin_lock_bh(&tp->indirect_lock); - pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); - pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); - spin_unlock_bh(&tp->indirect_lock); - } else { - void __iomem *dest = tp->regs + off; - writel(val, dest); - readl(dest); /* always flush PCI write */ - } + tp->write32(tp, off, val); + if (!(tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) && + !(tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG) && + !(tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND)) + tp->read32(tp, off); /* flush */ } -static inline void _tw32_rx_mbox(struct tg3 *tp, u32 off, u32 val) +static inline void tw32_mailbox_flush(struct tg3 *tp, u32 off, u32 val) { - void __iomem *mbox = tp->regs + off; - writel(val, mbox); - if (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER) - readl(mbox); + tp->write32_mbox(tp, off, val); + if (!(tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER) && + !(tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND)) + tp->read32_mbox(tp, off); } -static inline void _tw32_tx_mbox(struct tg3 *tp, u32 off, u32 val) +static void tg3_write32_tx_mbox(struct tg3 *tp, u32 off, u32 val) { void __iomem *mbox = tp->regs + off; writel(val, mbox); @@ -384,46 +435,57 @@ static inline void _tw32_tx_mbox(struct readl(mbox); } -#define tw32_mailbox(reg, val) writel(((val) & 0xffffffff), tp->regs + (reg)) -#define tw32_rx_mbox(reg, val) _tw32_rx_mbox(tp, reg, val) -#define tw32_tx_mbox(reg, val) _tw32_tx_mbox(tp, reg, val) +static void tg3_write32(struct tg3 *tp, u32 off, u32 val) +{ + writel(val, tp->regs + off); +} + +static u32 tg3_read32(struct tg3 *tp, u32 off) +{ + return (readl(tp->regs + off)); +} + +#define tw32_mailbox(reg, val) tp->write32_mbox(tp, reg, val) +#define tw32_mailbox_f(reg, val) tw32_mailbox_flush(tp, (reg), (val)) +#define tw32_rx_mbox(reg, val) tp->write32_rx_mbox(tp, reg, val) +#define tw32_tx_mbox(reg, val) tp->write32_tx_mbox(tp, reg, val) +#define tr32_mailbox(reg) tp->read32_mbox(tp, reg) -#define tw32(reg,val) tg3_write_indirect_reg32(tp,(reg),(val)) +#define tw32(reg,val) tp->write32(tp, reg, val) #define tw32_f(reg,val) _tw32_flush(tp,(reg),(val)) -#define tw16(reg,val) writew(((val) & 0xffff), tp->regs + (reg)) -#define tw8(reg,val) writeb(((val) & 0xff), tp->regs + (reg)) -#define tr32(reg) readl(tp->regs + (reg)) -#define tr16(reg) readw(tp->regs + (reg)) -#define tr8(reg) readb(tp->regs + (reg)) +#define tr32(reg) tp->read32(tp, reg) static void tg3_write_mem(struct tg3 *tp, u32 off, u32 val) { - spin_lock_bh(&tp->indirect_lock); + unsigned long flags; + + spin_lock_irqsave(&tp->indirect_lock, flags); pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, off); pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_DATA, val); /* Always leave this as zero. */ pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, 0); - spin_unlock_bh(&tp->indirect_lock); + spin_unlock_irqrestore(&tp->indirect_lock, flags); } static void tg3_read_mem(struct tg3 *tp, u32 off, u32 *val) { - spin_lock_bh(&tp->indirect_lock); + unsigned long flags; + + spin_lock_irqsave(&tp->indirect_lock, flags); pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, off); pci_read_config_dword(tp->pdev, TG3PCI_MEM_WIN_DATA, val); /* Always leave this as zero. */ pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, 0); - spin_unlock_bh(&tp->indirect_lock); + spin_unlock_irqrestore(&tp->indirect_lock, flags); } static void tg3_disable_ints(struct tg3 *tp) { tw32(TG3PCI_MISC_HOST_CTRL, (tp->misc_host_ctrl | MISC_HOST_CTRL_MASK_PCI_INT)); - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001); } static inline void tg3_cond_int(struct tg3 *tp) @@ -439,9 +501,8 @@ static void tg3_enable_ints(struct tg3 * tw32(TG3PCI_MISC_HOST_CTRL, (tp->misc_host_ctrl & ~MISC_HOST_CTRL_MASK_PCI_INT)); - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, - (tp->last_tag << 24)); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + (tp->last_tag << 24)); tg3_cond_int(tp); } @@ -472,8 +533,6 @@ static inline unsigned int tg3_has_work( */ static void tg3_restart_ints(struct tg3 *tp) { - tw32(TG3PCI_MISC_HOST_CTRL, - (tp->misc_host_ctrl & ~MISC_HOST_CTRL_MASK_PCI_INT)); tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, tp->last_tag << 24); mmiowb(); @@ -3278,9 +3337,8 @@ static irqreturn_t tg3_interrupt(int irq /* No work, shared interrupt perhaps? re-enable * interrupts, and flush that PCI write */ - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000000); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); } } else { /* shared interrupt */ handled = 0; @@ -3323,9 +3381,8 @@ static irqreturn_t tg3_interrupt_tagged( /* no work, shared interrupt perhaps? re-enable * interrupts, and flush that PCI write */ - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, - tp->last_tag << 24); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + tp->last_tag << 24); } } else { /* shared interrupt */ handled = 0; @@ -4216,7 +4273,7 @@ static void tg3_stop_fw(struct tg3 *); static int tg3_chip_reset(struct tg3 *tp) { u32 val; - u32 flags_save; + void (*write_op)(struct tg3 *, u32, u32); int i; if (!(tp->tg3_flags2 & TG3_FLG2_SUN_570X)) @@ -4228,8 +4285,9 @@ static int tg3_chip_reset(struct tg3 *tp * fun things. So, temporarily disable the 5701 * hardware workaround, while we do the reset. */ - flags_save = tp->tg3_flags; - tp->tg3_flags &= ~TG3_FLAG_5701_REG_WRITE_BUG; + write_op = tp->write32; + if (write_op == tg3_write_flush_reg32) + tp->write32 = tg3_write32; /* do the reset */ val = GRC_MISC_CFG_CORECLK_RESET; @@ -4248,8 +4306,8 @@ static int tg3_chip_reset(struct tg3 *tp val |= GRC_MISC_CFG_KEEP_GPHY_POWER; tw32(GRC_MISC_CFG, val); - /* restore 5701 hardware bug workaround flag */ - tp->tg3_flags = flags_save; + /* restore 5701 hardware bug workaround write method */ + tp->write32 = write_op; /* Unfortunately, we have to delay before the PCI read back. * Some 575X chips even will not respond to a PCI cfg access @@ -4635,7 +4693,6 @@ static int tg3_load_firmware_cpu(struct int cpu_scratch_size, struct fw_info *info) { int err, i; - u32 orig_tg3_flags = tp->tg3_flags; void (*write_op)(struct tg3 *, u32, u32); if (cpu_base == TX_CPU_BASE && @@ -4651,11 +4708,6 @@ static int tg3_load_firmware_cpu(struct else write_op = tg3_write_indirect_reg32; - /* Force use of PCI config space for indirect register - * write calls. - */ - tp->tg3_flags |= TG3_FLAG_PCIX_TARGET_HWBUG; - /* It is possible that bootcode is still loading at this point. * Get the nvram lock first before halting the cpu. */ @@ -4691,7 +4743,6 @@ static int tg3_load_firmware_cpu(struct err = 0; out: - tp->tg3_flags = orig_tg3_flags; return err; } @@ -5808,8 +5859,7 @@ static int tg3_reset_hw(struct tg3 *tp) tw32_f(GRC_LOCAL_CTRL, tp->grc_local_ctrl); udelay(100); - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0); tp->last_tag = 0; if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) { @@ -6198,7 +6248,8 @@ static int tg3_test_interrupt(struct tg3 HOSTCC_MODE_NOW); for (i = 0; i < 5; i++) { - int_mbox = tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + int_mbox = tr32_mailbox(MAILBOX_INTERRUPT_0 + + TG3_64BIT_REG_LOW); if (int_mbox != 0) break; msleep(10); @@ -6598,10 +6649,10 @@ static int tg3_open(struct net_device *d /* Mailboxes */ printk("DEBUG: SNDHOST_PROD[%08x%08x] SNDNIC_PROD[%08x%08x]\n", - tr32(MAILBOX_SNDHOST_PROD_IDX_0 + 0x0), - tr32(MAILBOX_SNDHOST_PROD_IDX_0 + 0x4), - tr32(MAILBOX_SNDNIC_PROD_IDX_0 + 0x0), - tr32(MAILBOX_SNDNIC_PROD_IDX_0 + 0x4)); + tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + 0x0), + tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + 0x4), + tr32_mailbox(MAILBOX_SNDNIC_PROD_IDX_0 + 0x0), + tr32_mailbox(MAILBOX_SNDNIC_PROD_IDX_0 + 0x4)); /* NIC side send descriptors. */ for (i = 0; i < 6; i++) { @@ -7903,7 +7954,7 @@ static int tg3_test_loopback(struct tg3 num_pkts++; tw32_tx_mbox(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW, send_idx); - tr32(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW); + tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW); udelay(10); @@ -9153,14 +9204,6 @@ static int __devinit tg3_is_sun_570X(str static int __devinit tg3_get_invariants(struct tg3 *tp) { static struct pci_device_id write_reorder_chipsets[] = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801AA_8) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801AB_8) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801BA_11) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801BA_6) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_FE_GATE_700C) }, { }, @@ -9177,7 +9220,7 @@ static int __devinit tg3_get_invariants( tp->tg3_flags2 |= TG3_FLG2_SUN_570X; #endif - /* If we have an AMD 762 or Intel ICH/ICH0/ICH2 chipset, write + /* If we have an AMD 762 chipset, write * reordering to the mailbox registers done by the host * controller can cause major troubles. We read back from * every mailbox register write to force the writes to be @@ -9215,6 +9258,69 @@ static int __devinit tg3_get_invariants( if (tp->pci_chip_rev_id == CHIPREV_ID_5752_A0_HW) tp->pci_chip_rev_id = CHIPREV_ID_5752_A0; + /* If we have 5702/03 A1 or A2 on certain ICH chipsets, + * we need to disable memory and use config. cycles + * only to access all registers. The 5702/03 chips + * can mistakenly decode the special cycles from the + * ICH chipsets as memory write cycles, causing corruption + * of register and memory space. Only certain ICH bridges + * will drive special cycles with non-zero data during the + * address phase which can fall within the 5703's address + * range. This is not an ICH bug as the PCI spec allows + * non-zero address during special cycles. However, only + * these ICH bridges are known to drive non-zero addresses + * during special cycles. + * + * Since special cycles do not cross PCI bridges, we only + * enable this workaround if the 5703 is on the secondary + * bus of these ICH bridges. + */ + if ((tp->pci_chip_rev_id == CHIPREV_ID_5703_A1) || + (tp->pci_chip_rev_id == CHIPREV_ID_5703_A2)) { + static struct tg3_dev_id { + u32 vendor; + u32 device; + u32 rev; + } ich_chipsets[] = { + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801AA_8, + PCI_ANY_ID }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801AB_8, + PCI_ANY_ID }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_11, + 0xa }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_6, + PCI_ANY_ID }, + { }, + }; + struct tg3_dev_id *pci_id = &ich_chipsets[0]; + struct pci_dev *bridge = NULL; + + while (pci_id->vendor != 0) { + bridge = pci_get_device(pci_id->vendor, pci_id->device, + bridge); + if (!bridge) { + pci_id++; + continue; + } + if (pci_id->rev != PCI_ANY_ID) { + u8 rev; + + pci_read_config_byte(bridge, PCI_REVISION_ID, + &rev); + if (rev > pci_id->rev) + continue; + } + if (bridge->subordinate && + (bridge->subordinate->number == + tp->pdev->bus->number)) { + + tp->tg3_flags2 |= TG3_FLG2_ICH_WORKAROUND; + pci_dev_put(bridge); + break; + } + } + } + /* Find msi capability. */ if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780) tp->msi_cap = pci_find_capability(tp->pdev, PCI_CAP_ID_MSI); @@ -9302,6 +9408,12 @@ static int __devinit tg3_get_invariants( } } + /* 5700 BX chips need to have their TX producer index mailboxes + * written twice to workaround a bug. + */ + if (GET_CHIP_REV(tp->pci_chip_rev_id) == CHIPREV_5700_BX) + tp->tg3_flags |= TG3_FLAG_TXD_MBOX_HWBUG; + /* Back to back register writes can cause problems on this chip, * the workaround is to read back all reg writes except those to * mailbox regs. See tg3_write_indirect_reg32(). @@ -9325,6 +9437,43 @@ static int __devinit tg3_get_invariants( pci_write_config_dword(tp->pdev, TG3PCI_PCISTATE, pci_state_reg); } + /* Default fast path register access methods */ + tp->read32 = tg3_read32; + tp->write32 = tg3_write32; + tp->read32_mbox = tg3_read32; + tp->write32_mbox = tg3_write32; + tp->write32_tx_mbox = tg3_write32; + tp->write32_rx_mbox = tg3_write32; + + /* Various workaround register access methods */ + if (tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) + tp->write32 = tg3_write_indirect_reg32; + else if (tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG) + tp->write32 = tg3_write_flush_reg32; + + if ((tp->tg3_flags & TG3_FLAG_TXD_MBOX_HWBUG) || + (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER)) { + tp->write32_tx_mbox = tg3_write32_tx_mbox; + if (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER) + tp->write32_rx_mbox = tg3_write_flush_reg32; + } + + if (tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND) { + tp->read32 = tg3_read_indirect_reg32; + tp->write32 = tg3_write_indirect_reg32; + tp->read32_mbox = tg3_read_indirect_mbox; + tp->write32_mbox = tg3_write_indirect_mbox; + tp->write32_tx_mbox = tg3_write_indirect_mbox; + tp->write32_rx_mbox = tg3_write_indirect_mbox; + + iounmap(tp->regs); + tp->regs = 0; + + pci_read_config_word(tp->pdev, PCI_COMMAND, &pci_cmd); + pci_cmd &= ~PCI_COMMAND_MEMORY; + pci_write_config_word(tp->pdev, PCI_COMMAND, pci_cmd); + } + /* Get eeprom hw config before calling tg3_set_power_state(). * In particular, the TG3_FLAG_EEPROM_WRITE_PROT flag must be * determined before calling tg3_set_power_state() so that @@ -9539,14 +9688,6 @@ static int __devinit tg3_get_invariants( else tp->tg3_flags &= ~TG3_FLAG_POLL_SERDES; - /* 5700 BX chips need to have their TX producer index mailboxes - * written twice to workaround a bug. - */ - if (GET_CHIP_REV(tp->pci_chip_rev_id) == CHIPREV_5700_BX) - tp->tg3_flags |= TG3_FLAG_TXD_MBOX_HWBUG; - else - tp->tg3_flags &= ~TG3_FLAG_TXD_MBOX_HWBUG; - /* It seems all chips can get confused if TX buffers * straddle the 4GB address boundary in some cases. */ @@ -10469,7 +10610,10 @@ static int __devinit tg3_init_one(struct return 0; err_out_iounmap: - iounmap(tp->regs); + if (tp->regs) { + iounmap(tp->regs); + tp->regs = 0; + } err_out_free_dev: free_netdev(dev); @@ -10491,7 +10635,10 @@ static void __devexit tg3_remove_one(str struct tg3 *tp = netdev_priv(dev); unregister_netdev(dev); - iounmap(tp->regs); + if (tp->regs) { + iounmap(tp->regs); + tp->regs = 0; + } free_netdev(dev); pci_release_regions(pdev); pci_disable_device(pdev); diff -puN drivers/net/tg3.h~git-net drivers/net/tg3.h --- devel/drivers/net/tg3.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/tg3.h 2005-08-06 23:39:09.000000000 -0700 @@ -2049,6 +2049,11 @@ struct tg3 { spinlock_t lock; spinlock_t indirect_lock; + u32 (*read32) (struct tg3 *, u32); + void (*write32) (struct tg3 *, u32, u32); + u32 (*read32_mbox) (struct tg3 *, u32); + void (*write32_mbox) (struct tg3 *, u32, + u32); void __iomem *regs; struct net_device *dev; struct pci_dev *pdev; @@ -2060,6 +2065,8 @@ struct tg3 { u32 msg_enable; /* begin "tx thread" cacheline section */ + void (*write32_tx_mbox) (struct tg3 *, u32, + u32); u32 tx_prod; u32 tx_cons; u32 tx_pending; @@ -2071,6 +2078,8 @@ struct tg3 { dma_addr_t tx_desc_mapping; /* begin "rx thread" cacheline section */ + void (*write32_rx_mbox) (struct tg3 *, u32, + u32); u32 rx_rcb_ptr; u32 rx_std_ptr; u32 rx_jumbo_ptr; @@ -2165,6 +2174,7 @@ struct tg3 { #define TG3_FLG2_ANY_SERDES (TG3_FLG2_PHY_SERDES | \ TG3_FLG2_MII_SERDES) #define TG3_FLG2_PARALLEL_DETECT 0x01000000 +#define TG3_FLG2_ICH_WORKAROUND 0x02000000 u32 split_mode_max_reqs; #define SPLIT_MODE_5704_MAX_REQ 3 diff -puN drivers/net/wan/hdlc_generic.c~git-net drivers/net/wan/hdlc_generic.c --- devel/drivers/net/wan/hdlc_generic.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/wan/hdlc_generic.c 2005-08-06 23:39:09.000000000 -0700 @@ -61,7 +61,7 @@ static struct net_device_stats *hdlc_get static int hdlc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *p) + struct packet_type *p, struct net_device *orig_dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto.netif_rx) diff -puN drivers/net/wan/lapbether.c~git-net drivers/net/wan/lapbether.c --- devel/drivers/net/wan/lapbether.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/wan/lapbether.c 2005-08-06 23:39:09.000000000 -0700 @@ -86,7 +86,7 @@ static __inline__ int dev_is_ethdev(stru /* * Receive a LAPB frame via an ethernet interface. */ -static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype) +static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { int len, err; struct lapbethdev *lapbeth; diff -puN drivers/net/wan/sdla_fr.c~git-net drivers/net/wan/sdla_fr.c --- devel/drivers/net/wan/sdla_fr.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/wan/sdla_fr.c 2005-08-06 23:39:09.000000000 -0700 @@ -445,7 +445,7 @@ void s508_s514_unlock(sdla_t *card, uns void s508_s514_lock(sdla_t *card, unsigned long *smp_flags); unsigned short calc_checksum (char *, int); -static int setup_fr_header(struct sk_buff** skb, +static int setup_fr_header(struct sk_buff *skb, struct net_device* dev, char op_mode); @@ -1372,7 +1372,7 @@ static int if_send(struct sk_buff* skb, /* Move the if_header() code to here. By inserting frame * relay header in if_header() we would break the * tcpdump and other packet sniffers */ - chan->fr_header_len = setup_fr_header(&skb,dev,chan->common.usedby); + chan->fr_header_len = setup_fr_header(skb,dev,chan->common.usedby); if (chan->fr_header_len < 0 ){ ++chan->ifstats.tx_dropped; ++card->wandev.stats.tx_dropped; @@ -1597,8 +1597,6 @@ static int setup_for_delayed_transmit(st return 1; } - skb_unlink(skb); - chan->transmit_length = len; chan->delay_skb = skb; @@ -4871,18 +4869,15 @@ static void unconfig_fr (sdla_t *card) } } -static int setup_fr_header(struct sk_buff **skb_orig, struct net_device* dev, +static int setup_fr_header(struct sk_buff *skb, struct net_device* dev, char op_mode) { - struct sk_buff *skb = *skb_orig; fr_channel_t *chan=dev->priv; - if (op_mode == WANPIPE){ - + if (op_mode == WANPIPE) { chan->fr_header[0]=Q922_UI; switch (htons(skb->protocol)){ - case ETH_P_IP: chan->fr_header[1]=NLPID_IP; break; @@ -4894,16 +4889,14 @@ static int setup_fr_header(struct sk_buf } /* If we are in bridging mode, we must apply - * an Ethernet header */ - if (op_mode == BRIDGE || op_mode == BRIDGE_NODE){ - - + * an Ethernet header + */ + if (op_mode == BRIDGE || op_mode == BRIDGE_NODE) { /* Encapsulate the packet as a bridged Ethernet frame. */ #ifdef DEBUG printk(KERN_INFO "%s: encapsulating skb for frame relay\n", dev->name); #endif - chan->fr_header[0] = 0x03; chan->fr_header[1] = 0x00; chan->fr_header[2] = 0x80; @@ -4916,7 +4909,6 @@ static int setup_fr_header(struct sk_buf /* Yuck. */ skb->protocol = ETH_P_802_3; return 8; - } return 0; diff -puN drivers/net/wan/syncppp.c~git-net drivers/net/wan/syncppp.c --- devel/drivers/net/wan/syncppp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/net/wan/syncppp.c 2005-08-06 23:39:09.000000000 -0700 @@ -1447,7 +1447,7 @@ static void sppp_print_bytes (u_char *p, * after interrupt servicing to process frames queued via netif_rx. */ -static int sppp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p) +static int sppp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p, struct net_device *orig_dev) { if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; diff -puN drivers/usb/net/usbnet.c~git-net drivers/usb/net/usbnet.c --- devel/drivers/usb/net/usbnet.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/usb/net/usbnet.c 2005-08-06 23:39:09.000000000 -0700 @@ -2903,19 +2903,18 @@ static struct net_device_stats *usbnet_g * completion callbacks. 2.5 should have fixed those bugs... */ -static void defer_bh (struct usbnet *dev, struct sk_buff *skb) +static void defer_bh(struct usbnet *dev, struct sk_buff *skb, struct sk_buff_head *list) { - struct sk_buff_head *list = skb->list; unsigned long flags; - spin_lock_irqsave (&list->lock, flags); - __skb_unlink (skb, list); - spin_unlock (&list->lock); - spin_lock (&dev->done.lock); - __skb_queue_tail (&dev->done, skb); + spin_lock_irqsave(&list->lock, flags); + __skb_unlink(skb, list); + spin_unlock(&list->lock); + spin_lock(&dev->done.lock); + __skb_queue_tail(&dev->done, skb); if (dev->done.qlen == 1) - tasklet_schedule (&dev->bh); - spin_unlock_irqrestore (&dev->done.lock, flags); + tasklet_schedule(&dev->bh); + spin_unlock_irqrestore(&dev->done.lock, flags); } /* some work can't be done in tasklets, so we use keventd @@ -3120,7 +3119,7 @@ block: break; } - defer_bh (dev, skb); + defer_bh(dev, skb, &dev->rxq); if (urb) { if (netif_running (dev->net) @@ -3490,7 +3489,7 @@ static void tx_complete (struct urb *urb urb->dev = NULL; entry->state = tx_done; - defer_bh (dev, skb); + defer_bh(dev, skb, &dev->txq); } /*-------------------------------------------------------------------------*/ diff -puN drivers/w1/w1_int.c~git-net drivers/w1/w1_int.c --- devel/drivers/w1/w1_int.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/drivers/w1/w1_int.c 2005-08-06 23:39:09.000000000 -0700 @@ -221,3 +221,5 @@ void w1_remove_master_device(struct w1_b EXPORT_SYMBOL(w1_add_master_device); EXPORT_SYMBOL(w1_remove_master_device); + +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_W1); diff -puN fs/smbfs/sock.c~git-net fs/smbfs/sock.c --- devel/fs/smbfs/sock.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/fs/smbfs/sock.c 2005-08-06 23:39:09.000000000 -0700 @@ -15,12 +15,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include diff -puN include/asm-alpha/socket.h~git-net include/asm-alpha/socket.h --- devel/include/asm-alpha/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-alpha/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -25,6 +25,8 @@ #define SO_ERROR 0x1007 #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_RCVLOWAT 0x1010 #define SO_SNDLOWAT 0x1011 #define SO_RCVTIMEO 0x1012 diff -puN include/asm-arm26/socket.h~git-net include/asm-arm26/socket.h --- devel/include/asm-arm26/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-arm26/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-arm/socket.h~git-net include/asm-arm/socket.h --- devel/include/asm-arm/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-arm/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-cris/socket.h~git-net include/asm-cris/socket.h --- devel/include/asm-cris/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-cris/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -16,6 +16,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-frv/socket.h~git-net include/asm-frv/socket.h --- devel/include/asm-frv/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-frv/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-h8300/socket.h~git-net include/asm-h8300/socket.h --- devel/include/asm-h8300/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-h8300/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-i386/socket.h~git-net include/asm-i386/socket.h --- devel/include/asm-i386/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-i386/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-ia64/socket.h~git-net include/asm-ia64/socket.h --- devel/include/asm-ia64/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-ia64/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -23,6 +23,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-m32r/socket.h~git-net include/asm-m32r/socket.h --- devel/include/asm-m32r/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-m32r/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-m68k/socket.h~git-net include/asm-m68k/socket.h --- devel/include/asm-m68k/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-m68k/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-mips/socket.h~git-net include/asm-mips/socket.h --- devel/include/asm-mips/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-mips/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -37,6 +37,8 @@ To add: #define SO_REUSEPORT 0x0200 /* A #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_SNDBUF 0x1001 /* Send buffer size. */ #define SO_RCVBUF 0x1002 /* Receive buffer. */ +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_SNDLOWAT 0x1003 /* send low-water mark */ #define SO_RCVLOWAT 0x1004 /* receive low-water mark */ #define SO_SNDTIMEO 0x1005 /* send timeout */ diff -puN include/asm-parisc/socket.h~git-net include/asm-parisc/socket.h --- devel/include/asm-parisc/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-parisc/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -16,6 +16,8 @@ /* To add :#define SO_REUSEPORT 0x0200 */ #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_SNDLOWAT 0x1003 #define SO_RCVLOWAT 0x1004 #define SO_SNDTIMEO 0x1005 diff -puN include/asm-ppc64/socket.h~git-net include/asm-ppc64/socket.h --- devel/include/asm-ppc64/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-ppc64/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -21,6 +21,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-ppc/socket.h~git-net include/asm-ppc/socket.h --- devel/include/asm-ppc/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-ppc/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -20,6 +20,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-s390/socket.h~git-net include/asm-s390/socket.h --- devel/include/asm-s390/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-s390/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -22,6 +22,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-sh/socket.h~git-net include/asm-sh/socket.h --- devel/include/asm-sh/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-sh/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_RCVBUFFORCE 32 +#define SO_SNDBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-sparc64/socket.h~git-net include/asm-sparc64/socket.h --- devel/include/asm-sparc64/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-sparc64/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -29,6 +29,8 @@ #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_ERROR 0x1007 #define SO_TYPE 0x1008 diff -puN include/asm-sparc/socket.h~git-net include/asm-sparc/socket.h --- devel/include/asm-sparc/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-sparc/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -29,6 +29,8 @@ #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_ERROR 0x1007 #define SO_TYPE 0x1008 diff -puN include/asm-v850/socket.h~git-net include/asm-v850/socket.h --- devel/include/asm-v850/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-v850/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-x86_64/socket.h~git-net include/asm-x86_64/socket.h --- devel/include/asm-x86_64/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-x86_64/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN include/asm-xtensa/socket.h~git-net include/asm-xtensa/socket.h --- devel/include/asm-xtensa/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/asm-xtensa/socket.h 2005-08-06 23:39:09.000000000 -0700 @@ -24,6 +24,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff -puN /dev/null include/linux/dccp.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/include/linux/dccp.h 2005-08-06 23:39:09.000000000 -0700 @@ -0,0 +1,431 @@ +#ifndef _LINUX_DCCP_H +#define _LINUX_DCCP_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* FIXME: this is utterly wrong */ +struct sockaddr_dccp { + struct sockaddr_in in; + unsigned int service; +}; + +enum dccp_state { + DCCP_OPEN = TCP_ESTABLISHED, + DCCP_REQUESTING = TCP_SYN_SENT, + DCCP_PARTOPEN = TCP_FIN_WAIT1, /* FIXME: + This mapping is horrible, but TCP has + no matching state for DCCP_PARTOPEN, + as TCP_SYN_RECV is already used by + DCCP_RESPOND, why don't stop using TCP + mapping of states? OK, now we don't use + sk_stream_sendmsg anymore, so doesn't + seem to exist any reason for us to + do the TCP mapping here */ + DCCP_LISTEN = TCP_LISTEN, + DCCP_RESPOND = TCP_SYN_RECV, + DCCP_CLOSING = TCP_CLOSING, + DCCP_TIME_WAIT = TCP_TIME_WAIT, + DCCP_CLOSED = TCP_CLOSE, + DCCP_MAX_STATES = TCP_MAX_STATES, +}; + +#define DCCP_STATE_MASK 0xf +#define DCCP_ACTION_FIN (1<<7) + +enum { + DCCPF_OPEN = TCPF_ESTABLISHED, + DCCPF_REQUESTING = TCPF_SYN_SENT, + DCCPF_PARTOPEN = TCPF_FIN_WAIT1, + DCCPF_LISTEN = TCPF_LISTEN, + DCCPF_RESPOND = TCPF_SYN_RECV, + DCCPF_CLOSING = TCPF_CLOSING, + DCCPF_TIME_WAIT = TCPF_TIME_WAIT, + DCCPF_CLOSED = TCPF_CLOSE, +}; + +/** + * struct dccp_hdr - generic part of DCCP packet header + * + * @dccph_sport - Relevant port on the endpoint that sent this packet + * @dccph_dport - Relevant port on the other endpoint + * @dccph_doff - Data Offset from the start of the DCCP header, in 32-bit words + * @dccph_ccval - Used by the HC-Sender CCID + * @dccph_cscov - Parts of the packet that are covered by the Checksum field + * @dccph_checksum - Internet checksum, depends on dccph_cscov + * @dccph_x - 0 = 24 bit sequence number, 1 = 48 + * @dccph_type - packet type, see DCCP_PKT_ prefixed macros + * @dccph_seq - sequence number high or low order 24 bits, depends on dccph_x + */ +struct dccp_hdr { + __u16 dccph_sport, + dccph_dport; + __u8 dccph_doff; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 dccph_cscov:4, + dccph_ccval:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 dccph_ccval:4, + dccph_cscov:4; +#else +#error "Adjust your defines" +#endif + __u16 dccph_checksum; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 dccph_x:1, + dccph_type:4, + dccph_reserved:3, + dccph_seq:24; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u32 dccph_reserved:3, + dccph_type:4, + dccph_x:1, + dccph_seq:24; +#else +#error "Adjust your defines" +#endif +}; + +static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) +{ + return (struct dccp_hdr *)skb->h.raw; +} + +/** + * struct dccp_hdr_ext - the low bits of a 48 bit seq packet + * + * @dccph_seq_low - low 24 bits of a 48 bit seq packet + */ +struct dccp_hdr_ext { + __u32 dccph_seq_low; +}; + +static inline struct dccp_hdr_ext *dccp_hdrx(const struct sk_buff *skb) +{ + return (struct dccp_hdr_ext *)(skb->h.raw + sizeof(struct dccp_hdr)); +} + +static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); +} + +static inline __u64 dccp_hdr_seq(const struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 seq_nr = ntohl(dh->dccph_seq << 8); +#elif defined(__BIG_ENDIAN_BITFIELD) + __u64 seq_nr = ntohl(dh->dccph_seq); +#else +#error "Adjust your defines" +#endif + + if (dh->dccph_x != 0) + seq_nr = (seq_nr << 32) + ntohl(dccp_hdrx(skb)->dccph_seq_low); + + return seq_nr; +} + +/** + * struct dccp_hdr_request - Conection initiation request header + * + * @dccph_req_service - Service to which the client app wants to connect + * @dccph_req_options - list of options (must be a multiple of 32 bits + */ +struct dccp_hdr_request { + __u32 dccph_req_service; +}; + +static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) +{ + return (struct dccp_hdr_request *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +/** + * struct dccp_hdr_ack_bits - acknowledgment bits common to most packets + * + * @dccph_resp_ack_nr_high - 48 bit ack number high order bits, contains GSR + * @dccph_resp_ack_nr_low - 48 bit ack number low order bits, contains GSR + */ +struct dccp_hdr_ack_bits { + __u32 dccph_reserved1:8, + dccph_ack_nr_high:24; + __u32 dccph_ack_nr_low; +}; + +static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) +{ + return (struct dccp_hdr_ack_bits *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) +{ + const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); +#if defined(__LITTLE_ENDIAN_BITFIELD) + return (((u64)ntohl(dhack->dccph_ack_nr_high << 8)) << 32) + ntohl(dhack->dccph_ack_nr_low); +#elif defined(__BIG_ENDIAN_BITFIELD) + return (((u64)ntohl(dhack->dccph_ack_nr_high)) << 32) + ntohl(dhack->dccph_ack_nr_low); +#else +#error "Adjust your defines" +#endif +} + +/** + * struct dccp_hdr_response - Conection initiation response header + * + * @dccph_resp_ack_nr_high - 48 bit ack number high order bits, contains GSR + * @dccph_resp_ack_nr_low - 48 bit ack number low order bits, contains GSR + * @dccph_resp_service - Echoes the Service Code on a received DCCP-Request + * @dccph_resp_options - list of options (must be a multiple of 32 bits + */ +struct dccp_hdr_response { + struct dccp_hdr_ack_bits dccph_resp_ack; + __u32 dccph_resp_service; +}; + +static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) +{ + return (struct dccp_hdr_response *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +/** + * struct dccp_hdr_reset - Unconditionally shut down a connection + * + * @dccph_reset_service - Echoes the Service Code on a received DCCP-Request + * @dccph_reset_options - list of options (must be a multiple of 32 bits + */ +struct dccp_hdr_reset { + struct dccp_hdr_ack_bits dccph_reset_ack; + __u8 dccph_reset_code, + dccph_reset_data[3]; +}; + +static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) +{ + return (struct dccp_hdr_reset *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +enum dccp_pkt_type { + DCCP_PKT_REQUEST = 0, + DCCP_PKT_RESPONSE, + DCCP_PKT_DATA, + DCCP_PKT_ACK, + DCCP_PKT_DATAACK, + DCCP_PKT_CLOSEREQ, + DCCP_PKT_CLOSE, + DCCP_PKT_RESET, + DCCP_PKT_SYNC, + DCCP_PKT_SYNCACK, + DCCP_PKT_INVALID, +}; + +#define DCCP_NR_PKT_TYPES DCCP_PKT_INVALID + +static inline unsigned int dccp_packet_hdr_len(const __u8 type) +{ + if (type == DCCP_PKT_DATA) + return 0; + if (type == DCCP_PKT_DATAACK || + type == DCCP_PKT_ACK || + type == DCCP_PKT_SYNC || + type == DCCP_PKT_SYNCACK || + type == DCCP_PKT_CLOSE || + type == DCCP_PKT_CLOSEREQ) + return sizeof(struct dccp_hdr_ack_bits); + if (type == DCCP_PKT_REQUEST) + return sizeof(struct dccp_hdr_request); + if (type == DCCP_PKT_RESPONSE) + return sizeof(struct dccp_hdr_response); + return sizeof(struct dccp_hdr_reset); +} + +static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) +{ + return dccp_basic_hdr_len(skb) + + dccp_packet_hdr_len(dccp_hdr(skb)->dccph_type); +} + +enum dccp_reset_codes { + DCCP_RESET_CODE_UNSPECIFIED = 0, + DCCP_RESET_CODE_CLOSED, + DCCP_RESET_CODE_ABORTED, + DCCP_RESET_CODE_NO_CONNECTION, + DCCP_RESET_CODE_PACKET_ERROR, + DCCP_RESET_CODE_OPTION_ERROR, + DCCP_RESET_CODE_MANDATORY_ERROR, + DCCP_RESET_CODE_CONNECTION_REFUSED, + DCCP_RESET_CODE_BAD_SERVICE_CODE, + DCCP_RESET_CODE_TOO_BUSY, + DCCP_RESET_CODE_BAD_INIT_COOKIE, + DCCP_RESET_CODE_AGGRESSION_PENALTY, +}; + +/* DCCP options */ +enum { + DCCPO_PADDING = 0, + DCCPO_MANDATORY = 1, + DCCPO_MIN_RESERVED = 3, + DCCPO_MAX_RESERVED = 31, + DCCPO_NDP_COUNT = 37, + DCCPO_ACK_VECTOR_0 = 38, + DCCPO_ACK_VECTOR_1 = 39, + DCCPO_TIMESTAMP = 41, + DCCPO_TIMESTAMP_ECHO = 42, + DCCPO_ELAPSED_TIME = 43, + DCCPO_MAX = 45, + DCCPO_MIN_CCID_SPECIFIC = 128, + DCCPO_MAX_CCID_SPECIFIC = 255, +}; + +/* DCCP features */ +enum { + DCCPF_RESERVED = 0, + DCCPF_SEQUENCE_WINDOW = 3, + DCCPF_SEND_ACK_VECTOR = 6, + DCCPF_SEND_NDP_COUNT = 7, + /* 10-127 reserved */ + DCCPF_MIN_CCID_SPECIFIC = 128, + DCCPF_MAX_CCID_SPECIFIC = 255, +}; + +/* initial values for each feature */ +#define DCCPF_INITIAL_SEQUENCE_WINDOW 100 +/* FIXME: for now we're using CCID 3 (TFRC) */ +#define DCCPF_INITIAL_CCID 3 +#define DCCPF_INITIAL_SEND_ACK_VECTOR 0 +/* FIXME: for now we're default to 1 but it should really be 0 */ +#define DCCPF_INITIAL_SEND_NDP_COUNT 1 + +#define DCCP_NDP_LIMIT 0xFFFFFF + +/** + * struct dccp_options - option values for a DCCP connection + * @dccpo_sequence_window - Sequence Window Feature (section 7.5.2) + * @dccpo_ccid - Congestion Control Id (CCID) (section 10) + * @dccpo_send_ack_vector - Send Ack Vector Feature (section 11.5) + * @dccpo_send_ndp_count - Send NDP Count Feature (7.7.2) + */ +struct dccp_options { + __u64 dccpo_sequence_window; + __u8 dccpo_ccid; + __u8 dccpo_send_ack_vector; + __u8 dccpo_send_ndp_count; +}; + +extern void __dccp_options_init(struct dccp_options *dccpo); +extern void dccp_options_init(struct dccp_options *dccpo); +extern int dccp_parse_options(struct sock *sk, struct sk_buff *skb); + +struct dccp_request_sock { + struct inet_request_sock dreq_inet_rsk; + __u64 dreq_iss; + __u64 dreq_isr; + __u32 dreq_service; +}; + +static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) +{ + return (struct dccp_request_sock *)req; +} + +/* Read about the ECN nonce to see why it is 253 */ +#define DCCP_MAX_ACK_VECTOR_LEN 253 + +struct dccp_options_received { + u32 dccpor_ndp:24, + dccpor_ack_vector_len:8; + u32 dccpor_ack_vector_idx:10; + /* 22 bits hole, try to pack */ + u32 dccpor_timestamp; + u32 dccpor_timestamp_echo; + u32 dccpor_elapsed_time; +}; + +struct ccid; + +enum dccp_role { + DCCP_ROLE_UNDEFINED, + DCCP_ROLE_LISTEN, + DCCP_ROLE_CLIENT, + DCCP_ROLE_SERVER, +}; + +/** + * struct dccp_sock - DCCP socket state + * + * @dccps_swl - sequence number window low + * @dccps_swh - sequence number window high + * @dccps_awl - acknowledgement number window low + * @dccps_awh - acknowledgement number window high + * @dccps_iss - initial sequence number sent + * @dccps_isr - initial sequence number received + * @dccps_osr - first OPEN sequence number received + * @dccps_gss - greatest sequence number sent + * @dccps_gsr - greatest valid sequence number received + * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss + * @dccps_timestamp_time - time of latest TIMESTAMP option + * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option + * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options) + * @dccps_pmtu_cookie - Last pmtu seen by socket + * @dccps_avg_packet_size - FIXME: has to be set by the app thru some setsockopt or ioctl, CCID3 uses it + * @dccps_role - Role of this sock, one of %dccp_role + * @dccps_ndp_count - number of Non Data Packets since last data packet + * @dccps_hc_rx_ackpkts - receiver half connection acked packets + */ +struct dccp_sock { + /* inet_connection_sock has to be the first member of dccp_sock */ + struct inet_connection_sock dccps_inet_connection; + __u64 dccps_swl; + __u64 dccps_swh; + __u64 dccps_awl; + __u64 dccps_awh; + __u64 dccps_iss; + __u64 dccps_isr; + __u64 dccps_osr; + __u64 dccps_gss; + __u64 dccps_gsr; + __u64 dccps_gar; + unsigned long dccps_service; + unsigned long dccps_timestamp_time; + __u32 dccps_timestamp_echo; + __u32 dccps_avg_packet_size; + unsigned long dccps_ndp_count; + __u16 dccps_ext_header_len; + __u32 dccps_pmtu_cookie; + __u32 dccps_mss_cache; + struct dccp_options dccps_options; + struct dccp_ackpkts *dccps_hc_rx_ackpkts; + void *dccps_hc_rx_ccid_private; + void *dccps_hc_tx_ccid_private; + struct ccid *dccps_hc_rx_ccid; + struct ccid *dccps_hc_tx_ccid; + struct dccp_options_received dccps_options_received; + enum dccp_role dccps_role:2; +}; + +static inline struct dccp_sock *dccp_sk(const struct sock *sk) +{ + return (struct dccp_sock *)sk; +} + +static inline const char *dccp_role(const struct sock *sk) +{ + switch (dccp_sk(sk)->dccps_role) { + case DCCP_ROLE_UNDEFINED: return "undefined"; + case DCCP_ROLE_LISTEN: return "listen"; + case DCCP_ROLE_SERVER: return "server"; + case DCCP_ROLE_CLIENT: return "client"; + } +} + +#endif /* _LINUX_DCCP_H */ diff -puN include/linux/hippidevice.h~git-net include/linux/hippidevice.h --- devel/include/linux/hippidevice.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/hippidevice.h 2005-08-06 23:39:09.000000000 -0700 @@ -26,6 +26,11 @@ #include #ifdef __KERNEL__ + +struct hippi_cb { + __u32 ifield; +}; + extern unsigned short hippi_type_trans(struct sk_buff *skb, struct net_device *dev); diff -puN include/linux/if_vlan.h~git-net include/linux/if_vlan.h --- devel/include/linux/if_vlan.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/if_vlan.h 2005-08-06 23:39:09.000000000 -0700 @@ -155,7 +155,6 @@ static inline int __vlan_hwaccel_rx(stru { struct net_device_stats *stats; - skb->real_dev = skb->dev; skb->dev = grp->vlan_devices[vlan_tag & VLAN_VID_MASK]; if (skb->dev == NULL) { dev_kfree_skb_any(skb); diff -puN include/linux/in.h~git-net include/linux/in.h --- devel/include/linux/in.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/in.h 2005-08-06 23:39:09.000000000 -0700 @@ -32,6 +32,7 @@ enum { IPPROTO_PUP = 12, /* PUP protocol */ IPPROTO_UDP = 17, /* User Datagram Protocol */ IPPROTO_IDP = 22, /* XNS IDP protocol */ + IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */ IPPROTO_RSVP = 46, /* RSVP protocol */ IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */ diff -puN include/linux/ip.h~git-net include/linux/ip.h --- devel/include/linux/ip.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/ip.h 2005-08-06 23:39:09.000000000 -0700 @@ -196,6 +196,8 @@ static inline void inet_sk_copy_descenda #endif #endif +extern int inet_sk_rebuild_header(struct sock *sk); + struct iphdr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 ihl:4, diff -puN include/linux/ipv6.h~git-net include/linux/ipv6.h --- devel/include/linux/ipv6.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/ipv6.h 2005-08-06 23:39:09.000000000 -0700 @@ -308,6 +308,36 @@ static inline void inet_sk_copy_descenda #define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only) #define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk)) + +#include + +struct tcp6_timewait_sock { + struct tcp_timewait_sock tw_v6_sk; + struct in6_addr tw_v6_daddr; + struct in6_addr tw_v6_rcv_saddr; +}; + +static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk) +{ + return (struct tcp6_timewait_sock *)sk; +} + +static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + &inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr; +} + +static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) +{ + return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; +} + +static inline int inet_v6_ipv6only(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + ipv6_only_sock(sk) : inet_twsk(sk)->tw_ipv6only; +} #else #define __ipv6_only_sock(sk) 0 #define ipv6_only_sock(sk) 0 @@ -322,8 +352,19 @@ static inline struct raw6_sock *raw6_sk( return NULL; } -#endif +#define __tcp_v6_rcv_saddr(__sk) NULL +#define tcp_v6_rcv_saddr(__sk) NULL +#define tcp_twsk_ipv6only(__sk) 0 +#define inet_v6_ipv6only(__sk) 0 +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ + +#define INET6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \ + (((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + ((__sk)->sk_family == AF_INET6) && \ + ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \ + ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#endif +#endif /* __KERNEL__ */ -#endif +#endif /* _IPV6_H */ diff -puN include/linux/list.h~git-net include/linux/list.h --- devel/include/linux/list.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/list.h 2005-08-06 23:39:09.000000000 -0700 @@ -419,6 +419,19 @@ static inline void list_splice_init(stru pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** + * list_for_each_entry_safe_continue - iterate over list of given type + * continuing after existing point safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = n, n = list_entry(n->member.next, typeof(*n), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** * list_for_each_rcu - iterate over an rcu-protected list * @pos: the &struct list_head to use as a loop counter. * @head: the head for your list. diff -puN include/linux/netdevice.h~git-net include/linux/netdevice.h --- devel/include/linux/netdevice.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netdevice.h 2005-08-06 23:39:09.000000000 -0700 @@ -497,10 +497,12 @@ static inline void *netdev_priv(struct n #define SET_NETDEV_DEV(net, pdev) ((net)->class_dev.dev = (pdev)) struct packet_type { - __be16 type; /* This is really htons(ether_type). */ - struct net_device *dev; /* NULL is wildcarded here */ - int (*func) (struct sk_buff *, struct net_device *, - struct packet_type *); + __be16 type; /* This is really htons(ether_type). */ + struct net_device *dev; /* NULL is wildcarded here */ + int (*func) (struct sk_buff *, + struct net_device *, + struct packet_type *, + struct net_device *); void *af_packet_priv; struct list_head list; }; diff -puN include/linux/netfilter_decnet.h~git-net include/linux/netfilter_decnet.h --- devel/include/linux/netfilter_decnet.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netfilter_decnet.h 2005-08-06 23:39:09.000000000 -0700 @@ -9,6 +9,8 @@ #include +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* IP Cache bits. */ /* Src IP address. */ #define NFC_DN_SRC 0x0001 @@ -18,6 +20,7 @@ #define NFC_DN_IF_IN 0x0004 /* Output device. */ #define NFC_DN_IF_OUT 0x0008 +#endif /* ! __KERNEL__ */ /* DECnet Hooks */ /* After promisc drops, checksum checks. */ diff -puN include/linux/netfilter.h~git-net include/linux/netfilter.h --- devel/include/linux/netfilter.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netfilter.h 2005-08-06 23:39:09.000000000 -0700 @@ -21,10 +21,23 @@ #define NF_STOP 5 #define NF_MAX_VERDICT NF_STOP +/* we overload the higher bits for encoding auxiliary data such as the queue + * number. Not nice, but better than additional function arguments. */ +#define NF_VERDICT_MASK 0x0000ffff +#define NF_VERDICT_BITS 16 + +#define NF_VERDICT_QMASK 0xffff0000 +#define NF_VERDICT_QBITS 16 + +#define NF_QUEUE_NR(x) ((x << NF_VERDICT_QBITS) & NF_VERDICT_QMASK || NF_QUEUE) + +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* Generic cache responses from hook functions. <= 0x2000 is used for protocol-flags. */ #define NFC_UNKNOWN 0x4000 #define NFC_ALTERED 0x8000 +#endif #ifdef __KERNEL__ #include @@ -101,15 +114,51 @@ void nf_unregister_sockopt(struct nf_soc extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; -typedef void nf_logfn(unsigned int hooknum, +/* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will + * disappear once iptables is replaced with pkttables. Please DO NOT use them + * for any new code! */ +#define NF_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ +#define NF_LOG_TCPOPT 0x02 /* Log TCP options */ +#define NF_LOG_IPOPT 0x04 /* Log IP options */ +#define NF_LOG_UID 0x08 /* Log UID owning local socket */ +#define NF_LOG_MASK 0x0f + +#define NF_LOG_TYPE_LOG 0x01 +#define NF_LOG_TYPE_ULOG 0x02 + +struct nf_loginfo { + u_int8_t type; + union { + struct { + u_int32_t copy_len; + u_int16_t group; + u_int16_t qthreshold; + } ulog; + struct { + u_int8_t level; + u_int8_t logflags; + } log; + } u; +}; + +typedef void nf_logfn(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + const struct nf_loginfo *li, const char *prefix); +struct nf_logger { + struct module *me; + nf_logfn *logfn; + char *name; +}; + /* Function to register/unregister log function. */ -int nf_log_register(int pf, nf_logfn *logfn); -void nf_log_unregister(int pf, nf_logfn *logfn); +int nf_log_register(int pf, struct nf_logger *logger); +void nf_log_unregister_pf(int pf); +void nf_log_unregister_logger(struct nf_logger *logger); /* Calls the registered backend logging function */ void nf_log_packet(int pf, @@ -117,6 +166,7 @@ void nf_log_packet(int pf, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + struct nf_loginfo *li, const char *fmt, ...); /* Activate hook; either okfn or kfree_skb called, unless a hook @@ -176,10 +226,12 @@ int nf_getsockopt(struct sock *sk, int p /* Packet queuing */ typedef int (*nf_queue_outfn_t)(struct sk_buff *skb, - struct nf_info *info, void *data); + struct nf_info *info, + unsigned int queuenum, void *data); extern int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data); extern int nf_unregister_queue_handler(int pf); +extern void nf_unregister_queue_handlers(nf_queue_outfn_t outfn); extern void nf_reinject(struct sk_buff *skb, struct nf_info *info, unsigned int verdict); @@ -190,6 +242,27 @@ extern void nf_ct_attach(struct sk_buff /* FIXME: Before cache is ever used, this must be implemented for real. */ extern void nf_invalidate_cache(int pf); +/* Call this before modifying an existing packet: ensures it is + modifiable and linear to the point you care about (writable_len). + Returns true or false. */ +extern int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len); + +struct nf_queue_rerouter { + void (*save)(const struct sk_buff *skb, struct nf_info *info); + int (*reroute)(struct sk_buff **skb, const struct nf_info *info); + int rer_size; +}; + +#define nf_info_reroute(x) ((void *)x + sizeof(struct nf_info)) + +extern int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer); +extern int nf_unregister_queue_rerouter(int pf); + +#ifdef CONFIG_PROC_FS +#include +extern struct proc_dir_entry *proc_net_netfilter; +#endif + #else /* !CONFIG_NETFILTER */ #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb) static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} diff -puN include/linux/netfilter_ipv4.h~git-net include/linux/netfilter_ipv4.h --- devel/include/linux/netfilter_ipv4.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netfilter_ipv4.h 2005-08-06 23:39:09.000000000 -0700 @@ -8,6 +8,8 @@ #include #include +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* IP Cache bits. */ /* Src IP address. */ #define NFC_IP_SRC 0x0001 @@ -35,6 +37,7 @@ #define NFC_IP_DST_PT 0x0400 /* Something else about the proto */ #define NFC_IP_PROTO_UNKNOWN 0x2000 +#endif /* ! __KERNEL__ */ /* IP Hooks */ /* After promisc drops, checksum checks. */ @@ -77,11 +80,6 @@ enum nf_ip_hook_priorities { #ifdef __KERNEL__ extern int ip_route_me_harder(struct sk_buff **pskb); -/* Call this before modifying an existing IP packet: ensures it is - modifiable and linear to the point you care about (writable_len). - Returns true or false. */ -extern int skb_ip_make_writable(struct sk_buff **pskb, - unsigned int writable_len); #endif /*__KERNEL__*/ #endif /*__LINUX_IP_NETFILTER_H*/ diff -puN include/linux/netfilter_ipv4/ip_conntrack_core.h~git-net include/linux/netfilter_ipv4/ip_conntrack_core.h --- devel/include/linux/netfilter_ipv4/ip_conntrack_core.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netfilter_ipv4/ip_conntrack_core.h 2005-08-06 23:39:09.000000000 -0700 @@ -2,6 +2,9 @@ #define _IP_CONNTRACK_CORE_H #include +#define MAX_IP_CT_PROTO 256 +extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; + /* This header is used to share core functionality between the standalone connection tracking module, and the compatibility layer's use of connection tracking. */ @@ -38,12 +41,19 @@ extern int __ip_conntrack_confirm(struct /* Confirm a connection: returns NF_DROP if packet must be dropped. */ static inline int ip_conntrack_confirm(struct sk_buff **pskb) { - if ((*pskb)->nfct - && !is_confirmed((struct ip_conntrack *)(*pskb)->nfct)) - return __ip_conntrack_confirm(pskb); - return NF_ACCEPT; + struct ip_conntrack *ct = (struct ip_conntrack *)(*pskb)->nfct; + int ret = NF_ACCEPT; + + if (ct) { + if (!is_confirmed(ct)) + ret = __ip_conntrack_confirm(pskb); + ip_ct_deliver_cached_events(ct); + } + return ret; } +extern void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp); + extern struct list_head *ip_conntrack_hash; extern struct list_head ip_conntrack_expect_list; extern rwlock_t ip_conntrack_lock; diff -puN include/linux/netfilter_ipv4/ip_conntrack.h~git-net include/linux/netfilter_ipv4/ip_conntrack.h --- devel/include/linux/netfilter_ipv4/ip_conntrack.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netfilter_ipv4/ip_conntrack.h 2005-08-06 23:39:09.000000000 -0700 @@ -65,6 +65,63 @@ enum ip_conntrack_status { /* Both together */ IPS_NAT_DONE_MASK = (IPS_DST_NAT_DONE | IPS_SRC_NAT_DONE), + + /* Connection is dying (removed from lists), can not be unset. */ + IPS_DYING_BIT = 9, + IPS_DYING = (1 << IPS_DYING_BIT), +}; + +/* Connection tracking event bits */ +enum ip_conntrack_events +{ + /* New conntrack */ + IPCT_NEW_BIT = 0, + IPCT_NEW = (1 << IPCT_NEW_BIT), + + /* Expected connection */ + IPCT_RELATED_BIT = 1, + IPCT_RELATED = (1 << IPCT_RELATED_BIT), + + /* Destroyed conntrack */ + IPCT_DESTROY_BIT = 2, + IPCT_DESTROY = (1 << IPCT_DESTROY_BIT), + + /* Timer has been refreshed */ + IPCT_REFRESH_BIT = 3, + IPCT_REFRESH = (1 << IPCT_REFRESH_BIT), + + /* Status has changed */ + IPCT_STATUS_BIT = 4, + IPCT_STATUS = (1 << IPCT_STATUS_BIT), + + /* Update of protocol info */ + IPCT_PROTOINFO_BIT = 5, + IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT), + + /* Volatile protocol info */ + IPCT_PROTOINFO_VOLATILE_BIT = 6, + IPCT_PROTOINFO_VOLATILE = (1 << IPCT_PROTOINFO_VOLATILE_BIT), + + /* New helper for conntrack */ + IPCT_HELPER_BIT = 7, + IPCT_HELPER = (1 << IPCT_HELPER_BIT), + + /* Update of helper info */ + IPCT_HELPINFO_BIT = 8, + IPCT_HELPINFO = (1 << IPCT_HELPINFO_BIT), + + /* Volatile helper info */ + IPCT_HELPINFO_VOLATILE_BIT = 9, + IPCT_HELPINFO_VOLATILE = (1 << IPCT_HELPINFO_VOLATILE_BIT), + + /* NAT info */ + IPCT_NATINFO_BIT = 10, + IPCT_NATINFO = (1 << IPCT_NATINFO_BIT), +}; + +enum ip_conntrack_expect_events { + IPEXP_NEW_BIT = 0, + IPEXP_NEW = (1 << IPEXP_NEW_BIT), }; #ifdef __KERNEL__ @@ -152,6 +209,9 @@ struct ip_conntrack /* Current number of expected connections */ unsigned int expecting; + /* Unique ID that identifies this conntrack*/ + unsigned int id; + /* Helper, if any. */ struct ip_conntrack_helper *helper; @@ -171,7 +231,7 @@ struct ip_conntrack #endif /* CONFIG_IP_NF_NAT_NEEDED */ #if defined(CONFIG_IP_NF_CONNTRACK_MARK) - unsigned long mark; + u_int32_t mark; #endif /* Traversed often, so hopefully in different cacheline to top */ @@ -200,6 +260,9 @@ struct ip_conntrack_expect /* Usage count. */ atomic_t use; + /* Unique ID */ + unsigned int id; + #ifdef CONFIG_IP_NF_NAT_NEEDED /* This is the original per-proto part, used to map the * expected connection the way the recipient expects. */ @@ -239,7 +302,12 @@ ip_conntrack_get(const struct sk_buff *s } /* decrement reference count on a conntrack */ -extern void ip_conntrack_put(struct ip_conntrack *ct); +static inline void +ip_conntrack_put(struct ip_conntrack *ct) +{ + IP_NF_ASSERT(ct); + nf_conntrack_put(&ct->ct_general); +} /* call to create an explicit dependency on ip_conntrack. */ extern void need_ip_conntrack(void); @@ -274,12 +342,50 @@ extern void ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *data), void *data); +extern struct ip_conntrack_helper * +__ip_conntrack_helper_find_byname(const char *); +extern struct ip_conntrack_helper * +ip_conntrack_helper_find_get(const struct ip_conntrack_tuple *tuple); +extern void ip_conntrack_helper_put(struct ip_conntrack_helper *helper); + +extern struct ip_conntrack_protocol * +__ip_conntrack_proto_find(u_int8_t protocol); +extern struct ip_conntrack_protocol * +ip_conntrack_proto_find_get(u_int8_t protocol); +extern void ip_conntrack_proto_put(struct ip_conntrack_protocol *proto); + +extern void ip_ct_remove_expectations(struct ip_conntrack *ct); + +extern struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *, + struct ip_conntrack_tuple *); + +extern void ip_conntrack_free(struct ip_conntrack *ct); + +extern void ip_conntrack_hash_insert(struct ip_conntrack *ct); + +extern struct ip_conntrack_expect * +__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple); + +extern struct ip_conntrack_expect * +ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple); + +extern struct ip_conntrack_tuple_hash * +__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack); + +extern void ip_conntrack_flush(void); + /* It's confirmed if it is, or has been in the hash table. */ static inline int is_confirmed(struct ip_conntrack *ct) { return test_bit(IPS_CONFIRMED_BIT, &ct->status); } +static inline int is_dying(struct ip_conntrack *ct) +{ + return test_bit(IPS_DYING_BIT, &ct->status); +} + extern unsigned int ip_conntrack_htable_size; struct ip_conntrack_stat @@ -303,6 +409,85 @@ struct ip_conntrack_stat #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++) +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +#include +#include + +struct ip_conntrack_ecache { + struct ip_conntrack *ct; + unsigned int events; +}; +DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); + +#define CONNTRACK_ECACHE(x) (__get_cpu_var(ip_conntrack_ecache).x) + +extern struct notifier_block *ip_conntrack_chain; +extern struct notifier_block *ip_conntrack_expect_chain; + +static inline int ip_conntrack_register_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&ip_conntrack_chain, nb); +} + +static inline int ip_conntrack_unregister_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&ip_conntrack_chain, nb); +} + +static inline int +ip_conntrack_expect_register_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&ip_conntrack_expect_chain, nb); +} + +static inline int +ip_conntrack_expect_unregister_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&ip_conntrack_expect_chain, nb); +} + +extern void ip_ct_deliver_cached_events(const struct ip_conntrack *ct); +extern void __ip_ct_event_cache_init(struct ip_conntrack *ct); + +static inline void +ip_conntrack_event_cache(enum ip_conntrack_events event, + const struct sk_buff *skb) +{ + struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct; + struct ip_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (ct != ecache->ct) + __ip_ct_event_cache_init(ct); + ecache->events |= event; + local_bh_enable(); +} + +static inline void ip_conntrack_event(enum ip_conntrack_events event, + struct ip_conntrack *ct) +{ + if (is_confirmed(ct) && !is_dying(ct)) + notifier_call_chain(&ip_conntrack_chain, event, ct); +} + +static inline void +ip_conntrack_expect_event(enum ip_conntrack_expect_events event, + struct ip_conntrack_expect *exp) +{ + notifier_call_chain(&ip_conntrack_expect_chain, event, exp); +} +#else /* CONFIG_IP_NF_CONNTRACK_EVENTS */ +static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, + const struct sk_buff *skb) {} +static inline void ip_conntrack_event(enum ip_conntrack_events event, + struct ip_conntrack *ct) {} +static inline void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) {} +static inline void +ip_conntrack_expect_event(enum ip_conntrack_expect_events event, + struct ip_conntrack_expect *exp) {} +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + #ifdef CONFIG_IP_NF_NAT_NEEDED static inline int ip_nat_initialized(struct ip_conntrack *conntrack, enum ip_nat_manip_type manip) diff -puN include/linux/netfilter_ipv4/ip_conntrack_helper.h~git-net include/linux/netfilter_ipv4/ip_conntrack_helper.h --- devel/include/linux/netfilter_ipv4/ip_conntrack_helper.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netfilter_ipv4/ip_conntrack_helper.h 2005-08-06 23:39:09.000000000 -0700 @@ -24,6 +24,8 @@ struct ip_conntrack_helper int (*help)(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info conntrackinfo); + + int (*to_nfattr)(struct sk_buff *skb, const struct ip_conntrack *ct); }; extern int ip_conntrack_helper_register(struct ip_conntrack_helper *); diff -puN include/linux/netfilter_ipv4/ip_conntrack_protocol.h~git-net include/linux/netfilter_ipv4/ip_conntrack_protocol.h --- devel/include/linux/netfilter_ipv4/ip_conntrack_protocol.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netfilter_ipv4/ip_conntrack_protocol.h 2005-08-06 23:39:10.000000000 -0700 @@ -2,6 +2,7 @@ #ifndef _IP_CONNTRACK_PROTOCOL_H #define _IP_CONNTRACK_PROTOCOL_H #include +#include struct seq_file; @@ -47,22 +48,22 @@ struct ip_conntrack_protocol int (*error)(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, unsigned int hooknum); + /* convert protoinfo to nfnetink attributes */ + int (*to_nfattr)(struct sk_buff *skb, struct nfattr *nfa, + const struct ip_conntrack *ct); + + int (*tuple_to_nfattr)(struct sk_buff *skb, + const struct ip_conntrack_tuple *t); + int (*nfattr_to_tuple)(struct nfattr *tb[], + struct ip_conntrack_tuple *t); + /* Module (if any) which this is connected to. */ struct module *me; }; -#define MAX_IP_CT_PROTO 256 -extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; - /* Protocol registration. */ extern int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto); extern void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto); - -static inline struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol) -{ - return ip_ct_protos[protocol]; -} - /* Existing built-in protocols */ extern struct ip_conntrack_protocol ip_conntrack_protocol_tcp; extern struct ip_conntrack_protocol ip_conntrack_protocol_udp; @@ -73,6 +74,11 @@ extern int ip_conntrack_protocol_tcp_ini /* Log invalid packets */ extern unsigned int ip_ct_log_invalid; +extern int ip_ct_port_tuple_to_nfattr(struct sk_buff *, + const struct ip_conntrack_tuple *); +extern int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *); + #ifdef CONFIG_SYSCTL #ifdef DEBUG_INVALID_PACKETS #define LOG_INVALID(proto) \ diff -puN include/linux/netfilter_ipv4/ip_nat_protocol.h~git-net include/linux/netfilter_ipv4/ip_nat_protocol.h --- devel/include/linux/netfilter_ipv4/ip_nat_protocol.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netfilter_ipv4/ip_nat_protocol.h 2005-08-06 23:39:10.000000000 -0700 @@ -4,6 +4,9 @@ #include #include +#include +#include + struct iphdr; struct ip_nat_range; @@ -15,6 +18,8 @@ struct ip_nat_protocol /* Protocol number. */ unsigned int protonum; + struct module *me; + /* Translate a packet to the target according to manip type. Return true if succeeded. */ int (*manip_pkt)(struct sk_buff **pskb, @@ -43,19 +48,20 @@ struct ip_nat_protocol unsigned int (*print_range)(char *buffer, const struct ip_nat_range *range); -}; -#define MAX_IP_NAT_PROTO 256 -extern struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; + int (*range_to_nfattr)(struct sk_buff *skb, + const struct ip_nat_range *range); + + int (*nfattr_to_range)(struct nfattr *tb[], + struct ip_nat_range *range); +}; /* Protocol registration. */ extern int ip_nat_protocol_register(struct ip_nat_protocol *proto); extern void ip_nat_protocol_unregister(struct ip_nat_protocol *proto); -static inline struct ip_nat_protocol *ip_nat_find_proto(u_int8_t protocol) -{ - return ip_nat_protos[protocol]; -} +extern struct ip_nat_protocol *ip_nat_proto_find_get(u_int8_t protocol); +extern void ip_nat_proto_put(struct ip_nat_protocol *proto); /* Built-in protocols. */ extern struct ip_nat_protocol ip_nat_protocol_tcp; @@ -67,4 +73,9 @@ extern int init_protocols(void) __init; extern void cleanup_protocols(void); extern struct ip_nat_protocol *find_nat_proto(u_int16_t protonum); +extern int ip_nat_port_range_to_nfattr(struct sk_buff *skb, + const struct ip_nat_range *range); +extern int ip_nat_port_nfattr_to_range(struct nfattr *tb[], + struct ip_nat_range *range); + #endif /*_IP_NAT_PROTO_H*/ diff -puN include/linux/netfilter_ipv4/ipt_LOG.h~git-net include/linux/netfilter_ipv4/ipt_LOG.h --- devel/include/linux/netfilter_ipv4/ipt_LOG.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netfilter_ipv4/ipt_LOG.h 2005-08-06 23:39:10.000000000 -0700 @@ -1,6 +1,7 @@ #ifndef _IPT_LOG_H #define _IPT_LOG_H +/* make sure not to change this without changing netfilter.h:NF_LOG_* (!) */ #define IPT_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ #define IPT_LOG_TCPOPT 0x02 /* Log TCP options */ #define IPT_LOG_IPOPT 0x04 /* Log IP options */ diff -puN /dev/null include/linux/netfilter_ipv4/ipt_NFQUEUE.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/include/linux/netfilter_ipv4/ipt_NFQUEUE.h 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,16 @@ +/* iptables module for using NFQUEUE mechanism + * + * (C) 2005 Harald Welte + * + * This software is distributed under GNU GPL v2, 1991 + * +*/ +#ifndef _IPT_NFQ_TARGET_H +#define _IPT_NFQ_TARGET_H + +/* target info */ +struct ipt_NFQ_info { + u_int16_t queuenum; +}; + +#endif /* _IPT_DSCP_TARGET_H */ diff -puN include/linux/netfilter_ipv6.h~git-net include/linux/netfilter_ipv6.h --- devel/include/linux/netfilter_ipv6.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netfilter_ipv6.h 2005-08-06 23:39:10.000000000 -0700 @@ -10,6 +10,8 @@ #include +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* IP Cache bits. */ /* Src IP address. */ #define NFC_IP6_SRC 0x0001 @@ -38,6 +40,7 @@ #define NFC_IP6_DST_PT 0x0400 /* Something else about the proto */ #define NFC_IP6_PROTO_UNKNOWN 0x2000 +#endif /* ! __KERNEL__ */ /* IP6 Hooks */ @@ -68,4 +71,7 @@ enum nf_ip6_hook_priorities { NF_IP6_PRI_LAST = INT_MAX, }; +int ipv6_netfilter_init(void); +void ipv6_netfilter_fini(void); + #endif /*__LINUX_IP6_NETFILTER_H*/ diff -puN include/linux/netfilter_ipv6/ip6t_LOG.h~git-net include/linux/netfilter_ipv6/ip6t_LOG.h --- devel/include/linux/netfilter_ipv6/ip6t_LOG.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netfilter_ipv6/ip6t_LOG.h 2005-08-06 23:39:10.000000000 -0700 @@ -1,6 +1,7 @@ #ifndef _IP6T_LOG_H #define _IP6T_LOG_H +/* make sure not to change this without changing netfilter.h:NF_LOG_* (!) */ #define IP6T_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ #define IP6T_LOG_TCPOPT 0x02 /* Log TCP options */ #define IP6T_LOG_IPOPT 0x04 /* Log IP options */ diff -puN /dev/null include/linux/netfilter/nfnetlink_conntrack.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/include/linux/netfilter/nfnetlink_conntrack.h 2005-08-06 23:39:09.000000000 -0700 @@ -0,0 +1,124 @@ +#ifndef _IPCONNTRACK_NETLINK_H +#define _IPCONNTRACK_NETLINK_H +#include + +enum cntl_msg_types { + IPCTNL_MSG_CT_NEW, + IPCTNL_MSG_CT_GET, + IPCTNL_MSG_CT_DELETE, + IPCTNL_MSG_CT_GET_CTRZERO, + + IPCTNL_MSG_MAX +}; + +enum ctnl_exp_msg_types { + IPCTNL_MSG_EXP_NEW, + IPCTNL_MSG_EXP_GET, + IPCTNL_MSG_EXP_DELETE, + + IPCTNL_MSG_EXP_MAX +}; + + +enum ctattr_type { + CTA_UNSPEC, + CTA_TUPLE_ORIG, + CTA_TUPLE_REPLY, + CTA_STATUS, + CTA_PROTOINFO, + CTA_HELP, + CTA_NAT, + CTA_TIMEOUT, + CTA_MARK, + CTA_COUNTERS_ORIG, + CTA_COUNTERS_REPLY, + CTA_USE, + CTA_ID, + __CTA_MAX +}; +#define CTA_MAX (__CTA_MAX - 1) + +enum ctattr_tuple { + CTA_TUPLE_UNSPEC, + CTA_TUPLE_IP, + CTA_TUPLE_PROTO, + __CTA_TUPLE_MAX +}; +#define CTA_TUPLE_MAX (__CTA_TUPLE_MAX - 1) + +enum ctattr_ip { + CTA_IP_UNSPEC, + CTA_IP_V4_SRC, + CTA_IP_V4_DST, + CTA_IP_V6_SRC, + CTA_IP_V6_DST, + __CTA_IP_MAX +}; +#define CTA_IP_MAX (__CTA_IP_MAX - 1) + +enum ctattr_l4proto { + CTA_PROTO_UNSPEC, + CTA_PROTO_NUM, + CTA_PROTO_SRC_PORT, + CTA_PROTO_DST_PORT, + CTA_PROTO_ICMP_ID, + CTA_PROTO_ICMP_TYPE, + CTA_PROTO_ICMP_CODE, + __CTA_PROTO_MAX +}; +#define CTA_PROTO_MAX (__CTA_PROTO_MAX - 1) + +enum ctattr_protoinfo { + CTA_PROTOINFO_UNSPEC, + CTA_PROTOINFO_TCP_STATE, + __CTA_PROTOINFO_MAX +}; +#define CTA_PROTOINFO_MAX (__CTA_PROTOINFO_MAX - 1) + +enum ctattr_counters { + CTA_COUNTERS_UNSPEC, + CTA_COUNTERS_PACKETS, + CTA_COUNTERS_BYTES, + __CTA_COUNTERS_MAX +}; +#define CTA_COUNTERS_MAX (__CTA_COUNTERS_MAX - 1) + +enum ctattr_nat { + CTA_NAT_UNSPEC, + CTA_NAT_MINIP, + CTA_NAT_MAXIP, + CTA_NAT_PROTO, + __CTA_NAT_MAX +}; +#define CTA_NAT_MAX (__CTA_NAT_MAX - 1) + +enum ctattr_protonat { + CTA_PROTONAT_UNSPEC, + CTA_PROTONAT_PORT_MIN, + CTA_PROTONAT_PORT_MAX, + __CTA_PROTONAT_MAX +}; +#define CTA_PROTONAT_MAX (__CTA_PROTONAT_MAX - 1) + +enum ctattr_expect { + CTA_EXPECT_UNSPEC, + CTA_EXPECT_MASTER, + CTA_EXPECT_TUPLE, + CTA_EXPECT_MASK, + CTA_EXPECT_TIMEOUT, + CTA_EXPECT_ID, + CTA_EXPECT_HELP_NAME, + __CTA_EXPECT_MAX +}; +#define CTA_EXPECT_MAX (__CTA_EXPECT_MAX - 1) + +enum ctattr_help { + CTA_HELP_UNSPEC, + CTA_HELP_NAME, + __CTA_HELP_MAX +}; +#define CTA_HELP_MAX (__CTA_HELP_MAX - 1) + +#define CTA_HELP_MAXNAMESIZE 32 + +#endif /* _IPCONNTRACK_NETLINK_H */ diff -puN /dev/null include/linux/netfilter/nfnetlink.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/include/linux/netfilter/nfnetlink.h 2005-08-06 23:39:09.000000000 -0700 @@ -0,0 +1,148 @@ +#ifndef _NFNETLINK_H +#define _NFNETLINK_H +#include + +/* nfnetlink groups: Up to 32 maximum */ +#define NF_NETLINK_CONNTRACK_NEW 0x00000001 +#define NF_NETLINK_CONNTRACK_UPDATE 0x00000002 +#define NF_NETLINK_CONNTRACK_DESTROY 0x00000004 +#define NF_NETLINK_CONNTRACK_EXP_NEW 0x00000008 +#define NF_NETLINK_CONNTRACK_EXP_UPDATE 0x00000010 +#define NF_NETLINK_CONNTRACK_EXP_DESTROY 0x00000020 + +/* Generic structure for encapsulation optional netfilter information. + * It is reminiscent of sockaddr, but with sa_family replaced + * with attribute type. + * ! This should someday be put somewhere generic as now rtnetlink and + * ! nfnetlink use the same attributes methods. - J. Schulist. + */ + +struct nfattr +{ + u_int16_t nfa_len; + u_int16_t nfa_type; +} __attribute__ ((packed)); + +/* FIXME: Shamelessly copy and pasted from rtnetlink.h, it's time + * to put this in a generic file */ + +#define NFA_ALIGNTO 4 +#define NFA_ALIGN(len) (((len) + NFA_ALIGNTO - 1) & ~(NFA_ALIGNTO - 1)) +#define NFA_OK(nfa,len) ((len) > 0 && (nfa)->nfa_len >= sizeof(struct nfattr) \ + && (nfa)->nfa_len <= (len)) +#define NFA_NEXT(nfa,attrlen) ((attrlen) -= NFA_ALIGN((nfa)->nfa_len), \ + (struct nfattr *)(((char *)(nfa)) + NFA_ALIGN((nfa)->nfa_len))) +#define NFA_LENGTH(len) (NFA_ALIGN(sizeof(struct nfattr)) + (len)) +#define NFA_SPACE(len) NFA_ALIGN(NFA_LENGTH(len)) +#define NFA_DATA(nfa) ((void *)(((char *)(nfa)) + NFA_LENGTH(0))) +#define NFA_PAYLOAD(nfa) ((int)((nfa)->nfa_len) - NFA_LENGTH(0)) +#define NFA_NEST(skb, type) \ +({ struct nfattr *__start = (struct nfattr *) (skb)->tail; \ + NFA_PUT(skb, type, 0, NULL); \ + __start; }) +#define NFA_NEST_END(skb, start) \ +({ (start)->nfa_len = ((skb)->tail - (unsigned char *) (start)); \ + (skb)->len; }) +#define NFA_NEST_CANCEL(skb, start) \ +({ if (start) \ + skb_trim(skb, (unsigned char *) (start) - (skb)->data); \ + -1; }) + +/* General form of address family dependent message. + */ +struct nfgenmsg { + u_int8_t nfgen_family; /* AF_xxx */ + u_int8_t version; /* nfnetlink version */ + u_int16_t res_id; /* resource id */ +} __attribute__ ((packed)); + +#define NFNETLINK_V0 0 + +#define NFM_NFA(n) ((struct nfattr *)(((char *)(n)) \ + + NLMSG_ALIGN(sizeof(struct nfgenmsg)))) +#define NFM_PAYLOAD(n) NLMSG_PAYLOAD(n, sizeof(struct nfgenmsg)) + +/* netfilter netlink message types are split in two pieces: + * 8 bit subsystem, 8bit operation. + */ + +#define NFNL_SUBSYS_ID(x) ((x & 0xff00) >> 8) +#define NFNL_MSG_TYPE(x) (x & 0x00ff) + +/* No enum here, otherwise __stringify() trick of MODULE_ALIAS_NFNL_SUBSYS() + * won't work anymore */ +#define NFNL_SUBSYS_NONE 0 +#define NFNL_SUBSYS_CTNETLINK 1 +#define NFNL_SUBSYS_CTNETLINK_EXP 2 +#define NFNL_SUBSYS_QUEUE 3 +#define NFNL_SUBSYS_ULOG 4 +#define NFNL_SUBSYS_COUNT 5 + +#ifdef __KERNEL__ + +#include +#include + +struct nfnl_callback +{ + int (*call)(struct sock *nl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp); + kernel_cap_t cap_required; /* capabilities required for this msg */ + u_int16_t attr_count; /* number of nfattr's */ +}; + +struct nfnetlink_subsystem +{ + const char *name; + __u8 subsys_id; /* nfnetlink subsystem ID */ + __u8 cb_count; /* number of callbacks */ + struct nfnl_callback *cb; /* callback for individual types */ +}; + +extern void __nfa_fill(struct sk_buff *skb, int attrtype, + int attrlen, const void *data); +#define NFA_PUT(skb, attrtype, attrlen, data) \ +({ if (skb_tailroom(skb) < (int)NFA_SPACE(attrlen)) goto nfattr_failure; \ + __nfa_fill(skb, attrtype, attrlen, data); }) + +extern struct semaphore nfnl_sem; + +#define nfnl_shlock() down(&nfnl_sem) +#define nfnl_shlock_nowait() down_trylock(&nfnl_sem) + +#define nfnl_shunlock() do { up(&nfnl_sem); \ + if(nfnl && nfnl->sk_receive_queue.qlen) \ + nfnl->sk_data_ready(nfnl, 0); \ + } while(0) + +extern void nfnl_lock(void); +extern void nfnl_unlock(void); + +extern int nfnetlink_subsys_register(struct nfnetlink_subsystem *n); +extern int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n); + +extern int nfattr_parse(struct nfattr *tb[], int maxattr, + struct nfattr *nfa, int len); + +#define nfattr_parse_nested(tb, max, nfa) \ + nfattr_parse((tb), (max), NFA_DATA((nfa)), NFA_PAYLOAD((nfa))) + +#define nfattr_bad_size(tb, max, cta_min) \ +({ int __i, __res = 0; \ + for (__i=0; __i + +enum nfulnl_msg_types { + NFULNL_MSG_PACKET, /* packet from kernel to userspace */ + NFULNL_MSG_CONFIG, /* connect to a particular queue */ + + NFULNL_MSG_MAX +}; + +struct nfulnl_msg_packet_hdr { + u_int16_t hw_protocol; /* hw protocol (network order) */ + u_int8_t hook; /* netfilter hook */ + u_int8_t _pad; +} __attribute__ ((packed)); + +struct nfulnl_msg_packet_hw { + u_int16_t hw_addrlen; + u_int16_t _pad; + u_int8_t hw_addr[8]; +} __attribute__ ((packed)); + +struct nfulnl_msg_packet_timestamp { + u_int64_t sec; + u_int64_t usec; +} __attribute__ ((packed)); + +#define NFULNL_PREFIXLEN 30 /* just like old log target */ + +enum nfulnl_attr_type { + NFULA_UNSPEC, + NFULA_PACKET_HDR, + NFULA_MARK, /* u_int32_t nfmark */ + NFULA_TIMESTAMP, /* nfulnl_msg_packet_timestamp */ + NFULA_IFINDEX_INDEV, /* u_int32_t ifindex */ + NFULA_IFINDEX_OUTDEV, /* u_int32_t ifindex */ + NFULA_HWADDR, /* nfulnl_msg_packet_hw */ + NFULA_PAYLOAD, /* opaque data payload */ + NFULA_PREFIX, /* string prefix */ + NFULA_UID, /* user id of socket */ + + __NFULA_MAX +}; +#define NFULA_MAX (__NFULA_MAX - 1) + +enum nfulnl_msg_config_cmds { + NFULNL_CFG_CMD_NONE, + NFULNL_CFG_CMD_BIND, + NFULNL_CFG_CMD_UNBIND, + NFULNL_CFG_CMD_PF_BIND, + NFULNL_CFG_CMD_PF_UNBIND, +}; + +struct nfulnl_msg_config_cmd { + u_int8_t command; /* nfulnl_msg_config_cmds */ +} __attribute__ ((packed)); + +struct nfulnl_msg_config_mode { + u_int32_t copy_range; + u_int8_t copy_mode; + u_int8_t _pad; +} __attribute__ ((packed)); + +enum nfulnl_attr_config { + NFULA_CFG_UNSPEC, + NFULA_CFG_CMD, /* nfulnl_msg_config_cmd */ + NFULA_CFG_MODE, /* nfulnl_msg_config_mode */ + NFULA_CFG_NLBUFSIZ, /* u_int32_t buffer size */ + NFULA_CFG_TIMEOUT, /* u_int32_t in 1/100 s */ + NFULA_CFG_QTHRESH, /* u_int32_t */ + __NFULA_CFG_MAX +}; +#define NFULA_CFG_MAX (__NFULA_CFG_MAX -1) + +#define NFULNL_COPY_NONE 0x00 +#define NFULNL_COPY_META 0x01 +#define NFULNL_COPY_PACKET 0x02 + +#endif /* _NFNETLINK_LOG_H */ diff -puN /dev/null include/linux/netfilter/nfnetlink_queue.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/include/linux/netfilter/nfnetlink_queue.h 2005-08-06 23:39:09.000000000 -0700 @@ -0,0 +1,86 @@ +#ifndef _NFNETLINK_QUEUE_H +#define _NFNETLINK_QUEUE_H + +#include + +enum nfqnl_msg_types { + NFQNL_MSG_PACKET, /* packet from kernel to userspace */ + NFQNL_MSG_VERDICT, /* verdict from userspace to kernel */ + NFQNL_MSG_CONFIG, /* connect to a particular queue */ + + NFQNL_MSG_MAX +}; + +struct nfqnl_msg_packet_hdr { + u_int32_t packet_id; /* unique ID of packet in queue */ + u_int16_t hw_protocol; /* hw protocol (network order) */ + u_int8_t hook; /* netfilter hook */ +} __attribute__ ((packed)); + +struct nfqnl_msg_packet_hw { + u_int16_t hw_addrlen; + u_int16_t _pad; + u_int8_t hw_addr[8]; +} __attribute__ ((packed)); + +struct nfqnl_msg_packet_timestamp { + u_int64_t sec; + u_int64_t usec; +} __attribute__ ((packed)); + +enum nfqnl_attr_type { + NFQA_UNSPEC, + NFQA_PACKET_HDR, + NFQA_VERDICT_HDR, /* nfqnl_msg_verdict_hrd */ + NFQA_MARK, /* u_int32_t nfmark */ + NFQA_TIMESTAMP, /* nfqnl_msg_packet_timestamp */ + NFQA_IFINDEX_INDEV, /* u_int32_t ifindex */ + NFQA_IFINDEX_OUTDEV, /* u_int32_t ifindex */ + NFQA_HWADDR, /* nfqnl_msg_packet_hw */ + NFQA_PAYLOAD, /* opaque data payload */ + + __NFQA_MAX +}; +#define NFQA_MAX (__NFQA_MAX - 1) + +struct nfqnl_msg_verdict_hdr { + u_int32_t verdict; + u_int32_t id; +} __attribute__ ((packed)); + + +enum nfqnl_msg_config_cmds { + NFQNL_CFG_CMD_NONE, + NFQNL_CFG_CMD_BIND, + NFQNL_CFG_CMD_UNBIND, + NFQNL_CFG_CMD_PF_BIND, + NFQNL_CFG_CMD_PF_UNBIND, +}; + +struct nfqnl_msg_config_cmd { + u_int8_t command; /* nfqnl_msg_config_cmds */ + u_int8_t _pad; + u_int16_t pf; /* AF_xxx for PF_[UN]BIND */ +} __attribute__ ((packed)); + +enum nfqnl_config_mode { + NFQNL_COPY_NONE, + NFQNL_COPY_META, + NFQNL_COPY_PACKET, +}; + +struct nfqnl_msg_config_params { + u_int32_t copy_range; + u_int8_t copy_mode; /* enum nfqnl_config_mode */ +} __attribute__ ((packed)); + + +enum nfqnl_attr_config { + NFQA_CFG_UNSPEC, + NFQA_CFG_CMD, /* nfqnl_msg_config_cmd */ + NFQA_CFG_PARAMS, /* nfqnl_msg_config_params */ + __NFQA_CFG_MAX +}; +#define NFQA_CFG_MAX (__NFQA_CFG_MAX-1) + +#endif /* _NFNETLINK_QUEUE_H */ diff -puN include/linux/net.h~git-net include/linux/net.h --- devel/include/linux/net.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/net.h 2005-08-06 23:39:09.000000000 -0700 @@ -84,6 +84,7 @@ enum sock_type { SOCK_RAW = 3, SOCK_RDM = 4, SOCK_SEQPACKET = 5, + SOCK_DCCP = 6, SOCK_PACKET = 10, }; @@ -282,5 +283,8 @@ static struct proto_ops name##_ops = { #define MODULE_ALIAS_NETPROTO(proto) \ MODULE_ALIAS("net-pf-" __stringify(proto)) +#define MODULE_ALIAS_NET_PF_PROTO(pf, proto) \ + MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto)) + #endif /* __KERNEL__ */ #endif /* _LINUX_NET_H */ diff -puN include/linux/netlink.h~git-net include/linux/netlink.h --- devel/include/linux/netlink.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/netlink.h 2005-08-06 23:39:10.000000000 -0700 @@ -119,7 +119,7 @@ struct netlink_skb_parms #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) -extern struct sock *netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)); +extern struct sock *netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct module *module); extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock); extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid, diff -puN include/linux/random.h~git-net include/linux/random.h --- devel/include/linux/random.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/random.h 2005-08-06 23:39:10.000000000 -0700 @@ -59,6 +59,8 @@ extern __u32 secure_tcp_sequence_number( __u16 sport, __u16 dport); extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr, __u16 sport, __u16 dport); +extern u64 secure_dccp_sequence_number(__u32 saddr, __u32 daddr, + __u16 sport, __u16 dport); #ifndef MODULE extern struct file_operations random_fops, urandom_fops; diff -puN include/linux/skbuff.h~git-net include/linux/skbuff.h --- devel/include/linux/skbuff.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/skbuff.h 2005-08-06 23:39:10.000000000 -0700 @@ -164,7 +164,6 @@ struct skb_shared_info { * @stamp: Time we arrived * @dev: Device we arrived on/are leaving by * @input_dev: Device we arrived on - * @real_dev: The real device we are using * @h: Transport layer header * @nh: Network layer header * @mac: Link layer header @@ -190,14 +189,11 @@ struct skb_shared_info { * @end: End pointer * @destructor: Destruct function * @nfmark: Can be used for communication between hooks - * @nfcache: Cache info * @nfct: Associated connection, if any * @nfctinfo: Relationship of this skb to the connection * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c - * @private: Data which is private to the HIPPI implementation * @tc_index: Traffic control index * @tc_verd: traffic control verdict - * @tc_classid: traffic control classid */ struct sk_buff { @@ -205,12 +201,10 @@ struct sk_buff { struct sk_buff *next; struct sk_buff *prev; - struct sk_buff_head *list; struct sock *sk; struct timeval stamp; struct net_device *dev; struct net_device *input_dev; - struct net_device *real_dev; union { struct tcphdr *th; @@ -252,33 +246,27 @@ struct sk_buff { __u8 local_df:1, cloned:1, ip_summed:2, - nohdr:1; - /* 3 bits spare */ + nohdr:1, + nfctinfo:3; __u8 pkt_type; __u16 protocol; void (*destructor)(struct sk_buff *skb); #ifdef CONFIG_NETFILTER - unsigned long nfmark; - __u32 nfcache; - __u32 nfctinfo; + __u32 nfmark; struct nf_conntrack *nfct; +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) + __u8 ipvs_property:1; +#endif #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; #endif #endif /* CONFIG_NETFILTER */ -#if defined(CONFIG_HIPPI) - union { - __u32 ifield; - } private; -#endif #ifdef CONFIG_NET_SCHED - __u32 tc_index; /* traffic control index */ + __u16 tc_index; /* traffic control index */ #ifdef CONFIG_NET_CLS_ACT - __u32 tc_verd; /* traffic control verdict */ - __u32 tc_classid; /* traffic control classid */ + __u16 tc_verd; /* traffic control verdict */ #endif - #endif @@ -597,7 +585,6 @@ static inline void __skb_queue_head(stru { struct sk_buff *prev, *next; - newsk->list = list; list->qlen++; prev = (struct sk_buff *)list; next = prev->next; @@ -622,7 +609,6 @@ static inline void __skb_queue_tail(stru { struct sk_buff *prev, *next; - newsk->list = list; list->qlen++; next = (struct sk_buff *)list; prev = next->prev; @@ -655,7 +641,6 @@ static inline struct sk_buff *__skb_dequ next->prev = prev; prev->next = next; result->next = result->prev = NULL; - result->list = NULL; } return result; } @@ -664,7 +649,7 @@ static inline struct sk_buff *__skb_dequ /* * Insert a packet on a list. */ -extern void skb_insert(struct sk_buff *old, struct sk_buff *newsk); +extern void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list); static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) @@ -672,24 +657,23 @@ static inline void __skb_insert(struct s newsk->next = next; newsk->prev = prev; next->prev = prev->next = newsk; - newsk->list = list; list->qlen++; } /* * Place a packet after a given packet in a list. */ -extern void skb_append(struct sk_buff *old, struct sk_buff *newsk); -static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk) +extern void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list); +static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) { - __skb_insert(newsk, old, old->next, old->list); + __skb_insert(newsk, old, old->next, list); } /* * remove sk_buff from list. _Must_ be called atomically, and with * the list known.. */ -extern void skb_unlink(struct sk_buff *skb); +extern void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list); static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev; @@ -698,7 +682,6 @@ static inline void __skb_unlink(struct s next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; - skb->list = NULL; next->prev = prev; prev->next = next; } diff -puN include/linux/socket.h~git-net include/linux/socket.h --- devel/include/linux/socket.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/socket.h 2005-08-06 23:39:10.000000000 -0700 @@ -271,6 +271,7 @@ struct ucred { #define SOL_IRDA 266 #define SOL_NETBEUI 267 #define SOL_LLC 268 +#define SOL_DCCP 269 /* IPX options */ #define IPX_TYPE 1 diff -puN include/linux/tcp.h~git-net include/linux/tcp.h --- devel/include/linux/tcp.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/linux/tcp.h 2005-08-06 23:39:10.000000000 -0700 @@ -55,24 +55,6 @@ struct tcphdr { __u16 urg_ptr; }; - -enum { - TCP_ESTABLISHED = 1, - TCP_SYN_SENT, - TCP_SYN_RECV, - TCP_FIN_WAIT1, - TCP_FIN_WAIT2, - TCP_TIME_WAIT, - TCP_CLOSE, - TCP_CLOSE_WAIT, - TCP_LAST_ACK, - TCP_LISTEN, - TCP_CLOSING, /* now a valid state */ - - TCP_MAX_STATES /* Leave at the end! */ -}; - -#define TCP_STATE_MASK 0xF #define TCP_ACTION_FIN (1 << 7) enum { @@ -195,8 +177,9 @@ struct tcp_info #include #include -#include #include +#include +#include /* This defines a selective acknowledgement block. */ struct tcp_sack_block { @@ -236,8 +219,8 @@ static inline struct tcp_request_sock *t } struct tcp_sock { - /* inet_sock has to be the first member of tcp_sock */ - struct inet_sock inet; + /* inet_connection_sock has to be the first member of tcp_sock */ + struct inet_connection_sock inet_conn; int tcp_header_len; /* Bytes of tcp header to send */ /* @@ -258,19 +241,6 @@ struct tcp_sock { __u32 snd_sml; /* Last byte of the most recently transmitted small packet */ __u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ __u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ - struct tcp_bind_bucket *bind_hash; - /* Delayed ACK control data */ - struct { - __u8 pending; /* ACK is pending */ - __u8 quick; /* Scheduled number of quick acks */ - __u8 pingpong; /* The session is interactive */ - __u8 blocked; /* Delayed ACK was blocked by socket lock*/ - __u32 ato; /* Predicted tick of soft clock */ - unsigned long timeout; /* Currently scheduled timeout */ - __u32 lrcvtime; /* timestamp of last received data packet*/ - __u16 last_seg_size; /* Size of last incoming segment */ - __u16 rcv_mss; /* MSS used for delayed ACK decisions */ - } ack; /* Data for direct copy to user */ struct { @@ -289,8 +259,8 @@ struct tcp_sock { __u16 xmit_size_goal; /* Goal for segmenting output packets */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ __u8 ca_state; /* State of fast-retransmit machine */ - __u8 retransmits; /* Number of unrecovered RTO timeouts. */ + __u8 keepalive_probes; /* num of allowed keep alive probes */ __u16 advmss; /* Advertised MSS */ __u32 window_clamp; /* Maximal window to advertise */ __u32 rcv_ssthresh; /* Current window clamp */ @@ -299,8 +269,8 @@ struct tcp_sock { __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ - __u8 unused; - __u8 defer_accept; /* User waits for some data after accept() */ + __u8 nonagle; /* Disable Nagle algorithm? */ + /* ONE BYTE HOLE, TRY TO PACK */ /* RTT measurement */ __u32 srtt; /* smoothed round trip time << 3 */ @@ -308,19 +278,13 @@ struct tcp_sock { __u32 mdev_max; /* maximal mdev for the last rtt period */ __u32 rttvar; /* smoothed mdev_max */ __u32 rtt_seq; /* sequence number to update rttvar */ - __u32 rto; /* retransmit timeout */ __u32 packets_out; /* Packets which are "in flight" */ __u32 left_out; /* Packets which leaved network */ __u32 retrans_out; /* Retransmitted packets out */ - __u8 backoff; /* backoff */ /* * Options received (usually on last packet, some only on SYN packets). */ - __u8 nonagle; /* Disable Nagle algorithm? */ - __u8 keepalive_probes; /* num of allowed keep alive probes */ - - __u8 probes_out; /* unanswered 0 window probes */ struct tcp_options_received rx_opt; /* @@ -333,11 +297,6 @@ struct tcp_sock { __u32 snd_cwnd_used; __u32 snd_cwnd_stamp; - /* Two commonly used timers in both sender and receiver paths. */ - unsigned long timeout; - struct timer_list retransmit_timer; /* Resend (no ack) */ - struct timer_list delack_timer; /* Ack delay */ - struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ @@ -352,7 +311,7 @@ struct tcp_sock { struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ - __u8 syn_retries; /* num of allowed syn retries */ + __u8 probes_out; /* unanswered 0 window probes */ __u8 ecn_flags; /* ECN status bits. */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ __u32 lost_out; /* Lost packets */ @@ -367,14 +326,12 @@ struct tcp_sock { int undo_retrans; /* number of undoable retransmissions. */ __u32 urg_seq; /* Seq of received urgent pointer */ __u16 urg_data; /* Saved octet of OOB data and control flags */ - __u8 pending; /* Scheduled timer event */ __u8 urg_mode; /* In urgent mode */ + /* ONE BYTE HOLE, TRY TO PACK! */ __u32 snd_up; /* Urgent pointer */ __u32 total_retrans; /* Total retransmits for entire connection */ - struct request_sock_queue accept_queue; /* FIFO of established children */ - unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; @@ -406,6 +363,20 @@ static inline struct tcp_sock *tcp_sk(co return (struct tcp_sock *)sk; } +struct tcp_timewait_sock { + struct inet_timewait_sock tw_sk; + __u32 tw_rcv_nxt; + __u32 tw_snd_nxt; + __u32 tw_rcv_wnd; + __u32 tw_ts_recent; + long tw_ts_recent_stamp; +}; + +static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) +{ + return (struct tcp_timewait_sock *)sk; +} + static inline void *tcp_ca(const struct tcp_sock *tp) { return (void *) tp->ca_priv; diff -puN include/net/act_api.h~git-net include/net/act_api.h --- devel/include/net/act_api.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/act_api.h 2005-08-06 23:39:10.000000000 -0700 @@ -63,7 +63,7 @@ struct tc_action_ops __u32 type; /* TBD to match kind */ __u32 capab; /* capabilities includes 4 bit version */ struct module *owner; - int (*act)(struct sk_buff **, struct tc_action *); + int (*act)(struct sk_buff **, struct tc_action *, struct tcf_result *); int (*get_stats)(struct sk_buff *, struct tc_action *); int (*dump)(struct sk_buff *, struct tc_action *,int , int); int (*cleanup)(struct tc_action *, int bind); diff -puN include/net/arp.h~git-net include/net/arp.h --- devel/include/net/arp.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/arp.h 2005-08-06 23:39:10.000000000 -0700 @@ -11,7 +11,7 @@ extern struct neigh_table arp_tbl; extern void arp_init(void); extern int arp_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, struct net_device *orig_dev); extern int arp_find(unsigned char *haddr, struct sk_buff *skb); extern int arp_ioctl(unsigned int cmd, void __user *arg); extern void arp_send(int type, int ptype, u32 dest_ip, diff -puN include/net/ax25.h~git-net include/net/ax25.h --- devel/include/net/ax25.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/ax25.h 2005-08-06 23:39:10.000000000 -0700 @@ -302,7 +302,7 @@ extern int ax25_protocol_is_registered( /* ax25_in.c */ extern int ax25_rx_iframe(ax25_cb *, struct sk_buff *); -extern int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *); +extern int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); /* ax25_ip.c */ extern int ax25_encapsulate(struct sk_buff *, struct net_device *, unsigned short, void *, void *, unsigned int); diff -puN include/net/datalink.h~git-net include/net/datalink.h --- devel/include/net/datalink.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/datalink.h 2005-08-06 23:39:10.000000000 -0700 @@ -9,7 +9,7 @@ struct datalink_proto { unsigned short header_length; int (*rcvfunc)(struct sk_buff *, struct net_device *, - struct packet_type *); + struct packet_type *, struct net_device *); int (*request)(struct datalink_proto *, struct sk_buff *, unsigned char *); struct list_head node; diff -puN include/net/dn.h~git-net include/net/dn.h --- devel/include/net/dn.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/dn.h 2005-08-06 23:39:10.000000000 -0700 @@ -3,6 +3,7 @@ #include #include +#include #include typedef unsigned short dn_address; diff -puN include/net/inet_common.h~git-net include/net/inet_common.h --- devel/include/net/inet_common.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/inet_common.h 2005-08-06 23:39:10.000000000 -0700 @@ -8,6 +8,11 @@ extern struct proto_ops inet_dgram_ops; * INET4 prototypes used by INET6 */ +struct msghdr; +struct sock; +struct sockaddr; +struct socket; + extern void inet_remove_sock(struct sock *sk1); extern void inet_put_sock(unsigned short num, struct sock *sk); @@ -29,7 +34,6 @@ extern unsigned int inet_poll(struct fi extern int inet_listen(struct socket *sock, int backlog); extern void inet_sock_destruct(struct sock *sk); -extern atomic_t inet_sock_nr; extern int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); diff -puN /dev/null include/net/inet_connection_sock.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/include/net/inet_connection_sock.h 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,251 @@ +/* + * NET Generic infrastructure for INET connection oriented protocols. + * + * Definitions for inet_connection_sock + * + * Authors: Many people, see the TCP sources + * + * From code originally in TCP + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET_CONNECTION_SOCK_H +#define _INET_CONNECTION_SOCK_H + +#include +#include +#include +#include + +#define INET_CSK_DEBUG 1 + +/* Cancel timers, when they are not required. */ +#undef INET_CSK_CLEAR_TIMERS + +struct inet_bind_bucket; +struct inet_hashinfo; + +/** inet_connection_sock - INET connection oriented sock + * + * @icsk_accept_queue: FIFO of established children + * @icsk_bind_hash: Bind node + * @icsk_timeout: Timeout + * @icsk_retransmit_timer: Resend (no ack) + * @icsk_rto: Retransmit timeout + * @icsk_retransmits: Number of unrecovered [RTO] timeouts + * @icsk_pending: Scheduled timer event + * @icsk_backoff: Backoff + * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries + * @icsk_ack: Delayed ACK control data + */ +struct inet_connection_sock { + /* inet_sock has to be the first member! */ + struct inet_sock icsk_inet; + struct request_sock_queue icsk_accept_queue; + struct inet_bind_bucket *icsk_bind_hash; + unsigned long icsk_timeout; + struct timer_list icsk_retransmit_timer; + struct timer_list icsk_delack_timer; + __u32 icsk_rto; + __u8 icsk_retransmits; + __u8 icsk_pending; + __u8 icsk_backoff; + __u8 icsk_syn_retries; + struct { + __u8 pending; /* ACK is pending */ + __u8 quick; /* Scheduled number of quick acks */ + __u8 pingpong; /* The session is interactive */ + __u8 blocked; /* Delayed ACK was blocked by socket lock */ + __u32 ato; /* Predicted tick of soft clock */ + unsigned long timeout; /* Currently scheduled timeout */ + __u32 lrcvtime; /* timestamp of last received data packet */ + __u16 last_seg_size; /* Size of last incoming segment */ + __u16 rcv_mss; /* MSS used for delayed ACK decisions */ + } icsk_ack; +}; + +#define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +#define ICSK_TIME_DACK 2 /* Delayed ack timer */ +#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ +#define ICSK_TIME_KEEPOPEN 4 /* Keepalive timer */ + +static inline struct inet_connection_sock *inet_csk(const struct sock *sk) +{ + return (struct inet_connection_sock *)sk; +} + +extern struct sock *inet_csk_clone(struct sock *sk, + const struct request_sock *req, + const unsigned int __nocast priority); + +enum inet_csk_ack_state_t { + ICSK_ACK_SCHED = 1, + ICSK_ACK_TIMER = 2, + ICSK_ACK_PUSHED = 4 +}; + +extern void inet_csk_init_xmit_timers(struct sock *sk, + void (*retransmit_handler)(unsigned long), + void (*delack_handler)(unsigned long), + void (*keepalive_handler)(unsigned long)); +extern void inet_csk_clear_xmit_timers(struct sock *sk); + +static inline void inet_csk_schedule_ack(struct sock *sk) +{ + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED; +} + +static inline int inet_csk_ack_scheduled(const struct sock *sk) +{ + return inet_csk(sk)->icsk_ack.pending & ICSK_ACK_SCHED; +} + +static inline void inet_csk_delack_init(struct sock *sk) +{ + memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); +} + +extern void inet_csk_delete_keepalive_timer(struct sock *sk); +extern void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); + +#ifdef INET_CSK_DEBUG +extern const char inet_csk_timer_bug_msg[]; +#endif + +static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { + icsk->icsk_pending = 0; +#ifdef INET_CSK_CLEAR_TIMERS + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); +#endif + } else if (what == ICSK_TIME_DACK) { + icsk->icsk_ack.blocked = icsk->icsk_ack.pending = 0; +#ifdef INET_CSK_CLEAR_TIMERS + sk_stop_timer(sk, &icsk->icsk_delack_timer); +#endif + } +#ifdef INET_CSK_DEBUG + else { + pr_debug(inet_csk_timer_bug_msg); + } +#endif +} + +/* + * Reset the retransmission timer + */ +static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, + unsigned long when, + const unsigned long max_when) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (when > max_when) { +#ifdef INET_CSK_DEBUG + pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", + sk, what, when, current_text_addr()); +#endif + when = max_when; + } + + if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { + icsk->icsk_pending = what; + icsk->icsk_timeout = jiffies + when; + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + } else if (what == ICSK_TIME_DACK) { + icsk->icsk_ack.pending |= ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = jiffies + when; + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + } +#ifdef INET_CSK_DEBUG + else { + pr_debug(inet_csk_timer_bug_msg); + } +#endif +} + +extern struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); + +extern struct request_sock *inet_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, + const __u32 raddr, + const __u32 laddr); +extern int inet_csk_get_port(struct inet_hashinfo *hashinfo, + struct sock *sk, unsigned short snum); + +extern struct dst_entry* inet_csk_route_req(struct sock *sk, + const struct request_sock *req); + +static inline void inet_csk_reqsk_queue_add(struct sock *sk, + struct request_sock *req, + struct sock *child) +{ + reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child); +} + +extern void inet_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned timeout); + +static inline void inet_csk_reqsk_queue_removed(struct sock *sk, + struct request_sock *req) +{ + if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0) + inet_csk_delete_keepalive_timer(sk); +} + +static inline void inet_csk_reqsk_queue_added(struct sock *sk, + const unsigned long timeout) +{ + if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0) + inet_csk_reset_keepalive_timer(sk, timeout); +} + +static inline int inet_csk_reqsk_queue_len(const struct sock *sk) +{ + return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue); +} + +static inline int inet_csk_reqsk_queue_young(const struct sock *sk) +{ + return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue); +} + +static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) +{ + return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); +} + +static inline void inet_csk_reqsk_queue_unlink(struct sock *sk, + struct request_sock *req, + struct request_sock **prev) +{ + reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req, prev); +} + +static inline void inet_csk_reqsk_queue_drop(struct sock *sk, + struct request_sock *req, + struct request_sock **prev) +{ + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); + reqsk_free(req); +} + +extern void inet_csk_reqsk_queue_prune(struct sock *parent, + const unsigned long interval, + const unsigned long timeout, + const unsigned long max_rto); + +extern void inet_csk_destroy_sock(struct sock *sk); +extern int inet_csk_listen_start(struct sock *sk, const int nr_table_entries); +extern void inet_csk_listen_stop(struct sock *sk); + +#endif /* _INET_CONNECTION_SOCK_H */ diff -puN /dev/null include/net/inet_hashtables.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/include/net/inet_hashtables.h 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,427 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _INET_HASHTABLES_H +#define _INET_HASHTABLES_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +/* This is for all connections with a full identity, no wildcards. + * New scheme, half the table is for TIME_WAIT, the other half is + * for the rest. I'll experiment with dynamic table growth later. + */ +struct inet_ehash_bucket { + rwlock_t lock; + struct hlist_head chain; +} __attribute__((__aligned__(8))); + +/* There are a few simple rules, which allow for local port reuse by + * an application. In essence: + * + * 1) Sockets bound to different interfaces may share a local port. + * Failing that, goto test 2. + * 2) If all sockets have sk->sk_reuse set, and none of them are in + * TCP_LISTEN state, the port may be shared. + * Failing that, goto test 3. + * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local + * address, and none of them are the same, the port may be + * shared. + * Failing this, the port cannot be shared. + * + * The interesting point, is test #2. This is what an FTP server does + * all day. To optimize this case we use a specific flag bit defined + * below. As we add sockets to a bind bucket list, we perform a + * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) + * As long as all sockets added to a bind bucket pass this test, + * the flag bit will be set. + * The resulting situation is that tcp_v[46]_verify_bind() can just check + * for this flag bit, if it is set and the socket trying to bind has + * sk->sk_reuse set, we don't even have to walk the owners list at all, + * we return that it is ok to bind this socket to the requested local port. + * + * Sounds like a lot of work, but it is worth it. In a more naive + * implementation (ie. current FreeBSD etc.) the entire list of ports + * must be walked for each data port opened by an ftp server. Needless + * to say, this does not scale at all. With a couple thousand FTP + * users logged onto your box, isn't it nice to know that new data + * ports are created in O(1) time? I thought so. ;-) -DaveM + */ +struct inet_bind_bucket { + unsigned short port; + signed short fastreuse; + struct hlist_node node; + struct hlist_head owners; +}; + +#define inet_bind_bucket_for_each(tb, node, head) \ + hlist_for_each_entry(tb, node, head, node) + +struct inet_bind_hashbucket { + spinlock_t lock; + struct hlist_head chain; +}; + +/* This is for listening sockets, thus all sockets which possess wildcards. */ +#define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ + +struct inet_hashinfo { + /* This is for sockets with full identity only. Sockets here will + * always be without wildcards and will have the following invariant: + * + * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE + * + * First half of the table is for sockets not in TIME_WAIT, second half + * is for TIME_WAIT sockets only. + */ + struct inet_ehash_bucket *ehash; + + /* Ok, let's try this, I give up, we do need a local binding + * TCP hash as well as the others for fast bind/connect. + */ + struct inet_bind_hashbucket *bhash; + + int bhash_size; + int ehash_size; + + /* All sockets in TCP_LISTEN state will be in here. This is the only + * table where wildcard'd TCP sockets can exist. Hash function here + * is just local port number. + */ + struct hlist_head listening_hash[INET_LHTABLE_SIZE]; + + /* All the above members are written once at bootup and + * never written again _or_ are predominantly read-access. + * + * Now align to a new cache line as all the following members + * are often dirty. + */ + rwlock_t lhash_lock ____cacheline_aligned; + atomic_t lhash_users; + wait_queue_head_t lhash_wait; + spinlock_t portalloc_lock; + kmem_cache_t *bind_bucket_cachep; + int port_rover; +}; + +static inline int inet_ehashfn(const __u32 laddr, const __u16 lport, + const __u32 faddr, const __u16 fport, + const int ehash_size) +{ + int h = (laddr ^ lport) ^ (faddr ^ fport); + h ^= h >> 16; + h ^= h >> 8; + return h & (ehash_size - 1); +} + +static inline int inet_sk_ehashfn(const struct sock *sk, const int ehash_size) +{ + const struct inet_sock *inet = inet_sk(sk); + const __u32 laddr = inet->rcv_saddr; + const __u16 lport = inet->num; + const __u32 faddr = inet->daddr; + const __u16 fport = inet->dport; + + return inet_ehashfn(laddr, lport, faddr, fport, ehash_size); +} + +extern struct inet_bind_bucket * + inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, + const unsigned short snum); +extern void inet_bind_bucket_destroy(kmem_cache_t *cachep, + struct inet_bind_bucket *tb); + +static inline int inet_bhashfn(const __u16 lport, const int bhash_size) +{ + return lport & (bhash_size - 1); +} + +extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum); + +/* These can have wildcards, don't try too hard. */ +static inline int inet_lhashfn(const unsigned short num) +{ + return num & (INET_LHTABLE_SIZE - 1); +} + +static inline int inet_sk_listen_hashfn(const struct sock *sk) +{ + return inet_lhashfn(inet_sk(sk)->num); +} + +/* Caller must disable local BH processing. */ +static inline void __inet_inherit_port(struct inet_hashinfo *table, + struct sock *sk, struct sock *child) +{ + const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); + struct inet_bind_hashbucket *head = &table->bhash[bhash]; + struct inet_bind_bucket *tb; + + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + sk_add_bind_node(child, &tb->owners); + inet_csk(child)->icsk_bind_hash = tb; + spin_unlock(&head->lock); +} + +static inline void inet_inherit_port(struct inet_hashinfo *table, + struct sock *sk, struct sock *child) +{ + local_bh_disable(); + __inet_inherit_port(table, sk, child); + local_bh_enable(); +} + +extern void inet_put_port(struct inet_hashinfo *table, struct sock *sk); + +extern void inet_listen_wlock(struct inet_hashinfo *hashinfo); + +/* + * - We may sleep inside this lock. + * - If sleeping is not required (or called from BH), + * use plain read_(un)lock(&inet_hashinfo.lhash_lock). + */ +static inline void inet_listen_lock(struct inet_hashinfo *hashinfo) +{ + /* read_lock synchronizes to candidates to writers */ + read_lock(&hashinfo->lhash_lock); + atomic_inc(&hashinfo->lhash_users); + read_unlock(&hashinfo->lhash_lock); +} + +static inline void inet_listen_unlock(struct inet_hashinfo *hashinfo) +{ + if (atomic_dec_and_test(&hashinfo->lhash_users)) + wake_up(&hashinfo->lhash_wait); +} + +static inline void __inet_hash(struct inet_hashinfo *hashinfo, + struct sock *sk, const int listen_possible) +{ + struct hlist_head *list; + rwlock_t *lock; + + BUG_TRAP(sk_unhashed(sk)); + if (listen_possible && sk->sk_state == TCP_LISTEN) { + list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &hashinfo->lhash_lock; + inet_listen_wlock(hashinfo); + } else { + sk->sk_hashent = inet_sk_ehashfn(sk, hashinfo->ehash_size); + list = &hashinfo->ehash[sk->sk_hashent].chain; + lock = &hashinfo->ehash[sk->sk_hashent].lock; + write_lock(lock); + } + __sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock(lock); + if (listen_possible && sk->sk_state == TCP_LISTEN) + wake_up(&hashinfo->lhash_wait); +} + +static inline void inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + __inet_hash(hashinfo, sk, 1); + local_bh_enable(); + } +} + +static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + rwlock_t *lock; + + if (sk_unhashed(sk)) + goto out; + + if (sk->sk_state == TCP_LISTEN) { + local_bh_disable(); + inet_listen_wlock(hashinfo); + lock = &hashinfo->lhash_lock; + } else { + struct inet_ehash_bucket *head = &hashinfo->ehash[sk->sk_hashent]; + lock = &head->lock; + write_lock_bh(&head->lock); + } + + if (__sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + write_unlock_bh(lock); +out: + if (sk->sk_state == TCP_LISTEN) + wake_up(&hashinfo->lhash_wait); +} + +static inline int inet_iif(const struct sk_buff *skb) +{ + return ((struct rtable *)skb->dst)->rt_iif; +} + +extern struct sock *__inet_lookup_listener(const struct hlist_head *head, + const u32 daddr, + const unsigned short hnum, + const int dif); + +/* Optimize the common listener case. */ +static inline struct sock * + inet_lookup_listener(struct inet_hashinfo *hashinfo, + const u32 daddr, + const unsigned short hnum, const int dif) +{ + struct sock *sk = NULL; + const struct hlist_head *head; + + read_lock(&hashinfo->lhash_lock); + head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; + if (!hlist_empty(head)) { + const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); + + if (inet->num == hnum && !sk->sk_node.next && + (!inet->rcv_saddr || inet->rcv_saddr == daddr) && + (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && + !sk->sk_bound_dev_if) + goto sherry_cache; + sk = __inet_lookup_listener(head, daddr, hnum, dif); + } + if (sk) { +sherry_cache: + sock_hold(sk); + } + read_unlock(&hashinfo->lhash_lock); + return sk; +} + +/* Socket demux engine toys. */ +#ifdef __BIG_ENDIAN +#define INET_COMBINED_PORTS(__sport, __dport) \ + (((__u32)(__sport) << 16) | (__u32)(__dport)) +#else /* __LITTLE_ENDIAN */ +#define INET_COMBINED_PORTS(__sport, __dport) \ + (((__u32)(__dport) << 16) | (__u32)(__sport)) +#endif + +#if (BITS_PER_LONG == 64) +#ifdef __BIG_ENDIAN +#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ + const __u64 __name = (((__u64)(__saddr)) << 32) | ((__u64)(__daddr)); +#else /* __LITTLE_ENDIAN */ +#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ + const __u64 __name = (((__u64)(__daddr)) << 32) | ((__u64)(__saddr)); +#endif /* __BIG_ENDIAN */ +#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ + (((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ + ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ + (((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \ + ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#else /* 32-bit arch */ +#define INET_ADDR_COOKIE(__name, __saddr, __daddr) +#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif) \ + ((inet_sk(__sk)->daddr == (__saddr)) && \ + (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ + ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif) \ + ((inet_twsk(__sk)->tw_daddr == (__saddr)) && \ + (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \ + ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#endif /* 64-bit arch */ + +/* + * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need + * not check it for lookups anymore, thanks Alexey. -DaveM + * + * Local BH must be disabled here. + */ +static inline struct sock * + __inet_lookup_established(struct inet_hashinfo *hashinfo, + const u32 saddr, const u16 sport, + const u32 daddr, const u16 hnum, + const int dif) +{ + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(sport, hnum); + struct sock *sk; + const struct hlist_node *node; + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + const int hash = inet_ehashfn(daddr, hnum, saddr, sport, hashinfo->ehash_size); + struct inet_ehash_bucket *head = &hashinfo->ehash[hash]; + + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { + if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { + if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; + } + sk = NULL; +out: + read_unlock(&head->lock); + return sk; +hit: + sock_hold(sk); + goto out; +} + +static inline struct sock *__inet_lookup(struct inet_hashinfo *hashinfo, + const u32 saddr, const u16 sport, + const u32 daddr, const u16 hnum, + const int dif) +{ + struct sock *sk = __inet_lookup_established(hashinfo, saddr, sport, daddr, + hnum, dif); + return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif); +} + +static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo, + const u32 saddr, const u16 sport, + const u32 daddr, const u16 dport, + const int dif) +{ + struct sock *sk; + + local_bh_disable(); + sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +} +#endif /* _INET_HASHTABLES_H */ diff -puN /dev/null include/net/inet_timewait_sock.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/include/net/inet_timewait_sock.h 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,154 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for a generic INET TIMEWAIT sock + * + * From code originally in net/tcp.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET_TIMEWAIT_SOCK_ +#define _INET_TIMEWAIT_SOCK_ + +#include + +#include +#include +#include + +#include +#include + +#include + +#if (BITS_PER_LONG == 64) +#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 8 +#else +#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 4 +#endif + +struct inet_bind_bucket; +struct inet_hashinfo; + +/* + * This is a TIME_WAIT sock. It works around the memory consumption + * problems of sockets in such a state on heavily loaded servers, but + * without violating the protocol specification. + */ +struct inet_timewait_sock { + /* + * Now struct sock also uses sock_common, so please just + * don't add nothing before this first member (__tw_common) --acme + */ + struct sock_common __tw_common; +#define tw_family __tw_common.skc_family +#define tw_state __tw_common.skc_state +#define tw_reuse __tw_common.skc_reuse +#define tw_bound_dev_if __tw_common.skc_bound_dev_if +#define tw_node __tw_common.skc_node +#define tw_bind_node __tw_common.skc_bind_node +#define tw_refcnt __tw_common.skc_refcnt +#define tw_prot __tw_common.skc_prot + volatile unsigned char tw_substate; + /* 3 bits hole, try to pack */ + unsigned char tw_rcv_wscale; + /* Socket demultiplex comparisons on incoming packets. */ + /* these five are in inet_sock */ + __u16 tw_sport; + __u32 tw_daddr __attribute__((aligned(INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES))); + __u32 tw_rcv_saddr; + __u16 tw_dport; + __u16 tw_num; + /* And these are ours. */ + __u8 tw_ipv6only:1; + /* 31 bits hole, try to pack */ + int tw_hashent; + int tw_timeout; + unsigned long tw_ttd; + struct inet_bind_bucket *tw_tb; + struct hlist_node tw_death_node; +}; + +static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, + struct hlist_head *list) +{ + hlist_add_head(&tw->tw_node, list); +} + +static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, + struct hlist_head *list) +{ + hlist_add_head(&tw->tw_bind_node, list); +} + +static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw) +{ + return tw->tw_death_node.pprev != NULL; +} + +static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw) +{ + tw->tw_death_node.pprev = NULL; +} + +static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw) +{ + __hlist_del(&tw->tw_death_node); + inet_twsk_dead_node_init(tw); +} + +static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw) +{ + if (inet_twsk_dead_hashed(tw)) { + __inet_twsk_del_dead_node(tw); + return 1; + } + return 0; +} + +#define inet_twsk_for_each(tw, node, head) \ + hlist_for_each_entry(tw, node, head, tw_node) + +#define inet_twsk_for_each_inmate(tw, node, jail) \ + hlist_for_each_entry(tw, node, jail, tw_death_node) + +#define inet_twsk_for_each_inmate_safe(tw, node, safe, jail) \ + hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node) + +static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) +{ + return (struct inet_timewait_sock *)sk; +} + +static inline u32 inet_rcv_saddr(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + inet_sk(sk)->rcv_saddr : inet_twsk(sk)->tw_rcv_saddr; +} + +static inline void inet_twsk_put(struct inet_timewait_sock *tw) +{ + if (atomic_dec_and_test(&tw->tw_refcnt)) { +#ifdef SOCK_REFCNT_DEBUG + printk(KERN_DEBUG "%s timewait_sock %p released\n", + tw->tw_prot->name, tw); +#endif + kmem_cache_free(tw->tw_prot->twsk_slab, tw); + } +} + +extern struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, + const int state); + +extern void __inet_twsk_kill(struct inet_timewait_sock *tw, + struct inet_hashinfo *hashinfo); + +extern void __inet_twsk_hashdance(struct inet_timewait_sock *tw, + struct sock *sk, + struct inet_hashinfo *hashinfo); +#endif /* _INET_TIMEWAIT_SOCK_ */ diff -puN include/net/ip6_route.h~git-net include/net/ip6_route.h --- devel/include/net/ip6_route.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/ip6_route.h 2005-08-06 23:39:10.000000000 -0700 @@ -12,7 +12,6 @@ #include #include #include -#include #include #include diff -puN include/net/ip.h~git-net include/net/ip.h --- devel/include/net/ip.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/ip.h 2005-08-06 23:39:10.000000000 -0700 @@ -86,7 +86,7 @@ extern int ip_build_and_send_pkt(struct u32 saddr, u32 daddr, struct ip_options *opt); extern int ip_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, struct net_device *orig_dev); extern int ip_local_deliver(struct sk_buff *skb); extern int ip_mr_input(struct sk_buff *skb); extern int ip_output(struct sk_buff *skb); @@ -140,8 +140,6 @@ struct ip_reply_arg { void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len); -extern int ip_finish_output(struct sk_buff *skb); - struct ipv4_config { int log_martians; diff -puN include/net/ipv6.h~git-net include/net/ipv6.h --- devel/include/net/ipv6.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/ipv6.h 2005-08-06 23:39:10.000000000 -0700 @@ -145,7 +145,6 @@ DECLARE_SNMP_STAT(struct udp_mib, udp_st #define UDP6_INC_STATS(field) SNMP_INC_STATS(udp_stats_in6, field) #define UDP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_stats_in6, field) #define UDP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_stats_in6, field) -extern atomic_t inet6_sock_nr; int snmp6_register_dev(struct inet6_dev *idev); int snmp6_unregister_dev(struct inet6_dev *idev); @@ -346,7 +345,8 @@ static inline int ipv6_addr_any(const st extern int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, + struct net_device *orig_dev); /* * upper-layer output functions diff -puN include/net/ip_vs.h~git-net include/net/ip_vs.h --- devel/include/net/ip_vs.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/ip_vs.h 2005-08-06 23:39:10.000000000 -0700 @@ -255,7 +255,6 @@ struct ip_vs_daemon_user { #include /* for struct atomic_t */ #include /* for struct neighbour */ #include /* for struct dst_entry */ -#include #include #include diff -puN include/net/llc.h~git-net include/net/llc.h --- devel/include/net/llc.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/llc.h 2005-08-06 23:39:10.000000000 -0700 @@ -46,7 +46,8 @@ struct llc_sap { unsigned char f_bit; int (*rcv_func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, + struct net_device *orig_dev); struct llc_addr laddr; struct list_head node; struct { @@ -64,7 +65,7 @@ extern rwlock_t llc_sap_list_lock; extern unsigned char llc_station_mac_sa[ETH_ALEN]; extern int llc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, struct net_device *orig_dev); extern int llc_mac_hdr_init(struct sk_buff *skb, unsigned char *sa, unsigned char *da); @@ -78,7 +79,8 @@ extern void llc_set_station_handler(void extern struct llc_sap *llc_sap_open(unsigned char lsap, int (*rcv)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)); + struct packet_type *pt, + struct net_device *orig_dev)); extern void llc_sap_close(struct llc_sap *sap); extern struct llc_sap *llc_sap_find(unsigned char sap_value); diff -puN include/net/p8022.h~git-net include/net/p8022.h --- devel/include/net/p8022.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/p8022.h 2005-08-06 23:39:10.000000000 -0700 @@ -4,7 +4,8 @@ extern struct datalink_proto * register_8022_client(unsigned char type, int (*func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)); + struct packet_type *pt, + struct net_device *orig_dev)); extern void unregister_8022_client(struct datalink_proto *proto); #endif diff -puN include/net/pkt_cls.h~git-net include/net/pkt_cls.h --- devel/include/net/pkt_cls.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/pkt_cls.h 2005-08-06 23:39:10.000000000 -0700 @@ -352,10 +352,10 @@ tcf_change_indev(struct tcf_proto *tp, c static inline int tcf_match_indev(struct sk_buff *skb, char *indev) { - if (0 != indev[0]) { - if (NULL == skb->input_dev) + if (indev[0]) { + if (!skb->input_dev) return 0; - else if (0 != strcmp(indev, skb->input_dev->name)) + if (strcmp(indev, skb->input_dev->name)) return 0; } diff -puN include/net/psnap.h~git-net include/net/psnap.h --- devel/include/net/psnap.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/psnap.h 2005-08-06 23:39:10.000000000 -0700 @@ -1,7 +1,7 @@ #ifndef _NET_PSNAP_H #define _NET_PSNAP_H -extern struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *)); +extern struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *orig_dev)); extern void unregister_snap_client(struct datalink_proto *proto); #endif diff -puN include/net/raw.h~git-net include/net/raw.h --- devel/include/net/raw.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/raw.h 2005-08-06 23:39:10.000000000 -0700 @@ -37,6 +37,6 @@ extern struct sock *__raw_v4_lookup(stru unsigned long raddr, unsigned long laddr, int dif); -extern void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); +extern int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); #endif /* _RAW_H */ diff -puN include/net/rawv6.h~git-net include/net/rawv6.h --- devel/include/net/rawv6.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/rawv6.h 2005-08-06 23:39:10.000000000 -0700 @@ -7,10 +7,11 @@ extern struct hlist_head raw_v6_htable[RAWV6_HTABLE_SIZE]; extern rwlock_t raw_v6_lock; -extern void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr); +extern int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr); extern struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, - struct in6_addr *loc_addr, struct in6_addr *rmt_addr); + struct in6_addr *loc_addr, struct in6_addr *rmt_addr, + int dif); extern int rawv6_rcv(struct sock *sk, struct sk_buff *skb); diff -puN include/net/request_sock.h~git-net include/net/request_sock.h --- devel/include/net/request_sock.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/request_sock.h 2005-08-06 23:39:10.000000000 -0700 @@ -89,6 +89,7 @@ struct listen_sock { int qlen_young; int clock_hand; u32 hash_rnd; + u32 nr_table_entries; struct request_sock *syn_table[0]; }; @@ -96,6 +97,7 @@ struct listen_sock { * * @rskq_accept_head - FIFO head of established children * @rskq_accept_tail - FIFO tail of established children + * @rskq_defer_accept - User waits for some data after accept() * @syn_wait_lock - serializer * * %syn_wait_lock is necessary only to avoid proc interface having to grab the main @@ -111,6 +113,8 @@ struct request_sock_queue { struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; rwlock_t syn_wait_lock; + u8 rskq_defer_accept; + /* 3 bytes hole, try to pack */ struct listen_sock *listen_opt; }; @@ -129,11 +133,13 @@ static inline struct listen_sock *reqsk_ return lopt; } -static inline void reqsk_queue_destroy(struct request_sock_queue *queue) +static inline void __reqsk_queue_destroy(struct request_sock_queue *queue) { kfree(reqsk_queue_yank_listen_sk(queue)); } +extern void reqsk_queue_destroy(struct request_sock_queue *queue); + static inline struct request_sock * reqsk_queue_yank_acceptq(struct request_sock_queue *queue) { @@ -221,17 +227,17 @@ static inline int reqsk_queue_added(stru return prev_qlen; } -static inline int reqsk_queue_len(struct request_sock_queue *queue) +static inline int reqsk_queue_len(const struct request_sock_queue *queue) { return queue->listen_opt != NULL ? queue->listen_opt->qlen : 0; } -static inline int reqsk_queue_len_young(struct request_sock_queue *queue) +static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) { return queue->listen_opt->qlen_young; } -static inline int reqsk_queue_is_full(struct request_sock_queue *queue) +static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) { return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log; } diff -puN include/net/route.h~git-net include/net/route.h --- devel/include/net/route.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/route.h 2005-08-06 23:39:10.000000000 -0700 @@ -105,10 +105,6 @@ struct rt_cache_stat unsigned int out_hlist_search; }; -extern struct rt_cache_stat *rt_cache_stat; -#define RT_CACHE_STAT_INC(field) \ - (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++) - extern struct ip_rt_acct *ip_rt_acct; struct in_device; diff -puN include/net/sctp/constants.h~git-net include/net/sctp/constants.h --- devel/include/net/sctp/constants.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/sctp/constants.h 2005-08-06 23:39:10.000000000 -0700 @@ -47,10 +47,10 @@ #ifndef __sctp_constants_h__ #define __sctp_constants_h__ -#include /* For TCP states used in sctp_sock_state_t */ #include #include /* For ipv6hdr. */ #include +#include /* For TCP states used in sctp_sock_state_t */ /* Value used for stream negotiation. */ enum { SCTP_MAX_STREAM = 0xffff }; diff -puN include/net/sock.h~git-net include/net/sock.h --- devel/include/net/sock.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/sock.h 2005-08-06 23:39:10.000000000 -0700 @@ -88,6 +88,7 @@ do { spin_lock_init(&((__sk)->sk_lock.sl } while(0) struct sock; +struct proto; /** * struct sock_common - minimal network layer representation of sockets @@ -98,10 +99,11 @@ struct sock; * @skc_node: main hash linkage for various protocol lookup tables * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_refcnt: reference count + * @skc_prot: protocol handlers inside a network family * * This is the minimal network layer representation of sockets, the header - * for struct sock and struct tcp_tw_bucket. - */ + * for struct sock and struct inet_timewait_sock. + */ struct sock_common { unsigned short skc_family; volatile unsigned char skc_state; @@ -110,11 +112,12 @@ struct sock_common { struct hlist_node skc_node; struct hlist_node skc_bind_node; atomic_t skc_refcnt; + struct proto *skc_prot; }; /** * struct sock - network layer representation of sockets - * @__sk_common: shared layout with tcp_tw_bucket + * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer @@ -136,11 +139,10 @@ struct sock_common { * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_lingertime: %SO_LINGER l_linger setting - * @sk_hashent: hash entry in several tables (e.g. tcp_ehash) + * @sk_hashent: hash entry in several tables (e.g. inet_hashinfo.ehash) * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used - * @sk_prot: protocol handlers inside a network family * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a persistent failure not just 'timed out' @@ -173,7 +175,7 @@ struct sock_common { */ struct sock { /* - * Now struct tcp_tw_bucket also uses sock_common, so please just + * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; @@ -184,6 +186,7 @@ struct sock { #define sk_node __sk_common.skc_node #define sk_bind_node __sk_common.skc_bind_node #define sk_refcnt __sk_common.skc_refcnt +#define sk_prot __sk_common.skc_prot unsigned char sk_shutdown : 2, sk_no_check : 2, sk_userlocks : 4; @@ -218,7 +221,6 @@ struct sock { struct sk_buff *tail; } sk_backlog; struct sk_buff_head sk_error_queue; - struct proto *sk_prot; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err, @@ -253,28 +255,28 @@ struct sock { /* * Hashed lists helper routines */ -static inline struct sock *__sk_head(struct hlist_head *head) +static inline struct sock *__sk_head(const struct hlist_head *head) { return hlist_entry(head->first, struct sock, sk_node); } -static inline struct sock *sk_head(struct hlist_head *head) +static inline struct sock *sk_head(const struct hlist_head *head) { return hlist_empty(head) ? NULL : __sk_head(head); } -static inline struct sock *sk_next(struct sock *sk) +static inline struct sock *sk_next(const struct sock *sk) { return sk->sk_node.next ? hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; } -static inline int sk_unhashed(struct sock *sk) +static inline int sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); } -static inline int sk_hashed(struct sock *sk) +static inline int sk_hashed(const struct sock *sk) { return sk->sk_node.pprev != NULL; } @@ -549,6 +551,10 @@ struct proto { kmem_cache_t *slab; unsigned int obj_size; + kmem_cache_t *twsk_slab; + unsigned int twsk_obj_size; + atomic_t *orphan_count; + struct request_sock_ops *rsk_prot; struct module *owner; @@ -556,7 +562,9 @@ struct proto { char name[32]; struct list_head node; - +#ifdef SOCK_REFCNT_DEBUG + atomic_t socks; +#endif struct { int inuse; u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; @@ -566,6 +574,31 @@ struct proto { extern int proto_register(struct proto *prot, int alloc_slab); extern void proto_unregister(struct proto *prot); +#ifdef SOCK_REFCNT_DEBUG +static inline void sk_refcnt_debug_inc(struct sock *sk) +{ + atomic_inc(&sk->sk_prot->socks); +} + +static inline void sk_refcnt_debug_dec(struct sock *sk) +{ + atomic_dec(&sk->sk_prot->socks); + printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", + sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); +} + +static inline void sk_refcnt_debug_release(const struct sock *sk) +{ + if (atomic_read(&sk->sk_refcnt) != 1) + printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", + sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt)); +} +#else /* SOCK_REFCNT_DEBUG */ +#define sk_refcnt_debug_inc(sk) do { } while (0) +#define sk_refcnt_debug_dec(sk) do { } while (0) +#define sk_refcnt_debug_release(sk) do { } while (0) +#endif /* SOCK_REFCNT_DEBUG */ + /* Called with local bh disabled */ static __inline__ void sock_prot_inc_use(struct proto *prot) { @@ -577,6 +610,15 @@ static __inline__ void sock_prot_dec_use prot->stats[smp_processor_id()].inuse--; } +/* With per-bucket locks this operation is not-atomic, so that + * this version is not worse. + */ +static inline void __sk_prot_rehash(struct sock *sk) +{ + sk->sk_prot->unhash(sk); + sk->sk_prot->hash(sk); +} + /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) @@ -688,6 +730,8 @@ extern struct sock *sk_alloc(int family unsigned int __nocast priority, struct proto *prot, int zero_it); extern void sk_free(struct sock *sk); +extern struct sock *sk_clone(const struct sock *sk, + const unsigned int __nocast priority); extern struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, @@ -981,6 +1025,16 @@ sk_dst_check(struct sock *sk, u32 cookie return dst; } +static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) +{ + __sk_dst_set(sk, dst); + sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_TSO) { + if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len) + sk->sk_route_caps &= ~NETIF_F_TSO; + } +} + static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb) { sk->sk_wmem_queued += skb->truesize; diff -puN include/net/tcp_ecn.h~git-net include/net/tcp_ecn.h --- devel/include/net/tcp_ecn.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/tcp_ecn.h 2005-08-06 23:39:10.000000000 -0700 @@ -88,7 +88,7 @@ static inline void TCP_ECN_check_ce(stru * it is surely retransmit. It is not in ECN RFC, * but Linux follows this rule. */ else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) - tcp_enter_quickack_mode(tp); + tcp_enter_quickack_mode((struct sock *)tp); } } diff -puN include/net/tcp.h~git-net include/net/tcp.h --- devel/include/net/tcp.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/tcp.h 2005-08-06 23:39:10.000000000 -0700 @@ -21,360 +21,30 @@ #define TCP_DEBUG 1 #define FASTRETRANS_DEBUG 1 -/* Cancel timers, when they are not required. */ -#undef TCP_CLEAR_TIMERS - #include #include #include #include #include #include + +#include +#include #include #include #include #include #include -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -#include -#endif -#include - -/* This is for all connections with a full identity, no wildcards. - * New scheme, half the table is for TIME_WAIT, the other half is - * for the rest. I'll experiment with dynamic table growth later. - */ -struct tcp_ehash_bucket { - rwlock_t lock; - struct hlist_head chain; -} __attribute__((__aligned__(8))); - -/* This is for listening sockets, thus all sockets which possess wildcards. */ -#define TCP_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ - -/* There are a few simple rules, which allow for local port reuse by - * an application. In essence: - * - * 1) Sockets bound to different interfaces may share a local port. - * Failing that, goto test 2. - * 2) If all sockets have sk->sk_reuse set, and none of them are in - * TCP_LISTEN state, the port may be shared. - * Failing that, goto test 3. - * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local - * address, and none of them are the same, the port may be - * shared. - * Failing this, the port cannot be shared. - * - * The interesting point, is test #2. This is what an FTP server does - * all day. To optimize this case we use a specific flag bit defined - * below. As we add sockets to a bind bucket list, we perform a - * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) - * As long as all sockets added to a bind bucket pass this test, - * the flag bit will be set. - * The resulting situation is that tcp_v[46]_verify_bind() can just check - * for this flag bit, if it is set and the socket trying to bind has - * sk->sk_reuse set, we don't even have to walk the owners list at all, - * we return that it is ok to bind this socket to the requested local port. - * - * Sounds like a lot of work, but it is worth it. In a more naive - * implementation (ie. current FreeBSD etc.) the entire list of ports - * must be walked for each data port opened by an ftp server. Needless - * to say, this does not scale at all. With a couple thousand FTP - * users logged onto your box, isn't it nice to know that new data - * ports are created in O(1) time? I thought so. ;-) -DaveM - */ -struct tcp_bind_bucket { - unsigned short port; - signed short fastreuse; - struct hlist_node node; - struct hlist_head owners; -}; - -#define tb_for_each(tb, node, head) hlist_for_each_entry(tb, node, head, node) - -struct tcp_bind_hashbucket { - spinlock_t lock; - struct hlist_head chain; -}; - -static inline struct tcp_bind_bucket *__tb_head(struct tcp_bind_hashbucket *head) -{ - return hlist_entry(head->chain.first, struct tcp_bind_bucket, node); -} - -static inline struct tcp_bind_bucket *tb_head(struct tcp_bind_hashbucket *head) -{ - return hlist_empty(&head->chain) ? NULL : __tb_head(head); -} - -extern struct tcp_hashinfo { - /* This is for sockets with full identity only. Sockets here will - * always be without wildcards and will have the following invariant: - * - * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE - * - * First half of the table is for sockets not in TIME_WAIT, second half - * is for TIME_WAIT sockets only. - */ - struct tcp_ehash_bucket *__tcp_ehash; - - /* Ok, let's try this, I give up, we do need a local binding - * TCP hash as well as the others for fast bind/connect. - */ - struct tcp_bind_hashbucket *__tcp_bhash; - - int __tcp_bhash_size; - int __tcp_ehash_size; - - /* All sockets in TCP_LISTEN state will be in here. This is the only - * table where wildcard'd TCP sockets can exist. Hash function here - * is just local port number. - */ - struct hlist_head __tcp_listening_hash[TCP_LHTABLE_SIZE]; - - /* All the above members are written once at bootup and - * never written again _or_ are predominantly read-access. - * - * Now align to a new cache line as all the following members - * are often dirty. - */ - rwlock_t __tcp_lhash_lock ____cacheline_aligned; - atomic_t __tcp_lhash_users; - wait_queue_head_t __tcp_lhash_wait; - spinlock_t __tcp_portalloc_lock; -} tcp_hashinfo; - -#define tcp_ehash (tcp_hashinfo.__tcp_ehash) -#define tcp_bhash (tcp_hashinfo.__tcp_bhash) -#define tcp_ehash_size (tcp_hashinfo.__tcp_ehash_size) -#define tcp_bhash_size (tcp_hashinfo.__tcp_bhash_size) -#define tcp_listening_hash (tcp_hashinfo.__tcp_listening_hash) -#define tcp_lhash_lock (tcp_hashinfo.__tcp_lhash_lock) -#define tcp_lhash_users (tcp_hashinfo.__tcp_lhash_users) -#define tcp_lhash_wait (tcp_hashinfo.__tcp_lhash_wait) -#define tcp_portalloc_lock (tcp_hashinfo.__tcp_portalloc_lock) - -extern kmem_cache_t *tcp_bucket_cachep; -extern struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, - unsigned short snum); -extern void tcp_bucket_destroy(struct tcp_bind_bucket *tb); -extern void tcp_bucket_unlock(struct sock *sk); -extern int tcp_port_rover; - -/* These are AF independent. */ -static __inline__ int tcp_bhashfn(__u16 lport) -{ - return (lport & (tcp_bhash_size - 1)); -} - -extern void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, - unsigned short snum); - -#if (BITS_PER_LONG == 64) -#define TCP_ADDRCMP_ALIGN_BYTES 8 -#else -#define TCP_ADDRCMP_ALIGN_BYTES 4 -#endif - -/* This is a TIME_WAIT bucket. It works around the memory consumption - * problems of sockets in such a state on heavily loaded servers, but - * without violating the protocol specification. - */ -struct tcp_tw_bucket { - /* - * Now struct sock also uses sock_common, so please just - * don't add nothing before this first member (__tw_common) --acme - */ - struct sock_common __tw_common; -#define tw_family __tw_common.skc_family -#define tw_state __tw_common.skc_state -#define tw_reuse __tw_common.skc_reuse -#define tw_bound_dev_if __tw_common.skc_bound_dev_if -#define tw_node __tw_common.skc_node -#define tw_bind_node __tw_common.skc_bind_node -#define tw_refcnt __tw_common.skc_refcnt - volatile unsigned char tw_substate; - unsigned char tw_rcv_wscale; - __u16 tw_sport; - /* Socket demultiplex comparisons on incoming packets. */ - /* these five are in inet_sock */ - __u32 tw_daddr - __attribute__((aligned(TCP_ADDRCMP_ALIGN_BYTES))); - __u32 tw_rcv_saddr; - __u16 tw_dport; - __u16 tw_num; - /* And these are ours. */ - int tw_hashent; - int tw_timeout; - __u32 tw_rcv_nxt; - __u32 tw_snd_nxt; - __u32 tw_rcv_wnd; - __u32 tw_ts_recent; - long tw_ts_recent_stamp; - unsigned long tw_ttd; - struct tcp_bind_bucket *tw_tb; - struct hlist_node tw_death_node; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - struct in6_addr tw_v6_daddr; - struct in6_addr tw_v6_rcv_saddr; - int tw_v6_ipv6only; -#endif -}; - -static __inline__ void tw_add_node(struct tcp_tw_bucket *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_node, list); -} - -static __inline__ void tw_add_bind_node(struct tcp_tw_bucket *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_bind_node, list); -} - -static inline int tw_dead_hashed(struct tcp_tw_bucket *tw) -{ - return tw->tw_death_node.pprev != NULL; -} - -static __inline__ void tw_dead_node_init(struct tcp_tw_bucket *tw) -{ - tw->tw_death_node.pprev = NULL; -} - -static __inline__ void __tw_del_dead_node(struct tcp_tw_bucket *tw) -{ - __hlist_del(&tw->tw_death_node); - tw_dead_node_init(tw); -} - -static __inline__ int tw_del_dead_node(struct tcp_tw_bucket *tw) -{ - if (tw_dead_hashed(tw)) { - __tw_del_dead_node(tw); - return 1; - } - return 0; -} - -#define tw_for_each(tw, node, head) \ - hlist_for_each_entry(tw, node, head, tw_node) - -#define tw_for_each_inmate(tw, node, jail) \ - hlist_for_each_entry(tw, node, jail, tw_death_node) - -#define tw_for_each_inmate_safe(tw, node, safe, jail) \ - hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node) - -#define tcptw_sk(__sk) ((struct tcp_tw_bucket *)(__sk)) - -static inline u32 tcp_v4_rcv_saddr(const struct sock *sk) -{ - return likely(sk->sk_state != TCP_TIME_WAIT) ? - inet_sk(sk)->rcv_saddr : tcptw_sk(sk)->tw_rcv_saddr; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk) -{ - return likely(sk->sk_state != TCP_TIME_WAIT) ? - &inet6_sk(sk)->rcv_saddr : &tcptw_sk(sk)->tw_v6_rcv_saddr; -} - -static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) -{ - return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; -} - -#define tcptw_sk_ipv6only(__sk) (tcptw_sk(__sk)->tw_v6_ipv6only) +#include -static inline int tcp_v6_ipv6only(const struct sock *sk) -{ - return likely(sk->sk_state != TCP_TIME_WAIT) ? - ipv6_only_sock(sk) : tcptw_sk_ipv6only(sk); -} -#else -# define __tcp_v6_rcv_saddr(__sk) NULL -# define tcp_v6_rcv_saddr(__sk) NULL -# define tcptw_sk_ipv6only(__sk) 0 -# define tcp_v6_ipv6only(__sk) 0 -#endif - -extern kmem_cache_t *tcp_timewait_cachep; +#include -static inline void tcp_tw_put(struct tcp_tw_bucket *tw) -{ - if (atomic_dec_and_test(&tw->tw_refcnt)) { -#ifdef INET_REFCNT_DEBUG - printk(KERN_DEBUG "tw_bucket %p released\n", tw); -#endif - kmem_cache_free(tcp_timewait_cachep, tw); - } -} +extern struct inet_hashinfo tcp_hashinfo; extern atomic_t tcp_orphan_count; extern int tcp_tw_count; extern void tcp_time_wait(struct sock *sk, int state, int timeo); -extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); - - -/* Socket demux engine toys. */ -#ifdef __BIG_ENDIAN -#define TCP_COMBINED_PORTS(__sport, __dport) \ - (((__u32)(__sport)<<16) | (__u32)(__dport)) -#else /* __LITTLE_ENDIAN */ -#define TCP_COMBINED_PORTS(__sport, __dport) \ - (((__u32)(__dport)<<16) | (__u32)(__sport)) -#endif - -#if (BITS_PER_LONG == 64) -#ifdef __BIG_ENDIAN -#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \ - __u64 __name = (((__u64)(__saddr))<<32)|((__u64)(__daddr)); -#else /* __LITTLE_ENDIAN */ -#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \ - __u64 __name = (((__u64)(__daddr))<<32)|((__u64)(__saddr)); -#endif /* __BIG_ENDIAN */ -#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - (((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \ - ((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - (((*((__u64 *)&(tcptw_sk(__sk)->tw_daddr))) == (__cookie)) && \ - ((*((__u32 *)&(tcptw_sk(__sk)->tw_dport))) == (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#else /* 32-bit arch */ -#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) -#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - ((inet_sk(__sk)->daddr == (__saddr)) && \ - (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ - ((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - ((tcptw_sk(__sk)->tw_daddr == (__saddr)) && \ - (tcptw_sk(__sk)->tw_rcv_saddr == (__daddr)) && \ - ((*((__u32 *)&(tcptw_sk(__sk)->tw_dport))) == (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#endif /* 64-bit arch */ - -#define TCP_IPV6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \ - (((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports)) && \ - ((__sk)->sk_family == AF_INET6) && \ - ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \ - ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) - -/* These can have wildcards, don't try too hard. */ -static __inline__ int tcp_lhashfn(unsigned short num) -{ - return num & (TCP_LHTABLE_SIZE - 1); -} - -static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) -{ - return tcp_lhashfn(inet_sk(sk)->num); -} +extern void tcp_tw_deschedule(struct inet_timewait_sock *tw); #define MAX_TCP_HEADER (128 + MAX_HEADER) @@ -534,11 +204,6 @@ static __inline__ int tcp_sk_listen_hash #define TCPOLEN_SACK_BASE_ALIGNED 4 #define TCPOLEN_SACK_PERBLOCK 8 -#define TCP_TIME_RETRANS 1 /* Retransmit timer */ -#define TCP_TIME_DACK 2 /* Delayed ack timer */ -#define TCP_TIME_PROBE0 3 /* Zero window probe timer */ -#define TCP_TIME_KEEPOPEN 4 /* Keepalive timer */ - /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ @@ -585,12 +250,6 @@ extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; extern int tcp_memory_pressure; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -#define TCP_INET_FAMILY(fam) ((fam) == AF_INET) -#else -#define TCP_INET_FAMILY(fam) 1 -#endif - /* * Pointers to address related TCP functions * (i.e. things that depend on the address family) @@ -671,9 +330,6 @@ DECLARE_SNMP_STAT(struct tcp_mib, tcp_st #define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val) #define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val) -extern void tcp_put_port(struct sock *sk); -extern void tcp_inherit_port(struct sock *sk, struct sock *child); - extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); @@ -682,7 +338,7 @@ extern int tcp_v4_rcv(struct sk_buff * extern int tcp_v4_remember_stamp(struct sock *sk); -extern int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw); +extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); @@ -704,42 +360,22 @@ extern int tcp_rcv_established(struct extern void tcp_rcv_space_adjust(struct sock *sk); -enum tcp_ack_state_t -{ - TCP_ACK_SCHED = 1, - TCP_ACK_TIMER = 2, - TCP_ACK_PUSHED= 4 -}; - -static inline void tcp_schedule_ack(struct tcp_sock *tp) -{ - tp->ack.pending |= TCP_ACK_SCHED; -} - -static inline int tcp_ack_scheduled(struct tcp_sock *tp) +static inline void tcp_dec_quickack_mode(struct sock *sk, + const unsigned int pkts) { - return tp->ack.pending&TCP_ACK_SCHED; -} - -static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp, unsigned int pkts) -{ - if (tp->ack.quick) { - if (pkts >= tp->ack.quick) { - tp->ack.quick = 0; + struct inet_connection_sock *icsk = inet_csk(sk); + if (icsk->icsk_ack.quick) { + if (pkts >= icsk->icsk_ack.quick) { + icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ - tp->ack.ato = TCP_ATO_MIN; + icsk->icsk_ack.ato = TCP_ATO_MIN; } else - tp->ack.quick -= pkts; + icsk->icsk_ack.quick -= pkts; } } -extern void tcp_enter_quickack_mode(struct tcp_sock *tp); - -static __inline__ void tcp_delack_init(struct tcp_sock *tp) -{ - memset(&tp->ack, 0, sizeof(tp->ack)); -} +extern void tcp_enter_quickack_mode(struct sock *sk); static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { @@ -755,10 +391,9 @@ enum tcp_tw_status }; -extern enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw, +extern enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - struct tcphdr *th, - unsigned len); + const struct tcphdr *th); extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb, struct request_sock *req, @@ -773,7 +408,6 @@ extern void tcp_update_metrics(struct extern void tcp_close(struct sock *sk, long timeout); -extern struct sock * tcp_accept(struct sock *sk, int flags, int *err); extern unsigned int tcp_poll(struct file * file, struct socket *sock, struct poll_table_struct *wait); extern int tcp_getsockopt(struct sock *sk, int level, @@ -789,8 +423,6 @@ extern int tcp_recvmsg(struct kiocb *i size_t len, int nonblock, int flags, int *addr_len); -extern int tcp_listen_start(struct sock *sk); - extern void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab); @@ -799,11 +431,6 @@ extern void tcp_parse_options(struct s * TCP v4 functions exported for the inet6 API */ -extern int tcp_v4_rebuild_header(struct sock *sk); - -extern int tcp_v4_build_header(struct sock *sk, - struct sk_buff *skb); - extern void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); @@ -872,17 +499,14 @@ extern void tcp_cwnd_application_limited /* tcp_timer.c */ extern void tcp_init_xmit_timers(struct sock *); -extern void tcp_clear_xmit_timers(struct sock *); +static inline void tcp_clear_xmit_timers(struct sock *sk) +{ + inet_csk_clear_xmit_timers(sk); +} -extern void tcp_delete_keepalive_timer(struct sock *); -extern void tcp_reset_keepalive_timer(struct sock *, unsigned long); extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); extern unsigned int tcp_current_mss(struct sock *sk, int large); -#ifdef TCP_DEBUG -extern const char tcp_timer_bug_msg[]; -#endif - /* tcp_diag.c */ extern void tcp_get_info(struct sock *, struct tcp_info *); @@ -892,72 +516,6 @@ typedef int (*sk_read_actor_t)(read_desc extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); -static inline void tcp_clear_xmit_timer(struct sock *sk, int what) -{ - struct tcp_sock *tp = tcp_sk(sk); - - switch (what) { - case TCP_TIME_RETRANS: - case TCP_TIME_PROBE0: - tp->pending = 0; - -#ifdef TCP_CLEAR_TIMERS - sk_stop_timer(sk, &tp->retransmit_timer); -#endif - break; - case TCP_TIME_DACK: - tp->ack.blocked = 0; - tp->ack.pending = 0; - -#ifdef TCP_CLEAR_TIMERS - sk_stop_timer(sk, &tp->delack_timer); -#endif - break; - default: -#ifdef TCP_DEBUG - printk(tcp_timer_bug_msg); -#endif - return; - }; - -} - -/* - * Reset the retransmission timer - */ -static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (when > TCP_RTO_MAX) { -#ifdef TCP_DEBUG - printk(KERN_DEBUG "reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr()); -#endif - when = TCP_RTO_MAX; - } - - switch (what) { - case TCP_TIME_RETRANS: - case TCP_TIME_PROBE0: - tp->pending = what; - tp->timeout = jiffies+when; - sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); - break; - - case TCP_TIME_DACK: - tp->ack.pending |= TCP_ACK_TIMER; - tp->ack.timeout = jiffies+when; - sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); - break; - - default: -#ifdef TCP_DEBUG - printk(tcp_timer_bug_msg); -#endif - return; - }; -} - /* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. @@ -975,7 +533,7 @@ static inline void tcp_initialize_rcv_ms hint = min(hint, TCP_MIN_RCVMSS); hint = max(hint, TCP_MIN_MSS); - tp->ack.rcv_mss = hint; + inet_csk(sk)->icsk_ack.rcv_mss = hint; } static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) @@ -1110,7 +668,8 @@ static inline void tcp_packets_out_inc(s tp->packets_out += tcp_skb_pcount(skb); if (!orig) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } static inline void tcp_packets_out_dec(struct tcp_sock *tp, @@ -1277,8 +836,10 @@ static __inline__ void tcp_minshall_upda static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) { - if (!tp->packets_out && !tp->pending) - tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto); + const struct inet_connection_sock *icsk = inet_csk(sk); + if (!tp->packets_out && !icsk->icsk_pending) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + icsk->icsk_rto, TCP_RTO_MAX); } static __inline__ void tcp_push_pending_frames(struct sock *sk, @@ -1297,9 +858,6 @@ static __inline__ void tcp_update_wl(str tp->snd_wl1 = seq; } -extern void tcp_destroy_sock(struct sock *sk); - - /* * Calculate(/check) TCP checksum */ @@ -1359,8 +917,10 @@ static __inline__ int tcp_prequeue(struc tp->ucopy.memory = 0; } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { wake_up_interruptible(sk->sk_sleep); - if (!tcp_ack_scheduled(tp)) - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4); + if (!inet_csk_ack_scheduled(sk)) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + (3 * TCP_RTO_MIN) / 4, + TCP_RTO_MAX); } return 1; } @@ -1393,9 +953,9 @@ static __inline__ void tcp_set_state(str TCP_INC_STATS(TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); - if (tcp_sk(sk)->bind_hash && + if (inet_csk(sk)->icsk_bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) - tcp_put_port(sk); + inet_put_port(&tcp_hashinfo, sk); /* fall through */ default: if (oldstate==TCP_ESTABLISHED) @@ -1422,7 +982,7 @@ static __inline__ void tcp_done(struct s if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else - tcp_destroy_sock(sk); + inet_csk_destroy_sock(sk); } static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt) @@ -1524,54 +1084,6 @@ static inline int tcp_full_space(const s return tcp_win_from_space(sk->sk_rcvbuf); } -static inline void tcp_acceptq_queue(struct sock *sk, struct request_sock *req, - struct sock *child) -{ - reqsk_queue_add(&tcp_sk(sk)->accept_queue, req, sk, child); -} - -static inline void -tcp_synq_removed(struct sock *sk, struct request_sock *req) -{ - if (reqsk_queue_removed(&tcp_sk(sk)->accept_queue, req) == 0) - tcp_delete_keepalive_timer(sk); -} - -static inline void tcp_synq_added(struct sock *sk) -{ - if (reqsk_queue_added(&tcp_sk(sk)->accept_queue) == 0) - tcp_reset_keepalive_timer(sk, TCP_TIMEOUT_INIT); -} - -static inline int tcp_synq_len(struct sock *sk) -{ - return reqsk_queue_len(&tcp_sk(sk)->accept_queue); -} - -static inline int tcp_synq_young(struct sock *sk) -{ - return reqsk_queue_len_young(&tcp_sk(sk)->accept_queue); -} - -static inline int tcp_synq_is_full(struct sock *sk) -{ - return reqsk_queue_is_full(&tcp_sk(sk)->accept_queue); -} - -static inline void tcp_synq_unlink(struct tcp_sock *tp, struct request_sock *req, - struct request_sock **prev) -{ - reqsk_queue_unlink(&tp->accept_queue, req, prev); -} - -static inline void tcp_synq_drop(struct sock *sk, struct request_sock *req, - struct request_sock **prev) -{ - tcp_synq_unlink(tcp_sk(sk), req, prev); - tcp_synq_removed(sk, req); - reqsk_free(req); -} - static __inline__ void tcp_openreq_init(struct request_sock *req, struct tcp_options_received *rx_opt, struct sk_buff *skb) @@ -1593,27 +1105,6 @@ static __inline__ void tcp_openreq_init( extern void tcp_enter_memory_pressure(void); -extern void tcp_listen_wlock(void); - -/* - We may sleep inside this lock. - * - If sleeping is not required (or called from BH), - * use plain read_(un)lock(&tcp_lhash_lock). - */ - -static inline void tcp_listen_lock(void) -{ - /* read_lock synchronizes to candidates to writers */ - read_lock(&tcp_lhash_lock); - atomic_inc(&tcp_lhash_users); - read_unlock(&tcp_lhash_lock); -} - -static inline void tcp_listen_unlock(void) -{ - if (atomic_dec_and_test(&tcp_lhash_users)) - wake_up(&tcp_lhash_wait); -} - static inline int keepalive_intvl_when(const struct tcp_sock *tp) { return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl; @@ -1624,12 +1115,13 @@ static inline int keepalive_time_when(co return tp->keepalive_time ? : sysctl_tcp_keepalive_time; } -static inline int tcp_fin_time(const struct tcp_sock *tp) +static inline int tcp_fin_time(const struct sock *sk) { - int fin_timeout = tp->linger2 ? : sysctl_tcp_fin_timeout; + int fin_timeout = tcp_sk(sk)->linger2 ? : sysctl_tcp_fin_timeout; + const int rto = inet_csk(sk)->icsk_rto; - if (fin_timeout < (tp->rto<<2) - (tp->rto>>1)) - fin_timeout = (tp->rto<<2) - (tp->rto>>1); + if (fin_timeout < (rto << 2) - (rto >> 1)) + fin_timeout = (rto << 2) - (rto >> 1); return fin_timeout; } @@ -1658,15 +1150,6 @@ static inline int tcp_paws_check(const s return 1; } -static inline void tcp_v4_setup_caps(struct sock *sk, struct dst_entry *dst) -{ - sk->sk_route_caps = dst->dev->features; - if (sk->sk_route_caps & NETIF_F_TSO) { - if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len) - sk->sk_route_caps &= ~NETIF_F_TSO; - } -} - #define TCP_CHECK_TIMER(sk) do { } while (0) static inline int tcp_use_frto(const struct sock *sk) diff -puN /dev/null include/net/tcp_states.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/include/net/tcp_states.h 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,34 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the TCP protocol sk_state field. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _LINUX_TCP_STATES_H +#define _LINUX_TCP_STATES_H + +enum { + TCP_ESTABLISHED = 1, + TCP_SYN_SENT, + TCP_SYN_RECV, + TCP_FIN_WAIT1, + TCP_FIN_WAIT2, + TCP_TIME_WAIT, + TCP_CLOSE, + TCP_CLOSE_WAIT, + TCP_LAST_ACK, + TCP_LISTEN, + TCP_CLOSING, /* Now a valid state */ + + TCP_MAX_STATES /* Leave at the end! */ +}; + +#define TCP_STATE_MASK 0xF + +#endif /* _LINUX_TCP_STATES_H */ diff -puN include/net/x25device.h~git-net include/net/x25device.h --- devel/include/net/x25device.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/x25device.h 2005-08-06 23:39:10.000000000 -0700 @@ -8,7 +8,6 @@ static inline __be16 x25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->mac.raw = skb->data; - skb->input_dev = skb->dev = dev; skb->pkt_type = PACKET_HOST; return htons(ETH_P_X25); diff -puN include/net/x25.h~git-net include/net/x25.h --- devel/include/net/x25.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/x25.h 2005-08-06 23:39:10.000000000 -0700 @@ -175,7 +175,7 @@ extern void x25_kill_by_neigh(struct x25 /* x25_dev.c */ extern void x25_send_frame(struct sk_buff *, struct x25_neigh *); -extern int x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *); +extern int x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); extern void x25_establish_link(struct x25_neigh *); extern void x25_terminate_link(struct x25_neigh *); diff -puN include/net/xfrm.h~git-net include/net/xfrm.h --- devel/include/net/xfrm.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/include/net/xfrm.h 2005-08-06 23:39:10.000000000 -0700 @@ -818,7 +818,6 @@ extern void xfrm6_init(void); extern void xfrm6_fini(void); extern void xfrm_state_init(void); extern void xfrm4_state_init(void); -extern void xfrm4_state_fini(void); extern void xfrm6_state_init(void); extern void xfrm6_state_fini(void); diff -puN kernel/audit.c~git-net kernel/audit.c --- devel/kernel/audit.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/kernel/audit.c 2005-08-06 23:39:10.000000000 -0700 @@ -518,7 +518,8 @@ static int __init audit_init(void) { printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive); + audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive, + THIS_MODULE); if (!audit_sock) audit_panic("cannot initialize netlink socket"); diff -puN lib/kobject_uevent.c~git-net lib/kobject_uevent.c --- devel/lib/kobject_uevent.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/lib/kobject_uevent.c 2005-08-06 23:39:10.000000000 -0700 @@ -153,7 +153,8 @@ EXPORT_SYMBOL_GPL(kobject_uevent_atomic) static int __init kobject_uevent_init(void) { - uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, NULL); + uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, NULL, + THIS_MODULE); if (!uevent_sock) { printk(KERN_ERR diff -puN net/8021q/vlan_dev.c~git-net net/8021q/vlan_dev.c --- devel/net/8021q/vlan_dev.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/8021q/vlan_dev.c 2005-08-06 23:39:10.000000000 -0700 @@ -113,7 +113,7 @@ static inline struct sk_buff *vlan_check * */ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type* ptype) + struct packet_type* ptype, struct net_device *orig_dev) { unsigned char *rawp = NULL; struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data); diff -puN net/8021q/vlan.h~git-net net/8021q/vlan.h --- devel/net/8021q/vlan.h~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/8021q/vlan.h 2005-08-06 23:39:10.000000000 -0700 @@ -51,7 +51,7 @@ struct net_device *__find_vlan_dev(struc /* found in vlan_dev.c */ int vlan_dev_rebuild_header(struct sk_buff *skb); int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type* ptype); + struct packet_type *ptype, struct net_device *orig_dev); int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len); diff -puN net/802/hippi.c~git-net net/802/hippi.c --- devel/net/802/hippi.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/802/hippi.c 2005-08-06 23:39:10.000000000 -0700 @@ -51,6 +51,7 @@ static int hippi_header(struct sk_buff * unsigned len) { struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN); + struct hippi_cb *hcb = (struct hippi_cb *) skb->cb; if (!len){ len = skb->len - HIPPI_HLEN; @@ -84,9 +85,10 @@ static int hippi_header(struct sk_buff * if (daddr) { memcpy(hip->le.dest_switch_addr, daddr + 3, 3); - memcpy(&skb->private.ifield, daddr + 2, 4); + memcpy(&hcb->ifield, daddr + 2, 4); return HIPPI_HLEN; } + hcb->ifield = 0; return -((int)HIPPI_HLEN); } diff -puN net/802/p8022.c~git-net net/802/p8022.c --- devel/net/802/p8022.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/802/p8022.c 2005-08-06 23:39:10.000000000 -0700 @@ -35,7 +35,8 @@ static int p8022_request(struct datalink struct datalink_proto *register_8022_client(unsigned char type, int (*func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)) + struct packet_type *pt, + struct net_device *orig_dev)) { struct datalink_proto *proto; diff -puN net/802/psnap.c~git-net net/802/psnap.c --- devel/net/802/psnap.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/802/psnap.c 2005-08-06 23:39:10.000000000 -0700 @@ -47,7 +47,7 @@ static struct datalink_proto *find_snap_ * A SNAP packet has arrived */ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { int rc = 1; struct datalink_proto *proto; @@ -61,7 +61,7 @@ static int snap_rcv(struct sk_buff *skb, /* Pass the frame on. */ skb->h.raw += 5; skb_pull(skb, 5); - rc = proto->rcvfunc(skb, dev, &snap_packet_type); + rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } else { skb->sk = NULL; kfree_skb(skb); @@ -118,7 +118,8 @@ module_exit(snap_exit); struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, - struct packet_type *)) + struct packet_type *, + struct net_device *)) { struct datalink_proto *proto = NULL; diff -puN net/appletalk/aarp.c~git-net net/appletalk/aarp.c --- devel/net/appletalk/aarp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/appletalk/aarp.c 2005-08-06 23:39:10.000000000 -0700 @@ -698,7 +698,7 @@ static void __aarp_resolved(struct aarp_ * frame. We currently only support Ethernet. */ static int aarp_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { struct elapaarp *ea = aarp_hdr(skb); int hash, ret = 0; diff -puN net/appletalk/ddp.c~git-net net/appletalk/ddp.c --- devel/net/appletalk/ddp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/appletalk/ddp.c 2005-08-06 23:39:10.000000000 -0700 @@ -53,12 +53,12 @@ #include #include -#include #include #include /* For TIOCOUTQ/INQ */ #include #include #include +#include #include #include @@ -1390,7 +1390,7 @@ free_it: * [ie ARPHRD_ETHERTALK] */ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { struct ddpehdr *ddp; struct sock *sock; @@ -1482,7 +1482,7 @@ freeit: * header and append a long one. */ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { /* Expand any short form frames */ if (skb->mac.raw[2] == 1) { @@ -1528,7 +1528,7 @@ static int ltalk_rcv(struct sk_buff *skb } skb->h.raw = skb->data; - return atalk_rcv(skb, dev, pt); + return atalk_rcv(skb, dev, pt, orig_dev); freeit: kfree_skb(skb); return 0; diff -puN net/atm/ipcommon.c~git-net net/atm/ipcommon.c --- devel/net/atm/ipcommon.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/atm/ipcommon.c 2005-08-06 23:39:10.000000000 -0700 @@ -34,7 +34,6 @@ void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to) { - struct sk_buff *skb; unsigned long flags; struct sk_buff *skb_from = (struct sk_buff *) from; struct sk_buff *skb_to = (struct sk_buff *) to; @@ -47,8 +46,6 @@ void skb_migrate(struct sk_buff_head *fr prev->next = skb_to; to->prev->next = from->next; to->prev = from->prev; - for (skb = from->next; skb != skb_to; skb = skb->next) - skb->list = to; to->qlen += from->qlen; spin_unlock(&to->lock); from->prev = skb_from; diff -puN net/ax25/af_ax25.c~git-net net/ax25/af_ax25.c --- devel/net/ax25/af_ax25.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ax25/af_ax25.c 2005-08-06 23:39:10.000000000 -0700 @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include diff -puN net/ax25/ax25_ds_in.c~git-net net/ax25/ax25_ds_in.c --- devel/net/ax25/ax25_ds_in.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ax25/ax25_ds_in.c 2005-08-06 23:39:10.000000000 -0700 @@ -23,7 +23,7 @@ #include #include #include /* For ip_rcv */ -#include +#include #include #include #include diff -puN net/ax25/ax25_ds_timer.c~git-net net/ax25/ax25_ds_timer.c --- devel/net/ax25/ax25_ds_timer.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ax25/ax25_ds_timer.c 2005-08-06 23:39:10.000000000 -0700 @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff -puN net/ax25/ax25_in.c~git-net net/ax25/ax25_in.c --- devel/net/ax25/ax25_in.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ax25/ax25_in.c 2005-08-06 23:39:10.000000000 -0700 @@ -27,7 +27,7 @@ #include #include #include /* For ip_rcv */ -#include +#include #include /* For arp_rcv */ #include #include @@ -132,7 +132,7 @@ int ax25_rx_iframe(ax25_cb *ax25, struct skb->dev = ax25->ax25_dev->dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_IP); - ip_rcv(skb, skb->dev, NULL); /* Wrong ptype */ + ip_rcv(skb, skb->dev, NULL, skb->dev); /* Wrong ptype */ return 1; } #endif @@ -258,7 +258,7 @@ static int ax25_rcv(struct sk_buff *skb, skb->dev = dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_IP); - ip_rcv(skb, dev, ptype); /* Note ptype here is the wrong one, fix me later */ + ip_rcv(skb, dev, ptype, dev); /* Note ptype here is the wrong one, fix me later */ break; case AX25_P_ARP: @@ -268,7 +268,7 @@ static int ax25_rcv(struct sk_buff *skb, skb->dev = dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_ARP); - arp_rcv(skb, dev, ptype); /* Note ptype here is wrong... */ + arp_rcv(skb, dev, ptype, dev); /* Note ptype here is wrong... */ break; #endif case AX25_P_TEXT: @@ -454,7 +454,7 @@ static int ax25_rcv(struct sk_buff *skb, * Receive an AX.25 frame via a SLIP interface. */ int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *ptype) + struct packet_type *ptype, struct net_device *orig_dev) { skb->sk = NULL; /* Initially we don't know who it's for */ skb->destructor = NULL; /* Who initializes this, dammit?! */ diff -puN net/ax25/ax25_std_in.c~git-net net/ax25/ax25_std_in.c --- devel/net/ax25/ax25_std_in.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ax25/ax25_std_in.c 2005-08-06 23:39:10.000000000 -0700 @@ -30,7 +30,7 @@ #include #include #include /* For ip_rcv */ -#include +#include #include #include #include diff -puN net/ax25/ax25_std_timer.c~git-net net/ax25/ax25_std_timer.c --- devel/net/ax25/ax25_std_timer.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ax25/ax25_std_timer.c 2005-08-06 23:39:10.000000000 -0700 @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff -puN net/ax25/ax25_subr.c~git-net net/ax25/ax25_subr.c --- devel/net/ax25/ax25_subr.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ax25/ax25_subr.c 2005-08-06 23:39:10.000000000 -0700 @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -76,7 +76,7 @@ void ax25_requeue_frames(ax25_cb *ax25) if (skb_prev == NULL) skb_queue_head(&ax25->write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &ax25->write_queue); skb_prev = skb; } } diff -puN net/bridge/netfilter/ebt_mark.c~git-net net/bridge/netfilter/ebt_mark.c --- devel/net/bridge/netfilter/ebt_mark.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/bridge/netfilter/ebt_mark.c 2005-08-06 23:39:10.000000000 -0700 @@ -23,10 +23,9 @@ static int ebt_target_mark(struct sk_buf { struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data; - if ((*pskb)->nfmark != info->mark) { + if ((*pskb)->nfmark != info->mark) (*pskb)->nfmark = info->mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return info->target; } diff -puN net/bridge/netfilter/ebt_ulog.c~git-net net/bridge/netfilter/ebt_ulog.c --- devel/net/bridge/netfilter/ebt_ulog.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/bridge/netfilter/ebt_ulog.c 2005-08-06 23:39:10.000000000 -0700 @@ -258,7 +258,7 @@ static int __init init(void) spin_lock_init(&ulog_buffers[i].lock); } - ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL); + ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL, THIS_MODULE); if (!ebtulognl) ret = -ENOMEM; else if ((ret = ebt_register_watcher(&ulog))) diff -puN net/core/datagram.c~git-net net/core/datagram.c --- devel/net/core/datagram.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/core/datagram.c 2005-08-06 23:39:10.000000000 -0700 @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -51,9 +50,10 @@ #include #include -#include -#include +#include +#include +#include /* * Is a socket 'connection oriented' ? diff -puN net/core/dev.c~git-net net/core/dev.c --- devel/net/core/dev.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/core/dev.c 2005-08-06 23:39:10.000000000 -0700 @@ -1058,7 +1058,7 @@ void dev_queue_xmit_nit(struct sk_buff * skb2->h.raw = skb2->nh.raw; skb2->pkt_type = PACKET_OUTGOING; - ptype->func(skb2, skb->dev, ptype); + ptype->func(skb2, skb->dev, ptype, skb->dev); } } rcu_read_unlock(); @@ -1425,14 +1425,14 @@ int netif_rx_ni(struct sk_buff *skb) EXPORT_SYMBOL(netif_rx_ni); -static __inline__ void skb_bond(struct sk_buff *skb) +static inline struct net_device *skb_bond(struct sk_buff *skb) { struct net_device *dev = skb->dev; - if (dev->master) { - skb->real_dev = skb->dev; + if (dev->master) skb->dev = dev->master; - } + + return dev; } static void net_tx_action(struct softirq_action *h) @@ -1482,10 +1482,11 @@ static void net_tx_action(struct softirq } static __inline__ int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev) + struct packet_type *pt_prev, + struct net_device *orig_dev) { atomic_inc(&skb->users); - return pt_prev->func(skb, skb->dev, pt_prev); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) @@ -1496,7 +1497,8 @@ struct net_bridge_fdb_entry *(*br_fdb_ge void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); static __inline__ int handle_bridge(struct sk_buff **pskb, - struct packet_type **pt_prev, int *ret) + struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev) { struct net_bridge_port *port; @@ -1505,14 +1507,14 @@ static __inline__ int handle_bridge(stru return 0; if (*pt_prev) { - *ret = deliver_skb(*pskb, *pt_prev); + *ret = deliver_skb(*pskb, *pt_prev, orig_dev); *pt_prev = NULL; } return br_handle_frame_hook(port, pskb); } #else -#define handle_bridge(skb, pt_prev, ret) (0) +#define handle_bridge(skb, pt_prev, ret, orig_dev) (0) #endif #ifdef CONFIG_NET_CLS_ACT @@ -1534,17 +1536,14 @@ static int ing_filter(struct sk_buff *sk __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); if (MAX_RED_LOOP < ttl++) { printk("Redir loop detected Dropping packet (%s->%s)\n", - skb->input_dev?skb->input_dev->name:"??",skb->dev->name); + skb->input_dev->name, skb->dev->name); return TC_ACT_SHOT; } skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); - if (NULL == skb->input_dev) { - skb->input_dev = skb->dev; - printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name); - } + spin_lock(&dev->ingress_lock); if ((q = dev->qdisc_ingress) != NULL) result = q->enqueue(skb, q); @@ -1559,6 +1558,7 @@ static int ing_filter(struct sk_buff *sk int netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; + struct net_device *orig_dev; int ret = NET_RX_DROP; unsigned short type; @@ -1569,7 +1569,10 @@ int netif_receive_skb(struct sk_buff *sk if (!skb->stamp.tv_sec) net_timestamp(&skb->stamp); - skb_bond(skb); + if (!skb->input_dev) + skb->input_dev = skb->dev; + + orig_dev = skb_bond(skb); __get_cpu_var(netdev_rx_stat).total++; @@ -1590,14 +1593,14 @@ int netif_receive_skb(struct sk_buff *sk list_for_each_entry_rcu(ptype, &ptype_all, list) { if (!ptype->dev || ptype->dev == skb->dev) { if (pt_prev) - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } } #ifdef CONFIG_NET_CLS_ACT if (pt_prev) { - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; /* noone else should process this after*/ } else { skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); @@ -1616,7 +1619,7 @@ ncls: handle_diverter(skb); - if (handle_bridge(&skb, &pt_prev, &ret)) + if (handle_bridge(&skb, &pt_prev, &ret, orig_dev)) goto out; type = skb->protocol; @@ -1624,13 +1627,13 @@ ncls: if (ptype->type == type && (!ptype->dev || ptype->dev == skb->dev)) { if (pt_prev) - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } } if (pt_prev) { - ret = pt_prev->func(skb, skb->dev, pt_prev); + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { kfree_skb(skb); /* Jamal, now you will not able to escape explaining diff -puN net/core/netfilter.c~git-net net/core/netfilter.c --- devel/net/core/netfilter.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/core/netfilter.c 2005-08-06 23:39:10.000000000 -0700 @@ -22,12 +22,8 @@ #include #include #include -#include -#include -#include +#include #include -#include -#include /* In this code, we can be waiting indefinitely for userspace to * service a packet if a hook returns NF_QUEUE. We could keep a count @@ -58,6 +54,9 @@ static struct nf_queue_handler_t { nf_queue_outfn_t outfn; void *data; } queue_handler[NPROTO]; + +static struct nf_queue_rerouter *queue_rerouter; + static DEFINE_RWLOCK(queue_handler_lock); int nf_register_hook(struct nf_hook_ops *reg) @@ -223,7 +222,8 @@ static unsigned int nf_iterate(struct li verdict = elem->hook(hook, skb, indev, outdev, okfn); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG - if (unlikely(verdict > NF_MAX_VERDICT)) { + if (unlikely((verdict & NF_VERDICT_MASK) + > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", elem->hook, hook); continue; @@ -241,6 +241,9 @@ int nf_register_queue_handler(int pf, nf { int ret; + if (pf >= NPROTO) + return -EINVAL; + write_lock_bh(&queue_handler_lock); if (queue_handler[pf].outfn) ret = -EBUSY; @@ -257,6 +260,9 @@ int nf_register_queue_handler(int pf, nf /* The caller must flush their queue before this */ int nf_unregister_queue_handler(int pf) { + if (pf >= NPROTO) + return -EINVAL; + write_lock_bh(&queue_handler_lock); queue_handler[pf].outfn = NULL; queue_handler[pf].data = NULL; @@ -265,16 +271,54 @@ int nf_unregister_queue_handler(int pf) return 0; } +int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer) +{ + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + memcpy(&queue_rerouter[pf], rer, sizeof(queue_rerouter[pf])); + write_unlock_bh(&queue_handler_lock); + + return 0; +} + +int nf_unregister_queue_rerouter(int pf) +{ + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + memset(&queue_rerouter[pf], 0, sizeof(queue_rerouter[pf])); + write_unlock_bh(&queue_handler_lock); + return 0; +} + +void nf_unregister_queue_handlers(nf_queue_outfn_t outfn) +{ + int pf; + + write_lock_bh(&queue_handler_lock); + for (pf = 0; pf < NPROTO; pf++) { + if (queue_handler[pf].outfn == outfn) { + queue_handler[pf].outfn = NULL; + queue_handler[pf].data = NULL; + } + } + write_unlock_bh(&queue_handler_lock); +} + /* * Any packet that leaves via this function must come back * through nf_reinject(). */ -static int nf_queue(struct sk_buff *skb, +static int nf_queue(struct sk_buff **skb, struct list_head *elem, int pf, unsigned int hook, struct net_device *indev, struct net_device *outdev, - int (*okfn)(struct sk_buff *)) + int (*okfn)(struct sk_buff *), + unsigned int queuenum) { int status; struct nf_info *info; @@ -287,17 +331,17 @@ static int nf_queue(struct sk_buff *skb, read_lock(&queue_handler_lock); if (!queue_handler[pf].outfn) { read_unlock(&queue_handler_lock); - kfree_skb(skb); + kfree_skb(*skb); return 1; } - info = kmalloc(sizeof(*info), GFP_ATOMIC); + info = kmalloc(sizeof(*info)+queue_rerouter[pf].rer_size, GFP_ATOMIC); if (!info) { if (net_ratelimit()) printk(KERN_ERR "OOM queueing packet %p\n", - skb); + *skb); read_unlock(&queue_handler_lock); - kfree_skb(skb); + kfree_skb(*skb); return 1; } @@ -316,15 +360,22 @@ static int nf_queue(struct sk_buff *skb, if (outdev) dev_hold(outdev); #ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - physindev = skb->nf_bridge->physindev; + if ((*skb)->nf_bridge) { + physindev = (*skb)->nf_bridge->physindev; if (physindev) dev_hold(physindev); - physoutdev = skb->nf_bridge->physoutdev; + physoutdev = (*skb)->nf_bridge->physoutdev; if (physoutdev) dev_hold(physoutdev); } #endif + if (queue_rerouter[pf].save) + queue_rerouter[pf].save(*skb, info); + + status = queue_handler[pf].outfn(*skb, info, queuenum, + queue_handler[pf].data); + + if (status >= 0 && queue_rerouter[pf].reroute) + status = queue_rerouter[pf].reroute(skb, info); - status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data); read_unlock(&queue_handler_lock); if (status < 0) { @@ -337,9 +388,11 @@ static int nf_queue(struct sk_buff *skb, #endif module_put(info->elem->owner); kfree(info); - kfree_skb(skb); + kfree_skb(*skb); + return 1; } + return 1; } @@ -368,9 +421,10 @@ next_hook: } else if (verdict == NF_DROP) { kfree_skb(*pskb); ret = -EPERM; - } else if (verdict == NF_QUEUE) { + } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { NFDEBUG("nf_hook: Verdict = QUEUE.\n"); - if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn)) + if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn, + verdict >> NF_VERDICT_BITS)) goto next_hook; } unlock: @@ -427,14 +481,15 @@ void nf_reinject(struct sk_buff *skb, st info->okfn, INT_MIN); } - switch (verdict) { + switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: info->okfn(skb); break; case NF_QUEUE: - if (!nf_queue(skb, elem, info->pf, info->hook, - info->indev, info->outdev, info->okfn)) + if (!nf_queue(&skb, elem, info->pf, info->hook, + info->indev, info->outdev, info->okfn, + verdict >> NF_VERDICT_BITS)) goto next_hook; break; } @@ -447,73 +502,7 @@ void nf_reinject(struct sk_buff *skb, st return; } -#ifdef CONFIG_INET -/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff **pskb) -{ - struct iphdr *iph = (*pskb)->nh.iph; - struct rtable *rt; - struct flowi fl = {}; - struct dst_entry *odst; - unsigned int hh_len; - - /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause - * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. - */ - if (inet_addr_type(iph->saddr) == RTN_LOCAL) { - fl.nl_u.ip4_u.daddr = iph->daddr; - fl.nl_u.ip4_u.saddr = iph->saddr; - fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); - fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0; -#ifdef CONFIG_IP_ROUTE_FWMARK - fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; -#endif - fl.proto = iph->protocol; - if (ip_route_output_key(&rt, &fl) != 0) - return -1; - - /* Drop old route. */ - dst_release((*pskb)->dst); - (*pskb)->dst = &rt->u.dst; - } else { - /* non-local src, find valid iif to satisfy - * rp-filter when calling ip_route_input. */ - fl.nl_u.ip4_u.daddr = iph->saddr; - if (ip_route_output_key(&rt, &fl) != 0) - return -1; - - odst = (*pskb)->dst; - if (ip_route_input(*pskb, iph->daddr, iph->saddr, - RT_TOS(iph->tos), rt->u.dst.dev) != 0) { - dst_release(&rt->u.dst); - return -1; - } - dst_release(&rt->u.dst); - dst_release(odst); - } - - if ((*pskb)->dst->error) - return -1; - - /* Change in oif may mean change in hh_len. */ - hh_len = (*pskb)->dst->dev->hard_header_len; - if (skb_headroom(*pskb) < hh_len) { - struct sk_buff *nskb; - - nskb = skb_realloc_headroom(*pskb, hh_len); - if (!nskb) - return -1; - if ((*pskb)->sk) - skb_set_owner_w(nskb, (*pskb)->sk); - kfree_skb(*pskb); - *pskb = nskb; - } - - return 0; -} -EXPORT_SYMBOL(ip_route_me_harder); - -int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len) +int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len) { struct sk_buff *nskb; @@ -540,19 +529,17 @@ copy_skb: *pskb = nskb; return 1; } -EXPORT_SYMBOL(skb_ip_make_writable); -#endif /*CONFIG_INET*/ +EXPORT_SYMBOL(skb_make_writable); /* Internal logging interface, which relies on the real LOG target modules */ #define NF_LOG_PREFIXLEN 128 -static nf_logfn *nf_logging[NPROTO]; /* = NULL */ -static int reported = 0; +static struct nf_logger *nf_logging[NPROTO]; /* = NULL */ static DEFINE_SPINLOCK(nf_log_lock); -int nf_log_register(int pf, nf_logfn *logfn) +int nf_log_register(int pf, struct nf_logger *logger) { int ret = -EBUSY; @@ -560,54 +547,134 @@ int nf_log_register(int pf, nf_logfn *lo * substituting pointer. */ spin_lock(&nf_log_lock); if (!nf_logging[pf]) { - rcu_assign_pointer(nf_logging[pf], logfn); + rcu_assign_pointer(nf_logging[pf], logger); ret = 0; } spin_unlock(&nf_log_lock); return ret; } -void nf_log_unregister(int pf, nf_logfn *logfn) +void nf_log_unregister_pf(int pf) { spin_lock(&nf_log_lock); - if (nf_logging[pf] == logfn) - nf_logging[pf] = NULL; + nf_logging[pf] = NULL; spin_unlock(&nf_log_lock); /* Give time to concurrent readers. */ synchronize_net(); -} +} + +void nf_log_unregister_logger(struct nf_logger *logger) +{ + int i; + + spin_lock(&nf_log_lock); + for (i = 0; i < NPROTO; i++) { + if (nf_logging[i] == logger) + nf_logging[i] = NULL; + } + spin_unlock(&nf_log_lock); + + synchronize_net(); +} void nf_log_packet(int pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + struct nf_loginfo *loginfo, const char *fmt, ...) { va_list args; char prefix[NF_LOG_PREFIXLEN]; - nf_logfn *logfn; + struct nf_logger *logger; rcu_read_lock(); - logfn = rcu_dereference(nf_logging[pf]); - if (logfn) { + logger = rcu_dereference(nf_logging[pf]); + if (logger) { va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); va_end(args); /* We must read logging before nf_logfn[pf] */ - logfn(hooknum, skb, in, out, prefix); - } else if (!reported) { - printk(KERN_WARNING "nf_log_packet: can\'t log yet, " - "no backend logging module loaded in!\n"); - reported++; + logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); + } else if (net_ratelimit()) { + printk(KERN_WARNING "nf_log_packet: can\'t log since " + "no backend logging module loaded in! Please either " + "load one, or disable logging explicitly\n"); } rcu_read_unlock(); } EXPORT_SYMBOL(nf_log_register); -EXPORT_SYMBOL(nf_log_unregister); +EXPORT_SYMBOL(nf_log_unregister_pf); +EXPORT_SYMBOL(nf_log_unregister_logger); EXPORT_SYMBOL(nf_log_packet); +#ifdef CONFIG_PROC_FS +struct proc_dir_entry *proc_net_netfilter; +EXPORT_SYMBOL(proc_net_netfilter); + +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + rcu_read_lock(); + + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void seq_stop(struct seq_file *s, void *v) +{ + rcu_read_unlock(); +} + +static int seq_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + const struct nf_logger *logger; + + logger = rcu_dereference(nf_logging[*pos]); + + if (!logger) + return seq_printf(s, "%2lld NONE\n", *pos); + + return seq_printf(s, "%2lld %s\n", *pos, logger->name); +} + +static struct seq_operations nflog_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nflog_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nflog_seq_ops); +} + +static struct file_operations nflog_file_ops = { + .owner = THIS_MODULE, + .open = nflog_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* PROC_FS */ + + /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence manufactured ICMP or RST packets will not be associated with it. */ @@ -626,11 +693,30 @@ void nf_ct_attach(struct sk_buff *new, s void __init netfilter_init(void) { int i, h; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; +#endif + + queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter), + GFP_KERNEL); + if (!queue_rerouter) + panic("netfilter: cannot allocate queue rerouter array\n"); + memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter)); for (i = 0; i < NPROTO; i++) { for (h = 0; h < NF_MAX_HOOKS; h++) INIT_LIST_HEAD(&nf_hooks[i][h]); } + +#ifdef CONFIG_PROC_FS + proc_net_netfilter = proc_mkdir("netfilter", proc_net); + if (!proc_net_netfilter) + panic("cannot create netfilter proc entry"); + pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter); + if (!pde) + panic("cannot create /proc/net/netfilter/nf_log"); + pde->proc_fops = &nflog_file_ops; +#endif } EXPORT_SYMBOL(ip_ct_attach); @@ -645,4 +731,7 @@ EXPORT_SYMBOL(nf_reinject); EXPORT_SYMBOL(nf_setsockopt); EXPORT_SYMBOL(nf_unregister_hook); EXPORT_SYMBOL(nf_unregister_queue_handler); +EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); +EXPORT_SYMBOL_GPL(nf_register_queue_rerouter); +EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter); EXPORT_SYMBOL(nf_unregister_sockopt); diff -puN net/core/request_sock.c~git-net net/core/request_sock.c --- devel/net/core/request_sock.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/core/request_sock.c 2005-08-06 23:39:10.000000000 -0700 @@ -32,7 +32,6 @@ * Further increasing requires to change hash table size. */ int sysctl_max_syn_backlog = 256; -EXPORT_SYMBOL(sysctl_max_syn_backlog); int reqsk_queue_alloc(struct request_sock_queue *queue, const int nr_table_entries) @@ -53,6 +52,8 @@ int reqsk_queue_alloc(struct request_soc get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); rwlock_init(&queue->syn_wait_lock); queue->rskq_accept_head = queue->rskq_accept_head = NULL; + queue->rskq_defer_accept = 0; + lopt->nr_table_entries = nr_table_entries; write_lock_bh(&queue->syn_wait_lock); queue->listen_opt = lopt; @@ -62,3 +63,28 @@ int reqsk_queue_alloc(struct request_soc } EXPORT_SYMBOL(reqsk_queue_alloc); + +void reqsk_queue_destroy(struct request_sock_queue *queue) +{ + /* make all the listen_opt local to us */ + struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); + + if (lopt->qlen != 0) { + int i; + + for (i = 0; i < lopt->nr_table_entries; i++) { + struct request_sock *req; + + while ((req = lopt->syn_table[i]) != NULL) { + lopt->syn_table[i] = req->dl_next; + lopt->qlen--; + reqsk_free(req); + } + } + } + + BUG_TRAP(lopt->qlen == 0); + kfree(lopt); +} + +EXPORT_SYMBOL(reqsk_queue_destroy); diff -puN net/core/rtnetlink.c~git-net net/core/rtnetlink.c --- devel/net/core/rtnetlink.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/core/rtnetlink.c 2005-08-06 23:39:10.000000000 -0700 @@ -708,7 +708,7 @@ void __init rtnetlink_init(void) if (!rta_buf) panic("rtnetlink_init: cannot allocate rta_buf\n"); - rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv); + rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv, THIS_MODULE); if (rtnl == NULL) panic("rtnetlink_init: cannot initialize rtnetlink\n"); netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); diff -puN net/core/skbuff.c~git-net net/core/skbuff.c --- devel/net/core/skbuff.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/core/skbuff.c 2005-08-06 23:39:10.000000000 -0700 @@ -281,8 +281,6 @@ void kfree_skbmem(struct sk_buff *skb) void __kfree_skb(struct sk_buff *skb) { - BUG_ON(skb->list != NULL); - dst_release(skb->dst); #ifdef CONFIG_XFRM secpath_put(skb->sp); @@ -302,7 +300,6 @@ void __kfree_skb(struct sk_buff *skb) skb->tc_index = 0; #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = 0; - skb->tc_classid = 0; #endif #endif @@ -333,11 +330,9 @@ struct sk_buff *skb_clone(struct sk_buff #define C(x) n->x = skb->x n->next = n->prev = NULL; - n->list = NULL; n->sk = NULL; C(stamp); C(dev); - C(real_dev); C(h); C(nh); C(mac); @@ -361,7 +356,6 @@ struct sk_buff *skb_clone(struct sk_buff n->destructor = NULL; #ifdef CONFIG_NETFILTER C(nfmark); - C(nfcache); C(nfct); nf_conntrack_get(skb->nfct); C(nfctinfo); @@ -370,9 +364,6 @@ struct sk_buff *skb_clone(struct sk_buff nf_bridge_get(skb->nf_bridge); #endif #endif /*CONFIG_NETFILTER*/ -#if defined(CONFIG_HIPPI) - C(private); -#endif #ifdef CONFIG_NET_SCHED C(tc_index); #ifdef CONFIG_NET_CLS_ACT @@ -380,7 +371,6 @@ struct sk_buff *skb_clone(struct sk_buff n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd); n->tc_verd = CLR_TC_MUNGED(n->tc_verd); C(input_dev); - C(tc_classid); #endif #endif @@ -404,10 +394,8 @@ static void copy_skb_header(struct sk_bu */ unsigned long offset = new->data - old->data; - new->list = NULL; new->sk = NULL; new->dev = old->dev; - new->real_dev = old->real_dev; new->priority = old->priority; new->protocol = old->protocol; new->dst = dst_clone(old->dst); @@ -424,7 +412,6 @@ static void copy_skb_header(struct sk_bu new->destructor = NULL; #ifdef CONFIG_NETFILTER new->nfmark = old->nfmark; - new->nfcache = old->nfcache; new->nfct = old->nfct; nf_conntrack_get(old->nfct); new->nfctinfo = old->nfctinfo; @@ -1344,50 +1331,43 @@ void skb_queue_tail(struct sk_buff_head __skb_queue_tail(list, newsk); spin_unlock_irqrestore(&list->lock, flags); } + /** * skb_unlink - remove a buffer from a list * @skb: buffer to remove + * @list: list to use * - * Place a packet after a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls + * Remove a packet from a list. The list locks are taken and this + * function is atomic with respect to other list locked calls * - * Works even without knowing the list it is sitting on, which can be - * handy at times. It also means that THE LIST MUST EXIST when you - * unlink. Thus a list must have its contents unlinked before it is - * destroyed. + * You must know what list the SKB is on. */ -void skb_unlink(struct sk_buff *skb) +void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { - struct sk_buff_head *list = skb->list; - - if (list) { - unsigned long flags; + unsigned long flags; - spin_lock_irqsave(&list->lock, flags); - if (skb->list == list) - __skb_unlink(skb, skb->list); - spin_unlock_irqrestore(&list->lock, flags); - } + spin_lock_irqsave(&list->lock, flags); + __skb_unlink(skb, list); + spin_unlock_irqrestore(&list->lock, flags); } - /** * skb_append - append a buffer * @old: buffer to insert after * @newsk: buffer to insert + * @list: list to use * * Place a packet after a given packet in a list. The list locks are taken * and this function is atomic with respect to other list locked calls. * A buffer cannot be placed on two lists at the same time. */ - -void skb_append(struct sk_buff *old, struct sk_buff *newsk) +void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) { unsigned long flags; - spin_lock_irqsave(&old->list->lock, flags); - __skb_append(old, newsk); - spin_unlock_irqrestore(&old->list->lock, flags); + spin_lock_irqsave(&list->lock, flags); + __skb_append(old, newsk, list); + spin_unlock_irqrestore(&list->lock, flags); } @@ -1395,19 +1375,21 @@ void skb_append(struct sk_buff *old, str * skb_insert - insert a buffer * @old: buffer to insert before * @newsk: buffer to insert + * @list: list to use + * + * Place a packet before a given packet in a list. The list locks are + * taken and this function is atomic with respect to other list locked + * calls. * - * Place a packet before a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls * A buffer cannot be placed on two lists at the same time. */ - -void skb_insert(struct sk_buff *old, struct sk_buff *newsk) +void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) { unsigned long flags; - spin_lock_irqsave(&old->list->lock, flags); - __skb_insert(newsk, old->prev, old, old->list); - spin_unlock_irqrestore(&old->list->lock, flags); + spin_lock_irqsave(&list->lock, flags); + __skb_insert(newsk, old->prev, old, list); + spin_unlock_irqrestore(&list->lock, flags); } #if 0 diff -puN net/core/sock.c~git-net net/core/sock.c --- devel/net/core/sock.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/core/sock.c 2005-08-06 23:39:10.000000000 -0700 @@ -260,7 +260,7 @@ int sock_setsockopt(struct socket *sock, if (val > sysctl_wmem_max) val = sysctl_wmem_max; - +set_sndbuf: sk->sk_userlocks |= SOCK_SNDBUF_LOCK; if ((val * 2) < SOCK_MIN_SNDBUF) sk->sk_sndbuf = SOCK_MIN_SNDBUF; @@ -274,6 +274,13 @@ int sock_setsockopt(struct socket *sock, sk->sk_write_space(sk); break; + case SO_SNDBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_sndbuf; + case SO_RCVBUF: /* Don't error on this BSD doesn't and if you think about it this is right. Otherwise apps have to @@ -282,7 +289,7 @@ int sock_setsockopt(struct socket *sock, if (val > sysctl_rmem_max) val = sysctl_rmem_max; - +set_rcvbuf: sk->sk_userlocks |= SOCK_RCVBUF_LOCK; /* FIXME: is this lower bound the right one? */ if ((val * 2) < SOCK_MIN_RCVBUF) @@ -291,6 +298,13 @@ int sock_setsockopt(struct socket *sock, sk->sk_rcvbuf = val * 2; break; + case SO_RCVBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_rcvbuf; + case SO_KEEPALIVE: #ifdef CONFIG_INET if (sk->sk_protocol == IPPROTO_TCP) @@ -686,6 +700,80 @@ void sk_free(struct sock *sk) module_put(owner); } +struct sock *sk_clone(const struct sock *sk, const unsigned int __nocast priority) +{ + struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0); + + if (newsk != NULL) { + struct sk_filter *filter; + + memcpy(newsk, sk, sk->sk_prot->obj_size); + + /* SANITY */ + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + bh_lock_sock(newsk); + + atomic_set(&newsk->sk_rmem_alloc, 0); + atomic_set(&newsk->sk_wmem_alloc, 0); + atomic_set(&newsk->sk_omem_alloc, 0); + skb_queue_head_init(&newsk->sk_receive_queue); + skb_queue_head_init(&newsk->sk_write_queue); + + rwlock_init(&newsk->sk_dst_lock); + rwlock_init(&newsk->sk_callback_lock); + + newsk->sk_dst_cache = NULL; + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + newsk->sk_send_head = NULL; + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + + sock_reset_flag(newsk, SOCK_DONE); + skb_queue_head_init(&newsk->sk_error_queue); + + filter = newsk->sk_filter; + if (filter != NULL) + sk_filter_charge(newsk, filter); + + if (unlikely(xfrm_sk_clone_policy(newsk))) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + newsk = NULL; + goto out; + } + + newsk->sk_err = 0; + newsk->sk_priority = 0; + atomic_set(&newsk->sk_refcnt, 2); + + /* + * Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy). + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + newsk->sk_socket = NULL; + newsk->sk_sleep = NULL; + + if (newsk->sk_prot->sockets_allocated) + atomic_inc(newsk->sk_prot->sockets_allocated); + } +out: + return newsk; +} + +EXPORT_SYMBOL_GPL(sk_clone); + void __init sk_init(void) { if (num_physpages <= 4096) { @@ -1353,11 +1441,7 @@ void sk_common_release(struct sock *sk) xfrm_sk_free_policy(sk); -#ifdef INET_REFCNT_DEBUG - if (atomic_read(&sk->sk_refcnt) != 1) - printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n", - sk, atomic_read(&sk->sk_refcnt)); -#endif + sk_refcnt_debug_release(sk); sock_put(sk); } @@ -1368,7 +1452,8 @@ static LIST_HEAD(proto_list); int proto_register(struct proto *prot, int alloc_slab) { - char *request_sock_slab_name; + char *request_sock_slab_name = NULL; + char *timewait_sock_slab_name; int rc = -ENOBUFS; if (alloc_slab) { @@ -1399,6 +1484,23 @@ int proto_register(struct proto *prot, i goto out_free_request_sock_slab_name; } } + + if (prot->twsk_obj_size) { + static const char mask[] = "tw_sock_%s"; + + timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); + + if (timewait_sock_slab_name == NULL) + goto out_free_request_sock_slab; + + sprintf(timewait_sock_slab_name, mask, prot->name); + prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, + prot->twsk_obj_size, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (prot->twsk_slab == NULL) + goto out_free_timewait_sock_slab_name; + } } write_lock(&proto_list_lock); @@ -1407,6 +1509,13 @@ int proto_register(struct proto *prot, i rc = 0; out: return rc; +out_free_timewait_sock_slab_name: + kfree(timewait_sock_slab_name); +out_free_request_sock_slab: + if (prot->rsk_prot && prot->rsk_prot->slab) { + kmem_cache_destroy(prot->rsk_prot->slab); + prot->rsk_prot->slab = NULL; + } out_free_request_sock_slab_name: kfree(request_sock_slab_name); out_free_sock_slab: @@ -1434,6 +1543,14 @@ void proto_unregister(struct proto *prot prot->rsk_prot->slab = NULL; } + if (prot->twsk_slab != NULL) { + const char *name = kmem_cache_name(prot->twsk_slab); + + kmem_cache_destroy(prot->twsk_slab); + kfree(name); + prot->twsk_slab = NULL; + } + list_del(&prot->node); write_unlock(&proto_list_lock); } diff -puN /dev/null net/dccp/ccid.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/ccid.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,139 @@ +/* + * net/dccp/ccid.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * CCID infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "ccid.h" + +static struct ccid *ccids[CCID_MAX]; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) +static atomic_t ccids_lockct = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(ccids_lock); + +/* + * The strategy is: modifications ccids vector are short, do not sleep and + * veeery rare, but read access should be free of any exclusive locks. + */ +static void ccids_write_lock(void) +{ + spin_lock(&ccids_lock); + while (atomic_read(&ccids_lockct) != 0) { + spin_unlock(&ccids_lock); + yield(); + spin_lock(&ccids_lock); + } +} + +static inline void ccids_write_unlock(void) +{ + spin_unlock(&ccids_lock); +} + +static inline void ccids_read_lock(void) +{ + atomic_inc(&ccids_lockct); + spin_unlock_wait(&ccids_lock); +} + +static inline void ccids_read_unlock(void) +{ + atomic_dec(&ccids_lockct); +} + +#else +#define ccids_write_lock() do { } while(0) +#define ccids_write_unlock() do { } while(0) +#define ccids_read_lock() do { } while(0) +#define ccids_read_unlock() do { } while(0) +#endif + +int ccid_register(struct ccid *ccid) +{ + int err; + + if (ccid->ccid_init == NULL) + return -1; + + ccids_write_lock(); + err = -EEXIST; + if (ccids[ccid->ccid_id] == NULL) { + ccids[ccid->ccid_id] = ccid; + err = 0; + } + ccids_write_unlock(); + if (err == 0) + pr_info("CCID: Registered CCID %d (%s)\n", + ccid->ccid_id, ccid->ccid_name); + return err; +} + +EXPORT_SYMBOL_GPL(ccid_register); + +int ccid_unregister(struct ccid *ccid) +{ + ccids_write_lock(); + ccids[ccid->ccid_id] = NULL; + ccids_write_unlock(); + pr_info("CCID: Unregistered CCID %d (%s)\n", + ccid->ccid_id, ccid->ccid_name); + return 0; +} + +EXPORT_SYMBOL_GPL(ccid_unregister); + +struct ccid *ccid_init(unsigned char id, struct sock *sk) +{ + struct ccid *ccid; + +#ifdef CONFIG_KMOD + if (ccids[id] == NULL) + request_module("net-dccp-ccid-%d", id); +#endif + ccids_read_lock(); + + ccid = ccids[id]; + if (ccid == NULL) + goto out; + + if (!try_module_get(ccid->ccid_owner)) + goto out_err; + + if (ccid->ccid_init(sk) != 0) + goto out_module_put; +out: + ccids_read_unlock(); + return ccid; +out_module_put: + module_put(ccid->ccid_owner); +out_err: + ccid = NULL; + goto out; +} + +EXPORT_SYMBOL_GPL(ccid_init); + +void ccid_exit(struct ccid *ccid, struct sock *sk) +{ + if (ccid == NULL) + return; + + ccids_read_lock(); + + if (ccids[ccid->ccid_id] != NULL) { + if (ccid->ccid_exit != NULL) + ccid->ccid_exit(sk); + module_put(ccid->ccid_owner); + } + + ccids_read_unlock(); +} + +EXPORT_SYMBOL_GPL(ccid_exit); diff -puN /dev/null net/dccp/ccid.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/ccid.h 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,156 @@ +#ifndef _CCID_H +#define _CCID_H +/* + * net/dccp/ccid.h + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * CCID infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#define CCID_MAX 255 + +struct ccid { + unsigned char ccid_id; + const char *ccid_name; + struct module *ccid_owner; + int (*ccid_init)(struct sock *sk); + void (*ccid_exit)(struct sock *sk); + int (*ccid_hc_rx_init)(struct sock *sk); + int (*ccid_hc_tx_init)(struct sock *sk); + void (*ccid_hc_rx_exit)(struct sock *sk); + void (*ccid_hc_tx_exit)(struct sock *sk); + void (*ccid_hc_rx_packet_recv)(struct sock *sk, struct sk_buff *skb); + int (*ccid_hc_rx_parse_options)(struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value); + void (*ccid_hc_rx_insert_options)(struct sock *sk, struct sk_buff *skb); + void (*ccid_hc_tx_insert_options)(struct sock *sk, struct sk_buff *skb); + void (*ccid_hc_tx_packet_recv)(struct sock *sk, struct sk_buff *skb); + int (*ccid_hc_tx_parse_options)(struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value); + int (*ccid_hc_tx_send_packet)(struct sock *sk, + struct sk_buff *skb, int len, + long *delay); + void (*ccid_hc_tx_packet_sent)(struct sock *sk, int more, int len); +}; + +extern int ccid_register(struct ccid *ccid); +extern int ccid_unregister(struct ccid *ccid); + +extern struct ccid *ccid_init(unsigned char id, struct sock *sk); +extern void ccid_exit(struct ccid *ccid, struct sock *sk); + +static inline void __ccid_get(struct ccid *ccid) +{ + __module_get(ccid->ccid_owner); +} + +static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb, int len, + long *delay) +{ + int rc = 0; + if (ccid->ccid_hc_tx_send_packet != NULL) + rc = ccid->ccid_hc_tx_send_packet(sk, skb, len, delay); + return rc; +} + +static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, + int more, int len) +{ + if (ccid->ccid_hc_tx_packet_sent != NULL) + ccid->ccid_hc_tx_packet_sent(sk, more, len); +} + +static inline int ccid_hc_rx_init(struct ccid *ccid, struct sock *sk) +{ + int rc = 0; + if (ccid->ccid_hc_rx_init != NULL) + rc = ccid->ccid_hc_rx_init(sk); + return rc; +} + +static inline int ccid_hc_tx_init(struct ccid *ccid, struct sock *sk) +{ + int rc = 0; + if (ccid->ccid_hc_tx_init != NULL) + rc = ccid->ccid_hc_tx_init(sk); + return rc; +} + +static inline void ccid_hc_rx_exit(struct ccid *ccid, struct sock *sk) +{ + if (ccid->ccid_hc_rx_exit != NULL) + ccid->ccid_hc_rx_exit(sk); +} + +static inline void ccid_hc_tx_exit(struct ccid *ccid, struct sock *sk) +{ + if (ccid->ccid_hc_tx_exit != NULL) + ccid->ccid_hc_tx_exit(sk); +} + +static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_rx_packet_recv != NULL) + ccid->ccid_hc_rx_packet_recv(sk, skb); +} + +static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_tx_packet_recv != NULL) + ccid->ccid_hc_tx_packet_recv(sk, skb); +} + +static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value) +{ + int rc = 0; + if (ccid->ccid_hc_tx_parse_options != NULL) + rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx, value); + return rc; +} + +static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value) +{ + int rc = 0; + if (ccid->ccid_hc_rx_parse_options != NULL) + rc = ccid->ccid_hc_rx_parse_options(sk, option, len, idx, value); + return rc; +} + +static inline void ccid_hc_tx_insert_options(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_tx_insert_options != NULL) + ccid->ccid_hc_tx_insert_options(sk, skb); +} + +static inline void ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_rx_insert_options != NULL) + ccid->ccid_hc_rx_insert_options(sk, skb); +} +#endif /* _CCID_H */ diff -puN /dev/null net/dccp/ccids/ccid3.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/ccids/ccid3.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,2156 @@ +/* + * net/dccp/ccids/ccid3.c + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "../ccid.h" +#include "../dccp.h" +#include "ccid3.h" + +#ifdef CCID3_DEBUG +extern int ccid3_debug; + +#define ccid3_pr_debug(format, a...) \ + do { if (ccid3_debug) \ + printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \ + } while (0) +#else +#define ccid3_pr_debug(format, a...) +#endif + +#define TFRC_MIN_PACKET_SIZE 16 +#define TFRC_STD_PACKET_SIZE 256 +#define TFRC_MAX_PACKET_SIZE 65535 + +#define USEC_IN_SEC 1000000 + +#define TFRC_INITIAL_TIMEOUT (2 * USEC_IN_SEC) +/* two seconds as per CCID3 spec 11 */ + +#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_IN_SEC / (2 * HZ)) +/* above is in usecs - half the scheduling granularity as per RFC3448 4.6 */ + +#define TFRC_WIN_COUNT_PER_RTT 4 +#define TFRC_WIN_COUNT_LIMIT 16 + +#define TFRC_MAX_BACK_OFF_TIME 64 +/* above is in seconds */ + +#define TFRC_SMALLEST_P 40 + +#define TFRC_RECV_IVAL_F_LENGTH 8 /* length(w[]) */ + +/* Number of later packets received before one is considered lost */ +#define TFRC_RECV_NUM_LATE_LOSS 3 + +enum ccid3_options { + TFRC_OPT_LOSS_EVENT_RATE = 192, + TFRC_OPT_LOSS_INTERVALS = 193, + TFRC_OPT_RECEIVE_RATE = 194, +}; + +static int ccid3_debug; + +static kmem_cache_t *ccid3_tx_hist_slab; +static kmem_cache_t *ccid3_rx_hist_slab; +static kmem_cache_t *ccid3_loss_interval_hist_slab; + +static inline struct ccid3_tx_hist_entry *ccid3_tx_hist_entry_new(int prio) +{ + struct ccid3_tx_hist_entry *entry = kmem_cache_alloc(ccid3_tx_hist_slab, prio); + + if (entry != NULL) + entry->ccid3htx_sent = 0; + + return entry; +} + +static inline void ccid3_tx_hist_entry_delete(struct ccid3_tx_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(ccid3_tx_hist_slab, entry); +} + +static inline struct ccid3_rx_hist_entry *ccid3_rx_hist_entry_new(struct sock *sk, + struct sk_buff *skb, + int prio) +{ + struct ccid3_rx_hist_entry *entry = kmem_cache_alloc(ccid3_rx_hist_slab, prio); + + if (entry != NULL) { + const struct dccp_hdr *dh = dccp_hdr(skb); + + entry->ccid3hrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + entry->ccid3hrx_win_count = dh->dccph_ccval; + entry->ccid3hrx_type = dh->dccph_type; + entry->ccid3hrx_ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; + do_gettimeofday(&(entry->ccid3hrx_tstamp)); + } + + return entry; +} + +static inline void ccid3_rx_hist_entry_delete(struct ccid3_rx_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(ccid3_rx_hist_slab, entry); +} + +static void ccid3_rx_history_delete(struct list_head *hist) +{ + struct ccid3_rx_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, hist, ccid3hrx_node) { + list_del_init(&entry->ccid3hrx_node); + kmem_cache_free(ccid3_rx_hist_slab, entry); + } +} + +static inline struct ccid3_loss_interval_hist_entry *ccid3_loss_interval_hist_entry_new(int prio) +{ + return kmem_cache_alloc(ccid3_loss_interval_hist_slab, prio); +} + +static inline void ccid3_loss_interval_hist_entry_delete(struct ccid3_loss_interval_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(ccid3_loss_interval_hist_slab, entry); +} + +static void ccid3_loss_interval_history_delete(struct list_head *hist) +{ + struct ccid3_loss_interval_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, hist, ccid3lih_node) { + list_del_init(&entry->ccid3lih_node); + kmem_cache_free(ccid3_loss_interval_hist_slab, entry); + } +} + +static int ccid3_init(struct sock *sk) +{ + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + return 0; +} + +static void ccid3_exit(struct sock *sk) +{ + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); +} + +/* TFRC sender states */ +enum ccid3_hc_tx_states { + TFRC_SSTATE_NO_SENT = 1, + TFRC_SSTATE_NO_FBACK, + TFRC_SSTATE_FBACK, + TFRC_SSTATE_TERM, +}; + +#ifdef CCID3_DEBUG +static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) +{ + static char *ccid3_state_names[] = { + [TFRC_SSTATE_NO_SENT] = "NO_SENT", + [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", + [TFRC_SSTATE_FBACK] = "FBACK", + [TFRC_SSTATE_TERM] = "TERM", + }; + + return ccid3_state_names[state]; +} +#endif + +static inline void ccid3_hc_tx_set_state(struct sock *sk, enum ccid3_hc_tx_states state) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_tx_state_name(oldstate), ccid3_tx_state_name(state)); + WARN_ON(state == oldstate); + hctx->ccid3hctx_state = state; +} + +static void timeval_sub(struct timeval large, struct timeval small, struct timeval *result) { + + result->tv_sec = large.tv_sec-small.tv_sec; + if (large.tv_usec < small.tv_usec) { + (result->tv_sec)--; + result->tv_usec = USEC_IN_SEC+large.tv_usec-small.tv_usec; + } else + result->tv_usec = large.tv_usec-small.tv_usec; +} + +static inline void timeval_fix(struct timeval *tv) { + if (tv->tv_usec >= USEC_IN_SEC) { + tv->tv_sec++; + tv->tv_usec -= USEC_IN_SEC; + } +} + +/* returns the difference in usecs between timeval passed in and current time */ +static inline u32 now_delta(struct timeval tv) { + struct timeval now; + + do_gettimeofday(&now); + return ((now.tv_sec-tv.tv_sec)*1000000+now.tv_usec-tv.tv_usec); +} + +#define CALCX_ARRSIZE 500 + +#define CALCX_SPLIT 50000 +/* equivalent to 0.05 */ + +static const u32 calcx_lookup[CALCX_ARRSIZE][2] = { + { 37172 , 8172 }, + { 53499 , 11567 }, + { 66664 , 14180 }, + { 78298 , 16388 }, + { 89021 , 18339 }, + { 99147 , 20108 }, + { 108858 , 21738 }, + { 118273 , 23260 }, + { 127474 , 24693 }, + { 136520 , 26052 }, + { 145456 , 27348 }, + { 154316 , 28589 }, + { 163130 , 29783 }, + { 171919 , 30935 }, + { 180704 , 32049 }, + { 189502 , 33130 }, + { 198328 , 34180 }, + { 207194 , 35202 }, + { 216114 , 36198 }, + { 225097 , 37172 }, + { 234153 , 38123 }, + { 243294 , 39055 }, + { 252527 , 39968 }, + { 261861 , 40864 }, + { 271305 , 41743 }, + { 280866 , 42607 }, + { 290553 , 43457 }, + { 300372 , 44293 }, + { 310333 , 45117 }, + { 320441 , 45929 }, + { 330705 , 46729 }, + { 341131 , 47518 }, + { 351728 , 48297 }, + { 362501 , 49066 }, + { 373460 , 49826 }, + { 384609 , 50577 }, + { 395958 , 51320 }, + { 407513 , 52054 }, + { 419281 , 52780 }, + { 431270 , 53499 }, + { 443487 , 54211 }, + { 455940 , 54916 }, + { 468635 , 55614 }, + { 481581 , 56306 }, + { 494785 , 56991 }, + { 508254 , 57671 }, + { 521996 , 58345 }, + { 536019 , 59014 }, + { 550331 , 59677 }, + { 564939 , 60335 }, + { 579851 , 60988 }, + { 595075 , 61636 }, + { 610619 , 62279 }, + { 626491 , 62918 }, + { 642700 , 63553 }, + { 659253 , 64183 }, + { 676158 , 64809 }, + { 693424 , 65431 }, + { 711060 , 66050 }, + { 729073 , 66664 }, + { 747472 , 67275 }, + { 766266 , 67882 }, + { 785464 , 68486 }, + { 805073 , 69087 }, + { 825103 , 69684 }, + { 845562 , 70278 }, + { 866460 , 70868 }, + { 887805 , 71456 }, + { 909606 , 72041 }, + { 931873 , 72623 }, + { 954614 , 73202 }, + { 977839 , 73778 }, + { 1001557 , 74352 }, + { 1025777 , 74923 }, + { 1050508 , 75492 }, + { 1075761 , 76058 }, + { 1101544 , 76621 }, + { 1127867 , 77183 }, + { 1154739 , 77741 }, + { 1182172 , 78298 }, + { 1210173 , 78852 }, + { 1238753 , 79405 }, + { 1267922 , 79955 }, + { 1297689 , 80503 }, + { 1328066 , 81049 }, + { 1359060 , 81593 }, + { 1390684 , 82135 }, + { 1422947 , 82675 }, + { 1455859 , 83213 }, + { 1489430 , 83750 }, + { 1523671 , 84284 }, + { 1558593 , 84817 }, + { 1594205 , 85348 }, + { 1630518 , 85878 }, + { 1667543 , 86406 }, + { 1705290 , 86932 }, + { 1743770 , 87457 }, + { 1782994 , 87980 }, + { 1822973 , 88501 }, + { 1863717 , 89021 }, + { 1905237 , 89540 }, + { 1947545 , 90057 }, + { 1990650 , 90573 }, + { 2034566 , 91087 }, + { 2079301 , 91600 }, + { 2124869 , 92111 }, + { 2171279 , 92622 }, + { 2218543 , 93131 }, + { 2266673 , 93639 }, + { 2315680 , 94145 }, + { 2365575 , 94650 }, + { 2416371 , 95154 }, + { 2468077 , 95657 }, + { 2520707 , 96159 }, + { 2574271 , 96660 }, + { 2628782 , 97159 }, + { 2684250 , 97658 }, + { 2740689 , 98155 }, + { 2798110 , 98651 }, + { 2856524 , 99147 }, + { 2915944 , 99641 }, + { 2976382 , 100134 }, + { 3037850 , 100626 }, + { 3100360 , 101117 }, + { 3163924 , 101608 }, + { 3228554 , 102097 }, + { 3294263 , 102586 }, + { 3361063 , 103073 }, + { 3428966 , 103560 }, + { 3497984 , 104045 }, + { 3568131 , 104530 }, + { 3639419 , 105014 }, + { 3711860 , 105498 }, + { 3785467 , 105980 }, + { 3860253 , 106462 }, + { 3936229 , 106942 }, + { 4013410 , 107422 }, + { 4091808 , 107902 }, + { 4171435 , 108380 }, + { 4252306 , 108858 }, + { 4334431 , 109335 }, + { 4417825 , 109811 }, + { 4502501 , 110287 }, + { 4588472 , 110762 }, + { 4675750 , 111236 }, + { 4764349 , 111709 }, + { 4854283 , 112182 }, + { 4945564 , 112654 }, + { 5038206 , 113126 }, + { 5132223 , 113597 }, + { 5227627 , 114067 }, + { 5324432 , 114537 }, + { 5422652 , 115006 }, + { 5522299 , 115474 }, + { 5623389 , 115942 }, + { 5725934 , 116409 }, + { 5829948 , 116876 }, + { 5935446 , 117342 }, + { 6042439 , 117808 }, + { 6150943 , 118273 }, + { 6260972 , 118738 }, + { 6372538 , 119202 }, + { 6485657 , 119665 }, + { 6600342 , 120128 }, + { 6716607 , 120591 }, + { 6834467 , 121053 }, + { 6953935 , 121514 }, + { 7075025 , 121976 }, + { 7197752 , 122436 }, + { 7322131 , 122896 }, + { 7448175 , 123356 }, + { 7575898 , 123815 }, + { 7705316 , 124274 }, + { 7836442 , 124733 }, + { 7969291 , 125191 }, + { 8103877 , 125648 }, + { 8240216 , 126105 }, + { 8378321 , 126562 }, + { 8518208 , 127018 }, + { 8659890 , 127474 }, + { 8803384 , 127930 }, + { 8948702 , 128385 }, + { 9095861 , 128840 }, + { 9244875 , 129294 }, + { 9395760 , 129748 }, + { 9548529 , 130202 }, + { 9703198 , 130655 }, + { 9859782 , 131108 }, + { 10018296 , 131561 }, + { 10178755 , 132014 }, + { 10341174 , 132466 }, + { 10505569 , 132917 }, + { 10671954 , 133369 }, + { 10840345 , 133820 }, + { 11010757 , 134271 }, + { 11183206 , 134721 }, + { 11357706 , 135171 }, + { 11534274 , 135621 }, + { 11712924 , 136071 }, + { 11893673 , 136520 }, + { 12076536 , 136969 }, + { 12261527 , 137418 }, + { 12448664 , 137867 }, + { 12637961 , 138315 }, + { 12829435 , 138763 }, + { 13023101 , 139211 }, + { 13218974 , 139658 }, + { 13417071 , 140106 }, + { 13617407 , 140553 }, + { 13819999 , 140999 }, + { 14024862 , 141446 }, + { 14232012 , 141892 }, + { 14441465 , 142339 }, + { 14653238 , 142785 }, + { 14867346 , 143230 }, + { 15083805 , 143676 }, + { 15302632 , 144121 }, + { 15523842 , 144566 }, + { 15747453 , 145011 }, + { 15973479 , 145456 }, + { 16201939 , 145900 }, + { 16432847 , 146345 }, + { 16666221 , 146789 }, + { 16902076 , 147233 }, + { 17140429 , 147677 }, + { 17381297 , 148121 }, + { 17624696 , 148564 }, + { 17870643 , 149007 }, + { 18119154 , 149451 }, + { 18370247 , 149894 }, + { 18623936 , 150336 }, + { 18880241 , 150779 }, + { 19139176 , 151222 }, + { 19400759 , 151664 }, + { 19665007 , 152107 }, + { 19931936 , 152549 }, + { 20201564 , 152991 }, + { 20473907 , 153433 }, + { 20748982 , 153875 }, + { 21026807 , 154316 }, + { 21307399 , 154758 }, + { 21590773 , 155199 }, + { 21876949 , 155641 }, + { 22165941 , 156082 }, + { 22457769 , 156523 }, + { 22752449 , 156964 }, + { 23049999 , 157405 }, + { 23350435 , 157846 }, + { 23653774 , 158287 }, + { 23960036 , 158727 }, + { 24269236 , 159168 }, + { 24581392 , 159608 }, + { 24896521 , 160049 }, + { 25214642 , 160489 }, + { 25535772 , 160929 }, + { 25859927 , 161370 }, + { 26187127 , 161810 }, + { 26517388 , 162250 }, + { 26850728 , 162690 }, + { 27187165 , 163130 }, + { 27526716 , 163569 }, + { 27869400 , 164009 }, + { 28215234 , 164449 }, + { 28564236 , 164889 }, + { 28916423 , 165328 }, + { 29271815 , 165768 }, + { 29630428 , 166208 }, + { 29992281 , 166647 }, + { 30357392 , 167087 }, + { 30725779 , 167526 }, + { 31097459 , 167965 }, + { 31472452 , 168405 }, + { 31850774 , 168844 }, + { 32232445 , 169283 }, + { 32617482 , 169723 }, + { 33005904 , 170162 }, + { 33397730 , 170601 }, + { 33792976 , 171041 }, + { 34191663 , 171480 }, + { 34593807 , 171919 }, + { 34999428 , 172358 }, + { 35408544 , 172797 }, + { 35821174 , 173237 }, + { 36237335 , 173676 }, + { 36657047 , 174115 }, + { 37080329 , 174554 }, + { 37507197 , 174993 }, + { 37937673 , 175433 }, + { 38371773 , 175872 }, + { 38809517 , 176311 }, + { 39250924 , 176750 }, + { 39696012 , 177190 }, + { 40144800 , 177629 }, + { 40597308 , 178068 }, + { 41053553 , 178507 }, + { 41513554 , 178947 }, + { 41977332 , 179386 }, + { 42444904 , 179825 }, + { 42916290 , 180265 }, + { 43391509 , 180704 }, + { 43870579 , 181144 }, + { 44353520 , 181583 }, + { 44840352 , 182023 }, + { 45331092 , 182462 }, + { 45825761 , 182902 }, + { 46324378 , 183342 }, + { 46826961 , 183781 }, + { 47333531 , 184221 }, + { 47844106 , 184661 }, + { 48358706 , 185101 }, + { 48877350 , 185541 }, + { 49400058 , 185981 }, + { 49926849 , 186421 }, + { 50457743 , 186861 }, + { 50992759 , 187301 }, + { 51531916 , 187741 }, + { 52075235 , 188181 }, + { 52622735 , 188622 }, + { 53174435 , 189062 }, + { 53730355 , 189502 }, + { 54290515 , 189943 }, + { 54854935 , 190383 }, + { 55423634 , 190824 }, + { 55996633 , 191265 }, + { 56573950 , 191706 }, + { 57155606 , 192146 }, + { 57741621 , 192587 }, + { 58332014 , 193028 }, + { 58926806 , 193470 }, + { 59526017 , 193911 }, + { 60129666 , 194352 }, + { 60737774 , 194793 }, + { 61350361 , 195235 }, + { 61967446 , 195677 }, + { 62589050 , 196118 }, + { 63215194 , 196560 }, + { 63845897 , 197002 }, + { 64481179 , 197444 }, + { 65121061 , 197886 }, + { 65765563 , 198328 }, + { 66414705 , 198770 }, + { 67068508 , 199213 }, + { 67726992 , 199655 }, + { 68390177 , 200098 }, + { 69058085 , 200540 }, + { 69730735 , 200983 }, + { 70408147 , 201426 }, + { 71090343 , 201869 }, + { 71777343 , 202312 }, + { 72469168 , 202755 }, + { 73165837 , 203199 }, + { 73867373 , 203642 }, + { 74573795 , 204086 }, + { 75285124 , 204529 }, + { 76001380 , 204973 }, + { 76722586 , 205417 }, + { 77448761 , 205861 }, + { 78179926 , 206306 }, + { 78916102 , 206750 }, + { 79657310 , 207194 }, + { 80403571 , 207639 }, + { 81154906 , 208084 }, + { 81911335 , 208529 }, + { 82672880 , 208974 }, + { 83439562 , 209419 }, + { 84211402 , 209864 }, + { 84988421 , 210309 }, + { 85770640 , 210755 }, + { 86558080 , 211201 }, + { 87350762 , 211647 }, + { 88148708 , 212093 }, + { 88951938 , 212539 }, + { 89760475 , 212985 }, + { 90574339 , 213432 }, + { 91393551 , 213878 }, + { 92218133 , 214325 }, + { 93048107 , 214772 }, + { 93883493 , 215219 }, + { 94724314 , 215666 }, + { 95570590 , 216114 }, + { 96422343 , 216561 }, + { 97279594 , 217009 }, + { 98142366 , 217457 }, + { 99010679 , 217905 }, + { 99884556 , 218353 }, + { 100764018 , 218801 }, + { 101649086 , 219250 }, + { 102539782 , 219698 }, + { 103436128 , 220147 }, + { 104338146 , 220596 }, + { 105245857 , 221046 }, + { 106159284 , 221495 }, + { 107078448 , 221945 }, + { 108003370 , 222394 }, + { 108934074 , 222844 }, + { 109870580 , 223294 }, + { 110812910 , 223745 }, + { 111761087 , 224195 }, + { 112715133 , 224646 }, + { 113675069 , 225097 }, + { 114640918 , 225548 }, + { 115612702 , 225999 }, + { 116590442 , 226450 }, + { 117574162 , 226902 }, + { 118563882 , 227353 }, + { 119559626 , 227805 }, + { 120561415 , 228258 }, + { 121569272 , 228710 }, + { 122583219 , 229162 }, + { 123603278 , 229615 }, + { 124629471 , 230068 }, + { 125661822 , 230521 }, + { 126700352 , 230974 }, + { 127745083 , 231428 }, + { 128796039 , 231882 }, + { 129853241 , 232336 }, + { 130916713 , 232790 }, + { 131986475 , 233244 }, + { 133062553 , 233699 }, + { 134144966 , 234153 }, + { 135233739 , 234608 }, + { 136328894 , 235064 }, + { 137430453 , 235519 }, + { 138538440 , 235975 }, + { 139652876 , 236430 }, + { 140773786 , 236886 }, + { 141901190 , 237343 }, + { 143035113 , 237799 }, + { 144175576 , 238256 }, + { 145322604 , 238713 }, + { 146476218 , 239170 }, + { 147636442 , 239627 }, + { 148803298 , 240085 }, + { 149976809 , 240542 }, + { 151156999 , 241000 }, + { 152343890 , 241459 }, + { 153537506 , 241917 }, + { 154737869 , 242376 }, + { 155945002 , 242835 }, + { 157158929 , 243294 }, + { 158379673 , 243753 }, + { 159607257 , 244213 }, + { 160841704 , 244673 }, + { 162083037 , 245133 }, + { 163331279 , 245593 }, + { 164586455 , 246054 }, + { 165848586 , 246514 }, + { 167117696 , 246975 }, + { 168393810 , 247437 }, + { 169676949 , 247898 }, + { 170967138 , 248360 }, + { 172264399 , 248822 }, + { 173568757 , 249284 }, + { 174880235 , 249747 }, + { 176198856 , 250209 }, + { 177524643 , 250672 }, + { 178857621 , 251136 }, + { 180197813 , 251599 }, + { 181545242 , 252063 }, + { 182899933 , 252527 }, + { 184261908 , 252991 }, + { 185631191 , 253456 }, + { 187007807 , 253920 }, + { 188391778 , 254385 }, + { 189783129 , 254851 }, + { 191181884 , 255316 }, + { 192588065 , 255782 }, + { 194001698 , 256248 }, + { 195422805 , 256714 }, + { 196851411 , 257181 }, + { 198287540 , 257648 }, + { 199731215 , 258115 }, + { 201182461 , 258582 }, + { 202641302 , 259050 }, + { 204107760 , 259518 }, + { 205581862 , 259986 }, + { 207063630 , 260454 }, + { 208553088 , 260923 }, + { 210050262 , 261392 }, + { 211555174 , 261861 }, + { 213067849 , 262331 }, + { 214588312 , 262800 }, + { 216116586 , 263270 }, + { 217652696 , 263741 }, + { 219196666 , 264211 }, + { 220748520 , 264682 }, + { 222308282 , 265153 }, + { 223875978 , 265625 }, + { 225451630 , 266097 }, + { 227035265 , 266569 }, + { 228626905 , 267041 }, + { 230226576 , 267514 }, + { 231834302 , 267986 }, + { 233450107 , 268460 }, + { 235074016 , 268933 }, + { 236706054 , 269407 }, + { 238346244 , 269881 }, + { 239994613 , 270355 }, + { 241651183 , 270830 }, + { 243315981 , 271305 } +}; + +/* Calculate the send rate as per section 3.1 of RFC3448 + +Returns send rate in bytes per second + +Integer maths and lookups are used as not allowed floating point in kernel + +The function for Xcalc as per section 3.1 of RFC3448 is: + +X = s + ------------------------------------------------------------- + R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2))) + +where +X is the trasmit rate in bytes/second +s is the packet size in bytes +R is the round trip time in seconds +p is the loss event rate, between 0 and 1.0, of the number of loss events + as a fraction of the number of packets transmitted +t_RTO is the TCP retransmission timeout value in seconds +b is the number of packets acknowledged by a single TCP acknowledgement + +we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes: + +X = s + ----------------------------------------------------------------------- + R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2))) + + +which we can break down into: + +X = s + -------- + R * f(p) + +where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p)) + +Function parameters: +s - bytes +R - RTT in usecs +p - loss rate (decimal fraction multiplied by 1,000,000) + +Returns Xcalc in bytes per second + +DON'T alter this code unless you run test cases against it as the code +has been manipulated to stop underflow/overlow. + +*/ +static u32 ccid3_calc_x(u16 s, u32 R, u32 p) +{ + int index; + u32 f; + u64 tmp1, tmp2; + + if (p < CALCX_SPLIT) + index = (p / (CALCX_SPLIT / CALCX_ARRSIZE)) - 1; + else + index = (p / (1000000 / CALCX_ARRSIZE)) - 1; + + if (index < 0) + /* p should be 0 unless there is a bug in my code */ + index = 0; + + if (R == 0) + R = 1; /* RTT can't be zero or else divide by zero */ + + BUG_ON(index >= CALCX_ARRSIZE); + + if (p >= CALCX_SPLIT) + f = calcx_lookup[index][0]; + else + f = calcx_lookup[index][1]; + + tmp1 = ((u64)s * 100000000); + tmp2 = ((u64)R * (u64)f); + do_div(tmp2,10000); + do_div(tmp1,tmp2); + /* don't alter above math unless you test due to overflow on 32 bit */ + + return (u32)tmp1; +} + +/* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */ +static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx) +{ + if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) + return; + /* if no feedback spec says t_ipi is 1 second (set elsewhere and then + * doubles after every no feedback timer (separate function) */ + + if (hctx->ccid3hctx_x < 10) { + ccid3_pr_debug("ccid3_calc_new_t_ipi - ccid3hctx_x < 10\n"); + hctx->ccid3hctx_x = 10; + } + hctx->ccid3hctx_t_ipi = (hctx->ccid3hctx_s * 100000) + / (hctx->ccid3hctx_x / 10); + /* reason for above maths with 10 in there is to avoid 32 bit + * overflow for jumbo packets */ + +} + +/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ +static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx) +{ + hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); + +} + +/* + * Update X by + * If (p > 0) + * x_calc = calcX(s, R, p); + * X = max(min(X_calc, 2 * X_recv), s / t_mbi); + * Else + * If (now - tld >= R) + * X = max(min(2 * X, 2 * X_recv), s / R); + * tld = now; + */ +static void ccid3_hc_tx_update_x(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + + if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) { /* to avoid large error in calcX */ + hctx->ccid3hctx_x_calc = ccid3_calc_x(hctx->ccid3hctx_s, + hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p); + hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc, 2 * hctx->ccid3hctx_x_recv), + hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME); + } else if (now_delta(hctx->ccid3hctx_t_ld) >= hctx->ccid3hctx_rtt) { + u32 rtt = hctx->ccid3hctx_rtt; + if (rtt < 10) { + rtt = 10; + } /* avoid divide by zero below */ + + hctx->ccid3hctx_x = max_t(u32, min_t(u32, 2 * hctx->ccid3hctx_x_recv, 2 * hctx->ccid3hctx_x), + (hctx->ccid3hctx_s * 100000) / (rtt / 10)); + /* Using 100000 and 10 to avoid 32 bit overflow for jumbo frames */ + do_gettimeofday(&hctx->ccid3hctx_t_ld); + } + + if (hctx->ccid3hctx_x == 0) { + ccid3_pr_debug("ccid3hctx_x = 0!\n"); + hctx->ccid3hctx_x = 1; + } +} + +static void ccid3_hc_tx_no_feedback_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct dccp_sock *dp = dccp_sk(sk); + unsigned long next_tmout = 0; + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + u32 rtt; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + /* XXX: set some sensible MIB */ + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + HZ / 5); + goto out; + } + + ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk, + ccid3_tx_state_name(hctx->ccid3hctx_state)); + + if (hctx->ccid3hctx_x < 10) { + ccid3_pr_debug("TFRC_SSTATE_NO_FBACK ccid3hctx_x < 10\n"); + hctx->ccid3hctx_x = 10; + } + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_TERM: + goto out; + case TFRC_SSTATE_NO_FBACK: + /* Halve send rate */ + hctx->ccid3hctx_x /= 2; + if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME)) + hctx->ccid3hctx_x = hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME; + + ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d bytes/s\n", + dccp_role(sk), sk, ccid3_tx_state_name(hctx->ccid3hctx_state), + hctx->ccid3hctx_x); + next_tmout = max_t(u32, 2 * (hctx->ccid3hctx_s * 100000) + / (hctx->ccid3hctx_x / 10), TFRC_INITIAL_TIMEOUT); + /* do above maths with 100000 and 10 to prevent overflow on 32 bit */ + /* FIXME - not sure above calculation is correct. See section 5 of CCID3 11 + * should adjust tx_t_ipi and double that to achieve it really */ + break; + case TFRC_SSTATE_FBACK: + /* Check if IDLE since last timeout and recv rate is less than 4 packets per RTT */ + rtt = hctx->ccid3hctx_rtt; + if (rtt < 10) + rtt = 10; + /* stop divide by zero below */ + if (!hctx->ccid3hctx_idle || (hctx->ccid3hctx_x_recv >= + 4 * (hctx->ccid3hctx_s * 100000) / (rtt / 10))) { + ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n", dccp_role(sk), sk, + ccid3_tx_state_name(hctx->ccid3hctx_state)); + /* Halve sending rate */ + + /* If (X_calc > 2 * X_recv) + * X_recv = max(X_recv / 2, s / (2 * t_mbi)); + * Else + * X_recv = X_calc / 4; + */ + BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P && hctx->ccid3hctx_x_calc == 0); + + /* check also if p is zero -> x_calc is infinity? */ + if (hctx->ccid3hctx_p < TFRC_SMALLEST_P || + hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv) + hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2, + hctx->ccid3hctx_s / (2 * TFRC_MAX_BACK_OFF_TIME)); + else + hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc / 4; + + /* Update sending rate */ + ccid3_hc_tx_update_x(sk); + } + if (hctx->ccid3hctx_x == 0) { + ccid3_pr_debug("TFRC_SSTATE_FBACK ccid3hctx_x = 0!\n"); + hctx->ccid3hctx_x = 10; + } + /* Schedule no feedback timer to expire in max(4 * R, 2 * s / X) */ + next_tmout = max_t(u32, inet_csk(sk)->icsk_rto, + 2 * (hctx->ccid3hctx_s * 100000) / (hctx->ccid3hctx_x / 10)); + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + goto out; + } + + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout))); + hctx->ccid3hctx_idle = 1; +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb, + int len, long *delay) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_tx_hist_entry *new_packet = NULL; + struct timeval now; + int rc = -ENOTCONN; + +// ccid3_pr_debug("%s, sk=%p, skb=%p, len=%d\n", dccp_role(sk), sk, skb, len); + /* + * check if pure ACK or Terminating */ + /* XXX: We only call this function for DATA and DATAACK, on, these packets can have + * zero length, but why the comment about "pure ACK"? + */ + if (hctx == NULL || len == 0 || hctx->ccid3hctx_state == TFRC_SSTATE_TERM) + goto out; + + /* See if last packet allocated was not sent */ + if (!list_empty(&hctx->ccid3hctx_hist)) + new_packet = list_entry(hctx->ccid3hctx_hist.next, + struct ccid3_tx_hist_entry, ccid3htx_node); + + if (new_packet == NULL || new_packet->ccid3htx_sent) { + new_packet = ccid3_tx_hist_entry_new(SLAB_ATOMIC); + + rc = -ENOBUFS; + if (new_packet == NULL) { + ccid3_pr_debug("%s, sk=%p, not enough mem to add " + "to history, send refused\n", dccp_role(sk), sk); + goto out; + } + + list_add(&new_packet->ccid3htx_node, &hctx->ccid3hctx_hist); + } + + do_gettimeofday(&now); + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + ccid3_pr_debug("%s, sk=%p, first packet(%llu)\n", dccp_role(sk), sk, + dp->dccps_gss); + + hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer; + hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk; + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)); + hctx->ccid3hctx_last_win_count = 0; + hctx->ccid3hctx_t_last_win_count = now; + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + hctx->ccid3hctx_t_ipi = TFRC_INITIAL_TIMEOUT; + + /* Set nominal send time for initial packet */ + hctx->ccid3hctx_t_nom = now; + (hctx->ccid3hctx_t_nom).tv_usec += hctx->ccid3hctx_t_ipi; + timeval_fix(&(hctx->ccid3hctx_t_nom)); + ccid3_calc_new_delta(hctx); + rc = 0; + break; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + *delay = (now_delta(hctx->ccid3hctx_t_nom) - hctx->ccid3hctx_delta); + ccid3_pr_debug("send_packet delay=%ld\n",*delay); + *delay /= -1000; + /* divide by -1000 is to convert to ms and get sign right */ + rc = *delay > 0 ? -EAGAIN : 0; + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + rc = -EINVAL; + break; + } + + /* Can we send? if so add options and add to packet history */ + if (rc == 0) + new_packet->ccid3htx_win_count = DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; +out: + return rc; +} + +static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_tx_hist_entry *packet = NULL; + struct timeval now; + +// ccid3_pr_debug("%s, sk=%p, more=%d, len=%d\n", dccp_role(sk), sk, more, len); + BUG_ON(hctx == NULL); + + if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) { + ccid3_pr_debug("%s, sk=%p, while state is TFRC_SSTATE_TERM!\n", + dccp_role(sk), sk); + return; + } + + do_gettimeofday(&now); + + /* check if we have sent a data packet */ + if (len > 0) { + unsigned long quarter_rtt; + + if (list_empty(&hctx->ccid3hctx_hist)) { + printk(KERN_CRIT "%s: packet doesn't exists in history!\n", __FUNCTION__); + return; + } + packet = list_entry(hctx->ccid3hctx_hist.next, struct ccid3_tx_hist_entry, ccid3htx_node); + if (packet->ccid3htx_sent) { + printk(KERN_CRIT "%s: no unsent packet in history!\n", __FUNCTION__); + return; + } + packet->ccid3htx_tstamp = now; + packet->ccid3htx_seqno = dp->dccps_gss; + // ccid3_pr_debug("%s, sk=%p, seqno=%llu inserted!\n", dccp_role(sk), sk, packet->ccid3htx_seqno); + + /* + * Check if win_count have changed */ + /* COMPLIANCE_BEGIN + * Algorithm in "8.1. Window Counter Valuer" in draft-ietf-dccp-ccid3-11.txt + */ + quarter_rtt = now_delta(hctx->ccid3hctx_t_last_win_count) / (hctx->ccid3hctx_rtt / 4); + if (quarter_rtt > 0) { + hctx->ccid3hctx_t_last_win_count = now; + hctx->ccid3hctx_last_win_count = (hctx->ccid3hctx_last_win_count + + min_t(unsigned long, quarter_rtt, 5)) % 16; + ccid3_pr_debug("%s, sk=%p, window changed from %u to %u!\n", + dccp_role(sk), sk, + packet->ccid3htx_win_count, + hctx->ccid3hctx_last_win_count); + } + /* COMPLIANCE_END */ +#if 0 + ccid3_pr_debug("%s, sk=%p, packet sent (%llu,%u)\n", + dccp_role(sk), sk, + packet->ccid3htx_seqno, + packet->ccid3htx_win_count); +#endif + hctx->ccid3hctx_idle = 0; + packet->ccid3htx_sent = 1; + } else + ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n", + dccp_role(sk), sk, dp->dccps_gss); + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + /* if first wasn't pure ack */ + if (len != 0) + printk(KERN_CRIT "%s: %s, First packet sent is noted as a data packet\n", + __FUNCTION__, dccp_role(sk)); + return; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + if (len > 0) { + hctx->ccid3hctx_t_nom = now; + ccid3_calc_new_t_ipi(hctx); + ccid3_calc_new_delta(hctx); + (hctx->ccid3hctx_t_nom).tv_usec += hctx->ccid3hctx_t_ipi; + timeval_fix(&(hctx->ccid3hctx_t_nom)); + } + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + break; + } +} + +static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_options_received *opt_recv; + struct ccid3_tx_hist_entry *entry, *next, *packet; + unsigned long next_tmout; + u16 t_elapsed; + u32 pinv; + u32 x_recv; + u32 r_sample; +#if 0 + ccid3_pr_debug("%s, sk=%p(%s), skb=%p(%s)\n", + dccp_role(sk), sk, dccp_state_name(sk->sk_state), + skb, dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); +#endif + if (hctx == NULL) + return; + + if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) { + ccid3_pr_debug("%s, sk=%p, received a packet when terminating!\n", dccp_role(sk), sk); + return; + } + + /* we are only interested in ACKs */ + if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || + DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) + return; + + opt_recv = &hctx->ccid3hctx_options_received; + + t_elapsed = dp->dccps_options_received.dccpor_elapsed_time; + x_recv = opt_recv->ccid3or_receive_rate; + pinv = opt_recv->ccid3or_loss_event_rate; + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + /* FIXME: what to do here? */ + return; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + /* Calculate new round trip sample by + * R_sample = (now - t_recvdata) - t_delay */ + /* get t_recvdata from history */ + packet = NULL; + list_for_each_entry_safe(entry, next, &hctx->ccid3hctx_hist, ccid3htx_node) + if (entry->ccid3htx_seqno == DCCP_SKB_CB(skb)->dccpd_ack_seq) { + packet = entry; + break; + } + + if (packet == NULL) { + ccid3_pr_debug("%s, sk=%p, seqno %llu(%s) does't exist in history!\n", + dccp_role(sk), sk, DCCP_SKB_CB(skb)->dccpd_ack_seq, + dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); + return; + } + + /* Update RTT */ + r_sample = now_delta(packet->ccid3htx_tstamp); + /* FIXME: */ + // r_sample -= usecs_to_jiffies(t_elapsed * 10); + + /* Update RTT estimate by + * If (No feedback recv) + * R = R_sample; + * Else + * R = q * R + (1 - q) * R_sample; + * + * q is a constant, RFC 3448 recomments 0.9 + */ + if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); + hctx->ccid3hctx_rtt = r_sample; + } else + hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 + r_sample / 10; + + /* + * XXX: this is to avoid a division by zero in ccid3_hc_tx_packet_sent + * implemention of the new window count. + */ + if (hctx->ccid3hctx_rtt < 4) + hctx->ccid3hctx_rtt = 4; + + ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, r_sample=%us\n", + dccp_role(sk), sk, + hctx->ccid3hctx_rtt, + r_sample); + + /* Update timeout interval */ + inet_csk(sk)->icsk_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, USEC_IN_SEC); + + /* Update receive rate */ + hctx->ccid3hctx_x_recv = x_recv; /* x_recv in bytes per second */ + + /* Update loss event rate */ + if (pinv == ~0 || pinv == 0) + hctx->ccid3hctx_p = 0; + else { + hctx->ccid3hctx_p = 1000000 / pinv; + + if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) { + hctx->ccid3hctx_p = TFRC_SMALLEST_P; + ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", dccp_role(sk), sk); + } + } + + /* unschedule no feedback timer */ + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + + /* Update sending rate */ + ccid3_hc_tx_update_x(sk); + + /* Update next send time */ + if (hctx->ccid3hctx_t_ipi > (hctx->ccid3hctx_t_nom).tv_usec) { + (hctx->ccid3hctx_t_nom).tv_usec += USEC_IN_SEC; + (hctx->ccid3hctx_t_nom).tv_sec--; + } + /* FIXME - if no feedback then t_ipi can go > 1 second */ + (hctx->ccid3hctx_t_nom).tv_usec -= hctx->ccid3hctx_t_ipi; + ccid3_calc_new_t_ipi(hctx); + (hctx->ccid3hctx_t_nom).tv_usec += hctx->ccid3hctx_t_ipi; + timeval_fix(&(hctx->ccid3hctx_t_nom)); + ccid3_calc_new_delta(hctx); + + /* remove all packets older than the one acked from history */ + list_for_each_entry_safe_continue(entry, next, &hctx->ccid3hctx_hist, ccid3htx_node) { + list_del_init(&entry->ccid3htx_node); + ccid3_tx_hist_entry_delete(entry); + } + if (hctx->ccid3hctx_x < 10) { + ccid3_pr_debug("ccid3_hc_tx_packet_recv hctx->ccid3hctx_x < 10\n"); + hctx->ccid3hctx_x = 10; + } + /* to prevent divide by zero below */ + + /* Schedule no feedback timer to expire in max(4 * R, 2 * s / X) */ + next_tmout = max(inet_csk(sk)->icsk_rto, + 2 * (hctx->ccid3hctx_s * 100000) / (hctx->ccid3hctx_x/10)); + /* maths with 100000 and 10 is to prevent overflow with 32 bit */ + + ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to expire in %lu jiffies (%luus)\n", + dccp_role(sk), sk, usecs_to_jiffies(next_tmout), next_tmout); + + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + max_t(u32,1,usecs_to_jiffies(next_tmout))); + + /* set idle flag */ + hctx->ccid3hctx_idle = 1; + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + break; + } +} + +static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + + if (hctx == NULL || !(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) + return; + + DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; +} + +static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, + unsigned char len, u16 idx, unsigned char *value) +{ + int rc = 0; + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_options_received *opt_recv; + + if (hctx == NULL) + return 0; + + opt_recv = &hctx->ccid3hctx_options_received; + + if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { + opt_recv->ccid3or_seqno = dp->dccps_gsr; + opt_recv->ccid3or_loss_event_rate = ~0; + opt_recv->ccid3or_loss_intervals_idx = 0; + opt_recv->ccid3or_loss_intervals_len = 0; + opt_recv->ccid3or_receive_rate = 0; + } + + switch (option) { + case TFRC_OPT_LOSS_EVENT_RATE: + if (len != 4) { + ccid3_pr_debug("%s, sk=%p, invalid len for TFRC_OPT_LOSS_EVENT_RATE\n", + dccp_role(sk), sk); + rc = -EINVAL; + } else { + opt_recv->ccid3or_loss_event_rate = ntohl(*(u32 *)value); + ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n", + dccp_role(sk), sk, + opt_recv->ccid3or_loss_event_rate); + } + break; + case TFRC_OPT_LOSS_INTERVALS: + opt_recv->ccid3or_loss_intervals_idx = idx; + opt_recv->ccid3or_loss_intervals_len = len; + ccid3_pr_debug("%s, sk=%p, LOSS_INTERVALS=(%u, %u)\n", + dccp_role(sk), sk, + opt_recv->ccid3or_loss_intervals_idx, + opt_recv->ccid3or_loss_intervals_len); + break; + case TFRC_OPT_RECEIVE_RATE: + if (len != 4) { + ccid3_pr_debug("%s, sk=%p, invalid len for TFRC_OPT_RECEIVE_RATE\n", + dccp_role(sk), sk); + rc = -EINVAL; + } else { + opt_recv->ccid3or_receive_rate = ntohl(*(u32 *)value); + ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n", + dccp_role(sk), sk, + opt_recv->ccid3or_receive_rate); + } + break; + } + + return rc; +} + +static int ccid3_hc_tx_init(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + hctx = dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx), gfp_any()); + if (hctx == NULL) + return -ENOMEM; + + memset(hctx, 0, sizeof(*hctx)); + + if (dp->dccps_avg_packet_size >= TFRC_MIN_PACKET_SIZE && + dp->dccps_avg_packet_size <= TFRC_MAX_PACKET_SIZE) + hctx->ccid3hctx_s = (u16)dp->dccps_avg_packet_size; + else + hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE; + + hctx->ccid3hctx_x = hctx->ccid3hctx_s; /* set transmission rate to 1 packet per second */ + hctx->ccid3hctx_rtt = 4; /* See ccid3_hc_tx_packet_sent win_count calculatation */ + inet_csk(sk)->icsk_rto = USEC_IN_SEC; + hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; + INIT_LIST_HEAD(&hctx->ccid3hctx_hist); + init_timer(&hctx->ccid3hctx_no_feedback_timer); + + return 0; +} + +static void ccid3_hc_tx_exit(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_tx_hist_entry *entry, *next; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + BUG_ON(hctx == NULL); + + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + + /* Empty packet history */ + list_for_each_entry_safe(entry, next, &hctx->ccid3hctx_hist, ccid3htx_node) { + list_del_init(&entry->ccid3htx_node); + ccid3_tx_hist_entry_delete(entry); + } + + kfree(dp->dccps_hc_tx_ccid_private); + dp->dccps_hc_tx_ccid_private = NULL; +} + +/* + * RX Half Connection methods + */ + +/* TFRC receiver states */ +enum ccid3_hc_rx_states { + TFRC_RSTATE_NO_DATA = 1, + TFRC_RSTATE_DATA, + TFRC_RSTATE_TERM = 127, +}; + +#ifdef CCID3_DEBUG +static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) +{ + static char *ccid3_rx_state_names[] = { + [TFRC_RSTATE_NO_DATA] = "NO_DATA", + [TFRC_RSTATE_DATA] = "DATA", + [TFRC_RSTATE_TERM] = "TERM", + }; + + return ccid3_rx_state_names[state]; +} +#endif + +static inline void ccid3_hc_rx_set_state(struct sock *sk, enum ccid3_hc_rx_states state) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_rx_state_name(oldstate), ccid3_rx_state_name(state)); + WARN_ON(state == oldstate); + hcrx->ccid3hcrx_state = state; +} + +static int ccid3_hc_rx_add_hist(struct sock *sk, struct ccid3_rx_hist_entry *packet) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_rx_hist_entry *entry, *next; + u8 num_later = 0; + + if (list_empty(&hcrx->ccid3hcrx_hist)) + list_add(&packet->ccid3hrx_node, &hcrx->ccid3hcrx_hist); + else { + u64 seqno = packet->ccid3hrx_seqno; + struct ccid3_rx_hist_entry *iter = list_entry(hcrx->ccid3hcrx_hist.next, + struct ccid3_rx_hist_entry, + ccid3hrx_node); + if (after48(seqno, iter->ccid3hrx_seqno)) + list_add(&packet->ccid3hrx_node, &hcrx->ccid3hcrx_hist); + else { + if (iter->ccid3hrx_type == DCCP_PKT_DATA || + iter->ccid3hrx_type == DCCP_PKT_DATAACK) + num_later = 1; + + list_for_each_entry_continue(iter, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (after48(seqno, iter->ccid3hrx_seqno)) { + list_add(&packet->ccid3hrx_node, &iter->ccid3hrx_node); + goto trim_history; + } + + if (iter->ccid3hrx_type == DCCP_PKT_DATA || + iter->ccid3hrx_type == DCCP_PKT_DATAACK) + num_later++; + + if (num_later == TFRC_RECV_NUM_LATE_LOSS) { + ccid3_rx_hist_entry_delete(packet); + ccid3_pr_debug("%s, sk=%p, packet(%llu) already lost!\n", + dccp_role(sk), sk, seqno); + return 1; + } + } + + if (num_later < TFRC_RECV_NUM_LATE_LOSS) + list_add_tail(&packet->ccid3hrx_node, &hcrx->ccid3hcrx_hist); + /* FIXME: else what? should we destroy the packet like above? */ + } + } + +trim_history: + /* Trim history (remove all packets after the NUM_LATE_LOSS + 1 data packets) */ + num_later = TFRC_RECV_NUM_LATE_LOSS + 1; + + if (!list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { + list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (num_later == 0) { + list_del_init(&entry->ccid3hrx_node); + ccid3_rx_hist_entry_delete(entry); + } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) + --num_later; + } + } else { + int step = 0; + u8 win_count = 0; /* Not needed, but lets shut up gcc */ + int tmp; + /* + * We have no loss interval history so we need at least one + * rtt:s of data packets to approximate rtt. + */ + list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (num_later == 0) { + switch (step) { + case 0: + step = 1; + /* OK, find next data packet */ + num_later = 1; + break; + case 1: + step = 2; + /* OK, find next data packet */ + num_later = 1; + win_count = entry->ccid3hrx_win_count; + break; + case 2: + tmp = win_count - entry->ccid3hrx_win_count; + if (tmp < 0) + tmp += TFRC_WIN_COUNT_LIMIT; + if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) { + /* we have found a packet older than one rtt + * remove the rest */ + step = 3; + } else /* OK, find next data packet */ + num_later = 1; + break; + case 3: + list_del_init(&entry->ccid3hrx_node); + ccid3_rx_hist_entry_delete(entry); + break; + } + } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) + --num_later; + } + } + + return 0; +} + +static void ccid3_hc_rx_send_feedback(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_rx_hist_entry *entry, *packet; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + switch (hcrx->ccid3hcrx_state) { + case TFRC_RSTATE_NO_DATA: + hcrx->ccid3hcrx_x_recv = 0; + break; + case TFRC_RSTATE_DATA: { + u32 delta = now_delta(hcrx->ccid3hcrx_tstamp_last_feedback); + + if (delta == 0) + delta = 1; /* to prevent divide by zero */ + hcrx->ccid3hcrx_x_recv = (hcrx->ccid3hcrx_bytes_recv * USEC_IN_SEC) / delta; + } + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state); + dump_stack(); + return; + } + + packet = NULL; + list_for_each_entry(entry, &hcrx->ccid3hcrx_hist, ccid3hrx_node) + if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) { + packet = entry; + break; + } + + if (packet == NULL) { + printk(KERN_CRIT "%s: %s, sk=%p, no data packet in history!\n", + __FUNCTION__, dccp_role(sk), sk); + dump_stack(); + return; + } + + do_gettimeofday(&(hcrx->ccid3hcrx_tstamp_last_feedback)); + hcrx->ccid3hcrx_last_counter = packet->ccid3hrx_win_count; + hcrx->ccid3hcrx_seqno_last_counter = packet->ccid3hrx_seqno; + hcrx->ccid3hcrx_bytes_recv = 0; + + /* Convert to multiples of 10us */ + hcrx->ccid3hcrx_elapsed_time = now_delta(packet->ccid3hrx_tstamp) / 10; + if (hcrx->ccid3hcrx_p == 0) + hcrx->ccid3hcrx_pinv = ~0; + else + hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p; + dccp_send_ack(sk); +} + +static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + + if (hcrx == NULL || !(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) + return; + + if (hcrx->ccid3hcrx_elapsed_time != 0 && !dccp_packet_without_ack(skb)) + dccp_insert_option_elapsed_time(sk, skb, hcrx->ccid3hcrx_elapsed_time); + + if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { + const u32 x_recv = htonl(hcrx->ccid3hcrx_x_recv); + const u32 pinv = htonl(hcrx->ccid3hcrx_pinv); + + dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, &pinv, sizeof(pinv)); + dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE, &x_recv, sizeof(x_recv)); + } + + DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter; +} + +/* Weights used to calculate loss event rate */ +/* + * These are integers as per section 8 of RFC3448. We can then divide by 4 * + * when we use it. + */ +const int ccid3_hc_rx_w[TFRC_RECV_IVAL_F_LENGTH] = { 4, 4, 4, 4, 3, 2, 1, 1, }; + +/* + * args: fvalue - function value to match + * returns: p closest to that value + * + * both fvalue and p are multiplied by 1,000,000 to use ints + */ +u32 calcx_reverse_lookup(u32 fvalue) { + int ctr = 0; + int small; + + if (fvalue < calcx_lookup[0][1]) + return 0; + if (fvalue <= calcx_lookup[CALCX_ARRSIZE-1][1]) + small = 1; + else if (fvalue > calcx_lookup[CALCX_ARRSIZE-1][0]) + return 1000000; + else + small = 0; + while (fvalue > calcx_lookup[ctr][small]) + ctr++; + if (small) + return (CALCX_SPLIT * ctr / CALCX_ARRSIZE); + else + return (1000000 * ctr / CALCX_ARRSIZE) ; +} + +/* calculate first loss interval + * + * returns estimated loss interval in usecs */ + +static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_rx_hist_entry *entry, *next, *tail = NULL; + u32 rtt, delta, x_recv, fval, p, tmp2; + struct timeval tstamp, tmp_tv; + int interval = 0; + int win_count = 0; + int step = 0; + u64 tmp1; + + list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) { + tail = entry; + + switch (step) { + case 0: + tstamp = entry->ccid3hrx_tstamp; + win_count = entry->ccid3hrx_win_count; + step = 1; + break; + case 1: + interval = win_count - entry->ccid3hrx_win_count; + if (interval < 0) + interval += TFRC_WIN_COUNT_LIMIT; + if (interval > 4) + goto found; + break; + } + } + } + + if (step == 0) { + printk(KERN_CRIT "%s: %s, sk=%p, packet history contains no data packets!\n", + __FUNCTION__, dccp_role(sk), sk); + return ~0; + } + + if (interval == 0) { + ccid3_pr_debug("%s, sk=%p, Could not find a win_count interval > 0. Defaulting to 1\n", + dccp_role(sk), sk); + interval = 1; + } +found: + timeval_sub(tstamp,tail->ccid3hrx_tstamp,&tmp_tv); + rtt = (tmp_tv.tv_sec * USEC_IN_SEC + tmp_tv.tv_usec) * 4 / interval; + ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n", + dccp_role(sk), sk, rtt); + if (rtt == 0) + rtt = 1; + + delta = now_delta(hcrx->ccid3hcrx_tstamp_last_feedback); + if (delta == 0) + delta = 1; + + x_recv = (hcrx->ccid3hcrx_bytes_recv * USEC_IN_SEC) / delta; + + tmp1 = (u64)x_recv * (u64)rtt; + do_div(tmp1,10000000); + tmp2 = (u32)tmp1; + fval = (hcrx->ccid3hcrx_s * 100000) / tmp2; + /* do not alter order above or you will get overflow on 32 bit */ + p = calcx_reverse_lookup(fval); + ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied loss rate=%u\n",\ + dccp_role(sk), sk, x_recv, p); + + if (p == 0) + return ~0; + else + return 1000000 / p; +} + +static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_loss_interval_hist_entry *li_entry; + + if (seq_loss != DCCP_MAX_SEQNO + 1) { + ccid3_pr_debug("%s, sk=%p, seq_loss=%llu, win_loss=%u, packet loss detected\n", + dccp_role(sk), sk, seq_loss, win_loss); + + if (list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { + struct ccid3_loss_interval_hist_entry *li_tail = NULL; + int i; + + ccid3_pr_debug("%s, sk=%p, first loss event detected, creating history\n", dccp_role(sk), sk); + for (i = 0; i <= TFRC_RECV_IVAL_F_LENGTH; ++i) { + li_entry = ccid3_loss_interval_hist_entry_new(SLAB_ATOMIC); + if (li_entry == NULL) { + ccid3_loss_interval_history_delete(&hcrx->ccid3hcrx_loss_interval_hist); + ccid3_pr_debug("%s, sk=%p, not enough mem for creating history\n", + dccp_role(sk), sk); + return; + } + if (li_tail == NULL) + li_tail = li_entry; + list_add(&li_entry->ccid3lih_node, &hcrx->ccid3hcrx_loss_interval_hist); + } + + li_entry->ccid3lih_seqno = seq_loss; + li_entry->ccid3lih_win_count = win_loss; + + li_tail->ccid3lih_interval = ccid3_hc_rx_calc_first_li(sk); + } + } + /* FIXME: find end of interval */ +} + +static void ccid3_hc_rx_detect_loss(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_rx_hist_entry *entry, *a_next, *b_next, *packet; + struct ccid3_rx_hist_entry *a_loss = NULL; + struct ccid3_rx_hist_entry *b_loss = NULL; + u64 seq_loss = DCCP_MAX_SEQNO + 1; + u8 win_loss = 0; + u8 num_later = TFRC_RECV_NUM_LATE_LOSS; + + list_for_each_entry_safe(entry, b_next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (num_later == 0) { + b_loss = entry; + break; + } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) + --num_later; + } + + if (b_loss == NULL) + goto out_update_li; + + a_next = b_next; + num_later = 1; + + list_for_each_entry_safe_continue(entry, a_next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (num_later == 0) { + a_loss = entry; + break; + } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) + --num_later; + } + + if (a_loss == NULL) { + if (list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { + /* no loss event have occured yet */ + ccid3_pr_debug("%s, sk=%p, TODO: find a lost data " + "packet by comparing to initial seqno\n", + dccp_role(sk), sk); + goto out_update_li; + } else { + pr_info("%s: %s, sk=%p, ERROR! Less than 4 data packets in history", + __FUNCTION__, dccp_role(sk), sk); + return; + } + } + + /* Locate a lost data packet */ + entry = packet = b_loss; + list_for_each_entry_safe_continue(entry, b_next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + u64 delta = dccp_delta_seqno(entry->ccid3hrx_seqno, packet->ccid3hrx_seqno); + + if (delta != 0) { + if (packet->ccid3hrx_type == DCCP_PKT_DATA || + packet->ccid3hrx_type == DCCP_PKT_DATAACK) + --delta; + /* + * FIXME: check this, probably this % usage is because + * in earlier drafts the ndp count was just 8 bits + * long, but now it cam be up to 24 bits long. + */ +#if 0 + if (delta % DCCP_NDP_LIMIT != + (packet->ccid3hrx_ndp - entry->ccid3hrx_ndp) % DCCP_NDP_LIMIT) +#endif + if (delta != packet->ccid3hrx_ndp - entry->ccid3hrx_ndp) { + seq_loss = entry->ccid3hrx_seqno; + dccp_inc_seqno(&seq_loss); + } + } + packet = entry; + if (packet == a_loss) + break; + } + + if (seq_loss != DCCP_MAX_SEQNO + 1) + win_loss = a_loss->ccid3hrx_win_count; + +out_update_li: + ccid3_hc_rx_update_li(sk, seq_loss, win_loss); +} + +static u32 ccid3_hc_rx_calc_i_mean(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_loss_interval_hist_entry *li_entry, *li_next; + int i = 0; + u32 i_tot; + u32 i_tot0 = 0; + u32 i_tot1 = 0; + u32 w_tot = 0; + + list_for_each_entry_safe(li_entry, li_next, &hcrx->ccid3hcrx_loss_interval_hist, ccid3lih_node) { + if (i < TFRC_RECV_IVAL_F_LENGTH) { + i_tot0 += li_entry->ccid3lih_interval * ccid3_hc_rx_w[i]; + w_tot += ccid3_hc_rx_w[i]; + } + + if (i != 0) + i_tot1 += li_entry->ccid3lih_interval * ccid3_hc_rx_w[i - 1]; + + if (++i > TFRC_RECV_IVAL_F_LENGTH) + break; + } + + if (i != TFRC_RECV_IVAL_F_LENGTH) { + pr_info("%s: %s, sk=%p, ERROR! Missing entry in interval history!\n", + __FUNCTION__, dccp_role(sk), sk); + return 0; + } + + i_tot = max(i_tot0, i_tot1); + + /* FIXME: Why do we do this? -Ian McDonald */ + if (i_tot * 4 < w_tot) + i_tot = w_tot * 4; + + return i_tot * 4 / w_tot; +} + +static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_rx_hist_entry *packet; + struct timeval now; + u8 win_count; + u32 p_prev; + int ins; +#if 0 + ccid3_pr_debug("%s, sk=%p(%s), skb=%p(%s)\n", + dccp_role(sk), sk, dccp_state_name(sk->sk_state), + skb, dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); +#endif + if (hcrx == NULL) + return; + + BUG_ON(!(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA || + hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA)); + + switch (DCCP_SKB_CB(skb)->dccpd_type) { + case DCCP_PKT_ACK: + if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA) + return; + case DCCP_PKT_DATAACK: + if (dp->dccps_options_received.dccpor_timestamp_echo == 0) + break; + p_prev = hcrx->ccid3hcrx_rtt; + do_gettimeofday(&now); + /* hcrx->ccid3hcrx_rtt = now - dp->dccps_options_received.dccpor_timestamp_echo - + usecs_to_jiffies(dp->dccps_options_received.dccpor_elapsed_time * 10); + FIXME - I think above code is broken - have to look at options more, will also need + to fix pr_debug below */ + if (p_prev != hcrx->ccid3hcrx_rtt) + ccid3_pr_debug("%s, sk=%p, New RTT estimate=%lu jiffies, tstamp_echo=%u, elapsed time=%u\n", + dccp_role(sk), sk, hcrx->ccid3hcrx_rtt, + dp->dccps_options_received.dccpor_timestamp_echo, + dp->dccps_options_received.dccpor_elapsed_time); + break; + case DCCP_PKT_DATA: + break; + default: + ccid3_pr_debug("%s, sk=%p, not DATA/DATAACK/ACK packet(%s)\n", + dccp_role(sk), sk, + dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); + return; + } + + packet = ccid3_rx_hist_entry_new(sk, skb, SLAB_ATOMIC); + if (packet == NULL) { + ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet to history (consider it lost)!", + dccp_role(sk), sk); + return; + } + + win_count = packet->ccid3hrx_win_count; + + ins = ccid3_hc_rx_add_hist(sk, packet); + + if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK) + return; + + switch (hcrx->ccid3hcrx_state) { + case TFRC_RSTATE_NO_DATA: + ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial feedback\n", + dccp_role(sk), sk, dccp_state_name(sk->sk_state), skb); + ccid3_hc_rx_send_feedback(sk); + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); + return; + case TFRC_RSTATE_DATA: + hcrx->ccid3hcrx_bytes_recv += skb->len - dccp_hdr(skb)->dccph_doff * 4; + if (ins == 0) { + do_gettimeofday(&now); + if ((now_delta(hcrx->ccid3hcrx_tstamp_last_ack)) >= hcrx->ccid3hcrx_rtt) { + hcrx->ccid3hcrx_tstamp_last_ack = now; + ccid3_hc_rx_send_feedback(sk); + } + return; + } + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state); + dump_stack(); + return; + } + + /* Dealing with packet loss */ + ccid3_pr_debug("%s, sk=%p(%s), skb=%p, data loss! Reacting...\n", + dccp_role(sk), sk, dccp_state_name(sk->sk_state), skb); + + ccid3_hc_rx_detect_loss(sk); + p_prev = hcrx->ccid3hcrx_p; + + /* Calculate loss event rate */ + if (!list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) + /* Scaling up by 1000000 as fixed decimal */ + hcrx->ccid3hcrx_p = 1000000 / ccid3_hc_rx_calc_i_mean(sk); + + if (hcrx->ccid3hcrx_p > p_prev) { + ccid3_hc_rx_send_feedback(sk); + return; + } +} + +static int ccid3_hc_rx_init(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + hcrx = dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx), gfp_any()); + if (hcrx == NULL) + return -ENOMEM; + + memset(hcrx, 0, sizeof(*hcrx)); + + if (dp->dccps_avg_packet_size >= TFRC_MIN_PACKET_SIZE && + dp->dccps_avg_packet_size <= TFRC_MAX_PACKET_SIZE) + hcrx->ccid3hcrx_s = (u16)dp->dccps_avg_packet_size; + else + hcrx->ccid3hcrx_s = TFRC_STD_PACKET_SIZE; + + hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; + INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist); + INIT_LIST_HEAD(&hcrx->ccid3hcrx_loss_interval_hist); + + return 0; +} + +static void ccid3_hc_rx_exit(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + if (hcrx == NULL) + return; + + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); + + /* Empty packet history */ + ccid3_rx_history_delete(&hcrx->ccid3hcrx_hist); + + /* Empty loss interval history */ + ccid3_loss_interval_history_delete(&hcrx->ccid3hcrx_loss_interval_hist); + + kfree(dp->dccps_hc_rx_ccid_private); + dp->dccps_hc_rx_ccid_private = NULL; +} + +static struct ccid ccid3 = { + .ccid_id = 3, + .ccid_name = "ccid3", + .ccid_owner = THIS_MODULE, + .ccid_init = ccid3_init, + .ccid_exit = ccid3_exit, + .ccid_hc_tx_init = ccid3_hc_tx_init, + .ccid_hc_tx_exit = ccid3_hc_tx_exit, + .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet, + .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent, + .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv, + .ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options, + .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options, + .ccid_hc_rx_init = ccid3_hc_rx_init, + .ccid_hc_rx_exit = ccid3_hc_rx_exit, + .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, + .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv, +}; + +module_param(ccid3_debug, int, 0444); +MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); + +static __init int ccid3_module_init(void) +{ + int rc = -ENOMEM; + + ccid3_tx_hist_slab = kmem_cache_create("dccp_ccid3_tx_history", + sizeof(struct ccid3_tx_hist_entry), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (ccid3_tx_hist_slab == NULL) + goto out; + + ccid3_rx_hist_slab = kmem_cache_create("dccp_ccid3_rx_history", + sizeof(struct ccid3_rx_hist_entry), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (ccid3_rx_hist_slab == NULL) + goto out_free_tx_history; + + ccid3_loss_interval_hist_slab = kmem_cache_create("dccp_ccid3_loss_interval_history", + sizeof(struct ccid3_loss_interval_hist_entry), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (ccid3_loss_interval_hist_slab == NULL) + goto out_free_rx_history; + + rc = ccid_register(&ccid3); + if (rc != 0) + goto out_free_loss_interval_history; + +out: + return rc; +out_free_loss_interval_history: + kmem_cache_destroy(ccid3_loss_interval_hist_slab); + ccid3_loss_interval_hist_slab = NULL; +out_free_rx_history: + kmem_cache_destroy(ccid3_rx_hist_slab); + ccid3_rx_hist_slab = NULL; +out_free_tx_history: + kmem_cache_destroy(ccid3_tx_hist_slab); + ccid3_tx_hist_slab = NULL; + goto out; +} +module_init(ccid3_module_init); + +static __exit void ccid3_module_exit(void) +{ + ccid_unregister(&ccid3); + + if (ccid3_tx_hist_slab != NULL) { + kmem_cache_destroy(ccid3_tx_hist_slab); + ccid3_tx_hist_slab = NULL; + } + if (ccid3_rx_hist_slab != NULL) { + kmem_cache_destroy(ccid3_rx_hist_slab); + ccid3_rx_hist_slab = NULL; + } + if (ccid3_loss_interval_hist_slab != NULL) { + kmem_cache_destroy(ccid3_loss_interval_hist_slab); + ccid3_loss_interval_hist_slab = NULL; + } +} +module_exit(ccid3_module_exit); + +MODULE_AUTHOR("Ian McDonald & Arnaldo Carvalho de Melo "); +MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("net-dccp-ccid-3"); diff -puN /dev/null net/dccp/ccids/ccid3.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/ccids/ccid3.h 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,137 @@ +/* + * net/dccp/ccids/ccid3.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#ifndef _DCCP_CCID3_H_ +#define _DCCP_CCID3_H_ + +#include +#include +#include + +struct ccid3_tx_hist_entry { + struct list_head ccid3htx_node; + u64 ccid3htx_seqno:48, + ccid3htx_win_count:8, + ccid3htx_sent:1; + struct timeval ccid3htx_tstamp; +}; + +struct ccid3_options_received { + u64 ccid3or_seqno:48, + ccid3or_loss_intervals_idx:16; + u16 ccid3or_loss_intervals_len; + u32 ccid3or_loss_event_rate; + u32 ccid3or_receive_rate; +}; + +/** struct ccid3_hc_tx_sock - CCID3 sender half connection congestion control block + * + * @ccid3hctx_state - Sender state + * @ccid3hctx_x - Current sending rate + * @ccid3hctx_x_recv - Receive rate + * @ccid3hctx_x_calc - Calculated send (?) rate + * @ccid3hctx_s - Packet size + * @ccid3hctx_rtt - Estimate of current round trip time in usecs + * @@ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 + * @ccid3hctx_last_win_count - Last window counter sent + * @ccid3hctx_t_last_win_count - Timestamp of earliest packet with last_win_count value sent + * @ccid3hctx_no_feedback_timer - Handle to no feedback timer + * @ccid3hctx_idle - FIXME + * @ccid3hctx_t_ld - Time last doubled during slow start + * @ccid3hctx_t_nom - Nominal send time of next packet + * @ccid3hctx_t_ipi - Interpacket (send) interval + * @ccid3hctx_delta - Send timer delta + * @ccid3hctx_hist - Packet history + */ +struct ccid3_hc_tx_sock { + u32 ccid3hctx_x; + u32 ccid3hctx_x_recv; + u32 ccid3hctx_x_calc; + u16 ccid3hctx_s; + u32 ccid3hctx_rtt; + u32 ccid3hctx_p; + u8 ccid3hctx_state; + u8 ccid3hctx_last_win_count; + u8 ccid3hctx_idle; + struct timeval ccid3hctx_t_last_win_count; + struct timer_list ccid3hctx_no_feedback_timer; + struct timeval ccid3hctx_t_ld; + struct timeval ccid3hctx_t_nom; + u32 ccid3hctx_t_ipi; + u32 ccid3hctx_delta; + struct list_head ccid3hctx_hist; + struct ccid3_options_received ccid3hctx_options_received; +}; + +struct ccid3_loss_interval_hist_entry { + struct list_head ccid3lih_node; + u64 ccid3lih_seqno:48, + ccid3lih_win_count:4; + u32 ccid3lih_interval; +}; + +struct ccid3_rx_hist_entry { + struct list_head ccid3hrx_node; + u64 ccid3hrx_seqno:48, + ccid3hrx_win_count:4, + ccid3hrx_type:4; + u32 ccid3hrx_ndp; /* In fact it is from 8 to 24 bits */ + struct timeval ccid3hrx_tstamp; +}; + +struct ccid3_hc_rx_sock { + u64 ccid3hcrx_seqno_last_counter:48, + ccid3hcrx_state:8, + ccid3hcrx_last_counter:4; + unsigned long ccid3hcrx_rtt; + u32 ccid3hcrx_p; + u32 ccid3hcrx_bytes_recv; + struct timeval ccid3hcrx_tstamp_last_feedback; + struct timeval ccid3hcrx_tstamp_last_ack; + struct list_head ccid3hcrx_hist; + struct list_head ccid3hcrx_loss_interval_hist; + u16 ccid3hcrx_s; + u32 ccid3hcrx_pinv; + u32 ccid3hcrx_elapsed_time; + u32 ccid3hcrx_x_recv; +}; + +#define ccid3_hc_tx_field(s,field) (s->dccps_hc_tx_ccid_private == NULL ? 0 : \ + ((struct ccid3_hc_tx_sock *)s->dccps_hc_tx_ccid_private)->ccid3hctx_##field) + +#define ccid3_hc_rx_field(s,field) (s->dccps_hc_rx_ccid_private == NULL ? 0 : \ + ((struct ccid3_hc_rx_sock *)s->dccps_hc_rx_ccid_private)->ccid3hcrx_##field) + +#endif /* _DCCP_CCID3_H_ */ diff -puN /dev/null net/dccp/ccids/Kconfig --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/ccids/Kconfig 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,25 @@ +menu "DCCP CCIDs Configuration (EXPERIMENTAL)" + depends on IP_DCCP && EXPERIMENTAL + +config IP_DCCP_CCID3 + tristate "CCID3 (TFRC) (EXPERIMENTAL)" + depends on IP_DCCP + ---help--- + CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based + rate-controlled congestion control mechanism. TFRC is designed to + be reasonably fair when competing for bandwidth with TCP-like flows, + where a flow is "reasonably fair" if its sending rate is generally + within a factor of two of the sending rate of a TCP flow under the + same conditions. However, TFRC has a much lower variation of + throughput over time compared with TCP, which makes CCID 3 more + suitable than CCID 2 for applications such streaming media where a + relatively smooth sending rate is of importance. + + CCID 3 is further described in [CCID 3 PROFILE]. The TFRC + congestion control algorithms were initially described in RFC 3448. + + This text was extracted from draft-ietf-dccp-spec-11.txt. + + If in doubt, say M. + +endmenu diff -puN /dev/null net/dccp/ccids/Makefile --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/ccids/Makefile 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,3 @@ +obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o + +dccp_ccid3-y := ccid3.o diff -puN /dev/null net/dccp/dccp.h --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/dccp.h 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,423 @@ +#ifndef _DCCP_H +#define _DCCP_H +/* + * net/dccp/dccp.h + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#define DCCP_DEBUG + +#ifdef DCCP_DEBUG +extern int dccp_debug; + +#define dccp_pr_debug(format, a...) \ + do { if (dccp_debug) \ + printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \ + } while (0) +#define dccp_pr_debug_cat(format, a...) do { if (dccp_debug) printk(format, ##a); } while (0) +#else +#define dccp_pr_debug(format, a...) +#define dccp_pr_debug_cat(format, a...) +#endif + +extern struct inet_hashinfo dccp_hashinfo; + +extern atomic_t dccp_orphan_count; +extern int dccp_tw_count; +extern void dccp_tw_deschedule(struct inet_timewait_sock *tw); + +extern void dccp_time_wait(struct sock *sk, int state, int timeo); + +/* FIXME: Right size this */ +#define DCCP_MAX_OPT_LEN 128 + +#define DCCP_MAX_PACKET_HDR 32 + +#define MAX_DCCP_HEADER (DCCP_MAX_PACKET_HDR + DCCP_MAX_OPT_LEN + MAX_HEADER) + +#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT + * state, about 60 seconds */ + +/* draft-ietf-dccp-spec-11.txt initial RTO value */ +#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ)) + +/* Maximal interval between probes for local resources. */ +#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U)) + +#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ + +extern struct proto dccp_v4_prot; + +/* is seq1 < seq2 ? */ +static inline const int before48(const u64 seq1, const u64 seq2) +{ + return (const s64)((seq1 << 16) - (seq2 << 16)) < 0; +} + +/* is seq1 > seq2 ? */ +static inline const int after48(const u64 seq1, const u64 seq2) +{ + return (const s64)((seq2 << 16) - (seq1 << 16)) < 0; +} + +/* is seq2 <= seq1 <= seq3 ? */ +static inline const int between48(const u64 seq1, const u64 seq2, const u64 seq3) +{ + return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); +} + +static inline u64 max48(const u64 seq1, const u64 seq2) +{ + return after48(seq1, seq2) ? seq1 : seq2; +} + +enum { + DCCP_MIB_NUM = 0, + DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ + DCCP_MIB_ESTABRESETS, /* EstabResets */ + DCCP_MIB_CURRESTAB, /* CurrEstab */ + DCCP_MIB_OUTSEGS, /* OutSegs */ + DCCP_MIB_OUTRSTS, + DCCP_MIB_ABORTONTIMEOUT, + DCCP_MIB_TIMEOUTS, + DCCP_MIB_ABORTFAILED, + DCCP_MIB_PASSIVEOPENS, + DCCP_MIB_ATTEMPTFAILS, + DCCP_MIB_OUTDATAGRAMS, + DCCP_MIB_INERRS, + DCCP_MIB_OPTMANDATORYERROR, + DCCP_MIB_INVALIDOPT, + __DCCP_MIB_MAX +}; + +#define DCCP_MIB_MAX __DCCP_MIB_MAX +struct dccp_mib { + unsigned long mibs[DCCP_MIB_MAX]; +} __SNMP_MIB_ALIGN__; + +DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); +#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) +#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field) +#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field) +#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) +#define DCCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(dccp_statistics, field, val) +#define DCCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(dccp_statistics, field, val) + +extern int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb); +extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb); + +extern int dccp_send_response(struct sock *sk); +extern void dccp_send_ack(struct sock *sk); +extern void dccp_send_delayed_ack(struct sock *sk); +extern void dccp_send_sync(struct sock *sk, u64 seq); + +extern void dccp_init_xmit_timers(struct sock *sk); +static inline void dccp_clear_xmit_timers(struct sock *sk) +{ + inet_csk_clear_xmit_timers(sk); +} + +extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); + +extern const char *dccp_packet_name(const int type); +extern const char *dccp_state_name(const int state); + +static inline void dccp_set_state(struct sock *sk, const int state) +{ + const int oldstate = sk->sk_state; + + dccp_pr_debug("%s(%p) %-10.10s -> %s\n", + dccp_role(sk), sk, + dccp_state_name(oldstate), dccp_state_name(state)); + WARN_ON(state == oldstate); + + switch (state) { + case DCCP_OPEN: + if (oldstate != DCCP_OPEN) + DCCP_INC_STATS(DCCP_MIB_CURRESTAB); + break; + + case DCCP_CLOSED: + if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN) + DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); + + sk->sk_prot->unhash(sk); + if (inet_csk(sk)->icsk_bind_hash != NULL && + !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) + inet_put_port(&dccp_hashinfo, sk); + /* fall through */ + default: + if (oldstate == DCCP_OPEN) + DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); + } + + /* Change state AFTER socket is unhashed to avoid closed + * socket sitting in hash tables. + */ + sk->sk_state = state; +} + +static inline void dccp_done(struct sock *sk) +{ + dccp_set_state(sk, DCCP_CLOSED); + dccp_clear_xmit_timers(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + else + inet_csk_destroy_sock(sk); +} + +static inline void dccp_openreq_init(struct request_sock *req, + struct dccp_sock *dp, + struct sk_buff *skb) +{ + /* + * FIXME: fill in the other req fields from the DCCP options + * received + */ + inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; + inet_rsk(req)->acked = 0; + req->rcv_wnd = 0; +} + +extern void dccp_v4_send_check(struct sock *sk, struct dccp_hdr *dh, int len, + struct sk_buff *skb); +extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); + +extern struct sock *dccp_create_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb); + +extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); + +extern void dccp_v4_err(struct sk_buff *skb, u32); + +extern int dccp_v4_rcv(struct sk_buff *skb); + +extern struct sock *dccp_v4_request_recv_sock(struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); +extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct request_sock **prev); + +extern int dccp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb); +extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct dccp_hdr *dh, unsigned len); +extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len); + +extern void dccp_close(struct sock *sk, long timeout); +extern struct sk_buff *dccp_make_response(struct sock *sk, + struct dst_entry *dst, + struct request_sock *req); + +extern int dccp_connect(struct sock *sk); +extern int dccp_disconnect(struct sock *sk, int flags); +extern int dccp_getsockopt(struct sock *sk, int level, int optname, + char *optval, int *optlen); +extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); +extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t size); +extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len); +extern int dccp_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen); +extern void dccp_shutdown(struct sock *sk, int how); + +extern int dccp_v4_checksum(const struct sk_buff *skb, + const u32 saddr, const u32 daddr); + +extern int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code); +extern void dccp_send_close(struct sock *sk); + +struct dccp_skb_cb { + __u8 dccpd_type; + __u8 dccpd_reset_code; + __u8 dccpd_service; + __u8 dccpd_ccval; + __u64 dccpd_seq; + __u64 dccpd_ack_seq; + int dccpd_opt_len; +}; + +#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) + +static inline int dccp_non_data_packet(const struct sk_buff *skb) +{ + const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; + + return type == DCCP_PKT_ACK || + type == DCCP_PKT_CLOSE || + type == DCCP_PKT_CLOSEREQ || + type == DCCP_PKT_RESET || + type == DCCP_PKT_SYNC || + type == DCCP_PKT_SYNCACK; +} + +static inline int dccp_packet_without_ack(const struct sk_buff *skb) +{ + const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; + + return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; +} + +#define DCCP_MAX_SEQNO ((((u64)1) << 48) - 1) +#define DCCP_PKT_WITHOUT_ACK_SEQ (DCCP_MAX_SEQNO << 2) + +static inline void dccp_set_seqno(u64 *seqno, u64 value) +{ + if (value > DCCP_MAX_SEQNO) + value -= DCCP_MAX_SEQNO + 1; + *seqno = value; +} + +static inline u64 dccp_delta_seqno(u64 seqno1, u64 seqno2) +{ + return ((seqno2 << 16) - (seqno1 << 16)) >> 16; +} + +static inline void dccp_inc_seqno(u64 *seqno) +{ + if (++*seqno > DCCP_MAX_SEQNO) + *seqno = 0; +} + +static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) +{ + struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + sizeof(*dh)); + +#if defined(__LITTLE_ENDIAN_BITFIELD) + dh->dccph_seq = htonl((gss >> 32)) >> 8; +#elif defined(__BIG_ENDIAN_BITFIELD) + dh->dccph_seq = htonl((gss >> 32)); +#else +#error "Adjust your defines" +#endif + dhx->dccph_seq_low = htonl(gss & 0xffffffff); +} + +static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, const u64 gsr) +{ +#if defined(__LITTLE_ENDIAN_BITFIELD) + dhack->dccph_ack_nr_high = htonl((gsr >> 32)) >> 8; +#elif defined(__BIG_ENDIAN_BITFIELD) + dhack->dccph_ack_nr_high = htonl((gsr >> 32)); +#else +#error "Adjust your defines" +#endif + dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff); +} + +static inline void dccp_update_gsr(struct sock *sk, u64 seq) +{ + struct dccp_sock *dp = dccp_sk(sk); + u64 tmp_gsr; + + dccp_set_seqno(&tmp_gsr, dp->dccps_gsr + 1 - (dp->dccps_options.dccpo_sequence_window / 4)); + dp->dccps_gsr = seq; + dccp_set_seqno(&dp->dccps_swl, max48(tmp_gsr, dp->dccps_isr)); + dccp_set_seqno(&dp->dccps_swh, + dp->dccps_gsr + (3 * dp->dccps_options.dccpo_sequence_window) / 4); +} + +static inline void dccp_update_gss(struct sock *sk, u64 seq) +{ + struct dccp_sock *dp = dccp_sk(sk); + u64 tmp_gss; + + dccp_set_seqno(&tmp_gss, dp->dccps_gss - dp->dccps_options.dccpo_sequence_window + 1); + dp->dccps_awl = max48(tmp_gss, dp->dccps_iss); + dp->dccps_awh = dp->dccps_gss = seq; +} + +extern void dccp_insert_options(struct sock *sk, struct sk_buff *skb); +extern void dccp_insert_option_elapsed_time(struct sock *sk, + struct sk_buff *skb, + u32 elapsed_time); +extern void dccp_insert_option(struct sock *sk, struct sk_buff *skb, + unsigned char option, + const void *value, unsigned char len); + +extern struct socket *dccp_ctl_socket; + +#define DCCP_ACKPKTS_STATE_RECEIVED 0 +#define DCCP_ACKPKTS_STATE_ECN_MARKED (1 << 6) +#define DCCP_ACKPKTS_STATE_NOT_RECEIVED (3 << 6) + +#define DCCP_ACKPKTS_STATE_MASK 0xC0 /* 11000000 */ +#define DCCP_ACKPKTS_LEN_MASK 0x3F /* 00111111 */ + +/** struct dccp_ackpkts - acknowledgeable packets + * + * This data structure is the one defined in the DCCP draft + * Appendix A. + * + * @dccpap_buf_head - circular buffer head + * @dccpap_buf_tail - circular buffer tail + * @dccpap_buf_ackno - ack # of the most recent packet acknoldgeable in the buffer (i.e. %dccpap_buf_head) + * @dccpap_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked by the buffer with State 0 + * + * Additionally, the HC-Receiver must keep some information about the + * Ack Vectors it has recently sent. For each packet sent carrying an + * Ack Vector, it remembers four variables: + * + * @dccpap_ack_seqno - the Sequence Number used for the packet (HC-Receiver seqno) + * @dccpap_ack_ptr - the value of buf_head at the time of acknowledgement. + * @dccpap_ack_ackno - the Acknowledgement Number used for the packet (HC-Sender seqno) + * @dccpap_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. + * + * @dccpap_buf_len - circular buffer length + * @dccpap_buf - circular buffer of acknowledgeable packets + */ +struct dccp_ackpkts { + unsigned int dccpap_buf_head; + unsigned int dccpap_buf_tail; + u64 dccpap_buf_ackno; + u64 dccpap_ack_seqno; + u64 dccpap_ack_ackno; + unsigned int dccpap_ack_ptr; + unsigned int dccpap_buf_vector_len; + unsigned int dccpap_ack_vector_len; + unsigned int dccpap_buf_len; + unsigned long dccpap_time; + u8 dccpap_buf_nonce; + u8 dccpap_ack_nonce; + u8 dccpap_buf[0]; +}; + +extern struct dccp_ackpkts *dccp_ackpkts_alloc(unsigned int len, int priority); +extern void dccp_ackpkts_free(struct dccp_ackpkts *ap); +extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state); +extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, + struct sock *sk, u64 ackno); + +#ifdef DCCP_DEBUG +extern void dccp_ackvector_print(const u64 ackno, + const unsigned char *vector, int len); +extern void dccp_ackpkts_print(const struct dccp_ackpkts *ap); +#else +static inline void dccp_ackvector_print(const u64 ackno, + const unsigned char *vector, + int len) { } +static inline void dccp_ackpkts_print(const struct dccp_ackpkts *ap) { } +#endif + +#endif /* _DCCP_H */ diff -puN /dev/null net/dccp/input.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/input.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,510 @@ +/* + * net/dccp/input.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include + +#include "ccid.h" +#include "dccp.h" + +static void dccp_fin(struct sock *sk, struct sk_buff *skb) +{ + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); + __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + sk->sk_data_ready(sk, 0); +} + +static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb) +{ + switch (sk->sk_state) { + case DCCP_PARTOPEN: + case DCCP_OPEN: + dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED); + dccp_fin(sk, skb); + dccp_set_state(sk, DCCP_CLOSED); + break; + } +} + +static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) +{ + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == CloseReq) + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + return; + } + + switch (sk->sk_state) { + case DCCP_PARTOPEN: + case DCCP_OPEN: + dccp_set_state(sk, DCCP_CLOSING); + dccp_send_close(sk); + break; + } +} + +static inline void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (dp->dccps_options.dccpo_send_ack_vector) + dccp_ackpkts_check_rcv_ackno(dp->dccps_hc_rx_ackpkts, sk, + DCCP_SKB_CB(skb)->dccpd_ack_seq); +} + +static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + struct dccp_sock *dp = dccp_sk(sk); + u64 lswl = dp->dccps_swl; + u64 lawl = dp->dccps_awl; + + /* + * Step 5: Prepare sequence numbers for Sync + * If P.type == Sync or P.type == SyncAck, + * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL, + * / * P is valid, so update sequence number variables + * accordingly. After this update, P will pass the tests + * in Step 6. A SyncAck is generated if necessary in + * Step 15 * / + * Update S.GSR, S.SWL, S.SWH + * Otherwise, + * Drop packet and return + */ + if (dh->dccph_type == DCCP_PKT_SYNC || + dh->dccph_type == DCCP_PKT_SYNCACK) { + if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awl, dp->dccps_awh) && + !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl)) + dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); + else + return -1; + /* + * Step 6: Check sequence numbers + * Let LSWL = S.SWL and LAWL = S.AWL + * If P.type == CloseReq or P.type == Close or P.type == Reset, + * LSWL := S.GSR + 1, LAWL := S.GAR + * If LSWL <= P.seqno <= S.SWH + * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH), + * Update S.GSR, S.SWL, S.SWH + * If P.type != Sync, + * Update S.GAR + * Otherwise, + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ || + dh->dccph_type == DCCP_PKT_CLOSE || + dh->dccph_type == DCCP_PKT_RESET) { + lswl = dp->dccps_gsr; + dccp_inc_seqno(&lswl); + lawl = dp->dccps_gar; + } + + if (between48(DCCP_SKB_CB(skb)->dccpd_seq, lswl, dp->dccps_swh) && + (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ || + between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, lawl, dp->dccps_awh))) { + dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); + + if (dh->dccph_type != DCCP_PKT_SYNC && + DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq; + } else { + dccp_pr_debug("Step 6 failed, sending SYNC...\n"); + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + return -1; + } + + return 0; +} + +int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (dccp_check_seqno(sk, skb)) + goto discard; + + if (dccp_parse_options(sk, skb)) + goto discard; + + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + /* + * FIXME: check ECN to see if we should use + * DCCP_ACKPKTS_STATE_ECN_MARKED + */ + if (dp->dccps_options.dccpo_send_ack_vector) { + struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; + + if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKPKTS_STATE_RECEIVED)) { + LIMIT_NETDEBUG(pr_info("DCCP: acknowledgeable packets buffer full!\n")); + ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MIN, TCP_RTO_MAX); + goto discard; + } + + /* + * FIXME: this activation is probably wrong, have to study more + * TCP delack machinery and how it fits into DCCP draft, but + * for now it kinda "works" 8) + */ + if (!inet_csk_ack_scheduled(sk)) { + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5 * HZ, TCP_RTO_MAX); + } + } + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + + switch (dccp_hdr(skb)->dccph_type) { + case DCCP_PKT_DATAACK: + case DCCP_PKT_DATA: + /* + * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED option + * if it is. + */ + __skb_pull(skb, dh->dccph_doff * 4); + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + sk->sk_data_ready(sk, 0); + return 0; + case DCCP_PKT_ACK: + goto discard; + case DCCP_PKT_RESET: + /* + * Step 9: Process Reset + * If P.type == Reset, + * Tear down connection + * S.state := TIMEWAIT + * Set TIMEWAIT timer + * Drop packet and return + */ + dccp_fin(sk, skb); + dccp_time_wait(sk, DCCP_TIME_WAIT, 0); + return 0; + case DCCP_PKT_CLOSEREQ: + dccp_rcv_closereq(sk, skb); + goto discard; + case DCCP_PKT_CLOSE: + dccp_rcv_close(sk, skb); + return 0; + case DCCP_PKT_REQUEST: + /* Step 7 + * or (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state >= OPEN and P.type == Request + * and P.seqno >= S.OSR) + * or (S.state >= OPEN and P.type == Response + * and P.seqno >= S.OSR) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + if (dp->dccps_role != DCCP_ROLE_LISTEN) + goto send_sync; + goto check_seq; + case DCCP_PKT_RESPONSE: + if (dp->dccps_role != DCCP_ROLE_CLIENT) + goto send_sync; +check_seq: + if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) { +send_sync: + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + } + break; + } + + DCCP_INC_STATS_BH(DCCP_MIB_INERRS); +discard: + __kfree_skb(skb); + return 0; +} + +static int dccp_rcv_request_sent_state_process(struct sock *sk, + struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) +{ + /* + * Step 4: Prepare sequence numbers in REQUEST + * If S.state == REQUEST, + * If (P.type == Response or P.type == Reset) + * and S.AWL <= P.ackno <= S.AWH, + * / * Set sequence number variables corresponding to the + * other endpoint, so P will pass the tests in Step 6 * / + * Set S.GSR, S.ISR, S.SWL, S.SWH + * / * Response processing continues in Step 10; Reset + * processing continues in Step 9 * / + */ + if (dh->dccph_type == DCCP_PKT_RESPONSE) { + const struct inet_connection_sock *icsk = inet_csk(sk); + struct dccp_sock *dp = dccp_sk(sk); + + /* Stop the REQUEST timer */ + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); + BUG_TRAP(sk->sk_send_head != NULL); + __kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + + if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awl, dp->dccps_awh)) { + dccp_pr_debug("invalid ackno: S.AWL=%llu, P.ackno=%llu, S.AWH=%llu \n", + dp->dccps_awl, DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awh); + goto out_invalid_packet; + } + + dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; + dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); + + if (ccid_hc_rx_init(dp->dccps_hc_rx_ccid, sk) != 0 || + ccid_hc_tx_init(dp->dccps_hc_tx_ccid, sk) != 0) { + ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); + /* FIXME: send appropriate RESET code */ + goto out_invalid_packet; + } + + dccp_sync_mss(sk, dp->dccps_pmtu_cookie); + + /* + * Step 10: Process REQUEST state (second part) + * If S.state == REQUEST, + * / * If we get here, P is a valid Response from the server (see + * Step 4), and we should move to PARTOPEN state. PARTOPEN + * means send an Ack, don't send Data packets, retransmit + * Acks periodically, and always include any Init Cookie from + * the Response * / + * S.state := PARTOPEN + * Set PARTOPEN timer + * Continue with S.state == PARTOPEN + * / * Step 12 will send the Ack completing the three-way + * handshake * / + */ + dccp_set_state(sk, DCCP_PARTOPEN); + + /* Make sure socket is routed, for correct metrics. */ + inet_sk_rebuild_header(sk); + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, 0, POLL_OUT); + } + + if (sk->sk_write_pending || icsk->icsk_ack.pingpong || + icsk->icsk_accept_queue.rskq_defer_accept) { + /* Save one ACK. Data will be ready after + * several ticks, if write_pending is set. + * + * It may be deleted, but with this feature tcpdumps + * look so _wonderfully_ clever, that I was not able + * to stand against the temptation 8) --ANK + */ + /* + * OK, in DCCP we can as well do a similar trick, its + * even in the draft, but there is no need for us to + * schedule an ack here, as dccp_sendmsg does this for + * us, also stated in the draft. -acme + */ + __kfree_skb(skb); + return 0; + } + dccp_send_ack(sk); + return -1; + } + +out_invalid_packet: + return 1; /* dccp_v4_do_rcv will send a reset, but... + FIXME: the reset code should be DCCP_RESET_CODE_PACKET_ERROR */ +} + +static int dccp_rcv_respond_partopen_state_process(struct sock *sk, + struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) +{ + int queued = 0; + + switch (dh->dccph_type) { + case DCCP_PKT_RESET: + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + break; + case DCCP_PKT_DATAACK: + case DCCP_PKT_ACK: + /* + * FIXME: we should be reseting the PARTOPEN (DELACK) timer here, + * but only if we haven't used the DELACK timer for something else, + * like sending a delayed ack for a TIMESTAMP echo, etc, for now + * were not clearing it, sending an extra ACK when there is nothing + * else to do in DELACK is not a big deal after all. + */ + + /* Stop the PARTOPEN timer */ + if (sk->sk_state == DCCP_PARTOPEN) + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + + dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq; + dccp_set_state(sk, DCCP_OPEN); + + if (dh->dccph_type == DCCP_PKT_DATAACK) { + dccp_rcv_established(sk, skb, dh, len); + queued = 1; /* packet was queued (by dccp_rcv_established) */ + } + break; + } + + return queued; +} + +int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct dccp_hdr *dh, unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + const int old_state = sk->sk_state; + int queued = 0; + + if (sk->sk_state != DCCP_LISTEN && sk->sk_state != DCCP_REQUESTING) { + if (dccp_check_seqno(sk, skb)) + goto discard; + + /* + * Step 8: Process options and mark acknowledgeable + */ + if (dccp_parse_options(sk, skb)) + goto discard; + + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + + /* + * FIXME: check ECN to see if we should use + * DCCP_ACKPKTS_STATE_ECN_MARKED + */ + if (dp->dccps_options.dccpo_send_ack_vector) { + if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKPKTS_STATE_RECEIVED)) + goto discard; + /* + * FIXME: this activation is probably wrong, have to study more + * TCP delack machinery and how it fits into DCCP draft, but + * for now it kinda "works" 8) + */ + if (dp->dccps_hc_rx_ackpkts->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1 && + !inet_csk_ack_scheduled(sk)) { + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MIN, TCP_RTO_MAX); + } + } + } + + /* + * Step 9: Process Reset + * If P.type == Reset, + * Tear down connection + * S.state := TIMEWAIT + * Set TIMEWAIT timer + * Drop packet and return + */ + if (dh->dccph_type == DCCP_PKT_RESET) { + /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */ + dccp_fin(sk, skb); + dccp_time_wait(sk, DCCP_TIME_WAIT, 0); + return 0; + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == CloseReq) + * or (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && + (dh->dccph_type == DCCP_PKT_RESPONSE || dh->dccph_type == DCCP_PKT_CLOSEREQ)) || + (dp->dccps_role == DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_REQUEST) || + (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + goto discard; + } + + switch (sk->sk_state) { + case DCCP_CLOSED: + return 1; + + case DCCP_LISTEN: + if (dh->dccph_type == DCCP_PKT_ACK || + dh->dccph_type == DCCP_PKT_DATAACK) + return 1; + + if (dh->dccph_type == DCCP_PKT_RESET) + goto discard; + + if (dh->dccph_type == DCCP_PKT_REQUEST) { + if (dccp_v4_conn_request(sk, skb) < 0) + return 1; + + /* FIXME: do congestion control initialization */ + goto discard; + } + goto discard; + + case DCCP_REQUESTING: + /* FIXME: do congestion control initialization */ + + queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); + if (queued >= 0) + return queued; + + __kfree_skb(skb); + return 0; + + case DCCP_RESPOND: + case DCCP_PARTOPEN: + queued = dccp_rcv_respond_partopen_state_process(sk, skb, dh, len); + break; + } + + if (dh->dccph_type == DCCP_PKT_ACK || dh->dccph_type == DCCP_PKT_DATAACK) { + switch (old_state) { + case DCCP_PARTOPEN: + sk->sk_state_change(sk); + sk_wake_async(sk, 0, POLL_OUT); + break; + } + } + + if (!queued) { +discard: + __kfree_skb(skb); + } + return 0; +} diff -puN /dev/null net/dccp/ipv4.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/ipv4.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,1291 @@ +/* + * net/dccp/ipv4.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "ccid.h" +#include "dccp.h" + +struct inet_hashinfo __cacheline_aligned dccp_hashinfo = { + .lhash_lock = RW_LOCK_UNLOCKED, + .lhash_users = ATOMIC_INIT(0), + .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait), + .portalloc_lock = SPIN_LOCK_UNLOCKED, + .port_rover = 1024 - 1, +}; + +static int dccp_v4_get_port(struct sock *sk, const unsigned short snum) +{ + return inet_csk_get_port(&dccp_hashinfo, sk, snum); +} + +static void dccp_v4_hash(struct sock *sk) +{ + inet_hash(&dccp_hashinfo, sk); +} + +static void dccp_v4_unhash(struct sock *sk) +{ + inet_unhash(&dccp_hashinfo, sk); +} + +/* called with local bh disabled */ +static int __dccp_v4_check_established(struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_sock *inet = inet_sk(sk); + const u32 daddr = inet->rcv_saddr; + const u32 saddr = inet->daddr; + const int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, dccp_hashinfo.ehash_size); + struct inet_ehash_bucket *head = &dccp_hashinfo.ehash[hash]; + const struct sock *sk2; + const const struct hlist_node *node; + struct inet_timewait_sock *tw; + + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { + tw = inet_twsk(sk2); + + if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->num = lport; + inet->sport = htons(lport); + sk->sk_hashent = hash; + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp != NULL) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw != NULL) { + /* Silly. Should hash-dance instead... */ + dccp_tw_deschedule(tw); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +/* + * Bind a port for a connect operation and hash it. + */ +static int dccp_v4_hash_connect(struct sock *sk) +{ + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (snum == 0) { + int rover; + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + + /* TODO. Actually it is not so bad idea to remove + * dccp_hashinfo.portalloc_lock before next submission to Linus. + * As soon as we touch this place at all it is time to think. + * + * Now it protects single _advisory_ variable dccp_hashinfo.port_rover, + * hence it is mostly useless. + * Code will work nicely if we just delete it, but + * I am afraid in contented case it will work not better or + * even worse: another cpu just will hit the same bucket + * and spin there. + * So some cpu salt could remove both contention and + * memory pingpong. Any ideas how to do this in a nice way? + */ + spin_lock(&dccp_hashinfo.portalloc_lock); + rover = dccp_hashinfo.port_rover; + + do { + rover++; + if ((rover < low) || (rover > high)) + rover = low; + head = &dccp_hashinfo.bhash[inet_bhashfn(rover, dccp_hashinfo.bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == rover) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__dccp_v4_check_established(sk, + rover, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, head, rover); + if (tb == NULL) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } while (--remaining > 0); + dccp_hashinfo.port_rover = rover; + spin_unlock(&dccp_hashinfo.portalloc_lock); + + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + /* All locks still held and bhs disabled */ + dccp_hashinfo.port_rover = rover; + spin_unlock(&dccp_hashinfo.portalloc_lock); + + inet_bind_hash(sk, tb, rover); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(rover); + __inet_hash(&dccp_hashinfo, sk, 0); + } + spin_unlock(&head->lock); + + if (tw != NULL) { + dccp_tw_deschedule(tw); + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &dccp_hashinfo.bhash[inet_bhashfn(snum, dccp_hashinfo.bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { + __inet_hash(&dccp_hashinfo, sk, 0); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __dccp_v4_check_established(sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + struct rtable *rt; + u32 daddr, nexthop; + int tmp; + int err; + + dp->dccps_role = DCCP_ROLE_CLIENT; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + nexthop = daddr = usin->sin_addr.s_addr; + if (inet->opt != NULL && inet->opt->srr) { + if (daddr == 0) + return -EINVAL; + nexthop = inet->opt->faddr; + } + + tmp = ip_route_connect(&rt, nexthop, inet->saddr, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, + IPPROTO_DCCP, + inet->sport, usin->sin_port, sk); + if (tmp < 0) + return tmp; + + if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { + ip_rt_put(rt); + return -ENETUNREACH; + } + + if (inet->opt == NULL || !inet->opt->srr) + daddr = rt->rt_dst; + + if (inet->saddr == 0) + inet->saddr = rt->rt_src; + inet->rcv_saddr = inet->saddr; + + inet->dport = usin->sin_port; + inet->daddr = daddr; + + dp->dccps_ext_header_len = 0; + if (inet->opt != NULL) + dp->dccps_ext_header_len = inet->opt->optlen; + /* + * Socket identity is still unknown (sport may be zero). + * However we set state to DCCP_REQUESTING and not releasing socket + * lock select source port, enter ourselves into the hash tables and + * complete initialization after this. + */ + dccp_set_state(sk, DCCP_REQUESTING); + err = dccp_v4_hash_connect(sk); + if (err != 0) + goto failure; + + err = ip_route_newports(&rt, inet->sport, inet->dport, sk); + if (err != 0) + goto failure; + + /* OK, now commit destination to socket. */ + sk_setup_caps(sk, &rt->u.dst); + + dp->dccps_gar = + dp->dccps_iss = secure_dccp_sequence_number(inet->saddr, + inet->daddr, + inet->sport, + usin->sin_port); + dccp_update_gss(sk, dp->dccps_iss); + + inet->id = dp->dccps_iss ^ jiffies; + + err = dccp_connect(sk); + rt = NULL; + if (err != 0) + goto failure; +out: + return err; +failure: + /* This unhashes the socket and releases the local port, if necessary. */ + dccp_set_state(sk, DCCP_CLOSED); + ip_rt_put(rt); + sk->sk_route_caps = 0; + inet->dport = 0; + goto out; +} + +/* + * This routine does path mtu discovery as defined in RFC1191. + */ +static inline void dccp_do_pmtu_discovery(struct sock *sk, + const struct iphdr *iph, + u32 mtu) +{ + struct dst_entry *dst; + const struct inet_sock *inet = inet_sk(sk); + const struct dccp_sock *dp = dccp_sk(sk); + + /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs + * send out by Linux are always < 576bytes so they should go through + * unfragmented). + */ + if (sk->sk_state == DCCP_LISTEN) + return; + + /* We don't check in the destentry if pmtu discovery is forbidden + * on this route. We just assume that no packet_to_big packets + * are send back when pmtu discovery is not active. + * There is a small race when the user changes this flag in the + * route, but I think that's acceptable. + */ + if ((dst = __sk_dst_check(sk, 0)) == NULL) + return; + + dst->ops->update_pmtu(dst, mtu); + + /* Something is about to be wrong... Remember soft error + * for the case, if this connection will not able to recover. + */ + if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) + sk->sk_err_soft = EMSGSIZE; + + mtu = dst_mtu(dst); + + if (inet->pmtudisc != IP_PMTUDISC_DONT && + dp->dccps_pmtu_cookie > mtu) { + dccp_sync_mss(sk, mtu); + + /* + * From: draft-ietf-dccp-spec-11.txt + * + * DCCP-Sync packets are the best choice for upward probing, + * since DCCP-Sync probes do not risk application data loss. + */ + dccp_send_sync(sk, dp->dccps_gsr); + } /* else let the usual retransmit timer handle it */ +} + +static void dccp_v4_ctl_send_ack(struct sk_buff *rxskb) +{ + int err; + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_ack_bits); + struct sk_buff *skb; + + if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL) + return; + + skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC); + if (skb == NULL) + return; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_DCCP_HEADER); + + skb->dst = dst_clone(rxskb->dst); + + skb->h.raw = skb_push(skb, dccp_hdr_ack_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_ack_len); + + /* Build DCCP header and checksum it. */ + dh->dccph_type = DCCP_PKT_ACK; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_ack_len / 4; + dh->dccph_x = 1; + + dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq); + + bh_lock_sock(dccp_ctl_socket->sk); + err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk, + rxskb->nh.iph->daddr, rxskb->nh.iph->saddr, NULL); + bh_unlock_sock(dccp_ctl_socket->sk); + + if (err == NET_XMIT_CN || err == 0) { + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + } +} + +static void dccp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) +{ + dccp_v4_ctl_send_ack(skb); +} + +static int dccp_v4_send_response(struct sock *sk, struct request_sock *req, + struct dst_entry *dst) +{ + int err = -1; + struct sk_buff *skb; + + /* First, grab a route. */ + + if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL) + goto out; + + skb = dccp_make_response(sk, dst, req); + if (skb != NULL) { + const struct inet_request_sock *ireq = inet_rsk(req); + + err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, + ireq->rmt_addr, + ireq->opt); + if (err == NET_XMIT_CN) + err = 0; + } + +out: + dst_release(dst); + return err; +} + +/* + * This routine is called by the ICMP module when it gets some sort of error + * condition. If err < 0 then the socket should be closed and the error + * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code. + * After adjustment header points to the first 8 bytes of the tcp header. We + * need to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When someone else + * accesses the socket the ICMP is just dropped and for some paths there is no + * check at all. A more general error queue to queue errors for later handling + * is probably better. + */ +void dccp_v4_err(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (struct iphdr *)skb->data; + const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + (iph->ihl << 2)); + struct dccp_sock *dp; + struct inet_sock *inet; + const int type = skb->h.icmph->type; + const int code = skb->h.icmph->code; + struct sock *sk; + __u64 seq; + int err; + + if (skb->len < (iph->ihl << 2) + 8) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + + sk = inet_lookup(&dccp_hashinfo, iph->daddr, dh->dccph_dport, + iph->saddr, dh->dccph_sport, inet_iif(skb)); + if (sk == NULL) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + + if (sk->sk_state == DCCP_TIME_WAIT) { + inet_twsk_put((struct inet_timewait_sock *)sk); + return; + } + + bh_lock_sock(sk); + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == DCCP_CLOSED) + goto out; + + dp = dccp_sk(sk); + seq = dccp_hdr_seq(skb); + if (sk->sk_state != DCCP_LISTEN && + !between48(seq, dp->dccps_swl, dp->dccps_swh)) { + NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + switch (type) { + case ICMP_SOURCE_QUENCH: + /* Just silently ignore these. */ + goto out; + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + goto out; + + if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + if (!sock_owned_by_user(sk)) + dccp_do_pmtu_discovery(sk, iph, info); + goto out; + } + + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + goto out; + } + + switch (sk->sk_state) { + struct request_sock *req , **prev; + case DCCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + req = inet_csk_search_req(sk, &prev, dh->dccph_dport, + iph->daddr, iph->saddr); + if (!req) + goto out; + + /* + * ICMPs are not backlogged, hence we cannot get an established + * socket here. + */ + BUG_TRAP(!req->sk); + + if (seq != dccp_rsk(req)->dreq_iss) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + /* + * Still in RESPOND, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case DCCP_REQUESTING: + case DCCP_RESPOND: + if (!sock_owned_by_user(sk)) { + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + + sk->sk_error_report(sk); + + dccp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + /* If we've already connected we will keep trying + * until we time out, or the user gives up. + * + * rfc1122 4.2.3.9 allows to consider as hard errors + * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, + * but it is obsoleted by pmtu discovery). + * + * Note, that in modern internet, where routing is unreliable + * and in each dark corner broken firewalls sit, sending random + * errors ordered by their masters even this two messages finally lose + * their original sense (even Linux sends invalid PORT_UNREACHs) + * + * Now we are in compliance with RFCs. + * --ANK (980905) + */ + + inet = inet_sk(sk); + if (!sock_owned_by_user(sk) && inet->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else /* Only an error on timeout */ + sk->sk_err_soft = err; +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +extern struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, enum dccp_reset_codes code); + +int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) +{ + struct sk_buff *skb; + /* + * FIXME: what if rebuild_header fails? + * Should we be doing a rebuild_header here? + */ + int err = inet_sk_rebuild_header(sk); + + if (err != 0) + return err; + + skb = dccp_make_reset(sk, sk->sk_dst_cache, code); + if (skb != NULL) { + const struct dccp_sock *dp = dccp_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + + err = ip_build_and_send_pkt(skb, sk, + inet->saddr, inet->daddr, NULL); + if (err == NET_XMIT_CN) + err = 0; + + ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); + } + + return err; +} + +static inline u64 dccp_v4_init_sequence(const struct sock *sk, + const struct sk_buff *skb) +{ + return secure_dccp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + dccp_hdr(skb)->dccph_dport, + dccp_hdr(skb)->dccph_sport); +} + +int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct inet_request_sock *ireq; + struct dccp_sock dp; + struct request_sock *req; + struct dccp_request_sock *dreq; + const __u32 saddr = skb->nh.iph->saddr; + const __u32 daddr = skb->nh.iph->daddr; + struct dst_entry *dst = NULL; + + /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ + if (((struct rtable *)skb->dst)->rt_flags & + (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + /* + * TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if (inet_csk_reqsk_queue_is_full(sk)) + goto drop; + + /* + * Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = reqsk_alloc(sk->sk_prot->rsk_prot); + if (req == NULL) + goto drop; + + /* FIXME: process options */ + + dccp_openreq_init(req, &dp, skb); + + ireq = inet_rsk(req); + ireq->loc_addr = daddr; + ireq->rmt_addr = saddr; + /* FIXME: Merge Aristeu's option parsing code when ready */ + req->rcv_wnd = 100; /* Fake, option parsing will get the right value */ + ireq->opt = NULL; + + /* + * Step 3: Process LISTEN state + * + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * + * In fact we defer setting S.GSR, S.SWL, S.SWH to + * dccp_create_openreq_child. + */ + dreq = dccp_rsk(req); + dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq; + dreq->dreq_iss = dccp_v4_init_sequence(sk, skb); + dreq->dreq_service = dccp_hdr_request(skb)->dccph_req_service; + + if (dccp_v4_send_response(sk, req, dst)) + goto drop_and_free; + + inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + return 0; + +drop_and_free: + /* + * FIXME: should be reqsk_free after implementing req->rsk_ops + */ + __reqsk_free(req); +drop: + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + return -1; +} + +/* + * The three way handshake has completed - we got a valid ACK or DATAACK - + * now create the new socket. + * + * This is the equivalent of TCP's tcp_v4_syn_recv_sock + */ +struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet_request_sock *ireq; + struct inet_sock *newinet; + struct dccp_sock *newdp; + struct sock *newsk; + + if (sk_acceptq_is_full(sk)) + goto exit_overflow; + + if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL) + goto exit; + + newsk = dccp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto exit; + + sk_setup_caps(newsk, dst); + + newdp = dccp_sk(newsk); + newinet = inet_sk(newsk); + ireq = inet_rsk(req); + newinet->daddr = ireq->rmt_addr; + newinet->rcv_saddr = ireq->loc_addr; + newinet->saddr = ireq->loc_addr; + newinet->opt = ireq->opt; + ireq->opt = NULL; + newinet->mc_index = inet_iif(skb); + newinet->mc_ttl = skb->nh.iph->ttl; + newinet->id = jiffies; + + dccp_sync_mss(newsk, dst_mtu(dst)); + + __inet_hash(&dccp_hashinfo, newsk, 0); + __inet_inherit_port(&dccp_hashinfo, sk, newsk); + + return newsk; + +exit_overflow: + NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); +exit: + NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + dst_release(dst); + return NULL; +} + +static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + const struct iphdr *iph = skb->nh.iph; + struct sock *nsk; + struct request_sock **prev; + /* Find possible connection requests. */ + struct request_sock *req = inet_csk_search_req(sk, &prev, + dh->dccph_sport, + iph->saddr, iph->daddr); + if (req != NULL) + return dccp_check_req(sk, skb, req, prev); + + nsk = __inet_lookup_established(&dccp_hashinfo, + iph->saddr, dh->dccph_sport, + iph->daddr, ntohs(dh->dccph_dport), + inet_iif(skb)); + if (nsk != NULL) { + if (nsk->sk_state != DCCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put((struct inet_timewait_sock *)nsk); + return NULL; + } + + return sk; +} + +int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr, const u32 daddr) +{ + const struct dccp_hdr* dh = dccp_hdr(skb); + int checksum_len; + u32 tmp; + + if (dh->dccph_cscov == 0) + checksum_len = skb->len; + else { + checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32); + checksum_len = checksum_len < skb->len ? checksum_len : skb->len; + } + + tmp = csum_partial((unsigned char *)dh, checksum_len, 0); + return csum_tcpudp_magic(saddr, daddr, checksum_len, IPPROTO_DCCP, tmp); +} + +static int dccp_v4_verify_checksum(struct sk_buff *skb, + const u32 saddr, const u32 daddr) +{ + struct dccp_hdr *dh = dccp_hdr(skb); + int checksum_len; + u32 tmp; + + if (dh->dccph_cscov == 0) + checksum_len = skb->len; + else { + checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32); + checksum_len = checksum_len < skb->len ? checksum_len : skb->len; + } + tmp = csum_partial((unsigned char *)dh, checksum_len, 0); + return csum_tcpudp_magic(saddr, daddr, checksum_len, IPPROTO_DCCP, tmp) == 0 ? 0 : -1; +} + +static struct dst_entry* dccp_v4_route_skb(struct sock *sk, + struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif, + .nl_u = { .ip4_u = + { .daddr = skb->nh.iph->saddr, + .saddr = skb->nh.iph->daddr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = sk->sk_protocol, + .uli_u = { .ports = + { .sport = dccp_hdr(skb)->dccph_dport, + .dport = dccp_hdr(skb)->dccph_sport } } }; + + if (ip_route_output_flow(&rt, &fl, sk, 0)) { + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + + return &rt->u.dst; +} + +void dccp_v4_ctl_send_reset(struct sk_buff *rxskb) +{ + int err; + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_reset); + struct sk_buff *skb; + struct dst_entry *dst; + + /* Never send a reset in response to a reset. */ + if (rxdh->dccph_type == DCCP_PKT_RESET) + return; + + if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL) + return; + + dst = dccp_v4_route_skb(dccp_ctl_socket->sk, rxskb); + if (dst == NULL) + return; + + skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC); + if (skb == NULL) + goto out; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_DCCP_HEADER); + skb->dst = dst_clone(dst); + + skb->h.raw = skb_push(skb, dccp_hdr_reset_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_reset_len); + + /* Build DCCP header and checksum it. */ + dh->dccph_type = DCCP_PKT_RESET; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_reset_len / 4; + dh->dccph_x = 1; + dccp_hdr_reset(skb)->dccph_reset_code = DCCP_SKB_CB(rxskb)->dccpd_reset_code; + + dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq); + + dh->dccph_checksum = dccp_v4_checksum(skb, rxskb->nh.iph->saddr, + rxskb->nh.iph->daddr); + + bh_lock_sock(dccp_ctl_socket->sk); + err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk, + rxskb->nh.iph->daddr, rxskb->nh.iph->saddr, NULL); + bh_unlock_sock(dccp_ctl_socket->sk); + + if (err == NET_XMIT_CN || err == 0) { + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + } +out: + dst_release(dst); +} + +int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_hdr *dh = dccp_hdr(skb); + + if (sk->sk_state == DCCP_OPEN) { /* Fast path */ + if (dccp_rcv_established(sk, skb, dh, skb->len)) + goto reset; + return 0; + } + + /* + * Step 3: Process LISTEN state + * If S.state == LISTEN, + * If P.type == Request or P contains a valid Init Cookie option, + * * Must scan the packet's options to check for an Init + * Cookie. Only the Init Cookie is processed here, + * however; other options are processed in Step 8. This + * scan need only be performed if the endpoint uses Init + * Cookies * + * * Generate a new socket and switch to that socket * + * Set S := new socket for this port pair + * S.state = RESPOND + * Choose S.ISS (initial seqno) or set from Init Cookie + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * Continue with S.state == RESPOND + * * A Response packet will be generated in Step 11 * + * Otherwise, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + * + * NOTE: the check for the packet types is done in dccp_rcv_state_process + */ + if (sk->sk_state == DCCP_LISTEN) { + struct sock *nsk = dccp_v4_hnd_req(sk, skb); + + if (nsk == NULL) + goto discard; + + if (nsk != sk) { + if (dccp_child_process(sk, nsk, skb)) + goto reset; + return 0; + } + } + + if (dccp_rcv_state_process(sk, skb, dh, skb->len)) + goto reset; + return 0; + +reset: + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + dccp_v4_ctl_send_reset(skb); +discard: + kfree_skb(skb); + return 0; +} + +static inline int dccp_invalid_packet(struct sk_buff *skb) +{ + const struct dccp_hdr *dh; + + if (skb->pkt_type != PACKET_HOST) + return 1; + + if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { + dccp_pr_debug("pskb_may_pull failed\n"); + return 1; + } + + dh = dccp_hdr(skb); + + /* If the packet type is not understood, drop packet and return */ + if (dh->dccph_type >= DCCP_PKT_INVALID) { + dccp_pr_debug("invalid packet type\n"); + return 1; + } + + /* + * If P.Data Offset is too small for packet type, or too large for + * packet, drop packet and return + */ + if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { + dccp_pr_debug("Offset(%u) too small 1\n", dh->dccph_doff); + return 1; + } + + if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) { + dccp_pr_debug("P.Data Offset(%u) too small 2\n", dh->dccph_doff); + return 1; + } + + dh = dccp_hdr(skb); + + /* + * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet + * has short sequence numbers), drop packet and return + */ + if (dh->dccph_x == 0 && + dh->dccph_type != DCCP_PKT_DATA && + dh->dccph_type != DCCP_PKT_ACK && + dh->dccph_type != DCCP_PKT_DATAACK) { + dccp_pr_debug("P.type (%s) not Data, Ack nor DataAck and P.X == 0\n", + dccp_packet_name(dh->dccph_type)); + return 1; + } + + /* If the header checksum is incorrect, drop packet and return */ + if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, + skb->nh.iph->daddr) < 0) { + dccp_pr_debug("header checksum is incorrect\n"); + return 1; + } + + return 0; +} + +/* this is called when real data arrives */ +int dccp_v4_rcv(struct sk_buff *skb) +{ + const struct dccp_hdr *dh; + struct sock *sk; + int rc; + + /* Step 1: Check header basics: */ + + if (dccp_invalid_packet(skb)) + goto discard_it; + + dh = dccp_hdr(skb); +#if 0 + /* + * Use something like this to simulate some DATA/DATAACK loss to test + * dccp_ackpkts_add, you'll get something like this on a session that + * sends 10 DATA/DATAACK packets: + * + * dccp_ackpkts_print: 281473596467422 |0,0|3,0|0,0|3,0|0,0|3,0|0,0|3,0|0,1| + * + * 0, 0 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == just this packet + * 0, 1 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == two adjacent packets with the same state + * 3, 0 means: DCCP_ACKPKTS_STATE_NOT_RECEIVED, RLE == just this packet + * + * So... + * + * 281473596467422 was received + * 281473596467421 was not received + * 281473596467420 was received + * 281473596467419 was not received + * 281473596467418 was received + * 281473596467417 was not received + * 281473596467416 was received + * 281473596467415 was not received + * 281473596467414 was received + * 281473596467413 was received (this one was the 3way handshake RESPONSE) + * + */ + if (dh->dccph_type == DCCP_PKT_DATA || dh->dccph_type == DCCP_PKT_DATAACK) { + static int discard = 0; + + if (discard) { + discard = 0; + goto discard_it; + } + discard = 1; + } +#endif + DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); + DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; + + dccp_pr_debug("%8.8s " + "src=%u.%u.%u.%u@%-5d " + "dst=%u.%u.%u.%u@%-5d seq=%llu", + dccp_packet_name(dh->dccph_type), + NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport), + NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport), + DCCP_SKB_CB(skb)->dccpd_seq); + + if (dccp_packet_without_ack(skb)) { + DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; + dccp_pr_debug_cat("\n"); + } else { + DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); + dccp_pr_debug_cat(", ack=%llu\n", DCCP_SKB_CB(skb)->dccpd_ack_seq); + } + + /* Step 2: + * Look up flow ID in table and get corresponding socket */ + sk = __inet_lookup(&dccp_hashinfo, + skb->nh.iph->saddr, dh->dccph_sport, + skb->nh.iph->daddr, ntohs(dh->dccph_dport), + inet_iif(skb)); + + /* + * Step 2: + * If no socket ... + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (sk == NULL) { + dccp_pr_debug("failed to look up flow ID in table and " + "get corresponding socket\n"); + goto no_dccp_socket; + } + + /* + * Step 2: + * ... or S.state == TIMEWAIT, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + + if (sk->sk_state == DCCP_TIME_WAIT) { + dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: discard_and_relse\n"); + goto discard_and_relse; + } + + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + dccp_pr_debug("xfrm4_policy_check failed\n"); + goto discard_and_relse; + } + + if (sk_filter(sk, skb, 0)) { + dccp_pr_debug("sk_filter failed\n"); + goto discard_and_relse; + } + + skb->dev = NULL; + + bh_lock_sock(sk); + rc = 0; + if (!sock_owned_by_user(sk)) + rc = dccp_v4_do_rcv(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + sock_put(sk); + return rc; + +no_dccp_socket: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + /* + * Step 2: + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (dh->dccph_type != DCCP_PKT_RESET) { + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + dccp_v4_ctl_send_reset(skb); + } + +discard_it: + /* Discard frame. */ + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; +} + +static int dccp_v4_init_sock(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + static int dccp_ctl_socket_init = 1; + + dccp_options_init(&dp->dccps_options); + + if (dp->dccps_options.dccpo_send_ack_vector) { + dp->dccps_hc_rx_ackpkts = dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN, + GFP_KERNEL); + + if (dp->dccps_hc_rx_ackpkts == NULL) + return -ENOMEM; + } + + /* + * FIXME: We're hardcoding the CCID, and doing this at this point makes + * the listening (master) sock get CCID control blocks, which is not + * necessary, but for now, to not mess with the test userspace apps, + * lets leave it here, later the real solution is to do this in a + * setsockopt(CCIDs-I-want/accept). -acme + */ + if (likely(!dccp_ctl_socket_init)) { + dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_ccid, sk); + dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_ccid, sk); + if (dp->dccps_hc_rx_ccid == NULL || + dp->dccps_hc_tx_ccid == NULL) { + ccid_exit(dp->dccps_hc_rx_ccid, sk); + ccid_exit(dp->dccps_hc_tx_ccid, sk); + dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts); + dp->dccps_hc_rx_ackpkts = NULL; + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + return -ENOMEM; + } + } else + dccp_ctl_socket_init = 0; + + dccp_init_xmit_timers(sk); + sk->sk_state = DCCP_CLOSED; + dp->dccps_mss_cache = 536; + dp->dccps_role = DCCP_ROLE_UNDEFINED; + + return 0; +} + +int dccp_v4_destroy_sock(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + + /* + * DCCP doesn't use sk_qrite_queue, just sk_send_head + * for retransmissions + */ + if (sk->sk_send_head != NULL) { + kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + } + + /* Clean up a referenced DCCP bind bucket. */ + if (inet_csk(sk)->icsk_bind_hash != NULL) + inet_put_port(&dccp_hashinfo, sk); + + dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts); + dp->dccps_hc_rx_ackpkts = NULL; + ccid_exit(dp->dccps_hc_rx_ccid, sk); + ccid_exit(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + + return 0; +} + +static void dccp_v4_reqsk_destructor(struct request_sock *req) +{ + kfree(inet_rsk(req)->opt); +} + +static struct request_sock_ops dccp_request_sock_ops = { + .family = PF_INET, + .obj_size = sizeof(struct dccp_request_sock), + .rtx_syn_ack = dccp_v4_send_response, + .send_ack = dccp_v4_reqsk_send_ack, + .destructor = dccp_v4_reqsk_destructor, + .send_reset = dccp_v4_ctl_send_reset, +}; + +struct proto dccp_v4_prot = { + .name = "DCCP", + .owner = THIS_MODULE, + .close = dccp_close, + .connect = dccp_v4_connect, + .disconnect = dccp_disconnect, + .ioctl = dccp_ioctl, + .init = dccp_v4_init_sock, + .setsockopt = dccp_setsockopt, + .getsockopt = dccp_getsockopt, + .sendmsg = dccp_sendmsg, + .recvmsg = dccp_recvmsg, + .backlog_rcv = dccp_v4_do_rcv, + .hash = dccp_v4_hash, + .unhash = dccp_v4_unhash, + .accept = inet_csk_accept, + .get_port = dccp_v4_get_port, + .shutdown = dccp_shutdown, + .destroy = dccp_v4_destroy_sock, + .orphan_count = &dccp_orphan_count, + .max_header = MAX_DCCP_HEADER, + .obj_size = sizeof(struct dccp_sock), + .rsk_prot = &dccp_request_sock_ops, + .twsk_obj_size = sizeof(struct inet_timewait_sock), /* FIXME! create dccp_timewait_sock */ +}; diff -puN /dev/null net/dccp/Kconfig --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/Kconfig 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,24 @@ +menu "DCCP Configuration (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + +config IP_DCCP + tristate "The DCCP Protocol (EXPERIMENTAL)" + ---help--- + Datagram Congestion Control Protocol + + From draft-ietf-dccp-spec-11 . + + The Datagram Congestion Control Protocol (DCCP) is a transport + protocol that implements bidirectional, unicast connections of + congestion-controlled, unreliable datagrams. It should be suitable + for use by applications such as streaming media, Internet telephony, + and on-line games + + To compile this protocol support as a module, choose M here: the + module will be called dccp. + + If in doubt, say N. + +source "net/dccp/ccids/Kconfig" + +endmenu diff -puN /dev/null net/dccp/Makefile --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/Makefile 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,5 @@ +obj-$(CONFIG_IP_DCCP) += dccp.o + +dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o timer.o + +obj-y += ccids/ diff -puN /dev/null net/dccp/minisocks.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/minisocks.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,199 @@ +/* + * net/dccp/minisocks.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "ccid.h" +#include "dccp.h" + +void dccp_time_wait(struct sock *sk, int state, int timeo) +{ + /* FIXME: Implement */ + dccp_pr_debug("Want to help? Start here\n"); + dccp_set_state(sk, state); +} + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void dccp_tw_deschedule(struct inet_timewait_sock *tw) +{ + dccp_pr_debug("Want to help? Start here\n"); + __inet_twsk_kill(tw, &dccp_hashinfo); +} + +struct sock *dccp_create_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) +{ + /* + * Step 3: Process LISTEN state + * + * // Generate a new socket and switch to that socket + * Set S := new socket for this port pair + */ + struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); + + if (newsk != NULL) { + const struct dccp_request_sock *dreq = dccp_rsk(req); + struct inet_connection_sock *newicsk = inet_csk(sk); + struct dccp_sock *newdp = dccp_sk(newsk); + + newdp->dccps_hc_rx_ackpkts = NULL; + newdp->dccps_role = DCCP_ROLE_SERVER; + newicsk->icsk_rto = TCP_TIMEOUT_INIT; + + if (newdp->dccps_options.dccpo_send_ack_vector) { + newdp->dccps_hc_rx_ackpkts = dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN, + GFP_ATOMIC); + /* + * XXX: We're using the same CCIDs set on the parent, i.e. sk_clone + * copied the master sock and left the CCID pointers for this child, + * that is why we do the __ccid_get calls. + */ + if (unlikely(newdp->dccps_hc_rx_ackpkts == NULL)) + goto out_free; + } + + if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid, newsk) != 0 || + ccid_hc_tx_init(newdp->dccps_hc_tx_ccid, newsk) != 0)) { + dccp_ackpkts_free(newdp->dccps_hc_rx_ackpkts); + ccid_hc_rx_exit(newdp->dccps_hc_rx_ccid, newsk); + ccid_hc_tx_exit(newdp->dccps_hc_tx_ccid, newsk); +out_free: + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; + } + + __ccid_get(newdp->dccps_hc_rx_ccid); + __ccid_get(newdp->dccps_hc_tx_ccid); + + /* + * Step 3: Process LISTEN state + * + * Choose S.ISS (initial seqno) or set from Init Cookie + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + */ + + /* See dccp_v4_conn_request */ + newdp->dccps_options.dccpo_sequence_window = req->rcv_wnd; + + newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr; + dccp_update_gsr(newsk, dreq->dreq_isr); + + newdp->dccps_iss = dreq->dreq_iss; + dccp_update_gss(newsk, dreq->dreq_iss); + + dccp_init_xmit_timers(newsk); + + DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); + } + return newsk; +} + +/* + * Process an incoming packet for RESPOND sockets represented + * as an request_sock. + */ +struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct request_sock **prev) +{ + struct sock *child = NULL; + + /* Check for retransmitted REQUEST */ + if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { + if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dccp_rsk(req)->dreq_isr)) { + struct dccp_request_sock *dreq = dccp_rsk(req); + + dccp_pr_debug("Retransmitted REQUEST\n"); + /* Send another RESPONSE packet */ + dccp_set_seqno(&dreq->dreq_iss, dreq->dreq_iss + 1); + dccp_set_seqno(&dreq->dreq_isr, DCCP_SKB_CB(skb)->dccpd_seq); + req->rsk_ops->rtx_syn_ack(sk, req, NULL); + } + /* Network Duplicate, discard packet */ + return NULL; + } + + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; + + if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK && + dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK) + goto drop; + + /* Invalid ACK */ + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) { + dccp_pr_debug("Invalid ACK number: ack_seq=%llu, dreq_iss=%llu\n", + DCCP_SKB_CB(skb)->dccpd_ack_seq, dccp_rsk(req)->dreq_iss); + goto drop; + } + + child = dccp_v4_request_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; + + /* FIXME: deal with options */ + + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); + inet_csk_reqsk_queue_add(sk, req, child); +out: + return child; +listen_overflow: + dccp_pr_debug("listen_overflow!\n"); + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; +drop: + if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) + req->rsk_ops->send_reset(skb); + + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; +} + +/* + * Queue segment on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket. + */ +int dccp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + const int state = child->sk_state; + + if (!sock_owned_by_user(child)) { + ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb), skb->len); + + /* Wakeup parent, send SIGIO */ + if (state == DCCP_RESPOND && child->sk_state != state) + parent->sk_data_ready(parent, 0); + } else { + /* Alas, it is possible again, because we do lookup + * in main socket hash table and lock on listening + * socket does not protect us more. + */ + sk_add_backlog(child, skb); + } + + bh_unlock_sock(child); + sock_put(child); + return ret; +} diff -puN /dev/null net/dccp/options.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/options.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,763 @@ +/* + * net/dccp/options.c + * + * An implementation of the DCCP protocol + * Aristeu Sergio Rozanski Filho + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include + +#include "ccid.h" +#include "dccp.h" + +static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap, + struct sock *sk, + const u64 ackno, + const unsigned char len, + const unsigned char *vector); + +/* stores the default values for new connection. may be changed with sysctl */ +static const struct dccp_options dccpo_default_values = { + .dccpo_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW, + .dccpo_ccid = DCCPF_INITIAL_CCID, + .dccpo_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR, + .dccpo_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT, +}; + +void dccp_options_init(struct dccp_options *dccpo) +{ + memcpy(dccpo, &dccpo_default_values, sizeof(*dccpo)); +} + +static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) +{ + u32 value = 0; + + if (len > 3) + value += *bf++ << 24; + if (len > 2) + value += *bf++ << 16; + if (len > 1) + value += *bf++ << 8; + if (len > 0) + value += *bf; + + return value; +} + +int dccp_parse_options(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); +#ifdef DCCP_DEBUG + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT rx opt: " : + "server rx opt: "; +#endif + const struct dccp_hdr *dh = dccp_hdr(skb); + const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; + unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); + unsigned char *opt_ptr = options; + const unsigned char *opt_end = (unsigned char *)dh + (dh->dccph_doff * 4); + struct dccp_options_received *opt_recv = &dp->dccps_options_received; + unsigned char opt, len; + unsigned char *value; + + memset(opt_recv, 0, sizeof(*opt_recv)); + + while (opt_ptr != opt_end) { + opt = *opt_ptr++; + len = 0; + value = NULL; + + /* Check if this isn't a single byte option */ + if (opt > DCCPO_MAX_RESERVED) { + if (opt_ptr == opt_end) + goto out_invalid_option; + + len = *opt_ptr++; + if (len < 3) + goto out_invalid_option; + /* + * Remove the type and len fields, leaving + * just the value size + */ + len -= 2; + value = opt_ptr; + opt_ptr += len; + + if (opt_ptr > opt_end) + goto out_invalid_option; + } + + switch (opt) { + case DCCPO_PADDING: + break; + case DCCPO_NDP_COUNT: + if (len > 3) + goto out_invalid_option; + + opt_recv->dccpor_ndp = dccp_decode_value_var(value, len); + dccp_pr_debug("%sNDP count=%d\n", debug_prefix, opt_recv->dccpor_ndp); + break; + case DCCPO_ACK_VECTOR_0: + if (len > DCCP_MAX_ACK_VECTOR_LEN) + goto out_invalid_option; + + if (pkt_type == DCCP_PKT_DATA) + continue; + + opt_recv->dccpor_ack_vector_len = len; + opt_recv->dccpor_ack_vector_idx = value - options; + + dccp_pr_debug("%sACK vector 0, len=%d, ack_ackno=%llu\n", + debug_prefix, len, DCCP_SKB_CB(skb)->dccpd_ack_seq); + dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, + value, len); + dccp_ackpkts_check_rcv_ackvector(dp->dccps_hc_rx_ackpkts, sk, + DCCP_SKB_CB(skb)->dccpd_ack_seq, + len, value); + break; + case DCCPO_TIMESTAMP: + if (len != 4) + goto out_invalid_option; + + opt_recv->dccpor_timestamp = ntohl(*(u32 *)value); + + dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp; + dp->dccps_timestamp_time = jiffies; + + dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n", + debug_prefix, opt_recv->dccpor_timestamp, + DCCP_SKB_CB(skb)->dccpd_ack_seq); + break; + case DCCPO_TIMESTAMP_ECHO: + if (len < 4 || len > 8) + goto out_invalid_option; + + opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value); + + dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, diff=%u\n", + debug_prefix, opt_recv->dccpor_timestamp_echo, + len + 2, DCCP_SKB_CB(skb)->dccpd_ack_seq, + tcp_time_stamp - opt_recv->dccpor_timestamp_echo); + + opt_recv->dccpor_elapsed_time = dccp_decode_value_var(value + 4, len - 4); + dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n", debug_prefix, + opt_recv->dccpor_elapsed_time); + break; + case DCCPO_ELAPSED_TIME: + if (len > 4) + goto out_invalid_option; + + if (pkt_type == DCCP_PKT_DATA) + continue; + opt_recv->dccpor_elapsed_time = dccp_decode_value_var(value, len); + dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix, + opt_recv->dccpor_elapsed_time); + break; + /* + * From draft-ietf-dccp-spec-11.txt: + * + * Option numbers 128 through 191 are for options sent from the HC- + * Sender to the HC-Receiver; option numbers 192 through 255 are for + * options sent from the HC-Receiver to the HC-Sender. + */ + case 128 ... 191: { + const u16 idx = value - options; + + if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, opt, len, idx, value) != 0) + goto out_invalid_option; + } + break; + case 192 ... 255: { + const u16 idx = value - options; + + if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, opt, len, idx, value) != 0) + goto out_invalid_option; + } + break; + default: + pr_info("DCCP(%p): option %d(len=%d) not implemented, ignoring\n", + sk, opt, len); + break; + } + } + + return 0; + +out_invalid_option: + DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; + pr_info("DCCP(%p): invalid option %d, len=%d\n", sk, opt, len); + return -1; +} + +static void dccp_encode_value_var(const u32 value, unsigned char *to, + const unsigned int len) +{ + if (len > 3) + *to++ = (value & 0xFF000000) >> 24; + if (len > 2) + *to++ = (value & 0xFF0000) >> 16; + if (len > 1) + *to++ = (value & 0xFF00) >> 8; + if (len > 0) + *to++ = (value & 0xFF); +} + +static inline int dccp_ndp_len(const int ndp) +{ + return likely(ndp <= 0xFF) ? 1 : ndp <= 0xFFFF ? 2 : 3; +} + +void dccp_insert_option(struct sock *sk, struct sk_buff *skb, + const unsigned char option, + const void *value, const unsigned char len) +{ + unsigned char *to; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert %d option!\n", option)); + return; + } + + DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2; + + to = skb_push(skb, len + 2); + *to++ = option; + *to++ = len + 2; + + memcpy(to, value, len); +} + +EXPORT_SYMBOL_GPL(dccp_insert_option); + +static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + int ndp = dp->dccps_ndp_count; + + if (dccp_non_data_packet(skb)) + ++dp->dccps_ndp_count; + else + dp->dccps_ndp_count = 0; + + if (ndp > 0) { + unsigned char *ptr; + const int ndp_len = dccp_ndp_len(ndp); + const int len = ndp_len + 2; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + return; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + ptr = skb_push(skb, len); + *ptr++ = DCCPO_NDP_COUNT; + *ptr++ = len; + dccp_encode_value_var(ndp, ptr, ndp_len); + } +} + +static inline int dccp_elapsed_time_len(const u32 elapsed_time) +{ + return elapsed_time == 0 ? 0 : + elapsed_time <= 0xFF ? 1 : + elapsed_time <= 0xFFFF ? 2 : + elapsed_time <= 0xFFFFFF ? 3 : 4; +} + +void dccp_insert_option_elapsed_time(struct sock *sk, + struct sk_buff *skb, + u32 elapsed_time) +{ +#ifdef DCCP_DEBUG + struct dccp_sock *dp = dccp_sk(sk); + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : + "server TX opt: "; +#endif + const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); + const int len = 2 + elapsed_time_len; + unsigned char *to; + + /* If elapsed_time == 0... */ + if (elapsed_time_len == 2) + return; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert elapsed time!\n")); + return; + } + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + *to++ = DCCPO_ELAPSED_TIME; + *to++ = len; + + dccp_encode_value_var(elapsed_time, to, elapsed_time_len); + + dccp_pr_debug("%sELAPSED_TIME=%u, len=%d, seqno=%llu\n", + debug_prefix, elapsed_time, + len, DCCP_SKB_CB(skb)->dccpd_seq); +} + +EXPORT_SYMBOL(dccp_insert_option_elapsed_time); + +static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); +#ifdef DCCP_DEBUG + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : + "server TX opt: "; +#endif + struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; + int len = ap->dccpap_buf_vector_len + 2; + const u32 elapsed_time = jiffies_to_usecs(jiffies - ap->dccpap_time) / 10; + unsigned char *to, *from; + + if (elapsed_time != 0) + dccp_insert_option_elapsed_time(sk, skb, elapsed_time); + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert ACK Vector!\n")); + return; + } + + /* + * XXX: now we have just one ack vector sent record, so + * we have to wait for it to be cleared. + * + * Of course this is not acceptable, but this is just for + * basic testing now. + */ + if (ap->dccpap_ack_seqno != DCCP_MAX_SEQNO + 1) + return; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + *to++ = DCCPO_ACK_VECTOR_0; + *to++ = len; + + len = ap->dccpap_buf_vector_len; + from = ap->dccpap_buf + ap->dccpap_buf_head; + + /* Check if buf_head wraps */ + if (ap->dccpap_buf_head + len > ap->dccpap_buf_len) { + const unsigned int tailsize = ap->dccpap_buf_len - ap->dccpap_buf_head; + + memcpy(to, from, tailsize); + to += tailsize; + len -= tailsize; + from = ap->dccpap_buf; + } + + memcpy(to, from, len); + /* + * From draft-ietf-dccp-spec-11.txt: + * + * For each acknowledgement it sends, the HC-Receiver will add an + * acknowledgement record. ack_seqno will equal the HC-Receiver + * sequence number it used for the ack packet; ack_ptr will equal + * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will equal + * buf_nonce. + * + * This implemention uses just one ack record for now. + */ + ap->dccpap_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + ap->dccpap_ack_ptr = ap->dccpap_buf_head; + ap->dccpap_ack_ackno = ap->dccpap_buf_ackno; + ap->dccpap_ack_nonce = ap->dccpap_buf_nonce; + ap->dccpap_ack_vector_len = ap->dccpap_buf_vector_len; + + dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, ack_ackno=%llu\n", + debug_prefix, ap->dccpap_ack_vector_len, + ap->dccpap_ack_seqno, ap->dccpap_ack_ackno); +} + +static inline void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb) +{ + const u32 now = htonl(tcp_time_stamp); + dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now)); +} + +static void dccp_insert_option_timestamp_echo(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); +#ifdef DCCP_DEBUG + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : + "server TX opt: "; +#endif + u32 tstamp_echo; + const u32 elapsed_time = jiffies_to_usecs(jiffies - dp->dccps_timestamp_time) / 10; + const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); + const int len = 6 + elapsed_time_len; + unsigned char *to; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert timestamp echo!\n")); + return; + } + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + *to++ = DCCPO_TIMESTAMP_ECHO; + *to++ = len; + + tstamp_echo = htonl(dp->dccps_timestamp_echo); + memcpy(to, &tstamp_echo, 4); + to += 4; + dccp_encode_value_var(elapsed_time, to, elapsed_time_len); + + dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, seqno=%llu\n", + debug_prefix, dp->dccps_timestamp_echo, + len, DCCP_SKB_CB(skb)->dccpd_seq); + + dp->dccps_timestamp_echo = 0; + dp->dccps_timestamp_time = 0; +} + +void dccp_insert_options(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + + DCCP_SKB_CB(skb)->dccpd_opt_len = 0; + + if (dp->dccps_options.dccpo_send_ndp_count) + dccp_insert_option_ndp(sk, skb); + + if (!dccp_packet_without_ack(skb)) { + if (dp->dccps_options.dccpo_send_ack_vector && + dp->dccps_hc_rx_ackpkts->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1) + dccp_insert_option_ack_vector(sk, skb); + + dccp_insert_option_timestamp(sk, skb); + if (dp->dccps_timestamp_echo != 0) + dccp_insert_option_timestamp_echo(sk, skb); + } + + ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb); + + /* XXX: insert other options when appropriate */ + + if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) { + /* The length of all options has to be a multiple of 4 */ + int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; + + if (padding != 0) { + padding = 4 - padding; + memset(skb_push(skb, padding), 0, padding); + DCCP_SKB_CB(skb)->dccpd_opt_len += padding; + } + } +} + +struct dccp_ackpkts *dccp_ackpkts_alloc(unsigned int len, int priority) +{ + struct dccp_ackpkts *ap = kmalloc(sizeof(*ap) + len, priority); + + if (ap != NULL) { +#ifdef DCCP_DEBUG + memset(ap->dccpap_buf, 0xFF, len); +#endif + ap->dccpap_buf_len = len; + ap->dccpap_buf_head = ap->dccpap_buf_tail = ap->dccpap_buf_len - 1; + ap->dccpap_buf_ackno = ap->dccpap_ack_ackno = ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + ap->dccpap_buf_nonce = ap->dccpap_buf_nonce = 0; + ap->dccpap_ack_ptr = 0; + ap->dccpap_time = 0; + ap->dccpap_buf_vector_len = ap->dccpap_ack_vector_len = 0; + } + + return ap; +} + +void dccp_ackpkts_free(struct dccp_ackpkts *ap) +{ + if (ap != NULL) { +#ifdef DCCP_DEBUG + memset(ap, 0xFF, sizeof(*ap) + ap->dccpap_buf_len); +#endif + kfree(ap); + } +} + +static inline u8 dccp_ackpkts_state(const struct dccp_ackpkts *ap, + const unsigned int index) +{ + return ap->dccpap_buf[index] & DCCP_ACKPKTS_STATE_MASK; +} + +static inline u8 dccp_ackpkts_len(const struct dccp_ackpkts *ap, + const unsigned int index) +{ + return ap->dccpap_buf[index] & DCCP_ACKPKTS_LEN_MASK; +} + +/* + * If several packets are missing, the HC-Receiver may prefer to enter multiple + * bytes with run length 0, rather than a single byte with a larger run length; + * this simplifies table updates if one of the missing packets arrives. + */ +static inline int dccp_ackpkts_set_buf_head_state(struct dccp_ackpkts *ap, + const unsigned int packets, + const unsigned char state) +{ + unsigned int gap; + signed long new_head; + + if (ap->dccpap_buf_vector_len + packets > ap->dccpap_buf_len) + return -ENOBUFS; + + gap = packets - 1; + new_head = ap->dccpap_buf_head - packets; + + if (new_head < 0) { + if (gap > 0) { + memset(ap->dccpap_buf, DCCP_ACKPKTS_STATE_NOT_RECEIVED, + gap + new_head + 1); + gap = -new_head; + } + new_head += ap->dccpap_buf_len; + } + + ap->dccpap_buf_head = new_head; + + if (gap > 0) + memset(ap->dccpap_buf + ap->dccpap_buf_head + 1, + DCCP_ACKPKTS_STATE_NOT_RECEIVED, gap); + + ap->dccpap_buf[ap->dccpap_buf_head] = state; + ap->dccpap_buf_vector_len += packets; + return 0; +} + +/* + * Implements the draft-ietf-dccp-spec-11.txt Appendix A + */ +int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state) +{ + /* + * Check at the right places if the buffer is full, if it is, tell the + * caller to start dropping packets till the HC-Sender acks our ACK + * vectors, when we will free up space in dccpap_buf. + * + * We may well decide to do buffer compression, etc, but for now lets + * just drop. + * + * From Appendix A: + * + * Of course, the circular buffer may overflow, either when the HC- + * Sender is sending data at a very high rate, when the HC-Receiver's + * acknowledgements are not reaching the HC-Sender, or when the HC- + * Sender is forgetting to acknowledge those acks (so the HC-Receiver + * is unable to clean up old state). In this case, the HC-Receiver + * should either compress the buffer (by increasing run lengths when + * possible), transfer its state to a larger buffer, or, as a last + * resort, drop all received packets, without processing them + * whatsoever, until its buffer shrinks again. + */ + + /* See if this is the first ackno being inserted */ + if (ap->dccpap_buf_vector_len == 0) { + ap->dccpap_buf[ap->dccpap_buf_head] = state; + ap->dccpap_buf_vector_len = 1; + } else if (after48(ackno, ap->dccpap_buf_ackno)) { + const u64 delta = dccp_delta_seqno(ap->dccpap_buf_ackno, ackno); + + /* + * Look if the state of this packet is the same as the previous ackno + * and if so if we can bump the head len. + */ + if (delta == 1 && + dccp_ackpkts_state(ap, ap->dccpap_buf_head) == state && + dccp_ackpkts_len(ap, ap->dccpap_buf_head) < DCCP_ACKPKTS_LEN_MASK) + ap->dccpap_buf[ap->dccpap_buf_head]++; + else if (dccp_ackpkts_set_buf_head_state(ap, delta, state)) + return -ENOBUFS; + } else { + /* + * A.1.2. Old Packets + * + * When a packet with Sequence Number S arrives, and S <= buf_ackno, + * the HC-Receiver will scan the table for the byte corresponding to S. + * (Indexing structures could reduce the complexity of this scan.) + */ + u64 delta = dccp_delta_seqno(ackno, ap->dccpap_buf_ackno); + unsigned int index = ap->dccpap_buf_head; + + while (1) { + const u8 len = dccp_ackpkts_len(ap, index); + const u8 state = dccp_ackpkts_state(ap, index); + /* + * valid packets not yet in dccpap_buf have a reserved entry, with + * a len equal to 0 + */ + if (state == DCCP_ACKPKTS_STATE_NOT_RECEIVED && + len == 0 && delta == 0) { /* Found our reserved seat! */ + dccp_pr_debug("Found %llu reserved seat!\n", ackno); + ap->dccpap_buf[index] = state; + goto out; + } + /* len == 0 means one packet */ + if (delta < len + 1) + goto out_duplicate; + + delta -= len + 1; + if (++index == ap->dccpap_buf_len) + index = 0; + } + } + + ap->dccpap_buf_ackno = ackno; + ap->dccpap_time = jiffies; +out: + dccp_pr_debug(""); + dccp_ackpkts_print(ap); + return 0; + +out_duplicate: + /* Duplicate packet */ + dccp_pr_debug("Received a dup or already considered lost packet: %llu\n", ackno); + return -EILSEQ; +} + +#ifdef DCCP_DEBUG +void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len) +{ + if (!dccp_debug) + return; + + printk("ACK vector len=%d, ackno=%llu |", len, ackno); + + while (len--) { + const u8 state = (*vector & DCCP_ACKPKTS_STATE_MASK) >> 6; + const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK); + + printk("%d,%d|", state, rl); + ++vector; + } + + printk("\n"); +} + +void dccp_ackpkts_print(const struct dccp_ackpkts *ap) +{ + dccp_ackvector_print(ap->dccpap_buf_ackno, + ap->dccpap_buf + ap->dccpap_buf_head, + ap->dccpap_buf_vector_len); +} +#endif + +static void dccp_ackpkts_trow_away_ack_record(struct dccp_ackpkts *ap) +{ + /* + * As we're keeping track of the ack vector size + * (dccpap_buf_vector_len) and the sent ack vector size + * (dccpap_ack_vector_len) we don't need dccpap_buf_tail at all, but + * keep this code here as in the future we'll implement a vector of ack + * records, as suggested in draft-ietf-dccp-spec-11.txt Appendix A. -acme + */ +#if 0 + ap->dccpap_buf_tail = ap->dccpap_ack_ptr + 1; + if (ap->dccpap_buf_tail >= ap->dccpap_buf_len) + ap->dccpap_buf_tail -= ap->dccpap_buf_len; +#endif + ap->dccpap_buf_vector_len -= ap->dccpap_ack_vector_len; +} + +void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk, + u64 ackno) +{ + /* Check if we actually sent an ACK vector */ + if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1) + return; + + if (ackno == ap->dccpap_ack_seqno) { +#ifdef DCCP_DEBUG + struct dccp_sock *dp = dccp_sk(sk); + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT rx ack: " : + "server rx ack: "; +#endif + dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, ack_ackno=%llu, ACKED!\n", + debug_prefix, 1, + ap->dccpap_ack_seqno, ap->dccpap_ack_ackno); + dccp_ackpkts_trow_away_ack_record(ap); + ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + } +} + +static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap, + struct sock *sk, u64 ackno, + const unsigned char len, + const unsigned char *vector) +{ + unsigned char i; + + /* Check if we actually sent an ACK vector */ + if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1) + return; + /* + * We're in the receiver half connection, so if the received an ACK vector + * ackno (e.g. 50) before dccpap_ack_seqno (e.g. 52), we're not interested. + * + * Extra explanation with example: + * + * if we received an ACK vector with ackno 50, it can only be acking + * 50, 49, 48, etc, not 52 (the seqno for the ACK vector we sent). + */ + // dccp_pr_debug("is %llu < %llu? ", ackno, ap->dccpap_ack_seqno); + if (before48(ackno, ap->dccpap_ack_seqno)) { + // dccp_pr_debug_cat("yes\n"); + return; + } + // dccp_pr_debug_cat("no\n"); + + i = len; + while (i--) { + const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK); + u64 ackno_end_rl; + + dccp_set_seqno(&ackno_end_rl, ackno - rl); + + // dccp_pr_debug("is %llu <= %llu <= %llu? ", ackno_end_rl, ap->dccpap_ack_seqno, ackno); + if (between48(ap->dccpap_ack_seqno, ackno_end_rl, ackno)) { + const u8 state = (*vector & DCCP_ACKPKTS_STATE_MASK) >> 6; + // dccp_pr_debug_cat("yes\n"); + + if (state != DCCP_ACKPKTS_STATE_NOT_RECEIVED) { +#ifdef DCCP_DEBUG + struct dccp_sock *dp = dccp_sk(sk); + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT rx ack: " : + "server rx ack: "; +#endif + dccp_pr_debug("%sACK vector 0, len=%d, ack_seqno=%llu, ack_ackno=%llu, ACKED!\n", + debug_prefix, len, + ap->dccpap_ack_seqno, ap->dccpap_ack_ackno); + dccp_ackpkts_trow_away_ack_record(ap); + } + /* + * If dccpap_ack_seqno was not received, no problem we'll + * send another ACK vector. + */ + ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + break; + } + // dccp_pr_debug_cat("no\n"); + + dccp_set_seqno(&ackno, ackno_end_rl - 1); + ++vector; + } +} diff -puN /dev/null net/dccp/output.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/output.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,409 @@ +/* + * net/dccp/output.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include + +#include "ccid.h" +#include "dccp.h" + +static inline void dccp_event_ack_sent(struct sock *sk) +{ + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); +} + +/* + * All SKB's seen here are completely headerless. It is our + * job to build the DCCP header, and pass the packet down to + * IP so it can do the same plus pass the packet off to the + * device. + */ +int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) +{ + if (likely(skb != NULL)) { + const const struct inet_sock *inet = inet_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + struct dccp_hdr *dh; + /* XXX For now we're using only 48 bits sequence numbers */ + const int dccp_header_size = sizeof(*dh) + + sizeof(struct dccp_hdr_ext) + + dccp_packet_hdr_len(dcb->dccpd_type); + int err, set_ack = 1; + u64 ackno = dp->dccps_gsr; + + /* + * FIXME: study DCCP_PKT_SYNC[ACK] to see what is the right thing + * to do here... + */ + dccp_inc_seqno(&dp->dccps_gss); + + dcb->dccpd_seq = dp->dccps_gss; + dccp_insert_options(sk, skb); + + switch (dcb->dccpd_type) { + case DCCP_PKT_DATA: + set_ack = 0; + break; + case DCCP_PKT_SYNC: + case DCCP_PKT_SYNCACK: + ackno = dcb->dccpd_seq; + break; + } + + skb->h.raw = skb_push(skb, dccp_header_size); + dh = dccp_hdr(skb); + /* Data packets are not cloned as they are never retransmitted */ + if (skb_cloned(skb)) + skb_set_owner_w(skb, sk); + + /* Build DCCP header and checksum it. */ + memset(dh, 0, dccp_header_size); + dh->dccph_type = dcb->dccpd_type; + dh->dccph_sport = inet->sport; + dh->dccph_dport = inet->dport; + dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4; + dh->dccph_ccval = dcb->dccpd_ccval; + /* XXX For now we're using only 48 bits sequence numbers */ + dh->dccph_x = 1; + + dp->dccps_awh = dp->dccps_gss; + dccp_hdr_set_seq(dh, dp->dccps_gss); + if (set_ack) + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno); + + switch (dcb->dccpd_type) { + case DCCP_PKT_REQUEST: + dccp_hdr_request(skb)->dccph_req_service = dcb->dccpd_service; + break; + case DCCP_PKT_RESET: + dccp_hdr_reset(skb)->dccph_reset_code = dcb->dccpd_reset_code; + break; + } + + dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, + inet->daddr); + + if (dcb->dccpd_type == DCCP_PKT_ACK || + dcb->dccpd_type == DCCP_PKT_DATAACK) + dccp_event_ack_sent(sk); + + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + + err = ip_queue_xmit(skb, 0); + if (err <= 0) + return err; + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; + } + return -ENOBUFS; +} + +unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) +{ + struct dccp_sock *dp = dccp_sk(sk); + int mss_now; + + /* + * FIXME: we really should be using the af_specific thing to support IPv6. + * mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); + */ + mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); + + /* Now subtract optional transport overhead */ + mss_now -= dp->dccps_ext_header_len; + + /* + * FIXME: this should come from the CCID infrastructure, where, say, + * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets + * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED + * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to + * make it a multiple of 4 + */ + + mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; + + /* And store cached results */ + dp->dccps_pmtu_cookie = pmtu; + dp->dccps_mss_cache = mss_now; + + return mss_now; +} + +int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ + if (inet_sk_rebuild_header(sk) != 0) + return -EHOSTUNREACH; /* Routing failure or similar. */ + + return dccp_transmit_skb(sk, (skb_cloned(skb) ? + pskb_copy(skb, GFP_ATOMIC): + skb_clone(skb, GFP_ATOMIC))); +} + +struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, + struct request_sock *req) +{ + struct dccp_hdr *dh; + const int dccp_header_size = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_response); + struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN + + dccp_header_size, 1, + GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size); + + skb->dst = dst_clone(dst); + skb->csum = 0; + + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; + DCCP_SKB_CB(skb)->dccpd_seq = dccp_rsk(req)->dreq_iss; + dccp_insert_options(sk, skb); + + skb->h.raw = skb_push(skb, dccp_header_size); + + dh = dccp_hdr(skb); + memset(dh, 0, dccp_header_size); + + dh->dccph_sport = inet_sk(sk)->sport; + dh->dccph_dport = inet_rsk(req)->rmt_port; + dh->dccph_doff = (dccp_header_size + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; + dh->dccph_type = DCCP_PKT_RESPONSE; + dh->dccph_x = 1; + dccp_hdr_set_seq(dh, dccp_rsk(req)->dreq_iss); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dccp_rsk(req)->dreq_isr); + + dh->dccph_checksum = dccp_v4_checksum(skb, inet_rsk(req)->loc_addr, + inet_rsk(req)->rmt_addr); + + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + return skb; +} + +struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, + const enum dccp_reset_codes code) + +{ + struct dccp_hdr *dh; + struct dccp_sock *dp = dccp_sk(sk); + const int dccp_header_size = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_reset); + struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN + + dccp_header_size, 1, + GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size); + + skb->dst = dst_clone(dst); + skb->csum = 0; + + dccp_inc_seqno(&dp->dccps_gss); + + DCCP_SKB_CB(skb)->dccpd_reset_code = code; + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET; + DCCP_SKB_CB(skb)->dccpd_seq = dp->dccps_gss; + dccp_insert_options(sk, skb); + + skb->h.raw = skb_push(skb, dccp_header_size); + + dh = dccp_hdr(skb); + memset(dh, 0, dccp_header_size); + + dh->dccph_sport = inet_sk(sk)->sport; + dh->dccph_dport = inet_sk(sk)->dport; + dh->dccph_doff = (dccp_header_size + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; + dh->dccph_type = DCCP_PKT_RESET; + dh->dccph_x = 1; + dccp_hdr_set_seq(dh, dp->dccps_gss); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dp->dccps_gsr); + + dccp_hdr_reset(skb)->dccph_reset_code = code; + + dh->dccph_checksum = dccp_v4_checksum(skb, inet_sk(sk)->saddr, + inet_sk(sk)->daddr); + + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + return skb; +} + +/* + * Do all connect socket setups that can be done AF independent. + */ +static inline void dccp_connect_init(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + sk->sk_err = 0; + sock_reset_flag(sk, SOCK_DONE); + + dccp_sync_mss(sk, dst_mtu(dst)); + + /* + * FIXME: set dp->{dccps_swh,dccps_swl}, with + * something like dccp_inc_seq + */ + + icsk->icsk_retransmits = 0; +} + +int dccp_connect(struct sock *sk) +{ + struct sk_buff *skb; + struct inet_connection_sock *icsk = inet_csk(sk); + + dccp_connect_init(sk); + + skb = alloc_skb(MAX_DCCP_HEADER + 15, sk->sk_allocation); + if (unlikely(skb == NULL)) + return -ENOBUFS; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_DCCP_HEADER); + + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; + /* FIXME: set service to something meaningful, coming + * from userspace*/ + DCCP_SKB_CB(skb)->dccpd_service = 0; + skb->csum = 0; + skb_set_owner_w(skb, sk); + + BUG_TRAP(sk->sk_send_head == NULL); + sk->sk_send_head = skb; + dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); + DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); + + /* Timer for repeating the REQUEST until an answer. */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + return 0; +} + +void dccp_send_ack(struct sock *sk) +{ + /* If we have been reset, we may not send again. */ + if (sk->sk_state != DCCP_CLOSED) { + struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC); + + if (skb == NULL) { + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); + return; + } + + /* Reserve space for headers */ + skb_reserve(skb, MAX_DCCP_HEADER); + skb->csum = 0; + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK; + skb_set_owner_w(skb, sk); + dccp_transmit_skb(sk, skb); + } +} + +EXPORT_SYMBOL_GPL(dccp_send_ack); + +void dccp_send_delayed_ack(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + /* + * FIXME: tune this timer. elapsed time fixes the skew, so no problem + * with using 2s, and active senders also piggyback the ACK into a + * DATAACK packet, so this is really for quiescent senders. + */ + unsigned long timeout = jiffies + 2 * HZ; + + /* Use new timeout only if there wasn't a older one earlier. */ + if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { + /* If delack timer was blocked or is about to expire, + * send ACK now. + * + * FIXME: check the "about to expire" part + */ + if (icsk->icsk_ack.blocked) { + dccp_send_ack(sk); + return; + } + + if (!time_before(timeout, icsk->icsk_ack.timeout)) + timeout = icsk->icsk_ack.timeout; + } + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); +} + +void dccp_send_sync(struct sock *sk, u64 seq) +{ + /* + * We are not putting this on the write queue, so + * dccp_transmit_skb() will set the ownership to this + * sock. + */ + struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC); + + if (skb == NULL) + /* FIXME: how to make sure the sync is sent? */ + return; + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_DCCP_HEADER); + skb->csum = 0; + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_SYNC; + DCCP_SKB_CB(skb)->dccpd_seq = seq; + + skb_set_owner_w(skb, sk); + dccp_transmit_skb(sk, skb); +} + +/* Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This cannot be + * allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under any circumstances. + */ +void dccp_send_close(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb; + + /* Socket is locked, keep trying until memory is available. */ + for (;;) { + skb = alloc_skb(sk->sk_prot->max_header, GFP_KERNEL); + if (skb != NULL) + break; + yield(); + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, sk->sk_prot->max_header); + skb->csum = 0; + DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ? DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ; + + skb_set_owner_w(skb, sk); + dccp_transmit_skb(sk, skb); + + ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); +} diff -puN /dev/null net/dccp/proto.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/proto.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,818 @@ +/* + * net/dccp/proto.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "ccid.h" +#include "dccp.h" + +DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics); + +atomic_t dccp_orphan_count = ATOMIC_INIT(0); + +static struct net_protocol dccp_protocol = { + .handler = dccp_v4_rcv, + .err_handler = dccp_v4_err, +}; + +const char *dccp_packet_name(const int type) +{ + static const char *dccp_packet_names[] = { + [DCCP_PKT_REQUEST] = "REQUEST", + [DCCP_PKT_RESPONSE] = "RESPONSE", + [DCCP_PKT_DATA] = "DATA", + [DCCP_PKT_ACK] = "ACK", + [DCCP_PKT_DATAACK] = "DATAACK", + [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", + [DCCP_PKT_CLOSE] = "CLOSE", + [DCCP_PKT_RESET] = "RESET", + [DCCP_PKT_SYNC] = "SYNC", + [DCCP_PKT_SYNCACK] = "SYNCACK", + }; + + if (type >= DCCP_NR_PKT_TYPES) + return "INVALID"; + else + return dccp_packet_names[type]; +} + +EXPORT_SYMBOL_GPL(dccp_packet_name); + +const char *dccp_state_name(const int state) +{ + static char *dccp_state_names[] = { + [DCCP_OPEN] = "OPEN", + [DCCP_REQUESTING] = "REQUESTING", + [DCCP_PARTOPEN] = "PARTOPEN", + [DCCP_LISTEN] = "LISTEN", + [DCCP_RESPOND] = "RESPOND", + [DCCP_CLOSING] = "CLOSING", + [DCCP_TIME_WAIT] = "TIME_WAIT", + [DCCP_CLOSED] = "CLOSED", + }; + + if (state >= DCCP_MAX_STATES) + return "INVALID STATE!"; + else + return dccp_state_names[state]; +} + +EXPORT_SYMBOL_GPL(dccp_state_name); + +static inline int dccp_listen_start(struct sock *sk) +{ + dccp_sk(sk)->dccps_role = DCCP_ROLE_LISTEN; + return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); +} + +int dccp_disconnect(struct sock *sk, int flags) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet = inet_sk(sk); + int err = 0; + const int old_state = sk->sk_state; + + if (old_state != DCCP_CLOSED) + dccp_set_state(sk, DCCP_CLOSED); + + /* ABORT function of RFC793 */ + if (old_state == DCCP_LISTEN) { + inet_csk_listen_stop(sk); + /* FIXME: do the active reset thing */ + } else if (old_state == DCCP_REQUESTING) + sk->sk_err = ECONNRESET; + + dccp_clear_xmit_timers(sk); + __skb_queue_purge(&sk->sk_receive_queue); + if (sk->sk_send_head != NULL) { + __kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + } + + inet->dport = 0; + + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + + sk->sk_shutdown = 0; + sock_reset_flag(sk, SOCK_DONE); + + icsk->icsk_backoff = 0; + inet_csk_delack_init(sk); + __sk_dst_reset(sk); + + BUG_TRAP(!inet->num || icsk->icsk_bind_hash); + + sk->sk_error_report(sk); + return err; +} + +int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + dccp_pr_debug("entry\n"); + return -ENOIOCTLCMD; +} + +int dccp_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + dccp_pr_debug("entry\n"); + + if (level != SOL_DCCP) + return ip_setsockopt(sk, level, optname, optval, optlen); + + return -EOPNOTSUPP; +} + +int dccp_getsockopt(struct sock *sk, int level, int optname, + char *optval, int *optlen) +{ + dccp_pr_debug("entry\n"); + + if (level != SOL_DCCP) + return ip_getsockopt(sk, level, optname, optval, optlen); + + return -EOPNOTSUPP; +} + +int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + const struct dccp_sock *dp = dccp_sk(sk); + const int flags = msg->msg_flags; + const int noblock = flags & MSG_DONTWAIT; + struct sk_buff *skb; + int rc, size; + long timeo; + + if (len > dp->dccps_mss_cache) + return -EMSGSIZE; + + lock_sock(sk); + + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + /* + * We have to use sk_stream_wait_connect here to set sk_write_pending, + * so that the trick in dccp_rcv_request_sent_state_process. + */ + /* Wait for a connection to finish. */ + if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING)) + if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_err; + + size = sk->sk_prot->max_header + len; + release_sock(sk); + skb = sock_alloc_send_skb(sk, size, noblock, &rc); + lock_sock(sk); + + if (skb == NULL) + goto out_release; + + skb_reserve(skb, sk->sk_prot->max_header); + rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + if (rc == 0) { + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + const struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; + long delay; + + /* + * XXX: This is just to match the Waikato tree CA interaction + * points, after the CCID3 code is stable and I have a better + * understanding of behaviour I'll change this to look more like + * TCP. + */ + while (1) { + rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, + skb, len, &delay); + if (rc == 0) + break; + if (rc != -EAGAIN) + goto out_discard; + if (delay > timeo) + goto out_discard; + release_sock(sk); + delay = schedule_timeout(delay); + lock_sock(sk); + timeo -= delay; + if (signal_pending(current)) + goto out_interrupted; + rc = -EPIPE; + if (!(sk->sk_state == DCCP_PARTOPEN || sk->sk_state == DCCP_OPEN)) + goto out_discard; + } + + if (sk->sk_state == DCCP_PARTOPEN) { + /* See 8.1.5. Handshake Completion */ + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + dcb->dccpd_type = DCCP_PKT_DATAACK; + /* FIXME: we really should have a dccps_ack_pending or use icsk */ + } else if (inet_csk_ack_scheduled(sk) || + (dp->dccps_options.dccpo_send_ack_vector && + ap->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1 && + ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)) + dcb->dccpd_type = DCCP_PKT_DATAACK; + else + dcb->dccpd_type = DCCP_PKT_DATA; + dccp_transmit_skb(sk, skb); + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); + } else { +out_discard: + kfree_skb(skb); + } +out_release: + release_sock(sk); + return rc ? : len; +out_err: + rc = sk_stream_error(sk, flags, rc); + goto out_release; +out_interrupted: + rc = sock_intr_errno(timeo); + goto out_discard; +} + +EXPORT_SYMBOL(dccp_sendmsg); + +int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, int *addr_len) +{ + const struct dccp_hdr *dh; + int copied = 0; + unsigned long used; + int err; + int target; /* Read at least this many bytes */ + long timeo; + + lock_sock(sk); + + err = -ENOTCONN; + if (sk->sk_state == DCCP_LISTEN) + goto out; + + timeo = sock_rcvtimeo(sk, nonblock); + + /* Urgent data needs to be handled specially. */ + if (flags & MSG_OOB) + goto recv_urg; + + /* FIXME */ +#if 0 + seq = &tp->copied_seq; + if (flags & MSG_PEEK) { + peek_seq = tp->copied_seq; + seq = &peek_seq; + } +#endif + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + do { + struct sk_buff *skb; + u32 offset; + + /* FIXME */ +#if 0 + /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ + if (tp->urg_data && tp->urg_seq == *seq) { + if (copied) + break; + if (signal_pending(current)) { + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; + break; + } + } +#endif + + /* Next get a buffer. */ + + skb = skb_peek(&sk->sk_receive_queue); + do { + if (!skb) + break; + + offset = 0; + dh = dccp_hdr(skb); + + if (dh->dccph_type == DCCP_PKT_DATA || + dh->dccph_type == DCCP_PKT_DATAACK) + goto found_ok_skb; + + if (dh->dccph_type == DCCP_PKT_RESET || + dh->dccph_type == DCCP_PKT_CLOSE) { + dccp_pr_debug("found fin ok!\n"); + goto found_fin_ok; + } + dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); + BUG_TRAP(flags & MSG_PEEK); + skb = skb->next; + } while (skb != (struct sk_buff *)&sk->sk_receive_queue); + + /* Well, if we have backlog, try to process it now yet. */ + if (copied >= target && !sk->sk_backlog.tail) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == DCCP_CLOSED || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current) || + (flags & MSG_PEEK)) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == DCCP_CLOSED) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; + break; + } + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + /* FIXME: cleanup_rbuf(sk, copied); */ + + if (copied >= target) { + /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else + sk_wait_data(sk, &timeo); + + continue; + + found_ok_skb: + /* Ok so how much can we use? */ + used = skb->len - offset; + if (len < used) + used = len; + + if (!(flags & MSG_TRUNC)) { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + } + + copied += used; + len -= used; + + /* FIXME: tcp_rcv_space_adjust(sk); */ + +//skip_copy: + if (used + offset < skb->len) + continue; + + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + continue; + found_fin_ok: + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + break; + + } while (len > 0); + + /* According to UNIX98, msg_name/msg_namelen are ignored + * on connected socket. I was just happy when found this 8) --ANK + */ + + /* Clean up data we have read: This will do ACK frames. */ + /* FIXME: cleanup_rbuf(sk, copied); */ + + release_sock(sk); + return copied; + +out: + release_sock(sk); + return err; + +recv_urg: + /* FIXME: err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); */ + goto out; +} + +static int inet_dccp_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + unsigned char old_state; + int err; + + lock_sock(sk); + + err = -EINVAL; + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) + goto out; + + old_state = sk->sk_state; + if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) + goto out; + + /* Really, if the socket is already in listen state + * we can only allow the backlog to be adjusted. + */ + if (old_state != DCCP_LISTEN) { + /* + * FIXME: here it probably should be sk->sk_prot->listen_start + * see tcp_listen_start + */ + err = dccp_listen_start(sk); + if (err) + goto out; + } + sk->sk_max_ack_backlog = backlog; + err = 0; + +out: + release_sock(sk); + return err; +} + +static const unsigned char dccp_new_state[] = { + /* current state: new state: action: */ + [0] = DCCP_CLOSED, + [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, + [DCCP_REQUESTING] = DCCP_CLOSED, + [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, + [DCCP_LISTEN] = DCCP_CLOSED, + [DCCP_RESPOND] = DCCP_CLOSED, + [DCCP_CLOSING] = DCCP_CLOSED, + [DCCP_TIME_WAIT] = DCCP_CLOSED, + [DCCP_CLOSED] = DCCP_CLOSED, +}; + +static int dccp_close_state(struct sock *sk) +{ + const int next = dccp_new_state[sk->sk_state]; + const int ns = next & DCCP_STATE_MASK; + + if (ns != sk->sk_state) + dccp_set_state(sk, ns); + + return next & DCCP_ACTION_FIN; +} + +void dccp_close(struct sock *sk, long timeout) +{ + struct sk_buff *skb; + + lock_sock(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (sk->sk_state == DCCP_LISTEN) { + dccp_set_state(sk, DCCP_CLOSED); + + /* Special case. */ + inet_csk_listen_stop(sk); + + goto adjudge_to_death; + } + + /* + * We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + *reader process may not have drained the data yet! + */ + /* FIXME: check for unread data */ + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + __kfree_skb(skb); + } + + if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { + /* Check zero linger _after_ checking for unread data. */ + sk->sk_prot->disconnect(sk, 0); + } else if (dccp_close_state(sk)) { + dccp_send_close(sk); + } + + sk_stream_wait_close(sk, timeout); + +adjudge_to_death: + release_sock(sk); + /* + * Now socket is owned by kernel and we acquire BH lock + * to finish close. No need to check for user refs. + */ + local_bh_disable(); + bh_lock_sock(sk); + BUG_TRAP(!sock_owned_by_user(sk)); + + sock_hold(sk); + sock_orphan(sk); + + if (sk->sk_state != DCCP_CLOSED) + dccp_set_state(sk, DCCP_CLOSED); + + atomic_inc(&dccp_orphan_count); + if (sk->sk_state == DCCP_CLOSED) + inet_csk_destroy_sock(sk); + + /* Otherwise, socket is reprieved until protocol close. */ + + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +} + +void dccp_shutdown(struct sock *sk, int how) +{ + dccp_pr_debug("entry\n"); +} + +struct proto_ops inet_dccp_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet_getname, + .poll = sock_no_poll, + .ioctl = inet_ioctl, + .listen = inet_dccp_listen, /* FIXME: work on inet_listen to rename it to sock_common_listen */ + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +extern struct net_proto_family inet_family_ops; + +static struct inet_protosw dccp_v4_protosw = { + .type = SOCK_DCCP, + .protocol = IPPROTO_DCCP, + .prot = &dccp_v4_prot, + .ops = &inet_dccp_ops, + .capability = -1, + .no_check = 0, + .flags = 0, +}; + +/* + * This is the global socket data structure used for responding to + * the Out-of-the-blue (OOTB) packets. A control sock will be created + * for this socket at the initialization time. + */ +struct socket *dccp_ctl_socket; + +static char dccp_ctl_socket_err_msg[] __initdata = + KERN_ERR "DCCP: Failed to create the control socket.\n"; + +static int __init dccp_ctl_sock_init(void) +{ + int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP, + &dccp_ctl_socket); + if (rc < 0) + printk(dccp_ctl_socket_err_msg); + else { + dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC; + inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1; + + /* Unhash it so that IP input processing does not even + * see it, we do not wish this socket to see incoming + * packets. + */ + dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk); + } + + return rc; +} + +static void __exit dccp_ctl_sock_exit(void) +{ + if (dccp_ctl_socket != NULL) + sock_release(dccp_ctl_socket); +} + +static int __init init_dccp_v4_mibs(void) +{ + int rc = -ENOMEM; + + dccp_statistics[0] = alloc_percpu(struct dccp_mib); + if (dccp_statistics[0] == NULL) + goto out; + + dccp_statistics[1] = alloc_percpu(struct dccp_mib); + if (dccp_statistics[1] == NULL) + goto out_free_one; + + rc = 0; +out: + return rc; +out_free_one: + free_percpu(dccp_statistics[0]); + dccp_statistics[0] = NULL; + goto out; + +} + +static int thash_entries; +module_param(thash_entries, int, 0444); +MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); + +int dccp_debug; +module_param(dccp_debug, int, 0444); +MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); + +static int __init dccp_init(void) +{ + unsigned long goal; + int ehash_order, bhash_order, i; + int rc = proto_register(&dccp_v4_prot, 1); + + if (rc) + goto out; + + dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", + sizeof(struct inet_bind_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!dccp_hashinfo.bind_bucket_cachep) + goto out_proto_unregister; + + /* + * Size and allocate the main established and bind bucket + * hash tables. + * + * The methodology is similar to that of the buffer cache. + */ + if (num_physpages >= (128 * 1024)) + goal = num_physpages >> (21 - PAGE_SHIFT); + else + goal = num_physpages >> (23 - PAGE_SHIFT); + + if (thash_entries) + goal = (thash_entries * sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; + for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) + ; + do { + dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE / + sizeof(struct inet_ehash_bucket); + dccp_hashinfo.ehash_size >>= 1; + while (dccp_hashinfo.ehash_size & (dccp_hashinfo.ehash_size - 1)) + dccp_hashinfo.ehash_size--; + dccp_hashinfo.ehash = (struct inet_ehash_bucket *) + __get_free_pages(GFP_ATOMIC, ehash_order); + } while (!dccp_hashinfo.ehash && --ehash_order > 0); + + if (!dccp_hashinfo.ehash) { + printk(KERN_CRIT "Failed to allocate DCCP " + "established hash table\n"); + goto out_free_bind_bucket_cachep; + } + + for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) { + rwlock_init(&dccp_hashinfo.ehash[i].lock); + INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); + } + + bhash_order = ehash_order; + + do { + dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / + sizeof(struct inet_bind_hashbucket); + if ((dccp_hashinfo.bhash_size > (64 * 1024)) && bhash_order > 0) + continue; + dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) + __get_free_pages(GFP_ATOMIC, bhash_order); + } while (!dccp_hashinfo.bhash && --bhash_order >= 0); + + if (!dccp_hashinfo.bhash) { + printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n"); + goto out_free_dccp_ehash; + } + + for (i = 0; i < dccp_hashinfo.bhash_size; i++) { + spin_lock_init(&dccp_hashinfo.bhash[i].lock); + INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); + } + + if (init_dccp_v4_mibs()) + goto out_free_dccp_bhash; + + rc = -EAGAIN; + if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP)) + goto out_free_dccp_v4_mibs; + + inet_register_protosw(&dccp_v4_protosw); + + rc = dccp_ctl_sock_init(); + if (rc) + goto out_unregister_protosw; +out: + return rc; +out_unregister_protosw: + inet_unregister_protosw(&dccp_v4_protosw); + inet_del_protocol(&dccp_protocol, IPPROTO_DCCP); +out_free_dccp_v4_mibs: + free_percpu(dccp_statistics[0]); + free_percpu(dccp_statistics[1]); + dccp_statistics[0] = dccp_statistics[1] = NULL; +out_free_dccp_bhash: + free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); + dccp_hashinfo.bhash = NULL; +out_free_dccp_ehash: + free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); + dccp_hashinfo.ehash = NULL; +out_free_bind_bucket_cachep: + kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); + dccp_hashinfo.bind_bucket_cachep = NULL; +out_proto_unregister: + proto_unregister(&dccp_v4_prot); + goto out; +} + +static const char dccp_del_proto_err_msg[] __exitdata = + KERN_ERR "can't remove dccp net_protocol\n"; + +static void __exit dccp_fini(void) +{ + dccp_ctl_sock_exit(); + + inet_unregister_protosw(&dccp_v4_protosw); + + if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0) + printk(dccp_del_proto_err_msg); + + /* Free the control endpoint. */ + sock_release(dccp_ctl_socket); + + proto_unregister(&dccp_v4_prot); + + kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); +} + +module_init(dccp_init); +module_exit(dccp_fini); + +/* __stringify doesn't likes enums, so use SOCK_DCCP (6) value directly */ +MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-6"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo "); +MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff -puN /dev/null net/dccp/timer.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/dccp/timer.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,245 @@ +/* + * net/dccp/timer.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "dccp.h" + +static void dccp_write_timer(unsigned long data); +static void dccp_keepalive_timer(unsigned long data); +static void dccp_delack_timer(unsigned long data); + +void dccp_init_xmit_timers(struct sock *sk) +{ + inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, + &dccp_keepalive_timer); +} + +static void dccp_write_err(struct sock *sk) +{ + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; + sk->sk_error_report(sk); + + dccp_v4_send_reset(sk, DCCP_RESET_CODE_ABORTED); + dccp_done(sk); + DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT); +} + +/* A write timeout has occurred. Process the after effects. */ +static int dccp_write_timeout(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + int retry_until; + + if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { + if (icsk->icsk_retransmits != 0) + dst_negative_advice(&sk->sk_dst_cache); + retry_until = icsk->icsk_syn_retries ? : /* FIXME! */ 3 /* FIXME! sysctl_tcp_syn_retries */; + } else { + if (icsk->icsk_retransmits >= /* FIXME! sysctl_tcp_retries1 */ 5 /* FIXME! */) { + /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black + hole detection. :-( + + It is place to make it. It is not made. I do not want + to make it. It is disguisting. It does not work in any + case. Let me to cite the same draft, which requires for + us to implement this: + + "The one security concern raised by this memo is that ICMP black holes + are often caused by over-zealous security administrators who block + all ICMP messages. It is vitally important that those who design and + deploy security systems understand the impact of strict filtering on + upper-layer protocols. The safest web site in the world is worthless + if most TCP implementations cannot transfer data from it. It would + be far nicer to have all of the black holes fixed rather than fixing + all of the TCP implementations." + + Golden words :-). + */ + + dst_negative_advice(&sk->sk_dst_cache); + } + + retry_until = /* FIXME! */ 15 /* FIXME! sysctl_tcp_retries2 */; + /* + * FIXME: see tcp_write_timout and tcp_out_of_resources + */ + } + + if (icsk->icsk_retransmits >= retry_until) { + /* Has it gone just too far? */ + dccp_write_err(sk); + return 1; + } + return 0; +} + +/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ +static void dccp_delack_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = inet_csk(sk); + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + icsk->icsk_ack.blocked = 1; + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); + sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); + goto out; + } + + if (sk->sk_state == DCCP_CLOSED || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + goto out; + if (time_after(icsk->icsk_ack.timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + goto out; + } + + icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; + + if (inet_csk_ack_scheduled(sk)) { + if (!icsk->icsk_ack.pingpong) { + /* Delayed ACK missed: inflate ATO. */ + icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); + } else { + /* Delayed ACK missed: leave pingpong mode and + * deflate ATO. + */ + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; + } + dccp_send_ack(sk); + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * The DCCP retransmit timer. + */ +static void dccp_retransmit_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* + * sk->sk_send_head has to have one skb with + * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP + * packet types (REQUEST, RESPONSE, the ACK in the 3way hanshake + * (PARTOPEN timer), etc). + */ + BUG_TRAP(sk->sk_send_head != NULL); + + /* + * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was + * sent, no need to retransmit, this sock is dead. + */ + if (dccp_write_timeout(sk)) + goto out; + + /* + * We want to know the number of packets retransmitted, not the + * total number of retransmissions of clones of original packets. + */ + if (icsk->icsk_retransmits == 0) + DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS); + + if (dccp_retransmit_skb(sk, sk->sk_send_head) < 0) { + /* + * Retransmission failed because of local congestion, + * do not backoff. + */ + if (icsk->icsk_retransmits == 0) + icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + min(icsk->icsk_rto, + TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); + goto out; + } + + icsk->icsk_backoff++; + icsk->icsk_retransmits++; + + icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + if (icsk->icsk_retransmits > 3 /* FIXME: sysctl_dccp_retries1 */) + __sk_dst_reset(sk); +out:; +} + +static void dccp_write_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = inet_csk(sk); + int event = 0; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later */ + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); + goto out; + } + + if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) + goto out; + + if (time_after(icsk->icsk_timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + goto out; + } + + event = icsk->icsk_pending; + icsk->icsk_pending = 0; + + switch (event) { + case ICSK_TIME_RETRANS: + dccp_retransmit_timer(sk); + break; + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * Timer for listening sockets + */ +static void dccp_response_timer(struct sock *sk) +{ + inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); +} + +static void dccp_keepalive_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + inet_csk_reset_keepalive_timer(sk, HZ / 20); + goto out; + } + + if (sk->sk_state == DCCP_LISTEN) { + dccp_response_timer(sk); + goto out; + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} diff -puN net/decnet/af_decnet.c~git-net net/decnet/af_decnet.c --- devel/net/decnet/af_decnet.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/decnet/af_decnet.c 2005-08-06 23:39:10.000000000 -0700 @@ -118,7 +118,7 @@ Version 0.0.6 2.1.110 07-aug-98 E #include #include #include -#include +#include #include #include #include @@ -1763,7 +1763,7 @@ static int dn_recvmsg(struct kiocb *iocb nskb = skb->next; if (skb->len == 0) { - skb_unlink(skb); + skb_unlink(skb, queue); kfree_skb(skb); /* * N.B. Don't refer to skb or cb after this point @@ -2073,7 +2073,7 @@ static struct notifier_block dn_dev_noti .notifier_call = dn_device_event, }; -extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *); +extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); static struct packet_type dn_dix_packet_type = { .type = __constant_htons(ETH_P_DNA_RT), diff -puN net/decnet/dn_nsp_in.c~git-net net/decnet/dn_nsp_in.c --- devel/net/decnet/dn_nsp_in.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/decnet/dn_nsp_in.c 2005-08-06 23:39:10.000000000 -0700 @@ -60,7 +60,7 @@ #include #include #include -#include +#include #include #include #include diff -puN net/decnet/dn_nsp_out.c~git-net net/decnet/dn_nsp_out.c --- devel/net/decnet/dn_nsp_out.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/decnet/dn_nsp_out.c 2005-08-06 23:39:10.000000000 -0700 @@ -479,7 +479,7 @@ int dn_nsp_check_xmit_queue(struct sock xmit_count = cb2->xmit_count; segnum = cb2->segnum; /* Remove and drop ack'ed packet */ - skb_unlink(ack); + skb_unlink(ack, q); kfree_skb(ack); ack = NULL; diff -puN net/decnet/dn_route.c~git-net net/decnet/dn_route.c --- devel/net/decnet/dn_route.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/decnet/dn_route.c 2005-08-06 23:39:10.000000000 -0700 @@ -572,7 +572,7 @@ static int dn_route_ptp_hello(struct sk_ return NET_RX_SUCCESS; } -int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct dn_skb_cb *cb; unsigned char flags = 0; diff -puN net/decnet/netfilter/dn_rtmsg.c~git-net net/decnet/netfilter/dn_rtmsg.c --- devel/net/decnet/netfilter/dn_rtmsg.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/decnet/netfilter/dn_rtmsg.c 2005-08-06 23:39:10.000000000 -0700 @@ -138,7 +138,8 @@ static int __init init(void) { int rv = 0; - dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk); + dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk, + THIS_MODULE); if (dnrmg == NULL) { printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); return -ENOMEM; @@ -162,6 +163,7 @@ static void __exit fini(void) MODULE_DESCRIPTION("DECnet Routing Message Grabulator"); MODULE_AUTHOR("Steven Whitehouse "); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG); module_init(init); module_exit(fini); diff -puN net/econet/af_econet.c~git-net net/econet/af_econet.c --- devel/net/econet/af_econet.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/econet/af_econet.c 2005-08-06 23:39:10.000000000 -0700 @@ -869,7 +869,7 @@ static void aun_tx_ack(unsigned long seq foundit: tx_result(skb->sk, eb->cookie, result); - skb_unlink(skb); + skb_unlink(skb, &aun_queue); spin_unlock_irqrestore(&aun_queue_lock, flags); kfree_skb(skb); } @@ -947,7 +947,7 @@ static void ab_cleanup(unsigned long h) { tx_result(skb->sk, eb->cookie, ECTYPE_TRANSMIT_NOT_PRESENT); - skb_unlink(skb); + skb_unlink(skb, &aun_queue); kfree_skb(skb); } skb = newskb; @@ -1009,7 +1009,7 @@ release: * Receive an Econet frame from a device. */ -static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct ec_framehdr *hdr; struct sock *sk; diff -puN net/ethernet/eth.c~git-net net/ethernet/eth.c --- devel/net/ethernet/eth.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ethernet/eth.c 2005-08-06 23:39:10.000000000 -0700 @@ -163,7 +163,6 @@ __be16 eth_type_trans(struct sk_buff *sk skb->mac.raw=skb->data; skb_pull(skb,ETH_HLEN); eth = eth_hdr(skb); - skb->input_dev = dev; if(*eth->h_dest&1) { diff -puN net/ipv4/af_inet.c~git-net net/ipv4/af_inet.c --- devel/net/ipv4/af_inet.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/af_inet.c 2005-08-06 23:39:10.000000000 -0700 @@ -99,6 +99,7 @@ #include #include #include +#include #include #include #include @@ -114,10 +115,6 @@ DEFINE_SNMP_STAT(struct linux_mib, net_statistics); -#ifdef INET_REFCNT_DEBUG -atomic_t inet_sock_nr; -#endif - extern void ip_mc_drop_socket(struct sock *sk); /* The inetsw table contains everything that inet_create needs to @@ -153,11 +150,7 @@ void inet_sock_destruct(struct sock *sk) if (inet->opt) kfree(inet->opt); dst_release(sk->sk_dst_cache); -#ifdef INET_REFCNT_DEBUG - atomic_dec(&inet_sock_nr); - printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", - sk, atomic_read(&inet_sock_nr)); -#endif + sk_refcnt_debug_dec(sk); } /* @@ -210,7 +203,7 @@ int inet_listen(struct socket *sock, int * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { - err = tcp_listen_start(sk); + err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); if (err) goto out; } @@ -317,9 +310,7 @@ static int inet_create(struct socket *so inet->mc_index = 0; inet->mc_list = NULL; -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif + sk_refcnt_debug_inc(sk); if (inet->num) { /* It assumes that any protocol which allows @@ -961,6 +952,119 @@ void inet_unregister_protosw(struct inet } } +/* + * Shall we try to damage output packets if routing dev changes? + */ + +int sysctl_ip_dynaddr; + +static int inet_sk_reselect_saddr(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + int err; + struct rtable *rt; + __u32 old_saddr = inet->saddr; + __u32 new_saddr; + __u32 daddr = inet->daddr; + + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; + + /* Query new route. */ + err = ip_route_connect(&rt, daddr, 0, + RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if, + sk->sk_protocol, + inet->sport, inet->dport, sk); + if (err) + return err; + + sk_setup_caps(sk, &rt->u.dst); + + new_saddr = rt->rt_src; + + if (new_saddr == old_saddr) + return 0; + + if (sysctl_ip_dynaddr > 1) { + printk(KERN_INFO "%s(): shifting inet->" + "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", + __FUNCTION__, + NIPQUAD(old_saddr), + NIPQUAD(new_saddr)); + } + + inet->saddr = inet->rcv_saddr = new_saddr; + + /* + * XXX The only one ugly spot where we need to + * XXX really change the sockets identity after + * XXX it has entered the hashes. -DaveM + * + * Besides that, it does not check for connection + * uniqueness. Wait for troubles. + */ + __sk_prot_rehash(sk); + return 0; +} + +int inet_sk_rebuild_header(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); + u32 daddr; + int err; + + /* Route is OK, nothing to do. */ + if (rt) + return 0; + + /* Reroute. */ + daddr = inet->daddr; + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; +{ + struct flowi fl = { + .oif = sk->sk_bound_dev_if, + .nl_u = { + .ip4_u = { + .daddr = daddr, + .saddr = inet->saddr, + .tos = RT_CONN_FLAGS(sk), + }, + }, + .proto = sk->sk_protocol, + .uli_u = { + .ports = { + .sport = inet->sport, + .dport = inet->dport, + }, + }, + }; + + err = ip_route_output_flow(&rt, &fl, sk, 0); +} + if (!err) + sk_setup_caps(sk, &rt->u.dst); + else { + /* Routing failed... */ + sk->sk_route_caps = 0; + /* + * Other protocols have to map its equivalent state to TCP_SYN_SENT. + * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme + */ + if (!sysctl_ip_dynaddr || + sk->sk_state != TCP_SYN_SENT || + (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || + (err = inet_sk_reselect_saddr(sk)) != 0) + sk->sk_err_soft = -err; + } + + return err; +} + +EXPORT_SYMBOL(inet_sk_rebuild_header); + #ifdef CONFIG_IP_MULTICAST static struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1205,7 +1309,3 @@ EXPORT_SYMBOL(inet_stream_ops); EXPORT_SYMBOL(inet_unregister_protosw); EXPORT_SYMBOL(net_statistics); EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); - -#ifdef INET_REFCNT_DEBUG -EXPORT_SYMBOL(inet_sock_nr); -#endif diff -puN net/ipv4/arp.c~git-net net/ipv4/arp.c --- devel/net/ipv4/arp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/arp.c 2005-08-06 23:39:10.000000000 -0700 @@ -700,7 +700,7 @@ void arp_send(int type, int ptype, u32 d static void parp_redo(struct sk_buff *skb) { nf_reset(skb); - arp_rcv(skb, skb->dev, NULL); + arp_rcv(skb, skb->dev, NULL, skb->dev); } /* @@ -927,7 +927,7 @@ out: * Receive an arp request from the device layer. */ -int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct arphdr *arp; diff -puN net/ipv4/datagram.c~git-net net/ipv4/datagram.c --- devel/net/ipv4/datagram.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/datagram.c 2005-08-06 23:39:10.000000000 -0700 @@ -17,8 +17,8 @@ #include #include #include -#include #include +#include int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { diff -puN net/ipv4/fib_frontend.c~git-net net/ipv4/fib_frontend.c --- devel/net/ipv4/fib_frontend.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/fib_frontend.c 2005-08-06 23:39:10.000000000 -0700 @@ -567,7 +567,7 @@ static void nl_fib_input(struct sock *sk static void nl_fib_lookup_init(void) { - netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input); + netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input, THIS_MODULE); } static void fib_disable_ip(struct net_device *dev, int force) @@ -662,5 +662,4 @@ void __init ip_fib_init(void) } EXPORT_SYMBOL(inet_addr_type); -EXPORT_SYMBOL(ip_dev_find); EXPORT_SYMBOL(ip_rt_ioctl); diff -puN /dev/null net/ipv4/inet_connection_sock.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/ipv4/inet_connection_sock.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,635 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET connection oriented protocols. + * + * Authors: See the TCP sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET_CSK_DEBUG +const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; +EXPORT_SYMBOL(inet_csk_timer_bug_msg); +#endif + +/* + * This array holds the first and last local port number. + * For high-usage systems, use sysctl to change this to + * 32768-61000 + */ +int sysctl_local_port_range[2] = { 1024, 4999 }; + +static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) +{ + const u32 sk_rcv_saddr = inet_rcv_saddr(sk); + struct sock *sk2; + struct hlist_node *node; + int reuse = sk->sk_reuse; + + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + !inet_v6_ipv6only(sk2) && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if (!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) { + const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); + if (!sk2_rcv_saddr || !sk_rcv_saddr || + sk2_rcv_saddr == sk_rcv_saddr) + break; + } + } + } + return node != NULL; +} + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + */ +int inet_csk_get_port(struct inet_hashinfo *hashinfo, + struct sock *sk, unsigned short snum) +{ + struct inet_bind_hashbucket *head; + struct hlist_node *node; + struct inet_bind_bucket *tb; + int ret; + + local_bh_disable(); + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int rover; + + spin_lock(&hashinfo->portalloc_lock); + if (hashinfo->port_rover < low) + rover = low; + else + rover = hashinfo->port_rover; + do { + rover++; + if (rover > high) + rover = low; + head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->port == rover) + goto next; + break; + next: + spin_unlock(&head->lock); + } while (--remaining > 0); + hashinfo->port_rover = rover; + spin_unlock(&hashinfo->portalloc_lock); + + /* Exhausted local port range during search? */ + ret = 1; + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. HEAD is + * non-NULL and we hold it's mutex. + */ + snum = rover; + } else { + head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->port == snum) + goto tb_found; + } + tb = NULL; + goto tb_not_found; +tb_found: + if (!hlist_empty(&tb->owners)) { + if (sk->sk_reuse > 1) + goto success; + if (tb->fastreuse > 0 && + sk->sk_reuse && sk->sk_state != TCP_LISTEN) { + goto success; + } else { + ret = 1; + if (inet_csk_bind_conflict(sk, tb)) + goto fail_unlock; + } + } +tb_not_found: + ret = 1; + if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + } else if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; +success: + if (!inet_csk(sk)->icsk_bind_hash) + inet_bind_hash(sk, tb, snum); + BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); + ret = 0; + +fail_unlock: + spin_unlock(&head->lock); +fail: + local_bh_enable(); + return ret; +} + +EXPORT_SYMBOL_GPL(inet_csk_get_port); + +/* + * Wait for an incoming connection, avoid race conditions. This must be called + * with the socket locked. + */ +static int inet_csk_wait_for_connect(struct sock *sk, long timeo) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + DEFINE_WAIT(wait); + int err; + + /* + * True wake-one mechanism for incoming connections: only + * one process gets woken up, not the 'whole herd'. + * Since we do not 'race & poll' for established sockets + * anymore, the common case will execute the loop only once. + * + * Subtle issue: "add_wait_queue_exclusive()" will be added + * after any current non-exclusive waiters, and we know that + * it will always _stay_ after any new non-exclusive waiters + * because all non-exclusive waiters are added at the + * beginning of the wait-queue. As such, it's ok to "drop" + * our exclusiveness temporarily when we get woken up without + * having to remove and re-insert us on the wait queue. + */ + for (;;) { + prepare_to_wait_exclusive(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); + release_sock(sk); + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) + timeo = schedule_timeout(timeo); + lock_sock(sk); + err = 0; + if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) + break; + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!timeo) + break; + } + finish_wait(sk->sk_sleep, &wait); + return err; +} + +/* + * This will accept the next outstanding connection. + */ +struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct sock *newsk; + int error; + + lock_sock(sk); + + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ + error = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out_err; + + /* Find already established connection */ + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + /* If this is a non blocking socket don't sleep */ + error = -EAGAIN; + if (!timeo) + goto out_err; + + error = inet_csk_wait_for_connect(sk, timeo); + if (error) + goto out_err; + } + + newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); + BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); +out: + release_sock(sk); + return newsk; +out_err: + newsk = NULL; + *err = error; + goto out; +} + +EXPORT_SYMBOL(inet_csk_accept); + +/* + * Using different timers for retransmit, delayed acks and probes + * We may wish use just one timer maintaining a list of expire jiffies + * to optimize. + */ +void inet_csk_init_xmit_timers(struct sock *sk, + void (*retransmit_handler)(unsigned long), + void (*delack_handler)(unsigned long), + void (*keepalive_handler)(unsigned long)) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + init_timer(&icsk->icsk_retransmit_timer); + init_timer(&icsk->icsk_delack_timer); + init_timer(&sk->sk_timer); + + icsk->icsk_retransmit_timer.function = retransmit_handler; + icsk->icsk_delack_timer.function = delack_handler; + sk->sk_timer.function = keepalive_handler; + + icsk->icsk_retransmit_timer.data = + icsk->icsk_delack_timer.data = + sk->sk_timer.data = (unsigned long)sk; + + icsk->icsk_pending = icsk->icsk_ack.pending = 0; +} + +EXPORT_SYMBOL(inet_csk_init_xmit_timers); + +void inet_csk_clear_xmit_timers(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &icsk->icsk_delack_timer); + sk_stop_timer(sk, &sk->sk_timer); +} + +EXPORT_SYMBOL(inet_csk_clear_xmit_timers); + +void inet_csk_delete_keepalive_timer(struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} + +EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); + +void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) +{ + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} + +EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); + +struct dst_entry* inet_csk_route_req(struct sock *sk, + const struct request_sock *req) +{ + struct rtable *rt; + const struct inet_request_sock *ireq = inet_rsk(req); + struct ip_options *opt = inet_rsk(req)->opt; + struct flowi fl = { .oif = sk->sk_bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = ((opt && opt->srr) ? + opt->faddr : + ireq->rmt_addr), + .saddr = ireq->loc_addr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = sk->sk_protocol, + .uli_u = { .ports = + { .sport = inet_sk(sk)->sport, + .dport = ireq->rmt_port } } }; + + if (ip_route_output_flow(&rt, &fl, sk, 0)) { + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + ip_rt_put(rt); + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + return &rt->u.dst; +} + +EXPORT_SYMBOL_GPL(inet_csk_route_req); + +static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, + const u32 rnd, const u16 synq_hsize) +{ + return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#define AF_INET_FAMILY(fam) ((fam) == AF_INET) +#else +#define AF_INET_FAMILY(fam) 1 +#endif + +struct request_sock *inet_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, const __u32 raddr, + const __u32 laddr) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct inet_request_sock *ireq = inet_rsk(req); + + if (ireq->rmt_port == rport && + ireq->rmt_addr == raddr && + ireq->loc_addr == laddr && + AF_INET_FAMILY(req->rsk_ops->family)) { + BUG_TRAP(!req->sk); + *prevp = prev; + break; + } + } + + return req; +} + +EXPORT_SYMBOL_GPL(inet_csk_search_req); + +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + const unsigned timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} + +/* Only thing we need from tcp.h */ +extern int sysctl_tcp_synack_retries; + +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); + +void inet_csk_reqsk_queue_prune(struct sock *parent, + const unsigned long interval, + const unsigned long timeout, + const unsigned long max_rto) +{ + struct inet_connection_sock *icsk = inet_csk(parent); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; + struct listen_sock *lopt = queue->listen_opt; + int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; + unsigned long now = jiffies; + struct request_sock **reqp, *req; + int i, budget; + + if (lopt == NULL || lopt->qlen == 0) + return; + + /* Normally all the openreqs are young and become mature + * (i.e. converted to established socket) for first timeout. + * If synack was not acknowledged for 3 seconds, it means + * one of the following things: synack was lost, ack was lost, + * rtt is high or nobody planned to ack (i.e. synflood). + * When server is a bit loaded, queue is populated with old + * open requests, reducing effective size of queue. + * When server is well loaded, queue size reduces to zero + * after several minutes of work. It is not synflood, + * it is normal operation. The solution is pruning + * too old entries overriding normal timeout, when + * situation becomes dangerous. + * + * Essentially, we reserve half of room for young + * embrions; and abort old ones without pity, if old + * ones are about to clog our table. + */ + if (lopt->qlen>>(lopt->max_qlen_log-1)) { + int young = (lopt->qlen_young<<1); + + while (thresh > 2) { + if (lopt->qlen < young) + break; + thresh--; + young <<= 1; + } + } + + if (queue->rskq_defer_accept) + max_retries = queue->rskq_defer_accept; + + budget = 2 * (lopt->nr_table_entries / (timeout / interval)); + i = lopt->clock_hand; + + do { + reqp=&lopt->syn_table[i]; + while ((req = *reqp) != NULL) { + if (time_after_eq(now, req->expires)) { + if ((req->retrans < thresh || + (inet_rsk(req)->acked && req->retrans < max_retries)) + && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { + unsigned long timeo; + + if (req->retrans++ == 0) + lopt->qlen_young--; + timeo = min((timeout << req->retrans), max_rto); + req->expires = now + timeo; + reqp = &req->dl_next; + continue; + } + + /* Drop this request */ + inet_csk_reqsk_queue_unlink(parent, req, reqp); + reqsk_queue_removed(queue, req); + reqsk_free(req); + continue; + } + reqp = &req->dl_next; + } + + i = (i + 1) & (lopt->nr_table_entries - 1); + + } while (--budget > 0); + + lopt->clock_hand = i; + + if (lopt->qlen) + inet_csk_reset_keepalive_timer(parent, interval); +} + +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); + +struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, + const unsigned int __nocast priority) +{ + struct sock *newsk = sk_clone(sk, priority); + + if (newsk != NULL) { + struct inet_connection_sock *newicsk = inet_csk(newsk); + + newsk->sk_state = TCP_SYN_RECV; + newicsk->icsk_bind_hash = NULL; + + inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; + newsk->sk_write_space = sk_stream_write_space; + + newicsk->icsk_retransmits = 0; + newicsk->icsk_backoff = 0; + + /* Deinitialize accept_queue to trap illegal accesses. */ + memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + } + return newsk; +} + +EXPORT_SYMBOL_GPL(inet_csk_clone); + +/* + * At this point, there should be no process reference to this + * socket, and thus no user references at all. Therefore we + * can assume the socket waitqueue is inactive and nobody will + * try to jump onto it. + */ +void inet_csk_destroy_sock(struct sock *sk) +{ + BUG_TRAP(sk->sk_state == TCP_CLOSE); + BUG_TRAP(sock_flag(sk, SOCK_DEAD)); + + /* It cannot be in hash table! */ + BUG_TRAP(sk_unhashed(sk)); + + /* If it has not 0 inet_sk(sk)->num, it must be bound */ + BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); + + sk->sk_prot->destroy(sk); + + sk_stream_kill_queues(sk); + + xfrm_sk_free_policy(sk); + + sk_refcnt_debug_release(sk); + + atomic_dec(sk->sk_prot->orphan_count); + sock_put(sk); +} + +EXPORT_SYMBOL(inet_csk_destroy_sock); + +int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) +{ + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + + if (rc != 0) + return rc; + + sk->sk_max_ack_backlog = 0; + sk->sk_ack_backlog = 0; + inet_csk_delack_init(sk); + + /* There is race window here: we announce ourselves listening, + * but this transition is still not validated by get_port(). + * It is OK, because this socket enters to hash table only + * after validation is complete. + */ + sk->sk_state = TCP_LISTEN; + if (!sk->sk_prot->get_port(sk, inet->num)) { + inet->sport = htons(inet->num); + + sk_dst_reset(sk); + sk->sk_prot->hash(sk); + + return 0; + } + + sk->sk_state = TCP_CLOSE; + __reqsk_queue_destroy(&icsk->icsk_accept_queue); + return -EADDRINUSE; +} + +EXPORT_SYMBOL_GPL(inet_csk_listen_start); + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. + */ +void inet_csk_listen_stop(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock *acc_req; + struct request_sock *req; + + inet_csk_delete_keepalive_timer(sk); + + /* make all the listen_opt local to us */ + acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); + + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + reqsk_queue_destroy(&icsk->icsk_accept_queue); + + while ((req = acc_req) != NULL) { + struct sock *child = req->sk; + + acc_req = req->dl_next; + + local_bh_disable(); + bh_lock_sock(child); + BUG_TRAP(!sock_owned_by_user(child)); + sock_hold(child); + + sk->sk_prot->disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + atomic_inc(sk->sk_prot->orphan_count); + + inet_csk_destroy_sock(child); + + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + + sk_acceptq_removed(sk); + __reqsk_free(req); + } + BUG_TRAP(!sk->sk_ack_backlog); +} + +EXPORT_SYMBOL_GPL(inet_csk_listen_stop); diff -puN /dev/null net/ipv4/inet_hashtables.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/ipv4/inet_hashtables.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,165 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic INET transport hashtables + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include + +/* + * Allocate and initialize a new local port bind bucket. + * The bindhash mutex for snum's hash chain must be held here. + */ +struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, + const unsigned short snum) +{ + struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); + + if (tb != NULL) { + tb->port = snum; + tb->fastreuse = 0; + INIT_HLIST_HEAD(&tb->owners); + hlist_add_head(&tb->node, &head->chain); + } + return tb; +} + +EXPORT_SYMBOL(inet_bind_bucket_create); + +/* + * Caller must hold hashbucket lock for this tb with local BH disabled + */ +void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) +{ + if (hlist_empty(&tb->owners)) { + __hlist_del(&tb->node); + kmem_cache_free(cachep, tb); + } +} + +void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum) +{ + inet_sk(sk)->num = snum; + sk_add_bind_node(sk, &tb->owners); + inet_csk(sk)->icsk_bind_hash = tb; +} + +EXPORT_SYMBOL(inet_bind_hash); + +/* + * Get rid of any references to a local port held by the given sock. + */ +static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); + struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; + struct inet_bind_bucket *tb; + + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + __sk_del_bind_node(sk); + inet_csk(sk)->icsk_bind_hash = NULL; + inet_sk(sk)->num = 0; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + spin_unlock(&head->lock); +} + +void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + local_bh_disable(); + __inet_put_port(hashinfo, sk); + local_bh_enable(); +} + +EXPORT_SYMBOL(inet_put_port); + +/* + * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. + * Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines (wake up each + * exclusive lock release). It should be ifdefed really. + */ +void inet_listen_wlock(struct inet_hashinfo *hashinfo) +{ + write_lock(&hashinfo->lhash_lock); + + if (atomic_read(&hashinfo->lhash_users)) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait_exclusive(&hashinfo->lhash_wait, + &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&hashinfo->lhash_users)) + break; + write_unlock_bh(&hashinfo->lhash_lock); + schedule(); + write_lock_bh(&hashinfo->lhash_lock); + } + + finish_wait(&hashinfo->lhash_wait, &wait); + } +} + +EXPORT_SYMBOL(inet_listen_wlock); + +/* + * Don't inline this cruft. Here are some nice properties to exploit here. The + * BSD API does not allow a listening sock to specify the remote port nor the + * remote address for the connection. So always assume those are both + * wildcarded during the search since they can never be otherwise. + */ +struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, + const unsigned short hnum, const int dif) +{ + struct sock *result = NULL, *sk; + const struct hlist_node *node; + int hiscore = -1; + + sk_for_each(sk, node, head) { + const struct inet_sock *inet = inet_sk(sk); + + if (inet->num == hnum && !ipv6_only_sock(sk)) { + const __u32 rcv_saddr = inet->rcv_saddr; + int score = sk->sk_family == PF_INET ? 1 : 0; + + if (rcv_saddr) { + if (rcv_saddr != daddr) + continue; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score += 2; + } + if (score == 5) + return sk; + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + return result; +} + +EXPORT_SYMBOL_GPL(__inet_lookup_listener); diff -puN net/ipv4/inetpeer.c~git-net net/ipv4/inetpeer.c --- devel/net/ipv4/inetpeer.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/inetpeer.c 2005-08-06 23:39:10.000000000 -0700 @@ -456,5 +456,3 @@ static void peer_check_expire(unsigned l peer_total / inet_peer_threshold * HZ; add_timer(&peer_periodic_timer); } - -EXPORT_SYMBOL(inet_peer_idlock); diff -puN /dev/null net/ipv4/inet_timewait_sock.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/ipv4/inet_timewait_sock.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,114 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic TIME_WAIT sockets functions + * + * From code orinally in TCP + */ + +#include + +#include +#include + +/* Must be called with locally disabled BHs. */ +void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) +{ + struct inet_bind_hashbucket *bhead; + struct inet_bind_bucket *tb; + /* Unlink from established hashes. */ + struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent]; + + write_lock(&ehead->lock); + if (hlist_unhashed(&tw->tw_node)) { + write_unlock(&ehead->lock); + return; + } + __hlist_del(&tw->tw_node); + sk_node_init(&tw->tw_node); + write_unlock(&ehead->lock); + + /* Disassociate with bind bucket. */ + bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; + spin_lock(&bhead->lock); + tb = tw->tw_tb; + __hlist_del(&tw->tw_bind_node); + tw->tw_tb = NULL; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + spin_unlock(&bhead->lock); +#ifdef SOCK_REFCNT_DEBUG + if (atomic_read(&tw->tw_refcnt) != 1) { + printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", + tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); + } +#endif + inet_twsk_put(tw); +} + +EXPORT_SYMBOL_GPL(__inet_twsk_kill); + +/* + * Enter the time wait state. This is called with locally disabled BH. + * Essentially we whip up a timewait bucket, copy the relevant info into it + * from the SK, and mess with hash chains and list linkage. + */ +void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, + struct inet_hashinfo *hashinfo) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent]; + struct inet_bind_hashbucket *bhead; + /* Step 1: Put TW into bind hash. Original socket stays there too. + Note, that any socket with inet->num != 0 MUST be bound in + binding cache, even if it is closed. + */ + bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; + spin_lock(&bhead->lock); + tw->tw_tb = icsk->icsk_bind_hash; + BUG_TRAP(icsk->icsk_bind_hash); + inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); + spin_unlock(&bhead->lock); + + write_lock(&ehead->lock); + + /* Step 2: Remove SK from established hash. */ + if (__sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + + /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ + inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain); + atomic_inc(&tw->tw_refcnt); + + write_unlock(&ehead->lock); +} + +struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) +{ + struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, + SLAB_ATOMIC); + if (tw != NULL) { + const struct inet_sock *inet = inet_sk(sk); + + /* Give us an identity. */ + tw->tw_daddr = inet->daddr; + tw->tw_rcv_saddr = inet->rcv_saddr; + tw->tw_bound_dev_if = sk->sk_bound_dev_if; + tw->tw_num = inet->num; + tw->tw_state = TCP_TIME_WAIT; + tw->tw_substate = state; + tw->tw_sport = inet->sport; + tw->tw_dport = inet->dport; + tw->tw_family = sk->sk_family; + tw->tw_reuse = sk->sk_reuse; + tw->tw_hashent = sk->sk_hashent; + tw->tw_ipv6only = 0; + tw->tw_prot = sk->sk_prot_creator; + atomic_set(&tw->tw_refcnt, 1); + inet_twsk_dead_node_init(tw); + } + + return tw; +} diff -puN net/ipv4/ipconfig.c~git-net net/ipv4/ipconfig.c --- devel/net/ipv4/ipconfig.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/ipconfig.c 2005-08-06 23:39:10.000000000 -0700 @@ -393,7 +393,7 @@ static int __init ic_defaults(void) #ifdef IPCONFIG_RARP -static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); +static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type rarp_packet_type __initdata = { .type = __constant_htons(ETH_P_RARP), @@ -414,7 +414,7 @@ static inline void ic_rarp_cleanup(void) * Process received RARP packet. */ static int __init -ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct arphdr *rarp; unsigned char *rarp_ptr; @@ -555,7 +555,7 @@ struct bootp_pkt { /* BOOTP packet form #define DHCPRELEASE 7 #define DHCPINFORM 8 -static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); +static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type bootp_packet_type __initdata = { .type = __constant_htons(ETH_P_IP), @@ -823,7 +823,7 @@ static void __init ic_do_bootp_ext(u8 *e /* * Receive BOOTP reply. */ -static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct bootp_pkt *b; struct iphdr *h; diff -puN net/ipv4/ip_input.c~git-net net/ipv4/ip_input.c --- devel/net/ipv4/ip_input.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/ip_input.c 2005-08-06 23:39:10.000000000 -0700 @@ -225,8 +225,8 @@ static inline int ip_local_deliver_finis /* If there maybe a raw socket we must check - if not we * don't care less */ - if (raw_sk) - raw_v4_input(skb, skb->nh.iph, hash); + if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash)) + raw_sk = NULL; if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { int ret; @@ -358,7 +358,7 @@ drop: /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct iphdr *iph; diff -puN net/ipv4/ip_options.c~git-net net/ipv4/ip_options.c --- devel/net/ipv4/ip_options.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/ip_options.c 2005-08-06 23:39:10.000000000 -0700 @@ -620,6 +620,3 @@ int ip_options_rcv_srr(struct sk_buff *s } return 0; } - -EXPORT_SYMBOL(ip_options_compile); -EXPORT_SYMBOL(ip_options_undo); diff -puN net/ipv4/ip_output.c~git-net net/ipv4/ip_output.c --- devel/net/ipv4/ip_output.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/ip_output.c 2005-08-06 23:39:10.000000000 -0700 @@ -69,13 +69,10 @@ #include #include #include -#include -#include #include #include #include #include -#include #include #include #include @@ -84,12 +81,8 @@ #include #include #include +#include -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr; int sysctl_ip_default_ttl = IPDEFTTL; /* Generate a checksum for an outgoing IP datagram. */ @@ -165,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff dst_output); } +EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); + static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; @@ -205,7 +200,7 @@ static inline int ip_finish_output2(stru return -EINVAL; } -int ip_finish_output(struct sk_buff *skb) +static int ip_finish_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; @@ -329,8 +324,7 @@ int ip_queue_xmit(struct sk_buff *skb, i if (ip_route_output_flow(&rt, &fl, sk, 0)) goto no_route; } - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); } skb->dst = dst_clone(&rt->u.dst); @@ -392,7 +386,6 @@ static void ip_copy_metadata(struct sk_b #endif #ifdef CONFIG_NETFILTER to->nfmark = from->nfmark; - to->nfcache = from->nfcache; /* Connection association is same as pre-frag packet */ nf_conntrack_put(to->nfct); to->nfct = from->nfct; @@ -1329,12 +1322,7 @@ void __init ip_init(void) #endif } -EXPORT_SYMBOL(ip_finish_output); EXPORT_SYMBOL(ip_fragment); EXPORT_SYMBOL(ip_generic_getfrag); EXPORT_SYMBOL(ip_queue_xmit); EXPORT_SYMBOL(ip_send_check); - -#ifdef CONFIG_SYSCTL -EXPORT_SYMBOL(sysctl_ip_default_ttl); -#endif diff -puN net/ipv4/ip_sockglue.c~git-net net/ipv4/ip_sockglue.c --- devel/net/ipv4/ip_sockglue.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/ip_sockglue.c 2005-08-06 23:39:10.000000000 -0700 @@ -1087,7 +1087,5 @@ int ip_getsockopt(struct sock *sk, int l EXPORT_SYMBOL(ip_cmsg_recv); -#ifdef CONFIG_IP_SCTP_MODULE EXPORT_SYMBOL(ip_getsockopt); EXPORT_SYMBOL(ip_setsockopt); -#endif diff -puN net/ipv4/ipvs/ip_vs_app.c~git-net net/ipv4/ipvs/ip_vs_app.c --- devel/net/ipv4/ipvs/ip_vs_app.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/ipvs/ip_vs_app.c 2005-08-06 23:39:10.000000000 -0700 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff -puN net/ipv4/ipvs/ip_vs_core.c~git-net net/ipv4/ipvs/ip_vs_core.c --- devel/net/ipv4/ipvs/ip_vs_core.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/ipvs/ip_vs_core.c 2005-08-06 23:39:10.000000000 -0700 @@ -22,6 +22,7 @@ * * Changes: * Paul `Rusty' Russell properly handle non-linear skbs + * Harald Welte don't use nfcache * */ @@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(u const struct net_device *out, int (*okfn)(struct sk_buff *)) { - if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY)) + if (!((*pskb)->ipvs_property)) return NF_ACCEPT; /* The packet was sent from IPVS, exit this chain */ @@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); - skb->nfcache |= NFC_IPVS_PROPERTY; + skb->ipvs_property = 1; verdict = NF_ACCEPT; out: @@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct s EnterFunction(11); - if (skb->nfcache & NFC_IPVS_PROPERTY) + if (skb->ipvs_property) return NF_ACCEPT; iph = skb->nh.iph; @@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct s ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); ip_vs_conn_put(cp); - skb->nfcache |= NFC_IPVS_PROPERTY; + skb->ipvs_property = 1; LeaveFunction(11); return NF_ACCEPT; diff -puN net/ipv4/ipvs/ip_vs_xmit.c~git-net net/ipv4/ipvs/ip_vs_xmit.c --- devel/net/ipv4/ipvs/ip_vs_xmit.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/ipvs/ip_vs_xmit.c 2005-08-06 23:39:10.000000000 -0700 @@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) #define IP_VS_XMIT(skb, rt) \ do { \ - (skb)->nfcache |= NFC_IPVS_PROPERTY; \ + (skb)->ipvs_property = 1; \ (skb)->ip_summed = CHECKSUM_NONE; \ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ (rt)->u.dst.dev, dst_output); \ diff -puN net/ipv4/Makefile~git-net net/ipv4/Makefile --- devel/net/ipv4/Makefile~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/Makefile 2005-08-06 23:39:10.000000000 -0700 @@ -4,11 +4,12 @@ obj-y := route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ - ip_output.o ip_sockglue.o \ + ip_output.o ip_sockglue.o inet_hashtables.o \ + inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ - sysctl_net_ipv4.o fib_frontend.o fib_semantics.o + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o diff -puN net/ipv4/multipath_drr.c~git-net net/ipv4/multipath_drr.c --- devel/net/ipv4/multipath_drr.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/multipath_drr.c 2005-08-06 23:39:10.000000000 -0700 @@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier return NOTIFY_DONE; } -struct notifier_block drr_dev_notifier = { +static struct notifier_block drr_dev_notifier = { .notifier_call = drr_dev_event, }; diff -puN /dev/null net/ipv4/netfilter.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,139 @@ +/* IPv4 specific functions of netfilter core */ + +#include +#ifdef CONFIG_NETFILTER + +#include +#include +#include + +#include +#include +#include +#include +#include + +/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ +int ip_route_me_harder(struct sk_buff **pskb) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct rtable *rt; + struct flowi fl = {}; + struct dst_entry *odst; + unsigned int hh_len; + + /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause + * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. + */ + if (inet_addr_type(iph->saddr) == RTN_LOCAL) { + fl.nl_u.ip4_u.daddr = iph->daddr; + fl.nl_u.ip4_u.saddr = iph->saddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0; +#ifdef CONFIG_IP_ROUTE_FWMARK + fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; +#endif + fl.proto = iph->protocol; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + /* Drop old route. */ + dst_release((*pskb)->dst); + (*pskb)->dst = &rt->u.dst; + } else { + /* non-local src, find valid iif to satisfy + * rp-filter when calling ip_route_input. */ + fl.nl_u.ip4_u.daddr = iph->saddr; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + odst = (*pskb)->dst; + if (ip_route_input(*pskb, iph->daddr, iph->saddr, + RT_TOS(iph->tos), rt->u.dst.dev) != 0) { + dst_release(&rt->u.dst); + return -1; + } + dst_release(&rt->u.dst); + dst_release(odst); + } + + if ((*pskb)->dst->error) + return -1; + + /* Change in oif may mean change in hh_len. */ + hh_len = (*pskb)->dst->dev->hard_header_len; + if (skb_headroom(*pskb) < hh_len) { + struct sk_buff *nskb; + + nskb = skb_realloc_headroom(*pskb, hh_len); + if (!nskb) + return -1; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + + return 0; +} +EXPORT_SYMBOL(ip_route_me_harder); + +/* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + +struct ip_rt_info { + u_int32_t daddr; + u_int32_t saddr; + u_int8_t tos; +}; + +static void queue_save(const struct sk_buff *skb, struct nf_info *info) +{ + struct ip_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP_LOCAL_OUT) { + const struct iphdr *iph = skb->nh.iph; + + rt_info->tos = iph->tos; + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + } +} + +static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info) +{ + const struct ip_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP_LOCAL_OUT) { + struct iphdr *iph = (*pskb)->nh.iph; + + if (!(iph->tos == rt_info->tos + && iph->daddr == rt_info->daddr + && iph->saddr == rt_info->saddr)) + return ip_route_me_harder(pskb); + } + return 0; +} + +static struct nf_queue_rerouter ip_reroute = { + .rer_size = sizeof(struct ip_rt_info), + .save = queue_save, + .reroute = queue_reroute, +}; + +static int init(void) +{ + return nf_register_queue_rerouter(PF_INET, &ip_reroute); +} + +static void fini(void) +{ + nf_unregister_queue_rerouter(PF_INET); +} + +module_init(init); +module_exit(fini); + +#endif /* CONFIG_NETFILTER */ diff -puN net/ipv4/netfilter/ip_conntrack_core.c~git-net net/ipv4/netfilter/ip_conntrack_core.c --- devel/net/ipv4/netfilter/ip_conntrack_core.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_conntrack_core.c 2005-08-06 23:39:10.000000000 -0700 @@ -37,6 +37,7 @@ #include #include #include +#include /* ip_conntrack_lock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ @@ -49,7 +50,7 @@ #include #include -#define IP_CONNTRACK_VERSION "2.1" +#define IP_CONNTRACK_VERSION "2.3" #if 0 #define DEBUGP printk @@ -76,14 +77,73 @@ unsigned int ip_ct_log_invalid; static LIST_HEAD(unconfirmed); static int ip_conntrack_vmalloc; -DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); +static unsigned int ip_conntrack_next_id = 1; +static unsigned int ip_conntrack_expect_next_id = 1; +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +struct notifier_block *ip_conntrack_chain; +struct notifier_block *ip_conntrack_expect_chain; + +DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); + +/* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ +static inline void +__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) +{ + DEBUGP("ecache: delivering events for %p\n", ecache->ct); + if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) + notifier_call_chain(&ip_conntrack_chain, ecache->events, + ecache->ct); + ecache->events = 0; + ip_conntrack_put(ecache->ct); + ecache->ct = NULL; +} + +/* Deliver all cached events for a particular conntrack. This is called + * by code prior to async packet handling or freeing the skb */ +void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) +{ + struct ip_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (ecache->ct == ct) + __ip_ct_deliver_cached_events(ecache); + local_bh_enable(); +} -void -ip_conntrack_put(struct ip_conntrack *ct) +void __ip_ct_event_cache_init(struct ip_conntrack *ct) { - IP_NF_ASSERT(ct); - nf_conntrack_put(&ct->ct_general); + struct ip_conntrack_ecache *ecache; + + /* take care of delivering potentially old events */ + ecache = &__get_cpu_var(ip_conntrack_ecache); + BUG_ON(ecache->ct == ct); + if (ecache->ct) + __ip_ct_deliver_cached_events(ecache); + /* initialize for this conntrack/packet */ + ecache->ct = ct; + nf_conntrack_get(&ct->ct_general); +} + +/* flush the event cache - touches other CPU's data and must not be called while + * packets are still passing through the code */ +static void ip_ct_event_cache_flush(void) +{ + struct ip_conntrack_ecache *ecache; + int cpu; + + for_each_cpu(cpu) { + ecache = &per_cpu(ip_conntrack_ecache, cpu); + if (ecache->ct) + ip_conntrack_put(ecache->ct); + } } +#else +static inline void ip_ct_event_cache_flush(void) {} +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + +DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); static int ip_conntrack_hash_rnd_initted; static unsigned int ip_conntrack_hash_rnd; @@ -144,6 +204,13 @@ static void unlink_expect(struct ip_conn list_del(&exp->list); CONNTRACK_STAT_INC(expect_delete); exp->master->expecting--; + ip_conntrack_expect_put(exp); +} + +void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp) +{ + unlink_expect(exp); + ip_conntrack_expect_put(exp); } static void expectation_timed_out(unsigned long ul_expect) @@ -156,6 +223,33 @@ static void expectation_timed_out(unsign ip_conntrack_expect_put(exp); } +struct ip_conntrack_expect * +__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { + atomic_inc(&i->use); + return i; + } + } + return NULL; +} + +/* Just find a expectation corresponding to a tuple. */ +struct ip_conntrack_expect * +ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + read_lock_bh(&ip_conntrack_lock); + i = __ip_conntrack_expect_find(tuple); + read_unlock_bh(&ip_conntrack_lock); + + return i; +} + /* If an expectation for this connection is found, it gets delete from * global list then returned. */ static struct ip_conntrack_expect * @@ -180,7 +274,7 @@ find_expectation(const struct ip_conntra } /* delete all expectations for this conntrack */ -static void remove_expectations(struct ip_conntrack *ct) +void ip_ct_remove_expectations(struct ip_conntrack *ct) { struct ip_conntrack_expect *i, *tmp; @@ -210,7 +304,7 @@ clean_from_lists(struct ip_conntrack *ct LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); /* Destroy all pending expectations */ - remove_expectations(ct); + ip_ct_remove_expectations(ct); } static void @@ -223,10 +317,13 @@ destroy_conntrack(struct nf_conntrack *n IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); + ip_conntrack_event(IPCT_DESTROY, ct); + set_bit(IPS_DYING_BIT, &ct->status); + /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to ip_conntrack_lock!!! -HW */ - proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); if (proto && proto->destroy) proto->destroy(ct); @@ -238,7 +335,7 @@ destroy_conntrack(struct nf_conntrack *n * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, * too. */ - remove_expectations(ct); + ip_ct_remove_expectations(ct); /* We overload first tuple to link into unconfirmed list. */ if (!is_confirmed(ct)) { @@ -253,8 +350,7 @@ destroy_conntrack(struct nf_conntrack *n ip_conntrack_put(ct->master); DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); - kmem_cache_free(ip_conntrack_cachep, ct); - atomic_dec(&ip_conntrack_count); + ip_conntrack_free(ct); } static void death_by_timeout(unsigned long ul_conntrack) @@ -280,7 +376,7 @@ conntrack_tuple_cmp(const struct ip_conn && ip_ct_tuple_equal(tuple, &i->tuple); } -static struct ip_conntrack_tuple_hash * +struct ip_conntrack_tuple_hash * __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { @@ -315,6 +411,29 @@ ip_conntrack_find_get(const struct ip_co return h; } +static void __ip_conntrack_hash_insert(struct ip_conntrack *ct, + unsigned int hash, + unsigned int repl_hash) +{ + ct->id = ++ip_conntrack_next_id; + list_prepend(&ip_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_prepend(&ip_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY].list); +} + +void ip_conntrack_hash_insert(struct ip_conntrack *ct) +{ + unsigned int hash, repl_hash; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + write_lock_bh(&ip_conntrack_lock); + __ip_conntrack_hash_insert(ct, hash, repl_hash); + write_unlock_bh(&ip_conntrack_lock); +} + /* Confirm a connection given skb; places it in hash table */ int __ip_conntrack_confirm(struct sk_buff **pskb) @@ -361,10 +480,7 @@ __ip_conntrack_confirm(struct sk_buff ** /* Remove from unconfirmed list */ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - list_prepend(&ip_conntrack_hash[hash], - &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - list_prepend(&ip_conntrack_hash[repl_hash], - &ct->tuplehash[IP_CT_DIR_REPLY]); + __ip_conntrack_hash_insert(ct, hash, repl_hash); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ @@ -374,6 +490,16 @@ __ip_conntrack_confirm(struct sk_buff ** set_bit(IPS_CONFIRMED_BIT, &ct->status); CONNTRACK_STAT_INC(insert); write_unlock_bh(&ip_conntrack_lock); + if (ct->helper) + ip_conntrack_event_cache(IPCT_HELPER, *pskb); +#ifdef CONFIG_IP_NF_NAT_NEEDED + if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || + test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_NATINFO, *pskb); +#endif + ip_conntrack_event_cache(master_ct(ct) ? + IPCT_RELATED : IPCT_NEW, *pskb); + return NF_ACCEPT; } @@ -438,34 +564,84 @@ static inline int helper_cmp(const struc return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); } -static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) +static struct ip_conntrack_helper * +__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) { return LIST_FIND(&helpers, helper_cmp, struct ip_conntrack_helper *, tuple); } -/* Allocate a new conntrack: we return -ENOMEM if classification - failed due to stress. Otherwise it really is unclassifiable. */ -static struct ip_conntrack_tuple_hash * -init_conntrack(const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *protocol, - struct sk_buff *skb) +struct ip_conntrack_helper * +ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_helper *helper; + + /* need ip_conntrack_lock to assure that helper exists until + * try_module_get() is called */ + read_lock_bh(&ip_conntrack_lock); + + helper = __ip_conntrack_helper_find(tuple); + if (helper) { + /* need to increase module usage count to assure helper will + * not go away while the caller is e.g. busy putting a + * conntrack in the hash that uses the helper */ + if (!try_module_get(helper->me)) + helper = NULL; + } + + read_unlock_bh(&ip_conntrack_lock); + + return helper; +} + +void ip_conntrack_helper_put(struct ip_conntrack_helper *helper) +{ + module_put(helper->me); +} + +struct ip_conntrack_protocol * +__ip_conntrack_proto_find(u_int8_t protocol) +{ + return ip_ct_protos[protocol]; +} + +/* this is guaranteed to always return a valid protocol helper, since + * it falls back to generic_protocol */ +struct ip_conntrack_protocol * +ip_conntrack_proto_find_get(u_int8_t protocol) +{ + struct ip_conntrack_protocol *p; + + preempt_disable(); + p = __ip_conntrack_proto_find(protocol); + if (p) { + if (!try_module_get(p->me)) + p = &ip_conntrack_generic_protocol; + } + preempt_enable(); + + return p; +} + +void ip_conntrack_proto_put(struct ip_conntrack_protocol *p) +{ + module_put(p->me); +} + +struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, + struct ip_conntrack_tuple *repl) { struct ip_conntrack *conntrack; - struct ip_conntrack_tuple repl_tuple; - size_t hash; - struct ip_conntrack_expect *exp; if (!ip_conntrack_hash_rnd_initted) { get_random_bytes(&ip_conntrack_hash_rnd, 4); ip_conntrack_hash_rnd_initted = 1; } - hash = hash_conntrack(tuple); - if (ip_conntrack_max && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { + unsigned int hash = hash_conntrack(orig); /* Try dropping from this hash chain. */ if (!early_drop(&ip_conntrack_hash[hash])) { if (net_ratelimit()) @@ -476,31 +652,58 @@ init_conntrack(const struct ip_conntrack } } - if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { - DEBUGP("Can't invert tuple.\n"); - return NULL; - } - conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); - return ERR_PTR(-ENOMEM); + return NULL; } memset(conntrack, 0, sizeof(*conntrack)); atomic_set(&conntrack->ct_general.use, 1); conntrack->ct_general.destroy = destroy_conntrack; - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; - conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; - if (!protocol->new(conntrack, skb)) { - kmem_cache_free(ip_conntrack_cachep, conntrack); - return NULL; - } + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* Don't set timer yet: wait for confirmation */ init_timer(&conntrack->timeout); conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; + atomic_inc(&ip_conntrack_count); + + return conntrack; +} + +void +ip_conntrack_free(struct ip_conntrack *conntrack) +{ + atomic_dec(&ip_conntrack_count); + kmem_cache_free(ip_conntrack_cachep, conntrack); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + * failed due to stress. Otherwise it really is unclassifiable */ +static struct ip_conntrack_tuple_hash * +init_conntrack(struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *protocol, + struct sk_buff *skb) +{ + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + struct ip_conntrack_expect *exp; + + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple))) + return NULL; + + if (!protocol->new(conntrack, skb)) { + ip_conntrack_free(conntrack); + return NULL; + } + write_lock_bh(&ip_conntrack_lock); exp = find_expectation(tuple); @@ -521,7 +724,7 @@ init_conntrack(const struct ip_conntrack nf_conntrack_get(&conntrack->master->ct_general); CONNTRACK_STAT_INC(expect_new); } else { - conntrack->helper = ip_ct_find_helper(&repl_tuple); + conntrack->helper = __ip_conntrack_helper_find(&repl_tuple); CONNTRACK_STAT_INC(new); } @@ -529,7 +732,6 @@ init_conntrack(const struct ip_conntrack /* Overload tuple linked list to put us in unconfirmed list. */ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); - atomic_inc(&ip_conntrack_count); write_unlock_bh(&ip_conntrack_lock); if (exp) { @@ -607,7 +809,7 @@ unsigned int ip_conntrack_in(unsigned in struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; struct ip_conntrack_protocol *proto; - int set_reply; + int set_reply = 0; int ret; /* Previously seen (loopback or untracked)? Ignore. */ @@ -625,9 +827,6 @@ unsigned int ip_conntrack_in(unsigned in return NF_DROP; } - /* FIXME: Do this right please. --RR */ - (*pskb)->nfcache |= NFC_UNKNOWN; - /* Doesn't cover locally-generated broadcast, so not worth it. */ #if 0 /* Ignore broadcast: no `connection'. */ @@ -643,7 +842,7 @@ unsigned int ip_conntrack_in(unsigned in } #endif - proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); + proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol); /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter @@ -679,8 +878,8 @@ unsigned int ip_conntrack_in(unsigned in return -ret; } - if (set_reply) - set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_STATUS, *pskb); return ret; } @@ -689,7 +888,7 @@ int invert_tuplepr(struct ip_conntrack_t const struct ip_conntrack_tuple *orig) { return ip_ct_invert_tuple(inverse, orig, - ip_ct_find_proto(orig->dst.protonum)); + __ip_conntrack_proto_find(orig->dst.protonum)); } /* Would two expected things clash? */ @@ -769,6 +968,8 @@ static void ip_conntrack_expect_insert(s exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; add_timer(&exp->timeout); + exp->id = ++ip_conntrack_expect_next_id; + atomic_inc(&exp->use); CONNTRACK_STAT_INC(expect_create); } @@ -827,6 +1028,7 @@ int ip_conntrack_expect_related(struct i evict_oldest_expect(expect->master); ip_conntrack_expect_insert(expect); + ip_conntrack_expect_event(IPEXP_NEW, expect); ret = 0; out: write_unlock_bh(&ip_conntrack_lock); @@ -847,7 +1049,7 @@ void ip_conntrack_alter_reply(struct ip_ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (!conntrack->master && conntrack->expecting == 0) - conntrack->helper = ip_ct_find_helper(newreply); + conntrack->helper = __ip_conntrack_helper_find(newreply); write_unlock_bh(&ip_conntrack_lock); } @@ -861,11 +1063,26 @@ int ip_conntrack_helper_register(struct return 0; } +struct ip_conntrack_helper * +__ip_conntrack_helper_find_byname(const char *name) +{ + struct ip_conntrack_helper *h; + + list_for_each_entry(h, &helpers, list) { + if (!strcmp(h->name, name)) + return h; + } + + return NULL; +} + static inline int unhelp(struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_helper *me) { - if (tuplehash_to_ctrack(i)->helper == me) + if (tuplehash_to_ctrack(i)->helper == me) { + ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i)); tuplehash_to_ctrack(i)->helper = NULL; + } return 0; } @@ -927,12 +1144,46 @@ void ip_ct_refresh_acct(struct ip_conntr if (del_timer(&ct->timeout)) { ct->timeout.expires = jiffies + extra_jiffies; add_timer(&ct->timeout); + ip_conntrack_event_cache(IPCT_REFRESH, skb); } ct_add_counters(ct, ctinfo, skb); write_unlock_bh(&ip_conntrack_lock); } } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be + * in ip_conntrack_core, since we don't want the protocols to autoload + * or depend on ctnetlink */ +int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t), + &tuple->src.u.tcp.port); + NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t), + &tuple->dst.u.tcp.port); + return 0; + +nfattr_failure: + return -1; +} + +int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *t) +{ + if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) + return -EINVAL; + + t->src.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); + t->dst.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); + + return 0; +} +#endif + /* Returns new sk_buff, or NULL */ struct sk_buff * ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) @@ -943,10 +1194,8 @@ ip_ct_gather_frags(struct sk_buff *skb, skb = ip_defrag(skb, user); local_bh_enable(); - if (skb) { + if (skb) ip_send_check(skb->nh.iph); - skb->nfcache |= NFC_ALTERED; - } return skb; } @@ -1096,16 +1345,14 @@ static void free_conntrack_hash(void) * ip_conntrack_htable_size)); } -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) +void ip_conntrack_flush() { - ip_ct_attach = NULL; /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ synchronize_net(); - + + ip_ct_event_cache_flush(); i_see_dead_people: ip_ct_iterate_cleanup(kill_all, NULL); if (atomic_read(&ip_conntrack_count) != 0) { @@ -1115,7 +1362,14 @@ void ip_conntrack_cleanup(void) /* wait until all references to ip_conntrack_untracked are dropped */ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) schedule(); +} +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) +{ + ip_ct_attach = NULL; + ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); free_conntrack_hash(); diff -puN net/ipv4/netfilter/ip_conntrack_ftp.c~git-net net/ipv4/netfilter/ip_conntrack_ftp.c --- devel/net/ipv4/netfilter/ip_conntrack_ftp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_conntrack_ftp.c 2005-08-06 23:39:10.000000000 -0700 @@ -262,7 +262,8 @@ static int find_nl_seq(u32 seq, const st } /* We don't update if it's older than what we have. */ -static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir) +static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir, + struct sk_buff *skb) { unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; @@ -276,10 +277,13 @@ static void update_nl_seq(u32 nl_seq, st oldest = i; } - if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; - else if (oldest != NUM_SEQ_TO_REMEMBER) + ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } else if (oldest != NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][oldest] = nl_seq; + ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } } static int help(struct sk_buff **pskb, @@ -439,7 +443,7 @@ out_update_nl: /* Now if this ends in \n, update ftp info. Seq may have been * adjusted by NAT code. */ if (ends_in_nl) - update_nl_seq(seq, ct_ftp_info,dir); + update_nl_seq(seq, ct_ftp_info,dir, *pskb); out: spin_unlock_bh(&ip_ftp_lock); return ret; diff -puN /dev/null net/ipv4/netfilter/ip_conntrack_netlink.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_conntrack_netlink.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,1579 @@ +/* Connection tracking via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist + * (C) 2002-2005 by Harald Welte + * (C) 2003 by Patrick Mchardy + * (C) 2005 by Pablo Neira Ayuso + * + * I've reworked this stuff to use attributes instead of conntrack + * structures. 5.44 am. I need more tea. --pablo 05/07/11. + * + * Initial connection tracking via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); + +static char __initdata version[] = "0.90"; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + + +static inline int +ctnetlink_dump_tuples_proto(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_protocol *proto; + + NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); + + proto = ip_conntrack_proto_find_get(tuple->dst.protonum); + if (proto && proto->tuple_to_nfattr) + return proto->tuple_to_nfattr(skb, tuple); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_tuples(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + struct nfattr *nest_parms; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_IP); + NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip); + NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip); + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO); + ctnetlink_dump_tuples_proto(skb, tuple); + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t status = htonl((u_int32_t) ct->status); + NFA_PUT(skb, CTA_STATUS, sizeof(status), &status); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + long timeout_l = ct->timeout.expires - jiffies; + u_int32_t timeout; + + if (timeout_l < 0) + timeout = 0; + else + timeout = htonl(timeout_l / HZ); + + NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + + struct nfattr *nest_proto; + int ret; + + if (!proto || !proto->to_nfattr) + return 0; + + nest_proto = NFA_NEST(skb, CTA_PROTOINFO); + + ret = proto->to_nfattr(skb, nest_proto, ct); + + ip_conntrack_proto_put(proto); + + NFA_NEST_END(skb, nest_proto); + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + struct nfattr *nest_helper; + + if (!ct->helper) + return 0; + + nest_helper = NFA_NEST(skb, CTA_HELP); + NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name); + + if (ct->helper->to_nfattr) + ct->helper->to_nfattr(skb, ct); + + NFA_NEST_END(skb, nest_helper); + + return 0; + +nfattr_failure: + return -1; +} + +#ifdef CONFIG_IP_NF_CT_ACCT +static inline int +ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, + enum ip_conntrack_dir dir) +{ + enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; + struct nfattr *nest_count = NFA_NEST(skb, type); + u_int64_t tmp; + + tmp = cpu_to_be64(ct->counters[dir].packets); + NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp); + + tmp = cpu_to_be64(ct->counters[dir].bytes); + NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp); + + NFA_NEST_END(skb, nest_count); + + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_counters(a, b, c) (0) +#endif + +#ifdef CONFIG_IP_NF_CONNTRACK_MARK +static inline int +ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t mark = htonl(ct->mark); + + NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark); + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_mark(a, b) (0) +#endif + +static inline int +ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t id = htonl(ct->id); + NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + unsigned int use = htonl(atomic_read(&ct->ct_general.use)); + + NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use); + return 0; + +nfattr_failure: + return -1; +} + +#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) + +static int +ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, int nowait, + const struct ip_conntrack *ct) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static int ctnetlink_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + struct ip_conntrack *ct = (struct ip_conntrack *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + unsigned int flags = 0, groups; + + /* ignore our fake conntrack entry */ + if (ct == &ip_conntrack_untracked) + return NOTIFY_DONE; + + if (events & IPCT_DESTROY) { + type = IPCTNL_MSG_CT_DELETE; + groups = NF_NETLINK_CONNTRACK_DESTROY; + goto alloc_skb; + } + if (events & (IPCT_NEW | IPCT_RELATED)) { + type = IPCTNL_MSG_CT_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + /* dump everything */ + events = ~0UL; + groups = NF_NETLINK_CONNTRACK_NEW; + goto alloc_skb; + } + if (events & (IPCT_STATUS | + IPCT_PROTOINFO | + IPCT_HELPER | + IPCT_HELPINFO | + IPCT_NATINFO)) { + type = IPCTNL_MSG_CT_NEW; + groups = NF_NETLINK_CONNTRACK_UPDATE; + goto alloc_skb; + } + + return NOTIFY_DONE; + +alloc_skb: + /* FIXME: Check if there are any listeners before, don't hurt performance */ + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + /* NAT stuff is now a status flag */ + if ((events & IPCT_STATUS || events & IPCT_NATINFO) + && ctnetlink_dump_status(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_REFRESH + && ctnetlink_dump_timeout(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_PROTOINFO + && ctnetlink_dump_protoinfo(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_HELPINFO + && ctnetlink_dump_helpinfo(skb, ct) < 0) + goto nfattr_failure; + + if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + nfnetlink_send(skb, 0, groups, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + +static int ctnetlink_done(struct netlink_callback *cb) +{ + DEBUGP("entered %s\n", __FUNCTION__); + return 0; +} + +static int +ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack *ct = NULL; + struct ip_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + + DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, + cb->args[0], *id); + + read_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = tuplehash_to_ctrack(h); + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + } + } +out: + read_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} + +#ifdef CONFIG_IP_NF_CT_ACCT +static int +ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack *ct = NULL; + struct ip_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + + DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__, + cb->args[0], *id); + + write_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = tuplehash_to_ctrack(h); + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + + memset(&ct->counters, 0, sizeof(ct->counters)); + } + } +out: + write_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} +#endif + +static const int cta_min_ip[CTA_IP_MAX] = { + [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), + [CTA_IP_V4_DST-1] = sizeof(u_int32_t), +}; + +static inline int +ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_IP_MAX]; + + DEBUGP("entered %s\n", __FUNCTION__); + + + if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) + return -EINVAL; + + if (!tb[CTA_IP_V4_SRC-1]) + return -EINVAL; + tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]); + + if (!tb[CTA_IP_V4_DST-1]) + return -EINVAL; + tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]); + + DEBUGP("leaving\n"); + + return 0; + +nfattr_failure: + return -1; +} + +static const int cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_NUM-1] = sizeof(u_int16_t), + [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), + [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), + [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t), +}; + +static inline int +ctnetlink_parse_tuple_proto(struct nfattr *attr, + struct ip_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_PROTO_MAX]; + struct ip_conntrack_protocol *proto; + int ret = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + if (!tb[CTA_PROTO_NUM-1]) + return -EINVAL; + tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + + proto = ip_conntrack_proto_find_get(tuple->dst.protonum); + + if (likely(proto && proto->nfattr_to_tuple)) { + ret = proto->nfattr_to_tuple(tb, tuple); + ip_conntrack_proto_put(proto); + } + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, + enum ctattr_tuple type) +{ + struct nfattr *tb[CTA_TUPLE_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tuple, 0, sizeof(*tuple)); + + if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) + goto nfattr_failure; + + if (!tb[CTA_TUPLE_IP-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple); + if (err < 0) + return err; + + if (!tb[CTA_TUPLE_PROTO-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple); + if (err < 0) + return err; + + /* orig and expect tuples get DIR_ORIGINAL */ + if (type == CTA_TUPLE_REPLY) + tuple->dst.dir = IP_CT_DIR_REPLY; + else + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + DUMP_TUPLE(tuple); + + DEBUGP("leaving\n"); + + return 0; + +nfattr_failure: + return -1; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +static const int cta_min_protonat[CTA_PROTONAT_MAX] = { + [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), + [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), +}; + +static int ctnetlink_parse_nat_proto(struct nfattr *attr, + const struct ip_conntrack *ct, + struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_PROTONAT_MAX]; + struct ip_nat_protocol *npt; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) + goto nfattr_failure; + + npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + if (!npt) + return 0; + + if (!npt->nfattr_to_range) { + ip_nat_proto_put(npt); + return 0; + } + + /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */ + if (npt->nfattr_to_range(tb, range) > 0) + range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + + ip_nat_proto_put(npt); + + DEBUGP("leaving\n"); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_parse_nat(struct nfattr *cda[], + const struct ip_conntrack *ct, struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_NAT_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(range, 0, sizeof(*range)); + + if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) + goto nfattr_failure; + + if (tb[CTA_NAT_MINIP-1]) + range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); + + if (!tb[CTA_NAT_MAXIP-1]) + range->max_ip = range->min_ip; + else + range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]); + + if (range->min_ip) + range->flags |= IP_NAT_RANGE_MAP_IPS; + + if (!tb[CTA_NAT_PROTO-1]) + return 0; + + err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range); + if (err < 0) + return err; + + DEBUGP("leaving\n"); + return 0; + +nfattr_failure: + return -1; +} +#endif + +static inline int +ctnetlink_parse_help(struct nfattr *attr, char **helper_name) +{ + struct nfattr *tb[CTA_HELP_MAX]; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) + goto nfattr_failure; + + if (!tb[CTA_HELP_NAME-1]) + return -EINVAL; + + *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); + + return 0; + +nfattr_failure: + return -1; +} + +static int +ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + struct ip_conntrack *ct; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else { + /* Flush the whole table */ + ip_conntrack_flush(); + return 0; + } + + if (err < 0) + return err; + + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash\n"); + return -ENOENT; + } + + ct = tuplehash_to_ctrack(h); + + if (cda[CTA_ID-1]) { + u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1])); + if (ct->id != id) { + ip_conntrack_put(ct); + return -ENOENT; + } + } + if (del_timer(&ct->timeout)) { + ip_conntrack_put(ct); + ct->timeout.function((unsigned long)ct); + return 0; + } + ip_conntrack_put(ct); + DEBUGP("leaving\n"); + + return 0; +} + +static int +ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + struct ip_conntrack *ct; + struct sk_buff *skb2 = NULL; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct nfgenmsg *msg = NLMSG_DATA(nlh); + u32 rlen; + + if (msg->nfgen_family != AF_INET) + return -EAFNOSUPPORT; + + if (NFNL_MSG_TYPE(nlh->nlmsg_type) == + IPCTNL_MSG_CT_GET_CTRZERO) { +#ifdef CONFIG_IP_NF_CT_ACCT + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table_w, + ctnetlink_done)) != 0) + return -EINVAL; +#else + return -ENOTSUPP; +#endif + } else { + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + } + + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else + return -EINVAL; + + if (err < 0) + return err; + + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash"); + return -ENOENT; + } + DEBUGP("tuple found\n"); + ct = tuplehash_to_ctrack(h); + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb2) { + ip_conntrack_put(ct); + return -ENOMEM; + } + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, 1, ct); + ip_conntrack_put(ct); + if (err <= 0) + goto out; + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto out; + + DEBUGP("leaving\n"); + return 0; + +out: + if (skb2) + kfree_skb(skb2); + return -1; +} + +static inline int +ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]); + d = ct->status ^ status; + + if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) + /* unchangeable */ + return -EINVAL; + + if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) + /* SEEN_REPLY bit can only be set */ + return -EINVAL; + + + if (d & IPS_ASSURED && !(status & IPS_ASSURED)) + /* ASSURED bit can only be set */ + return -EINVAL; + + if (cda[CTA_NAT-1]) { +#ifndef CONFIG_IP_NF_NAT_NEEDED + return -EINVAL; +#else + unsigned int hooknum; + struct ip_nat_range range; + + if (ctnetlink_parse_nat(cda, ct, &range) < 0) + return -EINVAL; + + DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", + NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), + htons(range.min.all), htons(range.max.all)); + + /* This is tricky but it works. ip_nat_setup_info needs the + * hook number as parameter, so let's do the correct + * conversion and run away */ + if (status & IPS_SRC_NAT_DONE) + hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ + else if (status & IPS_DST_NAT_DONE) + hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ + else + return -EINVAL; /* Missing NAT flags */ + + DEBUGP("NAT status: %lu\n", + status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + + if (ip_nat_initialized(ct, hooknum)) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + + DEBUGP("NAT status after setup_info: %lu\n", + ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); +#endif + } + + /* Be careful here, modifying NAT bits can screw up things, + * so don't let users modify them directly if they don't pass + * ip_nat_range. */ + ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + return 0; +} + + +static inline int +ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + struct ip_conntrack_helper *helper; + char *helpname; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + /* don't change helper of sibling connections */ + if (ct->master) + return -EINVAL; + + err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname); + if (err < 0) + return err; + + helper = __ip_conntrack_helper_find_byname(helpname); + if (!helper) { + if (!strcmp(helpname, "")) + helper = NULL; + else + return -EINVAL; + } + + if (ct->helper) { + if (!helper) { + /* we had a helper before ... */ + ip_ct_remove_expectations(ct); + ct->helper = NULL; + } else { + /* need to zero data of old helper */ + memset(&ct->help, 0, sizeof(ct->help)); + } + } + + ct->helper = helper; + + return 0; +} + +static inline int +ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + if (!del_timer(&ct->timeout)) + return -ETIME; + + ct->timeout.expires = jiffies + timeout * HZ; + add_timer(&ct->timeout); + + return 0; +} + +static int +ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_HELP-1]) { + err = ctnetlink_change_helper(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_TIMEOUT-1]) { + err = ctnetlink_change_timeout(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_STATUS-1]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + return err; + } + + DEBUGP("all done\n"); + return 0; +} + +static int +ctnetlink_create_conntrack(struct nfattr *cda[], + struct ip_conntrack_tuple *otuple, + struct ip_conntrack_tuple *rtuple) +{ + struct ip_conntrack *ct; + int err = -EINVAL; + + DEBUGP("entered %s\n", __FUNCTION__); + + ct = ip_conntrack_alloc(otuple, rtuple); + if (ct == NULL || IS_ERR(ct)) + return -ENOMEM; + + if (!cda[CTA_TIMEOUT-1]) + goto err; + ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + ct->status |= IPS_CONFIRMED; + + err = ctnetlink_change_status(ct, cda); + if (err < 0) + goto err; + + ct->helper = ip_conntrack_helper_find_get(rtuple); + + add_timer(&ct->timeout); + ip_conntrack_hash_insert(ct); + + if (ct->helper) + ip_conntrack_helper_put(ct->helper); + + DEBUGP("conntrack with id %u inserted\n", ct->id); + return 0; + +err: + ip_conntrack_free(ct); + return err; +} + +static int +ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple otuple, rtuple; + struct ip_conntrack_tuple_hash *h = NULL; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_TUPLE_ORIG-1]) { + err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); + if (err < 0) + return err; + } + + if (cda[CTA_TUPLE_REPLY-1]) { + err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY); + if (err < 0) + return err; + } + + write_lock_bh(&ip_conntrack_lock); + if (cda[CTA_TUPLE_ORIG-1]) + h = __ip_conntrack_find(&otuple, NULL); + else if (cda[CTA_TUPLE_REPLY-1]) + h = __ip_conntrack_find(&rtuple, NULL); + + if (h == NULL) { + write_unlock_bh(&ip_conntrack_lock); + DEBUGP("no such conntrack, create new\n"); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); + return err; + } + /* implicit 'else' */ + + /* we only allow nat config for new conntracks */ + if (cda[CTA_NAT-1]) { + err = -EINVAL; + goto out_unlock; + } + + /* We manipulate the conntrack inside the global conntrack table lock, + * so there's no need to increase the refcount */ + DEBUGP("conntrack found\n"); + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda); + +out_unlock: + write_unlock_bh(&ip_conntrack_lock); + return err; +} + +/*********************************************************************** + * EXPECT + ***********************************************************************/ + +static inline int +ctnetlink_exp_dump_tuple(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple, + enum ctattr_expect type) +{ + struct nfattr *nest_parms = NFA_NEST(skb, type); + + if (ctnetlink_dump_tuples(skb, tuple) < 0) + goto nfattr_failure; + + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_exp_dump_expect(struct sk_buff *skb, + const struct ip_conntrack_expect *exp) +{ + struct ip_conntrack *master = exp->master; + u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ); + u_int32_t id = htonl(exp->id); + + if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) + goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0) + goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, + &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + CTA_EXPECT_MASTER) < 0) + goto nfattr_failure; + + NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout); + NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id); + + return 0; + +nfattr_failure: + return -1; +} + +static int +ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, + int nowait, + const struct ip_conntrack_expect *exp) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static int ctnetlink_expect_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + int flags = 0; + u16 proto; + + if (events & IPEXP_NEW) { + type = IPCTNL_MSG_EXP_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + } else + return NOTIFY_DONE; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + proto = exp->tuple.dst.protonum; + nfnetlink_send(skb, 0, NF_NETLINK_CONNTRACK_EXP_NEW, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif + +static int +ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack_expect *exp = NULL; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[0]; + + DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); + + read_lock_bh(&ip_conntrack_lock); + list_for_each_prev(i, &ip_conntrack_expect_list) { + exp = (struct ip_conntrack_expect *) i; + if (exp->id <= *id) + continue; + if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + 1, exp) < 0) + goto out; + *id = exp->id; + } +out: + read_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last id=%llu\n", *id); + + return skb->len; +} + +static int +ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_expect *exp; + struct sk_buff *skb2; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct nfgenmsg *msg = NLMSG_DATA(nlh); + u32 rlen; + + if (msg->nfgen_family != AF_INET) + return -EAFNOSUPPORT; + + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_exp_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (cda[CTA_EXPECT_MASTER-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER); + else + return -EINVAL; + + if (err < 0) + return err; + + exp = ip_conntrack_expect_find_get(&tuple); + if (!exp) + return -ENOENT; + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + goto out; + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + 1, exp); + if (err <= 0) + goto out; + + ip_conntrack_expect_put(exp); + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto free; + + return err; + +out: + ip_conntrack_expect_put(exp); +free: + if (skb2) + kfree_skb(skb2); + return err; +} + +static int +ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_expect *exp, *tmp; + struct ip_conntrack_tuple tuple; + struct ip_conntrack_helper *h; + int err; + + if (cda[CTA_EXPECT_TUPLE-1]) { + /* delete a single expect by tuple */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + + /* bump usage count to 2 */ + exp = ip_conntrack_expect_find_get(&tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = + *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + ip_conntrack_expect_put(exp); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ + ip_conntrack_unexpect_related(exp); + /* have to put what we 'get' above. + * after this line usage count == 0 */ + ip_conntrack_expect_put(exp); + } else if (cda[CTA_EXPECT_HELP_NAME-1]) { + char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]); + + /* delete all expectations for this helper */ + write_lock_bh(&ip_conntrack_lock); + h = __ip_conntrack_helper_find_byname(name); + if (!h) { + write_unlock_bh(&ip_conntrack_lock); + return -EINVAL; + } + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, + list) { + if (exp->master->helper == h + && del_timer(&exp->timeout)) + __ip_ct_expect_unlink_destroy(exp); + } + write_unlock(&ip_conntrack_lock); + } else { + /* This basically means we have to flush everything*/ + write_lock_bh(&ip_conntrack_lock); + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, + list) { + if (del_timer(&exp->timeout)) + __ip_ct_expect_unlink_destroy(exp); + } + write_unlock_bh(&ip_conntrack_lock); + } + + return 0; +} +static int +ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[]) +{ + return -EOPNOTSUPP; +} + +static int +ctnetlink_create_expect(struct nfattr *cda[]) +{ + struct ip_conntrack_tuple tuple, mask, master_tuple; + struct ip_conntrack_tuple_hash *h = NULL; + struct ip_conntrack_expect *exp; + struct ip_conntrack *ct; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + /* caller guarantees that those three CTA_EXPECT_* exist */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER); + if (err < 0) + return err; + + /* Look for master conntrack of this expectation */ + h = ip_conntrack_find_get(&master_tuple, NULL); + if (!h) + return -ENOENT; + ct = tuplehash_to_ctrack(h); + + if (!ct->helper) { + /* such conntrack hasn't got any helper, abort */ + err = -EINVAL; + goto out; + } + + exp = ip_conntrack_expect_alloc(ct); + if (!exp) { + err = -ENOMEM; + goto out; + } + + exp->expectfn = NULL; + exp->master = ct; + memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple)); + memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple)); + + err = ip_conntrack_expect_related(exp); + ip_conntrack_expect_put(exp); + +out: + ip_conntrack_put(tuplehash_to_ctrack(h)); + return err; +} + +static int +ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_expect *exp; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (!cda[CTA_EXPECT_TUPLE-1] + || !cda[CTA_EXPECT_MASK-1] + || !cda[CTA_EXPECT_MASTER-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + + write_lock_bh(&ip_conntrack_lock); + exp = __ip_conntrack_expect_find(&tuple); + + if (!exp) { + write_unlock_bh(&ip_conntrack_lock); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_expect(cda); + return err; + } + + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_expect(exp, cda); + write_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving\n"); + + return err; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static struct notifier_block ctnl_notifier = { + .notifier_call = ctnetlink_conntrack_event, +}; + +static struct notifier_block ctnl_notifier_exp = { + .notifier_call = ctnetlink_expect_event, +}; +#endif + +static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { + [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, + .attr_count = CTA_MAX, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, + .attr_count = CTA_MAX, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { + [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, + .attr_count = CTA_EXPECT_MAX, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, + .attr_count = CTA_EXPECT_MAX, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, + .attr_count = CTA_EXPECT_MAX, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnetlink_subsystem ctnl_subsys = { + .name = "conntrack", + .subsys_id = NFNL_SUBSYS_CTNETLINK, + .cb_count = IPCTNL_MSG_MAX, + .cb = ctnl_cb, +}; + +static struct nfnetlink_subsystem ctnl_exp_subsys = { + .name = "conntrack_expect", + .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, + .cb_count = IPCTNL_MSG_EXP_MAX, + .cb = ctnl_exp_cb, +}; + +static int __init ctnetlink_init(void) +{ + int ret; + + printk("ctnetlink v%s: registering with nfnetlink.\n", version); + ret = nfnetlink_subsys_register(&ctnl_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register with nfnetlink.\n"); + goto err_out; + } + + ret = nfnetlink_subsys_register(&ctnl_exp_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register exp with nfnetlink.\n"); + goto err_unreg_subsys; + } + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS + ret = ip_conntrack_register_notifier(&ctnl_notifier); + if (ret < 0) { + printk("ctnetlink_init: cannot register notifier.\n"); + goto err_unreg_exp_subsys; + } + + ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp); + if (ret < 0) { + printk("ctnetlink_init: cannot expect register notifier.\n"); + goto err_unreg_notifier; + } +#endif + + return 0; + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +err_unreg_notifier: + ip_conntrack_unregister_notifier(&ctnl_notifier); +err_unreg_exp_subsys: + nfnetlink_subsys_unregister(&ctnl_exp_subsys); +#endif +err_unreg_subsys: + nfnetlink_subsys_unregister(&ctnl_subsys); +err_out: + return ret; +} + +static void __exit ctnetlink_exit(void) +{ + printk("ctnetlink: unregistering from nfnetlink.\n"); + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS + ip_conntrack_unregister_notifier(&ctnl_notifier_exp); + ip_conntrack_unregister_notifier(&ctnl_notifier); +#endif + + nfnetlink_subsys_unregister(&ctnl_exp_subsys); + nfnetlink_subsys_unregister(&ctnl_subsys); + return; +} + +module_init(ctnetlink_init); +module_exit(ctnetlink_exit); diff -puN net/ipv4/netfilter/ip_conntrack_proto_icmp.c~git-net net/ipv4/netfilter/ip_conntrack_proto_icmp.c --- devel/net/ipv4/netfilter/ip_conntrack_proto_icmp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_conntrack_proto_icmp.c 2005-08-06 23:39:10.000000000 -0700 @@ -102,22 +102,24 @@ static int icmp_packet(struct ip_conntra ct->timeout.function((unsigned long)ct); } else { atomic_inc(&ct->proto.icmp.count); + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); } return NF_ACCEPT; } +static u_int8_t valid_new[] = { + [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 +}; + /* Called when a new connection for this protocol found. */ static int icmp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) { - static u_int8_t valid_new[] - = { [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 }; - if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ @@ -158,11 +160,12 @@ icmp_error_message(struct sk_buff *skb, return NF_ACCEPT; } - innerproto = ip_ct_find_proto(inside->ip.protocol); + innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; /* Are they talking about one of our connections? */ if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); + ip_conntrack_proto_put(innerproto); return NF_ACCEPT; } @@ -170,8 +173,10 @@ icmp_error_message(struct sk_buff *skb, been preserved inside the ICMP. */ if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { DEBUGP("icmp_error_track: Can't invert tuple\n"); + ip_conntrack_proto_put(innerproto); return NF_ACCEPT; } + ip_conntrack_proto_put(innerproto); *ctinfo = IP_CT_RELATED; @@ -212,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_ icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); if (icmph == NULL) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: short packet "); return -NF_ACCEPT; } @@ -226,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_ if (!(u16)csum_fold(skb->csum)) break; if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad HW ICMP checksum "); return -NF_ACCEPT; case CHECKSUM_NONE: if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad ICMP checksum "); return -NF_ACCEPT; } @@ -249,7 +254,7 @@ checksum_skipped: */ if (icmph->type > NR_ICMP_TYPES) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: invalid ICMP type "); return -NF_ACCEPT; } @@ -265,6 +270,47 @@ checksum_skipped: return icmp_error_message(skb, ctinfo, hooknum); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +static int icmp_tuple_to_nfattr(struct sk_buff *skb, + const struct ip_conntrack_tuple *t) +{ + NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), + &t->src.u.icmp.id); + NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), + &t->dst.u.icmp.type); + NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), + &t->dst.u.icmp.code); + + if (t->dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[t->dst.u.icmp.type]) + return -EINVAL; + + return 0; + +nfattr_failure: + return -1; +} + +static int icmp_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMP_TYPE-1] + || !tb[CTA_PROTO_ICMP_CODE-1] + || !tb[CTA_PROTO_ICMP_ID-1]) + return -1; + + tuple->dst.u.icmp.type = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); + tuple->dst.u.icmp.code = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); + tuple->src.u.icmp.id = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + + return 0; +} +#endif + struct ip_conntrack_protocol ip_conntrack_protocol_icmp = { .proto = IPPROTO_ICMP, @@ -276,4 +322,9 @@ struct ip_conntrack_protocol ip_conntrac .packet = icmp_packet, .new = icmp_new, .error = icmp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = icmp_tuple_to_nfattr, + .nfattr_to_tuple = icmp_nfattr_to_tuple, +#endif }; diff -puN net/ipv4/netfilter/ip_conntrack_proto_sctp.c~git-net net/ipv4/netfilter/ip_conntrack_proto_sctp.c --- devel/net/ipv4/netfilter/ip_conntrack_proto_sctp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_conntrack_proto_sctp.c 2005-08-06 23:39:10.000000000 -0700 @@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntra } conntrack->proto.sctp.state = newconntrack; + if (oldsctpstate != newconntrack) + ip_conntrack_event_cache(IPCT_PROTOINFO, skb); write_unlock_bh(&sctp_lock); } @@ -503,7 +505,12 @@ static struct ip_conntrack_protocol ip_c .packet = sctp_packet, .new = sctp_new, .destroy = NULL, - .me = THIS_MODULE + .me = THIS_MODULE, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; #ifdef CONFIG_SYSCTL diff -puN net/ipv4/netfilter/ip_conntrack_proto_tcp.c~git-net net/ipv4/netfilter/ip_conntrack_proto_tcp.c --- devel/net/ipv4/netfilter/ip_conntrack_proto_tcp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2005-08-06 23:39:10.000000000 -0700 @@ -336,6 +336,23 @@ static int tcp_print_conntrack(struct se return seq_printf(s, "%s ", tcp_conntrack_names[state]); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, + const struct ip_conntrack *ct) +{ + read_lock_bh(&tcp_lock); + NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), + &ct->proto.tcp.state); + read_unlock_bh(&tcp_lock); + + return 0; + +nfattr_failure: + return -1; +} +#endif + static unsigned int get_conntrack_index(const struct tcphdr *tcph) { if (tcph->rst) return TCP_RST_SET; @@ -699,7 +716,7 @@ static int tcp_in_window(struct ip_ct_tc res = 1; } else { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? after(end, sender->td_end - receiver->td_maxwin - 1) ? @@ -798,7 +815,7 @@ static int tcp_error(struct sk_buff *skb sizeof(_tcph), &_tcph); if (th == NULL) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: short packet "); return -NF_ACCEPT; } @@ -806,7 +823,7 @@ static int tcp_error(struct sk_buff *skb /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -823,7 +840,7 @@ static int tcp_error(struct sk_buff *skb skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; } @@ -832,7 +849,7 @@ static int tcp_error(struct sk_buff *skb tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); if (!tcp_valid_flags[tcpflags]) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; } @@ -880,8 +897,9 @@ static int tcp_packet(struct ip_conntrac */ write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, - "ip_ct_tcp: killing out of sync session "); + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + NULL, "ip_ct_tcp: " + "killing out of sync session "); if (del_timer(&conntrack->timeout)) conntrack->timeout.function((unsigned long) conntrack); @@ -895,7 +913,7 @@ static int tcp_packet(struct ip_conntrac write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid packet ignored "); return NF_ACCEPT; case TCP_CONNTRACK_MAX: @@ -905,7 +923,7 @@ static int tcp_packet(struct ip_conntrac old_state); write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid state "); return -NF_ACCEPT; case TCP_CONNTRACK_SYN_SENT: @@ -926,7 +944,7 @@ static int tcp_packet(struct ip_conntrac write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, - "ip_ct_tcp: invalid SYN"); + NULL, "ip_ct_tcp: invalid SYN"); return -NF_ACCEPT; } case TCP_CONNTRACK_CLOSE: @@ -973,6 +991,10 @@ static int tcp_packet(struct ip_conntrac ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; write_unlock_bh(&tcp_lock); + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + if (new_state != old_state) + ip_conntrack_event_cache(IPCT_PROTOINFO, skb); + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { /* If only reply is a RST, we can consider ourselves not to have an established connection: this is a fairly common @@ -1096,4 +1118,10 @@ struct ip_conntrack_protocol ip_conntrac .packet = tcp_packet, .new = tcp_new, .error = tcp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .to_nfattr = tcp_to_nfattr, + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; diff -puN net/ipv4/netfilter/ip_conntrack_proto_udp.c~git-net net/ipv4/netfilter/ip_conntrack_proto_udp.c --- devel/net/ipv4/netfilter/ip_conntrack_proto_udp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2005-08-06 23:39:10.000000000 -0700 @@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrac ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout_stream); /* Also, more likely to be important, and not a probe */ - set_bit(IPS_ASSURED_BIT, &conntrack->status); + if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) + ip_conntrack_event_cache(IPCT_STATUS, skb); } else ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); @@ -97,7 +98,7 @@ static int udp_error(struct sk_buff *skb hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); if (hdr == NULL) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: short packet "); return -NF_ACCEPT; } @@ -105,7 +106,7 @@ static int udp_error(struct sk_buff *skb /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -125,7 +126,7 @@ static int udp_error(struct sk_buff *skb skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, udplen, 0))) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: bad UDP checksum "); return -NF_ACCEPT; } @@ -144,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrac .packet = udp_packet, .new = udp_new, .error = udp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; diff -puN net/ipv4/netfilter/ip_conntrack_standalone.c~git-net net/ipv4/netfilter/ip_conntrack_standalone.c --- devel/net/ipv4/netfilter/ip_conntrack_standalone.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_conntrack_standalone.c 2005-08-06 23:39:10.000000000 -0700 @@ -5,7 +5,7 @@ */ /* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team + * (C) 2002-2005 Netfilter Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file * if (DIRECTION(hash)) return 0; - proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum); + proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); IP_NF_ASSERT(proto); if (seq_printf(s, "%-8s %u %ld ", @@ -185,7 +184,7 @@ static int ct_seq_show(struct seq_file * return -ENOSPC; #if defined(CONFIG_IP_NF_CONNTRACK_MARK) - if (seq_printf(s, "mark=%lu ", conntrack->mark)) + if (seq_printf(s, "mark=%u ", conntrack->mark)) return -ENOSPC; #endif @@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - ip_ct_find_proto(expect->tuple.dst.protonum)); + __ip_conntrack_proto_find(expect->tuple.dst.protonum)); return seq_putc(s, '\n'); } @@ -889,6 +888,7 @@ static int init_or_cleanup(int init) return ret; cleanup: + synchronize_net(); #ifdef CONFIG_SYSCTL unregister_sysctl_table(ip_ct_sysctl_header); cleanup_localinops: @@ -971,6 +971,14 @@ void need_ip_conntrack(void) { } +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +EXPORT_SYMBOL_GPL(ip_conntrack_chain); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain); +EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier); +EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier); +EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init); +EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); +#endif EXPORT_SYMBOL(ip_conntrack_protocol_register); EXPORT_SYMBOL(ip_conntrack_protocol_unregister); EXPORT_SYMBOL(ip_ct_get_tuple); @@ -982,12 +990,16 @@ EXPORT_SYMBOL(ip_conntrack_helper_regist EXPORT_SYMBOL(ip_conntrack_helper_unregister); EXPORT_SYMBOL(ip_ct_iterate_cleanup); EXPORT_SYMBOL(ip_ct_refresh_acct); -EXPORT_SYMBOL(ip_ct_protos); -EXPORT_SYMBOL(ip_ct_find_proto); + EXPORT_SYMBOL(ip_conntrack_expect_alloc); EXPORT_SYMBOL(ip_conntrack_expect_put); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get); EXPORT_SYMBOL(ip_conntrack_expect_related); EXPORT_SYMBOL(ip_conntrack_unexpect_related); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); +EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find); +EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy); + EXPORT_SYMBOL(ip_conntrack_tuple_taken); EXPORT_SYMBOL(ip_ct_gather_frags); EXPORT_SYMBOL(ip_conntrack_htable_size); @@ -995,7 +1007,28 @@ EXPORT_SYMBOL(ip_conntrack_lock); EXPORT_SYMBOL(ip_conntrack_hash); EXPORT_SYMBOL(ip_conntrack_untracked); EXPORT_SYMBOL_GPL(ip_conntrack_find_get); -EXPORT_SYMBOL_GPL(ip_conntrack_put); #ifdef CONFIG_IP_NF_NAT_NEEDED EXPORT_SYMBOL(ip_conntrack_tcp_update); #endif + +EXPORT_SYMBOL_GPL(ip_conntrack_flush); +EXPORT_SYMBOL_GPL(__ip_conntrack_find); + +EXPORT_SYMBOL_GPL(ip_conntrack_alloc); +EXPORT_SYMBOL_GPL(ip_conntrack_free); +EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert); + +EXPORT_SYMBOL_GPL(ip_ct_remove_expectations); + +EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get); +EXPORT_SYMBOL_GPL(ip_conntrack_helper_put); +EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); + +EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); +EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); +EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); +EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple); +#endif diff -puN net/ipv4/netfilter/ip_nat_core.c~git-net net/ipv4/netfilter/ip_nat_core.c --- devel/net/ipv4/netfilter/ip_nat_core.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_nat_core.c 2005-08-06 23:39:10.000000000 -0700 @@ -47,8 +47,39 @@ DEFINE_RWLOCK(ip_nat_lock); static unsigned int ip_nat_htable_size; static struct list_head *bysource; + +#define MAX_IP_NAT_PROTO 256 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; +static inline struct ip_nat_protocol * +__ip_nat_proto_find(u_int8_t protonum) +{ + return ip_nat_protos[protonum]; +} + +struct ip_nat_protocol * +ip_nat_proto_find_get(u_int8_t protonum) +{ + struct ip_nat_protocol *p; + + /* we need to disable preemption to make sure 'p' doesn't get + * removed until we've grabbed the reference */ + preempt_disable(); + p = __ip_nat_proto_find(protonum); + if (p) { + if (!try_module_get(p->me)) + p = &ip_nat_unknown_protocol; + } + preempt_enable(); + + return p; +} + +void +ip_nat_proto_put(struct ip_nat_protocol *p) +{ + module_put(p->me); +} /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int @@ -103,7 +134,8 @@ static int in_range(const struct ip_conntrack_tuple *tuple, const struct ip_nat_range *range) { - struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum); + struct ip_nat_protocol *proto = + __ip_nat_proto_find(tuple->dst.protonum); /* If we are supposed to map IPs, then we must be in the range specified, otherwise let this drag us onto a new src IP. */ @@ -216,8 +248,7 @@ get_unique_tuple(struct ip_conntrack_tup struct ip_conntrack *conntrack, enum ip_nat_manip_type maniptype) { - struct ip_nat_protocol *proto - = ip_nat_find_proto(orig_tuple->dst.protonum); + struct ip_nat_protocol *proto; /* 1) If this srcip/proto/src-proto-part is currently mapped, and that same mapping gives a unique tuple within the given @@ -242,14 +273,20 @@ get_unique_tuple(struct ip_conntrack_tup /* 3) The per-protocol part of the manip is made to map into the range to make a unique tuple. */ + proto = ip_nat_proto_find_get(orig_tuple->dst.protonum); + /* Only bother mapping if it's not already in range and unique */ if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || proto->in_range(tuple, maniptype, &range->min, &range->max)) - && !ip_nat_used_tuple(tuple, conntrack)) + && !ip_nat_used_tuple(tuple, conntrack)) { + ip_nat_proto_put(proto); return; + } /* Last change: get protocol to try to obtain unique tuple. */ proto->unique_tuple(tuple, range, maniptype, conntrack); + + ip_nat_proto_put(proto); } unsigned int @@ -320,17 +357,20 @@ manip_pkt(u_int16_t proto, enum ip_nat_manip_type maniptype) { struct iphdr *iph; + struct ip_nat_protocol *p; - (*pskb)->nfcache |= NFC_ALTERED; - if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) + if (!skb_make_writable(pskb, iphdroff + sizeof(*iph))) return 0; iph = (void *)(*pskb)->data + iphdroff; /* Manipulate protcol part. */ - if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff, - target, maniptype)) + p = ip_nat_proto_find_get(proto); + if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) { + ip_nat_proto_put(p); return 0; + } + ip_nat_proto_put(p); iph = (void *)(*pskb)->data + iphdroff; @@ -391,7 +431,7 @@ int icmp_reply_translation(struct sk_buf struct ip_conntrack_tuple inner, target; int hdrlen = (*pskb)->nh.iph->ihl * 4; - if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside))) + if (!skb_make_writable(pskb, hdrlen + sizeof(*inside))) return 0; inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; @@ -426,7 +466,8 @@ int icmp_reply_translation(struct sk_buf if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + sizeof(struct icmphdr) + inside->ip.ihl*4, - &inner, ip_ct_find_proto(inside->ip.protocol))) + &inner, + __ip_conntrack_proto_find(inside->ip.protocol))) return 0; /* Change inner back to look like incoming packet. We do the @@ -496,6 +537,49 @@ void ip_nat_protocol_unregister(struct i synchronize_net(); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +int +ip_nat_port_range_to_nfattr(struct sk_buff *skb, + const struct ip_nat_range *range) +{ + NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t), + &range->min.tcp.port); + NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t), + &range->max.tcp.port); + + return 0; + +nfattr_failure: + return -1; +} + +int +ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range) +{ + int ret = 0; + + /* we have to return whether we actually parsed something or not */ + + if (tb[CTA_PROTONAT_PORT_MIN-1]) { + ret = 1; + range->min.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]); + } + + if (!tb[CTA_PROTONAT_PORT_MAX-1]) { + if (ret) + range->max.tcp.port = range->min.tcp.port; + } else { + ret = 1; + range->max.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]); + } + + return ret; +} +#endif + int __init ip_nat_init(void) { size_t i; diff -puN net/ipv4/netfilter/ip_nat_helper.c~git-net net/ipv4/netfilter/ip_nat_helper.c --- devel/net/ipv4/netfilter/ip_nat_helper.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_nat_helper.c 2005-08-06 23:39:10.000000000 -0700 @@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff struct tcphdr *tcph; int datalen; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return 0; if (rep_len > match_len @@ -228,7 +228,7 @@ ip_nat_mangle_udp_packet(struct sk_buff match_offset + match_len) return 0; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return 0; if (rep_len > match_len @@ -315,7 +315,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; - if (!skb_ip_make_writable(pskb, optend)) + if (!skb_make_writable(pskb, optend)) return 0; dir = CTINFO2DIR(ctinfo); @@ -363,7 +363,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb, this_way = &ct->nat.info.seq[dir]; other_way = &ct->nat.info.seq[!dir]; - if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) return 0; tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; diff -puN net/ipv4/netfilter/ip_nat_proto_icmp.c~git-net net/ipv4/netfilter/ip_nat_proto_icmp.c --- devel/net/ipv4/netfilter/ip_nat_proto_icmp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_nat_proto_icmp.c 2005-08-06 23:39:10.000000000 -0700 @@ -62,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb, struct icmphdr *hdr; unsigned int hdroff = iphdroff + iph->ihl*4; - if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) return 0; hdr = (struct icmphdr *)((*pskb)->data + hdroff); @@ -106,11 +106,18 @@ icmp_print_range(char *buffer, const str else return 0; } -struct ip_nat_protocol ip_nat_protocol_icmp -= { "ICMP", IPPROTO_ICMP, - icmp_manip_pkt, - icmp_in_range, - icmp_unique_tuple, - icmp_print, - icmp_print_range +struct ip_nat_protocol ip_nat_protocol_icmp = { + .name = "ICMP", + .protonum = IPPROTO_ICMP, + .me = THIS_MODULE, + .manip_pkt = icmp_manip_pkt, + .in_range = icmp_in_range, + .unique_tuple = icmp_unique_tuple, + .print = icmp_print, + .print_range = icmp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, +#endif }; diff -puN net/ipv4/netfilter/ip_nat_proto_tcp.c~git-net net/ipv4/netfilter/ip_nat_proto_tcp.c --- devel/net/ipv4/netfilter/ip_nat_proto_tcp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_nat_proto_tcp.c 2005-08-06 23:39:10.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -102,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb, if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); - if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) + if (!skb_make_writable(pskb, hdroff + hdrsize)) return 0; iph = (struct iphdr *)((*pskb)->data + iphdroff); @@ -169,11 +170,18 @@ tcp_print_range(char *buffer, const stru else return 0; } -struct ip_nat_protocol ip_nat_protocol_tcp -= { "TCP", IPPROTO_TCP, - tcp_manip_pkt, - tcp_in_range, - tcp_unique_tuple, - tcp_print, - tcp_print_range +struct ip_nat_protocol ip_nat_protocol_tcp = { + .name = "TCP", + .protonum = IPPROTO_TCP, + .me = THIS_MODULE, + .manip_pkt = tcp_manip_pkt, + .in_range = tcp_in_range, + .unique_tuple = tcp_unique_tuple, + .print = tcp_print, + .print_range = tcp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, +#endif }; diff -puN net/ipv4/netfilter/ip_nat_proto_udp.c~git-net net/ipv4/netfilter/ip_nat_proto_udp.c --- devel/net/ipv4/netfilter/ip_nat_proto_udp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_nat_proto_udp.c 2005-08-06 23:39:10.000000000 -0700 @@ -94,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb, u32 oldip, newip; u16 *portptr, newport; - if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) return 0; iph = (struct iphdr *)((*pskb)->data + iphdroff); @@ -156,11 +156,18 @@ udp_print_range(char *buffer, const stru else return 0; } -struct ip_nat_protocol ip_nat_protocol_udp -= { "UDP", IPPROTO_UDP, - udp_manip_pkt, - udp_in_range, - udp_unique_tuple, - udp_print, - udp_print_range +struct ip_nat_protocol ip_nat_protocol_udp = { + .name = "UDP", + .protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .manip_pkt = udp_manip_pkt, + .in_range = udp_in_range, + .unique_tuple = udp_unique_tuple, + .print = udp_print, + .print_range = udp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, +#endif }; diff -puN net/ipv4/netfilter/ip_nat_proto_unknown.c~git-net net/ipv4/netfilter/ip_nat_proto_unknown.c --- devel/net/ipv4/netfilter/ip_nat_proto_unknown.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_nat_proto_unknown.c 2005-08-06 23:39:10.000000000 -0700 @@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const } struct ip_nat_protocol ip_nat_unknown_protocol = { - "unknown", 0, - unknown_manip_pkt, - unknown_in_range, - unknown_unique_tuple, - unknown_print, - unknown_print_range + .name = "unknown", + .me = THIS_MODULE, + .manip_pkt = unknown_manip_pkt, + .in_range = unknown_in_range, + .unique_tuple = unknown_unique_tuple, + .print = unknown_print, + .print_range = unknown_print_range }; diff -puN net/ipv4/netfilter/ip_nat_snmp_basic.c~git-net net/ipv4/netfilter/ip_nat_snmp_basic.c --- devel/net/ipv4/netfilter/ip_nat_snmp_basic.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_nat_snmp_basic.c 2005-08-06 23:39:10.000000000 -0700 @@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb, return NF_DROP; } - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return NF_DROP; spin_lock_bh(&snmp_lock); diff -puN net/ipv4/netfilter/ip_nat_standalone.c~git-net net/ipv4/netfilter/ip_nat_standalone.c --- devel/net/ipv4/netfilter/ip_nat_standalone.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_nat_standalone.c 2005-08-06 23:39:10.000000000 -0700 @@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum, IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET))); - (*pskb)->nfcache |= NFC_UNKNOWN; - /* If we had a hardware checksum before, it's now invalid */ if ((*pskb)->ip_summed == CHECKSUM_HW) if (skb_checksum_help(*pskb, (out == NULL))) @@ -392,6 +390,8 @@ module_exit(fini); EXPORT_SYMBOL(ip_nat_setup_info); EXPORT_SYMBOL(ip_nat_protocol_register); EXPORT_SYMBOL(ip_nat_protocol_unregister); +EXPORT_SYMBOL_GPL(ip_nat_proto_find_get); +EXPORT_SYMBOL_GPL(ip_nat_proto_put); EXPORT_SYMBOL(ip_nat_cheat_check); EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); EXPORT_SYMBOL(ip_nat_mangle_udp_packet); diff -puN net/ipv4/netfilter/ip_queue.c~git-net net/ipv4/netfilter/ip_queue.c --- devel/net/ipv4/netfilter/ip_queue.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_queue.c 2005-08-06 23:39:10.000000000 -0700 @@ -43,17 +43,10 @@ #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen" -struct ipq_rt_info { - __u8 tos; - __u32 daddr; - __u32 saddr; -}; - struct ipq_queue_entry { struct list_head list; struct nf_info *info; struct sk_buff *skb; - struct ipq_rt_info rt_info; }; typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); @@ -281,7 +274,8 @@ nlmsg_failure: } static int -ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data) { int status = -EINVAL; struct sk_buff *nskb; @@ -299,14 +293,6 @@ ipq_enqueue_packet(struct sk_buff *skb, entry->info = info; entry->skb = skb; - if (entry->info->hook == NF_IP_LOCAL_OUT) { - struct iphdr *iph = skb->nh.iph; - - entry->rt_info.tos = iph->tos; - entry->rt_info.daddr = iph->daddr; - entry->rt_info.saddr = iph->saddr; - } - nskb = ipq_build_packet_message(entry, &status); if (nskb == NULL) goto err_out_free; @@ -382,23 +368,10 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, st } skb_put(e->skb, diff); } - if (!skb_ip_make_writable(&e->skb, v->data_len)) + if (!skb_make_writable(&e->skb, v->data_len)) return -ENOMEM; memcpy(e->skb->data, v->payload, v->data_len); - e->skb->nfcache |= NFC_ALTERED; - /* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - */ - if (e->info->hook == NF_IP_LOCAL_OUT) { - struct iphdr *iph = e->skb->nh.iph; - - if (!(iph->tos == e->rt_info.tos - && iph->daddr == e->rt_info.daddr - && iph->saddr == e->rt_info.saddr)) - return ip_route_me_harder(&e->skb); - } return 0; } @@ -686,7 +659,8 @@ init_or_cleanup(int init) goto cleanup; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk); + ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk, + THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; diff -puN net/ipv4/netfilter/ip_tables.c~git-net net/ipv4/netfilter/ip_tables.c --- devel/net/ipv4/netfilter/ip_tables.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ip_tables.c 2005-08-06 23:39:10.000000000 -0700 @@ -312,7 +312,6 @@ ipt_do_table(struct sk_buff **pskb, do { IP_NF_ASSERT(e); IP_NF_ASSERT(back); - (*pskb)->nfcache |= e->nfcache; if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { struct ipt_entry_target *t; diff -puN net/ipv4/netfilter/ipt_CLASSIFY.c~git-net net/ipv4/netfilter/ipt_CLASSIFY.c --- devel/net/ipv4/netfilter/ipt_CLASSIFY.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_CLASSIFY.c 2005-08-06 23:39:10.000000000 -0700 @@ -32,10 +32,8 @@ target(struct sk_buff **pskb, { const struct ipt_classify_target_info *clinfo = targinfo; - if((*pskb)->priority != clinfo->priority) { + if((*pskb)->priority != clinfo->priority) (*pskb)->priority = clinfo->priority; - (*pskb)->nfcache |= NFC_ALTERED; - } return IPT_CONTINUE; } diff -puN net/ipv4/netfilter/ipt_CLUSTERIP.c~git-net net/ipv4/netfilter/ipt_CLUSTERIP.c --- devel/net/ipv4/netfilter/ipt_CLUSTERIP.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_CLUSTERIP.c 2005-08-06 23:39:10.000000000 -0700 @@ -367,7 +367,7 @@ target(struct sk_buff **pskb, #ifdef DEBUG_CLUSTERP DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark); + DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); if (!clusterip_responsible(cipinfo->config, hash)) { DEBUGP("not responsible\n"); return NF_DROP; diff -puN net/ipv4/netfilter/ipt_connmark.c~git-net net/ipv4/netfilter/ipt_connmark.c --- devel/net/ipv4/netfilter/ipt_connmark.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_connmark.c 2005-08-06 23:39:10.000000000 -0700 @@ -54,9 +54,16 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + struct ipt_connmark_info *cm = + (struct ipt_connmark_info *)matchinfo; if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) return 0; + if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { + printk(KERN_WARNING "connmark: only support 32bit mark\n"); + return 0; + } + return 1; } diff -puN net/ipv4/netfilter/ipt_CONNMARK.c~git-net net/ipv4/netfilter/ipt_CONNMARK.c --- devel/net/ipv4/netfilter/ipt_CONNMARK.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_CONNMARK.c 2005-08-06 23:39:10.000000000 -0700 @@ -40,9 +40,9 @@ target(struct sk_buff **pskb, void *userinfo) { const struct ipt_connmark_target_info *markinfo = targinfo; - unsigned long diff; - unsigned long nfmark; - unsigned long newmark; + u_int32_t diff; + u_int32_t nfmark; + u_int32_t newmark; enum ip_conntrack_info ctinfo; struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); @@ -61,10 +61,8 @@ target(struct sk_buff **pskb, case IPT_CONNMARK_RESTORE: nfmark = (*pskb)->nfmark; diff = (ct->mark ^ nfmark) & markinfo->mask; - if (diff != 0) { + if (diff != 0) (*pskb)->nfmark = nfmark ^ diff; - (*pskb)->nfcache |= NFC_ALTERED; - } break; } } @@ -94,6 +92,11 @@ checkentry(const char *tablename, } } + if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) { + printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n"); + return 0; + } + return 1; } diff -puN net/ipv4/netfilter/ipt_DSCP.c~git-net net/ipv4/netfilter/ipt_DSCP.c --- devel/net/ipv4/netfilter/ipt_DSCP.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_DSCP.c 2005-08-06 23:39:10.000000000 -0700 @@ -39,7 +39,7 @@ target(struct sk_buff **pskb, if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return NF_DROP; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; @@ -51,7 +51,6 @@ target(struct sk_buff **pskb, sizeof(diffs), (*pskb)->nh.iph->check ^ 0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return IPT_CONTINUE; } diff -puN net/ipv4/netfilter/ipt_ECN.c~git-net net/ipv4/netfilter/ipt_ECN.c --- devel/net/ipv4/netfilter/ipt_ECN.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_ECN.c 2005-08-06 23:39:10.000000000 -0700 @@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const != (einfo->ip_ect & IPT_ECN_IP_MASK)) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return 0; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; @@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const sizeof(diffs), (*pskb)->nh.iph->check ^0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return 1; } @@ -67,7 +66,7 @@ set_ect_tcp(struct sk_buff **pskb, const || tcph->cwr == einfo->proto.tcp.cwr))) return 1; - if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) return 0; tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; @@ -86,7 +85,6 @@ set_ect_tcp(struct sk_buff **pskb, const else if (skb_checksum_help(*pskb, inward)) return 0; - (*pskb)->nfcache |= NFC_ALTERED; return 1; } diff -puN net/ipv4/netfilter/ipt_LOG.c~git-net net/ipv4/netfilter/ipt_LOG.c --- devel/net/ipv4/netfilter/ipt_LOG.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_LOG.c 2005-08-06 23:39:10.000000000 -0700 @@ -27,10 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("iptables syslog logging module"); -static unsigned int nflog = 1; -module_param(nflog, int, 0400); -MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); - #if 0 #define DEBUGP printk #else @@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as int static DEFINE_SPINLOCK(log_lock); /* One level of recursion won't kill us */ -static void dump_packet(const struct ipt_log_info *info, +static void dump_packet(const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { struct iphdr _iph, *ih; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { @@ -76,7 +78,7 @@ static void dump_packet(const struct ipt if (ntohs(ih->frag_off) & IP_OFFSET) printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - if ((info->logflags & IPT_LOG_IPOPT) + if ((logflags & IPT_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; unsigned int i, optsize; @@ -119,7 +121,7 @@ static void dump_packet(const struct ipt printk("SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (info->logflags & IPT_LOG_TCPSEQ) + if (logflags & IPT_LOG_TCPSEQ) printk("SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ @@ -146,7 +148,7 @@ static void dump_packet(const struct ipt /* Max length: 11 "URGP=65535 " */ printk("URGP=%u ", ntohs(th->urg_ptr)); - if ((info->logflags & IPT_LOG_TCPOPT) + if ((logflags & IPT_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; unsigned char *op; @@ -328,7 +330,7 @@ static void dump_packet(const struct ipt } /* Max length: 15 "UID=4294967295 " */ - if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) { + if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) printk("UID=%u ", skb->sk->sk_socket->file->f_uid); @@ -349,19 +351,31 @@ static void dump_packet(const struct ipt /* maxlen = 230+ 91 + 230 + 252 = 803 */ } +struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 0, + .logflags = NF_LOG_MASK, + }, + }, +}; + static void -ipt_log_packet(unsigned int hooknum, +ipt_log_packet(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const struct ipt_log_info *loginfo, - const char *level_string, + const struct nf_loginfo *loginfo, const char *prefix) { + if (!loginfo) + loginfo = &default_loginfo; + spin_lock_bh(&log_lock); - printk(level_string); - printk("%sIN=%s OUT=%s ", - prefix == NULL ? loginfo->prefix : prefix, + printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, in ? in->name : "", out ? out->name : ""); #ifdef CONFIG_BRIDGE_NETFILTER @@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb, void *userinfo) { const struct ipt_log_info *loginfo = targinfo; - char level_string[4] = "< >"; + struct nf_loginfo li; - level_string[1] = '0' + (loginfo->level % 8); - ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; - return IPT_CONTINUE; -} + nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix); -static void -ipt_logfn(unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const char *prefix) -{ - struct ipt_log_info loginfo = { - .level = 0, - .logflags = IPT_LOG_MASK, - .prefix = "" - }; - - ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); + return IPT_CONTINUE; } static int ipt_log_checkentry(const char *tablename, @@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = { .me = THIS_MODULE, }; +static struct nf_logger ipt_log_logger ={ + .name = "ipt_LOG", + .logfn = &ipt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { if (ipt_register_target(&ipt_log_reg)) return -EINVAL; - if (nflog) - nf_log_register(PF_INET, &ipt_logfn); + if (nf_log_register(PF_INET, &ipt_log_logger) < 0) { + printk(KERN_WARNING "ipt_LOG: not logging via system console " + "since somebody else already registered for PF_INET\n"); + /* we cannot make module load fail here, since otherwise + * iptables userspace would abort */ + } return 0; } static void __exit fini(void) { - if (nflog) - nf_log_unregister(PF_INET, &ipt_logfn); + nf_log_unregister_logger(&ipt_log_logger); ipt_unregister_target(&ipt_log_reg); } diff -puN net/ipv4/netfilter/ipt_mark.c~git-net net/ipv4/netfilter/ipt_mark.c --- devel/net/ipv4/netfilter/ipt_mark.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_mark.c 2005-08-06 23:39:10.000000000 -0700 @@ -37,9 +37,16 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo; + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) return 0; + if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { + printk(KERN_WARNING "mark: only supports 32bit mark\n"); + return 0; + } + return 1; } diff -puN net/ipv4/netfilter/ipt_MARK.c~git-net net/ipv4/netfilter/ipt_MARK.c --- devel/net/ipv4/netfilter/ipt_MARK.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_MARK.c 2005-08-06 23:39:10.000000000 -0700 @@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb, { const struct ipt_mark_target_info *markinfo = targinfo; - if((*pskb)->nfmark != markinfo->mark) { + if((*pskb)->nfmark != markinfo->mark) (*pskb)->nfmark = markinfo->mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return IPT_CONTINUE; } @@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb, break; } - if((*pskb)->nfmark != mark) { + if((*pskb)->nfmark != mark) (*pskb)->nfmark = mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return IPT_CONTINUE; } @@ -76,6 +74,8 @@ checkentry_v0(const char *tablename, unsigned int targinfosize, unsigned int hook_mask) { + struct ipt_mark_target_info *markinfo = targinfo; + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", targinfosize, @@ -88,6 +88,11 @@ checkentry_v0(const char *tablename, return 0; } + if (markinfo->mark > 0xffffffff) { + printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return 0; + } + return 1; } @@ -120,6 +125,11 @@ checkentry_v1(const char *tablename, return 0; } + if (markinfo->mark > 0xffffffff) { + printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return 0; + } + return 1; } diff -puN /dev/null net/ipv4/netfilter/ipt_NFQUEUE.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_NFQUEUE.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,70 @@ +/* iptables module for using new netfilter netlink queue + * + * (C) 2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include + +#include +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables NFQUEUE target"); +MODULE_LICENSE("GPL"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_NFQ_info *tinfo = targinfo; + + return NF_QUEUE_NR(tinfo->queuenum); +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) { + printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_NFQ_info))); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_NFQ_reg = { + .name = "NFQUEUE", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_NFQ_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_NFQ_reg); +} + +module_init(init); +module_exit(fini); diff -puN net/ipv4/netfilter/ipt_REJECT.c~git-net net/ipv4/netfilter/ipt_REJECT.c --- devel/net/ipv4/netfilter/ipt_REJECT.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_REJECT.c 2005-08-06 23:39:10.000000000 -0700 @@ -156,7 +156,6 @@ static void send_reset(struct sk_buff *o /* This packet will not be the same as the other: clear nf fields */ nf_reset(nskb); - nskb->nfcache = 0; nskb->nfmark = 0; #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(nskb->nf_bridge); diff -puN net/ipv4/netfilter/ipt_TCPMSS.c~git-net net/ipv4/netfilter/ipt_TCPMSS.c --- devel/net/ipv4/netfilter/ipt_TCPMSS.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_TCPMSS.c 2005-08-06 23:39:10.000000000 -0700 @@ -58,7 +58,7 @@ ipt_tcpmss_target(struct sk_buff **pskb, unsigned int i; u_int8_t *opt; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return NF_DROP; iph = (*pskb)->nh.iph; @@ -189,7 +189,6 @@ ipt_tcpmss_target(struct sk_buff **pskb, /* We never hw checksum SYN packets. */ BUG_ON((*pskb)->ip_summed == CHECKSUM_HW); - (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; return IPT_CONTINUE; } diff -puN net/ipv4/netfilter/ipt_TOS.c~git-net net/ipv4/netfilter/ipt_TOS.c --- devel/net/ipv4/netfilter/ipt_TOS.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_TOS.c 2005-08-06 23:39:10.000000000 -0700 @@ -33,7 +33,7 @@ target(struct sk_buff **pskb, if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return NF_DROP; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; @@ -46,7 +46,6 @@ target(struct sk_buff **pskb, sizeof(diffs), (*pskb)->nh.iph->check ^0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return IPT_CONTINUE; } diff -puN net/ipv4/netfilter/ipt_ULOG.c~git-net net/ipv4/netfilter/ipt_ULOG.c --- devel/net/ipv4/netfilter/ipt_ULOG.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/ipt_ULOG.c 2005-08-06 23:39:10.000000000 -0700 @@ -62,6 +62,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("iptables userspace logging module"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); #define ULOG_NL_EVENT 111 /* Harald's favorite number */ #define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ @@ -303,18 +304,27 @@ static unsigned int ipt_ulog_target(stru return IPT_CONTINUE; } -static void ipt_logfn(unsigned int hooknum, +static void ipt_logfn(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + const struct nf_loginfo *li, const char *prefix) { - struct ipt_ulog_info loginfo = { - .nl_group = ULOG_DEFAULT_NLGROUP, - .copy_range = 0, - .qthreshold = ULOG_DEFAULT_QTHRESHOLD, - .prefix = "" - }; + struct ipt_ulog_info loginfo; + + if (!li || li->type != NF_LOG_TYPE_ULOG) { + loginfo.nl_group = ULOG_DEFAULT_NLGROUP; + loginfo.copy_range = 0; + loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD; + loginfo.prefix[0] = '\0'; + } else { + loginfo.nl_group = li->u.ulog.group; + loginfo.copy_range = li->u.ulog.copy_len; + loginfo.qthreshold = li->u.ulog.qthreshold; + strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); + } ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); } @@ -354,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = .me = THIS_MODULE, }; +static struct nf_logger ipt_ulog_logger = { + .name = "ipt_ULOG", + .logfn = &ipt_logfn, + .me = THIS_MODULE, +}; + static int __init init(void) { int i; @@ -372,7 +388,7 @@ static int __init init(void) ulog_buffers[i].timer.data = i; } - nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL); + nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL, THIS_MODULE); if (!nflognl) return -ENOMEM; @@ -381,7 +397,7 @@ static int __init init(void) return -EINVAL; } if (nflog) - nf_log_register(PF_INET, &ipt_logfn); + nf_log_register(PF_INET, &ipt_ulog_logger); return 0; } @@ -394,7 +410,7 @@ static void __exit fini(void) DEBUGP("ipt_ULOG: cleanup_module\n"); if (nflog) - nf_log_unregister(PF_INET, &ipt_logfn); + nf_log_unregister_logger(&ipt_ulog_logger); ipt_unregister_target(&ipt_ulog_reg); sock_release(nflognl->sk_socket); diff -puN net/ipv4/netfilter/Kconfig~git-net net/ipv4/netfilter/Kconfig --- devel/net/ipv4/netfilter/Kconfig~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/Kconfig 2005-08-06 23:39:10.000000000 -0700 @@ -40,6 +40,16 @@ config IP_NF_CONNTRACK_MARK of packets, but this mark value is kept in the conntrack session instead of the individual packets. +config IP_NF_CONNTRACK_EVENTS + bool "Connection tracking events" + depends on IP_NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + provide a notifier chain that can be used by other kernel code + to get notified about changes in the connection tracking state. + + IF unsure, say `N'. + config IP_NF_CT_PROTO_SCTP tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' depends on IP_NF_CONNTRACK && EXPERIMENTAL @@ -100,11 +110,15 @@ config IP_NF_AMANDA To compile it as a module, choose M here. If unsure, say Y. config IP_NF_QUEUE - tristate "Userspace queueing via NETLINK" + tristate "IP Userspace queueing via NETLINK (OBSOLETE)" help Netfilter has the ability to queue packets to user space: the netlink device can be used to access them using this driver. + This option enables the old IPv4-only "ip_queue" implementation + which has been obsoleted by the new "nfnetlink_queue" code (see + CONFIG_NETFILTER_NETLINK_QUEUE). + To compile it as a module, choose M here. If unsure, say N. config IP_NF_IPTABLES @@ -692,5 +706,12 @@ config IP_NF_ARP_MANGLE Allows altering the ARP packet payload: source and destination hardware and network addresses. +config IP_NF_CONNTRACK_NETLINK + tristate 'Connection tracking netlink interface' + depends on IP_NF_CONNTRACK && NETFILTER_NETLINK + help + This option enables support for a netlink-based userspace interface + + endmenu diff -puN net/ipv4/netfilter/Makefile~git-net net/ipv4/netfilter/Makefile --- devel/net/ipv4/netfilter/Makefile~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/netfilter/Makefile 2005-08-06 23:39:10.000000000 -0700 @@ -9,6 +9,10 @@ iptable_nat-objs := ip_nat_standalone.o # connection tracking obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o +# conntrack netlink interface +obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o + + # SCTP protocol connection tracking obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o @@ -87,3 +91,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_m obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o diff -puN net/ipv4/protocol.c~git-net net/ipv4/protocol.c --- devel/net/ipv4/protocol.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/protocol.c 2005-08-06 23:39:10.000000000 -0700 @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff -puN net/ipv4/raw.c~git-net net/ipv4/raw.c --- devel/net/ipv4/raw.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/raw.c 2005-08-06 23:39:10.000000000 -0700 @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include @@ -71,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) { struct sock *sk; struct hlist_head *head; + int delivered = 0; read_lock(&raw_v4_lock); head = &raw_v4_htable[hash]; @@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, s skb->dev->ifindex); while (sk) { + delivered = 1; if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); @@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, s } out: read_unlock(&raw_v4_lock); + return delivered; } void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) diff -puN net/ipv4/route.c~git-net net/ipv4/route.c --- devel/net/ipv4/route.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/route.c 2005-08-06 23:39:10.000000000 -0700 @@ -240,7 +240,9 @@ static unsigned rt_hash_mask; static int rt_hash_log; static unsigned int rt_hash_rnd; -struct rt_cache_stat *rt_cache_stat; +static struct rt_cache_stat *rt_cache_stat; +#define RT_CACHE_STAT_INC(field) \ + (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++) static int rt_intern_hash(unsigned hash, struct rtable *rth, struct rtable **res); @@ -2600,6 +2602,8 @@ int __ip_route_output_key(struct rtable return ip_route_output_slow(rp, flp); } +EXPORT_SYMBOL_GPL(__ip_route_output_key); + int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) { int err; @@ -2618,6 +2622,8 @@ int ip_route_output_flow(struct rtable * return 0; } +EXPORT_SYMBOL_GPL(ip_route_output_flow); + int ip_route_output_key(struct rtable **rp, struct flowi *flp) { return ip_route_output_flow(rp, flp, NULL, 0); diff -puN net/ipv4/syncookies.c~git-net net/ipv4/syncookies.c --- devel/net/ipv4/syncookies.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/syncookies.c 2005-08-06 23:39:10.000000000 -0700 @@ -180,7 +180,7 @@ static inline struct sock *get_cookie_so child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); if (child) - tcp_acceptq_queue(sk, req, child); + inet_csk_reqsk_queue_add(sk, req, child); else reqsk_free(req); diff -puN net/ipv4/tcp.c~git-net net/ipv4/tcp.c --- devel/net/ipv4/tcp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/tcp.c 2005-08-06 23:39:10.000000000 -0700 @@ -271,11 +271,10 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIM DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); -kmem_cache_t *tcp_bucket_cachep; -kmem_cache_t *tcp_timewait_cachep; - atomic_t tcp_orphan_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(tcp_orphan_count); + int sysctl_tcp_mem[3]; int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; @@ -316,7 +315,7 @@ EXPORT_SYMBOL(tcp_enter_memory_pressure) static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) { - return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0; + return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (POLLIN | POLLRDNORM) : 0; } /* @@ -457,109 +456,6 @@ int tcp_ioctl(struct sock *sk, int cmd, return put_user(answ, (int __user *)arg); } - -int tcp_listen_start(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE); - - if (rc != 0) - return rc; - - sk->sk_max_ack_backlog = 0; - sk->sk_ack_backlog = 0; - tcp_delack_init(tp); - - /* There is race window here: we announce ourselves listening, - * but this transition is still not validated by get_port(). - * It is OK, because this socket enters to hash table only - * after validation is complete. - */ - sk->sk_state = TCP_LISTEN; - if (!sk->sk_prot->get_port(sk, inet->num)) { - inet->sport = htons(inet->num); - - sk_dst_reset(sk); - sk->sk_prot->hash(sk); - - return 0; - } - - sk->sk_state = TCP_CLOSE; - reqsk_queue_destroy(&tp->accept_queue); - return -EADDRINUSE; -} - -/* - * This routine closes sockets which have been at least partially - * opened, but not yet accepted. - */ - -static void tcp_listen_stop (struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt; - struct request_sock *acc_req; - struct request_sock *req; - int i; - - tcp_delete_keepalive_timer(sk); - - /* make all the listen_opt local to us */ - lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue); - acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue); - - if (lopt->qlen) { - for (i = 0; i < TCP_SYNQ_HSIZE; i++) { - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - lopt->qlen--; - reqsk_free(req); - - /* Following specs, it would be better either to send FIN - * (and enter FIN-WAIT-1, it is normal close) - * or to send active reset (abort). - * Certainly, it is pretty dangerous while synflood, but it is - * bad justification for our negligence 8) - * To be honest, we are not able to make either - * of the variants now. --ANK - */ - } - } - } - BUG_TRAP(!lopt->qlen); - - kfree(lopt); - - while ((req = acc_req) != NULL) { - struct sock *child = req->sk; - - acc_req = req->dl_next; - - local_bh_disable(); - bh_lock_sock(child); - BUG_TRAP(!sock_owned_by_user(child)); - sock_hold(child); - - tcp_disconnect(child, O_NONBLOCK); - - sock_orphan(child); - - atomic_inc(&tcp_orphan_count); - - tcp_destroy_sock(child); - - bh_unlock_sock(child); - local_bh_enable(); - sock_put(child); - - sk_acceptq_removed(sk); - __reqsk_free(req); - } - BUG_TRAP(!sk->sk_ack_backlog); -} - static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; @@ -975,7 +871,7 @@ do_fault: if (!skb->len) { if (sk->sk_send_head == skb) sk->sk_send_head = NULL; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); } @@ -1057,20 +953,21 @@ static void cleanup_rbuf(struct sock *sk BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); #endif - if (tcp_ack_scheduled(tp)) { + if (inet_csk_ack_scheduled(sk)) { + const struct inet_connection_sock *icsk = inet_csk(sk); /* Delayed ACKs frequently hit locked sockets during bulk * receive. */ - if (tp->ack.blocked || + if (icsk->icsk_ack.blocked || /* Once-per-two-segments ACK was not sent by tcp_input.c */ - tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss || + tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if * connection is not bidirectional, user drained * receive buffer and there was a small segment * in queue. */ - (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) && - !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) + (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && + !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = 1; } @@ -1572,40 +1469,6 @@ void tcp_shutdown(struct sock *sk, int h } } -/* - * At this point, there should be no process reference to this - * socket, and thus no user references at all. Therefore we - * can assume the socket waitqueue is inactive and nobody will - * try to jump onto it. - */ -void tcp_destroy_sock(struct sock *sk) -{ - BUG_TRAP(sk->sk_state == TCP_CLOSE); - BUG_TRAP(sock_flag(sk, SOCK_DEAD)); - - /* It cannot be in hash table! */ - BUG_TRAP(sk_unhashed(sk)); - - /* If it has not 0 inet_sk(sk)->num, it must be bound */ - BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash); - - sk->sk_prot->destroy(sk); - - sk_stream_kill_queues(sk); - - xfrm_sk_free_policy(sk); - -#ifdef INET_REFCNT_DEBUG - if (atomic_read(&sk->sk_refcnt) != 1) { - printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", - sk, atomic_read(&sk->sk_refcnt)); - } -#endif - - atomic_dec(&tcp_orphan_count); - sock_put(sk); -} - void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; @@ -1618,7 +1481,7 @@ void tcp_close(struct sock *sk, long tim tcp_set_state(sk, TCP_CLOSE); /* Special case. */ - tcp_listen_stop(sk); + inet_csk_listen_stop(sk); goto adjudge_to_death; } @@ -1721,12 +1584,12 @@ adjudge_to_death: tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); } else { - int tmo = tcp_fin_time(tp); + const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); + inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); } else { - atomic_inc(&tcp_orphan_count); + atomic_inc(sk->sk_prot->orphan_count); tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } @@ -1734,7 +1597,7 @@ adjudge_to_death: } if (sk->sk_state != TCP_CLOSE) { sk_stream_mem_reclaim(sk); - if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || + if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { if (net_ratelimit()) @@ -1745,10 +1608,10 @@ adjudge_to_death: NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); } } - atomic_inc(&tcp_orphan_count); + atomic_inc(sk->sk_prot->orphan_count); if (sk->sk_state == TCP_CLOSE) - tcp_destroy_sock(sk); + inet_csk_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ out: @@ -1769,6 +1632,7 @@ static inline int tcp_need_reset(int sta int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int err = 0; int old_state = sk->sk_state; @@ -1778,7 +1642,7 @@ int tcp_disconnect(struct sock *sk, int /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { - tcp_listen_stop(sk); + inet_csk_listen_stop(sk); } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { @@ -1805,7 +1669,7 @@ int tcp_disconnect(struct sock *sk, int tp->srtt = 0; if ((tp->write_seq += tp->max_window + 2) == 0) tp->write_seq = 1; - tp->backoff = 0; + icsk->icsk_backoff = 0; tp->snd_cwnd = 2; tp->probes_out = 0; tp->packets_out = 0; @@ -1813,117 +1677,26 @@ int tcp_disconnect(struct sock *sk, int tp->snd_cwnd_cnt = 0; tcp_set_ca_state(tp, TCP_CA_Open); tcp_clear_retrans(tp); - tcp_delack_init(tp); + inet_csk_delack_init(sk); sk->sk_send_head = NULL; tp->rx_opt.saw_tstamp = 0; tcp_sack_reset(&tp->rx_opt); __sk_dst_reset(sk); - BUG_TRAP(!inet->num || tp->bind_hash); + BUG_TRAP(!inet->num || icsk->icsk_bind_hash); sk->sk_error_report(sk); return err; } /* - * Wait for an incoming connection, avoid race - * conditions. This must be called with the socket locked. - */ -static int wait_for_connect(struct sock *sk, long timeo) -{ - struct tcp_sock *tp = tcp_sk(sk); - DEFINE_WAIT(wait); - int err; - - /* - * True wake-one mechanism for incoming connections: only - * one process gets woken up, not the 'whole herd'. - * Since we do not 'race & poll' for established sockets - * anymore, the common case will execute the loop only once. - * - * Subtle issue: "add_wait_queue_exclusive()" will be added - * after any current non-exclusive waiters, and we know that - * it will always _stay_ after any new non-exclusive waiters - * because all non-exclusive waiters are added at the - * beginning of the wait-queue. As such, it's ok to "drop" - * our exclusiveness temporarily when we get woken up without - * having to remove and re-insert us on the wait queue. - */ - for (;;) { - prepare_to_wait_exclusive(sk->sk_sleep, &wait, - TASK_INTERRUPTIBLE); - release_sock(sk); - if (reqsk_queue_empty(&tp->accept_queue)) - timeo = schedule_timeout(timeo); - lock_sock(sk); - err = 0; - if (!reqsk_queue_empty(&tp->accept_queue)) - break; - err = -EINVAL; - if (sk->sk_state != TCP_LISTEN) - break; - err = sock_intr_errno(timeo); - if (signal_pending(current)) - break; - err = -EAGAIN; - if (!timeo) - break; - } - finish_wait(sk->sk_sleep, &wait); - return err; -} - -/* - * This will accept the next outstanding connection. - */ - -struct sock *tcp_accept(struct sock *sk, int flags, int *err) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sock *newsk; - int error; - - lock_sock(sk); - - /* We need to make sure that this socket is listening, - * and that it has something pending. - */ - error = -EINVAL; - if (sk->sk_state != TCP_LISTEN) - goto out_err; - - /* Find already established connection */ - if (reqsk_queue_empty(&tp->accept_queue)) { - long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); - - /* If this is a non blocking socket don't sleep */ - error = -EAGAIN; - if (!timeo) - goto out_err; - - error = wait_for_connect(sk, timeo); - if (error) - goto out_err; - } - - newsk = reqsk_queue_get_child(&tp->accept_queue, sk); - BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); -out: - release_sock(sk); - return newsk; -out_err: - newsk = NULL; - *err = error; - goto out; -} - -/* * Socket option code for TCP. */ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int val; int err = 0; @@ -2022,7 +1795,7 @@ int tcp_setsockopt(struct sock *sk, int elapsed = tp->keepalive_time - elapsed; else elapsed = 0; - tcp_reset_keepalive_timer(sk, elapsed); + inet_csk_reset_keepalive_timer(sk, elapsed); } } break; @@ -2042,7 +1815,7 @@ int tcp_setsockopt(struct sock *sk, int if (val < 1 || val > MAX_TCP_SYNCNT) err = -EINVAL; else - tp->syn_retries = val; + icsk->icsk_syn_retries = val; break; case TCP_LINGER2: @@ -2055,15 +1828,15 @@ int tcp_setsockopt(struct sock *sk, int break; case TCP_DEFER_ACCEPT: - tp->defer_accept = 0; + icsk->icsk_accept_queue.rskq_defer_accept = 0; if (val > 0) { /* Translate value in seconds to number of * retransmits */ - while (tp->defer_accept < 32 && + while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && val > ((TCP_TIMEOUT_INIT / HZ) << - tp->defer_accept)) - tp->defer_accept++; - tp->defer_accept++; + icsk->icsk_accept_queue.rskq_defer_accept)) + icsk->icsk_accept_queue.rskq_defer_accept++; + icsk->icsk_accept_queue.rskq_defer_accept++; } break; @@ -2081,16 +1854,16 @@ int tcp_setsockopt(struct sock *sk, int case TCP_QUICKACK: if (!val) { - tp->ack.pingpong = 1; + icsk->icsk_ack.pingpong = 1; } else { - tp->ack.pingpong = 0; + icsk->icsk_ack.pingpong = 0; if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && - tcp_ack_scheduled(tp)) { - tp->ack.pending |= TCP_ACK_PUSHED; + inet_csk_ack_scheduled(sk)) { + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; cleanup_rbuf(sk, 1); if (!(val & 1)) - tp->ack.pingpong = 1; + icsk->icsk_ack.pingpong = 1; } } break; @@ -2107,15 +1880,16 @@ int tcp_setsockopt(struct sock *sk, int void tcp_get_info(struct sock *sk, struct tcp_info *info) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; memset(info, 0, sizeof(*info)); info->tcpi_state = sk->sk_state; info->tcpi_ca_state = tp->ca_state; - info->tcpi_retransmits = tp->retransmits; + info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = tp->probes_out; - info->tcpi_backoff = tp->backoff; + info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; @@ -2130,10 +1904,10 @@ void tcp_get_info(struct sock *sk, struc if (tp->ecn_flags&TCP_ECN_OK) info->tcpi_options |= TCPI_OPT_ECN; - info->tcpi_rto = jiffies_to_usecs(tp->rto); - info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); + info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); + info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); info->tcpi_snd_mss = tp->mss_cache; - info->tcpi_rcv_mss = tp->ack.rcv_mss; + info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; info->tcpi_unacked = tp->packets_out; info->tcpi_sacked = tp->sacked_out; @@ -2142,7 +1916,7 @@ void tcp_get_info(struct sock *sk, struc info->tcpi_fackets = tp->fackets_out; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); - info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime); + info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); info->tcpi_pmtu = tp->pmtu_cookie; @@ -2165,6 +1939,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int val, len; @@ -2202,7 +1977,7 @@ int tcp_getsockopt(struct sock *sk, int val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; break; case TCP_SYNCNT: - val = tp->syn_retries ? : sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; break; case TCP_LINGER2: val = tp->linger2; @@ -2210,8 +1985,8 @@ int tcp_getsockopt(struct sock *sk, int val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) << - (tp->defer_accept - 1)); + val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : + ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; @@ -2232,7 +2007,7 @@ int tcp_getsockopt(struct sock *sk, int return 0; } case TCP_QUICKACK: - val = !tp->ack.pingpong; + val = !icsk->icsk_ack.pingpong; break; case TCP_CONGESTION: @@ -2278,64 +2053,57 @@ void __init tcp_init(void) __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), sizeof(skb->cb)); - tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", - sizeof(struct tcp_bind_bucket), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (!tcp_bucket_cachep) + tcp_hashinfo.bind_bucket_cachep = + kmem_cache_create("tcp_bind_bucket", + sizeof(struct inet_bind_bucket), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!tcp_hashinfo.bind_bucket_cachep) panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); - tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", - sizeof(struct tcp_tw_bucket), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (!tcp_timewait_cachep) - panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); - /* Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ - tcp_ehash = (struct tcp_ehash_bucket *) + tcp_hashinfo.ehash = alloc_large_system_hash("TCP established", - sizeof(struct tcp_ehash_bucket), + sizeof(struct inet_ehash_bucket), thash_entries, (num_physpages >= 128 * 1024) ? (25 - PAGE_SHIFT) : (27 - PAGE_SHIFT), HASH_HIGHMEM, - &tcp_ehash_size, + &tcp_hashinfo.ehash_size, NULL, 0); - tcp_ehash_size = (1 << tcp_ehash_size) >> 1; - for (i = 0; i < (tcp_ehash_size << 1); i++) { - rwlock_init(&tcp_ehash[i].lock); - INIT_HLIST_HEAD(&tcp_ehash[i].chain); + tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1; + for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) { + rwlock_init(&tcp_hashinfo.ehash[i].lock); + INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); } - tcp_bhash = (struct tcp_bind_hashbucket *) + tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", - sizeof(struct tcp_bind_hashbucket), - tcp_ehash_size, + sizeof(struct inet_bind_hashbucket), + tcp_hashinfo.ehash_size, (num_physpages >= 128 * 1024) ? (25 - PAGE_SHIFT) : (27 - PAGE_SHIFT), HASH_HIGHMEM, - &tcp_bhash_size, + &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_bhash_size = 1 << tcp_bhash_size; - for (i = 0; i < tcp_bhash_size; i++) { - spin_lock_init(&tcp_bhash[i].lock); - INIT_HLIST_HEAD(&tcp_bhash[i].chain); + tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + for (i = 0; i < tcp_hashinfo.bhash_size; i++) { + spin_lock_init(&tcp_hashinfo.bhash[i].lock); + INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); } /* Try to be a bit smarter and adjust defaults depending * on available memory. */ for (order = 0; ((1 << order) << PAGE_SHIFT) < - (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); + (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); order++) ; if (order >= 4) { @@ -2350,7 +2118,7 @@ void __init tcp_init(void) sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } - tcp_port_rover = sysctl_local_port_range[0] - 1; + tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1; sysctl_tcp_mem[0] = 768 << order; sysctl_tcp_mem[1] = 1024 << order; @@ -2365,14 +2133,12 @@ void __init tcp_init(void) printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", - tcp_ehash_size << 1, tcp_bhash_size); + tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size); tcp_register_congestion_control(&tcp_reno); } -EXPORT_SYMBOL(tcp_accept); EXPORT_SYMBOL(tcp_close); -EXPORT_SYMBOL(tcp_destroy_sock); EXPORT_SYMBOL(tcp_disconnect); EXPORT_SYMBOL(tcp_getsockopt); EXPORT_SYMBOL(tcp_ioctl); @@ -2384,4 +2150,3 @@ EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown); EXPORT_SYMBOL(tcp_statistics); -EXPORT_SYMBOL(tcp_timewait_cachep); diff -puN net/ipv4/tcp_diag.c~git-net net/ipv4/tcp_diag.c --- devel/net/ipv4/tcp_diag.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/tcp_diag.c 2005-08-06 23:39:10.000000000 -0700 @@ -48,8 +48,9 @@ static struct sock *tcpnl; static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, int ext, u32 pid, u32 seq, u16 nlmsg_flags) { - struct inet_sock *inet = inet_sk(sk); + const struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcpdiagmsg *r; struct nlmsghdr *nlh; struct tcp_info *info = NULL; @@ -81,7 +82,7 @@ static int tcpdiag_fill(struct sk_buff * r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); if (r->tcpdiag_state == TCP_TIME_WAIT) { - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk; + const struct inet_timewait_sock *tw = inet_twsk(sk); long tmo = tw->tw_ttd - jiffies; if (tmo < 0) tmo = 0; @@ -99,10 +100,12 @@ static int tcpdiag_fill(struct sk_buff * r->tcpdiag_inode = 0; #ifdef CONFIG_IP_TCPDIAG_IPV6 if (r->tcpdiag_family == AF_INET6) { + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, - &tw->tw_v6_rcv_saddr); + &tcp6tw->tw_v6_rcv_saddr); ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, - &tw->tw_v6_daddr); + &tcp6tw->tw_v6_daddr); } #endif nlh->nlmsg_len = skb->tail - b; @@ -127,14 +130,14 @@ static int tcpdiag_fill(struct sk_buff * #define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ - if (tp->pending == TCP_TIME_RETRANS) { + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { r->tcpdiag_timer = 1; - r->tcpdiag_retrans = tp->retransmits; - r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); - } else if (tp->pending == TCP_TIME_PROBE0) { + r->tcpdiag_retrans = icsk->icsk_retransmits; + r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->tcpdiag_timer = 4; r->tcpdiag_retrans = tp->probes_out; - r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); + r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); } else if (timer_pending(&sk->sk_timer)) { r->tcpdiag_timer = 2; r->tcpdiag_retrans = tp->probes_out; @@ -172,8 +175,6 @@ nlmsg_failure: return -1; } -extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, - int dif); #ifdef CONFIG_IP_TCPDIAG_IPV6 extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, struct in6_addr *daddr, u16 dport, @@ -195,9 +196,9 @@ static int tcpdiag_get_exact(struct sk_b struct sk_buff *rep; if (req->tcpdiag_family == AF_INET) { - sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, - req->id.tcpdiag_src[0], req->id.tcpdiag_sport, - req->id.tcpdiag_if); + sk = inet_lookup(&tcp_hashinfo, req->id.tcpdiag_dst[0], + req->id.tcpdiag_dport, req->id.tcpdiag_src[0], + req->id.tcpdiag_sport, req->id.tcpdiag_if); } #ifdef CONFIG_IP_TCPDIAG_IPV6 else if (req->tcpdiag_family == AF_INET6) { @@ -239,7 +240,7 @@ static int tcpdiag_get_exact(struct sk_b out: if (sk) { if (sk->sk_state == TCP_TIME_WAIT) - tcp_tw_put((struct tcp_tw_bucket*)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); else sock_put(sk); } @@ -497,7 +498,7 @@ static int tcpdiag_dump_reqs(struct sk_b { struct tcpdiag_entry entry; struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt; struct rtattr *bc = NULL; struct inet_sock *inet = inet_sk(sk); @@ -513,9 +514,9 @@ static int tcpdiag_dump_reqs(struct sk_b entry.family = sk->sk_family; - read_lock_bh(&tp->accept_queue.syn_wait_lock); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - lopt = tp->accept_queue.listen_opt; + lopt = icsk->icsk_accept_queue.listen_opt; if (!lopt || !lopt->qlen) goto out; @@ -572,7 +573,7 @@ static int tcpdiag_dump_reqs(struct sk_b } out: - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); return err; } @@ -589,13 +590,13 @@ static int tcpdiag_dump(struct sk_buff * if (cb->args[0] == 0) { if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) goto skip_listen_ht; - tcp_listen_lock(); - for (i = s_i; i < TCP_LHTABLE_SIZE; i++) { + inet_listen_lock(&tcp_hashinfo); + for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct sock *sk; struct hlist_node *node; num = 0; - sk_for_each(sk, node, &tcp_listening_hash[i]) { + sk_for_each(sk, node, &tcp_hashinfo.listening_hash[i]) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) { @@ -613,7 +614,7 @@ static int tcpdiag_dump(struct sk_buff * goto syn_recv; if (tcpdiag_dump_sock(skb, sk, cb) < 0) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); goto done; } @@ -622,7 +623,7 @@ syn_recv: goto next_listen; if (tcpdiag_dump_reqs(skb, sk, cb) < 0) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); goto done; } @@ -636,7 +637,7 @@ next_listen: cb->args[3] = 0; cb->args[4] = 0; } - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); skip_listen_ht: cb->args[0] = 1; s_i = num = s_num = 0; @@ -645,8 +646,8 @@ skip_listen_ht: if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) return skb->len; - for (i = s_i; i < tcp_ehash_size; i++) { - struct tcp_ehash_bucket *head = &tcp_ehash[i]; + for (i = s_i; i < tcp_hashinfo.ehash_size; i++) { + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[i]; struct sock *sk; struct hlist_node *node; @@ -678,7 +679,7 @@ next_normal: if (r->tcpdiag_states&TCPF_TIME_WAIT) { sk_for_each(sk, node, - &tcp_ehash[i + tcp_ehash_size].chain) { + &tcp_hashinfo.ehash[i + tcp_hashinfo.ehash_size].chain) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) @@ -774,7 +775,8 @@ static void tcpdiag_rcv(struct sock *sk, static int __init tcpdiag_init(void) { - tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv); + tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv, + THIS_MODULE); if (tcpnl == NULL) return -ENOMEM; return 0; diff -puN net/ipv4/tcp_input.c~git-net net/ipv4/tcp_input.c --- devel/net/ipv4/tcp_input.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/tcp_input.c 2005-08-06 23:39:10.000000000 -0700 @@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, - struct sk_buff *skb) +static inline void tcp_measure_rcv_mss(struct sock *sk, + const struct sk_buff *skb) { - unsigned int len, lss; + struct inet_connection_sock *icsk = inet_csk(sk); + const unsigned int lss = icsk->icsk_ack.last_seg_size; + unsigned int len; - lss = tp->ack.last_seg_size; - tp->ack.last_seg_size = 0; + icsk->icsk_ack.last_seg_size = 0; /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ len = skb->len; - if (len >= tp->ack.rcv_mss) { - tp->ack.rcv_mss = len; + if (len >= icsk->icsk_ack.rcv_mss) { + icsk->icsk_ack.rcv_mss = len; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. @@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(s * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. */ - len -= tp->tcp_header_len; - tp->ack.last_seg_size = len; + len -= tcp_sk(sk)->tcp_header_len; + icsk->icsk_ack.last_seg_size = len; if (len == lss) { - tp->ack.rcv_mss = len; + icsk->icsk_ack.rcv_mss = len; return; } } - tp->ack.pending |= TCP_ACK_PUSHED; + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } } -static void tcp_incr_quickack(struct tcp_sock *tp) +static void tcp_incr_quickack(struct sock *sk) { - unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss); + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks==0) quickacks=2; - if (quickacks > tp->ack.quick) - tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS); + if (quickacks > icsk->icsk_ack.quick) + icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } -void tcp_enter_quickack_mode(struct tcp_sock *tp) +void tcp_enter_quickack_mode(struct sock *sk) { - tcp_incr_quickack(tp); - tp->ack.pingpong = 0; - tp->ack.ato = TCP_ATO_MIN; + struct inet_connection_sock *icsk = inet_csk(sk); + tcp_incr_quickack(sk); + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; } /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */ -static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp) +static inline int tcp_in_quickack_mode(const struct sock *sk) { - return (tp->ack.quick && !tp->ack.pingpong); + const struct inet_connection_sock *icsk = inet_csk(sk); + return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } /* Buffer size and advertised window tuning. @@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock */ /* Slow part of check#2. */ -static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, + const struct sk_buff *skb) { /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; @@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) - return 2*tp->ack.rcv_mss; + return 2 * inet_csk(sk)->icsk_ack.rcv_mss; truesize >>= 1; window >>= 1; @@ -260,7 +264,7 @@ static inline void tcp_grow_window(struc if (incr) { tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); - tp->ack.quick |= 1; + inet_csk(sk)->icsk_ack.quick |= 1; } } } @@ -325,7 +329,7 @@ static void tcp_clamp_window(struct sock unsigned int app_win = tp->rcv_nxt - tp->copied_seq; int ofo_win = 0; - tp->ack.quick = 0; + inet_csk(sk)->icsk_ack.quick = 0; skb_queue_walk(&tp->out_of_order_queue, skb) { ofo_win += skb->len; @@ -346,8 +350,8 @@ static void tcp_clamp_window(struct sock app_win += ofo_win; if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) app_win >>= 1; - if (app_win > tp->ack.rcv_mss) - app_win -= tp->ack.rcv_mss; + if (app_win > inet_csk(sk)->icsk_ack.rcv_mss) + app_win -= inet_csk(sk)->icsk_ack.rcv_mss; app_win = max(app_win, 2U*tp->advmss); if (!ofo_win) @@ -415,11 +419,12 @@ new_measure: tp->rcv_rtt_est.time = tcp_time_stamp; } -static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb) +static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss)) + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); } @@ -492,41 +497,42 @@ new_measure: */ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { + struct inet_connection_sock *icsk = inet_csk(sk); u32 now; - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); - tcp_measure_rcv_mss(tp, skb); + tcp_measure_rcv_mss(sk, skb); tcp_rcv_rtt_measure(tp); now = tcp_time_stamp; - if (!tp->ack.ato) { + if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize * delayed ACK engine. */ - tcp_incr_quickack(tp); - tp->ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(sk); + icsk->icsk_ack.ato = TCP_ATO_MIN; } else { - int m = now - tp->ack.lrcvtime; + int m = now - icsk->icsk_ack.lrcvtime; if (m <= TCP_ATO_MIN/2) { /* The fastest case is the first. */ - tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; - } else if (m < tp->ack.ato) { - tp->ack.ato = (tp->ack.ato>>1) + m; - if (tp->ack.ato > tp->rto) - tp->ack.ato = tp->rto; - } else if (m > tp->rto) { + icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; + } else if (m < icsk->icsk_ack.ato) { + icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; + if (icsk->icsk_ack.ato > icsk->icsk_rto) + icsk->icsk_ack.ato = icsk->icsk_rto; + } else if (m > icsk->icsk_rto) { /* Too long gap. Apparently sender falled to * restart window, so that we send ACKs quickly. */ - tcp_incr_quickack(tp); + tcp_incr_quickack(sk); sk_stream_mem_reclaim(sk); } } - tp->ack.lrcvtime = now; + icsk->icsk_ack.lrcvtime = now; TCP_ECN_check_ce(tp, skb); @@ -611,8 +617,9 @@ static void tcp_rtt_estimator(struct tcp /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static inline void tcp_set_rto(struct tcp_sock *tp) +static inline void tcp_set_rto(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) * * More seriously: @@ -623,7 +630,7 @@ static inline void tcp_set_rto(struct tc * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some curcumstances. */ - tp->rto = (tp->srtt >> 3) + tp->rttvar; + inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -635,10 +642,10 @@ static inline void tcp_set_rto(struct tc /* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */ -static inline void tcp_bound_rto(struct tcp_sock *tp) +static inline void tcp_bound_rto(struct sock *sk) { - if (tp->rto > TCP_RTO_MAX) - tp->rto = TCP_RTO_MAX; + if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) + inet_csk(sk)->icsk_rto = TCP_RTO_MAX; } /* Save metrics learned by this TCP session. @@ -658,7 +665,7 @@ void tcp_update_metrics(struct sock *sk) if (dst && (dst->flags&DST_HOST)) { int m; - if (tp->backoff || !tp->srtt) { + if (inet_csk(sk)->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. * Reset our results. @@ -801,9 +808,9 @@ static void tcp_init_metrics(struct sock tp->mdev = dst_metric(dst, RTAX_RTTVAR); tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); } - tcp_set_rto(tp); - tcp_bound_rto(tp); - if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) + tcp_set_rto(sk); + tcp_bound_rto(sk); + if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) goto reset; tp->snd_cwnd = tcp_init_cwnd(tp, dst); tp->snd_cwnd_stamp = tcp_time_stamp; @@ -817,7 +824,7 @@ reset: if (!tp->rx_opt.saw_tstamp && tp->srtt) { tp->srtt = 0; tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; - tp->rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; } } @@ -1118,7 +1125,7 @@ void tcp_enter_frto(struct sock *sk) if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + (tp->ca_state == TCP_CA_Loss && !inet_csk(sk)->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); tcp_ca_event(tp, CA_EVENT_FRTO); @@ -1214,7 +1221,7 @@ void tcp_enter_loss(struct sock *sk, int /* Reduce ssthresh if it has not yet been made inside this window. */ if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + (tp->ca_state == TCP_CA_Loss && !inet_csk(sk)->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); tcp_ca_event(tp, CA_EVENT_LOSS); @@ -1253,7 +1260,7 @@ void tcp_enter_loss(struct sock *sk, int TCP_ECN_queue_cwr(tp); } -static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) +static int tcp_check_sack_reneging(struct sock *sk) { struct sk_buff *skb; @@ -1268,9 +1275,10 @@ static int tcp_check_sack_reneging(struc NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); - tp->retransmits++; + inet_csk(sk)->icsk_retransmits++; tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 1; } return 0; @@ -1281,15 +1289,15 @@ static inline int tcp_fackets_out(struct return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; } -static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb) +static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) { - return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto); + return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); } static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) { return tp->packets_out && - tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue)); + tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue)); } /* Linux NewReno/SACK/FACK/ECN state machine. @@ -1509,7 +1517,7 @@ static void tcp_update_scoreboard(struct struct sk_buff *skb; sk_stream_for_retrans_queue(skb, sk) { - if (tcp_skb_timedout(tp, skb) && + if (tcp_skb_timedout(sk, skb) && !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -1676,7 +1684,7 @@ static int tcp_try_undo_loss(struct sock tp->left_out = tp->sacked_out; tcp_undo_cwr(tp, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); - tp->retransmits = 0; + inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; if (!IsReno(tp)) tcp_set_ca_state(tp, TCP_CA_Open); @@ -1750,7 +1758,7 @@ tcp_fastretrans_alert(struct sock *sk, u tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ - if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) + if (tp->sacked_out && tcp_check_sack_reneging(sk)) return; /* C. Process data loss notification, provided it is valid. */ @@ -1774,7 +1782,7 @@ tcp_fastretrans_alert(struct sock *sk, u } else if (!before(tp->snd_una, tp->high_seq)) { switch (tp->ca_state) { case TCP_CA_Loss: - tp->retransmits = 0; + inet_csk(sk)->icsk_retransmits = 0; if (tcp_try_undo_recovery(sk, tp)) return; break; @@ -1824,7 +1832,7 @@ tcp_fastretrans_alert(struct sock *sk, u break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) - tp->retransmits = 0; + inet_csk(sk)->icsk_retransmits = 0; if (!tcp_try_undo_loss(sk, tp)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); @@ -1881,10 +1889,8 @@ tcp_fastretrans_alert(struct sock *sk, u /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) +static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) { - __u32 seq_rtt; - /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the @@ -1900,14 +1906,15 @@ static void tcp_ack_saw_tstamp(struct tc * answer arrives rto becomes 120 seconds! If at least one of segments * in window is lost... Voila. --ANK (010210) */ - seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + struct tcp_sock *tp = tcp_sk(sk); + const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; tcp_rtt_estimator(tp, seq_rtt, usrtt); - tcp_set_rto(tp); - tp->backoff = 0; - tcp_bound_rto(tp); + tcp_set_rto(sk); + inet_csk(sk)->icsk_backoff = 0; + tcp_bound_rto(sk); } -static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -1921,20 +1928,21 @@ static void tcp_ack_no_tstamp(struct tcp if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tp, seq_rtt, usrtt); - tcp_set_rto(tp); - tp->backoff = 0; - tcp_bound_rto(tp); + tcp_rtt_estimator(tcp_sk(sk), seq_rtt, usrtt); + tcp_set_rto(sk); + inet_csk(sk)->icsk_backoff = 0; + tcp_bound_rto(sk); } -static inline void tcp_ack_update_rtt(struct tcp_sock *tp, - int flag, s32 seq_rtt, u32 *usrtt) +static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, + const s32 seq_rtt, u32 *usrtt) { + const struct tcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(tp, usrtt, flag); + tcp_ack_saw_tstamp(sk, usrtt, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); + tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); } static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, @@ -1951,9 +1959,9 @@ static inline void tcp_cong_avoid(struct static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out) { - tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } } @@ -2085,12 +2093,12 @@ static int tcp_clean_rtx_queue(struct so seq_rtt = now - scb->when; tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); } if (acked&FLAG_ACKED) { - tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); + tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); tcp_ack_packets_out(sk, tp); if (tp->ca_ops->pkts_acked) @@ -2125,20 +2133,22 @@ static int tcp_clean_rtx_queue(struct so static void tcp_ack_probe(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); /* Was it a usable window open? */ if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, tp->snd_una + tp->snd_wnd)) { - tp->backoff = 0; - tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); + icsk->icsk_backoff = 0; + inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! */ } else { - tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RTO_MAX)); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), + TCP_RTO_MAX); } } @@ -2157,8 +2167,8 @@ static inline int tcp_may_raise_cwnd(str /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack, - u32 ack_seq, u32 nwin) +static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, + const u32 ack_seq, const u32 nwin) { return (after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || @@ -2500,8 +2510,9 @@ static inline void tcp_replace_ts_recent * up to bandwidth of 18Gigabit/sec. 8) ] */ -static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) +static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th = skb->h.th; u32 seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -2516,14 +2527,15 @@ static int tcp_disordered_ack(struct tcp !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && /* 4. ... and sits in replay window. */ - (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ); + (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); } -static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb) +static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { + const struct tcp_sock *tp = tcp_sk(sk); return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && - !tcp_disordered_ack(tp, skb)); + !tcp_disordered_ack(sk, skb)); } /* Check segment sequence number for validity. @@ -2586,7 +2598,7 @@ static void tcp_fin(struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(sk, SOCK_DONE); @@ -2596,7 +2608,7 @@ static void tcp_fin(struct sk_buff *skb, case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - tp->ack.pingpong = 1; + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: @@ -2694,7 +2706,7 @@ static void tcp_send_dupack(struct sock if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); - tcp_enter_quickack_mode(tp); + tcp_enter_quickack_mode(sk); if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -2853,7 +2865,7 @@ static void tcp_ofo_queue(struct sock *s if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received \n"); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &tp->out_of_order_queue); __kfree_skb(skb); continue; } @@ -2861,7 +2873,7 @@ static void tcp_ofo_queue(struct sock *s tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if(skb->h.th->fin) @@ -2942,7 +2954,7 @@ queue_and_out: * gap in queue is filled. */ if (skb_queue_empty(&tp->out_of_order_queue)) - tp->ack.pingpong = 0; + inet_csk(sk)->icsk_ack.pingpong = 0; } if (tp->rx_opt.num_sacks) @@ -2963,8 +2975,8 @@ queue_and_out: tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: - tcp_enter_quickack_mode(tp); - tcp_schedule_ack(tp); + tcp_enter_quickack_mode(sk); + inet_csk_schedule_ack(sk); drop: __kfree_skb(skb); return; @@ -2974,7 +2986,7 @@ drop: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) goto out_of_window; - tcp_enter_quickack_mode(tp); + tcp_enter_quickack_mode(sk); if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ @@ -3003,7 +3015,7 @@ drop: /* Disable header prediction. */ tp->pred_flags = 0; - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -3027,7 +3039,7 @@ drop: u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (seq == TCP_SKB_CB(skb1)->end_seq) { - __skb_append(skb1, skb); + __skb_append(skb1, skb, &tp->out_of_order_queue); if (!tp->rx_opt.num_sacks || tp->selective_acks[0].end_seq != seq) @@ -3071,7 +3083,7 @@ drop: tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); break; } - __skb_unlink(skb1, skb1->list); + __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); __kfree_skb(skb1); } @@ -3088,8 +3100,9 @@ add_sack: * simplifies code) */ static void -tcp_collapse(struct sock *sk, struct sk_buff *head, - struct sk_buff *tail, u32 start, u32 end) +tcp_collapse(struct sock *sk, struct sk_buff_head *list, + struct sk_buff *head, struct sk_buff *tail, + u32 start, u32 end) { struct sk_buff *skb; @@ -3099,7 +3112,7 @@ tcp_collapse(struct sock *sk, struct sk_ /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { struct sk_buff *next = skb->next; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; @@ -3145,7 +3158,7 @@ tcp_collapse(struct sock *sk, struct sk_ nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; - __skb_insert(nskb, skb->prev, skb, skb->list); + __skb_insert(nskb, skb->prev, skb, list); sk_stream_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ @@ -3164,7 +3177,7 @@ tcp_collapse(struct sock *sk, struct sk_ } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { struct sk_buff *next = skb->next; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; @@ -3200,7 +3213,8 @@ static void tcp_collapse_ofo_queue(struc if (skb == (struct sk_buff *)&tp->out_of_order_queue || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, head, skb, start, end); + tcp_collapse(sk, &tp->out_of_order_queue, + head, skb, start, end); head = skb; if (skb == (struct sk_buff *)&tp->out_of_order_queue) break; @@ -3237,7 +3251,8 @@ static int tcp_prune_queue(struct sock * tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); - tcp_collapse(sk, sk->sk_receive_queue.next, + tcp_collapse(sk, &sk->sk_receive_queue, + sk->sk_receive_queue.next, (struct sk_buff*)&sk->sk_receive_queue, tp->copied_seq, tp->rcv_nxt); sk_stream_mem_reclaim(sk); @@ -3370,13 +3385,13 @@ static void __tcp_ack_snd_check(struct s struct tcp_sock *tp = tcp_sk(sk); /* More than one full frame received... */ - if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). Or... */ && __tcp_select_window(sk) >= tp->rcv_wnd) || /* We ACK each frame or... */ - tcp_in_quickack_mode(tp) || + tcp_in_quickack_mode(sk) || /* We have out of order data. */ (ofo_possible && skb_peek(&tp->out_of_order_queue))) { @@ -3390,8 +3405,7 @@ static void __tcp_ack_snd_check(struct s static __inline__ void tcp_ack_snd_check(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_ack_scheduled(tp)) { + if (!inet_csk_ack_scheduled(sk)) { /* We sent a data segment already. */ return; } @@ -3462,7 +3476,7 @@ static void tcp_check_urg(struct sock * struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); tp->copied_seq++; if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } } @@ -3645,7 +3659,7 @@ int tcp_rcv_established(struct sock *sk, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); /* We know that such packets are checksummed * on entry. @@ -3678,7 +3692,7 @@ int tcp_rcv_established(struct sock *sk, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; @@ -3699,7 +3713,7 @@ int tcp_rcv_established(struct sock *sk, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; @@ -3719,7 +3733,7 @@ int tcp_rcv_established(struct sock *sk, /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); tcp_data_snd_check(sk, tp); - if (!tcp_ack_scheduled(tp)) + if (!inet_csk_ack_scheduled(sk)) goto no_ack; } @@ -3741,7 +3755,7 @@ slow_path: * RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(tp, skb)) { + tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); @@ -3788,7 +3802,7 @@ step5: if(th->ack) tcp_ack(sk, skb, FLAG_SLOWPATH); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ tcp_urg(sk, skb, th); @@ -3817,6 +3831,7 @@ static int tcp_rcv_synsent_state_process tcp_parse_options(skb, &tp->rx_opt, 0); if (th->ack) { + struct inet_connection_sock *icsk; /* rfc793: * "If the state is SYN-SENT then * first check the ACK bit @@ -3930,7 +3945,7 @@ static int tcp_rcv_synsent_state_process tcp_init_buffer_space(sk); if (sock_flag(sk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); @@ -3942,7 +3957,11 @@ static int tcp_rcv_synsent_state_process sk_wake_async(sk, 0, POLL_OUT); } - if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) { + icsk = inet_csk(sk); + + if (sk->sk_write_pending || + icsk->icsk_accept_queue.rskq_defer_accept || + icsk->icsk_ack.pingpong) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * @@ -3950,12 +3969,13 @@ static int tcp_rcv_synsent_state_process * look so _wonderfully_ clever, that I was not able * to stand against the temptation 8) --ANK */ - tcp_schedule_ack(tp); - tp->ack.lrcvtime = tcp_time_stamp; - tp->ack.ato = TCP_ATO_MIN; - tcp_incr_quickack(tp); - tcp_enter_quickack_mode(tp); - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + inet_csk_schedule_ack(sk); + icsk->icsk_ack.lrcvtime = tcp_time_stamp; + icsk->icsk_ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(sk); + tcp_enter_quickack_mode(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); discard: __kfree_skb(skb); @@ -4111,7 +4131,7 @@ int tcp_rcv_state_process(struct sock *s } if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(tp, skb)) { + tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); @@ -4180,7 +4200,7 @@ int tcp_rcv_state_process(struct sock *s */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(tp, 0, 0); + tcp_ack_saw_tstamp(sk, 0, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4227,9 +4247,9 @@ int tcp_rcv_state_process(struct sock *s return 1; } - tmo = tcp_fin_time(tp); + tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing @@ -4237,7 +4257,7 @@ int tcp_rcv_state_process(struct sock *s * if it spins in bh_lock_sock(), but it is really * marginal case. */ - tcp_reset_keepalive_timer(sk, tmo); + inet_csk_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto discard; diff -puN net/ipv4/tcp_ipv4.c~git-net net/ipv4/tcp_ipv4.c --- devel/net/ipv4/tcp_ipv4.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/tcp_ipv4.c 2005-08-06 23:39:10.000000000 -0700 @@ -64,6 +64,7 @@ #include #include +#include #include #include #include @@ -88,458 +89,29 @@ static struct socket *tcp_socket; void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); -struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { - .__tcp_lhash_lock = RW_LOCK_UNLOCKED, - .__tcp_lhash_users = ATOMIC_INIT(0), - .__tcp_lhash_wait - = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), - .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED +struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { + .lhash_lock = RW_LOCK_UNLOCKED, + .lhash_users = ATOMIC_INIT(0), + .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), + .portalloc_lock = SPIN_LOCK_UNLOCKED, + .port_rover = 1024 - 1, }; -/* - * This array holds the first and last local port number. - * For high-usage systems, use sysctl to change this to - * 32768-61000 - */ -int sysctl_local_port_range[2] = { 1024, 4999 }; -int tcp_port_rover = 1024 - 1; - -static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, - __u32 faddr, __u16 fport) -{ - int h = (laddr ^ lport) ^ (faddr ^ fport); - h ^= h >> 16; - h ^= h >> 8; - return h & (tcp_ehash_size - 1); -} - -static __inline__ int tcp_sk_hashfn(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - __u32 laddr = inet->rcv_saddr; - __u16 lport = inet->num; - __u32 faddr = inet->daddr; - __u16 fport = inet->dport; - - return tcp_hashfn(laddr, lport, faddr, fport); -} - -/* Allocate and initialize a new TCP local port bind bucket. - * The bindhash mutex for snum's hash chain must be held here. - */ -struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, - unsigned short snum) -{ - struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep, - SLAB_ATOMIC); - if (tb) { - tb->port = snum; - tb->fastreuse = 0; - INIT_HLIST_HEAD(&tb->owners); - hlist_add_head(&tb->node, &head->chain); - } - return tb; -} - -/* Caller must hold hashbucket lock for this tb with local BH disabled */ -void tcp_bucket_destroy(struct tcp_bind_bucket *tb) -{ - if (hlist_empty(&tb->owners)) { - __hlist_del(&tb->node); - kmem_cache_free(tcp_bucket_cachep, tb); - } -} - -/* Caller must disable local BH processing. */ -static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) -{ - struct tcp_bind_hashbucket *head = - &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)]; - struct tcp_bind_bucket *tb; - - spin_lock(&head->lock); - tb = tcp_sk(sk)->bind_hash; - sk_add_bind_node(child, &tb->owners); - tcp_sk(child)->bind_hash = tb; - spin_unlock(&head->lock); -} - -inline void tcp_inherit_port(struct sock *sk, struct sock *child) -{ - local_bh_disable(); - __tcp_inherit_port(sk, child); - local_bh_enable(); -} - -void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, - unsigned short snum) -{ - inet_sk(sk)->num = snum; - sk_add_bind_node(sk, &tb->owners); - tcp_sk(sk)->bind_hash = tb; -} - -static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) -{ - const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); - struct sock *sk2; - struct hlist_node *node; - int reuse = sk->sk_reuse; - - sk_for_each_bound(sk2, node, &tb->owners) { - if (sk != sk2 && - !tcp_v6_ipv6only(sk2) && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { - if (!reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) { - const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); - if (!sk2_rcv_saddr || !sk_rcv_saddr || - sk2_rcv_saddr == sk_rcv_saddr) - break; - } - } - } - return node != NULL; -} - -/* Obtain a reference to a local port for the given sock, - * if snum is zero it means select any available local port. - */ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) { - struct tcp_bind_hashbucket *head; - struct hlist_node *node; - struct tcp_bind_bucket *tb; - int ret; - - local_bh_disable(); - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover; - - spin_lock(&tcp_portalloc_lock); - if (tcp_port_rover < low) - rover = low; - else - rover = tcp_port_rover; - do { - rover++; - if (rover > high) - rover = low; - head = &tcp_bhash[tcp_bhashfn(rover)]; - spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) - if (tb->port == rover) - goto next; - break; - next: - spin_unlock(&head->lock); - } while (--remaining > 0); - tcp_port_rover = rover; - spin_unlock(&tcp_portalloc_lock); - - /* Exhausted local port range during search? */ - ret = 1; - if (remaining <= 0) - goto fail; - - /* OK, here is the one we will use. HEAD is - * non-NULL and we hold it's mutex. - */ - snum = rover; - } else { - head = &tcp_bhash[tcp_bhashfn(snum)]; - spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) - if (tb->port == snum) - goto tb_found; - } - tb = NULL; - goto tb_not_found; -tb_found: - if (!hlist_empty(&tb->owners)) { - if (sk->sk_reuse > 1) - goto success; - if (tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) { - goto success; - } else { - ret = 1; - if (tcp_bind_conflict(sk, tb)) - goto fail_unlock; - } - } -tb_not_found: - ret = 1; - if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) - goto fail_unlock; - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else - tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; -success: - if (!tcp_sk(sk)->bind_hash) - tcp_bind_hash(sk, tb, snum); - BUG_TRAP(tcp_sk(sk)->bind_hash == tb); - ret = 0; - -fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); - return ret; -} - -/* Get rid of any references to a local port held by the - * given sock. - */ -static void __tcp_put_port(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)]; - struct tcp_bind_bucket *tb; - - spin_lock(&head->lock); - tb = tcp_sk(sk)->bind_hash; - __sk_del_bind_node(sk); - tcp_sk(sk)->bind_hash = NULL; - inet->num = 0; - tcp_bucket_destroy(tb); - spin_unlock(&head->lock); -} - -void tcp_put_port(struct sock *sk) -{ - local_bh_disable(); - __tcp_put_port(sk); - local_bh_enable(); -} - -/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. - * Look, when several writers sleep and reader wakes them up, all but one - * immediately hit write lock and grab all the cpus. Exclusive sleep solves - * this, _but_ remember, it adds useless work on UP machines (wake up each - * exclusive lock release). It should be ifdefed really. - */ - -void tcp_listen_wlock(void) -{ - write_lock(&tcp_lhash_lock); - - if (atomic_read(&tcp_lhash_users)) { - DEFINE_WAIT(wait); - - for (;;) { - prepare_to_wait_exclusive(&tcp_lhash_wait, - &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&tcp_lhash_users)) - break; - write_unlock_bh(&tcp_lhash_lock); - schedule(); - write_lock_bh(&tcp_lhash_lock); - } - - finish_wait(&tcp_lhash_wait, &wait); - } -} - -static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) -{ - struct hlist_head *list; - rwlock_t *lock; - - BUG_TRAP(sk_unhashed(sk)); - if (listen_possible && sk->sk_state == TCP_LISTEN) { - list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; - lock = &tcp_lhash_lock; - tcp_listen_wlock(); - } else { - list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain; - lock = &tcp_ehash[sk->sk_hashent].lock; - write_lock(lock); - } - __sk_add_node(sk, list); - sock_prot_inc_use(sk->sk_prot); - write_unlock(lock); - if (listen_possible && sk->sk_state == TCP_LISTEN) - wake_up(&tcp_lhash_wait); + return inet_csk_get_port(&tcp_hashinfo, sk, snum); } static void tcp_v4_hash(struct sock *sk) { - if (sk->sk_state != TCP_CLOSE) { - local_bh_disable(); - __tcp_v4_hash(sk, 1); - local_bh_enable(); - } + inet_hash(&tcp_hashinfo, sk); } void tcp_unhash(struct sock *sk) { - rwlock_t *lock; - - if (sk_unhashed(sk)) - goto ende; - - if (sk->sk_state == TCP_LISTEN) { - local_bh_disable(); - tcp_listen_wlock(); - lock = &tcp_lhash_lock; - } else { - struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; - lock = &head->lock; - write_lock_bh(&head->lock); - } - - if (__sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); - write_unlock_bh(lock); - - ende: - if (sk->sk_state == TCP_LISTEN) - wake_up(&tcp_lhash_wait); -} - -/* Don't inline this cruft. Here are some nice properties to - * exploit here. The BSD API does not allow a listening TCP - * to specify the remote port nor the remote address for the - * connection. So always assume those are both wildcarded - * during the search since they can never be otherwise. - */ -static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr, - unsigned short hnum, int dif) -{ - struct sock *result = NULL, *sk; - struct hlist_node *node; - int score, hiscore; - - hiscore=-1; - sk_for_each(sk, node, head) { - struct inet_sock *inet = inet_sk(sk); - - if (inet->num == hnum && !ipv6_only_sock(sk)) { - __u32 rcv_saddr = inet->rcv_saddr; - - score = (sk->sk_family == PF_INET ? 1 : 0); - if (rcv_saddr) { - if (rcv_saddr != daddr) - continue; - score+=2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score+=2; - } - if (score == 5) - return sk; - if (score > hiscore) { - hiscore = score; - result = sk; - } - } - } - return result; -} - -/* Optimize the common listener case. */ -static inline struct sock *tcp_v4_lookup_listener(u32 daddr, - unsigned short hnum, int dif) -{ - struct sock *sk = NULL; - struct hlist_head *head; - - read_lock(&tcp_lhash_lock); - head = &tcp_listening_hash[tcp_lhashfn(hnum)]; - if (!hlist_empty(head)) { - struct inet_sock *inet = inet_sk((sk = __sk_head(head))); - - if (inet->num == hnum && !sk->sk_node.next && - (!inet->rcv_saddr || inet->rcv_saddr == daddr) && - (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && - !sk->sk_bound_dev_if) - goto sherry_cache; - sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif); - } - if (sk) { -sherry_cache: - sock_hold(sk); - } - read_unlock(&tcp_lhash_lock); - return sk; -} - -/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so - * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM - * - * Local BH must be disabled here. - */ - -static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, - u32 daddr, u16 hnum, - int dif) -{ - struct tcp_ehash_bucket *head; - TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) - __u32 ports = TCP_COMBINED_PORTS(sport, hnum); - struct sock *sk; - struct hlist_node *node; - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - int hash = tcp_hashfn(daddr, hnum, saddr, sport); - head = &tcp_ehash[hash]; - read_lock(&head->lock); - sk_for_each(sk, node, &head->chain) { - if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) - goto hit; /* You sunk my battleship! */ - } - - /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { - if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) - goto hit; - } - sk = NULL; -out: - read_unlock(&head->lock); - return sk; -hit: - sock_hold(sk); - goto out; -} - -static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, - u32 daddr, u16 hnum, int dif) -{ - struct sock *sk = __tcp_v4_lookup_established(saddr, sport, - daddr, hnum, dif); - - return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif); -} - -inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, - u16 dport, int dif) -{ - struct sock *sk; - - local_bh_disable(); - sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); - local_bh_enable(); - - return sk; + inet_unhash(&tcp_hashinfo, sk); } -EXPORT_SYMBOL_GPL(tcp_v4_lookup); - static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) { return secure_tcp_sequence_number(skb->nh.iph->daddr, @@ -550,27 +122,28 @@ static inline __u32 tcp_v4_init_sequence /* called with local bh disabled */ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, - struct tcp_tw_bucket **twp) + struct inet_timewait_sock **twp) { struct inet_sock *inet = inet_sk(sk); u32 daddr = inet->rcv_saddr; u32 saddr = inet->daddr; int dif = sk->sk_bound_dev_if; - TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) - __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); - int hash = tcp_hashfn(daddr, lport, saddr, inet->dport); - struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size); + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; struct sock *sk2; - struct hlist_node *node; - struct tcp_tw_bucket *tw; + const struct hlist_node *node; + struct inet_timewait_sock *tw; write_lock(&head->lock); /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { - tw = (struct tcp_tw_bucket *)sk2; + sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { + tw = inet_twsk(sk2); - if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { + if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); struct tcp_sock *tp = tcp_sk(sk); /* With PAWS, it is safe from the viewpoint @@ -587,15 +160,15 @@ static int __tcp_v4_check_established(st fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ - if (tw->tw_ts_recent_stamp && + if (tcptw->tw_ts_recent_stamp && (!twp || (sysctl_tcp_tw_reuse && xtime.tv_sec - - tw->tw_ts_recent_stamp > 1))) { - if ((tp->write_seq = - tw->tw_snd_nxt + 65535 + 2) == 0) + tcptw->tw_ts_recent_stamp > 1))) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) tp->write_seq = 1; - tp->rx_opt.ts_recent = tw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; sock_hold(sk2); goto unique; } else @@ -606,7 +179,7 @@ static int __tcp_v4_check_established(st /* And established part... */ sk_for_each(sk2, node, &head->chain) { - if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif)) goto not_unique; } @@ -629,7 +202,7 @@ unique: tcp_tw_deschedule(tw); NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - tcp_tw_put(tw); + inet_twsk_put(tw); } return 0; @@ -652,9 +225,9 @@ static inline u32 connect_port_offset(co */ static inline int tcp_v4_hash_connect(struct sock *sk) { - unsigned short snum = inet_sk(sk)->num; - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; int ret; if (!snum) { @@ -666,19 +239,19 @@ static inline int tcp_v4_hash_connect(st static u32 hint; u32 offset = hint + connect_port_offset(sk); struct hlist_node *node; - struct tcp_tw_bucket *tw = NULL; + struct inet_timewait_sock *tw = NULL; local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &tcp_bhash[tcp_bhashfn(port)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, * because the established check is already * unique enough. */ - tb_for_each(tb, node, &head->chain) { + inet_bind_bucket_for_each(tb, node, &head->chain) { if (tb->port == port) { BUG_TRAP(!hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) @@ -691,7 +264,7 @@ static inline int tcp_v4_hash_connect(st } } - tb = tcp_bucket_create(head, port); + tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); if (!tb) { spin_unlock(&head->lock); break; @@ -710,27 +283,27 @@ ok: hint += i; /* Head lock still held and bh's disabled */ - tcp_bind_hash(sk, tb, port); + inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); - __tcp_v4_hash(sk, 0); + __inet_hash(&tcp_hashinfo, sk, 0); } spin_unlock(&head->lock); if (tw) { tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); } ret = 0; goto out; } - head = &tcp_bhash[tcp_bhashfn(snum)]; - tb = tcp_sk(sk)->bind_hash; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __tcp_v4_hash(sk, 0); + __inet_hash(&tcp_hashinfo, sk, 0); spin_unlock_bh(&head->lock); return 0; } else { @@ -832,8 +405,7 @@ int tcp_v4_connect(struct sock *sk, stru goto failure; /* OK, now commit destination to socket. */ - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->saddr, @@ -859,53 +431,6 @@ failure: return err; } -static __inline__ int tcp_v4_iif(struct sk_buff *skb) -{ - return ((struct rtable *)skb->dst)->rt_iif; -} - -static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd) -{ - return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); -} - -static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp, - struct request_sock ***prevp, - __u16 rport, - __u32 raddr, __u32 laddr) -{ - struct listen_sock *lopt = tp->accept_queue.listen_opt; - struct request_sock *req, **prev; - - for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)]; - (req = *prev) != NULL; - prev = &req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->rmt_port == rport && - ireq->rmt_addr == raddr && - ireq->loc_addr == laddr && - TCP_INET_FAMILY(req->rsk_ops->family)) { - BUG_TRAP(!req->sk); - *prevp = prev; - break; - } - } - - return req; -} - -static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt = tp->accept_queue.listen_opt; - u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); - - reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT); - tcp_synq_added(sk); -} - - /* * This routine does path mtu discovery as defined in RFC1191. */ @@ -988,14 +513,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 return; } - sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, - th->source, tcp_v4_iif(skb)); + sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr, + th->source, inet_iif(skb)); if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { - tcp_tw_put((struct tcp_tw_bucket *)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); return; } @@ -1049,8 +574,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 if (sock_owned_by_user(sk)) goto out; - req = tcp_v4_search_req(tp, &prev, th->dest, - iph->daddr, iph->saddr); + req = inet_csk_search_req(sk, &prev, th->dest, + iph->daddr, iph->saddr); if (!req) goto out; @@ -1070,7 +595,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 * created socket, and POSIX does not want network * errors returned from accept(). */ - tcp_synq_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: @@ -1240,12 +765,13 @@ static void tcp_v4_send_ack(struct sk_bu static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) { - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + struct inet_timewait_sock *tw = inet_twsk(sk); + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, - tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); + tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent); - tcp_tw_put(tw); + inet_twsk_put(tw); } static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) @@ -1254,36 +780,6 @@ static void tcp_v4_reqsk_send_ack(struct req->ts_recent); } -static struct dst_entry* tcp_v4_route_req(struct sock *sk, - struct request_sock *req) -{ - struct rtable *rt; - const struct inet_request_sock *ireq = inet_rsk(req); - struct ip_options *opt = inet_rsk(req)->opt; - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .nl_u = { .ip4_u = - { .daddr = ((opt && opt->srr) ? - opt->faddr : - ireq->rmt_addr), - .saddr = ireq->loc_addr, - .tos = RT_CONN_FLAGS(sk) } }, - .proto = IPPROTO_TCP, - .uli_u = { .ports = - { .sport = inet_sk(sk)->sport, - .dport = ireq->rmt_port } } }; - - if (ip_route_output_flow(&rt, &fl, sk, 0)) { - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { - ip_rt_put(rt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - return &rt->u.dst; -} - /* * Send a SYN-ACK after having received an ACK. * This still operates on a request_sock only, not on a big @@ -1297,7 +793,7 @@ static int tcp_v4_send_synack(struct soc struct sk_buff * skb; /* First, grab a route. */ - if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) goto out; skb = tcp_make_synack(sk, dst, req); @@ -1399,7 +895,7 @@ int tcp_v4_conn_request(struct sock *sk, * limitations, they conserve resources and peer is * evidently real one. */ - if (tcp_synq_is_full(sk) && !isn) { + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { want_cookie = 1; @@ -1413,7 +909,7 @@ int tcp_v4_conn_request(struct sock *sk, * clogging syn queue with openreqs with exponentially increasing * timeout. */ - if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; req = reqsk_alloc(&tcp_request_sock_ops); @@ -1470,7 +966,7 @@ int tcp_v4_conn_request(struct sock *sk, */ if (tmp_opt.saw_tstamp && sysctl_tcp_tw_recycle && - (dst = tcp_v4_route_req(sk, req)) != NULL && + (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && @@ -1483,7 +979,7 @@ int tcp_v4_conn_request(struct sock *sk, } /* Kill the following clause, if you dislike this way. */ else if (!sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - tcp_synq_len(sk) < + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && (!peer || !peer->tcp_ts_stamp) && (!dst || !dst_metric(dst, RTAX_RTT))) { @@ -1514,7 +1010,7 @@ int tcp_v4_conn_request(struct sock *sk, if (want_cookie) { reqsk_free(req); } else { - tcp_v4_synq_add(sk, req); + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } return 0; @@ -1542,15 +1038,14 @@ struct sock *tcp_v4_syn_recv_sock(struct if (sk_acceptq_is_full(sk)) goto exit_overflow; - if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) goto exit; newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit; - newsk->sk_dst_cache = dst; - tcp_v4_setup_caps(newsk, dst); + sk_setup_caps(newsk, dst); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); @@ -1560,7 +1055,7 @@ struct sock *tcp_v4_syn_recv_sock(struct newinet->saddr = ireq->loc_addr; newinet->opt = ireq->opt; ireq->opt = NULL; - newinet->mc_index = tcp_v4_iif(skb); + newinet->mc_index = inet_iif(skb); newinet->mc_ttl = skb->nh.iph->ttl; newtp->ext_header_len = 0; if (newinet->opt) @@ -1571,8 +1066,8 @@ struct sock *tcp_v4_syn_recv_sock(struct newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk); - __tcp_v4_hash(newsk, 0); - __tcp_inherit_port(sk, newsk); + __inet_hash(&tcp_hashinfo, newsk, 0); + __inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; @@ -1588,27 +1083,24 @@ static struct sock *tcp_v4_hnd_req(struc { struct tcphdr *th = skb->h.th; struct iphdr *iph = skb->nh.iph; - struct tcp_sock *tp = tcp_sk(sk); struct sock *nsk; struct request_sock **prev; /* Find possible connection requests. */ - struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source, - iph->saddr, iph->daddr); + struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, + iph->saddr, iph->daddr); if (req) return tcp_check_req(sk, skb, req, prev); - nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, - th->source, - skb->nh.iph->daddr, - ntohs(th->dest), - tcp_v4_iif(skb)); + nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, + th->source, skb->nh.iph->daddr, + ntohs(th->dest), inet_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { bh_lock_sock(nsk); return nsk; } - tcp_tw_put((struct tcp_tw_bucket *)nsk); + inet_twsk_put((struct inet_timewait_sock *)nsk); return NULL; } @@ -1744,9 +1236,9 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, - skb->nh.iph->daddr, ntohs(th->dest), - tcp_v4_iif(skb)); + sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, ntohs(th->dest), + inet_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1798,24 +1290,25 @@ discard_and_relse: do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *) sk); goto discard_it; } if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TCP_MIB_INERRS); - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *) sk); goto discard_it; } - switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, - skb, th, skb->len)) { + switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk, + skb, th)) { case TCP_TW_SYN: { - struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, - ntohs(th->dest), - tcp_v4_iif(skb)); + struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, + skb->nh.iph->daddr, + ntohs(th->dest), + inet_iif(skb)); if (sk2) { - tcp_tw_deschedule((struct tcp_tw_bucket *)sk); - tcp_tw_put((struct tcp_tw_bucket *)sk); + tcp_tw_deschedule((struct inet_timewait_sock *)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); sk = sk2; goto process; } @@ -1831,112 +1324,6 @@ do_time_wait: goto discard_it; } -/* With per-bucket locks this operation is not-atomic, so that - * this version is not worse. - */ -static void __tcp_v4_rehash(struct sock *sk) -{ - sk->sk_prot->unhash(sk); - sk->sk_prot->hash(sk); -} - -static int tcp_v4_reselect_saddr(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - int err; - struct rtable *rt; - __u32 old_saddr = inet->saddr; - __u32 new_saddr; - __u32 daddr = inet->daddr; - - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; - - /* Query new route. */ - err = ip_route_connect(&rt, daddr, 0, - RT_CONN_FLAGS(sk), - sk->sk_bound_dev_if, - IPPROTO_TCP, - inet->sport, inet->dport, sk); - if (err) - return err; - - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); - - new_saddr = rt->rt_src; - - if (new_saddr == old_saddr) - return 0; - - if (sysctl_ip_dynaddr > 1) { - printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->" - "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", - NIPQUAD(old_saddr), - NIPQUAD(new_saddr)); - } - - inet->saddr = new_saddr; - inet->rcv_saddr = new_saddr; - - /* XXX The only one ugly spot where we need to - * XXX really change the sockets identity after - * XXX it has entered the hashes. -DaveM - * - * Besides that, it does not check for connection - * uniqueness. Wait for troubles. - */ - __tcp_v4_rehash(sk); - return 0; -} - -int tcp_v4_rebuild_header(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); - u32 daddr; - int err; - - /* Route is OK, nothing to do. */ - if (rt) - return 0; - - /* Reroute. */ - daddr = inet->daddr; - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; - - { - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .nl_u = { .ip4_u = - { .daddr = daddr, - .saddr = inet->saddr, - .tos = RT_CONN_FLAGS(sk) } }, - .proto = IPPROTO_TCP, - .uli_u = { .ports = - { .sport = inet->sport, - .dport = inet->dport } } }; - - err = ip_route_output_flow(&rt, &fl, sk, 0); - } - if (!err) { - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); - return 0; - } - - /* Routing failed... */ - sk->sk_route_caps = 0; - - if (!sysctl_ip_dynaddr || - sk->sk_state != TCP_SYN_SENT || - (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || - (err = tcp_v4_reselect_saddr(sk)) != 0) - sk->sk_err_soft = -err; - - return err; -} - static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) { struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; @@ -1985,18 +1372,18 @@ int tcp_v4_remember_stamp(struct sock *s return 0; } -int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) +int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) { - struct inet_peer *peer = NULL; - - peer = inet_getpeer(tw->tw_daddr, 1); + struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); if (peer) { - if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 || + const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + + if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && - peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = tw->tw_ts_recent_stamp; - peer->tcp_ts = tw->tw_ts_recent; + peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; + peer->tcp_ts = tcptw->tw_ts_recent; } inet_putpeer(peer); return 1; @@ -2008,7 +1395,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_ struct tcp_func ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, - .rebuild_header = tcp_v4_rebuild_header, + .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .remember_stamp = tcp_v4_remember_stamp, @@ -2030,7 +1417,7 @@ static int tcp_v4_init_sock(struct sock tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - tp->rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the @@ -2083,8 +1470,8 @@ int tcp_v4_destroy_sock(struct sock *sk) __skb_queue_purge(&tp->ucopy.prequeue); /* Clean up a referenced TCP bind bucket. */ - if (tp->bind_hash) - tcp_put_port(sk); + if (inet_csk(sk)->icsk_bind_hash) + inet_put_port(&tcp_hashinfo, sk); /* * If sendmsg cached page exists, toss it. @@ -2104,13 +1491,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ -static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head) +static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) { return hlist_empty(head) ? NULL : - list_entry(head->first, struct tcp_tw_bucket, tw_node); + list_entry(head->first, struct inet_timewait_sock, tw_node); } -static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw) +static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) { return tw->tw_node.next ? hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; @@ -2118,14 +1505,14 @@ static inline struct tcp_tw_bucket *tw_n static void *listening_get_next(struct seq_file *seq, void *cur) { - struct tcp_sock *tp; + struct inet_connection_sock *icsk; struct hlist_node *node; struct sock *sk = cur; struct tcp_iter_state* st = seq->private; if (!sk) { st->bucket = 0; - sk = sk_head(&tcp_listening_hash[0]); + sk = sk_head(&tcp_hashinfo.listening_hash[0]); goto get_sk; } @@ -2134,7 +1521,7 @@ static void *listening_get_next(struct s if (st->state == TCP_SEQ_STATE_OPENREQ) { struct request_sock *req = cur; - tp = tcp_sk(st->syn_wait_sk); + icsk = inet_csk(st->syn_wait_sk); req = req->dl_next; while (1) { while (req) { @@ -2147,17 +1534,17 @@ static void *listening_get_next(struct s if (++st->sbucket >= TCP_SYNQ_HSIZE) break; get_req: - req = tp->accept_queue.listen_opt->syn_table[st->sbucket]; + req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; } sk = sk_next(st->syn_wait_sk); st->state = TCP_SEQ_STATE_LISTENING; - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } else { - tp = tcp_sk(sk); - read_lock_bh(&tp->accept_queue.syn_wait_lock); - if (reqsk_queue_len(&tp->accept_queue)) + icsk = inet_csk(sk); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + if (reqsk_queue_len(&icsk->icsk_accept_queue)) goto start_req; - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); sk = sk_next(sk); } get_sk: @@ -2166,9 +1553,9 @@ get_sk: cur = sk; goto out; } - tp = tcp_sk(sk); - read_lock_bh(&tp->accept_queue.syn_wait_lock); - if (reqsk_queue_len(&tp->accept_queue)) { + icsk = inet_csk(sk); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + if (reqsk_queue_len(&icsk->icsk_accept_queue)) { start_req: st->uid = sock_i_uid(sk); st->syn_wait_sk = sk; @@ -2176,10 +1563,10 @@ start_req: st->sbucket = 0; goto get_req; } - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } - if (++st->bucket < TCP_LHTABLE_SIZE) { - sk = sk_head(&tcp_listening_hash[st->bucket]); + if (++st->bucket < INET_LHTABLE_SIZE) { + sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); goto get_sk; } cur = NULL; @@ -2203,16 +1590,16 @@ static void *established_get_first(struc struct tcp_iter_state* st = seq->private; void *rc = NULL; - for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) { + for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { struct sock *sk; struct hlist_node *node; - struct tcp_tw_bucket *tw; + struct inet_timewait_sock *tw; /* We can reschedule _before_ having picked the target: */ cond_resched_softirq(); - read_lock(&tcp_ehash[st->bucket].lock); - sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) { + read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family) { continue; } @@ -2220,15 +1607,15 @@ static void *established_get_first(struc goto out; } st->state = TCP_SEQ_STATE_TIME_WAIT; - tw_for_each(tw, node, - &tcp_ehash[st->bucket + tcp_ehash_size].chain) { + inet_twsk_for_each(tw, node, + &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { if (tw->tw_family != st->family) { continue; } rc = tw; goto out; } - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: @@ -2238,7 +1625,7 @@ out: static void *established_get_next(struct seq_file *seq, void *cur) { struct sock *sk = cur; - struct tcp_tw_bucket *tw; + struct inet_timewait_sock *tw; struct hlist_node *node; struct tcp_iter_state* st = seq->private; @@ -2255,15 +1642,15 @@ get_tw: cur = tw; goto out; } - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; /* We can reschedule between buckets: */ cond_resched_softirq(); - if (++st->bucket < tcp_ehash_size) { - read_lock(&tcp_ehash[st->bucket].lock); - sk = sk_head(&tcp_ehash[st->bucket].chain); + if (++st->bucket < tcp_hashinfo.ehash_size) { + read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); } else { cur = NULL; goto out; @@ -2277,7 +1664,7 @@ get_tw: } st->state = TCP_SEQ_STATE_TIME_WAIT; - tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain); + tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain); goto get_tw; found: cur = sk; @@ -2301,12 +1688,12 @@ static void *tcp_get_idx(struct seq_file void *rc; struct tcp_iter_state* st = seq->private; - tcp_listen_lock(); + inet_listen_lock(&tcp_hashinfo); st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_idx(seq, &pos); if (!rc) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); local_bh_disable(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_idx(seq, pos); @@ -2339,7 +1726,7 @@ static void *tcp_seq_next(struct seq_fil case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); local_bh_disable(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_first(seq); @@ -2362,17 +1749,17 @@ static void tcp_seq_stop(struct seq_file switch (st->state) { case TCP_SEQ_STATE_OPENREQ: if (v) { - struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); break; case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); local_bh_enable(); break; } @@ -2469,18 +1856,19 @@ static void get_tcp4_sock(struct sock *s int timer_active; unsigned long timer_expires; struct tcp_sock *tp = tcp_sk(sp); + const struct inet_connection_sock *icsk = inet_csk(sp); struct inet_sock *inet = inet_sk(sp); unsigned int dest = inet->daddr; unsigned int src = inet->rcv_saddr; __u16 destp = ntohs(inet->dport); __u16 srcp = ntohs(inet->sport); - if (tp->pending == TCP_TIME_RETRANS) { + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; - timer_expires = tp->timeout; - } else if (tp->pending == TCP_TIME_PROBE0) { + timer_expires = icsk->icsk_timeout; + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = tp->timeout; + timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; @@ -2495,17 +1883,19 @@ static void get_tcp4_sock(struct sock *s tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq, timer_active, jiffies_to_clock_t(timer_expires - jiffies), - tp->retransmits, + icsk->icsk_retransmits, sock_i_uid(sp), tp->probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong, + icsk->icsk_rto, + icsk->icsk_ack.ato, + (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); } -static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) +static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i) { unsigned int dest, src; __u16 destp, srcp; @@ -2585,7 +1975,7 @@ struct proto tcp_prot = { .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, - .accept = tcp_accept, + .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, @@ -2600,6 +1990,7 @@ struct proto tcp_prot = { .get_port = tcp_v4_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .sockets_allocated = &tcp_sockets_allocated, + .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, @@ -2607,6 +1998,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), + .twsk_obj_size = sizeof(struct tcp_timewait_sock), .rsk_prot = &tcp_request_sock_ops, }; @@ -2628,19 +2020,13 @@ void __init tcp_v4_init(struct net_proto } EXPORT_SYMBOL(ipv4_specific); -EXPORT_SYMBOL(tcp_bind_hash); -EXPORT_SYMBOL(tcp_bucket_create); +EXPORT_SYMBOL(inet_bind_bucket_create); EXPORT_SYMBOL(tcp_hashinfo); -EXPORT_SYMBOL(tcp_inherit_port); -EXPORT_SYMBOL(tcp_listen_wlock); -EXPORT_SYMBOL(tcp_port_rover); EXPORT_SYMBOL(tcp_prot); -EXPORT_SYMBOL(tcp_put_port); EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_v4_conn_request); EXPORT_SYMBOL(tcp_v4_connect); EXPORT_SYMBOL(tcp_v4_do_rcv); -EXPORT_SYMBOL(tcp_v4_rebuild_header); EXPORT_SYMBOL(tcp_v4_remember_stamp); EXPORT_SYMBOL(tcp_v4_send_check); EXPORT_SYMBOL(tcp_v4_syn_recv_sock); diff -puN net/ipv4/tcp_minisocks.c~git-net net/ipv4/tcp_minisocks.c --- devel/net/ipv4/tcp_minisocks.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/tcp_minisocks.c 2005-08-06 23:39:10.000000000 -0700 @@ -41,7 +41,7 @@ int sysctl_tcp_max_tw_buckets = NR_FILE* int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_abort_on_overflow; -static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); +static void tcp_tw_schedule(struct inet_timewait_sock *tw, int timeo); static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -56,43 +56,6 @@ static __inline__ int tcp_in_window(u32 int tcp_tw_count; - -/* Must be called with locally disabled BHs. */ -static void tcp_timewait_kill(struct tcp_tw_bucket *tw) -{ - struct tcp_ehash_bucket *ehead; - struct tcp_bind_hashbucket *bhead; - struct tcp_bind_bucket *tb; - - /* Unlink from established hashes. */ - ehead = &tcp_ehash[tw->tw_hashent]; - write_lock(&ehead->lock); - if (hlist_unhashed(&tw->tw_node)) { - write_unlock(&ehead->lock); - return; - } - __hlist_del(&tw->tw_node); - sk_node_init(&tw->tw_node); - write_unlock(&ehead->lock); - - /* Disassociate with bind bucket. */ - bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)]; - spin_lock(&bhead->lock); - tb = tw->tw_tb; - __hlist_del(&tw->tw_bind_node); - tw->tw_tb = NULL; - tcp_bucket_destroy(tb); - spin_unlock(&bhead->lock); - -#ifdef INET_REFCNT_DEBUG - if (atomic_read(&tw->tw_refcnt) != 1) { - printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, - atomic_read(&tw->tw_refcnt)); - } -#endif - tcp_tw_put(tw); -} - /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN @@ -122,19 +85,20 @@ static void tcp_timewait_kill(struct tcp * to avoid misread sequence numbers, states etc. --ANK */ enum tcp_tw_status -tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, - struct tcphdr *th, unsigned len) +tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, + const struct tcphdr *th) { + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); struct tcp_options_received tmp_opt; int paws_reject = 0; tmp_opt.saw_tstamp = 0; - if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) { + if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { tcp_parse_options(skb, &tmp_opt, 0); if (tmp_opt.saw_tstamp) { - tmp_opt.ts_recent = tw->tw_ts_recent; - tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + tmp_opt.ts_recent = tcptw->tw_ts_recent; + tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_check(&tmp_opt, th->rst); } } @@ -145,20 +109,20 @@ tcp_timewait_state_process(struct tcp_tw /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tw->tw_rcv_nxt, - tw->tw_rcv_nxt + tw->tw_rcv_wnd)) + tcptw->tw_rcv_nxt, + tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) return TCP_TW_ACK; if (th->rst) goto kill; - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt)) + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) goto kill_with_rst; /* Dup ACK? */ - if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) || + if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -166,19 +130,19 @@ tcp_timewait_state_process(struct tcp_tw * reset. */ if (!th->fin || - TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) { + TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_RST; } /* FIN arrived, enter true time-wait state. */ - tw->tw_substate = TCP_TIME_WAIT; - tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tw->tw_substate = TCP_TIME_WAIT; + tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tw->tw_ts_recent_stamp = xtime.tv_sec; - tw->tw_ts_recent = tmp_opt.rcv_tsval; + tcptw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } /* I am shamed, but failed to make it more elegant. @@ -187,7 +151,7 @@ kill_with_rst: * do not undertsnad recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && - sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp && + sysctl_tcp_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_v4_tw_remember_stamp(tw)) tcp_tw_schedule(tw, tw->tw_timeout); else @@ -213,7 +177,7 @@ kill_with_rst: */ if (!paws_reject && - (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt && + (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ @@ -225,18 +189,18 @@ kill_with_rst: if (sysctl_tcp_rfc1337 == 0) { kill: tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } } tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { - tw->tw_ts_recent = tmp_opt.rcv_tsval; - tw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent = tmp_opt.rcv_tsval; + tcptw->tw_ts_recent_stamp = xtime.tv_sec; } - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -258,9 +222,10 @@ kill: */ if (th->syn && !th->rst && !th->ack && !paws_reject && - (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) || - (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { - u32 isn = tw->tw_snd_nxt + 65535 + 2; + (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || + (tmp_opt.saw_tstamp && + (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { + u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; TCP_SKB_CB(skb)->when = isn; @@ -285,100 +250,49 @@ kill: */ return TCP_TW_ACK; } - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } -/* Enter the time wait state. This is called with locally disabled BH. - * Essentially we whip up a timewait bucket, copy the - * relevant info into it from the SK, and mess with hash chains - * and list linkage. - */ -static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) -{ - struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; - struct tcp_bind_hashbucket *bhead; - - /* Step 1: Put TW into bind hash. Original socket stays there too. - Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in - binding cache, even if it is closed. - */ - bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)]; - spin_lock(&bhead->lock); - tw->tw_tb = tcp_sk(sk)->bind_hash; - BUG_TRAP(tcp_sk(sk)->bind_hash); - tw_add_bind_node(tw, &tw->tw_tb->owners); - spin_unlock(&bhead->lock); - - write_lock(&ehead->lock); - - /* Step 2: Remove SK from established hash. */ - if (__sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); - - /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ - tw_add_node(tw, &(ehead + tcp_ehash_size)->chain); - atomic_inc(&tw->tw_refcnt); - - write_unlock(&ehead->lock); -} - /* * Move a socket to time-wait or dead fin-wait-2 state. */ void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct tcp_tw_bucket *tw = NULL; - struct tcp_sock *tp = tcp_sk(sk); + struct inet_timewait_sock *tw = NULL; + const struct tcp_sock *tp = tcp_sk(sk); int recycle_ok = 0; if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tp->af_specific->remember_stamp(sk); if (tcp_tw_count < sysctl_tcp_max_tw_buckets) - tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); + tw = inet_twsk_alloc(sk, state); - if(tw != NULL) { - struct inet_sock *inet = inet_sk(sk); - int rto = (tp->rto<<2) - (tp->rto>>1); - - /* Give us an identity. */ - tw->tw_daddr = inet->daddr; - tw->tw_rcv_saddr = inet->rcv_saddr; - tw->tw_bound_dev_if = sk->sk_bound_dev_if; - tw->tw_num = inet->num; - tw->tw_state = TCP_TIME_WAIT; - tw->tw_substate = state; - tw->tw_sport = inet->sport; - tw->tw_dport = inet->dport; - tw->tw_family = sk->sk_family; - tw->tw_reuse = sk->sk_reuse; - tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; - atomic_set(&tw->tw_refcnt, 1); + if (tw != NULL) { + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + const struct inet_connection_sock *icsk = inet_csk(sk); + const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); - tw->tw_hashent = sk->sk_hashent; - tw->tw_rcv_nxt = tp->rcv_nxt; - tw->tw_snd_nxt = tp->snd_nxt; - tw->tw_rcv_wnd = tcp_receive_window(tp); - tw->tw_ts_recent = tp->rx_opt.ts_recent; - tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; - tw_dead_node_init(tw); + tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + tcptw->tw_rcv_nxt = tp->rcv_nxt; + tcptw->tw_snd_nxt = tp->snd_nxt; + tcptw->tw_rcv_wnd = tcp_receive_window(tp); + tcptw->tw_ts_recent = tp->rx_opt.ts_recent; + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); - ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr); - ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr); - tw->tw_v6_ipv6only = np->ipv6only; - } else { - memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr)); - memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr)); - tw->tw_v6_ipv6only = 0; + ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6only = np->ipv6only; } #endif /* Linkage updates. */ - __tcp_tw_hashdance(sk, tw); + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) @@ -393,7 +307,7 @@ void tcp_time_wait(struct sock *sk, int } tcp_tw_schedule(tw, timeo); - tcp_tw_put(tw); + inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than @@ -428,7 +342,7 @@ static u32 twkill_thread_slots; /* Returns non-zero if quota exceeded. */ static int tcp_do_twkill_work(int slot, unsigned int quota) { - struct tcp_tw_bucket *tw; + struct inet_timewait_sock *tw; struct hlist_node *node; unsigned int killed; int ret; @@ -442,11 +356,11 @@ static int tcp_do_twkill_work(int slot, killed = 0; ret = 0; rescan: - tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { - __tw_del_dead_node(tw); + inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { + __inet_twsk_del_dead_node(tw); spin_unlock(&tw_death_lock); - tcp_timewait_kill(tw); - tcp_tw_put(tw); + __inet_twsk_kill(tw, &tcp_hashinfo); + inet_twsk_put(tw); killed++; spin_lock(&tw_death_lock); if (killed > quota) { @@ -532,16 +446,16 @@ static void twkill_work(void *dummy) */ /* This is for handling early-kills of TIME_WAIT sockets. */ -void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +void tcp_tw_deschedule(struct inet_timewait_sock *tw) { spin_lock(&tw_death_lock); - if (tw_del_dead_node(tw)) { - tcp_tw_put(tw); + if (inet_twsk_del_dead_node(tw)) { + inet_twsk_put(tw); if (--tcp_tw_count == 0) del_timer(&tcp_tw_timer); } spin_unlock(&tw_death_lock); - tcp_timewait_kill(tw); + __inet_twsk_kill(tw, &tcp_hashinfo); } /* Short-time timewait calendar */ @@ -553,7 +467,7 @@ static struct timer_list tcp_twcal_timer TIMER_INITIALIZER(tcp_twcal_tick, 0, 0); static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; -static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) +static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo) { struct hlist_head *list; int slot; @@ -587,7 +501,7 @@ static void tcp_tw_schedule(struct tcp_t spin_lock(&tw_death_lock); /* Unlink it, if it was scheduled */ - if (tw_del_dead_node(tw)) + if (inet_twsk_del_dead_node(tw)) tcp_tw_count--; else atomic_inc(&tw->tw_refcnt); @@ -645,13 +559,13 @@ void tcp_twcal_tick(unsigned long dummy) for (n=0; nsk_prot -acme */ - struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0); + struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); - if(newsk != NULL) { - struct inet_request_sock *ireq = inet_rsk(req); + if (newsk != NULL) { + const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_connection_sock *newicsk = inet_csk(sk); struct tcp_sock *newtp; - struct sk_filter *filter; - - memcpy(newsk, sk, sizeof(struct tcp_sock)); - newsk->sk_state = TCP_SYN_RECV; - - /* SANITY */ - sk_node_init(&newsk->sk_node); - tcp_sk(newsk)->bind_hash = NULL; - - /* Clone the TCP header template */ - inet_sk(newsk)->dport = ireq->rmt_port; - - sock_lock_init(newsk); - bh_lock_sock(newsk); - - rwlock_init(&newsk->sk_dst_lock); - atomic_set(&newsk->sk_rmem_alloc, 0); - skb_queue_head_init(&newsk->sk_receive_queue); - atomic_set(&newsk->sk_wmem_alloc, 0); - skb_queue_head_init(&newsk->sk_write_queue); - atomic_set(&newsk->sk_omem_alloc, 0); - newsk->sk_wmem_queued = 0; - newsk->sk_forward_alloc = 0; - - sock_reset_flag(newsk, SOCK_DONE); - newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; - newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; - newsk->sk_send_head = NULL; - rwlock_init(&newsk->sk_callback_lock); - skb_queue_head_init(&newsk->sk_error_queue); - newsk->sk_write_space = sk_stream_write_space; - - if ((filter = newsk->sk_filter) != NULL) - sk_filter_charge(newsk, filter); - - if (unlikely(xfrm_sk_clone_policy(newsk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - return NULL; - } /* Now setup tcp_sock */ newtp = tcp_sk(newsk); newtp->pred_flags = 0; newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->snd_nxt = treq->snt_isn + 1; - newtp->snd_una = treq->snt_isn + 1; - newtp->snd_sml = treq->snt_isn + 1; + newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1; tcp_prequeue_init(newtp); tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); - newtp->retransmits = 0; - newtp->backoff = 0; newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; - newtp->rto = TCP_TIMEOUT_INIT; + newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; newtp->left_out = 0; @@ -792,23 +658,10 @@ struct sock *tcp_create_openreq_child(st newtp->probes_out = 0; newtp->rx_opt.num_sacks = 0; newtp->urg_data = 0; - /* Deinitialize accept_queue to trap illegal accesses. */ - memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue)); - - /* Back to base struct sock members. */ - newsk->sk_err = 0; - newsk->sk_priority = 0; - atomic_set(&newsk->sk_refcnt, 2); -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif - atomic_inc(&tcp_sockets_allocated); if (sock_flag(newsk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); - newsk->sk_socket = NULL; - newsk->sk_sleep = NULL; + inet_csk_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { @@ -838,7 +691,7 @@ struct sock *tcp_create_openreq_child(st newtp->tcp_header_len = sizeof(struct tcphdr); } if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) - newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len; + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); if (newtp->ecn_flags&TCP_ECN_OK) @@ -934,9 +787,10 @@ struct sock *tcp_check_req(struct sock * does sequence test, SYN is truncated, and thus we consider it a bare ACK. - If tp->defer_accept, we silently drop this bare ACK. Otherwise, - we create an established connection. Both ends (listening sockets) - accept the new incoming connection and try to talk to each other. 8-) + If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this + bare ACK. Otherwise, we create an established connection. Both + ends (listening sockets) accept the new incoming connection and try + to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. @@ -1003,7 +857,8 @@ struct sock *tcp_check_req(struct sock * return NULL; /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ - if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; return NULL; } @@ -1018,10 +873,10 @@ struct sock *tcp_check_req(struct sock * if (child == NULL) goto listen_overflow; - tcp_synq_unlink(tp, req, prev); - tcp_synq_removed(sk, req); + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); - tcp_acceptq_queue(sk, req, child); + inet_csk_reqsk_queue_add(sk, req, child); return child; listen_overflow: @@ -1035,7 +890,7 @@ struct sock *tcp_check_req(struct sock * if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_reset(skb); - tcp_synq_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req, prev); return NULL; } diff -puN net/ipv4/tcp_output.c~git-net net/ipv4/tcp_output.c --- devel/net/ipv4/tcp_output.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/tcp_output.c 2005-08-06 23:39:10.000000000 -0700 @@ -105,8 +105,9 @@ static __u16 tcp_advertise_mss(struct so /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) +static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) { + struct tcp_sock *tp = tcp_sk(sk); s32 delta = tcp_time_stamp - tp->lsndtime; u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; @@ -116,7 +117,7 @@ static void tcp_cwnd_restart(struct tcp_ tp->snd_ssthresh = tcp_current_ssthresh(tp); restart_cwnd = min(restart_cwnd, cwnd); - while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) + while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tp->snd_cwnd = max(cwnd, restart_cwnd); tp->snd_cwnd_stamp = tcp_time_stamp; @@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_ static inline void tcp_event_data_sent(struct tcp_sock *tp, struct sk_buff *skb, struct sock *sk) { - u32 now = tcp_time_stamp; + struct inet_connection_sock *icsk = inet_csk(sk); + const u32 now = tcp_time_stamp; - if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) - tcp_cwnd_restart(tp, __sk_dst_get(sk)); + if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) + tcp_cwnd_restart(sk, __sk_dst_get(sk)); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) - tp->ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + icsk->icsk_ack.pingpong = 1; } static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { - struct tcp_sock *tp = tcp_sk(sk); - - tcp_dec_quickack_mode(tp, pkts); - tcp_clear_xmit_timer(sk, TCP_TIME_DACK); + tcp_dec_quickack_mode(sk, pkts); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* Determine a window scaling and initial window to offer. @@ -505,7 +505,7 @@ static int tcp_fragment(struct sock *sk, /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff); + __skb_append(skb, buff, &sk->sk_write_queue); return 0; } @@ -696,7 +696,7 @@ static inline void tcp_cwnd_validate(str if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) tcp_cwnd_application_limited(sk); } } @@ -892,7 +892,7 @@ static int tso_fragment(struct sock *sk, /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff); + __skb_append(skb, buff, &sk->sk_write_queue); return 0; } @@ -1150,6 +1150,7 @@ void tcp_push_one(struct sock *sk, unsig */ u32 __tcp_select_window(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); /* MSS for the peer's data. Previous verions used mss_clamp * here. I don't know if the value based on our guesses @@ -1157,7 +1158,7 @@ u32 __tcp_select_window(struct sock *sk) * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ - int mss = tp->ack.rcv_mss; + int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); int window; @@ -1166,7 +1167,7 @@ u32 __tcp_select_window(struct sock *sk) mss = full_space; if (free_space < full_space/2) { - tp->ack.quick = 0; + icsk->icsk_ack.quick = 0; if (tcp_memory_pressure) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); @@ -1241,7 +1242,7 @@ static void tcp_retrans_try_collapse(str tcp_skb_pcount(next_skb) != 1); /* Ok. We will be able to collapse the packet. */ - __skb_unlink(next_skb, next_skb->list); + __skb_unlink(next_skb, &sk->sk_write_queue); memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); @@ -1488,7 +1489,9 @@ void tcp_xmit_retransmit_queue(struct so if (skb == skb_peek(&sk->sk_write_queue)) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); } packet_cnt -= tcp_skb_pcount(skb); @@ -1541,7 +1544,9 @@ void tcp_xmit_retransmit_queue(struct so break; if (skb == skb_peek(&sk->sk_write_queue)) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } @@ -1777,8 +1782,8 @@ static inline void tcp_connect_init(stru tp->rcv_wup = 0; tp->copied_seq = 0; - tp->rto = TCP_TIMEOUT_INIT; - tp->retransmits = 0; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } @@ -1821,7 +1826,8 @@ int tcp_connect(struct sock *sk) TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } @@ -1831,20 +1837,21 @@ int tcp_connect(struct sock *sk) */ void tcp_send_delayed_ack(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - int ato = tp->ack.ato; + struct inet_connection_sock *icsk = inet_csk(sk); + int ato = icsk->icsk_ack.ato; unsigned long timeout; if (ato > TCP_DELACK_MIN) { + const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ/2; - if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) + if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ /* If some rtt estimate is known, use it to bound delayed ack. - * Do not use tp->rto here, use results of rtt measurements + * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ if (tp->srtt) { @@ -1861,21 +1868,22 @@ void tcp_send_delayed_ack(struct sock *s timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ - if (tp->ack.pending&TCP_ACK_TIMER) { + if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer was blocked or is about to expire, * send ACK now. */ - if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { + if (icsk->icsk_ack.blocked || + time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } - if (!time_before(timeout, tp->ack.timeout)) - timeout = tp->ack.timeout; + if (!time_before(timeout, icsk->icsk_ack.timeout)) + timeout = icsk->icsk_ack.timeout; } - tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; - tp->ack.timeout = timeout; - sk_reset_timer(sk, &tp->delack_timer, timeout); + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ @@ -1892,9 +1900,10 @@ void tcp_send_ack(struct sock *sk) */ buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (buff == NULL) { - tcp_schedule_ack(tp); - tp->ack.ato = TCP_ATO_MIN; - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); return; } @@ -2008,6 +2017,7 @@ int tcp_write_wakeup(struct sock *sk) */ void tcp_send_probe0(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int err; @@ -2016,16 +2026,17 @@ void tcp_send_probe0(struct sock *sk) if (tp->packets_out || !sk->sk_send_head) { /* Cancel probe timer, if it is not required. */ tp->probes_out = 0; - tp->backoff = 0; + icsk->icsk_backoff = 0; return; } if (err <= 0) { - if (tp->backoff < sysctl_tcp_retries2) - tp->backoff++; + if (icsk->icsk_backoff < sysctl_tcp_retries2) + icsk->icsk_backoff++; tp->probes_out++; - tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RTO_MAX)); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), + TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, * do not backoff and do not remember probes_out. @@ -2035,8 +2046,10 @@ void tcp_send_probe0(struct sock *sk) */ if (!tp->probes_out) tp->probes_out=1; - tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, + TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); } } diff -puN net/ipv4/tcp_timer.c~git-net net/ipv4/tcp_timer.c --- devel/net/ipv4/tcp_timer.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/tcp_timer.c 2005-08-06 23:39:10.000000000 -0700 @@ -36,49 +36,13 @@ static void tcp_write_timer(unsigned lon static void tcp_delack_timer(unsigned long); static void tcp_keepalive_timer (unsigned long data); -#ifdef TCP_DEBUG -const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; -EXPORT_SYMBOL(tcp_timer_bug_msg); -#endif - -/* - * Using different timers for retransmit, delayed acks and probes - * We may wish use just one timer maintaining a list of expire jiffies - * to optimize. - */ - void tcp_init_xmit_timers(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - - init_timer(&tp->retransmit_timer); - tp->retransmit_timer.function=&tcp_write_timer; - tp->retransmit_timer.data = (unsigned long) sk; - tp->pending = 0; - - init_timer(&tp->delack_timer); - tp->delack_timer.function=&tcp_delack_timer; - tp->delack_timer.data = (unsigned long) sk; - tp->ack.pending = 0; - - init_timer(&sk->sk_timer); - sk->sk_timer.function = &tcp_keepalive_timer; - sk->sk_timer.data = (unsigned long)sk; + inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, + &tcp_keepalive_timer); } -void tcp_clear_xmit_timers(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->pending = 0; - sk_stop_timer(sk, &tp->retransmit_timer); - - tp->ack.pending = 0; - tp->ack.blocked = 0; - sk_stop_timer(sk, &tp->delack_timer); - - sk_stop_timer(sk, &sk->sk_timer); -} +EXPORT_SYMBOL(tcp_init_xmit_timers); static void tcp_write_err(struct sock *sk) { @@ -155,15 +119,15 @@ static int tcp_orphan_retries(struct soc /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (tp->retransmits) + if (icsk->icsk_retransmits) dst_negative_advice(&sk->sk_dst_cache); - retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; + retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { - if (tp->retransmits >= sysctl_tcp_retries1) { + if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black hole detection. :-( @@ -189,16 +153,16 @@ static int tcp_write_timeout(struct sock retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - int alive = (tp->rto < TCP_RTO_MAX); + const int alive = (icsk->icsk_rto < TCP_RTO_MAX); retry_until = tcp_orphan_retries(sk, alive); - if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) + if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until)) return 1; } } - if (tp->retransmits >= retry_until) { + if (icsk->icsk_retransmits >= retry_until) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -210,26 +174,27 @@ static void tcp_delack_timer(unsigned lo { struct sock *sk = (struct sock*)data; struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - tp->ack.blocked = 1; + icsk->icsk_ack.blocked = 1; NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN); + sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out_unlock; } sk_stream_mem_reclaim(sk); - if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER)) + if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; - if (time_after(tp->ack.timeout, jiffies)) { - sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); + if (time_after(icsk->icsk_ack.timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); goto out; } - tp->ack.pending &= ~TCP_ACK_TIMER; + icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; @@ -242,16 +207,16 @@ static void tcp_delack_timer(unsigned lo tp->ucopy.memory = 0; } - if (tcp_ack_scheduled(tp)) { - if (!tp->ack.pingpong) { + if (inet_csk_ack_scheduled(sk)) { + if (!icsk->icsk_ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ - tp->ack.ato = min(tp->ack.ato << 1, tp->rto); + icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ - tp->ack.pingpong = 0; - tp->ack.ato = TCP_ATO_MIN; + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_send_ack(sk); NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); @@ -294,7 +259,8 @@ static void tcp_probe_timer(struct sock max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - int alive = ((tp->rto<backoff) < TCP_RTO_MAX); + const struct inet_connection_sock *icsk = inet_csk(sk); + const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); max_probes = tcp_orphan_retries(sk, alive); @@ -317,6 +283,7 @@ static void tcp_probe_timer(struct sock static void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!tp->packets_out) goto out; @@ -351,7 +318,7 @@ static void tcp_retransmit_timer(struct if (tcp_write_timeout(sk)) goto out; - if (tp->retransmits == 0) { + if (icsk->icsk_retransmits == 0) { if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { if (tp->rx_opt.sack_ok) { if (tp->ca_state == TCP_CA_Recovery) @@ -381,10 +348,11 @@ static void tcp_retransmit_timer(struct /* Retransmission failed because of local congestion, * do not backoff. */ - if (!tp->retransmits) - tp->retransmits=1; - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, - min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); + if (!icsk->icsk_retransmits) + icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); goto out; } @@ -403,13 +371,13 @@ static void tcp_retransmit_timer(struct * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ - tp->backoff++; - tp->retransmits++; + icsk->icsk_backoff++; + icsk->icsk_retransmits++; out_reset_timer: - tp->rto = min(tp->rto << 1, TCP_RTO_MAX); - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); - if (tp->retransmits > sysctl_tcp_retries1) + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + if (icsk->icsk_retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); out:; @@ -418,32 +386,32 @@ out:; static void tcp_write_timer(unsigned long data) { struct sock *sk = (struct sock*)data; - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int event; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ - sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20)); + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); goto out_unlock; } - if (sk->sk_state == TCP_CLOSE || !tp->pending) + if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) goto out; - if (time_after(tp->timeout, jiffies)) { - sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); + if (time_after(icsk->icsk_timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); goto out; } - event = tp->pending; - tp->pending = 0; + event = icsk->icsk_pending; + icsk->icsk_pending = 0; switch (event) { - case TCP_TIME_RETRANS: + case ICSK_TIME_RETRANS: tcp_retransmit_timer(sk); break; - case TCP_TIME_PROBE0: + case ICSK_TIME_PROBE0: tcp_probe_timer(sk); break; } @@ -462,96 +430,8 @@ out_unlock: static void tcp_synack_timer(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt = tp->accept_queue.listen_opt; - int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; - int thresh = max_retries; - unsigned long now = jiffies; - struct request_sock **reqp, *req; - int i, budget; - - if (lopt == NULL || lopt->qlen == 0) - return; - - /* Normally all the openreqs are young and become mature - * (i.e. converted to established socket) for first timeout. - * If synack was not acknowledged for 3 seconds, it means - * one of the following things: synack was lost, ack was lost, - * rtt is high or nobody planned to ack (i.e. synflood). - * When server is a bit loaded, queue is populated with old - * open requests, reducing effective size of queue. - * When server is well loaded, queue size reduces to zero - * after several minutes of work. It is not synflood, - * it is normal operation. The solution is pruning - * too old entries overriding normal timeout, when - * situation becomes dangerous. - * - * Essentially, we reserve half of room for young - * embrions; and abort old ones without pity, if old - * ones are about to clog our table. - */ - if (lopt->qlen>>(lopt->max_qlen_log-1)) { - int young = (lopt->qlen_young<<1); - - while (thresh > 2) { - if (lopt->qlen < young) - break; - thresh--; - young <<= 1; - } - } - - if (tp->defer_accept) - max_retries = tp->defer_accept; - - budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL)); - i = lopt->clock_hand; - - do { - reqp=&lopt->syn_table[i]; - while ((req = *reqp) != NULL) { - if (time_after_eq(now, req->expires)) { - if ((req->retrans < thresh || - (inet_rsk(req)->acked && req->retrans < max_retries)) - && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) { - unsigned long timeo; - - if (req->retrans++ == 0) - lopt->qlen_young--; - timeo = min((TCP_TIMEOUT_INIT << req->retrans), - TCP_RTO_MAX); - req->expires = now + timeo; - reqp = &req->dl_next; - continue; - } - - /* Drop this request */ - tcp_synq_unlink(tp, req, reqp); - reqsk_queue_removed(&tp->accept_queue, req); - reqsk_free(req); - continue; - } - reqp = &req->dl_next; - } - - i = (i+1)&(TCP_SYNQ_HSIZE-1); - - } while (--budget > 0); - - lopt->clock_hand = i; - - if (lopt->qlen) - tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); -} - -void tcp_delete_keepalive_timer (struct sock *sk) -{ - sk_stop_timer(sk, &sk->sk_timer); -} - -void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len) -{ - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); + inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, + TCP_TIMEOUT_INIT, TCP_RTO_MAX); } void tcp_set_keepalive(struct sock *sk, int val) @@ -560,9 +440,9 @@ void tcp_set_keepalive(struct sock *sk, return; if (val && !sock_flag(sk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) - tcp_delete_keepalive_timer(sk); + inet_csk_delete_keepalive_timer(sk); } @@ -576,7 +456,7 @@ static void tcp_keepalive_timer (unsigne bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - tcp_reset_keepalive_timer (sk, HZ/20); + inet_csk_reset_keepalive_timer (sk, HZ/20); goto out; } @@ -587,7 +467,7 @@ static void tcp_keepalive_timer (unsigne if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { - int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; + const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); @@ -634,7 +514,7 @@ static void tcp_keepalive_timer (unsigne sk_stream_mem_reclaim(sk); resched: - tcp_reset_keepalive_timer (sk, elapsed); + inet_csk_reset_keepalive_timer (sk, elapsed); goto out; death: @@ -644,8 +524,3 @@ out: bh_unlock_sock(sk); sock_put(sk); } - -EXPORT_SYMBOL(tcp_clear_xmit_timers); -EXPORT_SYMBOL(tcp_delete_keepalive_timer); -EXPORT_SYMBOL(tcp_init_xmit_timers); -EXPORT_SYMBOL(tcp_reset_keepalive_timer); diff -puN net/ipv4/udp.c~git-net net/ipv4/udp.c --- devel/net/ipv4/udp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/udp.c 2005-08-06 23:39:10.000000000 -0700 @@ -95,7 +95,8 @@ #include #include #include -#include +#include +#include #include #include #include diff -puN net/ipv4/xfrm4_state.c~git-net net/ipv4/xfrm4_state.c --- devel/net/ipv4/xfrm4_state.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv4/xfrm4_state.c 2005-08-06 23:39:10.000000000 -0700 @@ -128,8 +128,10 @@ void __init xfrm4_state_init(void) xfrm_state_register_afinfo(&xfrm4_state_afinfo); } +#if 0 void __exit xfrm4_state_fini(void) { xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); } +#endif /* 0 */ diff -puN net/ipv6/addrconf.c~git-net net/ipv6/addrconf.c --- devel/net/ipv6/addrconf.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/addrconf.c 2005-08-06 23:39:10.000000000 -0700 @@ -1041,9 +1041,9 @@ int ipv6_rcv_saddr_equal(const struct so const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; - u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); + u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); int sk_ipv6only = ipv6_only_sock(sk); - int sk2_ipv6only = tcp_v6_ipv6only(sk2); + int sk2_ipv6only = inet_v6_ipv6only(sk2); int addr_type = ipv6_addr_type(sk_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; diff -puN net/ipv6/af_inet6.c~git-net net/ipv6/af_inet6.c --- devel/net/ipv6/af_inet6.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/af_inet6.c 2005-08-06 23:39:10.000000000 -0700 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -85,26 +86,12 @@ extern void if6_proc_exit(void); int sysctl_ipv6_bindv6only; -#ifdef INET_REFCNT_DEBUG -atomic_t inet6_sock_nr; -EXPORT_SYMBOL(inet6_sock_nr); -#endif - /* The inetsw table contains everything that inet_create needs to * build a new socket. */ static struct list_head inetsw6[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw6_lock); -static void inet6_sock_destruct(struct sock *sk) -{ - inet_sock_destruct(sk); - -#ifdef INET_REFCNT_DEBUG - atomic_dec(&inet6_sock_nr); -#endif -} - static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); @@ -185,7 +172,7 @@ static int inet6_create(struct socket *s inet->hdrincl = 1; } - sk->sk_destruct = inet6_sock_destruct; + sk->sk_destruct = inet_sock_destruct; sk->sk_family = PF_INET6; sk->sk_protocol = protocol; @@ -212,12 +199,17 @@ static int inet6_create(struct socket *s inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; + /* + * Increment only the relevant sk_prot->socks debug field, this changes + * the previous behaviour of incrementing both the equivalent to + * answer->prot->socks (inet6_sock_nr) and inet_sock_nr. + * + * This allows better debug granularity as we'll know exactly how many + * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6 + * transport protocol socks. -acme + */ + sk_refcnt_debug_inc(sk); - -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); - atomic_inc(&inet_sock_nr); -#endif if (inet->num) { /* It assumes that any protocol which allows * the user to assign a number at socket @@ -757,6 +749,9 @@ static int __init inet6_init(void) err = igmp6_init(&inet6_family_ops); if (err) goto igmp_fail; + err = ipv6_netfilter_init(); + if (err) + goto netfilter_fail; /* Create /proc/foo6 entries. */ #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -813,6 +808,8 @@ proc_tcp6_fail: raw6_proc_exit(); proc_raw6_fail: #endif + ipv6_netfilter_fini(); +netfilter_fail: igmp6_cleanup(); igmp_fail: ndisc_cleanup(); @@ -852,6 +849,7 @@ static void __exit inet6_exit(void) ip6_route_cleanup(); ipv6_packet_cleanup(); igmp6_cleanup(); + ipv6_netfilter_fini(); ndisc_cleanup(); icmpv6_cleanup(); #ifdef CONFIG_SYSCTL diff -puN net/ipv6/datagram.c~git-net net/ipv6/datagram.c --- devel/net/ipv6/datagram.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/datagram.c 2005-08-06 23:39:10.000000000 -0700 @@ -29,6 +29,7 @@ #include #include #include +#include #include #include diff -puN net/ipv6/icmp.c~git-net net/ipv6/icmp.c --- devel/net/ipv6/icmp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/icmp.c 2005-08-06 23:39:10.000000000 -0700 @@ -551,7 +551,8 @@ static void icmpv6_notify(struct sk_buff read_lock(&raw_v6_lock); if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) { - while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr))) { + while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, + skb->dev->ifindex))) { rawv6_err(sk, skb, NULL, type, code, inner_offset, info); sk = sk_next(sk); } diff -puN net/ipv6/ip6_input.c~git-net net/ipv6/ip6_input.c --- devel/net/ipv6/ip6_input.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/ip6_input.c 2005-08-06 23:39:10.000000000 -0700 @@ -56,7 +56,7 @@ static inline int ip6_rcv_finish( struct return dst_input(skb); } -int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct ipv6hdr *hdr; u32 pkt_len; @@ -166,8 +166,8 @@ resubmit: nexthdr = skb->nh.raw[nhoff]; raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]); - if (raw_sk) - ipv6_raw_deliver(skb, nexthdr); + if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) + raw_sk = NULL; hash = nexthdr & (MAX_INET_PROTOS - 1); if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { @@ -198,12 +198,13 @@ resubmit: if (!raw_sk) { if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); - icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff); + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_UNK_NEXTHDR, nhoff, + skb->dev); } - } else { + } else IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); - kfree_skb(skb); - } + kfree_skb(skb); } rcu_read_unlock(); return 0; diff -puN net/ipv6/ip6_output.c~git-net net/ipv6/ip6_output.c --- devel/net/ipv6/ip6_output.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/ip6_output.c 2005-08-06 23:39:10.000000000 -0700 @@ -153,51 +153,6 @@ int ip6_output(struct sk_buff *skb) return ip6_output2(skb); } -#ifdef CONFIG_NETFILTER -int ip6_route_me_harder(struct sk_buff *skb) -{ - struct ipv6hdr *iph = skb->nh.ipv6h; - struct dst_entry *dst; - struct flowi fl = { - .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, - .nl_u = - { .ip6_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, } }, - .proto = iph->nexthdr, - }; - - dst = ip6_route_output(skb->sk, &fl); - - if (dst->error) { - IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n")); - dst_release(dst); - return -EINVAL; - } - - /* Drop old route. */ - dst_release(skb->dst); - - skb->dst = dst; - return 0; -} -#endif - -static inline int ip6_maybe_reroute(struct sk_buff *skb) -{ -#ifdef CONFIG_NETFILTER - if (skb->nfcache & NFC_ALTERED){ - if (ip6_route_me_harder(skb) != 0){ - kfree_skb(skb); - return -EINVAL; - } - } -#endif /* CONFIG_NETFILTER */ - return dst_output(skb); -} - /* * xmit an sk_buff (used by TCP) */ @@ -266,7 +221,8 @@ int ip6_xmit(struct sock *sk, struct sk_ mtu = dst_mtu(dst); if ((skb->len <= mtu) || ipfragok) { IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute); + return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, + dst_output); } if (net_ratelimit()) @@ -321,7 +277,9 @@ static int ip6_call_ra_chain(struct sk_b read_lock(&ip6_ra_lock); for (ra = ip6_ra_chain; ra; ra = ra->next) { struct sock *sk = ra->sk; - if (sk && ra->sel == sel) { + if (sk && ra->sel == sel && + (!sk->sk_bound_dev_if || + sk->sk_bound_dev_if == skb->dev->ifindex)) { if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) diff -puN net/ipv6/ipv6_sockglue.c~git-net net/ipv6/ipv6_sockglue.c --- devel/net/ipv6/ipv6_sockglue.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/ipv6_sockglue.c 2005-08-06 23:39:10.000000000 -0700 @@ -163,6 +163,13 @@ int ipv6_setsockopt(struct sock *sk, int fl6_free_socklist(sk); ipv6_sock_mc_close(sk); + /* + * Sock is moving from IPv6 to IPv4 (sk_prot), so + * remove it from the refcnt debug socks count in the + * original family... + */ + sk_refcnt_debug_dec(sk); + if (sk->sk_protocol == IPPROTO_TCP) { struct tcp_sock *tp = tcp_sk(sk); @@ -192,9 +199,11 @@ int ipv6_setsockopt(struct sock *sk, int kfree_skb(pktopt); sk->sk_destruct = inet_sock_destruct; -#ifdef INET_REFCNT_DEBUG - atomic_dec(&inet6_sock_nr); -#endif + /* + * ... and add it to the refcnt debug socks count + * in the new family. -acme + */ + sk_refcnt_debug_inc(sk); module_put(THIS_MODULE); retv = 0; break; diff -puN net/ipv6/ipv6_syms.c~git-net net/ipv6/ipv6_syms.c --- devel/net/ipv6/ipv6_syms.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/ipv6_syms.c 2005-08-06 23:39:10.000000000 -0700 @@ -15,9 +15,6 @@ EXPORT_SYMBOL(ndisc_mc_map); EXPORT_SYMBOL(register_inet6addr_notifier); EXPORT_SYMBOL(unregister_inet6addr_notifier); EXPORT_SYMBOL(ip6_route_output); -#ifdef CONFIG_NETFILTER -EXPORT_SYMBOL(ip6_route_me_harder); -#endif EXPORT_SYMBOL(addrconf_lock); EXPORT_SYMBOL(ipv6_setsockopt); EXPORT_SYMBOL(ipv6_getsockopt); diff -puN net/ipv6/Makefile~git-net net/ipv6/Makefile --- devel/net/ipv6/Makefile~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/Makefile 2005-08-06 23:39:10.000000000 -0700 @@ -8,7 +8,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_ou route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ - ip6_flowlabel.o ipv6_syms.o + ip6_flowlabel.o ipv6_syms.o netfilter.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ xfrm6_output.o diff -puN /dev/null net/ipv6/netfilter.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/ipv6/netfilter.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,105 @@ +#include +#include + +#ifdef CONFIG_NETFILTER + +#include +#include +#include +#include +#include +#include +#include + +int ip6_route_me_harder(struct sk_buff *skb) +{ + struct ipv6hdr *iph = skb->nh.ipv6h; + struct dst_entry *dst; + struct flowi fl = { + .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, + .nl_u = + { .ip6_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, } }, + .proto = iph->nexthdr, + }; + + dst = ip6_route_output(skb->sk, &fl); + + if (dst->error) { + IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n")); + dst_release(dst); + return -EINVAL; + } + + /* Drop old route. */ + dst_release(skb->dst); + + skb->dst = dst; + return 0; +} +EXPORT_SYMBOL(ip6_route_me_harder); + +/* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + +struct ip6_rt_info { + struct in6_addr daddr; + struct in6_addr saddr; +}; + +static void save(const struct sk_buff *skb, struct nf_info *info) +{ + struct ip6_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP6_LOCAL_OUT) { + struct ipv6hdr *iph = skb->nh.ipv6h; + + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + } +} + +static int reroute(struct sk_buff **pskb, const struct nf_info *info) +{ + struct ip6_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP6_LOCAL_OUT) { + struct ipv6hdr *iph = (*pskb)->nh.ipv6h; + if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || + !ipv6_addr_equal(&iph->saddr, &rt_info->saddr)) + return ip6_route_me_harder(*pskb); + } + return 0; +} + +static struct nf_queue_rerouter ip6_reroute = { + .rer_size = sizeof(struct ip6_rt_info), + .save = &save, + .reroute = &reroute, +}; + +int __init ipv6_netfilter_init(void) +{ + return nf_register_queue_rerouter(PF_INET6, &ip6_reroute); +} + +void ipv6_netfilter_fini(void) +{ + nf_unregister_queue_rerouter(PF_INET6); +} + +#else /* CONFIG_NETFILTER */ +int __init ipv6_netfilter_init(void) +{ + return 0; +} + +void ipv6_netfilter_fini(void) +{ +} +#endif /* CONFIG_NETFILTER */ diff -puN net/ipv6/netfilter/ip6_queue.c~git-net net/ipv6/netfilter/ip6_queue.c --- devel/net/ipv6/netfilter/ip6_queue.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/netfilter/ip6_queue.c 2005-08-06 23:39:10.000000000 -0700 @@ -47,16 +47,10 @@ #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" -struct ipq_rt_info { - struct in6_addr daddr; - struct in6_addr saddr; -}; - struct ipq_queue_entry { struct list_head list; struct nf_info *info; struct sk_buff *skb; - struct ipq_rt_info rt_info; }; typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); @@ -278,7 +272,8 @@ nlmsg_failure: } static int -ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data) { int status = -EINVAL; struct sk_buff *nskb; @@ -296,13 +291,6 @@ ipq_enqueue_packet(struct sk_buff *skb, entry->info = info; entry->skb = skb; - if (entry->info->hook == NF_IP_LOCAL_OUT) { - struct ipv6hdr *iph = skb->nh.ipv6h; - - entry->rt_info.daddr = iph->daddr; - entry->rt_info.saddr = iph->saddr; - } - nskb = ipq_build_packet_message(entry, &status); if (nskb == NULL) goto err_out_free; @@ -378,22 +366,10 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, st } skb_put(e->skb, diff); } - if (!skb_ip_make_writable(&e->skb, v->data_len)) + if (!skb_make_writable(&e->skb, v->data_len)) return -ENOMEM; memcpy(e->skb->data, v->payload, v->data_len); - e->skb->nfcache |= NFC_ALTERED; - /* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - * Not a nice way to cmp, but works - */ - if (e->info->hook == NF_IP_LOCAL_OUT) { - struct ipv6hdr *iph = e->skb->nh.ipv6h; - if (!ipv6_addr_equal(&iph->daddr, &e->rt_info.daddr) || - !ipv6_addr_equal(&iph->saddr, &e->rt_info.saddr)) - return ip6_route_me_harder(e->skb); - } return 0; } @@ -679,7 +655,7 @@ init_or_cleanup(int init) goto cleanup; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk); + ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk, THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; diff -puN net/ipv6/netfilter/ip6_tables.c~git-net net/ipv6/netfilter/ip6_tables.c --- devel/net/ipv6/netfilter/ip6_tables.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/netfilter/ip6_tables.c 2005-08-06 23:39:10.000000000 -0700 @@ -401,7 +401,6 @@ ip6t_do_table(struct sk_buff **pskb, do { IP_NF_ASSERT(e); IP_NF_ASSERT(back); - (*pskb)->nfcache |= e->nfcache; if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6, &protoff, &offset)) { struct ip6t_entry_target *t; diff -puN net/ipv6/netfilter/ip6t_LOG.c~git-net net/ipv6/netfilter/ip6t_LOG.c --- devel/net/ipv6/netfilter/ip6t_LOG.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/netfilter/ip6t_LOG.c 2005-08-06 23:39:10.000000000 -0700 @@ -26,10 +26,6 @@ MODULE_AUTHOR("Jan Rekorajski #include @@ -44,7 +40,7 @@ struct in_device; static DEFINE_SPINLOCK(log_lock); /* One level of recursion won't kill us */ -static void dump_packet(const struct ip6t_log_info *info, +static void dump_packet(const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int ip6hoff, int recurse) { @@ -53,6 +49,12 @@ static void dump_packet(const struct ip6 struct ipv6hdr _ip6h, *ih; unsigned int ptr; unsigned int hdrlen = 0; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); if (ih == NULL) { @@ -84,7 +86,7 @@ static void dump_packet(const struct ip6 } /* Max length: 48 "OPT (...) " */ - if (info->logflags & IP6T_LOG_IPOPT) + if (logflags & IP6T_LOG_IPOPT) printk("OPT ( "); switch (currenthdr) { @@ -119,7 +121,7 @@ static void dump_packet(const struct ip6 case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: if (fragment) { - if (info->logflags & IP6T_LOG_IPOPT) + if (logflags & IP6T_LOG_IPOPT) printk(")"); return; } @@ -127,7 +129,7 @@ static void dump_packet(const struct ip6 break; /* Max Length */ case IPPROTO_AH: - if (info->logflags & IP6T_LOG_IPOPT) { + if (logflags & IP6T_LOG_IPOPT) { struct ip_auth_hdr _ahdr, *ah; /* Max length: 3 "AH " */ @@ -158,7 +160,7 @@ static void dump_packet(const struct ip6 hdrlen = (hp->hdrlen+2)<<2; break; case IPPROTO_ESP: - if (info->logflags & IP6T_LOG_IPOPT) { + if (logflags & IP6T_LOG_IPOPT) { struct ip_esp_hdr _esph, *eh; /* Max length: 4 "ESP " */ @@ -190,7 +192,7 @@ static void dump_packet(const struct ip6 printk("Unknown Ext Hdr %u", currenthdr); return; } - if (info->logflags & IP6T_LOG_IPOPT) + if (logflags & IP6T_LOG_IPOPT) printk(") "); currenthdr = hp->nexthdr; @@ -218,7 +220,7 @@ static void dump_packet(const struct ip6 printk("SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (info->logflags & IP6T_LOG_TCPSEQ) + if (logflags & IP6T_LOG_TCPSEQ) printk("SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ @@ -245,7 +247,7 @@ static void dump_packet(const struct ip6 /* Max length: 11 "URGP=65535 " */ printk("URGP=%u ", ntohs(th->urg_ptr)); - if ((info->logflags & IP6T_LOG_TCPOPT) + if ((logflags & IP6T_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; unsigned int i; @@ -349,7 +351,7 @@ static void dump_packet(const struct ip6 } /* Max length: 15 "UID=4294967295 " */ - if ((info->logflags & IP6T_LOG_UID) && recurse && skb->sk) { + if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) printk("UID=%u ", skb->sk->sk_socket->file->f_uid); @@ -357,19 +359,31 @@ static void dump_packet(const struct ip6 } } +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 0, + .logflags = NF_LOG_MASK, + }, + }, +}; + static void -ip6t_log_packet(unsigned int hooknum, +ip6t_log_packet(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const struct ip6t_log_info *loginfo, - const char *level_string, + const struct nf_loginfo *loginfo, const char *prefix) { + if (!loginfo) + loginfo = &default_loginfo; + spin_lock_bh(&log_lock); - printk(level_string); - printk("%sIN=%s OUT=%s ", - prefix == NULL ? loginfo->prefix : prefix, + printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, in ? in->name : "", out ? out->name : ""); if (in && !out) { @@ -416,29 +430,17 @@ ip6t_log_target(struct sk_buff **pskb, void *userinfo) { const struct ip6t_log_info *loginfo = targinfo; - char level_string[4] = "< >"; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; - level_string[1] = '0' + (loginfo->level % 8); - ip6t_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + nf_log_packet(PF_INET6, hooknum, *pskb, in, out, &li, loginfo->prefix); return IP6T_CONTINUE; } -static void -ip6t_logfn(unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const char *prefix) -{ - struct ip6t_log_info loginfo = { - .level = 0, - .logflags = IP6T_LOG_MASK, - .prefix = "" - }; - - ip6t_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); -} static int ip6t_log_checkentry(const char *tablename, const struct ip6t_entry *e, @@ -475,20 +477,29 @@ static struct ip6t_target ip6t_log_reg = .me = THIS_MODULE, }; +static struct nf_logger ip6t_logger = { + .name = "ip6t_LOG", + .logfn = &ip6t_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { if (ip6t_register_target(&ip6t_log_reg)) return -EINVAL; - if (nflog) - nf_log_register(PF_INET6, &ip6t_logfn); + if (nf_log_register(PF_INET6, &ip6t_logger) < 0) { + printk(KERN_WARNING "ip6t_LOG: not logging via system console " + "since somebody else already registered for PF_INET6\n"); + /* we cannot make module load fail here, since otherwise + * ip6tables userspace would abort */ + } return 0; } static void __exit fini(void) { - if (nflog) - nf_log_unregister(PF_INET6, &ip6t_logfn); + nf_log_unregister_logger(&ip6t_logger); ip6t_unregister_target(&ip6t_log_reg); } diff -puN net/ipv6/netfilter/ip6t_MARK.c~git-net net/ipv6/netfilter/ip6t_MARK.c --- devel/net/ipv6/netfilter/ip6t_MARK.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/netfilter/ip6t_MARK.c 2005-08-06 23:39:10.000000000 -0700 @@ -28,10 +28,9 @@ target(struct sk_buff **pskb, { const struct ip6t_mark_target_info *markinfo = targinfo; - if((*pskb)->nfmark != markinfo->mark) { + if((*pskb)->nfmark != markinfo->mark) (*pskb)->nfmark = markinfo->mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return IP6T_CONTINUE; } diff -puN /dev/null net/ipv6/netfilter/ip6t_NFQUEUE.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/ipv6/netfilter/ip6t_NFQUEUE.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,70 @@ +/* ip6tables module for using new netfilter netlink queue + * + * (C) 2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include + +#include +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("ip6tables NFQUEUE target"); +MODULE_LICENSE("GPL"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_NFQ_info *tinfo = targinfo; + + return NF_QUEUE_NR(tinfo->queuenum); +} + +static int +checkentry(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IP6T_ALIGN(sizeof(struct ipt_NFQ_info))) { + printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n", + targinfosize, + IP6T_ALIGN(sizeof(struct ipt_NFQ_info))); + return 0; + } + + return 1; +} + +static struct ip6t_target ipt_NFQ_reg = { + .name = "NFQUEUE", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_target(&ipt_NFQ_reg); +} + +static void __exit fini(void) +{ + ip6t_unregister_target(&ipt_NFQ_reg); +} + +module_init(init); +module_exit(fini); diff -puN net/ipv6/netfilter/Kconfig~git-net net/ipv6/netfilter/Kconfig --- devel/net/ipv6/netfilter/Kconfig~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/netfilter/Kconfig 2005-08-06 23:39:10.000000000 -0700 @@ -10,13 +10,16 @@ menu "IPv6: Netfilter Configuration (EXP # dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK #fi config IP6_NF_QUEUE - tristate "Userspace queueing via NETLINK" + tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" ---help--- This option adds a queue handler to the kernel for IPv6 - packets which lets us to receive the filtered packets - with QUEUE target using libiptc as we can do with - the IPv4 now. + packets which enables users to receive the filtered packets + with QUEUE target using libipq. + + THis option enables the old IPv6-only "ip6_queue" implementation + which has been obsoleted by the new "nfnetlink_queue" code (see + CONFIG_NETFILTER_NETLINK_QUEUE). (C) Fernando Anton 2001 IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. diff -puN net/ipv6/netfilter/Makefile~git-net net/ipv6/netfilter/Makefile --- devel/net/ipv6/netfilter/Makefile~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/netfilter/Makefile 2005-08-06 23:39:10.000000000 -0700 @@ -24,3 +24,4 @@ obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue. obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ip6t_NFQUEUE.o diff -puN net/ipv6/raw.c~git-net net/ipv6/raw.c --- devel/net/ipv6/raw.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/raw.c 2005-08-06 23:39:10.000000000 -0700 @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -81,7 +82,8 @@ static void raw_v6_unhash(struct sock *s /* Grumble... icmp and ip_input want to get at this... */ struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, - struct in6_addr *loc_addr, struct in6_addr *rmt_addr) + struct in6_addr *loc_addr, struct in6_addr *rmt_addr, + int dif) { struct hlist_node *node; int is_multicast = ipv6_addr_is_multicast(loc_addr); @@ -94,6 +96,9 @@ struct sock *__raw_v6_lookup(struct sock !ipv6_addr_equal(&np->daddr, rmt_addr)) continue; + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + continue; + if (!ipv6_addr_any(&np->rcv_saddr)) { if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) goto found; @@ -137,11 +142,12 @@ static __inline__ int icmpv6_filter(stru * * Caller owns SKB so we must make clones. */ -void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) +int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct in6_addr *saddr; struct in6_addr *daddr; struct sock *sk; + int delivered = 0; __u8 hash; saddr = &skb->nh.ipv6h->saddr; @@ -160,9 +166,10 @@ void ipv6_raw_deliver(struct sk_buff *sk if (sk == NULL) goto out; - sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr); + sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, skb->dev->ifindex); while (sk) { + delivered = 1; if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); @@ -170,10 +177,12 @@ void ipv6_raw_deliver(struct sk_buff *sk if (clone) rawv6_rcv(sk, clone); } - sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr); + sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr, + skb->dev->ifindex); } out: read_unlock(&raw_v6_lock); + return delivered; } /* This cleans up af_inet6 a bit. -DaveM */ diff -puN net/ipv6/tcp_ipv6.c~git-net net/ipv6/tcp_ipv6.c --- devel/net/ipv6/tcp_ipv6.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/tcp_ipv6.c 2005-08-06 23:39:10.000000000 -0700 @@ -84,7 +84,7 @@ static __inline__ int tcp_v6_hashfn(stru hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]); hashent ^= hashent>>16; hashent ^= hashent>>8; - return (hashent & (tcp_ehash_size - 1)); + return (hashent & (tcp_hashinfo.ehash_size - 1)); } static __inline__ int tcp_v6_sk_hashfn(struct sock *sk) @@ -98,11 +98,11 @@ static __inline__ int tcp_v6_sk_hashfn(s return tcp_v6_hashfn(laddr, lport, faddr, fport); } -static inline int tcp_v6_bind_conflict(struct sock *sk, - struct tcp_bind_bucket *tb) +static inline int tcp_v6_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) { - struct sock *sk2; - struct hlist_node *node; + const struct sock *sk2; + const struct hlist_node *node; /* We must walk the whole port owner list in this case. -DaveM */ sk_for_each_bound(sk2, node, &tb->owners) { @@ -126,8 +126,8 @@ static inline int tcp_v6_bind_conflict(s */ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) { - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; struct hlist_node *node; int ret; @@ -138,25 +138,25 @@ static int tcp_v6_get_port(struct sock * int remaining = (high - low) + 1; int rover; - spin_lock(&tcp_portalloc_lock); - if (tcp_port_rover < low) + spin_lock(&tcp_hashinfo.portalloc_lock); + if (tcp_hashinfo.port_rover < low) rover = low; else - rover = tcp_port_rover; + rover = tcp_hashinfo.port_rover; do { rover++; if (rover > high) rover = low; - head = &tcp_bhash[tcp_bhashfn(rover)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == rover) goto next; break; next: spin_unlock(&head->lock); } while (--remaining > 0); - tcp_port_rover = rover; - spin_unlock(&tcp_portalloc_lock); + tcp_hashinfo.port_rover = rover; + spin_unlock(&tcp_hashinfo.portalloc_lock); /* Exhausted local port range during search? */ ret = 1; @@ -166,9 +166,9 @@ static int tcp_v6_get_port(struct sock * /* OK, here is the one we will use. */ snum = rover; } else { - head = &tcp_bhash[tcp_bhashfn(snum)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == snum) goto tb_found; } @@ -187,8 +187,11 @@ tb_found: } tb_not_found: ret = 1; - if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) - goto fail_unlock; + if (tb == NULL) { + tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum); + if (tb == NULL) + goto fail_unlock; + } if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) tb->fastreuse = 1; @@ -199,9 +202,9 @@ tb_not_found: tb->fastreuse = 0; success: - if (!tcp_sk(sk)->bind_hash) - tcp_bind_hash(sk, tb, snum); - BUG_TRAP(tcp_sk(sk)->bind_hash == tb); + if (!inet_csk(sk)->icsk_bind_hash) + inet_bind_hash(sk, tb, snum); + BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); ret = 0; fail_unlock: @@ -219,13 +222,13 @@ static __inline__ void __tcp_v6_hash(str BUG_TRAP(sk_unhashed(sk)); if (sk->sk_state == TCP_LISTEN) { - list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; - lock = &tcp_lhash_lock; - tcp_listen_wlock(); + list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &tcp_hashinfo.lhash_lock; + inet_listen_wlock(&tcp_hashinfo); } else { sk->sk_hashent = tcp_v6_sk_hashfn(sk); - list = &tcp_ehash[sk->sk_hashent].chain; - lock = &tcp_ehash[sk->sk_hashent].lock; + list = &tcp_hashinfo.ehash[sk->sk_hashent].chain; + lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock; write_lock(lock); } @@ -258,8 +261,8 @@ static struct sock *tcp_v6_lookup_listen int score, hiscore; hiscore=0; - read_lock(&tcp_lhash_lock); - sk_for_each(sk, node, &tcp_listening_hash[tcp_lhashfn(hnum)]) { + read_lock(&tcp_hashinfo.lhash_lock); + sk_for_each(sk, node, &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)]) { if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -286,7 +289,7 @@ static struct sock *tcp_v6_lookup_listen } if (result) sock_hold(result); - read_unlock(&tcp_lhash_lock); + read_unlock(&tcp_hashinfo.lhash_lock); return result; } @@ -300,33 +303,32 @@ static inline struct sock *__tcp_v6_look struct in6_addr *daddr, u16 hnum, int dif) { - struct tcp_ehash_bucket *head; struct sock *sk; - struct hlist_node *node; - __u32 ports = TCP_COMBINED_PORTS(sport, hnum); - int hash; - + const struct hlist_node *node; + const __u32 ports = INET_COMBINED_PORTS(sport, hnum); /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); - head = &tcp_ehash[hash]; + const int hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; + read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { /* For IPV6 do the cheaper port and family tests first. */ - if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif)) + if (INET6_MATCH(sk, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { - /* FIXME: acme: check this... */ - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) { + const struct inet_timewait_sock *tw = inet_twsk(sk); if(*((__u32 *)&(tw->tw_dport)) == ports && sk->sk_family == PF_INET6) { - if(ipv6_addr_equal(&tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) && - (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + + if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && + (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) goto hit; } } @@ -374,7 +376,7 @@ EXPORT_SYMBOL_GPL(tcp_v6_lookup); * Open request hash tables. */ -static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd) +static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd) { u32 a, b, c; @@ -394,14 +396,15 @@ static u32 tcp_v6_synq_hash(struct in6_a return c & (TCP_SYNQ_HSIZE - 1); } -static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp, +static struct request_sock *tcp_v6_search_req(const struct sock *sk, struct request_sock ***prevp, __u16 rport, struct in6_addr *raddr, struct in6_addr *laddr, int iif) { - struct listen_sock *lopt = tp->accept_queue.listen_opt; + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; struct request_sock *req, **prev; for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; @@ -447,43 +450,46 @@ static __u32 tcp_v6_init_sequence(struct } static int __tcp_v6_check_established(struct sock *sk, __u16 lport, - struct tcp_tw_bucket **twp) + struct inet_timewait_sock **twp) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *daddr = &np->rcv_saddr; struct in6_addr *saddr = &np->daddr; int dif = sk->sk_bound_dev_if; - u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); - int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport); - struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport); + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; struct sock *sk2; - struct hlist_node *node; - struct tcp_tw_bucket *tw; + const struct hlist_node *node; + struct inet_timewait_sock *tw; write_lock(&head->lock); /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { - tw = (struct tcp_tw_bucket*)sk2; + sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2); + + tw = inet_twsk(sk2); if(*((__u32 *)&(tw->tw_dport)) == ports && sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) && + ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); struct tcp_sock *tp = tcp_sk(sk); - if (tw->tw_ts_recent_stamp && - (!twp || (sysctl_tcp_tw_reuse && - xtime.tv_sec - - tw->tw_ts_recent_stamp > 1))) { + if (tcptw->tw_ts_recent_stamp && + (!twp || + (sysctl_tcp_tw_reuse && + xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { /* See comment in tcp_ipv4.c */ - tp->write_seq = tw->tw_snd_nxt + 65535 + 2; + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (!tp->write_seq) tp->write_seq = 1; - tp->rx_opt.ts_recent = tw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; sock_hold(sk2); goto unique; } else @@ -494,7 +500,7 @@ static int __tcp_v6_check_established(st /* And established part... */ sk_for_each(sk2, node, &head->chain) { - if(TCP_IPV6_MATCH(sk2, saddr, daddr, ports, dif)) + if (INET6_MATCH(sk2, saddr, daddr, ports, dif)) goto not_unique; } @@ -513,7 +519,7 @@ unique: tcp_tw_deschedule(tw); NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - tcp_tw_put(tw); + inet_twsk_put(tw); } return 0; @@ -535,8 +541,8 @@ static inline u32 tcpv6_port_offset(cons static int tcp_v6_hash_connect(struct sock *sk) { unsigned short snum = inet_sk(sk)->num; - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; int ret; if (!snum) { @@ -548,19 +554,19 @@ static int tcp_v6_hash_connect(struct so static u32 hint; u32 offset = hint + tcpv6_port_offset(sk); struct hlist_node *node; - struct tcp_tw_bucket *tw = NULL; + struct inet_timewait_sock *tw = NULL; local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &tcp_bhash[tcp_bhashfn(port)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, * because the established check is already * unique enough. */ - tb_for_each(tb, node, &head->chain) { + inet_bind_bucket_for_each(tb, node, &head->chain) { if (tb->port == port) { BUG_TRAP(!hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) @@ -573,7 +579,7 @@ static int tcp_v6_hash_connect(struct so } } - tb = tcp_bucket_create(head, port); + tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); if (!tb) { spin_unlock(&head->lock); break; @@ -592,7 +598,7 @@ ok: hint += i; /* Head lock still held and bh's disabled */ - tcp_bind_hash(sk, tb, port); + inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); __tcp_v6_hash(sk); @@ -601,15 +607,15 @@ ok: if (tw) { tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); } ret = 0; goto out; } - head = &tcp_bhash[tcp_bhashfn(snum)]; - tb = tcp_sk(sk)->bind_hash; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { @@ -837,7 +843,7 @@ static void tcp_v6_err(struct sk_buff *s } if (sk->sk_state == TCP_TIME_WAIT) { - tcp_tw_put((struct tcp_tw_bucket*)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); return; } @@ -915,7 +921,7 @@ static void tcp_v6_err(struct sk_buff *s if (sock_owned_by_user(sk)) goto out; - req = tcp_v6_search_req(tp, &prev, th->dest, &hdr->daddr, + req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr, &hdr->saddr, tcp_v6_iif(skb)); if (!req) goto out; @@ -930,7 +936,7 @@ static void tcp_v6_err(struct sk_buff *s goto out; } - tcp_synq_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: @@ -1215,12 +1221,14 @@ static void tcp_v6_send_ack(struct sk_bu static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) { - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + struct inet_timewait_sock *tw = inet_twsk(sk); + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v6_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, - tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); + tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcptw->tw_ts_recent); - tcp_tw_put(tw); + inet_twsk_put(tw); } static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) @@ -1233,11 +1241,10 @@ static struct sock *tcp_v6_hnd_req(struc { struct request_sock *req, **prev; struct tcphdr *th = skb->h.th; - struct tcp_sock *tp = tcp_sk(sk); struct sock *nsk; /* Find possible connection requests. */ - req = tcp_v6_search_req(tp, &prev, th->source, &skb->nh.ipv6h->saddr, + req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, tcp_v6_iif(skb)); if (req) return tcp_check_req(sk, skb, req, prev); @@ -1253,7 +1260,7 @@ static struct sock *tcp_v6_hnd_req(struc bh_lock_sock(nsk); return nsk; } - tcp_tw_put((struct tcp_tw_bucket*)nsk); + inet_twsk_put((struct inet_timewait_sock *)nsk); return NULL; } @@ -1266,12 +1273,12 @@ static struct sock *tcp_v6_hnd_req(struc static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt = tp->accept_queue.listen_opt; - u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); - reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT); - tcp_synq_added(sk); + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); } @@ -1296,13 +1303,13 @@ static int tcp_v6_conn_request(struct so /* * There are no SYN attacks on IPv6, yet... */ - if (tcp_synq_is_full(sk) && !isn) { + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { if (net_ratelimit()) printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n"); goto drop; } - if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; req = reqsk_alloc(&tcp6_request_sock_ops); @@ -1402,12 +1409,11 @@ static struct sock * tcp_v6_syn_recv_soc newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = skb->nh.ipv6h->hop_limit; - /* Charge newly allocated IPv6 socket. Though it is mapped, - * it is IPv6 yet. + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks count + * here, tcp_create_openreq_child now does this for us, see the comment in + * that function for the gory details. -acme */ -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); -#endif /* It is tricky place. Until this moment IPv4 tcp worked with IPv6 af_tcp.af_specific. @@ -1462,10 +1468,11 @@ static struct sock * tcp_v6_syn_recv_soc if (newsk == NULL) goto out; - /* Charge newly allocated IPv6 socket */ -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); -#endif + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, tcp_create_openreq_child now does this for us, see the + * comment in that function for the gory details. -acme + */ ip6_dst_store(newsk, dst, NULL); newsk->sk_route_caps = dst->dev->features & @@ -1531,7 +1538,7 @@ static struct sock * tcp_v6_syn_recv_soc newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; __tcp_v6_hash(newsk); - tcp_inherit_port(sk, newsk); + inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; @@ -1790,26 +1797,26 @@ discard_and_relse: do_time_wait: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *)sk); goto discard_it; } if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TCP_MIB_INERRS); - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *)sk); goto discard_it; } - switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, - skb, th, skb->len)) { + switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk, + skb, th)) { case TCP_TW_SYN: { struct sock *sk2; sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); if (sk2 != NULL) { - tcp_tw_deschedule((struct tcp_tw_bucket *)sk); - tcp_tw_put((struct tcp_tw_bucket *)sk); + tcp_tw_deschedule((struct inet_timewait_sock *)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); sk = sk2; goto process; } @@ -1978,7 +1985,7 @@ static struct tcp_func ipv6_specific = { static struct tcp_func ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, - .rebuild_header = tcp_v4_rebuild_header, + .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .remember_stamp = tcp_v4_remember_stamp, @@ -2003,7 +2010,7 @@ static int tcp_v6_init_sock(struct sock tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - tp->rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the @@ -2086,18 +2093,20 @@ static void get_tcp6_sock(struct seq_fil unsigned long timer_expires; struct inet_sock *inet = inet_sk(sp); struct tcp_sock *tp = tcp_sk(sp); + const struct inet_connection_sock *icsk = inet_csk(sp); struct ipv6_pinfo *np = inet6_sk(sp); dest = &np->daddr; src = &np->rcv_saddr; destp = ntohs(inet->dport); srcp = ntohs(inet->sport); - if (tp->pending == TCP_TIME_RETRANS) { + + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; - timer_expires = tp->timeout; - } else if (tp->pending == TCP_TIME_PROBE0) { + timer_expires = icsk->icsk_timeout; + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = tp->timeout; + timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; @@ -2118,28 +2127,31 @@ static void get_tcp6_sock(struct seq_fil tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq, timer_active, jiffies_to_clock_t(timer_expires - jiffies), - tp->retransmits, + icsk->icsk_retransmits, sock_i_uid(sp), tp->probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong, + icsk->icsk_rto, + icsk->icsk_ack.ato, + (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong, tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh ); } static void get_timewait6_sock(struct seq_file *seq, - struct tcp_tw_bucket *tw, int i) + struct inet_timewait_sock *tw, int i) { struct in6_addr *dest, *src; __u16 destp, srcp; + struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); int ttd = tw->tw_ttd - jiffies; if (ttd < 0) ttd = 0; - dest = &tw->tw_v6_daddr; - src = &tw->tw_v6_rcv_saddr; + dest = &tcp6tw->tw_v6_daddr; + src = &tcp6tw->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); @@ -2214,7 +2226,7 @@ struct proto tcpv6_prot = { .close = tcp_close, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, - .accept = tcp_accept, + .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v6_init_sock, .destroy = tcp_v6_destroy_sock, @@ -2231,11 +2243,13 @@ struct proto tcpv6_prot = { .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, + .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .rsk_prot = &tcp6_request_sock_ops, }; diff -puN net/ipv6/udp.c~git-net net/ipv6/udp.c --- devel/net/ipv6/udp.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipv6/udp.c 2005-08-06 23:39:10.000000000 -0700 @@ -51,6 +51,7 @@ #include #include #include +#include #include #include diff -puN net/ipx/af_ipx.c~git-net net/ipx/af_ipx.c --- devel/net/ipx/af_ipx.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipx/af_ipx.c 2005-08-06 23:39:10.000000000 -0700 @@ -44,7 +44,6 @@ #include #include #include -#include #include #include @@ -52,6 +51,7 @@ #include #include #include +#include #include @@ -1627,7 +1627,7 @@ out: return rc; } -static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { /* NULL here for pt means the packet was looped back */ struct ipx_interface *intrfc; diff -puN net/ipx/ipx_proc.c~git-net net/ipx/ipx_proc.c --- devel/net/ipx/ipx_proc.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/ipx/ipx_proc.c 2005-08-06 23:39:10.000000000 -0700 @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include static __inline__ struct ipx_interface *ipx_get_interface_idx(loff_t pos) diff -puN net/irda/af_irda.c~git-net net/irda/af_irda.c --- devel/net/irda/af_irda.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/irda/af_irda.c 2005-08-06 23:39:10.000000000 -0700 @@ -56,7 +56,7 @@ #include #include -#include +#include #include diff -puN net/irda/irlap_frame.c~git-net net/irda/irlap_frame.c --- devel/net/irda/irlap_frame.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/irda/irlap_frame.c 2005-08-06 23:39:10.000000000 -0700 @@ -988,9 +988,6 @@ void irlap_resend_rejected_frames(struct IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__); return; } - /* Unlink tx_skb from list */ - tx_skb->next = tx_skb->prev = NULL; - tx_skb->list = NULL; /* Clear old Nr field + poll bit */ tx_skb->data[1] &= 0x0f; @@ -1063,9 +1060,6 @@ void irlap_resend_rejected_frame(struct IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__); return; } - /* Unlink tx_skb from list */ - tx_skb->next = tx_skb->prev = NULL; - tx_skb->list = NULL; /* Clear old Nr field + poll bit */ tx_skb->data[1] &= 0x0f; @@ -1309,7 +1303,7 @@ static void irlap_recv_test_frame(struct * Jean II */ int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *ptype) + struct packet_type *ptype, struct net_device *orig_dev) { struct irlap_info info; struct irlap_cb *self; diff -puN net/irda/irmod.c~git-net net/irda/irmod.c --- devel/net/irda/irmod.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/irda/irmod.c 2005-08-06 23:39:10.000000000 -0700 @@ -54,7 +54,7 @@ extern int irsock_init(void); extern void irsock_cleanup(void); /* irlap_frame.c */ extern int irlap_driver_rcv(struct sk_buff *, struct net_device *, - struct packet_type *); + struct packet_type *, struct net_device *); /* * Module parameters diff -puN net/Kconfig~git-net net/Kconfig --- devel/net/Kconfig~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/Kconfig 2005-08-06 23:39:10.000000000 -0700 @@ -147,6 +147,7 @@ source "net/bridge/netfilter/Kconfig" endif +source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/atm/Kconfig" source "net/bridge/Kconfig" @@ -205,6 +206,8 @@ config NET_PKTGEN To compile this code as a module, choose M here: the module will be called pktgen. +source "net/netfilter/Kconfig" + endmenu endmenu diff -puN net/lapb/lapb_subr.c~git-net net/lapb/lapb_subr.c --- devel/net/lapb/lapb_subr.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/lapb/lapb_subr.c 2005-08-06 23:39:10.000000000 -0700 @@ -78,7 +78,7 @@ void lapb_requeue_frames(struct lapb_cb if (!skb_prev) skb_queue_head(&lapb->write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &lapb->write_queue); skb_prev = skb; } } diff -puN net/llc/af_llc.c~git-net net/llc/af_llc.c --- devel/net/llc/af_llc.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/llc/af_llc.c 2005-08-06 23:39:10.000000000 -0700 @@ -23,13 +23,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include /* remember: uninitialized global data is zeroed because its in .bss */ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; @@ -714,7 +714,7 @@ static int llc_ui_recvmsg(struct kiocb * if (uaddr) memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); msg->msg_namelen = sizeof(*uaddr); - if (!skb->list) { + if (!skb->next) { dgram_free: kfree_skb(skb); } diff -puN net/llc/llc_conn.c~git-net net/llc/llc_conn.c --- devel/net/llc/llc_conn.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/llc/llc_conn.c 2005-08-06 23:39:10.000000000 -0700 @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include @@ -71,7 +71,11 @@ int llc_conn_state_process(struct sock * if (!ev->ind_prim && !ev->cfm_prim) { /* indicate or confirm not required */ - if (!skb->list) + /* XXX this is not very pretty, perhaps we should store + * XXX indicate/confirm-needed state in the llc_conn_state_ev + * XXX control block of the SKB instead? -DaveM + */ + if (!skb->next) goto out_kfree_skb; goto out_skb_put; } diff -puN net/llc/llc_core.c~git-net net/llc/llc_core.c --- devel/net/llc/llc_core.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/llc/llc_core.c 2005-08-06 23:39:10.000000000 -0700 @@ -103,7 +103,8 @@ out: struct llc_sap *llc_sap_open(unsigned char lsap, int (*func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)) + struct packet_type *pt, + struct net_device *orig_dev)) { struct llc_sap *sap = llc_sap_find(lsap); diff -puN net/llc/llc_if.c~git-net net/llc/llc_if.c --- devel/net/llc/llc_if.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/llc/llc_if.c 2005-08-06 23:39:10.000000000 -0700 @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -25,6 +24,7 @@ #include #include #include +#include u8 llc_mac_null_var[IFHWADDRLEN]; diff -puN net/llc/llc_input.c~git-net net/llc/llc_input.c --- devel/net/llc/llc_input.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/llc/llc_input.c 2005-08-06 23:39:10.000000000 -0700 @@ -132,7 +132,7 @@ static inline int llc_fixup_skb(struct s * data now), it queues this frame in the connection's backlog. */ int llc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { struct llc_sap *sap; struct llc_pdu_sn *pdu; @@ -165,7 +165,7 @@ int llc_rcv(struct sk_buff *skb, struct * LLC functionality */ if (sap->rcv_func) { - sap->rcv_func(skb, dev, pt); + sap->rcv_func(skb, dev, pt, orig_dev); goto out; } dest = llc_pdu_type(skb); diff -puN net/llc/llc_sap.c~git-net net/llc/llc_sap.c --- devel/net/llc/llc_sap.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/llc/llc_sap.c 2005-08-06 23:39:10.000000000 -0700 @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include /** diff -puN net/Makefile~git-net net/Makefile --- devel/net/Makefile~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/Makefile 2005-08-06 23:39:10.000000000 -0700 @@ -16,6 +16,7 @@ obj-$(CONFIG_NET) += $(tmp-y) obj-$(CONFIG_LLC) += llc/ obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ obj-$(CONFIG_INET) += ipv4/ +obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ ifneq ($(CONFIG_IPV6),) @@ -41,6 +42,7 @@ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_DECNET) += decnet/ obj-$(CONFIG_ECONET) += econet/ obj-$(CONFIG_VLAN_8021Q) += 8021q/ +obj-$(CONFIG_IP_DCCP) += dccp/ obj-$(CONFIG_IP_SCTP) += sctp/ ifeq ($(CONFIG_NET),y) diff -puN /dev/null net/netfilter/Kconfig --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/netfilter/Kconfig 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,24 @@ +config NETFILTER_NETLINK + tristate "Netfilter netlink interface" + help + If this option is enabled, the kernel will include support + for the new netfilter netlink interface. + +config NETFILTER_NETLINK_QUEUE + tristate "Netfilter NFQUEUE over NFNETLINK interface" + depends on NETFILTER_NETLINK + help + If this option isenabled, the kernel will include support + for queueing packets via NFNETLINK. + +config NETFILTER_NETLINK_LOG + tristate "Netfilter LOG over NFNETLINK interface" + depends on NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + for logging packets via NFNETLINK. + + This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms, + and is also scheduled to replace the old syslog-based ipt_LOG + and ip6t_LOG modules. + diff -puN /dev/null net/netfilter/Makefile --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/netfilter/Makefile 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,3 @@ +obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o +obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o diff -puN /dev/null net/netfilter/nfnetlink.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/netfilter/nfnetlink.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,376 @@ +/* Netfilter messages via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist , + * (C) 2002-2005 by Harald Welte + * (C) 2005 by Pablo Neira Ayuso + * + * Initial netfilter messages via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); + +static char __initdata nfversion[] = "0.30"; + +#if 0 +#define DEBUGP(format, args...) \ + printk(KERN_DEBUG "%s(%d):%s(): " format, __FILE__, \ + __LINE__, __FUNCTION__, ## args) +#else +#define DEBUGP(format, args...) +#endif + +static struct sock *nfnl = NULL; +static struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT]; +DECLARE_MUTEX(nfnl_sem); + +void nfnl_lock(void) +{ + nfnl_shlock(); +} + +void nfnl_unlock(void) +{ + nfnl_shunlock(); +} + +int nfnetlink_subsys_register(struct nfnetlink_subsystem *n) +{ + DEBUGP("registering subsystem ID %u\n", n->subsys_id); + + nfnl_lock(); + if (subsys_table[n->subsys_id]) { + nfnl_unlock(); + return -EBUSY; + } + subsys_table[n->subsys_id] = n; + nfnl_unlock(); + + return 0; +} + +int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n) +{ + DEBUGP("unregistering subsystem ID %u\n", n->subsys_id); + + nfnl_lock(); + subsys_table[n->subsys_id] = NULL; + nfnl_unlock(); + + return 0; +} + +static inline struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) +{ + u_int8_t subsys_id = NFNL_SUBSYS_ID(type); + + if (subsys_id >= NFNL_SUBSYS_COUNT + || subsys_table[subsys_id] == NULL) + return NULL; + + return subsys_table[subsys_id]; +} + +static inline struct nfnl_callback * +nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss) +{ + u_int8_t cb_id = NFNL_MSG_TYPE(type); + + if (cb_id >= ss->cb_count) { + DEBUGP("msgtype %u >= %u, returning\n", type, ss->cb_count); + return NULL; + } + + return &ss->cb[cb_id]; +} + +void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen, + const void *data) +{ + struct nfattr *nfa; + int size = NFA_LENGTH(attrlen); + + nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size)); + nfa->nfa_type = attrtype; + nfa->nfa_len = size; + memcpy(NFA_DATA(nfa), data, attrlen); + memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size); +} + +int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) +{ + memset(tb, 0, sizeof(struct nfattr *) * maxattr); + + while (NFA_OK(nfa, len)) { + unsigned flavor = nfa->nfa_type; + if (flavor && flavor <= maxattr) + tb[flavor-1] = nfa; + nfa = NFA_NEXT(nfa, len); + } + + return 0; +} + +/** + * nfnetlink_check_attributes - check and parse nfnetlink attributes + * + * subsys: nfnl subsystem for which this message is to be parsed + * nlmsghdr: netlink message to be checked/parsed + * cda: array of pointers, needs to be at least subsys->attr_count big + * + */ +static int +nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, + struct nlmsghdr *nlh, struct nfattr *cda[]) +{ + int min_len; + u_int16_t attr_count; + u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + + if (unlikely(cb_id >= subsys->cb_count)) { + DEBUGP("msgtype %u >= %u, returning\n", + cb_id, subsys->cb_count); + return -EINVAL; + } + + min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg)); + if (unlikely(nlh->nlmsg_len < min_len)) + return -EINVAL; + + attr_count = subsys->cb[cb_id].attr_count; + memset(cda, 0, sizeof(struct nfattr *) * attr_count); + + /* check attribute lengths. */ + if (likely(nlh->nlmsg_len > min_len)) { + struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh)); + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + + while (NFA_OK(attr, attrlen)) { + unsigned flavor = attr->nfa_type; + if (flavor) { + if (flavor > attr_count) + return -EINVAL; + cda[flavor - 1] = attr; + } + attr = NFA_NEXT(attr, attrlen); + } + } + + /* implicit: if nlmsg_len == min_len, we return 0, and an empty + * (zeroed) cda[] array. The message is valid, but empty. */ + + return 0; +} + +int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +{ + int allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL; + int err = 0; + + NETLINK_CB(skb).dst_groups = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(nfnl, skb, pid, group, allocation); + if (echo) + err = netlink_unicast(nfnl, skb, pid, MSG_DONTWAIT); + + return err; +} + +int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags) +{ + return netlink_unicast(nfnl, skb, pid, flags); +} + +/* Process one complete nfnetlink message. */ +static inline int nfnetlink_rcv_msg(struct sk_buff *skb, + struct nlmsghdr *nlh, int *errp) +{ + struct nfnl_callback *nc; + struct nfnetlink_subsystem *ss; + int type, err = 0; + + DEBUGP("entered; subsys=%u, msgtype=%u\n", + NFNL_SUBSYS_ID(nlh->nlmsg_type), + NFNL_MSG_TYPE(nlh->nlmsg_type)); + + /* Only requests are handled by kernel now. */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { + DEBUGP("received non-request message\n"); + return 0; + } + + /* All the messages must at least contain nfgenmsg */ + if (nlh->nlmsg_len < + NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) { + DEBUGP("received message was too short\n"); + return 0; + } + + type = nlh->nlmsg_type; + ss = nfnetlink_get_subsys(type); + if (!ss) { +#ifdef CONFIG_KMOD + /* don't call nfnl_shunlock, since it would reenter + * with further packet processing */ + up(&nfnl_sem); + request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); + nfnl_shlock(); + ss = nfnetlink_get_subsys(type); + if (!ss) +#endif + goto err_inval; + } + + nc = nfnetlink_find_client(type, ss); + if (!nc) { + DEBUGP("unable to find client for type %d\n", type); + goto err_inval; + } + + if (nc->cap_required && + !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) { + DEBUGP("permission denied for type %d\n", type); + *errp = -EPERM; + return -1; + } + + { + u_int16_t attr_count = + ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count; + struct nfattr *cda[attr_count]; + + memset(cda, 0, sizeof(struct nfattr *) * attr_count); + + err = nfnetlink_check_attributes(ss, nlh, cda); + if (err < 0) + goto err_inval; + + DEBUGP("calling handler\n"); + err = nc->call(nfnl, skb, nlh, cda, errp); + *errp = err; + return err; + } + +err_inval: + DEBUGP("returning -EINVAL\n"); + *errp = -EINVAL; + return -1; +} + +/* Process one packet of messages. */ +static inline int nfnetlink_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr *nlh; + + while (skb->len >= NLMSG_SPACE(0)) { + u32 rlen; + + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(struct nlmsghdr) + || skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if (nfnetlink_rcv_msg(skb, nlh, &err)) { + if (!err) + return -1; + netlink_ack(skb, nlh, err); + } else + if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + + return 0; +} + +static void nfnetlink_rcv(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + if (nfnl_shlock_nowait()) + return; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (nfnetlink_rcv_skb(skb)) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, + skb); + else + kfree_skb(skb); + break; + } + kfree_skb(skb); + } + + /* don't call nfnl_shunlock, since it would reenter + * with further packet processing */ + up(&nfnl_sem); + } while(nfnl && nfnl->sk_receive_queue.qlen); +} + +void __exit nfnetlink_exit(void) +{ + printk("Removing netfilter NETLINK layer.\n"); + sock_release(nfnl->sk_socket); + return; +} + +int __init nfnetlink_init(void) +{ + printk("Netfilter messages via NETLINK v%s.\n", nfversion); + + nfnl = netlink_kernel_create(NETLINK_NETFILTER, nfnetlink_rcv, + THIS_MODULE); + if (!nfnl) { + printk(KERN_ERR "cannot initialize nfnetlink!\n"); + return -1; + } + + return 0; +} + +module_init(nfnetlink_init); +module_exit(nfnetlink_exit); + +EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); +EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); +EXPORT_SYMBOL_GPL(nfnetlink_send); +EXPORT_SYMBOL_GPL(nfnetlink_unicast); +EXPORT_SYMBOL_GPL(nfattr_parse); +EXPORT_SYMBOL_GPL(__nfa_fill); diff -puN /dev/null net/netfilter/nfnetlink_log.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/netfilter/nfnetlink_log.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,996 @@ +/* + * This is a module which is used for logging packets to userspace via + * nfetlink. + * + * (C) 2005 by Harald Welte + * + * Based on the old ipv4-only ipt_ULOG.c: + * (C) 2000-2004 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NFULNL_NLBUFSIZ_DEFAULT 4096 +#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ +#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ + +#define PRINTR(x, args...) do { if (net_ratelimit()) \ + printk(x, ## args); } while (0); + +#if 0 +#define UDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \ + __FILE__, __LINE__, __FUNCTION__, \ + ## args) +#else +#define UDEBUG(x, ...) +#endif + +struct nfulnl_instance { + struct hlist_node hlist; /* global list of instances */ + spinlock_t lock; + atomic_t use; /* use count */ + + unsigned int qlen; /* number of nlmsgs in skb */ + struct sk_buff *skb; /* pre-allocatd skb */ + struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ + struct timer_list timer; + int peer_pid; /* PID of the peer process */ + + /* configurable parameters */ + unsigned int flushtimeout; /* timeout until queue flush */ + unsigned int nlbufsiz; /* netlink buffer allocation size */ + unsigned int qthreshold; /* threshold of the queue */ + u_int32_t copy_range; + u_int16_t group_num; /* number of this queue */ + u_int8_t copy_mode; +}; + +static DEFINE_RWLOCK(instances_lock); + +#define INSTANCE_BUCKETS 16 +static struct hlist_head instance_table[INSTANCE_BUCKETS]; +static unsigned int hash_init; + +static inline u_int8_t instance_hashfn(u_int16_t group_num) +{ + return ((group_num & 0xff) % INSTANCE_BUCKETS); +} + +static struct nfulnl_instance * +__instance_lookup(u_int16_t group_num) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct nfulnl_instance *inst; + + UDEBUG("entering (group_num=%u)\n", group_num); + + head = &instance_table[instance_hashfn(group_num)]; + hlist_for_each_entry(inst, pos, head, hlist) { + if (inst->group_num == group_num) + return inst; + } + return NULL; +} + +static inline void +instance_get(struct nfulnl_instance *inst) +{ + atomic_inc(&inst->use); +} + +static struct nfulnl_instance * +instance_lookup_get(u_int16_t group_num) +{ + struct nfulnl_instance *inst; + + read_lock_bh(&instances_lock); + inst = __instance_lookup(group_num); + if (inst) + instance_get(inst); + read_unlock_bh(&instances_lock); + + return inst; +} + +static void +instance_put(struct nfulnl_instance *inst) +{ + if (inst && atomic_dec_and_test(&inst->use)) { + UDEBUG("kfree(inst=%p)\n", inst); + kfree(inst); + } +} + +static void nfulnl_timer(unsigned long data); + +static struct nfulnl_instance * +instance_create(u_int16_t group_num, int pid) +{ + struct nfulnl_instance *inst; + + UDEBUG("entering (group_num=%u, pid=%d)\n", group_num, + pid); + + write_lock_bh(&instances_lock); + if (__instance_lookup(group_num)) { + inst = NULL; + UDEBUG("aborting, instance already exists\n"); + goto out_unlock; + } + + inst = kmalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) + goto out_unlock; + + memset(inst, 0, sizeof(*inst)); + INIT_HLIST_NODE(&inst->hlist); + inst->lock = SPIN_LOCK_UNLOCKED; + /* needs to be two, since we _put() after creation */ + atomic_set(&inst->use, 2); + + init_timer(&inst->timer); + inst->timer.function = nfulnl_timer; + inst->timer.data = (unsigned long)inst; + /* don't start timer yet. (re)start it with every packet */ + + inst->peer_pid = pid; + inst->group_num = group_num; + + inst->qthreshold = NFULNL_QTHRESH_DEFAULT; + inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT; + inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT; + inst->copy_mode = NFULNL_COPY_PACKET; + inst->copy_range = 0xffff; + + if (!try_module_get(THIS_MODULE)) + goto out_free; + + hlist_add_head(&inst->hlist, + &instance_table[instance_hashfn(group_num)]); + + UDEBUG("newly added node: %p, next=%p\n", &inst->hlist, + inst->hlist.next); + + write_unlock_bh(&instances_lock); + + return inst; + +out_free: + instance_put(inst); +out_unlock: + write_unlock_bh(&instances_lock); + return NULL; +} + +static int __nfulnl_send(struct nfulnl_instance *inst); + +static void +_instance_destroy2(struct nfulnl_instance *inst, int lock) +{ + /* first pull it out of the global list */ + if (lock) + write_lock_bh(&instances_lock); + + UDEBUG("removing instance %p (queuenum=%u) from hash\n", + inst, inst->group_num); + + hlist_del(&inst->hlist); + + if (lock) + write_unlock_bh(&instances_lock); + + /* then flush all pending packets from skb */ + + spin_lock_bh(&inst->lock); + if (inst->skb) { + if (inst->qlen) + __nfulnl_send(inst); + if (inst->skb) { + kfree_skb(inst->skb); + inst->skb = NULL; + } + } + spin_unlock_bh(&inst->lock); + + /* and finally put the refcount */ + instance_put(inst); + + module_put(THIS_MODULE); +} + +static inline void +__instance_destroy(struct nfulnl_instance *inst) +{ + _instance_destroy2(inst, 0); +} + +static inline void +instance_destroy(struct nfulnl_instance *inst) +{ + _instance_destroy2(inst, 1); +} + +static int +nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode, + unsigned int range) +{ + int status = 0; + + spin_lock_bh(&inst->lock); + + switch (mode) { + case NFULNL_COPY_NONE: + case NFULNL_COPY_META: + inst->copy_mode = mode; + inst->copy_range = 0; + break; + + case NFULNL_COPY_PACKET: + inst->copy_mode = mode; + /* we're using struct nfattr which has 16bit nfa_len */ + if (range > 0xffff) + inst->copy_range = 0xffff; + else + inst->copy_range = range; + break; + + default: + status = -EINVAL; + break; + } + + spin_unlock_bh(&inst->lock); + + return status; +} + +static int +nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz) +{ + int status; + + spin_lock_bh(&inst->lock); + if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT) + status = -ERANGE; + else if (nlbufsiz > 131072) + status = -ERANGE; + else { + inst->nlbufsiz = nlbufsiz; + status = 0; + } + spin_unlock_bh(&inst->lock); + + return status; +} + +static int +nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout) +{ + spin_lock_bh(&inst->lock); + inst->flushtimeout = timeout; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static int +nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh) +{ + spin_lock_bh(&inst->lock); + inst->qthreshold = qthresh; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size, + unsigned int pkt_size) +{ + struct sk_buff *skb; + + UDEBUG("entered (%u, %u)\n", inst_size, pkt_size); + + /* alloc skb which should be big enough for a whole multipart + * message. WARNING: has to be <= 128k due to slab restrictions */ + + skb = alloc_skb(inst_size, GFP_ATOMIC); + if (!skb) { + PRINTR("nfnetlink_log: can't alloc whole buffer (%u bytes)\n", + inst_size); + + /* try to allocate only as much as we need for current + * packet */ + + skb = alloc_skb(pkt_size, GFP_ATOMIC); + if (!skb) + PRINTR("nfnetlink_log: can't even alloc %u bytes\n", + pkt_size); + } + + return skb; +} + +static int +__nfulnl_send(struct nfulnl_instance *inst) +{ + int status; + + if (timer_pending(&inst->timer)) + del_timer(&inst->timer); + + if (inst->qlen > 1) + inst->lastnlh->nlmsg_type = NLMSG_DONE; + + status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT); + if (status < 0) { + UDEBUG("netlink_unicast() failed\n"); + /* FIXME: statistics */ + } + + inst->qlen = 0; + inst->skb = NULL; + inst->lastnlh = NULL; + + return status; +} + +static void nfulnl_timer(unsigned long data) +{ + struct nfulnl_instance *inst = (struct nfulnl_instance *)data; + + UDEBUG("timer function called, flushing buffer\n"); + + spin_lock_bh(&inst->lock); + __nfulnl_send(inst); + instance_put(inst); + spin_unlock_bh(&inst->lock); +} + +static inline int +__build_packet_message(struct nfulnl_instance *inst, + const struct sk_buff *skb, + unsigned int data_len, + unsigned int pf, + unsigned int hooknum, + const struct net_device *indev, + const struct net_device *outdev, + const struct nf_loginfo *li, + const char *prefix) +{ + unsigned char *old_tail; + struct nfulnl_msg_packet_hdr pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + u_int32_t tmp_uint; + + UDEBUG("entered\n"); + + old_tail = inst->skb->tail; + nlh = NLMSG_PUT(inst->skb, 0, 0, + NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, + sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + nfmsg->nfgen_family = pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(inst->group_num); + + pmsg.hw_protocol = htons(skb->protocol); + pmsg.hook = hooknum; + + NFA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg); + + if (prefix) { + int slen = strlen(prefix); + if (slen > NFULNL_PREFIXLEN) + slen = NFULNL_PREFIXLEN; + NFA_PUT(inst->skb, NFULA_PREFIX, slen, prefix); + } + + if (indev) { + tmp_uint = htonl(indev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint), + &tmp_uint); + } + + if (outdev) { + tmp_uint = htonl(outdev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint), + &tmp_uint); + } + + if (skb->nfmark) { + tmp_uint = htonl(skb->nfmark); + NFA_PUT(inst->skb, NFULA_MARK, sizeof(tmp_uint), &tmp_uint); + } + + if (indev && skb->dev && skb->dev->hard_header_parse) { + struct nfulnl_msg_packet_hw phw; + + phw.hw_addrlen = + skb->dev->hard_header_parse((struct sk_buff *)skb, + phw.hw_addr); + phw.hw_addrlen = htons(phw.hw_addrlen); + NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw); + } + + if (skb->stamp.tv_sec) { + struct nfulnl_msg_packet_timestamp ts; + + ts.sec = cpu_to_be64(skb->stamp.tv_sec); + ts.usec = cpu_to_be64(skb->stamp.tv_usec); + + NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts); + } + + /* UID */ + if (skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) { + u_int32_t uid = htonl(skb->sk->sk_socket->file->f_uid); + /* need to unlock here since NFA_PUT may goto */ + read_unlock_bh(&skb->sk->sk_callback_lock); + NFA_PUT(inst->skb, NFULA_UID, sizeof(uid), &uid); + } else + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + if (data_len) { + struct nfattr *nfa; + int size = NFA_LENGTH(data_len); + + if (skb_tailroom(inst->skb) < (int)NFA_SPACE(data_len)) { + printk(KERN_WARNING "nfnetlink_log: no tailroom!\n"); + goto nlmsg_failure; + } + + nfa = (struct nfattr *)skb_put(inst->skb, NFA_ALIGN(size)); + nfa->nfa_type = NFULA_PAYLOAD; + nfa->nfa_len = size; + + if (skb_copy_bits(skb, 0, NFA_DATA(nfa), data_len)) + BUG(); + } + + nlh->nlmsg_len = inst->skb->tail - old_tail; + return 0; + +nlmsg_failure: + UDEBUG("nlmsg_failure\n"); +nfattr_failure: + PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n"); + return -1; +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_ULOG, + .u = { + .ulog = { + .copy_len = 0xffff, + .group = 0, + .qthreshold = 1, + }, + }, +}; + +/* log handler for internal netfilter logging api */ +static void +nfulnl_log_packet(unsigned int pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *li_user, + const char *prefix) +{ + unsigned int size, data_len; + struct nfulnl_instance *inst; + const struct nf_loginfo *li; + unsigned int qthreshold; + unsigned int nlbufsiz; + + if (li_user && li_user->type == NF_LOG_TYPE_ULOG) + li = li_user; + else + li = &default_loginfo; + + inst = instance_lookup_get(li->u.ulog.group); + if (!inst) + inst = instance_lookup_get(0); + if (!inst) { + PRINTR("nfnetlink_log: trying to log packet, " + "but no instance for group %u\n", li->u.ulog.group); + return; + } + + /* all macros expand to constant values at compile time */ + /* FIXME: do we want to make the size calculation conditional based on + * what is actually present? way more branches and checks, but more + * memory efficient... */ + size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hdr)) + + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NFA_SPACE(sizeof(u_int32_t)) /* mark */ + + NFA_SPACE(sizeof(u_int32_t)) /* uid */ + + NFA_SPACE(NFULNL_PREFIXLEN) /* prefix */ + + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hw)) + + NFA_SPACE(sizeof(struct nfulnl_msg_packet_timestamp)); + + UDEBUG("initial size=%u\n", size); + + spin_lock_bh(&inst->lock); + + qthreshold = inst->qthreshold; + /* per-rule qthreshold overrides per-instance */ + if (qthreshold > li->u.ulog.qthreshold) + qthreshold = li->u.ulog.qthreshold; + + switch (inst->copy_mode) { + case NFULNL_COPY_META: + case NFULNL_COPY_NONE: + data_len = 0; + break; + + case NFULNL_COPY_PACKET: + if (inst->copy_range == 0 + || inst->copy_range > skb->len) + data_len = skb->len; + else + data_len = inst->copy_range; + + size += NFA_SPACE(data_len); + UDEBUG("copy_packet, therefore size now %u\n", size); + break; + + default: + spin_unlock_bh(&inst->lock); + instance_put(inst); + return; + } + + if (size > inst->nlbufsiz) + nlbufsiz = size; + else + nlbufsiz = inst->nlbufsiz; + + if (!inst->skb) { + if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) { + UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n", + inst->nlbufsiz, size); + goto alloc_failure; + } + } else if (inst->qlen >= qthreshold || + size > skb_tailroom(inst->skb)) { + /* either the queue len is too high or we don't have + * enough room in the skb left. flush to userspace. */ + UDEBUG("flushing old skb\n"); + + __nfulnl_send(inst); + + if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) { + UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n", + inst->nlbufsiz, size); + goto alloc_failure; + } + } + + UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold); + inst->qlen++; + + __build_packet_message(inst, skb, data_len, pf, + hooknum, in, out, li, prefix); + + /* timer_pending always called within inst->lock, so there + * is no chance of a race here */ + if (!timer_pending(&inst->timer)) { + instance_get(inst); + inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100); + add_timer(&inst->timer); + } + spin_unlock_bh(&inst->lock); + + return; + +alloc_failure: + spin_unlock_bh(&inst->lock); + instance_put(inst); + UDEBUG("error allocating skb\n"); + /* FIXME: statistics */ +} + +static int +nfulnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_NETFILTER && n->pid) { + int i; + + /* destroy all instances for this pid */ + write_lock_bh(&instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp, *t2; + struct nfulnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { + UDEBUG("node = %p\n", inst); + if (n->pid == inst->peer_pid) + __instance_destroy(inst); + } + } + write_unlock_bh(&instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfulnl_rtnl_notifier = { + .notifier_call = nfulnl_rcv_nl_event, +}; + +static int +nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + return -ENOTSUPP; +} + +static struct nf_logger nfulnl_logger = { + .name = "nfnetlink_log", + .logfn = &nfulnl_log_packet, + .me = THIS_MODULE, +}; + +static const int nfula_min[NFULA_MAX] = { + [NFULA_PACKET_HDR-1] = sizeof(struct nfulnl_msg_packet_hdr), + [NFULA_MARK-1] = sizeof(u_int32_t), + [NFULA_TIMESTAMP-1] = sizeof(struct nfulnl_msg_packet_timestamp), + [NFULA_IFINDEX_INDEV-1] = sizeof(u_int32_t), + [NFULA_IFINDEX_OUTDEV-1]= sizeof(u_int32_t), + [NFULA_HWADDR-1] = sizeof(struct nfulnl_msg_packet_hw), + [NFULA_PAYLOAD-1] = 0, + [NFULA_PREFIX-1] = 0, + [NFULA_UID-1] = sizeof(u_int32_t), +}; + +static const int nfula_cfg_min[NFULA_CFG_MAX] = { + [NFULA_CFG_CMD-1] = sizeof(struct nfulnl_msg_config_cmd), + [NFULA_CFG_MODE-1] = sizeof(struct nfulnl_msg_config_mode), + [NFULA_CFG_TIMEOUT-1] = sizeof(u_int32_t), + [NFULA_CFG_QTHRESH-1] = sizeof(u_int32_t), + [NFULA_CFG_NLBUFSIZ-1] = sizeof(u_int32_t), +}; + +static int +nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfula[], int *errp) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t group_num = ntohs(nfmsg->res_id); + struct nfulnl_instance *inst; + int ret = 0; + + UDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); + + if (nfattr_bad_size(nfula, NFULA_CFG_MAX, nfula_cfg_min)) { + UDEBUG("bad attribute size\n"); + return -EINVAL; + } + + inst = instance_lookup_get(group_num); + if (nfula[NFULA_CFG_CMD-1]) { + u_int8_t pf = nfmsg->nfgen_family; + struct nfulnl_msg_config_cmd *cmd; + cmd = NFA_DATA(nfula[NFULA_CFG_CMD-1]); + UDEBUG("found CFG_CMD for\n"); + + switch (cmd->command) { + case NFULNL_CFG_CMD_BIND: + if (inst) { + ret = -EBUSY; + goto out_put; + } + + inst = instance_create(group_num, + NETLINK_CB(skb).pid); + if (!inst) { + ret = -EINVAL; + goto out_put; + } + break; + case NFULNL_CFG_CMD_UNBIND: + if (!inst) { + ret = -ENODEV; + goto out_put; + } + + if (inst->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto out_put; + } + + instance_destroy(inst); + break; + case NFULNL_CFG_CMD_PF_BIND: + UDEBUG("registering log handler for pf=%u\n", pf); + ret = nf_log_register(pf, &nfulnl_logger); + break; + case NFULNL_CFG_CMD_PF_UNBIND: + UDEBUG("unregistering log handler for pf=%u\n", pf); + /* This is a bug and a feature. We cannot unregister + * other handlers, like nfnetlink_inst can */ + nf_log_unregister_pf(pf); + break; + default: + ret = -EINVAL; + break; + } + } else { + if (!inst) { + UDEBUG("no config command, and no instance for " + "group=%u pid=%u =>ENOENT\n", + group_num, NETLINK_CB(skb).pid); + ret = -ENOENT; + goto out_put; + } + + if (inst->peer_pid != NETLINK_CB(skb).pid) { + UDEBUG("no config command, and wrong pid\n"); + ret = -EPERM; + goto out_put; + } + } + + if (nfula[NFULA_CFG_MODE-1]) { + struct nfulnl_msg_config_mode *params; + params = NFA_DATA(nfula[NFULA_CFG_MODE-1]); + + nfulnl_set_mode(inst, params->copy_mode, + ntohs(params->copy_range)); + } + + if (nfula[NFULA_CFG_TIMEOUT-1]) { + u_int32_t timeout = + *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_TIMEOUT-1]); + + nfulnl_set_timeout(inst, ntohl(timeout)); + } + + if (nfula[NFULA_CFG_NLBUFSIZ-1]) { + u_int32_t nlbufsiz = + *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_NLBUFSIZ-1]); + + nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); + } + + if (nfula[NFULA_CFG_QTHRESH-1]) { + u_int32_t qthresh = + *(u_int16_t *)NFA_DATA(nfula[NFULA_CFG_QTHRESH-1]); + + nfulnl_set_qthresh(inst, ntohl(qthresh)); + } + +out_put: + instance_put(inst); + return ret; +} + +static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { + [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, + .attr_count = NFULA_MAX, + .cap_required = CAP_NET_ADMIN, }, + [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, + .attr_count = NFULA_CFG_MAX, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnetlink_subsystem nfulnl_subsys = { + .name = "log", + .subsys_id = NFNL_SUBSYS_ULOG, + .cb_count = NFULNL_MSG_MAX, + .cb = nfulnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ + struct iter_state *st = seq->private; + + if (!st) + return NULL; + + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&instance_table[st->bucket])) + return instance_table[st->bucket].first; + } + return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ + struct iter_state *st = seq->private; + + h = h->next; + while (!h) { + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + h = instance_table[st->bucket].first; + } + return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head; + head = get_first(seq); + + if (head) + while (pos && (head = get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&instances_lock); + return get_idx(seq, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfulnl_instance *inst = v; + + return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n", + inst->group_num, + inst->peer_pid, inst->qlen, + inst->copy_mode, inst->copy_range, + inst->flushtimeout, atomic_read(&inst->use)); +} + +static struct seq_operations nful_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nful_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct iter_state *is; + int ret; + + is = kmalloc(sizeof(*is), GFP_KERNEL); + if (!is) + return -ENOMEM; + memset(is, 0, sizeof(*is)); + ret = seq_open(file, &nful_seq_ops); + if (ret < 0) + goto out_free; + seq = file->private_data; + seq->private = is; + return ret; +out_free: + kfree(is); + return ret; +} + +static struct file_operations nful_file_ops = { + .owner = THIS_MODULE, + .open = nful_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ + +static int +init_or_cleanup(int init) +{ + int i, status = -ENOMEM; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_nful; +#endif + + if (!init) + goto cleanup; + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&instance_table[i]); + + /* it's not really all that important to have a random value, so + * we can do this from the init function, even if there hasn't + * been that much entropy yet */ + get_random_bytes(&hash_init, sizeof(hash_init)); + + netlink_register_notifier(&nfulnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfulnl_subsys); + if (status < 0) { + printk(KERN_ERR "log: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + +#ifdef CONFIG_PROC_FS + proc_nful = create_proc_entry("nfnetlink_log", 0440, + proc_net_netfilter); + if (!proc_nful) + goto cleanup_subsys; + proc_nful->proc_fops = &nful_file_ops; +#endif + + return status; + +cleanup: + nf_log_unregister_logger(&nfulnl_logger); +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_log", proc_net_netfilter); +cleanup_subsys: +#endif + nfnetlink_subsys_unregister(&nfulnl_subsys); +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfulnl_rtnl_notifier); + return status; +} + +static int __init init(void) +{ + + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +MODULE_DESCRIPTION("netfilter userspace logging"); +MODULE_AUTHOR("Harald Welte "); +MODULE_LICENSE("GPL"); + +module_init(init); +module_exit(fini); diff -puN /dev/null net/netfilter/nfnetlink_queue.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ devel-akpm/net/netfilter/nfnetlink_queue.c 2005-08-06 23:39:10.000000000 -0700 @@ -0,0 +1,1072 @@ +/* + * This is a module which is used for queueing packets and communicating with + * userspace via nfetlink. + * + * (C) 2005 by Harald Welte + * + * Based on the old ipv4-only ip_queue.c: + * (C) 2000-2002 James Morris + * (C) 2003-2005 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NFQNL_QMAX_DEFAULT 1024 + +#if 0 +#define QDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \ + __FILE__, __LINE__, __FUNCTION__, \ + ## args) +#else +#define QDEBUG(x, ...) +#endif + +struct nfqnl_queue_entry { + struct list_head list; + struct nf_info *info; + struct sk_buff *skb; + unsigned int id; +}; + +struct nfqnl_instance { + struct hlist_node hlist; /* global list of queues */ + atomic_t use; + + int peer_pid; + unsigned int queue_maxlen; + unsigned int copy_range; + unsigned int queue_total; + unsigned int queue_dropped; + unsigned int queue_user_dropped; + + atomic_t id_sequence; /* 'sequence' of pkt ids */ + + u_int16_t queue_num; /* number of this queue */ + u_int8_t copy_mode; + + spinlock_t lock; + + struct list_head queue_list; /* packets in queue */ +}; + +typedef int (*nfqnl_cmpfn)(struct nfqnl_queue_entry *, unsigned long); + +static DEFINE_RWLOCK(instances_lock); + +u_int64_t htonll(u_int64_t in) +{ + u_int64_t out; + int i; + + for (i = 0; i < sizeof(u_int64_t); i++) + ((u_int8_t *)&out)[sizeof(u_int64_t)-1] = ((u_int8_t *)&in)[i]; + + return out; +} + +#define INSTANCE_BUCKETS 16 +static struct hlist_head instance_table[INSTANCE_BUCKETS]; + +static inline u_int8_t instance_hashfn(u_int16_t queue_num) +{ + return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS; +} + +static struct nfqnl_instance * +__instance_lookup(u_int16_t queue_num) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct nfqnl_instance *inst; + + head = &instance_table[instance_hashfn(queue_num)]; + hlist_for_each_entry(inst, pos, head, hlist) { + if (inst->queue_num == queue_num) + return inst; + } + return NULL; +} + +static struct nfqnl_instance * +instance_lookup_get(u_int16_t queue_num) +{ + struct nfqnl_instance *inst; + + read_lock_bh(&instances_lock); + inst = __instance_lookup(queue_num); + if (inst) + atomic_inc(&inst->use); + read_unlock_bh(&instances_lock); + + return inst; +} + +static void +instance_put(struct nfqnl_instance *inst) +{ + if (inst && atomic_dec_and_test(&inst->use)) { + QDEBUG("kfree(inst=%p)\n", inst); + kfree(inst); + } +} + +static struct nfqnl_instance * +instance_create(u_int16_t queue_num, int pid) +{ + struct nfqnl_instance *inst; + + QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid); + + write_lock_bh(&instances_lock); + if (__instance_lookup(queue_num)) { + inst = NULL; + QDEBUG("aborting, instance already exists\n"); + goto out_unlock; + } + + inst = kmalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) + goto out_unlock; + + memset(inst, 0, sizeof(*inst)); + inst->queue_num = queue_num; + inst->peer_pid = pid; + inst->queue_maxlen = NFQNL_QMAX_DEFAULT; + inst->copy_range = 0xfffff; + inst->copy_mode = NFQNL_COPY_NONE; + atomic_set(&inst->id_sequence, 0); + /* needs to be two, since we _put() after creation */ + atomic_set(&inst->use, 2); + inst->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&inst->queue_list); + + if (!try_module_get(THIS_MODULE)) + goto out_free; + + hlist_add_head(&inst->hlist, + &instance_table[instance_hashfn(queue_num)]); + + write_unlock_bh(&instances_lock); + + QDEBUG("successfully created new instance\n"); + + return inst; + +out_free: + kfree(inst); +out_unlock: + write_unlock_bh(&instances_lock); + return NULL; +} + +static void nfqnl_flush(struct nfqnl_instance *queue, int verdict); + +static void +_instance_destroy2(struct nfqnl_instance *inst, int lock) +{ + /* first pull it out of the global list */ + if (lock) + write_lock_bh(&instances_lock); + + QDEBUG("removing instance %p (queuenum=%u) from hash\n", + inst, inst->queue_num); + hlist_del(&inst->hlist); + + if (lock) + write_unlock_bh(&instances_lock); + + /* then flush all pending skbs from the queue */ + nfqnl_flush(inst, NF_DROP); + + /* and finally put the refcount */ + instance_put(inst); + + module_put(THIS_MODULE); +} + +static inline void +__instance_destroy(struct nfqnl_instance *inst) +{ + _instance_destroy2(inst, 0); +} + +static inline void +instance_destroy(struct nfqnl_instance *inst) +{ + _instance_destroy2(inst, 1); +} + + + +static void +issue_verdict(struct nfqnl_queue_entry *entry, int verdict) +{ + QDEBUG("entering for entry %p, verdict %u\n", entry, verdict); + + /* TCP input path (and probably other bits) assume to be called + * from softirq context, not from syscall, like issue_verdict is + * called. TCP input path deadlocks with locks taken from timer + * softirq, e.g. We therefore emulate this by local_bh_disable() */ + + local_bh_disable(); + nf_reinject(entry->skb, entry->info, verdict); + local_bh_enable(); + + kfree(entry); +} + +static inline void +__enqueue_entry(struct nfqnl_instance *queue, + struct nfqnl_queue_entry *entry) +{ + list_add(&entry->list, &queue->queue_list); + queue->queue_total++; +} + +/* + * Find and return a queued entry matched by cmpfn, or return the last + * entry if cmpfn is NULL. + */ +static inline struct nfqnl_queue_entry * +__find_entry(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, + unsigned long data) +{ + struct list_head *p; + + list_for_each_prev(p, &queue->queue_list) { + struct nfqnl_queue_entry *entry = (struct nfqnl_queue_entry *)p; + + if (!cmpfn || cmpfn(entry, data)) + return entry; + } + return NULL; +} + +static inline void +__dequeue_entry(struct nfqnl_instance *q, struct nfqnl_queue_entry *entry) +{ + list_del(&entry->list); + q->queue_total--; +} + +static inline struct nfqnl_queue_entry * +__find_dequeue_entry(struct nfqnl_instance *queue, + nfqnl_cmpfn cmpfn, unsigned long data) +{ + struct nfqnl_queue_entry *entry; + + entry = __find_entry(queue, cmpfn, data); + if (entry == NULL) + return NULL; + + __dequeue_entry(queue, entry); + return entry; +} + + +static inline void +__nfqnl_flush(struct nfqnl_instance *queue, int verdict) +{ + struct nfqnl_queue_entry *entry; + + while ((entry = __find_dequeue_entry(queue, NULL, 0))) + issue_verdict(entry, verdict); +} + +static inline int +__nfqnl_set_mode(struct nfqnl_instance *queue, + unsigned char mode, unsigned int range) +{ + int status = 0; + + switch (mode) { + case NFQNL_COPY_NONE: + case NFQNL_COPY_META: + queue->copy_mode = mode; + queue->copy_range = 0; + break; + + case NFQNL_COPY_PACKET: + queue->copy_mode = mode; + /* we're using struct nfattr which has 16bit nfa_len */ + if (range > 0xffff) + queue->copy_range = 0xffff; + else + queue->copy_range = range; + break; + + default: + status = -EINVAL; + + } + return status; +} + +static struct nfqnl_queue_entry * +find_dequeue_entry(struct nfqnl_instance *queue, + nfqnl_cmpfn cmpfn, unsigned long data) +{ + struct nfqnl_queue_entry *entry; + + spin_lock_bh(&queue->lock); + entry = __find_dequeue_entry(queue, cmpfn, data); + spin_unlock_bh(&queue->lock); + + return entry; +} + +static void +nfqnl_flush(struct nfqnl_instance *queue, int verdict) +{ + spin_lock_bh(&queue->lock); + __nfqnl_flush(queue, verdict); + spin_unlock_bh(&queue->lock); +} + +static struct sk_buff * +nfqnl_build_packet_message(struct nfqnl_instance *queue, + struct nfqnl_queue_entry *entry, int *errp) +{ + unsigned char *old_tail; + size_t size; + size_t data_len = 0; + struct sk_buff *skb; + struct nfqnl_msg_packet_hdr pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int tmp_uint; + + QDEBUG("entered\n"); + + /* all macros expand to constant values at compile time */ + size = NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hdr)) + + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NLMSG_SPACE(sizeof(u_int32_t)) /* mark */ + + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hw)) + + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_timestamp)); + + spin_lock_bh(&queue->lock); + + switch (queue->copy_mode) { + case NFQNL_COPY_META: + case NFQNL_COPY_NONE: + data_len = 0; + break; + + case NFQNL_COPY_PACKET: + if (queue->copy_range == 0 + || queue->copy_range > entry->skb->len) + data_len = entry->skb->len; + else + data_len = queue->copy_range; + + size += NLMSG_SPACE(data_len); + break; + + default: + *errp = -EINVAL; + spin_unlock_bh(&queue->lock); + return NULL; + } + + spin_unlock_bh(&queue->lock); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail= skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, + NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, + sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + nfmsg->nfgen_family = entry->info->pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(queue->queue_num); + + pmsg.packet_id = htonl(entry->id); + pmsg.hw_protocol = htons(entry->skb->protocol); + pmsg.hook = entry->info->hook; + + NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg); + + if (entry->info->indev) { + tmp_uint = htonl(entry->info->indev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); + } + + if (entry->info->outdev) { + tmp_uint = htonl(entry->info->outdev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); + } + + if (entry->skb->nfmark) { + tmp_uint = htonl(entry->skb->nfmark); + NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint); + } + + if (entry->info->indev && entry->skb->dev + && entry->skb->dev->hard_header_parse) { + struct nfqnl_msg_packet_hw phw; + + phw.hw_addrlen = + entry->skb->dev->hard_header_parse(entry->skb, + phw.hw_addr); + phw.hw_addrlen = htons(phw.hw_addrlen); + NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); + } + + if (entry->skb->stamp.tv_sec) { + struct nfqnl_msg_packet_timestamp ts; + + ts.sec = htonll(entry->skb->stamp.tv_sec); + ts.usec = htonll(entry->skb->stamp.tv_usec); + + NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); + } + + if (data_len) { + struct nfattr *nfa; + int size = NFA_LENGTH(data_len); + + if (skb_tailroom(skb) < (int)NFA_SPACE(data_len)) { + printk(KERN_WARNING "nf_queue: no tailroom!\n"); + goto nlmsg_failure; + } + + nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size)); + nfa->nfa_type = NFQA_PAYLOAD; + nfa->nfa_len = size; + + if (skb_copy_bits(entry->skb, 0, NFA_DATA(nfa), data_len)) + BUG(); + } + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: +nfattr_failure: + if (skb) + kfree_skb(skb); + *errp = -EINVAL; + if (net_ratelimit()) + printk(KERN_ERR "nf_queue: error creating packet message\n"); + return NULL; +} + +static int +nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data) +{ + int status = -EINVAL; + struct sk_buff *nskb; + struct nfqnl_instance *queue; + struct nfqnl_queue_entry *entry; + + QDEBUG("entered\n"); + + queue = instance_lookup_get(queuenum); + if (!queue) { + QDEBUG("no queue instance matching\n"); + return -EINVAL; + } + + if (queue->copy_mode == NFQNL_COPY_NONE) { + QDEBUG("mode COPY_NONE, aborting\n"); + status = -EAGAIN; + goto err_out_put; + } + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + if (net_ratelimit()) + printk(KERN_ERR + "nf_queue: OOM in nfqnl_enqueue_packet()\n"); + status = -ENOMEM; + goto err_out_put; + } + + entry->info = info; + entry->skb = skb; + entry->id = atomic_inc_return(&queue->id_sequence); + + nskb = nfqnl_build_packet_message(queue, entry, &status); + if (nskb == NULL) + goto err_out_free; + + spin_lock_bh(&queue->lock); + + if (!queue->peer_pid) + goto err_out_free_nskb; + + if (queue->queue_total >= queue->queue_maxlen) { + queue->queue_dropped++; + status = -ENOSPC; + if (net_ratelimit()) + printk(KERN_WARNING "ip_queue: full at %d entries, " + "dropping packets(s). Dropped: %d\n", + queue->queue_total, queue->queue_dropped); + goto err_out_free_nskb; + } + + /* nfnetlink_unicast will either free the nskb or add it to a socket */ + status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT); + if (status < 0) { + queue->queue_user_dropped++; + goto err_out_unlock; + } + + __enqueue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + instance_put(queue); + return status; + +err_out_free_nskb: + kfree_skb(nskb); + +err_out_unlock: + spin_unlock_bh(&queue->lock); + +err_out_free: + kfree(entry); +err_out_put: + instance_put(queue); + return status; +} + +static int +nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e) +{ + int diff; + + diff = data_len - e->skb->len; + if (diff < 0) + skb_trim(e->skb, data_len); + else if (diff > 0) { + if (data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + struct sk_buff *newskb; + + newskb = skb_copy_expand(e->skb, + skb_headroom(e->skb), + diff, + GFP_ATOMIC); + if (newskb == NULL) { + printk(KERN_WARNING "ip_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + if (e->skb->sk) + skb_set_owner_w(newskb, e->skb->sk); + kfree_skb(e->skb); + e->skb = newskb; + } + skb_put(e->skb, diff); + } + if (!skb_make_writable(&e->skb, data_len)) + return -ENOMEM; + memcpy(e->skb->data, data, data_len); + + return 0; +} + +static inline int +id_cmp(struct nfqnl_queue_entry *e, unsigned long id) +{ + return (id == e->id); +} + +static int +nfqnl_set_mode(struct nfqnl_instance *queue, + unsigned char mode, unsigned int range) +{ + int status; + + spin_lock_bh(&queue->lock); + status = __nfqnl_set_mode(queue, mode, range); + spin_unlock_bh(&queue->lock); + + return status; +} + +static int +dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex) +{ + if (entry->info->indev) + if (entry->info->indev->ifindex == ifindex) + return 1; + + if (entry->info->outdev) + if (entry->info->outdev->ifindex == ifindex) + return 1; + + return 0; +} + +/* drop all packets with either indev or outdev == ifindex from all queue + * instances */ +static void +nfqnl_dev_drop(int ifindex) +{ + int i; + + QDEBUG("entering for ifindex %u\n", ifindex); + + /* this only looks like we have to hold the readlock for a way too long + * time, issue_verdict(), nf_reinject(), ... - but we always only + * issue NF_DROP, which is processed directly in nf_reinject() */ + read_lock_bh(&instances_lock); + + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp; + struct nfqnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry(inst, tmp, head, hlist) { + struct nfqnl_queue_entry *entry; + while ((entry = find_dequeue_entry(inst, dev_cmp, + ifindex)) != NULL) + issue_verdict(entry, NF_DROP); + } + } + + read_unlock_bh(&instances_lock); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static int +nfqnl_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + nfqnl_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_dev_notifier = { + .notifier_call = nfqnl_rcv_dev_event, +}; + +static int +nfqnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_NETFILTER && n->pid) { + int i; + + /* destroy all instances for this pid */ + write_lock_bh(&instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp, *t2; + struct nfqnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { + if (n->pid == inst->peer_pid) + __instance_destroy(inst); + } + } + write_unlock_bh(&instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_rtnl_notifier = { + .notifier_call = nfqnl_rcv_nl_event, +}; + +static const int nfqa_verdict_min[NFQA_MAX] = { + [NFQA_VERDICT_HDR-1] = sizeof(struct nfqnl_msg_verdict_hdr), + [NFQA_MARK-1] = sizeof(u_int32_t), + [NFQA_PAYLOAD-1] = 0, +}; + +static int +nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + unsigned int verdict; + struct nfqnl_queue_entry *entry; + int err; + + if (nfattr_bad_size(nfqa, NFQA_MAX, nfqa_verdict_min)) { + QDEBUG("bad attribute size\n"); + return -EINVAL; + } + + queue = instance_lookup_get(queue_num); + if (!queue) + return -ENODEV; + + if (queue->peer_pid != NETLINK_CB(skb).pid) { + err = -EPERM; + goto err_out_put; + } + + if (!nfqa[NFQA_VERDICT_HDR-1]) { + err = -EINVAL; + goto err_out_put; + } + + vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]); + verdict = ntohl(vhdr->verdict); + + if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) { + err = -EINVAL; + goto err_out_put; + } + + entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id)); + if (entry == NULL) { + err = -ENOENT; + goto err_out_put; + } + + if (nfqa[NFQA_PAYLOAD-1]) { + if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]), + NFA_PAYLOAD(nfqa[NFQA_PAYLOAD-1]), entry) < 0) + verdict = NF_DROP; + } + + if (nfqa[NFQA_MARK-1]) + skb->nfmark = ntohl(*(u_int32_t *)NFA_DATA(nfqa[NFQA_MARK-1])); + + issue_verdict(entry, verdict); + instance_put(queue); + return 0; + +err_out_put: + instance_put(queue); + return err; +} + +static int +nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + return -ENOTSUPP; +} + +static const int nfqa_cfg_min[NFQA_CFG_MAX] = { + [NFQA_CFG_CMD-1] = sizeof(struct nfqnl_msg_config_cmd), + [NFQA_CFG_PARAMS-1] = sizeof(struct nfqnl_msg_config_params), +}; + +static int +nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + struct nfqnl_instance *queue; + int ret = 0; + + QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); + + if (nfattr_bad_size(nfqa, NFQA_CFG_MAX, nfqa_cfg_min)) { + QDEBUG("bad attribute size\n"); + return -EINVAL; + } + + queue = instance_lookup_get(queue_num); + if (nfqa[NFQA_CFG_CMD-1]) { + struct nfqnl_msg_config_cmd *cmd; + cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]); + QDEBUG("found CFG_CMD\n"); + + switch (cmd->command) { + case NFQNL_CFG_CMD_BIND: + if (queue) + return -EBUSY; + + queue = instance_create(queue_num, NETLINK_CB(skb).pid); + if (!queue) + return -EINVAL; + break; + case NFQNL_CFG_CMD_UNBIND: + if (!queue) + return -ENODEV; + + if (queue->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto out_put; + } + + instance_destroy(queue); + break; + case NFQNL_CFG_CMD_PF_BIND: + QDEBUG("registering queue handler for pf=%u\n", + ntohs(cmd->pf)); + ret = nf_register_queue_handler(ntohs(cmd->pf), + nfqnl_enqueue_packet, + NULL); + + break; + case NFQNL_CFG_CMD_PF_UNBIND: + QDEBUG("unregistering queue handler for pf=%u\n", + ntohs(cmd->pf)); + /* This is a bug and a feature. We can unregister + * other handlers(!) */ + ret = nf_unregister_queue_handler(ntohs(cmd->pf)); + break; + default: + ret = -EINVAL; + break; + } + } else { + if (!queue) { + QDEBUG("no config command, and no instance ENOENT\n"); + ret = -ENOENT; + goto out_put; + } + + if (queue->peer_pid != NETLINK_CB(skb).pid) { + QDEBUG("no config command, and wrong pid\n"); + ret = -EPERM; + goto out_put; + } + } + + if (nfqa[NFQA_CFG_PARAMS-1]) { + struct nfqnl_msg_config_params *params; + params = NFA_DATA(nfqa[NFQA_CFG_PARAMS-1]); + + nfqnl_set_mode(queue, params->copy_mode, + ntohl(params->copy_range)); + } + +out_put: + instance_put(queue); + return ret; +} + +static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { + [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, + .attr_count = NFQA_MAX, + .cap_required = CAP_NET_ADMIN }, + [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, + .attr_count = NFQA_MAX, + .cap_required = CAP_NET_ADMIN }, + [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, + .attr_count = NFQA_CFG_MAX, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnetlink_subsystem nfqnl_subsys = { + .name = "nf_queue", + .subsys_id = NFNL_SUBSYS_QUEUE, + .cb_count = NFQNL_MSG_MAX, + .cb = nfqnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ + struct iter_state *st = seq->private; + + if (!st) + return NULL; + + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&instance_table[st->bucket])) + return instance_table[st->bucket].first; + } + return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ + struct iter_state *st = seq->private; + + h = h->next; + while (!h) { + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + h = instance_table[st->bucket].first; + } + return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head; + head = get_first(seq); + + if (head) + while (pos && (head = get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&instances_lock); + return get_idx(seq, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfqnl_instance *inst = v; + + return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", + inst->queue_num, + inst->peer_pid, inst->queue_total, + inst->copy_mode, inst->copy_range, + inst->queue_dropped, inst->queue_user_dropped, + atomic_read(&inst->id_sequence), + atomic_read(&inst->use)); +} + +static struct seq_operations nfqnl_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqnl_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct iter_state *is; + int ret; + + is = kmalloc(sizeof(*is), GFP_KERNEL); + if (!is) + return -ENOMEM; + memset(is, 0, sizeof(*is)); + ret = seq_open(file, &nfqnl_seq_ops); + if (ret < 0) + goto out_free; + seq = file->private_data; + seq->private = is; + return ret; +out_free: + kfree(is); + return ret; +} + +static struct file_operations nfqnl_file_ops = { + .owner = THIS_MODULE, + .open = nfqnl_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ + +static int +init_or_cleanup(int init) +{ + int i, status = -ENOMEM; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_nfqueue; +#endif + + if (!init) + goto cleanup; + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&instance_table[i]); + + netlink_register_notifier(&nfqnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfqnl_subsys); + if (status < 0) { + printk(KERN_ERR "nf_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + +#ifdef CONFIG_PROC_FS + proc_nfqueue = create_proc_entry("nfnetlink_queue", 0440, + proc_net_netfilter); + if (!proc_nfqueue) + goto cleanup_subsys; + proc_nfqueue->proc_fops = &nfqnl_file_ops; +#endif + + register_netdevice_notifier(&nfqnl_dev_notifier); + + return status; + +cleanup: + nf_unregister_queue_handlers(nfqnl_enqueue_packet); + unregister_netdevice_notifier(&nfqnl_dev_notifier); +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_queue", proc_net_netfilter); +cleanup_subsys: +#endif + nfnetlink_subsys_unregister(&nfqnl_subsys); +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + return status; +} + +static int __init init(void) +{ + + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +MODULE_DESCRIPTION("netfilter packet queue handler"); +MODULE_AUTHOR("Harald Welte "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); + +module_init(init); +module_exit(fini); diff -puN net/netlink/af_netlink.c~git-net net/netlink/af_netlink.c --- devel/net/netlink/af_netlink.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/netlink/af_netlink.c 2005-08-06 23:39:10.000000000 -0700 @@ -13,7 +13,12 @@ * added netlink_proto_exit * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo * use nlk_sk, as sk->protinfo is on a diet 8) - * + * Fri Jul 22 19:51:12 MEST 2005 Harald Welte + * - inc module use count of module that owns + * the kernel socket in case userspace opens + * socket of same protocol + * - remove all module support, since netlink is + * mandatory if CONFIG_NET=y these days */ #include @@ -92,6 +97,7 @@ struct netlink_table { struct nl_pid_hash hash; struct hlist_head mc_list; unsigned int nl_nonroot; + struct proto_ops *p_ops; }; static struct netlink_table *nl_table; @@ -341,7 +347,21 @@ static int netlink_create(struct socket if (protocol<0 || protocol >= MAX_LINKS) return -EPROTONOSUPPORT; - sock->ops = &netlink_ops; + netlink_table_grab(); + if (!nl_table[protocol].hash.entries) { +#ifdef CONFIG_KMOD + /* We do 'best effort'. If we find a matching module, + * it is loaded. If not, we don't return an error to + * allow pure userspace<->userspace communication. -HW + */ + netlink_table_ungrab(); + request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); + netlink_table_grab(); +#endif + } + netlink_table_ungrab(); + + sock->ops = nl_table[protocol].p_ops; sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); if (!sk) @@ -394,6 +414,22 @@ static int netlink_release(struct socket }; notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); } + + /* When this is a kernel socket, we need to remove the owner pointer, + * since we don't know whether the module will be dying at any given + * point - HW + */ + if (!nlk->pid) { + struct proto_ops *p_tmp; + + netlink_table_grab(); + p_tmp = nl_table[sk->sk_protocol].p_ops; + if (p_tmp != &netlink_ops) { + nl_table[sk->sk_protocol].p_ops = &netlink_ops; + kfree(p_tmp); + } + netlink_table_ungrab(); + } sock_put(sk); return 0; @@ -1023,8 +1059,9 @@ static void netlink_data_ready(struct so */ struct sock * -netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)) +netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct module *module) { + struct proto_ops *p_ops; struct socket *sock; struct sock *sk; @@ -1034,22 +1071,63 @@ netlink_kernel_create(int unit, void (*i if (unit<0 || unit>=MAX_LINKS) return NULL; + /* Do a quick check, to make us not go down to netlink_insert() + * if protocol already has kernel socket. + */ + sk = netlink_lookup(unit, 0); + if (unlikely(sk)) { + sock_put(sk); + return NULL; + } + if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; + sk = NULL; + if (module) { + /* Every registering protocol implemented in a module needs + * it's own p_ops, since the socket code cannot deal with + * module refcounting otherwise. -HW + */ + p_ops = kmalloc(sizeof(*p_ops), GFP_KERNEL); + if (!p_ops) + goto out_sock_release; + + memcpy(p_ops, &netlink_ops, sizeof(*p_ops)); + p_ops->owner = module; + } else + p_ops = &netlink_ops; + + netlink_table_grab(); + nl_table[unit].p_ops = p_ops; + netlink_table_ungrab(); + if (netlink_create(sock, unit) < 0) { - sock_release(sock); - return NULL; + sk = NULL; + goto out_kfree_p_ops; } + sk = sock->sk; sk->sk_data_ready = netlink_data_ready; if (input) nlk_sk(sk)->data_ready = input; if (netlink_insert(sk, 0)) { - sock_release(sock); - return NULL; + sk = NULL; + goto out_kfree_p_ops; + } + + return sk; + +out_kfree_p_ops: + netlink_table_grab(); + if (nl_table[unit].p_ops != &netlink_ops) { + kfree(nl_table[unit].p_ops); + nl_table[unit].p_ops = &netlink_ops; } + netlink_table_ungrab(); +out_sock_release: + sock_release(sock); return sk; } @@ -1413,6 +1491,8 @@ enomem: for (i = 0; i < MAX_LINKS; i++) { struct nl_pid_hash *hash = &nl_table[i].hash; + nl_table[i].p_ops = &netlink_ops; + hash->table = nl_pid_hash_alloc(1 * sizeof(*hash->table)); if (!hash->table) { while (i-- > 0) @@ -1438,21 +1518,7 @@ out: return err; } -static void __exit netlink_proto_exit(void) -{ - sock_unregister(PF_NETLINK); - proc_net_remove("netlink"); - kfree(nl_table); - nl_table = NULL; - proto_unregister(&netlink_proto); -} - core_initcall(netlink_proto_init); -module_exit(netlink_proto_exit); - -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_NETPROTO(PF_NETLINK); EXPORT_SYMBOL(netlink_ack); EXPORT_SYMBOL(netlink_broadcast); diff -puN net/netrom/af_netrom.c~git-net net/netrom/af_netrom.c --- devel/net/netrom/af_netrom.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/netrom/af_netrom.c 2005-08-06 23:39:10.000000000 -0700 @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include diff -puN net/netrom/nr_dev.c~git-net net/netrom/nr_dev.c --- devel/net/netrom/nr_dev.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/netrom/nr_dev.c 2005-08-06 23:39:10.000000000 -0700 @@ -64,7 +64,7 @@ int nr_rx_ip(struct sk_buff *skb, struct skb->nh.raw = skb->data; skb->pkt_type = PACKET_HOST; - ip_rcv(skb, skb->dev, NULL); + ip_rcv(skb, skb->dev, NULL, skb->dev); return 1; } diff -puN net/netrom/nr_in.c~git-net net/netrom/nr_in.c --- devel/net/netrom/nr_in.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/netrom/nr_in.c 2005-08-06 23:39:10.000000000 -0700 @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include /* For ip_rcv */ #include #include diff -puN net/netrom/nr_subr.c~git-net net/netrom/nr_subr.c --- devel/net/netrom/nr_subr.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/netrom/nr_subr.c 2005-08-06 23:39:10.000000000 -0700 @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -77,7 +77,7 @@ void nr_requeue_frames(struct sock *sk) if (skb_prev == NULL) skb_queue_head(&sk->sk_write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } diff -puN net/netrom/nr_timer.c~git-net net/netrom/nr_timer.c --- devel/net/netrom/nr_timer.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/netrom/nr_timer.c 2005-08-06 23:39:10.000000000 -0700 @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff -puN net/packet/af_packet.c~git-net net/packet/af_packet.c --- devel/net/packet/af_packet.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/packet/af_packet.c 2005-08-06 23:39:10.000000000 -0700 @@ -241,7 +241,7 @@ static struct proto_ops packet_ops; #ifdef CONFIG_SOCK_PACKET static struct proto_ops packet_ops_spkt; -static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; @@ -441,7 +441,7 @@ static inline unsigned run_filter(struct we will not harm anyone. */ -static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_ll *sll; @@ -546,7 +546,7 @@ drop: } #ifdef CONFIG_PACKET_MMAP -static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct packet_sock *po; diff -puN net/rose/af_rose.c~git-net net/rose/af_rose.c --- devel/net/rose/af_rose.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/rose/af_rose.c 2005-08-06 23:39:10.000000000 -0700 @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include diff -puN net/rose/rose_in.c~git-net net/rose/rose_in.c --- devel/net/rose/rose_in.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/rose/rose_in.c 2005-08-06 23:39:10.000000000 -0700 @@ -27,7 +27,7 @@ #include #include #include /* For ip_rcv */ -#include +#include #include #include #include diff -puN net/rose/rose_route.c~git-net net/rose/rose_route.c --- devel/net/rose/rose_route.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/rose/rose_route.c 2005-08-06 23:39:10.000000000 -0700 @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff -puN net/rose/rose_subr.c~git-net net/rose/rose_subr.c --- devel/net/rose/rose_subr.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/rose/rose_subr.c 2005-08-06 23:39:10.000000000 -0700 @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -74,7 +74,7 @@ void rose_requeue_frames(struct sock *sk if (skb_prev == NULL) skb_queue_head(&sk->sk_write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } diff -puN net/rose/rose_timer.c~git-net net/rose/rose_timer.c --- devel/net/rose/rose_timer.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/rose/rose_timer.c 2005-08-06 23:39:10.000000000 -0700 @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff -puN net/sched/act_api.c~git-net net/sched/act_api.c --- devel/net/sched/act_api.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sched/act_api.c 2005-08-06 23:39:10.000000000 -0700 @@ -165,7 +165,7 @@ int tcf_action_exec(struct sk_buff *skb, while ((a = act) != NULL) { repeat: if (a->ops && a->ops->act) { - ret = a->ops->act(&skb, a); + ret = a->ops->act(&skb, a, res); if (TC_MUNGED & skb->tc_verd) { /* copied already, allow trampling */ skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); @@ -179,11 +179,6 @@ repeat: act = a->next; } exec_done: - if (skb->tc_classid > 0) { - res->classid = skb->tc_classid; - res->class = 0; - skb->tc_classid = 0; - } return ret; } diff -puN net/sched/gact.c~git-net net/sched/gact.c --- devel/net/sched/gact.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sched/gact.c 2005-08-06 23:39:10.000000000 -0700 @@ -135,7 +135,7 @@ tcf_gact_cleanup(struct tc_action *a, in } static int -tcf_gact(struct sk_buff **pskb, struct tc_action *a) +tcf_gact(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct tcf_gact *p = PRIV(a, gact); struct sk_buff *skb = *pskb; diff -puN net/sched/ipt.c~git-net net/sched/ipt.c --- devel/net/sched/ipt.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sched/ipt.c 2005-08-06 23:39:10.000000000 -0700 @@ -201,7 +201,7 @@ tcf_ipt_cleanup(struct tc_action *a, int } static int -tcf_ipt(struct sk_buff **pskb, struct tc_action *a) +tcf_ipt(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { int ret = 0, result = 0; struct tcf_ipt *p = PRIV(a, ipt); diff -puN net/sched/mirred.c~git-net net/sched/mirred.c --- devel/net/sched/mirred.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sched/mirred.c 2005-08-06 23:39:10.000000000 -0700 @@ -158,7 +158,7 @@ tcf_mirred_cleanup(struct tc_action *a, } static int -tcf_mirred(struct sk_buff **pskb, struct tc_action *a) +tcf_mirred(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct tcf_mirred *p = PRIV(a, mirred); struct net_device *dev; diff -puN net/sched/pedit.c~git-net net/sched/pedit.c --- devel/net/sched/pedit.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sched/pedit.c 2005-08-06 23:39:10.000000000 -0700 @@ -130,7 +130,7 @@ tcf_pedit_cleanup(struct tc_action *a, i } static int -tcf_pedit(struct sk_buff **pskb, struct tc_action *a) +tcf_pedit(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = PRIV(a, pedit); struct sk_buff *skb = *pskb; diff -puN net/sched/police.c~git-net net/sched/police.c --- devel/net/sched/police.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sched/police.c 2005-08-06 23:39:10.000000000 -0700 @@ -284,7 +284,8 @@ static int tcf_act_police_cleanup(struct return 0; } -static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a) +static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a, + struct tcf_result *res) { psched_time_t now; struct sk_buff *skb = *pskb; diff -puN net/sched/simple.c~git-net net/sched/simple.c --- devel/net/sched/simple.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sched/simple.c 2005-08-06 23:39:10.000000000 -0700 @@ -44,7 +44,7 @@ static DEFINE_RWLOCK(simp_lock); #include #include -static int tcf_simp(struct sk_buff **pskb, struct tc_action *a) +static int tcf_simp(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct sk_buff *skb = *pskb; struct tcf_defact *p = PRIV(a, defact); diff -puN net/sctp/ipv6.c~git-net net/sctp/ipv6.c --- devel/net/sctp/ipv6.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sctp/ipv6.c 2005-08-06 23:39:10.000000000 -0700 @@ -66,8 +66,8 @@ #include #include -#include #include +#include #include #include #include @@ -641,10 +641,7 @@ static struct sock *sctp_v6_create_accep else newinet->pmtudisc = IP_PMTUDISC_WANT; -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); - atomic_inc(&inet_sock_nr); -#endif + sk_refcnt_debug_inc(newsk); if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); diff -puN net/sctp/protocol.c~git-net net/sctp/protocol.c --- devel/net/sctp/protocol.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sctp/protocol.c 2005-08-06 23:39:10.000000000 -0700 @@ -593,9 +593,7 @@ static struct sock *sctp_v4_create_accep newinet->mc_index = 0; newinet->mc_list = NULL; -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif + sk_refcnt_debug_inc(newsk); if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); diff -puN net/sctp/socket.c~git-net net/sctp/socket.c --- devel/net/sctp/socket.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sctp/socket.c 2005-08-06 23:39:10.000000000 -0700 @@ -4892,7 +4892,7 @@ static void sctp_sock_migrate(struct soc sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &oldsk->sk_receive_queue); __skb_queue_tail(&newsk->sk_receive_queue, skb); } } @@ -4921,7 +4921,7 @@ static void sctp_sock_migrate(struct soc sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &oldsp->pd_lobby); __skb_queue_tail(queue, skb); } } diff -puN net/sctp/ulpqueue.c~git-net net/sctp/ulpqueue.c --- devel/net/sctp/ulpqueue.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sctp/ulpqueue.c 2005-08-06 23:39:10.000000000 -0700 @@ -50,9 +50,9 @@ /* Forward declarations for internal helpers. */ static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq, - struct sctp_ulpevent *); + struct sctp_ulpevent *); static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *, - struct sctp_ulpevent *); + struct sctp_ulpevent *); /* 1st Level Abstractions */ @@ -125,7 +125,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq event = sctp_ulpq_order(ulpq, event); } - /* Send event to the ULP. */ + /* Send event to the ULP. 'event' is the sctp_ulpevent for + * very first SKB on the 'temp' list. + */ if (event) sctp_ulpq_tail_event(ulpq, event); @@ -158,14 +160,18 @@ static int sctp_ulpq_clear_pd(struct sct return sctp_clear_pd(ulpq->asoc->base.sk); } - - +/* If the SKB of 'event' is on a list, it is the first such member + * of that list. + */ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sock *sk = ulpq->asoc->base.sk; - struct sk_buff_head *queue; + struct sk_buff_head *queue, *skb_list; + struct sk_buff *skb = sctp_event2skb(event); int clear_pd = 0; + skb_list = (struct sk_buff_head *) skb->prev; + /* If the socket is just going to throw this away, do not * even try to deliver it. */ @@ -197,10 +203,10 @@ int sctp_ulpq_tail_event(struct sctp_ulp /* If we are harvesting multiple skbs they will be * collected on a list. */ - if (sctp_event2skb(event)->list) - sctp_skb_list_tail(sctp_event2skb(event)->list, queue); + if (skb_list) + sctp_skb_list_tail(skb_list, queue); else - __skb_queue_tail(queue, sctp_event2skb(event)); + __skb_queue_tail(queue, skb); /* Did we just complete partial delivery and need to get * rolling again? Move pending data to the receive @@ -214,10 +220,11 @@ int sctp_ulpq_tail_event(struct sctp_ulp return 1; out_free: - if (sctp_event2skb(event)->list) - sctp_queue_purge_ulpevents(sctp_event2skb(event)->list); + if (skb_list) + sctp_queue_purge_ulpevents(skb_list); else sctp_ulpevent_free(event); + return 0; } @@ -269,7 +276,7 @@ static inline void sctp_ulpq_store_reasm * payload was fragmented on the way and ip had to reassemble them. * We add the rest of skb's to the first skb's fraglist. */ -static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, struct sk_buff *l_frag) +static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag) { struct sk_buff *pos; struct sctp_ulpevent *event; @@ -294,7 +301,7 @@ static struct sctp_ulpevent *sctp_make_r skb_shinfo(f_frag)->frag_list = pos; /* Remove the first fragment from the reassembly queue. */ - __skb_unlink(f_frag, f_frag->list); + __skb_unlink(f_frag, queue); while (pos) { pnext = pos->next; @@ -304,7 +311,7 @@ static struct sctp_ulpevent *sctp_make_r f_frag->data_len += pos->len; /* Remove the fragment from the reassembly queue. */ - __skb_unlink(pos, pos->list); + __skb_unlink(pos, queue); /* Break if we have reached the last fragment. */ if (pos == l_frag) @@ -375,7 +382,7 @@ static inline struct sctp_ulpevent *sctp done: return retval; found: - retval = sctp_make_reassembled_event(first_frag, pos); + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; goto done; @@ -435,7 +442,7 @@ static inline struct sctp_ulpevent *sctp * further. */ done: - retval = sctp_make_reassembled_event(first_frag, last_frag); + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag); if (retval && is_last) retval->msg_flags |= MSG_EOR; @@ -527,7 +534,7 @@ static inline struct sctp_ulpevent *sctp * further. */ done: - retval = sctp_make_reassembled_event(first_frag, last_frag); + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag); return retval; } @@ -537,6 +544,7 @@ done: static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { + struct sk_buff_head *event_list; struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; struct sctp_stream *in; @@ -547,6 +555,8 @@ static inline void sctp_ulpq_retrieve_or ssn = event->ssn; in = &ulpq->asoc->ssnmap->in; + event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev; + /* We are holding the chunks by stream, by SSN. */ sctp_skb_for_each(pos, &ulpq->lobby, tmp) { cevent = (struct sctp_ulpevent *) pos->cb; @@ -567,10 +577,10 @@ static inline void sctp_ulpq_retrieve_or /* Found it, so mark in the ssnmap. */ sctp_ssn_next(in, sid); - __skb_unlink(pos, pos->list); + __skb_unlink(pos, &ulpq->lobby); /* Attach all gathered skbs to the event. */ - __skb_queue_tail(sctp_event2skb(event)->list, pos); + __skb_queue_tail(event_list, pos); } } @@ -626,7 +636,7 @@ static inline void sctp_ulpq_store_order } static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, - struct sctp_ulpevent *event) + struct sctp_ulpevent *event) { __u16 sid, ssn; struct sctp_stream *in; @@ -667,7 +677,7 @@ static inline void sctp_ulpq_reap_ordere { struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; - struct sctp_ulpevent *event = NULL; + struct sctp_ulpevent *event; struct sctp_stream *in; struct sk_buff_head temp; __u16 csid, cssn; @@ -675,6 +685,8 @@ static inline void sctp_ulpq_reap_ordere in = &ulpq->asoc->ssnmap->in; /* We are holding the chunks by stream, by SSN. */ + skb_queue_head_init(&temp); + event = NULL; sctp_skb_for_each(pos, &ulpq->lobby, tmp) { cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; @@ -686,19 +698,20 @@ static inline void sctp_ulpq_reap_ordere /* Found it, so mark in the ssnmap. */ sctp_ssn_next(in, csid); - __skb_unlink(pos, pos->list); + __skb_unlink(pos, &ulpq->lobby); if (!event) { /* Create a temporary list to collect chunks on. */ event = sctp_skb2event(pos); - skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); } else { /* Attach all gathered skbs to the event. */ - __skb_queue_tail(sctp_event2skb(event)->list, pos); + __skb_queue_tail(&temp, pos); } } - /* Send event to the ULP. */ + /* Send event to the ULP. 'event' is the sctp_ulpevent for + * very first SKB on the 'temp' list. + */ if (event) sctp_ulpq_tail_event(ulpq, event); } diff -puN net/sunrpc/svcsock.c~git-net net/sunrpc/svcsock.c --- devel/net/sunrpc/svcsock.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/sunrpc/svcsock.c 2005-08-06 23:39:10.000000000 -0700 @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -34,7 +33,7 @@ #include #include #include -#include +#include #include #include diff -puN net/unix/af_unix.c~git-net net/unix/af_unix.c --- devel/net/unix/af_unix.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/unix/af_unix.c 2005-08-06 23:39:10.000000000 -0700 @@ -105,7 +105,7 @@ #include #include #include -#include +#include #include #include #include diff -puN net/unix/garbage.c~git-net net/unix/garbage.c --- devel/net/unix/garbage.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/unix/garbage.c 2005-08-06 23:39:10.000000000 -0700 @@ -76,11 +76,11 @@ #include #include #include -#include #include #include #include +#include /* Internal data structures and random procedures: */ @@ -286,16 +286,16 @@ void unix_gc(void) skb = skb_peek(&s->sk_receive_queue); while (skb && skb != (struct sk_buff *)&s->sk_receive_queue) { - nextsk=skb->next; + nextsk = skb->next; /* * Do we have file descriptors ? */ - if(UNIXCB(skb).fp) - { - __skb_unlink(skb, skb->list); - __skb_queue_tail(&hitlist,skb); + if (UNIXCB(skb).fp) { + __skb_unlink(skb, + &s->sk_receive_queue); + __skb_queue_tail(&hitlist, skb); } - skb=nextsk; + skb = nextsk; } spin_unlock(&s->sk_receive_queue.lock); } diff -puN net/wanrouter/af_wanpipe.c~git-net net/wanrouter/af_wanpipe.c --- devel/net/wanrouter/af_wanpipe.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/wanrouter/af_wanpipe.c 2005-08-06 23:39:10.000000000 -0700 @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include #include diff -puN net/x25/af_x25.c~git-net net/x25/af_x25.c --- devel/net/x25/af_x25.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/x25/af_x25.c 2005-08-06 23:39:10.000000000 -0700 @@ -47,7 +47,7 @@ #include #include #include -#include +#include #include #include #include /* For TIOCINQ/OUTQ */ diff -puN net/x25/x25_dev.c~git-net net/x25/x25_dev.c --- devel/net/x25/x25_dev.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/x25/x25_dev.c 2005-08-06 23:39:10.000000000 -0700 @@ -81,7 +81,7 @@ static int x25_receive_data(struct sk_bu } int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev, - struct packet_type *ptype) + struct packet_type *ptype, struct net_device *orig_dev) { struct sk_buff *nskb; struct x25_neigh *nb; diff -puN net/x25/x25_in.c~git-net net/x25/x25_in.c --- devel/net/x25/x25_in.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/x25/x25_in.c 2005-08-06 23:39:10.000000000 -0700 @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) diff -puN net/x25/x25_subr.c~git-net net/x25/x25_subr.c --- devel/net/x25/x25_subr.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/x25/x25_subr.c 2005-08-06 23:39:10.000000000 -0700 @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include /* @@ -80,7 +80,7 @@ void x25_requeue_frames(struct sock *sk) if (!skb_prev) skb_queue_head(&sk->sk_write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } diff -puN net/x25/x25_timer.c~git-net net/x25/x25_timer.c --- devel/net/x25/x25_timer.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/x25/x25_timer.c 2005-08-06 23:39:10.000000000 -0700 @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include static void x25_heartbeat_expiry(unsigned long); diff -puN net/xfrm/xfrm_user.c~git-net net/xfrm/xfrm_user.c --- devel/net/xfrm/xfrm_user.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/net/xfrm/xfrm_user.c 2005-08-06 23:39:10.000000000 -0700 @@ -1519,7 +1519,8 @@ static int __init xfrm_user_init(void) { printk(KERN_INFO "Initializing IPsec netlink socket\n"); - xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv); + xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv, + THIS_MODULE); if (xfrm_nl == NULL) return -ENOMEM; @@ -1537,3 +1538,4 @@ static void __exit xfrm_user_exit(void) module_init(xfrm_user_init); module_exit(xfrm_user_exit); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM); diff -puN security/selinux/netlink.c~git-net security/selinux/netlink.c --- devel/security/selinux/netlink.c~git-net 2005-08-06 23:39:09.000000000 -0700 +++ devel-akpm/security/selinux/netlink.c 2005-08-06 23:39:10.000000000 -0700 @@ -103,7 +103,7 @@ void selnl_notify_policyload(u32 seqno) static int __init selnl_init(void) { - selnl = netlink_kernel_create(NETLINK_SELINUX, NULL); + selnl = netlink_kernel_create(NETLINK_SELINUX, NULL, THIS_MODULE); if (selnl == NULL) panic("SELinux: Cannot create netlink socket."); netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV); _