Import 2.1.120pre12.1.120pre1
authorLinus Torvalds<torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:16:29 +0000 (23 15:16 -0500)
committerLinus Torvalds<torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:16:29 +0000 (23 15:16 -0500)
111 files changed:
CREDITS
Makefile
drivers/net/3c59x.c
drivers/net/bmac.c
drivers/net/mace.c
drivers/net/plip.c
drivers/net/sunlance.c
drivers/scsi/README.in2000
drivers/scsi/in2000.c
drivers/scsi/in2000.h
fs/exec.c
include/linux/if.h
include/linux/if_packet.h
include/linux/in6.h
include/linux/ipv6.h
include/linux/ipv6_route.h
include/linux/netdevice.h
include/linux/netlink.h
include/linux/notifier.h
include/linux/proc_fs.h
include/linux/rtnetlink.h
include/linux/skbuff.h
include/linux/socket.h
include/linux/sockios.h
include/net/dst.h
include/net/flow.h
include/net/ip6_fib.h
include/net/ip6_route.h
include/net/ip_fib.h
include/net/ipv6.h
include/net/ndisc.h
include/net/pkt_sched.h
include/net/protocol.h
include/net/rawv6.h
include/net/route.h
include/net/snmp.h
include/net/sock.h
include/net/tcp.h
include/net/transp_v6.h
init/main.c
net/Config.in
net/appletalk/ddp.c
net/ax25/af_ax25.c
net/core/datagram.c
net/core/dev.c
net/core/iovec.c
net/core/neighbour.c
net/core/rtnetlink.c
net/core/scm.c
net/core/skbuff.c
net/core/sock.c
net/ipv4/af_inet.c
net/ipv4/arp.c
net/ipv4/devinet.c
net/ipv4/fib_frontend.c
net/ipv4/fib_hash.c
net/ipv4/fib_rules.c
net/ipv4/fib_semantics.c
net/ipv4/icmp.c
net/ipv4/igmp.c
net/ipv4/ip_forward.c
net/ipv4/ip_fragment.c
net/ipv4/ip_fw.c
net/ipv4/ip_gre.c
net/ipv4/ip_input.c
net/ipv4/ip_nat_dumb.c
net/ipv4/ip_options.c
net/ipv4/ip_output.c
net/ipv4/ip_sockglue.c
net/ipv4/ipip.c
net/ipv4/ipmr.c
net/ipv4/proc.c
net/ipv4/raw.c
net/ipv4/route.c
net/ipv4/tcp.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_output.c
net/ipv4/udp.c
net/ipv6/addrconf.c
net/ipv6/af_inet6.c
net/ipv6/datagram.c
net/ipv6/exthdrs.c
net/ipv6/icmp.c
net/ipv6/ip6_fib.c
net/ipv6/ip6_fw.c
net/ipv6/ip6_input.c
net/ipv6/ip6_output.c
net/ipv6/ipv6_sockglue.c
net/ipv6/mcast.c
net/ipv6/ndisc.c
net/ipv6/proc.c
net/ipv6/raw.c
net/ipv6/reassembly.c
net/ipv6/route.c
net/ipv6/sit.c
net/ipv6/tcp_ipv6.c
net/ipv6/udp.c
net/ipx/af_ipx.c
net/ipx/af_spx.c
net/netlink/af_netlink.c
net/netlink/netlink_dev.c
net/netrom/af_netrom.c
net/netsyms.c
net/rose/af_rose.c
net/sched/cls_api.c
net/sched/cls_rsvp.h
net/sched/sch_api.c
net/socket.c
net/unix/af_unix.c
net/x25/af_x25.c

index 77d7ef2..088c8d8 100644 (file)
--- a/CREDITS
+++ b/CREDITS
@@ -1233,12+1233,12 @@ S: 7546 JA  Enschede
 S: Netherlands
 
 N: David S. Miller
-E: davem@caip.rutgers.edu
-D: Sparc hacker
-D: New Linux-Activists maintainer
+E: davem@dm.cobaltmicro.com
+D: Sparc and blue box hacker
+D: Vger Linux mailing list co-maintainer
 D: Linux Emacs elf/qmagic support + other libc/gcc things
 D: Yee bore de yee bore! ;-)
-S: 111 Alta Tierra Court
+S: 331 Santa Rosa Drive
 S: Los Gatos, California 95032
 S: USA
 
index d722bf9..6c03fd5 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,6+1,6 @@
 VERSION = 2
 PATCHLEVEL = 1
-SUBLEVEL = 119
+SUBLEVEL = 120
 
 ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/)
 
index 8892298..e1b2bdb 100644 (file)
 */
 
 static char *version =
-"3c59x.c:v0.99E 5/12/98 Donald Becker http://cesdis.gsfc.nasa.gov/linux/drivers/vortex.html\n";
+"3c59x.c:v0.99F 8/7/98 Donald Becker http://cesdis.gsfc.nasa.gov/linux/drivers/vortex.html\n";
 
 /* "Knobs" that adjust features and parameters. */
 /* Set the copy breakpoint for the copy-only-tiny-frames scheme.
    Setting to > 1512 effectively disables this feature. */
-static const rx_copybreak = 200;
+static const int rx_copybreak = 200;
 /* Allow setting MTU to a larger size, bypassing the normal ethernet setup. */
-static const mtu = 1500;
+static const int mtu = 1500;
 /* Maximum events (Rx packets, etc.) to handle at each interrupt. */
 static int max_interrupt_work = 20;
 
@@ -37,9+37,6 @@ static int vortex_debug = 1;
    debugging. */
 static int rx_nocopy = 0, rx_copy = 0, queued_packet = 0, rx_csumhits;
 
-/* Enable the automatic media selection code -- usually set. */
-#define AUTOMEDIA 1
-
 /* Allow the use of fragment bus master transfers instead of only
    programmed-I/O for Vortex cards.  Full-bus-master transfers are always
    enabled by default on Boomerang cards.  If VORTEX_BUS_MASTER is defined,
@@ -77,7+74,9 @@ static int rx_nocopy = 0, rx_copy = 0, queued_packet = 0, rx_csumhits;
 #include <linux/malloc.h>
 #include <linux/interrupt.h>
 #include <linux/pci.h>
+#if LINUX_VERSION_CODE < 0x20155  ||  defined(CARDBUS)
 #include <linux/bios32.h>
+#endif
 #include <linux/timer.h>
 #include <asm/irq.h>                   /* For NR_IRQS only. */
 #include <asm/bitops.h>
@@ -105,11+104,6 @@ static int rx_nocopy = 0, rx_copy = 0, queued_packet = 0, rx_csumhits;
 #define RUN_AT(x) (jiffies + (x))
 #define DEV_ALLOC_SKB(len) dev_alloc_skb(len)
 #endif
-#if LINUX_VERSION_CODE < 0x20159
-#define DEV_FREE_SKB(skb) dev_kfree_skb (skb, FREE_WRITE);
-#else  /* Grrr, unneeded incompatible change. */
-#define DEV_FREE_SKB(skb) dev_kfree_skb(skb);
-#endif
 
 #ifdef SA_SHIRQ
 #define FREE_IRQ(irqnum, dev) free_irq(irqnum, dev)
@@ -128,9+122,27 @@ static int rx_nocopy = 0, rx_copy = 0, queued_packet = 0, rx_csumhits;
 #define udelay(microsec)       do { int _i = 4*microsec; while (--_i > 0) { __SLOW_DOWN_IO; }} while (0)
 #endif
 
+#if LINUX_VERSION_CODE <= 0x20139
+#define        net_device_stats enet_statistics
+#define NETSTATS_VER2
+#endif
 #if LINUX_VERSION_CODE < 0x20138
 #define test_and_set_bit(val, addr) set_bit(val, addr)
+#define le32_to_cpu(val) (val)
+#define cpu_to_le32(val) (val)
 #endif
+#if LINUX_VERSION_CODE < 0x20155
+#define PCI_SUPPORT_VER1
+#else
+#define PCI_SUPPORT_VER2
+#endif
+#if LINUX_VERSION_CODE < 0x20159
+#define DEV_FREE_SKB(skb) dev_kfree_skb (skb, FREE_WRITE);
+#else  /* Grrr, unneeded incompatible change. */
+#define DEV_FREE_SKB(skb) dev_kfree_skb(skb);
+#endif
+
+
 #if defined(MODULE) && (LINUX_VERSION_CODE >= 0x20115)
 MODULE_AUTHOR("Donald Becker <becker@cesdis.gsfc.nasa.gov>");
 MODULE_DESCRIPTION("3Com 3c590/3c900 series Vortex/Boomerang driver");
@@ -141,7+153,7 @@ MODULE_PARM(rx_copybreak, "i");
 MODULE_PARM(max_interrupt_work, "i");
 MODULE_PARM(compaq_ioaddr, "i");
 MODULE_PARM(compaq_irq, "i");
-MODULE_PARM(compaq_prod_id, "i");
+MODULE_PARM(compaq_device_id, "i");
 #endif
 
 /* Operational parameter that usually are not changed. */
@@ -166,7+178,7 @@ static char mii_preamble_required = 0;
 /* Caution!  These entries must be consistent. */
 static const int product_ids[] = {
        0x5900, 0x5920, 0x5970, 0x5950, 0x5951, 0x5952, 0x9000, 0x9001,
-       0x9050, 0x9051, 0x9055, 0x5057, 0 };
+       0x9050, 0x9051, 0x9055, 0x5057, 0x5175, 0 };
 static const char *product_names[] = {
        "3c590 Vortex 10Mbps",
        "3c592 EISA 10mbps Demon/Vortex",
@@ -180,6+192,7 @@ static const char *product_names[] = {
        "3c905 Boomerang 100baseT4",
        "3c905B Cyclone 100baseTx",
        "3c575",                                                /* Cardbus Boomerang */
+       "3CCFE575",                                             /* Cardbus ?Cyclone? */
 };
 
 /*
@@ -192,17+205,16 @@ XL, 3Com's PCI to 10/100baseT adapters.  It also works with the 10Mbs
 versions of the FastEtherLink cards.  The supported product IDs are
   3c590, 3c592, 3c595, 3c597, 3c900, 3c905
 
-The ISA 3c515 is supported with a seperate driver, 3c515.c, included with
-the kernel source or available from
+The related ISA 3c515 is supported with a separate driver, 3c515.c, included
+with the kernel source or available from
     cesdis.gsfc.nasa.gov:/pub/linux/drivers/3c515.html
 
 II. Board-specific settings
 
 PCI bus devices are configured by the system at boot time, so no jumpers
 need to be set on the board.  The system BIOS should be set to assign the
-PCI INTA signal to an otherwise unused system IRQ line.  While it's
-physically possible to shared PCI interrupt lines, the 1.2.0 kernel doesn't
-support it.
+PCI INTA signal to an otherwise unused system IRQ line.  Note: The 1.2.*
+kernels did not support PCI interrupt sharing.
 
 III. Driver operation
 
@@ -210,10+222,10 @@ The 3c59x series use an interface that's very similar to the previous 3c5x9
 series.  The primary interface is two programmed-I/O FIFOs, with an
 alternate single-contiguous-region bus-master transfer (see next).
 
-The 3c900 "Boomerang" series uses a full-bus-master interface with seperate
+The 3c900 "Boomerang" series uses a full-bus-master interface with separate
 lists of transmit and receive descriptors, similar to the AMD LANCE/PCnet,
 DEC Tulip and Intel Speedo3.  The first chip version retains a compatible
-programmed-I/O interface that will be removed in the 'B' and subsequent
+programmed-I/O interface that has been removed in 'B' and subsequent board
 revisions.
 
 One extension that is advertised in a very large font is that the adapters
@@ -231,7+243,7 @@ packets may be reordered and receive buffer groups are associated with a
 single frame.
 
 With full-bus-master support, this driver uses a "RX_COPYBREAK" scheme.
-Tather than a fixed intermediate receive buffer, this scheme allocates
+Rather than a fixed intermediate receive buffer, this scheme allocates
 full-sized skbuffs as receive buffers.  The value RX_COPYBREAK is used as
 the copying breakpoint: it is chosen to trade-off the memory wasted by
 passing the full-sized skbuff to the queue layer for all frames vs. the
@@ -404,7+416,7 @@ struct vortex_private {
        struct sk_buff* tx_skbuff[TX_RING_SIZE];
        unsigned int cur_rx, cur_tx;            /* The next free ring entry */
        unsigned int dirty_rx, dirty_tx;        /* The ring entries to be free()ed. */
-       struct enet_statistics stats;
+       struct net_device_stats stats;
        struct sk_buff *tx_skb;         /* Packet being eaten by bus master ctrl.  */
 
        /* PCI configuration space information. */
@@ -412,17+424,16 @@ struct vortex_private {
        u16 pci_device_id;
 
        /* The remainder are related to chip state, mostly media selection. */
-       int in_interrupt;
+       unsigned long in_interrupt;
        struct timer_list timer;        /* Media selection timer. */
        int options;                            /* User-settable misc. driver options. */
-       unsigned int
-         media_override:3,                     /* Passed-in media type. */
-         default_media:3,                              /* Read from the EEPROM/Wn3_Config. */
-         full_duplex:1, autoselect:1,
-         bus_master:1,                         /* Vortex can only do a fragment bus-m. */
-         full_bus_master_tx:1, full_bus_master_rx:2, /* Boomerang  */
-         hw_csums:1,                           /* Has hardware checksums. */
-      tx_full:1;
+       unsigned int media_override:3,                  /* Passed-in media type. */
+               default_media:4,                                /* Read from the EEPROM/Wn3_Config. */
+               full_duplex:1, force_fd:1, autoselect:1,
+               bus_master:1,                           /* Vortex can only do a fragment bus-m. */
+               full_bus_master_tx:1, full_bus_master_rx:2, /* Boomerang  */
+               hw_csums:1,                             /* Has hardware checksums. */
+               tx_full:1;
        u16 status_enable;
        u16 available_media;                            /* From Wn3_Options. */
        u16 capabilities, info1, info2;         /* Various, from EEPROM. */
@@ -459,15+470,15 @@ static struct media_table {
 };
 
 static int vortex_scan(struct device *dev);
-static struct device *vortex_found_device(struct device *dev, int ioaddr,
+static struct device *vortex_found_device(struct device *dev, long ioaddr,
                                                                                  int irq, int device_id,
                                                                                  int options, int card_idx);
 static int vortex_probe1(struct device *dev);
 static int vortex_open(struct device *dev);
-static void mdio_sync(int ioaddr, int bits);
-static int mdio_read(int ioaddr, int phy_id, int location);
+static void mdio_sync(long ioaddr, int bits);
+static int mdio_read(long ioaddr, int phy_id, int location);
 #ifdef HAVE_PRIVATE_IOCTL
-static void mdio_write(int ioaddr, int phy_id, int location, int value);
+static void mdio_write(long ioaddr, int phy_id, int location, int value);
 #endif
 static void vortex_timer(unsigned long arg);
 static int vortex_start_xmit(struct sk_buff *skb, struct device *dev);
@@ -476,8+487,8 @@ static int vortex_rx(struct device *dev);
 static int boomerang_rx(struct device *dev);
 static void vortex_interrupt IRQ(int irq, void *dev_id, struct pt_regs *regs);
 static int vortex_close(struct device *dev);
-static void update_stats(int addr, struct device *dev);
-static struct enet_statistics *vortex_get_stats(struct device *dev);
+static void update_stats(long ioaddr, struct device *dev);
+static struct net_device_stats *vortex_get_stats(struct device *dev);
 static void set_rx_mode(struct device *dev);
 #ifdef HAVE_PRIVATE_IOCTL
 static int vortex_ioctl(struct device *dev, struct ifreq *rq, int cmd);
@@ -660,6+671,24 @@ static int vortex_scan(struct device *dev)
                        if (vendor != TCOM_VENDOR_ID)
                                continue;
 
+                       /* Power-up the card. */
+                       pcibios_read_config_word(pci_bus, pci_device_fn,
+                                                                                0xe0, &pci_command);
+                       if (pci_command & 0x3) {
+                               /* Save the ioaddr and IRQ info! */
+                               printk(KERN_INFO "  A 3Com network adapter is powered down!"
+                                          "  Setting the power state %4.4x->%4.4x.\n",
+                                          pci_command, pci_command & ~3);
+                               pcibios_write_config_word(pci_bus, pci_device_fn,
+                                                                                 0xe0, pci_command & ~3);
+                               printk(KERN_INFO "  Setting the IRQ to %d, IOADDR to %#lx.\n",
+                                          irq, ioaddr);
+                               pcibios_write_config_byte(pci_bus, pci_device_fn,
+                                                                                PCI_INTERRUPT_LINE, irq);
+                               pcibios_write_config_dword(pci_bus, pci_device_fn,
+                                                                                 PCI_BASE_ADDRESS_0, ioaddr);
+                       }
+
                        if (ioaddr == 0) {
                                printk(KERN_WARNING "  A 3Com network adapter has been found, "
                                           "however it has not been assigned an I/O address.\n"
@@ -715,7+744,7 @@ static int vortex_scan(struct device *dev)
 
        /* Now check all slots of the EISA bus. */
        if (EISA_bus) {
-               static int ioaddr = 0x1000;
+               static long ioaddr = 0x1000;
                for ( ; ioaddr < 0x9000; ioaddr += 0x1000) {
                        int device_id;
                        if (check_region(ioaddr, VORTEX_TOTAL_SIZE))
@@ -753,7+782,7 @@ static int vortex_scan(struct device *dev)
 }
 
 static struct device *
-vortex_found_device(struct device *dev, int ioaddr, int irq,
+vortex_found_device(struct device *dev, long ioaddr, int irq,
                                        int device_id, int option, int card_idx)
 {
        struct vortex_private *vp;
@@ -839,6+868,7 @@ vortex_found_device(struct device *dev, int ioaddr, int irq,
                vp->full_duplex = 0;
                vp->bus_master = 0;
        }
+       vp->force_fd = vp->full_duplex;
 
        vortex_probe1(dev);
 #endif /* MODULE */
@@ -847,13+877,13 @@ vortex_found_device(struct device *dev, int ioaddr, int irq,
 
 static int vortex_probe1(struct device *dev)
 {
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
        u16 *ether_addr = (u16 *)dev->dev_addr;
        unsigned int eeprom[0x40], checksum = 0;                /* EEPROM contents */
        int i;
 
-       printk(KERN_INFO "%s: 3Com %s at %#3x,",
+       printk(KERN_INFO "%s: 3Com %s at %#3lx,",
                   dev->name, vp->product_name, ioaddr);
 
        /* Read the station address from the EEPROM. */
@@ -888,11+918,15 @@ static int vortex_probe1(struct device *dev)
                ether_addr[i] = htons(eeprom[i + 10]);
        for (i = 0; i < 6; i++)
                printk("%c%2.2x", i ? ':' : ' ', dev->dev_addr[i]);
+#ifdef __sparc__
+       printk(", IRQ %s\n", __irq_itoa(dev->irq));
+#else
        printk(", IRQ %d\n", dev->irq);
        /* Tell them about an invalid IRQ. */
        if (vortex_debug && (dev->irq <= 0 || dev->irq >= NR_IRQS))
                printk(KERN_WARNING " *** Warning: IRQ %d is unlikely to work! ***\n",
                           dev->irq);
+#endif
 
        /* Extract our information from the EEPROM data. */
        vp->info1 = eeprom[13];
@@ -918,7+952,7 @@ static int vortex_probe1(struct device *dev)
                           config.u.ram_width ? "word" : "byte",
                           ram_split[config.u.ram_split],
                           config.u.autoselect ? "autoselect/" : "",
-                          config.u.xcvr ? "NWay Autonegotiation" :
+                          config.u.xcvr > XCVR_ExtMII ? "<invalid transceiver>" :
                           media_tbl[config.u.xcvr].name);
                vp->default_media = config.u.xcvr;
                vp->autoselect = config.u.autoselect;
@@ -931,7+965,7 @@ static int vortex_probe1(struct device *dev)
        } else
                dev->if_port = vp->default_media;
 
-       if (dev->if_port == XCVR_MII) {
+       if (dev->if_port == XCVR_MII || dev->if_port == XCVR_NWAY) {
                int phy, phy_idx = 0;
                EL3WINDOW(4);
                for (phy = 0; phy < 32 && phy_idx < sizeof(vp->phys); phy++) {
@@ -991,7+1025,7 @@ static int vortex_probe1(struct device *dev)
 static int
 vortex_open(struct device *dev)
 {
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
        union wn3_config config;
        int i;
@@ -1011,6+1045,8 @@ vortex_open(struct device *dev)
                dev->if_port = XCVR_100baseTx;
                while (! (vp->available_media & media_tbl[dev->if_port].mask))
                        dev->if_port = media_tbl[dev->if_port].next;
+               if (vp->phys[0])
+                       dev->if_port = XCVR_NWAY;
 
                if (vortex_debug > 1)
                        printk(KERN_DEBUG "%s: Initial media type %s.\n",
@@ -1019,15+1055,16 @@ vortex_open(struct device *dev)
                init_timer(&vp->timer);
                vp->timer.expires = RUN_AT(media_tbl[dev->if_port].wait);
                vp->timer.data = (unsigned long)dev;
-               vp->timer.function = &vortex_timer;    /* timer handler */
+               vp->timer.function = &vortex_timer;             /* timer handler */
                add_timer(&vp->timer);
        } else
                dev->if_port = vp->default_media;
 
+       vp->full_duplex = vp->force_fd;
        config.u.xcvr = dev->if_port;
        outl(config.i, ioaddr + Wn3_Config);
 
-       if (dev->if_port == XCVR_MII) {
+       if (dev->if_port == XCVR_MII || dev->if_port == XCVR_NWAY) {
                int mii_reg1, mii_reg5;
                EL3WINDOW(4);
                /* Read BMSR (reg1) only to clear old status. */
@@ -1127,9+1164,9 @@ vortex_open(struct device *dev)
                        printk(KERN_DEBUG "%s:  Filling in the Rx ring.\n", dev->name);
                for (i = 0; i < RX_RING_SIZE; i++) {
                        struct sk_buff *skb;
-                       vp->rx_ring[i].next = virt_to_bus(&vp->rx_ring[i+1]);
+                       vp->rx_ring[i].next = cpu_to_le32(virt_to_bus(&vp->rx_ring[i+1]));
                        vp->rx_ring[i].status = 0;      /* Clear complete bit. */
-                       vp->rx_ring[i].length = PKT_BUF_SZ | LAST_FRAG;
+                       vp->rx_ring[i].length = cpu_to_le32(PKT_BUF_SZ | LAST_FRAG);
                        skb = DEV_ALLOC_SKB(PKT_BUF_SZ);
                        vp->rx_skbuff[i] = skb;
                        if (skb == NULL)
@@ -1137,12+1174,13 @@ vortex_open(struct device *dev)
                        skb->dev = dev;                 /* Mark as being used by this device. */
 #if LINUX_VERSION_CODE >= 0x10300
                        skb_reserve(skb, 2);    /* Align IP on 16 byte boundaries */
-                       vp->rx_ring[i].addr = virt_to_bus(skb->tail);
+                       vp->rx_ring[i].addr = cpu_to_le32(virt_to_bus(skb->tail));
 #else
                        vp->rx_ring[i].addr = virt_to_bus(skb->data);
 #endif
                }
-               vp->rx_ring[i-1].next = virt_to_bus(&vp->rx_ring[0]); /* Wrap the ring. */
+               /* Wrap the ring. */
+               vp->rx_ring[i-1].next = cpu_to_le32(virt_to_bus(&vp->rx_ring[0]));
                outl(virt_to_bus(&vp->rx_ring[0]), ioaddr + UpListPtr);
        }
        if (vp->full_bus_master_tx) {           /* Boomerang bus master Tx. */
@@ -1186,24+1224,25 @@ vortex_open(struct device *dev)
 
 static void vortex_timer(unsigned long data)
 {
-#ifdef AUTOMEDIA
        struct device *dev = (struct device *)data;
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
        unsigned long flags;
+       int next_tick = 0;
        int ok = 0;
+       int media_status, old_window;
 
        if (vortex_debug > 1)
                printk(KERN_DEBUG "%s: Media selection timer tick happened, %s.\n",
                           dev->name, media_tbl[dev->if_port].name);
 
-       save_flags(flags);      cli(); {
-         int old_window = inw(ioaddr + EL3_CMD) >> 13;
-         int media_status;
-         EL3WINDOW(4);
-         media_status = inw(ioaddr + Wn4_Media);
-         switch (dev->if_port) {
-         case XCVR_10baseT:  case XCVR_100baseTx:  case XCVR_100baseFx:
+       save_flags(flags);
+       cli();
+       old_window = inw(ioaddr + EL3_CMD) >> 13;
+       EL3WINDOW(4);
+       media_status = inw(ioaddr + Wn4_Media);
+       switch (dev->if_port) {
+       case XCVR_10baseT:  case XCVR_100baseTx:  case XCVR_100baseFx:
                if (media_status & Media_LnkBeat) {
                  ok = 1;
                  if (vortex_debug > 1)
@@ -1212,18+1251,27 @@ static void vortex_timer(unsigned long data)
                } else if (vortex_debug > 1)
                  printk(KERN_DEBUG "%s: Media %s is has no link beat, %x.\n",
                                   dev->name, media_tbl[dev->if_port].name, media_status);
-
                break;
-         case XCVR_MII:
-                 {
-                         int mii_reg1 = mdio_read(ioaddr, vp->phys[0], 1);
+         case XCVR_MII: case XCVR_NWAY:
+                 if (mdio_read(ioaddr, vp->phys[0], 1) & 0x0004) {
                          int mii_reg5 = mdio_read(ioaddr, vp->phys[0], 5);
-                         if (vortex_debug > 1)
-                                 printk(KERN_DEBUG "%s: MII #%d status register is %4.4x, "
-                                                "link partner capability %4.4x.\n",
-                                                dev->name, vp->phys[0], mii_reg1, mii_reg5);
-                         if (mii_reg1 & 0x0004)
-                                 ok = 1;
+                         ok = 1;
+                         if (! vp->force_fd  &&  mii_reg5 != 0xffff) {
+                                 int duplex = (mii_reg5&0x0100) ||
+                                         (mii_reg5 & 0x01C0) == 0x0040;
+                                 if (vp->full_duplex != duplex) {
+                                         vp->full_duplex = duplex;
+                                         printk(KERN_INFO "%s: Setting %s-duplex based on MII "
+                                                        "#%d link partner capability of %4.4x.\n",
+                                                        dev->name, vp->full_duplex ? "full" : "half",
+                                                        vp->phys[0], mii_reg5);
+                                         /* Set the full-duplex bit. */
+                                         outb((vp->full_duplex ? 0x20 : 0) |
+                                                  (dev->mtu > 1500 ? 0x40 : 0),
+                                                  ioaddr + Wn3_MAC_Ctrl);
+                                 }
+                                 next_tick = 60*HZ;
+                         }
                          break;
                  }
          default:                                      /* Other media types handled by Tx timeouts. */
@@ -1231,8+1279,8 @@ static void vortex_timer(unsigned long data)
                  printk(KERN_DEBUG "%s: Media %s is has no indication, %x.\n",
                                 dev->name, media_tbl[dev->if_port].name, media_status);
                ok = 1;
-         }
-         if ( ! ok) {
+       }
+       if ( ! ok) {
                union wn3_config config;
 
                do {
@@ -1249,8+1297,7 @@ static void vortex_timer(unsigned long data)
                        printk(KERN_DEBUG "%s: Media selection failed, now trying "
                                   "%s port.\n",
                                   dev->name, media_tbl[dev->if_port].name);
-                 vp->timer.expires = RUN_AT(media_tbl[dev->if_port].wait);
-                 add_timer(&vp->timer);
+                 next_tick = RUN_AT(media_tbl[dev->if_port].wait);
                }
                outw((media_status & ~(Media_10TP|Media_SQE)) |
                         media_tbl[dev->if_port].media_bits, ioaddr + Wn4_Media);
@@ -1262,21+1309,25 @@ static void vortex_timer(unsigned long data)
 
                outw(dev->if_port == XCVR_10base2 ? StartCoax : StopCoax,
                         ioaddr + EL3_CMD);
-         }
-         EL3WINDOW(old_window);
-       }   restore_flags(flags);
+       }
+       EL3WINDOW(old_window);
+       restore_flags(flags);
+
        if (vortex_debug > 1)
          printk(KERN_DEBUG "%s: Media selection timer finished, %s.\n",
                         dev->name, media_tbl[dev->if_port].name);
 
-#endif /* AUTOMEDIA*/
+       if (next_tick) {
+               vp->timer.expires = RUN_AT(next_tick);
+               add_timer(&vp->timer);
+       }
        return;
 }
 
 static void vortex_tx_timeout(struct device *dev)
 {
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
        int j;
 
        printk(KERN_ERR "%s: transmit timed out, tx_status %2.2x status %4.4x.\n",
@@ -1309,8+1360,8 @@ static void vortex_tx_timeout(struct device *dev)
                for (i = 0; i < TX_RING_SIZE; i++) {
                        printk(KERN_DEBUG "  %d: @%p  length %8.8x status %8.8x\n", i,
                                   &vp->tx_ring[i],
-                                  vp->tx_ring[i].length,
-                                  vp->tx_ring[i].status);
+                                  le32_to_cpu(vp->tx_ring[i].length),
+                                  le32_to_cpu(vp->tx_ring[i].status));
                }
        }
 #endif
@@ -1340,14+1391,14 @@ static void vortex_tx_timeout(struct device *dev)
 }
 
 /*
- * Handle uncommon interrupt sources.  This is a seperate routine to minimize
+ * Handle uncommon interrupt sources.  This is a separate routine to minimize
  * the cache impact.
  */
 static void
 vortex_error(struct device *dev, int status)
 {
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
        int do_tx_reset = 0;
        int i;
 
@@ -1434,7+1485,7 @@ static int
 vortex_start_xmit(struct sk_buff *skb, struct device *dev)
 {
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
 
        if (test_and_set_bit(0, (void*)&dev->tbusy) != 0) {
                if (jiffies - dev->trans_start >= TX_TIMEOUT)
@@ -1506,7+1557,7 @@ static int
 boomerang_start_xmit(struct sk_buff *skb, struct device *dev)
 {
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
 
        if (test_and_set_bit(0, (void*)&dev->tbusy) != 0) {
                if (jiffies - dev->trans_start >= TX_TIMEOUT)
@@ -1528,13+1579,13 @@ boomerang_start_xmit(struct sk_buff *skb, struct device *dev)
                                printk(KERN_WARNING "%s: Tx Ring full, refusing to send buffer.\n",
                                           dev->name);
                        return 1;
-               } 
+               }
                /* end change 06/25/97 M. Sievers */    
                vp->tx_skbuff[entry] = skb;
                vp->tx_ring[entry].next = 0;
-               vp->tx_ring[entry].addr = virt_to_bus(skb->data);
-               vp->tx_ring[entry].length = skb->len | LAST_FRAG;
-               vp->tx_ring[entry].status = skb->len | TxIntrUploaded;
+               vp->tx_ring[entry].addr = cpu_to_le32(virt_to_bus(skb->data));
+               vp->tx_ring[entry].length = cpu_to_le32(skb->len | LAST_FRAG);
+               vp->tx_ring[entry].status = cpu_to_le32(skb->len | TxIntrUploaded);
 
                save_flags(flags);
                cli();
@@ -1543,7+1594,7 @@ boomerang_start_xmit(struct sk_buff *skb, struct device *dev)
                for (i = 600; i >= 0 ; i--)
                        if ( (inw(ioaddr + EL3_STATUS) & CmdInProgress) == 0)
                                break;
-               prev_entry->next = virt_to_bus(&vp->tx_ring[entry]);
+               prev_entry->next = cpu_to_le32(virt_to_bus(&vp->tx_ring[entry]));
                if (inl(ioaddr + DownListPtr) == 0) {
                        outl(virt_to_bus(&vp->tx_ring[entry]), ioaddr + DownListPtr);
                        queued_packet++;
@@ -1555,7+1606,7 @@ boomerang_start_xmit(struct sk_buff *skb, struct device *dev)
                if (vp->cur_tx - vp->dirty_tx > TX_RING_SIZE - 1)
                        vp->tx_full = 1;
                else {                                  /* Clear previous interrupt enable. */
-                       prev_entry->status &= ~TxIntrUploaded;
+                       prev_entry->status &= cpu_to_le32(~TxIntrUploaded);
                        clear_bit(0, (void*)&dev->tbusy);
                }
                dev->trans_start = jiffies;
@@ -1573,8+1624,8 @@ static void vortex_interrupt IRQ(int irq, void *dev_id, struct pt_regs *regs)
        struct device *dev = (struct device *)(irq2dev_map[irq]);
 #endif
        struct vortex_private *vp;
-       int ioaddr, status;
-       int latency;
+       long ioaddr;
+       int latency, status;
        int work_done = max_interrupt_work;
 
        vp = (struct vortex_private *)dev->priv;
@@ -1586,7+1637,6 @@ static void vortex_interrupt IRQ(int irq, void *dev_id, struct pt_regs *regs)
        dev->interrupt = 1;
        ioaddr = dev->base_addr;
        latency = inb(ioaddr + Timer);
-
        status = inw(ioaddr + EL3_STATUS);
 
        if (vortex_debug > 4)
@@ -1681,7+1731,7 @@ static int
 vortex_rx(struct device *dev)
 {
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
        int i;
        short rx_status;
 
@@ -1751,7+1801,7 @@ boomerang_rx(struct device *dev)
 {
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
        int entry = vp->cur_rx % RX_RING_SIZE;
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
        int rx_status;
        int rx_work_limit = vp->dirty_rx + RX_RING_SIZE - vp->cur_rx;
 
@@ -1759,8+1809,7 @@ boomerang_rx(struct device *dev)
                printk(KERN_DEBUG "  In boomerang_rx(), status %4.4x, rx_status "
                           "%4.4x.\n",
                           inw(ioaddr+EL3_STATUS), inw(ioaddr+RxStatus));
-       while ((--rx_work_limit >= 0) &&
-                       ((rx_status = vp->rx_ring[entry].status) & RxDComplete)) {
+       while ((rx_status = le32_to_cpu(vp->rx_ring[entry].status)) & RxDComplete) {
                if (rx_status & RxDError) { /* Error, update stats. */
                        unsigned char rx_error = rx_status >> 16;
                        if (vortex_debug > 2)
@@ -1789,21+1838,18 @@ boomerang_rx(struct device *dev)
                                skb_reserve(skb, 2);    /* Align IP on 16 byte boundaries */
                                /* 'skb_put()' points to the start of sk_buff data area. */
                                memcpy(skb_put(skb, pkt_len),
-                                          bus_to_virt(vp->rx_ring[entry].addr),
+                                          bus_to_virt(le32_to_cpu(vp->rx_ring[entry].addr)),
                                           pkt_len);
 #else
-                               memcpy(skb->data, bus_to_virt(vp->rx_ring[entry].addr), pkt_len);
+                               memcpy(skb->data, bus_to_virt(vp->rx_ring[entry].addr),
+                                          pkt_len);
                                skb->len = pkt_len;
 #endif
                                rx_copy++;
-                       } else{
+                       } else {
                                void *temp;
                                /* Pass up the skbuff already on the Rx ring. */
                                skb = vp->rx_skbuff[entry];
-                               if (skb == NULL) {
-                                       printk(KERN_WARNING "%s: in boomerang_rx -- attempt to use NULL skb caught\n", dev->name);
-                                       break;
-                               }
                                vp->rx_skbuff[entry] = NULL;
 #if LINUX_VERSION_CODE >= 0x10300
                                temp = skb_put(skb, pkt_len);
@@ -1811,10+1857,11 @@ boomerang_rx(struct device *dev)
                                temp = skb->data;
 #endif
                                /* Remove this checking code for final release. */
-                               if (bus_to_virt(vp->rx_ring[entry].addr) != temp)
+                               if (bus_to_virt(le32_to_cpu(vp->rx_ring[entry].addr)) != temp)
                                        printk(KERN_ERR "%s: Warning -- the skbuff addresses do not match"
                                                   " in boomerang_rx: %p vs. %p.\n", dev->name,
-                                                  bus_to_virt(vp->rx_ring[entry].addr), temp);
+                                                  bus_to_virt(le32_to_cpu(vp->rx_ring[entry].addr)),
+                                                  temp);
                                rx_nocopy++;
                        }
 #if LINUX_VERSION_CODE > 0x10300
@@ -1836,6+1883,8 @@ boomerang_rx(struct device *dev)
                        vp->stats.rx_packets++;
                }
                entry = (++vp->cur_rx) % RX_RING_SIZE;
+               if (--rx_work_limit < 0)
+                       break;
        }
        /* Refill the Rx ring buffers. */
        for (; vp->dirty_rx < vp->cur_rx; vp->dirty_rx++) {
@@ -1843,14+1892,12 @@ boomerang_rx(struct device *dev)
                entry = vp->dirty_rx % RX_RING_SIZE;
                if (vp->rx_skbuff[entry] == NULL) {
                        skb = DEV_ALLOC_SKB(PKT_BUF_SZ);
-                       if (skb == NULL) {
-                               printk(KERN_DEBUG "%s: in boomerang_rx -- could not allocate skbuff\n", dev->name);
+                       if (skb == NULL)
                                break;                  /* Bad news!  */
-                       }
                        skb->dev = dev;                 /* Mark as being used by this device. */
 #if LINUX_VERSION_CODE > 0x10300
                        skb_reserve(skb, 2);    /* Align IP on 16 byte boundaries */
-                       vp->rx_ring[entry].addr = virt_to_bus(skb->tail);
+                       vp->rx_ring[entry].addr = cpu_to_le32(virt_to_bus(skb->tail));
 #else
                        vp->rx_ring[entry].addr = virt_to_bus(skb->data);
 #endif
@@ -1859,12+1906,6 @@ boomerang_rx(struct device *dev)
                vp->rx_ring[entry].status = 0;  /* Clear complete bit. */
                outw(UpUnstall, ioaddr + EL3_CMD);
        }
-
-       if (vp->dirty_rx >= RX_RING_SIZE ) {
-               vp->cur_rx -= RX_RING_SIZE;
-               vp->dirty_rx -= RX_RING_SIZE;
-       }
-
        return 0;
 }
 
@@ -1872,7+1913,7 @@ static int
 vortex_close(struct device *dev)
 {
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
        int i;
 
        dev->start = 0;
@@ -1934,8+1975,7 @@ vortex_close(struct device *dev)
        return 0;
 }
 
-static struct enet_statistics *
-vortex_get_stats(struct device *dev)
+static struct net_device_stats *vortex_get_stats(struct device *dev)
 {
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
        unsigned long flags;
@@ -1956,7+1996,7 @@ vortex_get_stats(struct device *dev)
        table.  This is done by checking that the ASM (!) code generated uses
        atomic updates with '+='.
        */
-static void update_stats(int ioaddr, struct device *dev)
+static void update_stats(long ioaddr, struct device *dev)
 {
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
 
@@ -1991,7+2031,7 @@ static void update_stats(int ioaddr, struct device *dev)
 static int vortex_ioctl(struct device *dev, struct ifreq *rq, int cmd)
 {
        struct vortex_private *vp = (struct vortex_private *)dev->priv;
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
        u16 *data = (u16 *)&rq->ifr_data;
        int phy = vp->phys[0] & 0x1f;
 
@@ -2000,7+2040,7 @@ static int vortex_ioctl(struct device *dev, struct ifreq *rq, int cmd)
                           dev->name, rq->ifr_ifrn.ifrn_name, cmd,
                           data[0], data[1], data[2], data[3]);
 
-    switch(cmd) {
+       switch(cmd) {
        case SIOCDEVPRIVATE:            /* Get the address of the PHY in use. */
                data[0] = phy;
        case SIOCDEVPRIVATE+1:          /* Read the specified MII register. */
@@ -2025,7+2065,7 @@ static int vortex_ioctl(struct device *dev, struct ifreq *rq, int cmd)
 static void
 set_rx_mode(struct device *dev)
 {
-       int ioaddr = dev->base_addr;
+       long ioaddr = dev->base_addr;
        int new_mode;
 
        if (dev->flags & IFF_PROMISC) {
@@ -2068,11+2108,11 @@ set_multicast_list(struct device *dev, int num_addrs, void *addrs)
 
 /* Generate the preamble required for initial synchronization and
    a few older transceivers. */
-static void mdio_sync(int ioaddr, int bits)
+static void mdio_sync(long ioaddr, int bits)
 {
-       int mdio_addr = ioaddr + Wn4_PhysicalMgmt;
+       long mdio_addr = ioaddr + Wn4_PhysicalMgmt;
 
-       /* Establish sync by sending at least 32 logic ones. */ 
+       /* Establish sync by sending at least 32 logic ones. */
        while (-- bits >= 0) {
                outw(MDIO_DATA_WRITE1, mdio_addr);
                mdio_delay();
@@ -2081,12+2121,12 @@ static void mdio_sync(int ioaddr, int bits)
        }
 }
 
-static int mdio_read(int ioaddr, int phy_id, int location)
+static int mdio_read(long ioaddr, int phy_id, int location)
 {
        int i;
        int read_cmd = (0xf6 << 10) | (phy_id << 5) | location;
        unsigned int retval = 0;
-       int mdio_addr = ioaddr + Wn4_PhysicalMgmt;
+       long mdio_addr = ioaddr + Wn4_PhysicalMgmt;
 
        if (mii_preamble_required)
                mdio_sync(ioaddr, 32);
@@ -2110,10+2150,10 @@ static int mdio_read(int ioaddr, int phy_id, int location)
        return retval>>1 & 0xffff;
 }
 
-static void mdio_write(int ioaddr, int phy_id, int location, int value)
+static void mdio_write(long ioaddr, int phy_id, int location, int value)
 {
        int write_cmd = 0x50020000 | (phy_id << 23) | (location << 18) | value;
-       int mdio_addr = ioaddr + Wn4_PhysicalMgmt;
+       long mdio_addr = ioaddr + Wn4_PhysicalMgmt;
        int i;
 
        if (mii_preamble_required)
@@ -2166,7+2206,7 @@ cleanup_module(void)
  * Local variables:
  *  compile-command: "gcc -DMODULE -D__KERNEL__ -Wall -Wstrict-prototypes -O6 -c 3c59x.c `[ -f /usr/include/linux/modversions.h ] && echo -DMODVERSIONS`"
  *  SMP-compile-command: "gcc -D__SMP__ -DMODULE -D__KERNEL__ -Wall -Wstrict-prototypes -O6 -c 3c59x.c"
- *  compile-command-alt1: "gcc -DCARDBUS -DMODULE -D__KERNEL__ -Wall -Wstrict-prototypes -O6 -c 3c59x.c -o 3c59x_cb.o"
+ *  cardbus-compile-command: "gcc -DCARDBUS -DMODULE -D__KERNEL__ -Wall -Wstrict-prototypes -O6 -c 3c59x.c -o 3c575_cb.o -I/usr/src/pcmcia-cs-3.0.5/include/"
  *  c-indent-level: 4
  *  c-basic-offset: 4
  *  tab-width: 4
index 536df54..aef8103 100644 (file)
  */
 #define ENET_CRCPOLY 0x04c11db7
 
+/* switch to use multicast code lifted from sunhme driver */
+#define SUNHME_MULTICAST
+
 /* a bunch of constants for the "Heathrow" interrupt controller.
    These really should be in an include file somewhere */
 #define IoBaseHeathrow ((unsigned *)0xf3000000)
 #define XXDEBUG(args)
 
 struct bmac_data {
-/*     volatile struct bmac *bmac; */
+       /* volatile struct bmac *bmac; */
        struct sk_buff_head *queue;
        volatile struct dbdma_regs *tx_dma;
        int tx_dma_intr;
@@ -82,7+85,7 @@ typedef struct bmac_reg_entry {
        unsigned short reg_offset;
 } bmac_reg_entry_t;
 
-#define N_REG_ENTRIES 30
+#define N_REG_ENTRIES 31
 
 bmac_reg_entry_t reg_entries[N_REG_ENTRIES] = {
        {"MEMADD", MEMADD},
@@ -98,6+101,7 @@ bmac_reg_entry_t reg_entries[N_REG_ENTRIES] = {
        {"PAPAT", PAPAT},
        {"TXSFD", TXSFD},
        {"JAM", JAM},
+       {"TXCFG", TXCFG},
        {"TXMAX", TXMAX},
        {"TXMIN", TXMIN},
        {"PAREG", PAREG},
@@ -133,8+137,8 @@ static unsigned char dummy_buf[RX_BUFLEN];
  * buffers on a 16 byte boundary.
  */
 #define PRIV_BYTES     (sizeof(struct bmac_data) \
-                        + (N_RX_RING + N_TX_RING + 4) * sizeof(struct dbdma_cmd) \
-                        + sizeof(struct sk_buff_head))
+       + (N_RX_RING + N_TX_RING + 4) * sizeof(struct dbdma_cmd) \
+       + sizeof(struct sk_buff_head))
 
 static unsigned char bitrev(unsigned char b);
 static int bmac_open(struct device *dev);
@@ -263,6+267,7 @@ bmac_reset_chip(struct device *dev)
        udelay(50000);
        
        out_le32(heathrowFCR, fcrValue);
+       udelay(50000);
 }
 
 static void
@@ -273,7+278,7 @@ bmac_init_registers(struct device *dev)
        unsigned short *pWord16;
        int i;
 
-/* XXDEBUG(("bmac: enter init_registers\n")); */
+       /* XXDEBUG(("bmac: enter init_registers\n")); */
 
        bmwrite(dev, TXRST, TxResetBit);
 
@@ -468,6+473,7 @@ bmac_init_tx_ring(struct bmac_data *bp)
        if (!bp->tx_allocated) {
                /* zero out tx cmds, alloc space for double buffering */
                addr = (char *)kmalloc(ETHERMTU * N_TX_RING, GFP_DMA);
+               if (addr == NULL) return 0;
                for (i = 0; i < N_TX_RING; i++, addr += ETHERMTU) bp->tx_double[i] = addr;
                bp->tx_allocated = 1;
        }
@@ -500,6+506,7 @@ bmac_init_rx_ring(struct bmac_data *bp)
        if (!bp->rx_allocated) {
                for (i = 0; i < N_RX_RING; i++) {
                        bp->rx_bufs[i] = dev_alloc_skb(RX_BUFLEN+2);
+                       if (bp->rx_bufs[i] == NULL) return 0;
                        skb_reserve(bp->rx_bufs[i], 2);
                }
                bp->rx_allocated = 1;
@@ -531,8+538,8 @@ static int bmac_transmit_packet(struct sk_buff *skb, struct device *dev)
        int i;
 
        /* see if there's a free slot in the tx ring */
-/* XXDEBUG(("bmac_xmit_start: empty=%d fill=%d\n", */
-/*          bp->tx_empty, bp->tx_fill)); */
+       /* XXDEBUG(("bmac_xmit_start: empty=%d fill=%d\n", */
+       /*           bp->tx_empty, bp->tx_fill)); */
        i = bp->tx_fill + 1;
        if (i >= N_TX_RING) i = 0;
        if (i == bp->tx_empty) {
@@ -637,8+644,8 @@ static void bmac_txdma_intr(int irq, void *dev_id, struct pt_regs *regs)
                XXDEBUG(("bmac_txdma_intr\n"));
        }
 
-/*     del_timer(&bp->tx_timeout); */
-/*     bp->timeout_active = 0; */
+       /*     del_timer(&bp->tx_timeout); */
+       /*     bp->timeout_active = 0; */
 
        while (1) {
                cp = &bp->tx_cmds[bp->tx_empty];
@@ -655,8+662,8 @@ static void bmac_txdma_intr(int irq, void *dev_id, struct pt_regs *regs)
                bp->tx_bufs[bp->tx_empty] = NULL;
                bp->tx_fullup = 0;
                dev->tbusy = 0;
-/* XXDEBUG(("bmac_intr: cleared tbusy, empty=%d fill=%d\n", */
-/*              i, bp->tx_fill)); */
+               /* XXDEBUG(("bmac_intr: cleared tbusy, empty=%d fill=%d\n", */
+               /*               i, bp->tx_fill)); */
                mark_bh(NET_BH);
                if (++bp->tx_empty >= N_TX_RING) bp->tx_empty = 0;
                if (bp->tx_empty == bp->tx_fill) break;
@@ -678,7+685,7 @@ static struct net_device_stats *bmac_stats(struct device *dev)
        return &p->stats;
 }
 
-#if 0
+#ifndef SUNHME_MULTICAST
 /* Real fast bit-reversal algorithm, 6-bit values */
 static int reverse6[64] = {
        0x0,0x20,0x10,0x30,0x8,0x28,0x18,0x38,
@@ -743,98+750,98 @@ bmac_addhash(struct bmac_data *bp, unsigned char *addr)
        unsigned int     crc;
        unsigned short   mask;
 
-       if (!(*addr 
-             crc = bmac_crc((unsigned short *)addr) & 0x3f; /* Big-endian alert! */
-             crc = reverse6[crc];      /* Hyperfast bit-reversing algorithm */
-             if (bp->hash_use_count[crc]++) return; /* This bit is already set */
-             mask = crc % 16;
-             mask = (unsigned char)1 << mask;
-             bp->hash_use_count[crc/16] |= mask;
-             }
-
-           static void
-           bmac_removehash(struct bmac_data *bp, unsigned char *addr)
-           {   
-                   unsigned int crc;
-                   unsigned char mask;
-
-                   /* Now, delete the address from the filter copy, as indicated */
-                   crc = bmac_crc((unsigned short *)addr) & 0x3f; /* Big-endian alert! */
-                   crc = reverse6[crc];        /* Hyperfast bit-reversing algorithm */
-                   if (bp->hash_use_count[crc] == 0) return; /* That bit wasn't in use! */
-                   if (--bp->hash_use_count[crc]) return; /* That bit is still in use */
-                   mask = crc % 16;
-                   mask = ((unsigned char)1 << mask) ^ 0xffff; /* To turn off bit */
-                   bp->hash_table_mask[crc/16] &= mask;
-           }
+       if (!(*addr)) return;
+       crc = bmac_crc((unsigned short *)addr) & 0x3f; /* Big-endian alert! */
+       crc = reverse6[crc];    /* Hyperfast bit-reversing algorithm */
+       if (bp->hash_use_count[crc]++) return; /* This bit is already set */
+       mask = crc % 16;
+       mask = (unsigned char)1 << mask;
+       bp->hash_use_count[crc/16] |= mask;
+}
+
+static void
+bmac_removehash(struct bmac_data *bp, unsigned char *addr)
+{      
+       unsigned int crc;
+       unsigned char mask;
+
+       /* Now, delete the address from the filter copy, as indicated */
+       crc = bmac_crc((unsigned short *)addr) & 0x3f; /* Big-endian alert! */
+       crc = reverse6[crc];    /* Hyperfast bit-reversing algorithm */
+       if (bp->hash_use_count[crc] == 0) return; /* That bit wasn't in use! */
+       if (--bp->hash_use_count[crc]) return; /* That bit is still in use */
+       mask = crc % 16;
+       mask = ((unsigned char)1 << mask) ^ 0xffff; /* To turn off bit */
+       bp->hash_table_mask[crc/16] &= mask;
+}
 
 /*
  * Sync the adapter with the software copy of the multicast mask
  *  (logical address filter).
  */
 
-                   static void
-                   bmac_rx_off(struct device *dev)
-                   {
-                           unsigned short rx_cfg;
-
-                           rx_cfg = bmread(dev, RXCFG);
-                           rx_cfg &= ~RxMACEnable;
-                           bmwrite(dev, RXCFG, rx_cfg);
-                           do {
-                                   rx_cfg = bmread(dev, RXCFG);
-                           }  while (rx_cfg & RxMACEnable);
-                   }
-
-                           unsigned short
-                           bmac_rx_on(struct device *dev, int hash_enable, int promisc_enable)
-                           {
-                                   unsigned short rx_cfg;
-
-                                   rx_cfg = bmread(dev, RXCFG);
-                                   rx_cfg |= RxMACEnable;
-                                   if (hash_enable) rx_cfg |= RxHashFilterEnable;
-                                   else rx_cfg &= ~RxHashFilterEnable;
-                                   if (promisc_enable) rx_cfg |= RxPromiscEnable;
-                                   else rx_cfg &= ~RxPromiscEnable;
-                                   bmwrite(dev, RXRST, RxResetValue);
-                                   bmwrite(dev, RXFIFOCSR, 0); /* first disable rxFIFO */
-                                   bmwrite(dev, RXFIFOCSR, RxFIFOEnable ); 
-                                   bmwrite(dev, RXCFG, rx_cfg );
-                                   return rx_cfg;
-                           }
-
-                                   static void
-                                   bmac_update_hash_table_mask(struct device *dev, struct bmac_data *bp)
-                                   {
-                                           bmwrite(dev, BHASH3, bp->hash_table_mask[0]); /* bits 15 - 0 */
-                                           bmwrite(dev, BHASH2, bp->hash_table_mask[1]); /* bits 31 - 16 */
-                                           bmwrite(dev, BHASH1, bp->hash_table_mask[2]); /* bits 47 - 32 */
-                                           bmwrite(dev, BHASH0, bp->hash_table_mask[3]); /* bits 63 - 48 */
-                                   }
+static void
+bmac_rx_off(struct device *dev)
+{
+       unsigned short rx_cfg;
+
+       rx_cfg = bmread(dev, RXCFG);
+       rx_cfg &= ~RxMACEnable;
+       bmwrite(dev, RXCFG, rx_cfg);
+       do {
+               rx_cfg = bmread(dev, RXCFG);
+       }  while (rx_cfg & RxMACEnable);
+}
+
+unsigned short
+bmac_rx_on(struct device *dev, int hash_enable, int promisc_enable)
+{
+       unsigned short rx_cfg;
+
+       rx_cfg = bmread(dev, RXCFG);
+       rx_cfg |= RxMACEnable;
+       if (hash_enable) rx_cfg |= RxHashFilterEnable;
+       else rx_cfg &= ~RxHashFilterEnable;
+       if (promisc_enable) rx_cfg |= RxPromiscEnable;
+       else rx_cfg &= ~RxPromiscEnable;
+       bmwrite(dev, RXRST, RxResetValue);
+       bmwrite(dev, RXFIFOCSR, 0);     /* first disable rxFIFO */
+       bmwrite(dev, RXFIFOCSR, RxFIFOEnable ); 
+       bmwrite(dev, RXCFG, rx_cfg );
+       return rx_cfg;
+}
+
+static void
+bmac_update_hash_table_mask(struct device *dev, struct bmac_data *bp)
+{
+       bmwrite(dev, BHASH3, bp->hash_table_mask[0]); /* bits 15 - 0 */
+       bmwrite(dev, BHASH2, bp->hash_table_mask[1]); /* bits 31 - 16 */
+       bmwrite(dev, BHASH1, bp->hash_table_mask[2]); /* bits 47 - 32 */
+       bmwrite(dev, BHASH0, bp->hash_table_mask[3]); /* bits 63 - 48 */
+}
 
 #if 0
-                                           static void
-                                           bmac_add_multi(struct device *dev,
-                                                          struct bmac_data *bp, unsigned char *addr)
-                                           {
-/* XXDEBUG(("bmac: enter bmac_add_multi\n")); */
-                                                   bmac_addhash(bp, addr);
-                                                   bmac_rx_off(dev);
-                                                   bmac_update_hash_table_mask(dev, bp);
-                                                   bmac_rx_on(dev, 1, (dev->flags & IFF_PROMISC)? 1 : 0);
-/* XXDEBUG(("bmac: exit bmac_add_multi\n")); */
-                                           }
-
-                                                   static void
-                                                   bmac_remove_multi(struct device *dev,
-                                                                     struct bmac_data *bp, unsigned char *addr)
-                                                   {
-                                                           bmac_removehash(bp, addr);
-                                                           bmac_rx_off(dev);
-                                                           bmac_update_hash_table_mask(dev, bp);
-                                                           bmac_rx_on(dev, 1, (dev->flags & IFF_PROMISC)? 1 : 0);
-                                                   }
+static void
+bmac_add_multi(struct device *dev,
+              struct bmac_data *bp, unsigned char *addr)
+{
+       /* XXDEBUG(("bmac: enter bmac_add_multi\n")); */
+       bmac_addhash(bp, addr);
+       bmac_rx_off(dev);
+       bmac_update_hash_table_mask(dev, bp);
+       bmac_rx_on(dev, 1, (dev->flags & IFF_PROMISC)? 1 : 0);
+       /* XXDEBUG(("bmac: exit bmac_add_multi\n")); */
+}
+
+static void
+bmac_remove_multi(struct device *dev,
+                 struct bmac_data *bp, unsigned char *addr)
+{
+       bmac_removehash(bp, addr);
+       bmac_rx_off(dev);
+       bmac_update_hash_table_mask(dev, bp);
+       bmac_rx_on(dev, 1, (dev->flags & IFF_PROMISC)? 1 : 0);
+}
 #endif
 
 /* Set or clear the multicast filter for this adaptor.
@@ -843,134+850,138 @@ bmac_addhash(struct bmac_data *bp, unsigned char *addr)
     num_addrs > 0      Multicast mode, receive normal and MC packets, and do
                        best-effort filtering.
  */
-                                                           static void bmac_set_multicast(struct device *dev)
-                                                           {
-                                                                   struct dev_mc_list *dmi;
-                                                                   struct bmac_data *bp = (struct bmac_data *) dev->priv;
-                                                                   int num_addrs = dev->mc_count;
-                                                                   unsigned short rx_cfg;
-                                                                   int i;
-
-                                                                   XXDEBUG(("bmac: enter bmac_set_multicast, n_addrs=%d\n", num_addrs));
-
-                                                                   if((dev->flags & IFF_ALLMULTI) || (dev->mc_count > 64)) {
-                                                                           for (i=0; i<4; i++) bp->hash_table_mask[i] = 0xffff;
-                                                                           bmac_update_hash_table_mask(dev, bp);
-                                                                           rx_cfg = bmac_rx_on(dev, 1, 0);
-                                                                           XXDEBUG(("bmac: all multi, rx_cfg=%#08x\n"));
-                                                                   } else if if ((dev->flags & IFF_PROMISC) || (num_addrs < 0)) {
-                                                                           rx_cfg = bmread(dev, RXCFG);
-                                                                           rx_cfg |= RxPromiscEnable;
-                                                                           bmwrite(dev, RXCFG, rx_cfg);
-                                                                           rx_cfg = bmac_rx_on(dev, 0, 1);
-                                                                           XXDEBUG(("bmac: promisc mode enabled, rx_cfg=%#08x\n", rx_cfg));
-                                                                   } else {
-                                                                           for (i=0; i<4; i++) bp->hash_table_mask[i] = 0;
-                                                                           for (i=0; i<64; i++) bp->hash_use_count[i] = 0;
-                                                                           if (num_addrs == 0) {
-                                                                                   rx_cfg = bmac_rx_on(dev, 0, 0);
-                                                                                   XXDEBUG(("bmac: multi disabled, rx_cfg=%#08x\n", rx_cfg));
-                                                                           } else {
-                                                                                   for (dmi=dev->mc_list; dmi!=NULL; dmi=dmi->next)
-                                                                                           bmac_addhash(bp, dmi->dmi_addr);
-                                                                                   bmac_update_hash_table_mask(dev, bp);
-                                                                                   rx_cfg = bmac_rx_on(dev, 1, 0);
-                                                                                   XXDEBUG(("bmac: multi enabled, rx_cfg=%#08x\n", rx_cfg));
-                                                                           }
-                                                                   }
-/* XXDEBUG(("bmac: exit bmac_set_multicast\n")); */
-                                                           }
-#endif
+static void bmac_set_multicast(struct device *dev)
+{
+       struct dev_mc_list *dmi;
+       struct bmac_data *bp = (struct bmac_data *) dev->priv;
+       int num_addrs = dev->mc_count;
+       unsigned short rx_cfg;
+       int i;
+
+       XXDEBUG(("bmac: enter bmac_set_multicast, n_addrs=%d\n", num_addrs));
+
+       if((dev->flags & IFF_ALLMULTI) || (dev->mc_count > 64)) {
+               for (i=0; i<4; i++) bp->hash_table_mask[i] = 0xffff;
+               bmac_update_hash_table_mask(dev, bp);
+               rx_cfg = bmac_rx_on(dev, 1, 0);
+               XXDEBUG(("bmac: all multi, rx_cfg=%#08x\n"));
+       } else if ((dev->flags & IFF_PROMISC) || (num_addrs < 0)) {
+               rx_cfg = bmread(dev, RXCFG);
+               rx_cfg |= RxPromiscEnable;
+               bmwrite(dev, RXCFG, rx_cfg);
+               rx_cfg = bmac_rx_on(dev, 0, 1);
+               XXDEBUG(("bmac: promisc mode enabled, rx_cfg=%#08x\n", rx_cfg));
+       } else {
+               for (i=0; i<4; i++) bp->hash_table_mask[i] = 0;
+               for (i=0; i<64; i++) bp->hash_use_count[i] = 0;
+               if (num_addrs == 0) {
+                       rx_cfg = bmac_rx_on(dev, 0, 0);
+                       XXDEBUG(("bmac: multi disabled, rx_cfg=%#08x\n", rx_cfg));
+               } else {
+                       for (dmi=dev->mc_list; dmi!=NULL; dmi=dmi->next)
+                               bmac_addhash(bp, dmi->dmi_addr);
+                       bmac_update_hash_table_mask(dev, bp);
+                       rx_cfg = bmac_rx_on(dev, 1, 0);
+                       XXDEBUG(("bmac: multi enabled, rx_cfg=%#08x\n", rx_cfg));
+               }
+       }
+       /* XXDEBUG(("bmac: exit bmac_set_multicast\n")); */
+}
+#else /* ifdef SUNHME_MULTICAST */
 
 /* The version of set_multicast below was lifted from sunhme.c */
 
 #define CRC_POLYNOMIAL_BE 0x04c11db7UL  /* Ethernet CRC, big endian */
 #define CRC_POLYNOMIAL_LE 0xedb88320UL  /* Ethernet CRC, little endian */
 
-                                                                   static void bmac_set_multicast(struct device *dev)
-                                                                   {
-                                                                           struct dev_mc_list *dmi = dev->mc_list;
-                                                                           char *addrs;
-                                                                           int i, j, bit, byte;
-                                                                           unsigned short rx_cfg;
-                                                                           u32 crc, poly = CRC_POLYNOMIAL_LE;
+static void bmac_set_multicast(struct device *dev)
+{
+       struct dev_mc_list *dmi = dev->mc_list;
+       char *addrs;
+       int i, j, bit, byte;
+       unsigned short rx_cfg;
+       u32 crc, poly = CRC_POLYNOMIAL_LE;
     
-                                                                           /* Let the transmits drain. */
-/*     while(dev->tbusy) schedule(); */
+       /* Let the transmits drain. */
+       /*     while(dev->tbusy) schedule(); */
     
-                                                                           /* Lock out others. */
-/*     set_bit(0, (void *) &dev->tbusy); */
+       /* Lock out others. */
+       /*     set_bit(0, (void *) &dev->tbusy); */
     
-                                                                           if((dev->flags & IFF_ALLMULTI) || (dev->mc_count > 64)) {
-                                                                                   bmwrite(dev, BHASH0, 0xffff);
-                                                                                   bmwrite(dev, BHASH1, 0xffff);
-                                                                                   bmwrite(dev, BHASH2, 0xffff);
-                                                                                   bmwrite(dev, BHASH3, 0xffff);
-                                                                           } else if(dev->flags & IFF_PROMISC) {
-                                                                                   rx_cfg = bmread(dev, RXCFG);
-                                                                                   rx_cfg |= RxPromiscEnable;
-                                                                                   bmwrite(dev, RXCFG, rx_cfg);
-                                                                           } else {
-                                                                                   u16 hash_table[4];
+       if((dev->flags & IFF_ALLMULTI) || (dev->mc_count > 64)) {
+               bmwrite(dev, BHASH0, 0xffff);
+               bmwrite(dev, BHASH1, 0xffff);
+               bmwrite(dev, BHASH2, 0xffff);
+               bmwrite(dev, BHASH3, 0xffff);
+       } else if(dev->flags & IFF_PROMISC) {
+               rx_cfg = bmread(dev, RXCFG);
+               rx_cfg |= RxPromiscEnable;
+               bmwrite(dev, RXCFG, rx_cfg);
+       } else {
+               u16 hash_table[4];
        
-                                                                                   for(i = 0; i < 4; i++) hash_table[i] = 0;
+               rx_cfg = bmread(dev, RXCFG);
+               rx_cfg &= ~RxPromiscEnable;
+               bmwrite(dev, RXCFG, rx_cfg);
+
+               for(i = 0; i < 4; i++) hash_table[i] = 0;
        
-                                                                                   for(i = 0; i < dev->mc_count; i++) {
-                                                                                           addrs = dmi->dmi_addr;
-                                                                                           dmi = dmi->next;
+               for(i = 0; i < dev->mc_count; i++) {
+                       addrs = dmi->dmi_addr;
+                       dmi = dmi->next;
            
-                                                                                           if(!(*addrs & 1))
-                                                                                                   continue;
+                       if(!(*addrs & 1))
+                               continue;
            
-                                                                                           crc = 0xffffffffU;
-                                                                                           for(byte = 0; byte < 6; byte++) {
-                                                                                                   for(bit = *addrs++, j = 0; j < 8; j++, bit >>= 1) {
-                                                                                                           int test;
+                       crc = 0xffffffffU;
+                       for(byte = 0; byte < 6; byte++) {
+                               for(bit = *addrs++, j = 0; j < 8; j++, bit >>= 1) {
+                                       int test;
                    
-                                                                                                           test = ((bit ^ crc) & 0x01);
-                                                                                                           crc >>= 1;
-                                                                                                           if(test)
-                                                                                                                   crc = crc ^ poly;
-                                                                                                   }
-                                                                                           }
-                                                                                           crc >>= 26;
-                                                                                           hash_table[crc >> 4] |= 1 << (crc & 0xf);
-                                                                                   }
-                                                                                   bmwrite(dev, BHASH0, hash_table[0]);
-                                                                                   bmwrite(dev, BHASH1, hash_table[1]);
-                                                                                   bmwrite(dev, BHASH2, hash_table[2]);
-                                                                                   bmwrite(dev, BHASH3, hash_table[3]);
-                                                                           }
+                                       test = ((bit ^ crc) & 0x01);
+                                       crc >>= 1;
+                                       if(test)
+                                               crc = crc ^ poly;
+                               }
+                       }
+                       crc >>= 26;
+                       hash_table[crc >> 4] |= 1 << (crc & 0xf);
+               }
+               bmwrite(dev, BHASH0, hash_table[0]);
+               bmwrite(dev, BHASH1, hash_table[1]);
+               bmwrite(dev, BHASH2, hash_table[2]);
+               bmwrite(dev, BHASH3, hash_table[3]);
+       }
     
-                                                                           /* Let us get going again. */
-/*     dev->tbusy = 0; */
-                                                                   }
-
-
-                                                                           static int miscintcount = 0;
-
-                                                                           static void bmac_misc_intr(int irq, void *dev_id, struct pt_regs *regs)
-                                                                           {
-                                                                                   struct device *dev = (struct device *) dev_id;
-                                                                                   struct bmac_data *bp = (struct bmac_data *)dev->priv;
-                                                                                   unsigned int status = bmread(dev, STATUS);
-                                                                                   if (miscintcount++ < 10) {
-                                                                                           XXDEBUG(("bmac_misc_intr\n"));
-                                                                                   }
-/* XXDEBUG(("bmac_misc_intr, status=%#08x\n", status)); */
-/*     bmac_txdma_intr_inner(irq, dev_id, regs); */
-/*   if (status & FrameReceived) bp->stats.rx_dropped++; */
-                                                                                   if (status & RxErrorMask) bp->stats.rx_errors++;
-                                                                                   if (status & RxCRCCntExp) bp->stats.rx_crc_errors++;
-                                                                                   if (status & RxLenCntExp) bp->stats.rx_length_errors++;
-                                                                                   if (status & RxOverFlow) bp->stats.rx_over_errors++;
-                                                                                   if (status & RxAlignCntExp) bp->stats.rx_frame_errors++;
-
-/*   if (status & FrameSent) bp->stats.tx_dropped++; */
-                                                                                   if (status & TxErrorMask) bp->stats.tx_errors++;
-                                                                                   if (status & TxUnderrun) bp->stats.tx_fifo_errors++;
-                                                                                   if (status & TxNormalCollExp) bp->stats.collisions++;
-                                                                           }
+       /* Let us get going again. */
+       /*     dev->tbusy = 0; */
+}
+#endif /* SUNHME_MULTICAST */
+
+static int miscintcount = 0;
+
+static void bmac_misc_intr(int irq, void *dev_id, struct pt_regs *regs)
+{
+       struct device *dev = (struct device *) dev_id;
+       struct bmac_data *bp = (struct bmac_data *)dev->priv;
+       unsigned int status = bmread(dev, STATUS);
+       if (miscintcount++ < 10) {
+               XXDEBUG(("bmac_misc_intr\n"));
+       }
+       /* XXDEBUG(("bmac_misc_intr, status=%#08x\n", status)); */
+       /*     bmac_txdma_intr_inner(irq, dev_id, regs); */
+       /*   if (status & FrameReceived) bp->stats.rx_dropped++; */
+       if (status & RxErrorMask) bp->stats.rx_errors++;
+       if (status & RxCRCCntExp) bp->stats.rx_crc_errors++;
+       if (status & RxLenCntExp) bp->stats.rx_length_errors++;
+       if (status & RxOverFlow) bp->stats.rx_over_errors++;
+       if (status & RxAlignCntExp) bp->stats.rx_frame_errors++;
+
+       /*   if (status & FrameSent) bp->stats.tx_dropped++; */
+       if (status & TxErrorMask) bp->stats.tx_errors++;
+       if (status & TxUnderrun) bp->stats.tx_fifo_errors++;
+       if (status & TxNormalCollExp) bp->stats.collisions++;
+}
 
 /*
  * Procedure for reading EEPROM 
@@ -988,464+999,467 @@ bmac_addhash(struct bmac_data *bp, unsigned char *addr)
 #define SROMAddressBits                6
 #define EnetAddressOffset      20
 
-                                                                                   static unsigned char
-                                                                                   bmac_clock_out_bit(struct device *dev)
-                                                                                   {
-                                                                                           unsigned short         data;
-                                                                                           unsigned short         val;
+static unsigned char
+bmac_clock_out_bit(struct device *dev)
+{
+       unsigned short         data;
+       unsigned short         val;
 
-                                                                                           bmwrite(dev, SROMCSR, ChipSelect | Clk);
-                                                                                           udelay(DelayValue);
+       bmwrite(dev, SROMCSR, ChipSelect | Clk);
+       udelay(DelayValue);
     
-                                                                                           data = bmread(dev, SROMCSR);
-                                                                                           udelay(DelayValue);
-                                                                                           val = (data >> SD0ShiftCount) & 1;
+       data = bmread(dev, SROMCSR);
+       udelay(DelayValue);
+       val = (data >> SD0ShiftCount) & 1;
 
-                                                                                           bmwrite(dev, SROMCSR, ChipSelect);
-                                                                                           udelay(DelayValue);
+       bmwrite(dev, SROMCSR, ChipSelect);
+       udelay(DelayValue);
     
-                                                                                           return val;
-                                                                                   }
+       return val;
+}
 
-                                                                                           static void
-                                                                                           bmac_clock_in_bit(struct device *dev, unsigned int val)
-                                                                                           {
-                                                                                                   unsigned short              data;    
+static void
+bmac_clock_in_bit(struct device *dev, unsigned int val)
+{
+       unsigned short          data;    
 
-                                                                                                   if (val != 0 && val != 1) return;
+       if (val != 0 && val != 1) return;
     
-                                                                                                   data = (val << SDIShiftCount);
-                                                                                                   bmwrite(dev, SROMCSR, data | ChipSelect  );
-                                                                                                   udelay(DelayValue);
+       data = (val << SDIShiftCount);
+       bmwrite(dev, SROMCSR, data | ChipSelect  );
+       udelay(DelayValue);
     
-                                                                                                   bmwrite(dev, SROMCSR, data | ChipSelect | Clk );
-                                                                                                   udelay(DelayValue);
-
-                                                                                                   bmwrite(dev, SROMCSR, data | ChipSelect);
-                                                                                                   udelay(DelayValue);
-                                                                                           }
-
-                                                                                                   static void
-                                                                                                   reset_and_select_srom(struct device *dev)
-                                                                                                   {
-                                                                                                           /* first reset */
-                                                                                                           bmwrite(dev, SROMCSR, 0);
-                                                                                                           udelay(DelayValue);
+       bmwrite(dev, SROMCSR, data | ChipSelect | Clk );
+       udelay(DelayValue);
+
+       bmwrite(dev, SROMCSR, data | ChipSelect);
+       udelay(DelayValue);
+}
+
+static void
+reset_and_select_srom(struct device *dev)
+{
+       /* first reset */
+       bmwrite(dev, SROMCSR, 0);
+       udelay(DelayValue);
     
-                                                                                                           /* send it the read command (110) */
-                                                                                                           bmac_clock_in_bit(dev, 1);
-                                                                                                           bmac_clock_in_bit(dev, 1);
-                                                                                                           bmac_clock_in_bit(dev, 0);
-                                                                                                   }
-
-                                                                                                           static unsigned short
-                                                                                                           read_srom(struct device *dev, unsigned int addr, unsigned int addr_len)
-                                                                                                           {
-                                                                                                                   unsigned short data, val;
-                                                                                                                   int i;
+       /* send it the read command (110) */
+       bmac_clock_in_bit(dev, 1);
+       bmac_clock_in_bit(dev, 1);
+       bmac_clock_in_bit(dev, 0);
+}
+
+static unsigned short
+read_srom(struct device *dev, unsigned int addr, unsigned int addr_len)
+{
+       unsigned short data, val;
+       int i;
     
-                                                                                                                   /* send out the address we want to read from */
-                                                                                                                   for (i = 0; i < addr_len; i++)      {
-                                                                                                                           val = addr >> (addr_len-i-1);
-                                                                                                                           bmac_clock_in_bit(dev, val & 1);
-                                                                                                                   }
+       /* send out the address we want to read from */
+       for (i = 0; i < addr_len; i++)  {
+               val = addr >> (addr_len-i-1);
+               bmac_clock_in_bit(dev, val & 1);
+       }
     
-                                                                                                                   /* Now read in the 16-bit data */
-                                                                                                                   data = 0;
-                                                                                                                   for (i = 0; i < 16; i++)    {
-                                                                                                                           val = bmac_clock_out_bit(dev);
-                                                                                                                           data <<= 1;
-                                                                                                                           data |= val;
-                                                                                                                   }
-                                                                                                                   bmwrite(dev, SROMCSR, 0);
+       /* Now read in the 16-bit data */
+       data = 0;
+       for (i = 0; i < 16; i++)        {
+               val = bmac_clock_out_bit(dev);
+               data <<= 1;
+               data |= val;
+       }
+       bmwrite(dev, SROMCSR, 0);
     
-                                                                                                                   return data;
-                                                                                                           }
+       return data;
+}
 
 /*
  * It looks like Cogent and SMC use different methods for calculating
  * checksums. What a pain.. 
  */
 
-                                                                                                                   static int
-                                                                                                                   bmac_verify_checksum(struct device *dev)
-                                                                                                                   {
-                                                                                                                           unsigned short data, storedCS;
+static int
+bmac_verify_checksum(struct device *dev)
+{
+       unsigned short data, storedCS;
     
-                                                                                                                           reset_and_select_srom(dev);
-                                                                                                                           data = read_srom(dev, 3, SROMAddressBits);
-                                                                                                                           storedCS = ((data >> 8) & 0x0ff) | ((data << 8) & 0xff00);
+       reset_and_select_srom(dev);
+       data = read_srom(dev, 3, SROMAddressBits);
+       storedCS = ((data >> 8) & 0x0ff) | ((data << 8) & 0xff00);
     
-                                                                                                                           return 0;
-                                                                                                                   }
-
-
-                                                                                                                           static void
-                                                                                                                           bmac_get_station_address(struct device *dev, unsigned char *ea)
-                                                                                                                           {
-                                                                                                                                   int i;
-                                                                                                                                   unsigned short data;
-
-                                                                                                                                   for (i = 0; i < 6; i++)     
-                                                                                                                                   {
-                                                                                                                                           reset_and_select_srom(dev);
-                                                                                                                                           data = read_srom(dev, i + EnetAddressOffset/2, SROMAddressBits);
-                                                                                                                                           ea[2*i]   = bitrev(data & 0x0ff);
-                                                                                                                                           ea[2*i+1] = bitrev((data >> 8) & 0x0ff);
-                                                                                                                                   }
-                                                                                                                           }
-
-                                                                                                                                   static int bmac_reset_and_enable(struct device *dev, int enable)
-                                                                                                                                   {
-                                                                                                                                           struct bmac_data *bp = dev->priv;
-                                                                                                                                           unsigned long flags;
-
-                                                                                                                                           save_flags(flags); cli();
-                                                                                                                                           bp->reset_and_enabled = 0;
-                                                                                                                                           bmac_reset_chip(dev);
-                                                                                                                                           if (enable) {
-                                                                                                                                                   if (!bmac_init_tx_ring(bp) || !bmac_init_rx_ring(bp)) return 0;
-                                                                                                                                                   if (!bmac_init_chip(dev)) return 0;
-                                                                                                                                                   bmac_start_chip(dev);
-                                                                                                                                                   bmwrite(dev, INTDISABLE, EnableNormal);
-                                                                                                                                                   bp->reset_and_enabled = 1;
-/*     { */
-/*         unsigned char random_packet[100]; */
-/*         unsigned int i; */
-/*         struct sk_buff *skb = dev_alloc_skb(RX_BUFLEN+2); */
-/*         unsigned char *data = skb_put(skb, sizeof(random_packet)); */
-/* XXDEBUG(("transmitting random packet\n")); */
-/*         for (i = 0; i < sizeof(random_packet); i++) data[i] = i; */
-/*         bmac_transmit_packet(skb, dev); */
-/* XXDEBUG(("done transmitting random packet\n")); */
-/*     } */
-                                                                                                                                           }
-                                                                                                                                           restore_flags(flags);
-                                                                                                                                           return 1;
-                                                                                                                                   }
-
-                                                                                                                                           int
-                                                                                                                                           bmac_probe(struct device *dev)
-                                                                                                                                           {
-                                                                                                                                                   int j, rev;
-                                                                                                                                                   struct bmac_data *bp;
-                                                                                                                                                   struct device_node *bmacs;
-                                                                                                                                                   unsigned char *addr;
-
-                                                                                                                                                   bmacs = find_devices("bmac");
-                                                                                                                                                   if (bmacs == NULL) return ENODEV;
-
-                                                                                                                                                   bmac_devs = dev; /* KLUDGE!! */
-
-                                                                                                                                                   if (bmacs->n_addrs != 3 || bmacs->n_intrs != 3) {
-                                                                                                                                                           printk(KERN_ERR "can't use BMAC %s: expect 3 addrs and 3 intrs\n",
-                                                                                                                                                                  bmacs->full_name);
-                                                                                                                                                           return EINVAL;
-                                                                                                                                                   }
+       return 0;
+}
+
+
+static void
+bmac_get_station_address(struct device *dev, unsigned char *ea)
+{
+       int i;
+       unsigned short data;
+
+       for (i = 0; i < 6; i++) 
+               {
+                       reset_and_select_srom(dev);
+                       data = read_srom(dev, i + EnetAddressOffset/2, SROMAddressBits);
+                       ea[2*i]   = bitrev(data & 0x0ff);
+                       ea[2*i+1] = bitrev((data >> 8) & 0x0ff);
+               }
+}
+
+static int bmac_reset_and_enable(struct device *dev, int enable)
+{
+       struct bmac_data *bp = dev->priv;
+       unsigned long flags;
+
+       save_flags(flags); cli();
+       bp->reset_and_enabled = 0;
+       bmac_reset_chip(dev);
+       if (enable) {
+               if (!bmac_init_tx_ring(bp) || !bmac_init_rx_ring(bp)) return 0;
+               if (!bmac_init_chip(dev)) return 0;
+               bmac_start_chip(dev);
+               bmwrite(dev, INTDISABLE, EnableNormal);
+               bp->reset_and_enabled = 1;
+               /*      { */
+               /*          unsigned char random_packet[100]; */
+               /*          unsigned int i; */
+               /*          struct sk_buff *skb = dev_alloc_skb(RX_BUFLEN+2); */
+               /*          unsigned char *data = skb_put(skb, sizeof(random_packet)); */
+               /* XXDEBUG(("transmitting random packet\n")); */
+               /*          for (i = 0; i < sizeof(random_packet); i++) data[i] = i; */
+               /*          bmac_transmit_packet(skb, dev); */
+               /* XXDEBUG(("done transmitting random packet\n")); */
+               /*      } */
+       }
+       restore_flags(flags);
+       return 1;
+}
+
+int
+bmac_probe(struct device *dev)
+{
+       int j, rev;
+       struct bmac_data *bp;
+       struct device_node *bmacs;
+       unsigned char *addr;
+       static struct device_node *all_bmacs = NULL, *next_bmac;
+
+       if (all_bmacs == NULL)
+               all_bmacs = next_bmac = find_devices("bmac");
+       bmacs = next_bmac;
+       if (bmacs == NULL) return -ENODEV;
+       next_bmac = bmacs->next;
+
+       bmac_devs = dev; /* KLUDGE!! */
+
+       if (bmacs->n_addrs != 3 || bmacs->n_intrs != 3) {
+               printk(KERN_ERR "can't use BMAC %s: expect 3 addrs and 3 intrs\n",
+                      bmacs->full_name);
+               return -EINVAL;
+       }
     
-                                                                                                                                                   if (dev == NULL) {
-                                                                                                                                                           dev = init_etherdev(NULL, PRIV_BYTES);
-                                                                                                                                                           bmac_devs = dev;  /*KLUDGE!!*/
-                                                                                                                                                   } else {
-                                                                                                                                                           /* XXX this doesn't look right (but it's never used :-) */
-                                                                                                                                                           dev->priv = kmalloc(PRIV_BYTES, GFP_KERNEL);
-                                                                                                                                                           if (dev->priv == 0) return -ENOMEM;
-                                                                                                                                                   }
+       if (dev == NULL) {
+               dev = init_etherdev(NULL, PRIV_BYTES);
+               bmac_devs = dev;  /*KLUDGE!!*/
+       } else {
+               /* XXX this doesn't look right (but it's never used :-) */
+               dev->priv = kmalloc(PRIV_BYTES, GFP_KERNEL);
+               if (dev->priv == 0) return -ENOMEM;
+       }
     
-                                                                                                                                                   dev->base_addr = bmacs->addrs[0].address;
-                                                                                                                                                   dev->irq = bmacs->intrs[0].line;
+       dev->base_addr = bmacs->addrs[0].address;
+       dev->irq = bmacs->intrs[0].line;
     
-                                                                                                                                                   bmwrite(dev, INTDISABLE, DisableAll);
-
-                                                                                                                                                   if (request_irq(dev->irq, bmac_misc_intr, 0, "BMAC-misc", dev)) {
-                                                                                                                                                           printk(KERN_ERR "BMAC: can't get irq %d\n", dev->irq);
-                                                                                                                                                           return -EAGAIN;
-                                                                                                                                                   }
-                                                                                                                                                   if (request_irq(bmacs->intrs[1].line, bmac_txdma_intr, 0, "BMAC-txdma",
-                                                                                                                                                                   dev)) {
-                                                                                                                                                           printk(KERN_ERR "BMAC: can't get irq %d\n", bmacs->intrs[1].line);
-                                                                                                                                                           return -EAGAIN;
-                                                                                                                                                   }
-                                                                                                                                                   if (request_irq(bmacs->intrs[2].line, bmac_rxdma_intr, 0, "BMAC-rxdma",
-                                                                                                                                                                   dev)) {
-                                                                                                                                                           printk(KERN_ERR "BMAC: can't get irq %d\n", bmacs->intrs[2].line);
-                                                                                                                                                           return -EAGAIN;
-                                                                                                                                                   }
+       bmwrite(dev, INTDISABLE, DisableAll);
     
-                                                                                                                                                   addr = get_property(bmacs, "mac-address", NULL);
-                                                                                                                                                   if (addr == NULL) {
-                                                                                                                                                           addr = get_property(bmacs, "local-mac-address", NULL);
-                                                                                                                                                           if (addr == NULL) {
-                                                                                                                                                                   printk(KERN_ERR "Can't get mac-address for BMAC at %lx\n",
-                                                                                                                                                                          dev->base_addr);
-                                                                                                                                                                   return -EAGAIN;
-                                                                                                                                                           }
-                                                                                                                                                   }
+       addr = get_property(bmacs, "mac-address", NULL);
+       if (addr == NULL) {
+               addr = get_property(bmacs, "local-mac-address", NULL);
+               if (addr == NULL) {
+                       printk(KERN_ERR "Can't get mac-address for BMAC at %lx\n",
+                              dev->base_addr);
+                       return -EAGAIN;
+               }
+       }
     
-                                                                                                                                                   printk(KERN_INFO "%s: BMAC at", dev->name);
-                                                                                                                                                   rev = addr[0] == 0 && addr[1] == 0xA0;
-                                                                                                                                                   for (j = 0; j < 6; ++j) {
-                                                                                                                                                           dev->dev_addr[j] = rev? bitrev(addr[j]): addr[j];
-                                                                                                                                                           printk("%c%.2x", (j? ':': ' '), dev->dev_addr[j]);
-                                                                                                                                                   }
-                                                                                                                                                   XXDEBUG((", base_addr=%#0lx", dev->base_addr));
-                                                                                                                                                   printk("\n");
+       printk(KERN_INFO "%s: BMAC at", dev->name);
+       rev = addr[0] == 0 && addr[1] == 0xA0;
+       for (j = 0; j < 6; ++j) {
+               dev->dev_addr[j] = rev? bitrev(addr[j]): addr[j];
+               printk("%c%.2x", (j? ':': ' '), dev->dev_addr[j]);
+       }
+       XXDEBUG((", base_addr=%#0lx", dev->base_addr));
+       printk("\n");
     
-                                                                                                                                                   dev->open = bmac_open;
-                                                                                                                                                   dev->stop = bmac_close;
-                                                                                                                                                   dev->hard_start_xmit = bmac_output;
-                                                                                                                                                   dev->get_stats = bmac_stats;
-                                                                                                                                                   dev->set_multicast_list = bmac_set_multicast;
-                                                                                                                                                   dev->set_mac_address = bmac_set_address;
-
-                                                                                                                                                   bmac_get_station_address(dev, addr);
-                                                                                                                                                   if (bmac_verify_checksum(dev) != 0) return EINVAL;
+       dev->open = bmac_open;
+       dev->stop = bmac_close;
+       dev->hard_start_xmit = bmac_output;
+       dev->get_stats = bmac_stats;
+       dev->set_multicast_list = bmac_set_multicast;
+       dev->set_mac_address = bmac_set_address;
+
+       bmac_get_station_address(dev, addr);
+       if (bmac_verify_checksum(dev) != 0) return -EINVAL;
     
-                                                                                                                                                   ether_setup(dev);
+       ether_setup(dev);
     
-                                                                                                                                                   bp = (struct bmac_data *) dev->priv;
-                                                                                                                                                   memset(bp, 0, sizeof(struct bmac_data));
-                                                                                                                                                   bp->tx_dma = (volatile struct dbdma_regs *) bmacs->addrs[1].address;
-                                                                                                                                                   bp->tx_dma_intr = bmacs->intrs[1].line;
-                                                                                                                                                   bp->rx_dma = (volatile struct dbdma_regs *) bmacs->addrs[2].address;
-                                                                                                                                                   bp->rx_dma_intr = bmacs->intrs[2].line;
+       bp = (struct bmac_data *) dev->priv;
+       memset(bp, 0, sizeof(struct bmac_data));
+       bp->tx_dma = (volatile struct dbdma_regs *) bmacs->addrs[1].address;
+       bp->tx_dma_intr = bmacs->intrs[1].line;
+       bp->rx_dma = (volatile struct dbdma_regs *) bmacs->addrs[2].address;
+       bp->rx_dma_intr = bmacs->intrs[2].line;
     
-                                                                                                                                                   bp->tx_cmds = (volatile struct dbdma_cmd *) DBDMA_ALIGN(bp + 1);
-                                                                                                                                                   bp->rx_cmds = bp->tx_cmds + N_TX_RING + 1;
+       bp->tx_cmds = (volatile struct dbdma_cmd *) DBDMA_ALIGN(bp + 1);
+       bp->rx_cmds = bp->tx_cmds + N_TX_RING + 1;
 
-                                                                                                                                                   bp->queue = (struct sk_buff_head *)(bp->rx_cmds + N_RX_RING + 1);
-                                                                                                                                                   skb_queue_head_init(bp->queue);
+       bp->queue = (struct sk_buff_head *)(bp->rx_cmds + N_RX_RING + 1);
+       skb_queue_head_init(bp->queue);
     
-                                                                                                                                                   memset(&bp->stats, 0, sizeof(bp->stats));
-                                                                                                                                                   memset((char *) bp->tx_cmds, 0,
-                                                                                                                                                          (N_TX_RING + N_RX_RING + 2) * sizeof(struct dbdma_cmd));
-/*     init_timer(&bp->tx_timeout); */
-/*     bp->timeout_active = 0; */
+       memset(&bp->stats, 0, sizeof(bp->stats));
+       memset((char *) bp->tx_cmds, 0,
+              (N_TX_RING + N_RX_RING + 2) * sizeof(struct dbdma_cmd));
+       /*     init_timer(&bp->tx_timeout); */
+       /*     bp->timeout_active = 0; */
+
+       if (request_irq(dev->irq, bmac_misc_intr, 0, "BMAC-misc", dev)) {
+               printk(KERN_ERR "BMAC: can't get irq %d\n", dev->irq);
+               return -EAGAIN;
+       }
+       if (request_irq(bmacs->intrs[1].line, bmac_txdma_intr, 0, "BMAC-txdma",
+                       dev)) {
+               printk(KERN_ERR "BMAC: can't get irq %d\n", bmacs->intrs[1].line);
+               return -EAGAIN;
+       }
+       if (request_irq(bmacs->intrs[2].line, bmac_rxdma_intr, 0, "BMAC-rxdma",
+                       dev)) {
+               printk(KERN_ERR "BMAC: can't get irq %d\n", bmacs->intrs[2].line);
+               return -EAGAIN;
+       }
+    
+       if (!bmac_reset_and_enable(dev, 0)) return -ENOMEM;
     
-                                                                                                                                                   if (!bmac_reset_and_enable(dev, 0)) return EINVAL;
-
 #ifdef CONFIG_PROC_FS
-                                                                                                                                                   proc_net_register(&(struct proc_dir_entry) {
-                                                                                                                                                           PROC_NET_BMAC, 4, "bmac",
-                                                                                                                                                                   S_IFREG | S_IRUGO, 1, 0, 0,
-                                                                                                                                                                   0, &proc_net_inode_operations,
-                                                                                                                                                                   bmac_proc_info
-                                                                                                                                                                   });
+       proc_net_register(&(struct proc_dir_entry) {
+               PROC_NET_BMAC, 4, "bmac",
+                       S_IFREG | S_IRUGO, 1, 0, 0,
+                       0, &proc_net_inode_operations,
+                       bmac_proc_info
+                       });
 #endif
 
-                                                                                                                                                   return 0;
-                                                                                                                                           }
-
-                                                                                                                                                   static int bmac_open(struct device *dev)
-                                                                                                                                                   {
-/* XXDEBUG(("bmac: enter open\n")); */
-                                                                                                                                                           /* reset the chip */
-                                                                                                                                                           bmac_reset_and_enable(dev, 1);
-
-                                                                                                                                                           dev->flags |= IFF_UP | IFF_RUNNING;
-
-                                                                                                                                                           return 0;
-                                                                                                                                                   }
-
-                                                                                                                                                           static int bmac_close(struct device *dev)
-                                                                                                                                                           {
-                                                                                                                                                                   struct bmac_data *bp = (struct bmac_data *) dev->priv;
-                                                                                                                                                                   volatile struct dbdma_regs *rd = bp->rx_dma;
-                                                                                                                                                                   volatile struct dbdma_regs *td = bp->tx_dma;
-                                                                                                                                                                   unsigned short config;
-                                                                                                                                                                   int i;
-
-                                                                                                                                                                   dev->flags &= ~(IFF_UP | IFF_RUNNING);
-
-                                                                                                                                                                   /* disable rx and tx */
-                                                                                                                                                                   config = bmread(dev, RXCFG);
-                                                                                                                                                                   bmwrite(dev, RXCFG, (config & ~RxMACEnable));
-
-                                                                                                                                                                   config = bmread(dev, TXCFG);
-                                                                                                                                                                   bmwrite(dev, TXCFG, (config & ~TxMACEnable));
-
-                                                                                                                                                                   bmwrite(dev, INTDISABLE, DisableAll); /* disable all intrs */
-
-                                                                                                                                                                   /* disable rx and tx dma */
-                                                                                                                                                                   st_le32(&rd->control, DBDMA_CLEAR(RUN|PAUSE|FLUSH|WAKE));   /* clear run bit */
-                                                                                                                                                                   st_le32(&td->control, DBDMA_CLEAR(RUN|PAUSE|FLUSH|WAKE));   /* clear run bit */
-
-                                                                                                                                                                   /* free some skb's */
-                                                                                                                                                                   XXDEBUG(("bmac: free rx bufs\n"));
-                                                                                                                                                                   for (i=0; i<N_RX_RING; i++) {
-                                                                                                                                                                           if (bp->rx_bufs[i] != NULL) {
-                                                                                                                                                                                   dev_kfree_skb(bp->rx_bufs[i]);
-                                                                                                                                                                                   bp->rx_bufs[i] = NULL;
-                                                                                                                                                                           }
-                                                                                                                                                                   }
-                                                                                                                                                                   bp->rx_allocated = 0;
-                                                                                                                                                                   XXDEBUG(("bmac: free doubles\n"));/*MEMORY LEAK BELOW!!! FIX!!! */
-                                                                                                                                                                   if (bp->tx_double[0] != NULL) kfree(bp->tx_double[0]);
-                                                                                                                                                                   XXDEBUG(("bmac: free tx bufs\n"));
-                                                                                                                                                                   for (i = 0; i<N_TX_RING; i++) {
-                                                                                                                                                                           if (bp->tx_bufs[i] != NULL) {
-                                                                                                                                                                                   dev_kfree_skb(bp->tx_bufs[i]);
-                                                                                                                                                                                   bp->tx_bufs[i] = NULL;
-                                                                                                                                                                           }
-                                                                                                                                                                   }
-                                                                                                                                                                   bp->tx_allocated = 0;
-                                                                                                                                                                   bp->reset_and_enabled = 0;
-                                                                                                                                                                   XXDEBUG(("bmac: all bufs freed\n"));
-
-                                                                                                                                                                   return 0;
-                                                                                                                                                           }
-
-                                                                                                                                                                   static void
-                                                                                                                                                                   bmac_start(struct device *dev)
-                                                                                                                                                                   {
-                                                                                                                                                                           struct bmac_data *bp = dev->priv;
-                                                                                                                                                                           int i;
-                                                                                                                                                                           struct sk_buff *skb;
-                                                                                                                                                                           unsigned long flags;
+       return 0;
+}
+
+static int bmac_open(struct device *dev)
+{
+       /* XXDEBUG(("bmac: enter open\n")); */
+       /* reset the chip */
+       bmac_reset_and_enable(dev, 1);
+
+       dev->flags |= IFF_UP | IFF_RUNNING;
+
+       return 0;
+}
+
+static int bmac_close(struct device *dev)
+{
+       struct bmac_data *bp = (struct bmac_data *) dev->priv;
+       volatile struct dbdma_regs *rd = bp->rx_dma;
+       volatile struct dbdma_regs *td = bp->tx_dma;
+       unsigned short config;
+       int i;
+
+       dev->flags &= ~(IFF_UP | IFF_RUNNING);
+
+       /* disable rx and tx */
+       config = bmread(dev, RXCFG);
+       bmwrite(dev, RXCFG, (config & ~RxMACEnable));
+
+       config = bmread(dev, TXCFG);
+       bmwrite(dev, TXCFG, (config & ~TxMACEnable));
+
+       bmwrite(dev, INTDISABLE, DisableAll); /* disable all intrs */
+
+       /* disable rx and tx dma */
+       st_le32(&rd->control, DBDMA_CLEAR(RUN|PAUSE|FLUSH|WAKE));       /* clear run bit */
+       st_le32(&td->control, DBDMA_CLEAR(RUN|PAUSE|FLUSH|WAKE));       /* clear run bit */
+
+       /* free some skb's */
+       XXDEBUG(("bmac: free rx bufs\n"));
+       for (i=0; i<N_RX_RING; i++) {
+               if (bp->rx_bufs[i] != NULL) {
+                       dev_kfree_skb(bp->rx_bufs[i]);
+                       bp->rx_bufs[i] = NULL;
+               }
+       }
+       bp->rx_allocated = 0;
+       XXDEBUG(("bmac: free doubles\n"));/*MEMORY LEAK BELOW!!! FIX!!! */
+       if (bp->tx_double[0] != NULL) kfree(bp->tx_double[0]);
+       XXDEBUG(("bmac: free tx bufs\n"));
+       for (i = 0; i<N_TX_RING; i++) {
+               if (bp->tx_bufs[i] != NULL) {
+                       dev_kfree_skb(bp->tx_bufs[i]);
+                       bp->tx_bufs[i] = NULL;
+               }
+       }
+       bp->tx_allocated = 0;
+       bp->reset_and_enabled = 0;
+       XXDEBUG(("bmac: all bufs freed\n"));
+
+       return 0;
+}
+
+static void
+bmac_start(struct device *dev)
+{
+       struct bmac_data *bp = dev->priv;
+       int i;
+       struct sk_buff *skb;
+       unsigned long flags;
     
-                                                                                                                                                                           save_flags(flags); cli();
-                                                                                                                                                                           while (1) {
-                                                                                                                                                                                   i = bp->tx_fill + 1;
-                                                                                                                                                                                   if (i >= N_TX_RING) i = 0;
-                                                                                                                                                                                   if (i == bp->tx_empty) break;
-                                                                                                                                                                                   skb = skb_dequeue(bp->queue);
-                                                                                                                                                                                   if (skb == NULL) break;
-                                                                                                                                                                                   bmac_transmit_packet(skb, dev);
-                                                                                                                                                                           }
-                                                                                                                                                                           restore_flags(flags);
-                                                                                                                                                                   }
-
-                                                                                                                                                                           static int
-                                                                                                                                                                           bmac_output(struct sk_buff *skb, struct device *dev)
-                                                                                                                                                                           {
-                                                                                                                                                                                   struct bmac_data *bp = dev->priv;
-                                                                                                                                                                                   skb_queue_tail(bp->queue, skb);
-                                                                                                                                                                                   bmac_start(dev);
-                                                                                                                                                                                   return 0;
-                                                                                                                                                                           }
-
-                                                                                                                                                                                   static void bmac_tx_timeout(unsigned long data)
-                                                                                                                                                                                   {
-                                                                                                                                                                                           struct device *dev = (struct device *) data;
-                                                                                                                                                                                           struct bmac_data *bp = (struct bmac_data *) dev->priv;
-                                                                                                                                                                                           volatile struct dbdma_regs *td = bp->tx_dma;
-                                                                                                                                                                                           volatile struct dbdma_regs *rd = bp->rx_dma;
-                                                                                                                                                                                           volatile struct dbdma_cmd *cp;
-                                                                                                                                                                                           unsigned long flags;
-                                                                                                                                                                                           unsigned short config, oldConfig;
-                                                                                                                                                                                           int i;
-
-                                                                                                                                                                                           XXDEBUG(("bmac: tx_timeout called\n"));
-                                                                                                                                                                                           save_flags(flags); cli();
-                                                                                                                                                                                           bp->timeout_active = 0;
-
-                                                                                                                                                                                           /* update various counters */
-/*     bmac_handle_misc_intrs(bp, 0); */
-
-                                                                                                                                                                                           cp = &bp->tx_cmds[bp->tx_empty];
-/*     XXDEBUG((KERN_DEBUG "bmac: tx dmastat=%x %x runt=%d pr=%x fs=%x fc=%x\n", */
+       save_flags(flags); cli();
+       while (1) {
+               i = bp->tx_fill + 1;
+               if (i >= N_TX_RING) i = 0;
+               if (i == bp->tx_empty) break;
+               skb = skb_dequeue(bp->queue);
+               if (skb == NULL) break;
+               bmac_transmit_packet(skb, dev);
+       }
+       restore_flags(flags);
+}
+
+static int
+bmac_output(struct sk_buff *skb, struct device *dev)
+{
+       struct bmac_data *bp = dev->priv;
+       skb_queue_tail(bp->queue, skb);
+       bmac_start(dev);
+       return 0;
+}
+
+static void bmac_tx_timeout(unsigned long data)
+{
+       struct device *dev = (struct device *) data;
+       struct bmac_data *bp = (struct bmac_data *) dev->priv;
+       volatile struct dbdma_regs *td = bp->tx_dma;
+       volatile struct dbdma_regs *rd = bp->rx_dma;
+       volatile struct dbdma_cmd *cp;
+       unsigned long flags;
+       unsigned short config, oldConfig;
+       int i;
+
+       XXDEBUG(("bmac: tx_timeout called\n"));
+       save_flags(flags); cli();
+       bp->timeout_active = 0;
+
+       /* update various counters */
+/*             bmac_handle_misc_intrs(bp, 0); */
+
+       cp = &bp->tx_cmds[bp->tx_empty];
+/*     XXDEBUG((KERN_DEBUG "bmac: tx dmastat=%x %x runt=%d pr=%x fs=%x fc=%x\n", */
 /*        ld_le32(&td->status), ld_le16(&cp->xfer_status), bp->tx_bad_runt, */
 /*        mb->pr, mb->xmtfs, mb->fifofc)); */
 
-                                                                                                                                                                                           /* turn off both tx and rx and reset the chip */
-                                                                                                                                                                                           config = bmread(dev, RXCFG);
-                                                                                                                                                                                           bmwrite(dev, RXCFG, (config & ~RxMACEnable));
-                                                                                                                                                                                           config = bmread(dev, TXCFG);
-                                                                                                                                                                                           bmwrite(dev, TXCFG, (config & ~TxMACEnable));
-                                                                                                                                                                                           out_le32(&td->control, DBDMA_CLEAR(RUN|PAUSE|FLUSH|WAKE|ACTIVE|DEAD));
-                                                                                                                                                                                           printk(KERN_ERR "bmac: transmit timeout - resetting\n");
-                                                                                                                                                                                           bmac_reset_chip(dev);
-
-                                                                                                                                                                                           /* restart rx dma */
-                                                                                                                                                                                           cp = bus_to_virt(ld_le32(&rd->cmdptr));
-                                                                                                                                                                                           out_le32(&rd->control, DBDMA_CLEAR(RUN|PAUSE|FLUSH|WAKE|ACTIVE|DEAD));
-                                                                                                                                                                                           out_le16(&cp->xfer_status, 0);
-                                                                                                                                                                                           out_le32(&rd->cmdptr, virt_to_bus(cp));
-                                                                                                                                                                                           out_le32(&rd->control, DBDMA_SET(RUN|WAKE));
-
-                                                                                                                                                                                           /* fix up the transmit side */
-                                                                                                                                                                                           XXDEBUG((KERN_DEBUG "bmac: tx empty=%d fill=%d fullup=%d\n",
-                                                                                                                                                                                                    bp->tx_empty, bp->tx_fill, bp->tx_fullup));
-                                                                                                                                                                                           i = bp->tx_empty;
-                                                                                                                                                                                           ++bp->stats.tx_errors;
-                                                                                                                                                                                           if (i != bp->tx_fill) {
-                                                                                                                                                                                                   dev_kfree_skb(bp->tx_bufs[i]);
-                                                                                                                                                                                                   bp->tx_bufs[i] = NULL;
-                                                                                                                                                                                                   if (++i >= N_TX_RING) i = 0;
-                                                                                                                                                                                                   bp->tx_empty = i;
-                                                                                                                                                                                           }
-                                                                                                                                                                                           bp->tx_fullup = 0;
-                                                                                                                                                                                           dev->tbusy = 0;
-                                                                                                                                                                                           mark_bh(NET_BH);
-                                                                                                                                                                                           XXDEBUG((KERN_DEBUG "bmac: clearing tbusy\n"));
-                                                                                                                                                                                           if (i != bp->tx_fill) {
-                                                                                                                                                                                                   cp = &bp->tx_cmds[i];
-                                                                                                                                                                                                   out_le16(&cp->xfer_status, 0);
-                                                                                                                                                                                                   out_le16(&cp->command, OUTPUT_LAST);
-                                                                                                                                                                                                   out_le32(&td->cmdptr, virt_to_bus(cp));
-                                                                                                                                                                                                   out_le32(&td->control, DBDMA_SET(RUN));
-/*     bmac_set_timeout(dev); */
-                                                                                                                                                                                                   XXDEBUG((KERN_DEBUG "bmac: starting %d\n", i));
-                                                                                                                                                                                           }
-
-                                                                                                                                                                                           /* turn it back on */
-                                                                                                                                                                                           oldConfig = bmread(dev, RXCFG);             
-                                                                                                                                                                                           bmwrite(dev, RXCFG, oldConfig | RxMACEnable ); 
-                                                                                                                                                                                           oldConfig = bmread(dev, TXCFG);             
-                                                                                                                                                                                           bmwrite(dev, TXCFG, oldConfig | TxMACEnable );  
-
-                                                                                                                                                                                           restore_flags(flags);
-                                                                                                                                                                                   }
+       /* turn off both tx and rx and reset the chip */
+       config = bmread(dev, RXCFG);
+       bmwrite(dev, RXCFG, (config & ~RxMACEnable));
+       config = bmread(dev, TXCFG);
+       bmwrite(dev, TXCFG, (config & ~TxMACEnable));
+       out_le32(&td->control, DBDMA_CLEAR(RUN|PAUSE|FLUSH|WAKE|ACTIVE|DEAD));
+       printk(KERN_ERR "bmac: transmit timeout - resetting\n");
+       bmac_reset_chip(dev);
+
+       /* restart rx dma */
+       cp = bus_to_virt(ld_le32(&rd->cmdptr));
+       out_le32(&rd->control, DBDMA_CLEAR(RUN|PAUSE|FLUSH|WAKE|ACTIVE|DEAD));
+       out_le16(&cp->xfer_status, 0);
+       out_le32(&rd->cmdptr, virt_to_bus(cp));
+       out_le32(&rd->control, DBDMA_SET(RUN|WAKE));
+
+       /* fix up the transmit side */
+       XXDEBUG((KERN_DEBUG "bmac: tx empty=%d fill=%d fullup=%d\n",
+                bp->tx_empty, bp->tx_fill, bp->tx_fullup));
+       i = bp->tx_empty;
+       ++bp->stats.tx_errors;
+       if (i != bp->tx_fill) {
+               dev_kfree_skb(bp->tx_bufs[i]);
+               bp->tx_bufs[i] = NULL;
+               if (++i >= N_TX_RING) i = 0;
+               bp->tx_empty = i;
+       }
+       bp->tx_fullup = 0;
+       dev->tbusy = 0;
+       mark_bh(NET_BH);
+       XXDEBUG((KERN_DEBUG "bmac: clearing tbusy\n"));
+       if (i != bp->tx_fill) {
+               cp = &bp->tx_cmds[i];
+               out_le16(&cp->xfer_status, 0);
+               out_le16(&cp->command, OUTPUT_LAST);
+               out_le32(&td->cmdptr, virt_to_bus(cp));
+               out_le32(&td->control, DBDMA_SET(RUN));
+               /*      bmac_set_timeout(dev); */
+               XXDEBUG((KERN_DEBUG "bmac: starting %d\n", i));
+       }
+
+       /* turn it back on */
+       oldConfig = bmread(dev, RXCFG);         
+       bmwrite(dev, RXCFG, oldConfig | RxMACEnable ); 
+       oldConfig = bmread(dev, TXCFG);         
+       bmwrite(dev, TXCFG, oldConfig | TxMACEnable );  
+
+       restore_flags(flags);
+}
 
 #if 0
-                                                                                                                                                                                           static void dump_dbdma(volatile struct dbdma_cmd *cp,int count)
-                                                                                                                                                                                           {
-                                                                                                                                                                                                   int i,*ip;
+static void dump_dbdma(volatile struct dbdma_cmd *cp,int count)
+{
+       int i,*ip;
        
-                                                                                                                                                                                                   for (i=0;i< count;i++)
-                                                                                                                                                                                                   {
-                                                                                                                                                                                                           ip = (int*)(cp+i);
+       for (i=0;i< count;i++) {
+               ip = (int*)(cp+i);
        
-                                                                                                                                                                                                           printk("dbdma req 0x%x addr 0x%x baddr 0x%x xfer/res 0x%x\n",
-                                                                                                                                                                                                                  ld_le32(ip+0),
-                                                                                                                                                                                                                  ld_le32(ip+1),
-                                                                                                                                                                                                                  ld_le32(ip+2),
-                                                                                                                                                                                                                  ld_le32(ip+3));
-                                                                                                                                                                                                   }
-
-                                                                                                                                                                                           }
+               printk("dbdma req 0x%x addr 0x%x baddr 0x%x xfer/res 0x%x\n",
+                      ld_le32(ip+0),
+                      ld_le32(ip+1),
+                      ld_le32(ip+2),
+                      ld_le32(ip+3));
+       }
+
+}
 #endif
 
-                                                                                                                                                                                                   static int
-                                                                                                                                                                                                   bmac_proc_info ( char *buffer, char **start, off_t offset, int length, int dummy)
-                                                                                                                                                                                                   {
-                                                                                                                                                                                                           int len = 0;
-                                                                                                                                                                                                           off_t pos   = 0;
-                                                                                                                                                                                                           off_t begin = 0;
-                                                                                                                                                                                                           int i;
+static int
+bmac_proc_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+       int len = 0;
+       off_t pos   = 0;
+       off_t begin = 0;
+       int i;
 
-                                                                                                                                                                                                           if (bmac_devs == NULL) return (-ENOSYS);
+       if (bmac_devs == NULL) return (-ENOSYS);
 
-                                                                                                                                                                                                           len += sprintf(buffer, "BMAC counters & registers\n");
+       len += sprintf(buffer, "BMAC counters & registers\n");
 
-                                                                                                                                                                                                           for (i = 0; i<N_REG_ENTRIES; i++) {
-                                                                                                                                                                                                                   len += sprintf(buffer + len, "%s: %#08x\n",
-                                                                                                                                                                                                                                  reg_entries[i].name,
-                                                                                                                                                                                                                                  bmread(bmac_devs, reg_entries[i].reg_offset));
-                                                                                                                                                                                                                   pos = begin + len;
+       for (i = 0; i<N_REG_ENTRIES; i++) {
+               len += sprintf(buffer + len, "%s: %#08x\n",
+                              reg_entries[i].name,
+                              bmread(bmac_devs, reg_entries[i].reg_offset));
+               pos = begin + len;
     
-                                                                                                                                                                                                                   if (pos < offset) {
-                                                                                                                                                                                                                           len = 0;
-                                                                                                                                                                                                                           begin = pos;
-                                                                                                                                                                                                                   }
+               if (pos < offset) {
+                       len = 0;
+                       begin = pos;
+               }
     
-                                                                                                                                                                                                                   if (pos > offset+length) break;
-                                                                                                                                                                                                           }
+               if (pos > offset+length) break;
+       }
   
-                                                                                                                                                                                                           *start = buffer + (offset - begin);
-                                                                                                                                                                                                           len -= (offset - begin);
+       *start = buffer + (offset - begin);
+       len -= (offset - begin);
   
-                                                                                                                                                                                                           if (len > length) len = length;
+       if (len > length) len = length;
   
-                                                                                                                                                                                                           return len;
-                                                                                                                                                                                                   }
+       return len;
+}
index 1c7f3b9..392354f 100644 (file)
@@ -91,85+91,90 @@ bitrev(int b)
 int
 mace_probe(struct device *dev)
 {
-    int j, rev;
-    struct mace_data *mp;
-    struct device_node *maces;
-    unsigned char *addr;
-
-    maces = find_devices("mace");
-    if (maces == 0)
-       return ENODEV;
-
-    do {
-       if (maces->n_addrs != 3 || maces->n_intrs != 3) {
-           printk(KERN_ERR "can't use MACE %s: expect 3 addrs and 3 intrs\n",
-                  maces->full_name);
-           continue;
+       int j, rev;
+       struct mace_data *mp;
+       struct device_node *mace;
+       unsigned char *addr;
+       static int maces_found = 0;
+       static struct device_node *next_mace;
+
+       if (!maces_found) {
+               next_mace = find_devices("mace");
+               maces_found = 1;
+       }
+       mace = next_mace;
+       if (mace == 0)
+               return -ENODEV;
+       next_mace = mace->next;
+
+       if (mace->n_addrs != 3 || mace->n_intrs != 3) {
+               printk(KERN_ERR "can't use MACE %s: expect 3 addrs and 3 intrs\n",
+                      mace->full_name);
+               return -ENODEV;
        }
 
        if (dev == NULL)
-           dev = init_etherdev(0, PRIV_BYTES);
+               dev = init_etherdev(0, PRIV_BYTES);
        else {
-           /* XXX this doesn't look right (but it's never used :-) */
-           dev->priv = kmalloc(PRIV_BYTES, GFP_KERNEL);
-           if (dev->priv == 0)
-               return -ENOMEM;
+               dev->priv = kmalloc(PRIV_BYTES, GFP_KERNEL);
+               if (dev->priv == 0)
+                       return -ENOMEM;
        }
 
        mp = (struct mace_data *) dev->priv;
-       dev->base_addr = maces->addrs[0].address;
+       dev->base_addr = mace->addrs[0].address;
        mp->mace = (volatile struct mace *)
-               ioremap(maces->addrs[0].address, 0x1000);
-       dev->irq = maces->intrs[0].line;
+               ioremap(mace->addrs[0].address, 0x1000);
+       dev->irq = mace->intrs[0].line;
 
        if (request_irq(dev->irq, mace_interrupt, 0, "MACE", dev)) {
-           printk(KERN_ERR "MACE: can't get irq %d\n", dev->irq);
-           return -EAGAIN;
+               printk(KERN_ERR "MACE: can't get irq %d\n", dev->irq);
+               return -EAGAIN;
        }
-       if (request_irq(maces->intrs[1].line, mace_txdma_intr, 0, "MACE-txdma",
+       if (request_irq(mace->intrs[1].line, mace_txdma_intr, 0, "MACE-txdma",
                        dev)) {
-           printk(KERN_ERR "MACE: can't get irq %d\n", maces->intrs[1].line);
-           return -EAGAIN;
+               printk(KERN_ERR "MACE: can't get irq %d\n", mace->intrs[1].line);
+               return -EAGAIN;
        }
-       if (request_irq(maces->intrs[2].line, mace_rxdma_intr, 0, "MACE-rxdma",
+       if (request_irq(mace->intrs[2].line, mace_rxdma_intr, 0, "MACE-rxdma",
                        dev)) {
-           printk(KERN_ERR "MACE: can't get irq %d\n", maces->intrs[2].line);
-           return -EAGAIN;
+               printk(KERN_ERR "MACE: can't get irq %d\n", mace->intrs[2].line);
+               return -EAGAIN;
        }
 
-       addr = get_property(maces, "mac-address", NULL);
+       addr = get_property(mace, "mac-address", NULL);
        if (addr == NULL) {
-           addr = get_property(maces, "local-mac-address", NULL);
-           if (addr == NULL) {
-               printk(KERN_ERR "Can't get mac-address for MACE at %lx\n",
-                      dev->base_addr);
-               return -EAGAIN;
-           }
+               addr = get_property(mace, "local-mac-address", NULL);
+               if (addr == NULL) {
+                       printk(KERN_ERR "Can't get mac-address for MACE at %lx\n",
+                              dev->base_addr);
+                       return -EAGAIN;
+               }
        }
 
        printk(KERN_INFO "%s: MACE at", dev->name);
        rev = addr[0] == 0 && addr[1] == 0xA0;
        for (j = 0; j < 6; ++j) {
-           dev->dev_addr[j] = rev? bitrev(addr[j]): addr[j];
-           printk("%c%.2x", (j? ':': ' '), dev->dev_addr[j]);
+               dev->dev_addr[j] = rev? bitrev(addr[j]): addr[j];
+               printk("%c%.2x", (j? ':': ' '), dev->dev_addr[j]);
        }
        printk("\n");
 
        mp = (struct mace_data *) dev->priv;
        mp->maccc = ENXMT | ENRCV;
        mp->tx_dma = (volatile struct dbdma_regs *)
-               ioremap(maces->addrs[1].address, 0x1000);
-       mp->tx_dma_intr = maces->intrs[1].line;
+               ioremap(mace->addrs[1].address, 0x1000);
+       mp->tx_dma_intr = mace->intrs[1].line;
        mp->rx_dma = (volatile struct dbdma_regs *)
-               ioremap(maces->addrs[2].address, 0x1000);
-       mp->rx_dma_intr = maces->intrs[2].line;
+               ioremap(mace->addrs[2].address, 0x1000);
+       mp->rx_dma_intr = mace->intrs[2].line;
 
        mp->tx_cmds = (volatile struct dbdma_cmd *) DBDMA_ALIGN(mp + 1);
        mp->rx_cmds = mp->tx_cmds + NCMDS_TX * N_TX_RING + 1;
 
        memset(&mp->stats, 0, sizeof(mp->stats));
        memset((char *) mp->tx_cmds, 0,
-             (NCMDS_TX*N_TX_RING + N_RX_RING + 2) * sizeof(struct dbdma_cmd));
+              (NCMDS_TX*N_TX_RING + N_RX_RING + 2) * sizeof(struct dbdma_cmd));
        init_timer(&mp->tx_timeout);
        mp->timeout_active = 0;
 
@@ -182,9+187,7 @@ mace_probe(struct device *dev)
 
        ether_setup(dev);
 
-    } while ((maces = maces->next) != 0);
-
-    return 0;
+       return 0;
 }
 
 static void mace_reset(struct device *dev)
index 6aea631..c27a185 100644 (file)
@@ -244,8+244,8 @@ plip_init_dev(struct device *dev, struct parport *pb))
                                         plip_wakeup, plip_interrupt, 
                                         PARPORT_DEV_LURK, dev);
 
-       printk(version);
-       printk("%s: Parallel port at %#3lx, using IRQ %d\n", dev->name,
+       printk(KERN_INFO "%s", version);
+       printk(KERN_INFO "%s: Parallel port at %#3lx, using IRQ %d\n", dev->name,
               dev->base_addr, dev->irq);
 
        /* Fill in the generic fields of the device structure. */
@@ -537,7+537,7 @@ plip_receive_packet(struct device *dev, struct net_local *nl,
                /* Malloc up new buffer. */
                rcv->skb = dev_alloc_skb(rcv->length.h);
                if (rcv->skb == NULL) {
-                       printk(KERN_WARNING "%s: Memory squeeze.\n", dev->name);
+                       printk(KERN_ERR "%s: Memory squeeze.\n", dev->name);
                        return ERROR;
                }
                skb_put(rcv->skb,rcv->length.h);
@@ -662,7+662,7 @@ plip_send_packet(struct device *dev, struct net_local *nl,
        unsigned int cx;
 
        if (snd->skb == NULL || (lbuf = snd->skb->data) == NULL) {
-               printk(KERN_ERR "%s: send skb lost\n", dev->name);
+               printk(KERN_DEBUG "%s: send skb lost\n", dev->name);
                snd->state = PLIP_PK_DONE;
                snd->skb = NULL;
                return ERROR;
@@ -817,7+817,7 @@ plip_interrupt(int irq, void *dev_id, struct pt_regs * regs)
        unsigned char c0;
 
        if (dev == NULL) {
-               printk(KERN_ERR "plip_interrupt: irq %d for unknown device.\n", irq);
+               printk(KERN_DEBUG "plip_interrupt: irq %d for unknown device.\n", irq);
                return;
        }
 
@@ -861,7+861,7 @@ plip_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 
        case PLIP_CN_ERROR:
                spin_unlock_irq(&nl->lock);
-               printk(KERN_WARNING "%s: receive interrupt in error state\n", dev->name);
+               printk(KERN_ERR "%s: receive interrupt in error state\n", dev->name);
                break;
        }
 }
@@ -1083,7+1083,8 @@ plip_get_stats(struct device *dev)
        return r;
 }
 
-static int plip_config(struct device *dev, struct ifmap *map)
+static int
+plip_config(struct device *dev, struct ifmap *map)
 {
        struct net_local *nl = (struct net_local *) dev->priv;
        struct pardevice *pardev = nl->pardev;
@@ -1091,8+1092,8 @@ static int plip_config(struct device *dev, struct ifmap *map)
        if (dev->flags & IFF_UP)
                return -EBUSY;
 
-       printk(KERN_INFO "plip: Warning, changing irq with ifconfig will be obsoleted.\n");
-       printk(KERN_INFO "plip: Next time, please set with /proc/parport/*/irq instead.\n");
+       printk(KERN_WARNING "plip: Warning, changing irq with ifconfig will be obsoleted.\n");
+       printk(KERN_WARNING "plip: Next time, please set with /proc/parport/*/irq instead.\n");
 
        if (map->irq != (unsigned char)-1) {
                pardev->port->irq = dev->irq = map->irq;
@@ -1177,7+1178,7 @@ void plip_setup(char *str, int *ints)
                        /* disable driver on "parport=" or "parport=0" */
                        parport[0] = -2;
                } else {
-                       printk(KERN_WARNING "warning: 'plip=0x%x' ignored\n", 
+                       printk(KERN_WARNINGING "warning: 'plip=0x%x' ignored\n", 
                               ints[1]);
                }
        }
index b0bfeb8..6901c71 100644 (file)
@@ -1,4+1,4 @@
-/* $Id: sunlance.c,v 1.79 1998/06/04 09:54:58 jj Exp $
+/* $Id: sunlance.c,v 1.81 1998/08/10 09:08:23 jj Exp $
  * lance.c: Linux/Sparc/Lance driver
  *
  *     Written 1995, 1996 by Miguel de Icaza
@@ -1135,9+1135,10 @@ __initfunc(int sparc_lance_probe (struct device *dev))
                return ENODEV;
        called++;
 
-       if (idprom->id_machtype == (SM_SUN4|SM_4_330)) {
+       if ((idprom->id_machtype == (SM_SUN4|SM_4_330)) ||
+           (idprom->id_machtype == (SM_SUN4|SM_4_470))) {
                memset (&sdev, 0, sizeof(sdev));
-               sdev.reg_addrs[0].phys_addr = SUN4_300_ETH_PHYSADDR;
+               sdev.reg_addrs[0].phys_addr = sun4_eth_physaddr;
                sdev.irqs[0] = 6;
                return sparc_lance_init(dev, &sdev, 0, 0);
        }
index 8c48f0f..80f1040 100644 (file)
@@ -1,4+1,19 @@
 
+UPDATE NEWS: version 1.33 - 26 Aug 98
+
+   Interrupt management in this driver has become, over
+   time, increasingly odd and difficult to explain - this
+   has been mostly due to my own mental inadequacies. In
+   recent kernels, it has failed to function at all when
+   compiled for SMP. I've fixed that problem, and after
+   taking a fresh look at interrupts in general, greatly
+   reduced the number of places where they're fiddled
+   with. Done some heavy testing and it looks very good.
+   The driver now makes use of the __initfunc() and
+   __initdata macros to save about 4k of kernel memory.
+   Once again, the same code works for both 2.0.xx and
+   2.1.xx kernels.
+
 UPDATE NEWS: version 1.32 - 28 Mar 98
 
    Removed the check for legal IN2000 hardware versions:
index adc7ccf..04ecc15 100644 (file)
 
 #include <linux/blk.h>
 #include <linux/stat.h>
-#include <asm/spinlock.h>
 
 #include "scsi.h"
 #include "sd.h"
 #include "hosts.h"
 
-
-#define IN2000_VERSION    "1.32"
-#define IN2000_DATE       "28/March/1998"
-
-/*
- * Note - the following defines have been moved to 'in2000.h':
- *
- *    PROC_INTERFACE
- *    PROC_STATISTICS
- *    SYNC_DEBUG
- *    DEBUGGING_ON
- *    DEBUG_DEFAULTS
- *    FAST_READ_IO
- *    FAST_WRITE_IO
- *
- */
-
+#define IN2000_VERSION    "1.33"
+#define IN2000_DATE       "26/August/1998"
 
 #include "in2000.h"
 
 
-
 /*
  * 'setup_strings' is a single string used to pass operating parameters and
  * settings from the kernel/module command-line to the driver. 'setup_args[]'
@@ -404,6+387,10 @@ DB(DB_QUEUE_COMMAND,printk("Q-%d-%02x-%ld(",cmd->target,cmd->cmnd[0],cmd->pid))
 
    cmd->SCp.Status = ILLEGAL_STATUS_BYTE;
 
+/* We need to disable interrupts before messing with the input
+ * queue and calling in2000_execute().
+ */
+
    save_flags(flags);
    cli();
 
@@ -443,20+430,19 @@ DB(DB_QUEUE_COMMAND,printk(")Q-%ld ",cmd->pid))
  * already connected, we give up immediately. Otherwise, look through
  * the input_Q, using the first command we find that's intended
  * for a currently non-busy target/lun.
+ * Note that this function is always called with interrupts already
+ * disabled (either from in2000_queuecommand() or in2000_intr()).
  */
 static void in2000_execute (struct Scsi_Host *instance)
 {
 struct IN2000_hostdata *hostdata;
 Scsi_Cmnd *cmd, *prev;
-unsigned long flags;
 int i;
 unsigned short *sp;
 unsigned short f;
 unsigned short flushbuf[16];
 
 
-   save_flags(flags);
-   cli();
    hostdata = (struct IN2000_hostdata *)instance->hostdata;
 
 DB(DB_EXECUTE,printk("EX("))
@@ -465,7+451,6 @@ DB(DB_EXECUTE,printk("EX("))
 
 DB(DB_EXECUTE,printk(")EX-0 "))
 
-      restore_flags(flags);
       return;
       }
 
@@ -489,7+474,6 @@ DB(DB_EXECUTE,printk(")EX-0 "))
 
 DB(DB_EXECUTE,printk(")EX-1 "))
 
-      restore_flags(flags);
       return;
       }
 
@@ -717,7+701,6 @@ no:
       
 DB(DB_EXECUTE,printk("%s%ld)EX-2 ",(cmd->SCp.phase)?"d:":"",cmd->pid))
 
-   restore_flags(flags);
 }
 
 
@@ -842,15+825,10 @@ int i;
 }
 
 
-/* It appears that the Linux interrupt dispatcher calls this
- * function in a non-reentrant fashion. What that means to us
- * is that we can use an SA_INTERRUPT type of interrupt (which
- * is faster), and do an sti() right away to let timer, serial,
- * etc. ints happen.
- *
- * WHOA! Wait a minute, pardner! Does this hold when more than
- * one card has been detected?? I doubt it. Maybe better
- * re-think the multiple card capability....
+/* We need to use spin_lock_irqsave() & spin_unlock_irqrestore() in this
+ * function in order to work in an SMP environment. (I'd be surprised
+ * if the driver is ever used by anyone on a real multi-CPU motherboard,
+ * but it _does_ need to be able to compile and run in an SMP kernel.)
  */
 
 static void in2000_intr (int irqnum, void * dev_id, struct pt_regs *ptregs)
@@ -863,6+841,7 @@ int i,j;
 unsigned long length;
 unsigned short *sp;
 unsigned short f;
+unsigned long flags;
 
    for (instance = instance_list; instance; instance = instance->next) {
       if (instance->irq == irqnum)
@@ -874,6+853,10 @@ unsigned short f;
       }
    hostdata = (struct IN2000_hostdata *)instance->hostdata;
 
+/* Get the spin_lock and disable further ints, for SMP */
+
+   CLISPIN_LOCK(flags);
+
 #ifdef PROC_STATISTICS
    hostdata->int_cnt++;
 #endif
@@ -1008,6+991,9 @@ DB(DB_FIFO,printk("{W:%02x} ",read1_io(IO_FIFO_COUNT)))
             }
 
       write1_io(0, IO_LED_OFF);
+
+/* release the SMP spin_lock and restore irq state */
+      CLISPIN_UNLOCK(flags);
       return;
       }
 
@@ -1023,6+1009,9 @@ DB(DB_FIFO,printk("{W:%02x} ",read1_io(IO_FIFO_COUNT)))
    if (!cmd && (sr != CSR_RESEL_AM && sr != CSR_TIMEOUT && sr != CSR_SELECT)) {
       printk("\nNR:wd-intr-1\n");
       write1_io(0, IO_LED_OFF);
+
+/* release the SMP spin_lock and restore irq state */
+      CLISPIN_UNLOCK(flags);
       return;
       }
 
@@ -1084,13+1073,10 @@ DB(DB_TRANSFER,printk("(%p,%d)",cmd->SCp.ptr,cmd->SCp.this_residual))
 /* Respond to the specific WD3393 interrupt - there are quite a few! */
 
    switch (sr) {
-      unsigned long flags;
 
       case CSR_TIMEOUT:
 DB(DB_INTR,printk("TIMEOUT"))
 
-        save_flags(flags);
-         cli();
          if (hostdata->state == S_RUNNING_LEVEL2)
             hostdata->connected = NULL;
          else {
@@ -1108,7+1094,6 @@ CHECK_NULL(cmd,"csr_timeout")
  * are commands waiting to be executed.
  */
 
-         restore_flags(flags);
          in2000_execute(instance);
          break;
 
@@ -1116,7+1101,6 @@ CHECK_NULL(cmd,"csr_timeout")
 /* Note: this interrupt should not occur in a LEVEL2 command */
 
       case CSR_SELECT:
-         cli();
 DB(DB_INTR,printk("SELECT"))
          hostdata->connected = cmd = (Scsi_Cmnd *)hostdata->selecting;
 CHECK_NULL(cmd,"csr_select")
@@ -1206,7+1190,6 @@ DB(DB_INTR,printk("%02x",cmd->SCp.Status))
       case CSR_SRV_REQ  |PHS_MESS_IN:
 DB(DB_INTR,printk("MSG_IN="))
 
-         cli();
          msg = read_1_byte(hostdata);
          sr = read_3393(hostdata,WD_SCSI_STATUS);  /* clear interrupt */
 
@@ -1355,8+1338,6 @@ printk("sync_xfer=%02x",hostdata->sync_xfer[cmd->target]);
 /* Note: this interrupt will occur only after a LEVEL2 command */
 
       case CSR_SEL_XFER_DONE:
-         save_flags(flags);
-         cli();
 
 /* Make sure that reselection is enabled at this point - it may
  * have been turned off for the command that just completed.
@@ -1383,7+1364,6 @@ DB(DB_INTR,printk(":%d.%d",cmd->SCp.Status,lun))
  * there are commands waiting to be executed.
  */
 
-            restore_flags(flags);
             in2000_execute(instance);
             }
          else {
@@ -1442,8+1422,6 @@ DB(DB_INTR,printk("%02x",hostdata->outgoing_msg[0]))
  * so we treat it as a normal command-complete-disconnect.
  */
 
-        save_flags(flags);
-         cli();
 
 /* Make sure that reselection is enabled at this point - it may
  * have been turned off for the command that just completed.
@@ -1453,6+1431,9 @@ DB(DB_INTR,printk("%02x",hostdata->outgoing_msg[0]))
          if (cmd == NULL) {
             printk(" - Already disconnected! ");
             hostdata->state = S_UNCONNECTED;
+
+/* release the SMP spin_lock and restore irq state */
+            CLISPIN_UNLOCK(flags);
             return;
             }
 DB(DB_INTR,printk("UNEXP_DISC-%ld",cmd->pid))
@@ -1469,14+1450,11 @@ DB(DB_INTR,printk("UNEXP_DISC-%ld",cmd->pid))
  * there are commands waiting to be executed.
  */
 
-         restore_flags(flags);
          in2000_execute(instance);
          break;
 
 
       case CSR_DISC:
-         save_flags(flags);
-         cli();
 
 /* Make sure that reselection is enabled at this point - it may
  * have been turned off for the command that just completed.
@@ -1521,7+1499,6 @@ DB(DB_INTR,printk(":%d",cmd->SCp.Status))
  * there are commands waiting to be executed.
  */
 
-         restore_flags(flags);
          in2000_execute(instance);
          break;
 
@@ -1529,8+1506,6 @@ DB(DB_INTR,printk(":%d",cmd->SCp.Status))
       case CSR_RESEL_AM:
 DB(DB_INTR,printk("RESEL"))
 
-         cli();
-
    /* First we have to make sure this reselection didn't */
    /* happen during Arbitration/Selection of some other device. */
    /* If yes, put losing command back on top of input_Q. */
@@ -1633,16+1608,12 @@ DB(DB_INTR,printk("-%ld",cmd->pid))
 
 DB(DB_INTR,printk("} "))
 
+/* release the SMP spin_lock and restore irq state */
+   CLISPIN_UNLOCK(flags);
+
 }
 
-static void do_in2000_intr(int irq, void *dev_id, struct pt_regs *regs)
-{
-   unsigned long flags;
 
-   spin_lock_irqsave(&io_request_lock, flags);
-   in2000_intr(irq, dev_id, regs);
-   spin_unlock_irqrestore(&io_request_lock, flags);
-}
 
 #define RESET_CARD         0
 #define RESET_CARD_AND_BUS 1
@@ -1827,7+1798,6 @@ unsigned long timeout;
       cmd->result = DID_ABORT << 16;
       cmd->scsi_done(cmd);
 
-/*      sti();*/
       in2000_execute (instance);
 
       restore_flags(flags);
@@ -1858,7+1828,6 @@ unsigned long timeout;
  * broke.
  */
 
-/*   sti();*/
    in2000_execute (instance);
 
    restore_flags(flags);
@@ -1876,7+1845,7 @@ static char setup_buffer[SETUP_BUFFER_SIZE];
 static char setup_used[MAX_SETUP_ARGS];
 static int done_setup = 0;
 
-void in2000_setup (char *str, int *ints)
+in2000__INITFUNC( void in2000_setup (char *str, int *ints) )
 {
 int i;
 char *p1,*p2;
@@ -1908,7+1877,7 @@ char *p1,*p2;
 /* check_setup_args() returns index if key found, 0 if not
  */
 
-static int check_setup_args(char *key, int *flags, int *val, char *buf)
+in2000__INITFUNC( static int check_setup_args(char *key, int *flags, int *val, char *buf) )
 {
 int x;
 char *cp;
@@ -1940,21+1909,21 @@ char *cp;
  * special macros declared in 'asm/io.h'. We use readb() and readl()
  * when reading from the card's BIOS area in in2000_detect().
  */
-static const unsigned int *bios_tab[] = {
+static const unsigned int *bios_tab[] in2000__INITDATA = {
    (unsigned int *)0xc8000,
    (unsigned int *)0xd0000,
    (unsigned int *)0xd8000,
    0
    };
 
-static const unsigned short base_tab[] = {
+static const unsigned short base_tab[] in2000__INITDATA = {
    0x220,
    0x200,
    0x110,
    0x100,
    };
 
-static const int int_tab[] = {
+static const int int_tab[] in2000__INITDATA = {
    15,
    14,
    11,
@@ -1962,7+1931,7 @@ static const int int_tab[] = {
    };
 
 
-int in2000_detect(Scsi_Host_Template * tpnt)
+in2000__INITFUNC( int in2000_detect(Scsi_Host_Template * tpnt) )
 {
 struct Scsi_Host *instance;
 struct IN2000_hostdata *hostdata;
@@ -2068,7+2037,7 @@ char buf[32];
       write1_io(0,IO_FIFO_READ);             /* start fifo out in read mode */
       write1_io(0,IO_INTR_MASK);    /* allow all ints */
       x = int_tab[(switches & (SW_INT0 | SW_INT1)) >> SW_INT_SHIFT];
-      if (request_irq(x, do_in2000_intr, SA_INTERRUPT, "in2000", NULL)) {
+      if (request_irq(x, in2000_intr, SA_INTERRUPT, "in2000", NULL)) {
          printk("in2000_detect: Unable to allocate IRQ.\n");
          detect_count--;
          continue;
index 9e011a7..2a6ad29 100644 (file)
@@ -2,7+2,7 @@
  *    in2000.h -  Linux device driver definitions for the
  *                Always IN2000 ISA SCSI card.
  *
- *    IMPORTANT: This file is for version 1.32 - 28/Mar/1998
+ *    IMPORTANT: This file is for version 1.33 - 26/Aug/1998
  *
  * Copyright (c) 1996 John Shifflett, GeoLog Consulting
  *    john@geolog.com
@@ -377,10+377,29 @@ struct IN2000_hostdata {
 #define PR_STOP      1<<7
 
 
-int in2000_detect(Scsi_Host_Template *);
+#include <linux/version.h>
+
+#if LINUX_VERSION_CODE < 0x020100   /* 2.0.xx */
+# define in2000__INITFUNC(function) function
+# define in2000__INIT
+# define in2000__INITDATA
+# define CLISPIN_LOCK(flags)   do { save_flags(flags); cli(); } while(0)
+# define CLISPIN_UNLOCK(flags) restore_flags(flags)
+#else                               /* 2.1.xxx */
+# include <linux/init.h>
+# include <asm/spinlock.h>
+# define in2000__INITFUNC(function) __initfunc(function)
+# define in2000__INIT __init
+# define in2000__INITDATA __initdata
+# define CLISPIN_LOCK(flags)   spin_lock_irqsave(&io_request_lock, flags)
+# define CLISPIN_UNLOCK(flags) spin_unlock_irqrestore(&io_request_lock, flags)
+#endif
+
+
+int in2000_detect(Scsi_Host_Template *) in2000__INIT;
 int in2000_queuecommand(Scsi_Cmnd *, void (*done)(Scsi_Cmnd *));
 int in2000_abort(Scsi_Cmnd *);
-void in2000_setup(char *, int *);
+void in2000_setup(char *, int *) in2000__INIT;
 int in2000_proc_info(char *, char **, off_t, int, int, int);
 struct proc_dir_entry proc_scsi_in2000;
 int in2000_biosparam(struct scsi_disk *, kdev_t, int *);
@@ -392,6+411,33 @@ int in2000_reset(Scsi_Cmnd *, unsigned int);
 #define IN2000_CPL      2
 #define IN2000_HOST_ID  7
 
+#if LINUX_VERSION_CODE < 0x020100   /* 2.0.xx */
+
+#define IN2000 {  NULL,                /* link pointer for modules */ \
+                  NULL,                /* usage_count for modules */ \
+                  &proc_scsi_in2000,   /* pointer to /proc/scsi directory entry */ \
+                  in2000_proc_info,    /* pointer to proc info function */ \
+                  "Always IN2000",     /* device name */ \
+                  in2000_detect,       /* returns number of in2000's found */ \
+                  NULL,                /* optional unload function for modules */ \
+                  NULL,                /* optional misc info function */ \
+                  NULL,                /* send scsi command, wait for completion */ \
+                  in2000_queuecommand, /* queue scsi command, don't wait */ \
+                  in2000_abort,        /* abort current command */ \
+                  in2000_reset,        /* reset scsi bus */ \
+                  NULL,                /* slave_attach - unused */ \
+                  in2000_biosparam,    /* figures out BIOS parameters for lilo, etc */ \
+                  IN2000_CAN_Q,        /* max commands we can queue up */ \
+                  IN2000_HOST_ID,      /* host-adapter scsi id */ \
+                  IN2000_SG,           /* scatter-gather table size */ \
+                  IN2000_CPL,          /* commands per lun */ \
+                  0,                   /* board counter */ \
+                  0,                   /* unchecked dma */ \
+                  DISABLE_CLUSTERING \
+               }
+
+#else       /* 2.1.xxx */
+
 #define IN2000 {  proc_dir:        &proc_scsi_in2000,   /* pointer to /proc/scsi directory entry */ \
                   proc_info:       in2000_proc_info,    /* pointer to proc info function */ \
                   name:            "Always IN2000",     /* device name */ \
@@ -408,5+454,7 @@ int in2000_reset(Scsi_Cmnd *, unsigned int);
                   use_new_eh_code: 0                    /* new error code - not using it yet */ \
                 }
 
+#endif
+
 
 #endif /* IN2000_H */
index 8633f0d..c0366ec 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -569,6+569,15 @@ flush_failed:
        return retval;
 }
 
+/*
+ * We mustn't allow tracing of suid binaries, unless
+ * the tracer has the capability to trace anything..
+ */
+static inline int must_not_trace_exec(struct task_struct * p)
+{
+       return (p->flags & PF_PTRACED) && !cap_raised(p->p_pptr->cap_effective, CAP_SYS_PTRACE);
+}
+
 /* 
  * Fill the binprm structure from the inode. 
  * Check permissions, then read the first 512 bytes
@@ -657,15+666,12 @@ int prepare_binprm(struct linux_binprm *bprm)
                }
        }
 
-
-
-
        if (id_change || cap_raised) {
                /* We can't suid-execute if we're sharing parts of the executable */
                /* or if we're being traced (or if suid execs are not allowed)    */
                /* (current->mm->count > 1 is ok, as we'll get a new mm anyway)   */
                if (IS_NOSUID(inode)
-                   || (current->flags & PF_PTRACED)
+                   || must_not_trace_exec(current)
                    || (atomic_read(&current->fs->count) > 1)
                    || (atomic_read(&current->sig->count) > 1)
                    || (atomic_read(&current->files->count) > 1)) {
index 87b6692..a5fdf3a 100644 (file)
@@ -114,6+114,7 @@ struct ifreq
                int     ifru_mtu;
                struct  ifmap ifru_map;
                char    ifru_slave[IFNAMSIZ];   /* Just fits the size */
+               char    ifru_newname[IFNAMSIZ];
                __kernel_caddr_t        ifru_data;
        } ifr_ifru;
 };
@@ -133,6+134,7 @@ struct ifreq
 #define ifr_ifindex    ifr_ifru.ifru_ivalue    /* interface index      */
 #define ifr_bandwidth  ifr_ifru.ifru_ivalue    /* link bandwidth       */
 #define ifr_qlen       ifr_ifru.ifru_ivalue    /* Queue length         */
+#define ifr_newname    ifr_ifru.ifru_newname   /* New name             */
 
 /*
  * Structure used in SIOCGIFCONF request.
index 3bdeca3..ad5655c 100644 (file)
@@ -25,9+25,10 @@ struct sockaddr_ll
 #define PACKET_BROADCAST       1               /* To all               */
 #define PACKET_MULTICAST       2               /* To group             */
 #define PACKET_OTHERHOST       3               /* To someone else      */
-#define PACKET_OUTGOING                4               /* Originated by us     */
-#define PACKET_LOOPBACK                5
-#define PACKET_FASTROUTE       6
+#define PACKET_OUTGOING                4               /* Outgoing of any type */
+/* These ones are invisible by user level */
+#define PACKET_LOOPBACK                5               /* MC/BRD frame looped back */
+#define PACKET_FASTROUTE       6               /* Fastrouted frame     */
 
 /* Packet socket options */
 
index 37f0e06..b639184 100644 (file)
@@ -35,8+35,15 @@ struct in6_addr
                __u16           u6_addr16[8];
                __u32           u6_addr32[4];
 #if (~0UL) > 0xffffffff
+#ifndef __RELAX_IN6_ADDR_ALIGNMENT
+               /* Alas, protocols do not respect 64bit alignmnet.
+                  rsvp/pim/... are broken. However, it is good
+                  idea to force correct alignment always, when
+                  it is possible.
+                */
                __u64           u6_addr64[2];
 #endif
+#endif
        } in6_u;
 #define s6_addr                        in6_u.u6_addr8
 #define s6_addr16              in6_u.u6_addr16
@@ -101,19+108,34 @@ struct ipv6_mreq {
 #define IPPROTO_DSTOPTS                60      /* IPv6 destination options     */
 
 /*
+ *     IPv6 TLV options.
+ */
+#define IPV6_TLV_PAD0          0
+#define IPV6_TLV_PADN          1
+#define IPV6_TLV_ROUTERALERT   20
+#define IPV6_TLV_JUMBO         194
+
+/*
  *     IPV6 socket options
  */
 
 #define IPV6_ADDRFORM          1
 #define IPV6_PKTINFO           2
-#define IPV6_RXHOPOPTS         3 /* obsolete name */
-#define IPV6_RXDSTOPTS         4 /* obsolete name */
-#define IPV6_HOPOPTS           IPV6_RXHOPOPTS  /* new name */
-#define IPV6_DSTOPTS           IPV6_RXDSTOPTS  /* new name */
-#define IPV6_RXSRCRT           5
+#define IPV6_HOPOPTS           3
+#define IPV6_DSTOPTS           4
+#define IPV6_RTHDR             5
 #define IPV6_PKTOPTIONS                6
 #define IPV6_CHECKSUM          7
 #define IPV6_HOPLIMIT          8
+#define IPV6_NEXTHOP           9
+#define IPV6_AUTHHDR           10
+
+#if 0
+/* Aliases for obsolete names */
+#define IPV6_RXHOPOPTS         IPV6_HOPOPTS
+#define IPV6_RXDSTOPTS         IPV6_DSTOPTS
+#define IPV6_RXSRCRT           IPV6_RTHDR
+#endif
 
 /*
  *     Alternative names
index 3913524..84564ba 100644 (file)
@@ -4,6+4,9 @@
 #include <linux/in6.h>
 #include <asm/byteorder.h>
 
+/* The latest drafts declared increase in minimal mtu up to 1280. */
+
+#define IPV6_MIN_MTU   1280
 
 /*
  *     Advanced API
@@ -58,8+61,6 @@ struct ipv6_opt_hdr {
 #define ipv6_optlen(p)  (((p)->hdrlen+1) << 3)
 #endif
 
-
-
 /*
  *     routing header type 0 (used in cmsghdr struct)
  */
@@ -72,10+73,11 @@ struct rt0_hdr {
 #define rt0_type               rt_hdr.type;
 };
 
-#ifdef __KERNEL__
-
 /*
  *     IPv6 fixed header
+ *
+ *     BEWARE, it is incorrect. The first 4 bits of flow_lbl
+ *     are glued to priority now, forming "class".
  */
 
 struct ipv6hdr {
@@ -87,7+89,7 @@ struct ipv6hdr {
                                priority:4;
 #else
 #error "Please fix <asm/byteorder.h>"
-#endif                                         
+#endif
        __u8                    flow_lbl[3];
 
        __u16                   payload_len;
@@ -98,28+100,24 @@ struct ipv6hdr {
        struct  in6_addr        daddr;
 };
 
-/*
- *     The length of this struct cannot be greater than the length of
- *     the proto_priv field in a sk_buff which is currently
- *     defined to be 16 bytes.
- *     Pointers take upto 8 bytes (sizeof(void *) is 8 on the alpha).
- */
-struct ipv6_options 
-{
-       /* length of extension headers   */
-
-       __u16                   opt_flen;       /* after fragment hdr */
-       __u16                   opt_nflen;      /* before fragment hdr */
+#ifdef __KERNEL__
 
-       /* 
-        * protocol options 
-        * usually carried in IPv6 extension headers
       */
+/* 
+   This structure contains results of exthdrs parsing
+   as offsets from skb->nh.
+ */
 
-       struct ipv6_rt_hdr              *srcrt; /* Routing Header */
+struct inet6_skb_parm
+{
+       int                     iif;
+       __u16                   ra;
+       __u16                   hop;
+       __u16                   auth;
+       __u16                   dst0;
+       __u16                   srcrt;
+       __u16                   dst1;
 };
 
-
 #endif
 
 #endif
index 7fc11c7..a4861d0 100644 (file)
@@ -26,7+26,6 @@ enum
 #define RTF_ALLONLINK  0x00020000      /* fallback, no routers on link */
 #define RTF_ADDRCONF   0x00040000      /* addrconf route - RA          */
 
-#define RTF_LINKRT     0x00100000      /* link specific - device match */
 #define RTF_NONEXTHOP  0x00200000      /* route with no nexthop        */
 #define RTF_EXPIRES    0x00400000
 
index c42e43d..41f6405 100644 (file)
@@ -232,8+232,7 @@ struct device
        
        unsigned short          flags;  /* interface flags (a la BSD)   */
        unsigned short          gflags;
-       unsigned short          metric; /* routing metric (not used)    */
-       unsigned short          mtu;    /* interface MTU value          */
+       unsigned                mtu;    /* interface MTU value          */
        unsigned short          type;   /* interface hardware type      */
        unsigned short          hard_header_len;        /* hardware hdr length  */
        void                    *priv;  /* pointer to private data      */
index bd4f9dd..59075b0 100644 (file)
@@ -16,8+16,8 @@ struct sockaddr_nl
 {
        sa_family_t     nl_family;      /* AF_NETLINK   */
        unsigned short  nl_pad;         /* zero         */
-       __kernel_pid_t  nl_pid;         /* process pid  */
-       unsigned        nl_groups;      /* multicast groups mask */
+       __u32           nl_pid;         /* process pid  */
+               __u32           nl_groups;      /* multicast groups mask */
 };
 
 struct nlmsghdr
@@ -26,7+26,7 @@ struct nlmsghdr
        __u16           nlmsg_type;     /* Message content */
        __u16           nlmsg_flags;    /* Additional flags */
        __u32           nlmsg_seq;      /* Sequence number */
-       __kernel_pid_t  nlmsg_pid;      /* Sending process PID */
+       __u32           nlmsg_pid;      /* Sending process PID */
 };
 
 /* Flags values */
@@ -64,7+64,7 @@ struct nlmsghdr
 #define NLMSG_DATA(nlh)  ((void*)(((char*)nlh) + NLMSG_LENGTH(0)))
 #define NLMSG_NEXT(nlh,len)     ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \
                                  (struct nlmsghdr*)(((char*)(nlh)) + NLMSG_ALIGN((nlh)->nlmsg_len)))
-#define NLMSG_OK(nlh,len) ((nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \
+#define NLMSG_OK(nlh,len) ((len) > 0 && (nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \
                           (nlh)->nlmsg_len <= (len))
 #define NLMSG_PAYLOAD(nlh,len) ((nlh)->nlmsg_len - NLMSG_SPACE((len)))
 
@@ -86,10+86,11 @@ struct nlmsgerr
 struct netlink_skb_parms
 {
        struct ucred            creds;          /* Skb credentials      */
-       pid_t                   pid;
-       unsigned                groups;
-       pid_t                   dst_pid;
-       unsigned                dst_groups;
+       __u32                   pid;
+       __u32                   groups;
+       __u32                   dst_pid;
+       __u32                   dst_groups;
+       kernel_cap_t            eff_cap;
 };
 
 #define NETLINK_CB(skb)                (*(struct netlink_skb_parms*)&((skb)->cb))
@@ -102,10+103,10 @@ extern int netlink_post(int unit, struct sk_buff *skb);
 extern int init_netlink(void);
 extern struct sock *netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len));
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
-extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, pid_t pid, int nonblock);
-extern void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, pid_t pid,
-                             unsigned group, int allocation);
-extern void netlink_set_err(struct sock *ssk, pid_t pid, unsigned group, int code);
+extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock);
+extern void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid,
+                             __u32 group, int allocation);
+extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code);
 
 /*
  *     skb should fit one page. This choice is good for headerless malloc.
@@ -125,28+126,8 @@ struct netlink_callback
        long            args[4];
 };
 
-#if 0
-
-void* nlmsg_broadcast(struct sock*, unsigned long type, int len, unsigned groups);
-struct skb_buff *nlmsg_alloc(unsigned long type, int len,
-                            unsigned long seq, unsigned long pid, int allocation);
-void __nlmsg_transmit(struct sock*, int allocation);
-
-extern __inline__ void nlmsg_release(struct sk_buff *skb)
-{
-       atomic_dec(skb->users);
-}
-
-extern __inline__ void nlmsg_transmit(struct sk_buff *sk, int allocation)
-{
-       if (sk->write_queue.qlen)
-               __nlmsg_transmit(sk, allocation);
-}
-
-#endif
-
 extern __inline__ struct nlmsghdr *
-__nlmsg_put(struct sk_buff *skb, pid_t pid, u32 seq, int type, int len)
+__nlmsg_put(struct sk_buff *skb, u32 pid, u32 seq, int type, int len)
 {
        struct nlmsghdr *nlh;
        int size = NLMSG_LENGTH(len);
index a984230..42facea 100644 (file)
@@ -98,6+98,7 @@ extern __inline__ int notifier_call_chain(struct notifier_block **n, unsigned lo
 #define NETDEV_CHANGEMTU       0x0007
 #define NETDEV_CHANGEADDR      0x0008
 #define NETDEV_GOING_DOWN      0x0009
+#define NETDEV_CHANGENAME      0x000A
 
 #define SYS_DOWN       0x0001  /* Notify of system down */
 #define SYS_RESTART    SYS_DOWN
index aefb1bf..1e912ae 100644 (file)
@@ -119,7+119,7 @@ enum net_directory_inos {
        PROC_NET_AX25_BPQETHER,
        PROC_NET_IP_MASQ_APP,
        PROC_NET_RT6,
-       PROC_NET_RT6_TREE,
+       PROC_NET_SNMP6,
        PROC_NET_RT6_STATS,
        PROC_NET_NDISC,
        PROC_NET_STRIP_STATUS,
index 4c8adc5..be60739 100644 (file)
@@ -5,7+5,6 @@
 #include <linux/netlink.h>
 
 #define RTNL_DEBUG 1
-/* #define CONFIG_RTNL_OLD_IFINFO 1 */
 
 
 /****
@@ -66,14+65,14 @@ struct rtattr
 
 #define RTA_ALIGNTO    4
 #define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) )
-#define RTA_OK(rta,len) ((rta)->rta_len >= sizeof(struct rtattr) && \
+#define RTA_OK(rta,len) ((len) > 0 && (rta)->rta_len >= sizeof(struct rtattr) && \
                         (rta)->rta_len <= (len))
 #define RTA_NEXT(rta,attrlen)  ((attrlen) -= RTA_ALIGN((rta)->rta_len), \
                                 (struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len)))
 #define RTA_LENGTH(len)        (RTA_ALIGN(sizeof(struct rtattr)) + (len))
 #define RTA_SPACE(len) RTA_ALIGN(RTA_LENGTH(len))
 #define RTA_DATA(rta)   ((void*)(((char*)(rta)) + RTA_LENGTH(0)))
-#define RTA_PAYLOAD(rta) ((rta)->rta_len - RTA_LENGTH(0))
+#define RTA_PAYLOAD(rta) ((int)((rta)->rta_len) - RTA_LENGTH(0))
 
 
 
@@ -91,18+90,9 @@ struct rtmsg
 
        unsigned char           rtm_table;      /* Routing table id */
        unsigned char           rtm_protocol;   /* Routing protocol; see below  */
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       unsigned char           rtm_nhs;        /* Number of nexthops */
-#else
        unsigned char           rtm_scope;      /* See below */ 
-#endif
        unsigned char           rtm_type;       /* See below    */
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       unsigned short          rtm_optlen;     /* Byte length of rtm_opt */
-       unsigned char           rtm_scope;      /* See below */ 
-       unsigned char           rtm_whatsit;    /* Unused byte */
-#endif
        unsigned                rtm_flags;
 };
 
@@ -176,9+166,6 @@ enum rt_scope_t
 #define RTM_F_NOTIFY           0x100   /* Notify user of route change  */
 #define RTM_F_CLONED           0x200   /* This route is cloned         */
 #define RTM_F_EQUALIZE         0x400   /* Multipath equalizer: NI      */
-#ifdef CONFIG_RTNL_OLD_IFINFO
-#define RTM_F_NOPMTUDISC       0x800   /* Do not make PMTU discovery   */
-#endif
 
 /* Reserved table identifiers */
 
@@ -206,17+193,10 @@ enum rtattr_type_t
        RTA_GATEWAY,
        RTA_PRIORITY,
        RTA_PREFSRC,
-#ifndef CONFIG_RTNL_OLD_IFINFO
        RTA_METRICS,
        RTA_MULTIPATH,
        RTA_PROTOINFO,
        RTA_FLOW,
-#else
-       RTA_WINDOW,
-       RTA_RTT,
-       RTA_MTU,
-       RTA_IFNAME,
-#endif
        RTA_CACHEINFO
 };
 
@@ -253,18+233,12 @@ struct rtnexthop
 #define RTNH_ALIGNTO   4
 #define RTNH_ALIGN(len) ( ((len)+RTNH_ALIGNTO-1) & ~(RTNH_ALIGNTO-1) )
 #define RTNH_OK(rtnh,len) ((rtnh)->rtnh_len >= sizeof(struct rtnexthop) && \
-                          (rtnh)->rtnh_len <= (len))
+                          ((int)(rtnh)->rtnh_len) <= (len))
 #define RTNH_NEXT(rtnh)        ((struct rtnexthop*)(((char*)(rtnh)) + RTNH_ALIGN((rtnh)->rtnh_len)))
 #define RTNH_LENGTH(len) (RTNH_ALIGN(sizeof(struct rtnexthop)) + (len))
 #define RTNH_SPACE(len)        RTNH_ALIGN(RTNH_LENGTH(len))
 #define RTNH_DATA(rtnh)   ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0)))
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
-#define RTM_RTNH(r) ((struct rtnexthop*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg)) \
-                                          + NLMSG_ALIGN((r)->rtm_optlen)))
-#define RTM_NHLEN(nlh,r) ((nlh)->nlmsg_len - NLMSG_SPACE(sizeof(struct rtmsg)) - NLMSG_ALIGN((r)->rtm_optlen))
-#endif
-
 /* RTM_CACHEINFO */
 
 struct rta_cacheinfo
@@ -424,35+398,6 @@ struct rtgenmsg
  * on network protocol.
  */
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
-struct ifinfomsg
-{
-       unsigned char   ifi_family;             /* Dummy        */
-       unsigned char   ifi_addrlen;            /* Length of HW address */
-       unsigned short  ifi_pad__;
-       int             ifi_index;              /* Link index   */
-       int             ifi_link;               /* Physical device */
-       char            ifi_name[IFNAMSIZ];
-       struct sockaddr ifi_address;            /* HW address   */
-       struct sockaddr ifi_broadcast;          /* HW broadcast */
-       unsigned        ifi_flags;              /* IFF_* flags  */
-       int             ifi_mtu;                /* Link mtu     */
-       char            ifi_qdiscname[IFNAMSIZ];/* Id of packet scheduler */
-       int             ifi_qdisc;              /* Packet scheduler handle */
-};
-
-enum
-{
-       IFLA_UNSPEC,
-       IFLA_ADDRESS,
-       IFLA_BROADCAST,
-       IFLA_IFNAME,
-       IFLA_QDISC,
-       IFLA_STATS
-};
-
-#else
-
 struct ifinfomsg
 {
        unsigned char   ifi_family;
@@ -475,8+420,6 @@ enum
        IFLA_STATS
 };
 
-#endif
-
 
 #define IFLA_MAX IFLA_STATS
 
@@ -588,7+531,7 @@ struct rtnetlink_link
 
 extern struct rtnetlink_link * rtnetlink_links[NPROTO];
 extern int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb);
-extern int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo);
+extern int rtnetlink_send(struct sk_buff *skb, u32 pid, u32 group, int echo);
 
 extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
 
index ee6c799..c242c09 100644 (file)
@@ -537,6+537,19 @@ extern __inline__ struct sk_buff *dev_alloc_skb(unsigned int length)
        return skb;
 }
 
+extern __inline__ struct sk_buff *
+skb_cow(struct sk_buff *skb, unsigned int headroom)
+{
+       headroom = (headroom+15)&~15;
+
+       if ((unsigned)skb_headroom(skb) < headroom || skb_cloned(skb)) {
+               struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
+               kfree_skb(skb);
+               skb = skb2;
+       }
+       return skb;
+}
+
 extern struct sk_buff *                skb_recv_datagram(struct sock *sk,unsigned flags,int noblock, int *err);
 extern unsigned int            datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait);
 extern int                     skb_copy_datagram(struct sk_buff *from, int offset, char *to,int size);
index b29fd43..c386ae4 100644 (file)
@@ -88,20+88,27 @@ struct cmsghdr {
 
 /*
  *     Get the next cmsg header
+ *
+ *     PLEASE, do not touch this function. If you think, that it is
+ *     incorrect, grep kernel sources and think about consequences
+ *     before trying to improve it.
+ *
+ *     Now it always returns valid, not truncated ancillary object
+ *     HEADER. But caller still MUST check, that cmsg->cmsg_len is
+ *     inside range, given by msg->msg_controllen before using
+ *     ansillary object DATA.                          --ANK (980731)
  */
  
 __KINLINE struct cmsghdr * __cmsg_nxthdr(void *__ctl, __kernel_size_t __size,
                                               struct cmsghdr *__cmsg)
 {
-       unsigned char * __ptr;
+       struct cmsghdr * __ptr;
 
-       if (__cmsg->cmsg_len < sizeof(struct cmsghdr))
-               return NULL;
-       __ptr = ((unsigned char *) __cmsg) +  CMSG_ALIGN(__cmsg->cmsg_len);
-       if (__ptr >= (unsigned char *) __ctl + __size)
+       __ptr = (struct cmsghdr*)(((unsigned char *) __cmsg) +  CMSG_ALIGN(__cmsg->cmsg_len));
+       if ((unsigned long)((char*)(__ptr+1) - (char *) __ctl) > __size)
                return NULL;
 
-       return (struct cmsghdr *) __ptr;
+       return __ptr;
 }
 
 __KINLINE struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr *__cmsg)
index 4f41001..995e43e 100644 (file)
 #define SIOCSIFMEM     0x8920          /* set memory address (BSD)     */
 #define SIOCGIFMTU     0x8921          /* get MTU size                 */
 #define SIOCSIFMTU     0x8922          /* set MTU size                 */
+#define SIOCSIFNAME    0x8923          /* set interface name */
 #define        SIOCSIFHWADDR   0x8924          /* set hardware address         */
 #define SIOCGIFENCAP   0x8925          /* get/set encapsulations       */
 #define SIOCSIFENCAP   0x8926          
index a73a2d0..50b3373 100644 (file)
@@ -35,7+35,6 @@ struct dst_entry
        atomic_t                use;            /* client references    */
        struct device           *dev;
        int                     obsolete;
-       __u32                   priority;
        unsigned long           lastuse;
        unsigned                mxlock;
        unsigned                window;
index 7759e50..45d232f 100644 (file)
@@ -21,6+21,10 @@ struct flowi {
                        struct in6_addr *       saddr;
                } ip6_u;
        } nl_u;
+#define fl6_dst                nl_u.ip6_u.daddr
+#define fl6_src                nl_u.ip6_u.saddr
+#define fl4_dst                nl_u.ip4_u.daddr
+#define fl4_src                nl_u.ip4_u.saddr
 
        int     oif;
 
index 5bd90dd..905876d 100644 (file)
 
 struct rt6_info;
 
-struct fib6_node {
+struct fib6_node
+{
        struct fib6_node        *parent;
        struct fib6_node        *left;
        struct fib6_node        *right;
@@ -43,12+44,14 @@ struct fib6_node {
  *
  */
 
-struct rt6key {
+struct rt6key
+{
        struct in6_addr addr;
        int             plen;
 };
 
-struct rt6_info {
+struct rt6_info
+{
        union {
                struct dst_entry        dst;
                struct rt6_info         *next;
@@ -56,21+59,16 @@ struct rt6_info {
 
 #define rt6i_dev                       u.dst.dev
 #define rt6i_nexthop                   u.dst.neighbour
-#define rt6i_use                       u.dst.use
-#define rt6i_ref                       u.dst.refcnt
-
-#define rt6i_tstamp                    u.dst.lastuse
 
        struct fib6_node                *rt6i_node;
 
        struct in6_addr                 rt6i_gateway;
        
-       int                             rt6i_keylen;
-
        u32                             rt6i_flags;
        u32                             rt6i_metric;
        u8                              rt6i_hoplimit;
        unsigned long                   rt6i_expires;
+       atomic_t                        rt6i_ref;
 
        union {
                struct flow_rule        *rt6iu_flowr;
@@ -84,6+82,33 @@ struct rt6_info {
        struct rt6key                   rt6i_src;
 };
 
+struct fib6_walker_t
+{
+       struct fib6_walker_t *prev, *next;
+       struct fib6_node *root, *node;
+       struct rt6_info *leaf;
+       unsigned char state;
+       unsigned char prune;
+       int (*func)(struct fib6_walker_t *);
+       void *args;
+};
+
+extern struct fib6_walker_t fib6_walker_list;
+
+extern __inline__ void fib6_walker_link(struct fib6_walker_t *w)
+{
+       w->next = fib6_walker_list.next;
+       w->prev = &fib6_walker_list;
+       w->next->prev = w;
+       w->prev->next = w;
+}
+
+extern __inline__ void fib6_walker_unlink(struct fib6_walker_t *w)
+{
+       w->next->prev = w->prev;
+       w->prev->next = w->next;
+       w->prev = w->next = w;
+}
 
 struct rt6_statistics {
        __u32           fib_nodes;
@@ -97,8+122,6 @@ struct rt6_statistics {
 #define RTN_ROOT       0x0002          /* tree root node               */
 #define RTN_RTINFO     0x0004          /* node with valid routing info */
 
-#define RTN_TAG                0x0100
-
 /*
  *     priority levels (or metrics)
  *
@@ -128,11+151,16 @@ extern struct fib6_node           *fib6_lookup(struct fib6_node *root,
                                             struct in6_addr *daddr,
                                             struct in6_addr *saddr);
 
-#define RT6_FILTER_RTNODES     1
+struct fib6_node               *fib6_locate(struct fib6_node *root,
+                                            struct in6_addr *daddr, int dst_len,
+                                            struct in6_addr *saddr, int src_len);
+
+extern void                    fib6_clean_tree(struct fib6_node *root,
+                                               int (*func)(struct rt6_info *, void *arg),
+                                               int prune, void *arg);
 
-extern void                    fib6_walk_tree(struct fib6_node *root,
-                                              f_pnode func, void *arg,
-                                              int filter);
+extern int                     fib6_walk(struct fib6_walker_t *w);
+extern int                     fib6_walk_continue(struct fib6_walker_t *w);
 
 extern int                     fib6_add(struct fib6_node *root,
                                         struct rt6_info *rt);
index 8ca62a7..9311cc3 100644 (file)
 #include <net/flow.h>
 #include <net/ip6_fib.h>
 
-/*
- *     Structure for assync processing of operations on the routing
- *     table
- */
-
-struct rt6_req {
-       int                     operation;
-       struct rt6_info         *ptr;
-
-       struct rt6_req          *next;
-       struct rt6_req          *prev;
-
-#define RT_OPER_ADD            1
-#define RT_OPER_DEL            2
-};
-
-
 struct pol_chain {
        int                     type;
        int                     priority;
@@ -53,8+36,7 @@ extern void                   ip6_route_cleanup(void);
 
 extern int                     ipv6_route_ioctl(unsigned int cmd, void *arg);
 
-extern struct rt6_info *       ip6_route_add(struct in6_rtmsg *rtmsg,
-                                             int *err);
+extern int                     ip6_route_add(struct in6_rtmsg *rtmsg);
 extern int                     ip6_del_rt(struct rt6_info *);
 
 extern int                     ip6_rt_addr_add(struct in6_addr *addr,
@@ -85,15+67,15 @@ extern struct rt6_info *    rt6_add_dflt_router(struct in6_addr *gwaddr,
 
 extern void                    rt6_purge_dflt_routers(int lst_resort);
 
-extern struct rt6_info *       rt6_redirect(struct in6_addr *dest,
+extern void                    rt6_redirect(struct in6_addr *dest,
                                             struct in6_addr *saddr,
-                                            struct in6_addr *target,
-                                            struct device *dev,
+                                            struct neighbour *neigh,
                                             int on_link);
 
-extern void                    rt6_pmtu_discovery(struct in6_addr *addr,
+extern void                    rt6_pmtu_discovery(struct in6_addr *daddr,
+                                                  struct in6_addr *saddr,
                                                   struct device *dev,
-                                                  int pmtu);
+                                                  u32 pmtu);
 
 struct nlmsghdr;
 struct netlink_callback;
@@ -103,22+85,25 @@ extern int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
 extern int inet6_rtm_getroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
 
 extern void rt6_ifdown(struct device *dev);
+extern void rt6_mtu_change(struct device *dev, unsigned mtu);
 
 /*
  *     Store a destination cache entry in a socket
  *     For UDP/RAW sockets this is done on udp_connect.
  */
 
-extern __inline__ void ip6_dst_store(struct sock *sk, struct dst_entry *dst)
+extern __inline__ void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
+                                    struct in6_addr *daddr)
 {
        struct ipv6_pinfo *np;
        struct rt6_info *rt;
-               
+
        np = &sk->net_pinfo.af_inet6;
        dst_release(xchg(&sk->dst_cache,dst));
-       
+
        rt = (struct rt6_info *) dst;
-       
+
+       np->daddr_cache = daddr;
        np->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
 }
 
index f96fa61..b79e4d0 100644 (file)
@@ -212,7+212,7 @@ extern int          fib_semantic_match(int type, struct fib_info *,
 extern struct fib_info *fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
                                         const struct nlmsghdr *, int *err);
 extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *, struct kern_rta *rta, struct fib_info *fi);
-extern int fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event,
+extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
                         u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
                         struct fib_info *fi);
 extern int fib_sync_down(u32 local, struct device *dev, int force);
index acf37b3..03f30b6 100644 (file)
@@ -4,7+4,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>
  *
- *     $Id: ipv6.h,v 1.12 1998/07/15 05:05:02 davem Exp $
+ *     $Id: ipv6.h,v 1.13 1998/08/26 12:02:11 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -86,53+86,44 @@ struct frag_hdr {
 
 #include <net/sock.h>
 
-extern struct ipv6_mib ipv6_statistics;
+extern struct ipv6_mib         ipv6_statistics;
+extern struct icmpv6_mib       icmpv6_statistics;
+extern struct udp_mib          udp_stats_in6;
 
-struct ipv6_frag {
-       __u16                   offset;
-       __u16                   len;
-       struct sk_buff          *skb;
-
-       struct frag_hdr         *fhdr;
-
-       struct ipv6_frag        *next;
+struct ip6_ra_chain
+{
+       struct ip6_ra_chain     *next;
+       struct sock             *sk;
+       int                     sel;
+       void                    (*destructor)(struct sock *);
 };
 
+extern struct ip6_ra_chain     *ip6_ra_chain;
+
 /*
- *     Equivalent of ipv4 struct ipq
+   This structure is prepared by protocol, when parsing
+   ancillary data and passed to IPv6.
  */
 
-struct frag_queue {
+struct ipv6_txoptions
+{
+       /* Length of this structure */
+       int                     tot_len;
 
-       struct frag_queue       *next;
-       struct frag_queue       *prev;
+       /* length of extension headers   */
 
-       __u32                   id;             /* fragment id          */
-       struct in6_addr         saddr;
-       struct in6_addr         daddr;
-       struct timer_list       timer;          /* expire timer         */
-       struct ipv6_frag        *fragments;
-       struct device           *dev;
-       __u8                    last_in;        /* has last segment arrived? */
-       __u8                    nexthdr;
-       __u8                    *nhptr;
-};
+       __u16                   opt_flen;       /* after fragment hdr */
+       __u16                   opt_nflen;      /* before fragment hdr */
 
-struct ipv6_tlvtype
-{
-       u8 type;
-       u8 len;
-};
+       struct ipv6_opt_hdr     *hopopt;
+       struct ipv6_opt_hdr     *dst0opt;
+       struct ipv6_rt_hdr      *srcrt; /* Routing Header */
+       struct ipv6_opt_hdr     *auth;
+       struct ipv6_opt_hdr     *dst1opt;
 
-struct ip6_ra_chain
-{
-       struct ip6_ra_chain     *next;
-       struct sock             *sk;
-       int                     sel;
-       void                    (*destructor)(struct sock *);
+       /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */
 };
 
-extern struct ip6_ra_chain     *ip6_ra_chain;
 
 extern int                     ip6_ra_control(struct sock *sk, int sel,
                                               void (*destructor)(struct sock *));
@@ -140,18+131,13 @@ extern int                        ip6_ra_control(struct sock *sk, int sel,
 
 extern int                     ip6_call_ra_chain(struct sk_buff *skb, int sel);
 
-extern int                     ip6_dstopt_unknown(struct sk_buff *skb,
-                                                  struct ipv6_tlvtype *hdr);
+extern u8 *                    ipv6_reassembly(struct sk_buff **skb, u8 *nhptr);
 
-extern int                     ipv6_routing_header(struct sk_buff **skb, 
-                                                   struct device *dev,
-                                                   __u8 *nhptr, 
-                                                   struct ipv6_options *opt);
+extern u8 *                    ipv6_parse_hopopts(struct sk_buff *skb, u8 *nhptr);
 
-extern int                     ipv6_reassembly(struct sk_buff **skb, 
-                                               struct device *dev, 
-                                               __u8 *nhptr,
-                                               struct ipv6_options *opt);
+extern u8 *                    ipv6_parse_exthdrs(struct sk_buff **skb, u8 *nhptr);
+
+extern struct ipv6_txoptions *  ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt);
 
 #define IPV6_FRAG_TIMEOUT      (60*HZ)         /* 60 seconds */
 
@@ -226,7+212,7 @@ extern int                  ipv6_rcv(struct sk_buff *skb,
 extern int                     ip6_xmit(struct sock *sk,
                                         struct sk_buff *skb,
                                         struct flowi *fl,
-                                        struct ipv6_options *opt);
+                                        struct ipv6_txoptions *opt);
 
 extern int                     ip6_nd_hdr(struct sock *sk,
                                           struct sk_buff *skb,
@@ -240,7+226,7 @@ extern int                  ip6_build_xmit(struct sock *sk,
                                               const void *data,
                                               struct flowi *fl,
                                               unsigned length,
-                                              struct ipv6_options *opt,
+                                              struct ipv6_txoptions *opt,
                                               int hlimit, int flags);
 
 /*
@@ -256,28+242,27 @@ extern int                        ip6_mc_input(struct sk_buff *skb);
  *     Extension header (options) processing
  */
 
-extern int                     ipv6opt_bld_rthdr(struct sk_buff *skb,
-                                                 struct ipv6_options *opt,
-                                                 struct in6_addr *addr,
-                                                 int proto);
-
-extern int                     ipv6opt_srcrt_co(struct sockaddr_in6 *sin6, 
-                                                int len, 
-                                                struct ipv6_options *opt);
-
-extern int                     ipv6opt_srcrt_cl(struct sockaddr_in6 *sin6, 
-                                                int num_addrs, 
-                                                struct ipv6_options *opt);
-
-extern int                     ipv6opt_srt_tosin(struct ipv6_options *opt,
-                                                 struct sockaddr_in6 *sin6,
-                                                 int len);
-
-extern void                    ipv6opt_free(struct ipv6_options *opt);
-
-extern struct ipv6_opt_hdr *   ipv6_skip_exthdr(struct ipv6_opt_hdr *hdr, 
+extern u8 *                    ipv6_build_nfrag_opts(struct sk_buff *skb,
+                                                     u8 *prev_hdr,
+                                                     struct ipv6_txoptions *opt,
+                                                     struct in6_addr *daddr,
+                                                     u32 jumbolen);
+extern u8 *                    ipv6_build_frag_opts(struct sk_buff *skb,
+                                                    u8 *prev_hdr,
+                                                    struct ipv6_txoptions *opt);
+extern void                    ipv6_push_nfrag_opts(struct sk_buff *skb,
+                                                    struct ipv6_txoptions *opt,
+                                                    u8 *proto,
+                                                    struct in6_addr **daddr_p);
+extern void                    ipv6_push_frag_opts(struct sk_buff *skb,
+                                                   struct ipv6_txoptions *opt,
+                                                   u8 *proto);
+
+extern u8 *                    ipv6_skip_exthdr(struct ipv6_opt_hdr *hdr, 
                                                 u8 *nexthdrp, int len);
 
+extern struct ipv6_txoptions * ipv6_invert_rthdr(struct sock *sk,
+                                                 struct ipv6_rt_hdr *hdr);
 
 
 /*
index 8ca3713..7a51f36 100644 (file)
@@ -60,12+60,7 @@ extern int                   ndisc_init(struct net_proto_family *ops);
 
 extern void                    ndisc_cleanup(void);
 
-extern int                     ndisc_rcv(struct sk_buff *skb,
-                                         struct device *dev,
-                                         struct in6_addr *saddr,
-                                         struct in6_addr *daddr,
-                                         struct ipv6_options *opt,
-                                         unsigned short len);
+extern int                     ndisc_rcv(struct sk_buff *skb, unsigned long len);
 
 extern void                    ndisc_send_ns(struct device *dev,
                                              struct neighbour *neigh,
index de7c769..142f7b3 100644 (file)
@@ -147,12+147,12 @@ extern psched_time_t      psched_time_base;
 
 #if PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
 
-#define PSCHED_WATCHER
+#define PSCHED_WATCHER unsigned long
 
-extern unsigned long psched_time_mark;
+extern PSCHED_WATCHER psched_time_mark;
 
 #if HZ == 100
-#define PSCHED_JSCALE 7
+#define PSCHED_JSCALE 13
 #elif HZ == 1024
 #define PSCHED_JSCALE 10
 #else
@@ -179,9+179,9 @@ extern int psched_clock_scale;
 
 #elif defined (__alpha__)
 
-#define PSCHED_WATCHER
+#define PSCHED_WATCHER u32
 
-extern u32 psched_time_mark;
+extern PSCHED_WATCHER psched_time_mark;
 
 #define PSCHED_GET_TIME(stamp) \
 ({ u32 __res; \
index 63d562a..f6e947b 100644 (file)
@@ -48,17+48,13 @@ struct inet_protocol
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 struct inet6_protocol 
 {
-       int     (*handler)(struct sk_buff *skb, struct device *dev,
-                       struct in6_addr *saddr,
-                       struct in6_addr *daddr,
-                       struct ipv6_options *opt, 
-                       unsigned short len,
-                       int redo, struct inet6_protocol *protocol);
+       int     (*handler)(struct sk_buff *skb,
+                       unsigned long len);
 
-       void    (*err_handler)(struct sk_buff *skb, int type, int code, unsigned char *buff,
-                       __u32 info, struct in6_addr *saddr,
-                       struct in6_addr *daddr,
-                       struct inet6_protocol *protocol);
+       void    (*err_handler)(struct sk_buff *skb, struct ipv6hdr *hdr,
+                              struct inet6_skb_parm *opt,
+                              int type, int code, unsigned char *buff,
+                              __u32 info);
        struct inet6_protocol *next;
        unsigned char   protocol;
        unsigned char   copy:1;
index 3637371..d54572d 100644 (file)
@@ -10,19+10,17 @@ extern struct sock *raw_v6_htable[RAWV6_HTABLE_SIZE];
 extern struct sock *raw_v6_lookup(struct sock *sk, unsigned short num,
                                  struct in6_addr *loc_addr, struct in6_addr *rmt_addr);
 
-extern int                     rawv6_rcv(struct sk_buff *skb, 
-                                         struct device *dev,
-                                         struct in6_addr *saddr, 
-                                         struct in6_addr *daddr,
-                                         struct ipv6_options *opt, 
-                                         unsigned short len);
+extern int                     rawv6_rcv(struct sock *sk,
+                                         struct sk_buff *skb, 
+                                         unsigned long len);
 
 
 extern void                    rawv6_err(struct sock *sk,
+                                         struct sk_buff *skb,
+                                         struct ipv6hdr *hdr,
+                                         struct inet6_skb_parm *opt,
                                          int type, int code, 
-                                         unsigned char *buff,
-                                         struct in6_addr *saddr,
-                                         struct in6_addr *daddr);
+                                         unsigned char *buff, u32 info);
 
 #endif
 
index 624fd23..bd76d03 100644 (file)
 #include <net/dst.h>
 #include <linux/in_route.h>
 #include <linux/rtnetlink.h>
+#include <linux/route.h>
+
+#ifndef __KERNEL__
+#warning This file is not supposed to be used outside of kernel.
+#endif
 
 #define RT_HASH_DIVISOR                256
 
  */
 #define RT_CACHE_BUBBLE_THRESHOLD      (5*HZ)
 
-#include <linux/route.h>
 
 #define RTO_ONLINK     0x01
 #define RTO_TPROXY     0x80000000
@@ -87,7+91,8 @@ struct rtable
 #endif
 };
 
-#ifdef __KERNEL__
+extern struct rtable   *rt_hash_table[RT_HASH_DIVISOR];
+
 extern void            ip_rt_init(void);
 extern void            ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw,
                                       u32 src, u8 tos, struct device *dev);
@@ -131,7+136,4 @@ extern __inline__ int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32
        return ip_route_output(rp, dst, src, tos, oif);
 }
 
-#endif
-
-
 #endif /* _ROUTE_H */
index eeeeb6a..e38826d 100644 (file)
@@ -52,11+52,14 @@ struct ipv6_mib
 {
        unsigned long   Ip6InReceives;
        unsigned long   Ip6InHdrErrors;
+       unsigned long   Ip6InTooBigErrors;
+       unsigned long   Ip6InNoRoutes;
        unsigned long   Ip6InAddrErrors;
-       unsigned long   Ip6ForwDatagrams;
        unsigned long   Ip6InUnknownProtos;
+       unsigned long   Ip6InTruncatedPkts;
        unsigned long   Ip6InDiscards;
        unsigned long   Ip6InDelivers;
+       unsigned long   Ip6OutForwDatagrams;
        unsigned long   Ip6OutRequests;
        unsigned long   Ip6OutDiscards;
        unsigned long   Ip6OutNoRoutes;
@@ -67,6+70,8 @@ struct ipv6_mib
        unsigned long   Ip6FragOKs;
        unsigned long   Ip6FragFails;
        unsigned long   Ip6FragCreates;
+       unsigned long   Ip6InMcastPkts;
+       unsigned long   Ip6OutMcastPkts;
 };
  
 struct icmp_mib
@@ -98,6+103,43 @@ struct icmp_mib
        unsigned long   IcmpOutAddrMasks;
        unsigned long   IcmpOutAddrMaskReps;
 };
+
+struct icmpv6_mib
+{
+       unsigned long   Icmp6InMsgs;
+       unsigned long   Icmp6InErrors;
+
+       unsigned long   Icmp6InDestUnreachs;
+       unsigned long   Icmp6InPktTooBigs;
+       unsigned long   Icmp6InTimeExcds;
+       unsigned long   Icmp6InParmProblems;
+
+       unsigned long   Icmp6InEchos;
+       unsigned long   Icmp6InEchoReplies;
+       unsigned long   Icmp6InGroupMembQueries;
+       unsigned long   Icmp6InGroupMembResponses;
+       unsigned long   Icmp6InGroupMembReductions;
+       unsigned long   Icmp6InRouterSolicits;
+       unsigned long   Icmp6InRouterAdvertisements;
+       unsigned long   Icmp6InNeighborSolicits;
+       unsigned long   Icmp6InNeighborAdvertisements;
+       unsigned long   Icmp6InRedirects;
+
+       unsigned long   Icmp6OutMsgs;
+
+       unsigned long   Icmp6OutDestUnreachs;
+       unsigned long   Icmp6OutPktTooBigs;
+       unsigned long   Icmp6OutTimeExcds;
+       unsigned long   Icmp6OutParmProblems;
+
+       unsigned long   Icmp6OutEchoReplies;
+       unsigned long   Icmp6OutRouterSolicits;
+       unsigned long   Icmp6OutNeighborSolicits;
+       unsigned long   Icmp6OutNeighborAdvertisements;
+       unsigned long   Icmp6OutRedirects;
+       unsigned long   Icmp6OutGroupMembResponses;
+       unsigned long   Icmp6OutGroupMembReductions;
+};
  
 struct tcp_mib
 {
@@ -131,6+173,9 @@ struct linux_mib
        unsigned long   SyncookiesRecv;
        unsigned long   SyncookiesFailed;
        unsigned long   EmbryonicRsts;
+       unsigned long   PruneCalled; 
+       unsigned long   RcvPruned;
+       unsigned long   OfoPruned;
 };
        
 #endif
index d8389f6..ad27511 100644 (file)
 
 #include <asm/atomic.h>
 
+#define MIN_WRITE_SPACE        2048
+
 /* The AF_UNIX specific socket options */
 struct unix_opt {
        int                     family;
@@ -134,6+136,7 @@ struct ipv6_pinfo {
        struct in6_addr         saddr;
        struct in6_addr         rcv_saddr;
        struct in6_addr         daddr;
+       struct in6_addr         *daddr_cache;
 
        __u32                   flow_lbl;
        int                     hop_limit;
@@ -141,21+144,28 @@ struct ipv6_pinfo {
        int                     mcast_oif;
        __u8                    priority;
 
-
-       /* sockopt flags */
-
-       __u8                    recvsrcrt:1,
-                               rxinfo:1,
+       /* pktoption flags */
+       union {
+               struct {
+                       __u8    srcrt:2,
+                               rxinfo:1,
                                rxhlim:1,
                                hopopts:1,
                                dstopts:1,
-                               mc_loop:1,
-                                unused:2;
+                                authhdr:1,
+                                unused:1;
+               } bits;
+               __u8            all;
+       } rxopt;
+
+       /* sockopt flags */
+       __u8                    mc_loop:1;
 
        struct ipv6_mc_socklist *ipv6_mc_list;
        __u32                   dst_cookie;
 
-       struct ipv6_options     *opt;
+       struct ipv6_txoptions   *opt;
+       struct sk_buff          *pktoptions;
 };
 
 struct raw6_opt {
@@ -207,6+217,10 @@ struct tcp_opt {
        __u32   snd_wl2;        /* Ack sequence for update              */
        __u32   snd_wnd;        /* The window we expect to receive      */
        __u32   max_window;
+       __u32   pmtu_cookie;    /* Last pmtu seen by socket             */
+       __u16   mss_cache;      /* Cached effective mss, not including SACKS */
+       __u16   mss_clamp;      /* Maximal mss, negotiated at connection setup */
+       __u16   ext_header_len; /* Dave, do you allow mw to use this hole? 8) --ANK */
        __u8    pending;        /* pending events                       */
        __u8    retransmits;
        __u32   last_ack_sent;  /* last ack we sent                     */
@@ -226,6+240,7 @@ struct tcp_opt {
        __u32   snd_ssthresh;   /* Slow start size threshold            */
        __u8    dup_acks;       /* Consequetive duplicate acks seen from other end */
        __u8    delayed_acks;
+       __u16   user_mss;       /* mss requested by user in ioctl */
 
        /* Two commonly used timers in both sender and receiver paths. */
        struct timer_list       retransmit_timer;       /* Resend (no ack)      */
@@ -252,7+267,6 @@ struct tcp_opt {
                wscale_ok,      /* Wscale seen on SYN packet            */
                sack_ok;        /* SACK seen on SYN packet              */
        char    saw_tstamp;     /* Saw TIMESTAMP on last packet         */
-        __u16  in_mss;         /* MSS option received from sender      */
         __u8   snd_wscale;     /* Window scaling received from sender  */
         __u8   rcv_wscale;     /* Window scaling to send to receiver   */
         __u32  rcv_tsval;      /* Time stamp value                     */
@@ -270,6+284,9 @@ struct tcp_opt {
        __u32   urg_seq;
        __u32   urg_data;
 
+       __u32   last_seg_size;  /* Size of last incoming segment */
+       __u32   rcv_mss;        /* MSS used for delayed ACK decisions */ 
+
        struct open_request     *syn_wait_queue;
        struct open_request     **syn_wait_last;
 
@@ -390,12+407,6 @@ struct sock {
 
        struct proto            *prot;
 
-       /* mss is min(mtu, max_window)
-        * XXX Fix this, mtu only used in one TCP place and that is it -DaveM
-        */
-       unsigned short          mtu;       /* mss negotiated in the syn's */
-       unsigned short          mss;       /* current eff. mss - can change */
-       unsigned short          user_mss;  /* mss requested by user in ioctl */
        unsigned short          shutdown;
 
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
@@ -868,6+879,26 @@ extern __inline__ int sock_error(struct sock *sk)
        return -err;
 }
 
+extern __inline__ unsigned long sock_wspace(struct sock *sk)
+{
+       int amt = 0;
+
+       if (!(sk->shutdown & SEND_SHUTDOWN)) {
+               amt = sk->sndbuf - atomic_read(&sk->wmem_alloc);
+               if (amt < 0) 
+                       amt = 0;
+       }
+       return amt;
+}
+
+/*
+ *     Default write policy as shown to user space via poll/select/SIGIO
+ *     Kernel internally doesn't use the MIN_WRITE_SPACE threshold.
+ */
+extern __inline__ int sock_writeable(struct sock *sk) 
+{
+       return sock_wspace(sk) >= MIN_WRITE_SPACE;
+}
 
 /* 
  *     Declarations from timer.c 
index 225d40a..3f305aa 100644 (file)
@@ -78,6+78,7 @@ struct tcp_bind_bucket {
        unsigned short          flags;
 #define TCPB_FLAG_LOCKED       0x0001
 #define TCPB_FLAG_FASTREUSE    0x0002
+#define TCPB_FLAG_GOODSOCKNUM  0x0004
 
        struct tcp_bind_bucket  *next;
        struct sock             *owners;
@@ -230,11+231,8 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
        return tcp_lhashfn(sk->num);
 }
 
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-#define NETHDR_SIZE    sizeof(struct ipv6hdr)
-#else
-#define NETHDR_SIZE    sizeof(struct iphdr) + 40
-#endif
+/* Note, that it is > than ipv6 header */
+#define NETHDR_SIZE    (sizeof(struct iphdr) + 40)
 
 /*
  * 40 is maximal IP options size
@@ -257,7+255,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
 #define MIN_WINDOW     2048
 #define MAX_ACK_BACKLOG        2
 #define MAX_DELAY_ACK  2
-#define MIN_WRITE_SPACE        2048
 #define TCP_WINDOW_DIFF        2048
 
 /* urg_data states */
@@ -354,7+351,7 @@ struct tcp_v4_open_req {
 struct tcp_v6_open_req {
        struct in6_addr         loc_addr;
        struct in6_addr         rmt_addr;
-       struct ipv6_options     *opt;
+       struct sk_buff          *pktopts;
        int                     iif;
 };
 #endif
@@ -400,6+397,13 @@ extern kmem_cache_t *tcp_openreq_cachep;
 /*
  *     Pointers to address related TCP functions
  *     (i.e. things that depend on the address family)
+ *
+ *     BUGGG_FUTURE: all the idea behind this struct is wrong.
+ *     It mixes socket frontend with transport function.
+ *     With port sharing between IPv6/v4 it gives the only advantage,
+ *     only poor IPv6 needs to permanently recheck, that it
+ *     is still IPv6 8)8) It must be cleaned up as soon as possible.
+ *                                             --ANK (980802)
  */
 
 struct tcp_func {
@@ -414,7+418,7 @@ struct tcp_func {
 
        int                     (*conn_request)         (struct sock *sk,
                                                         struct sk_buff *skb,
-                                                        void *opt, __u32 isn);
+                                                        __u32 isn);
 
        struct sock *           (*syn_recv_sock)        (struct sock *sk,
                                                         struct sk_buff *skb,
@@ -424,6+428,10 @@ struct tcp_func {
        struct sock *           (*get_sock)             (struct sk_buff *skb,
                                                         struct tcphdr *th);
 
+       __u16                   net_header_len;
+
+
+
        int                     (*setsockopt)           (struct sock *sk, 
                                                         int level, 
                                                         int optname, 
@@ -490,22+498,24 @@ extern int                        tcp_ioctl(struct sock *sk,
 extern int                     tcp_rcv_state_process(struct sock *sk, 
                                                      struct sk_buff *skb,
                                                      struct tcphdr *th,
-                                                     void *opt, __u16 len);
+                                                     unsigned len);
 
 extern int                     tcp_rcv_established(struct sock *sk, 
                                                    struct sk_buff *skb,
                                                    struct tcphdr *th, 
-                                                   __u16 len);
+                                                   unsigned len);
 
 extern int                     tcp_timewait_state_process(struct tcp_tw_bucket *tw,
                                                           struct sk_buff *skb,
                                                           struct tcphdr *th,
-                                                          void *opt, __u16 len);
+                                                          unsigned len);
 
 extern void                    tcp_close(struct sock *sk, 
                                          unsigned long timeout);
 extern struct sock *           tcp_accept(struct sock *sk, int flags);
 extern unsigned int            tcp_poll(struct file * file, struct socket *sock, struct poll_table_struct *wait);
+extern void                    tcp_write_space(struct sock *sk); 
+
 extern int                     tcp_getsockopt(struct sock *sk, int level, 
                                               int optname, char *optval, 
                                               int *optlen);
@@ -536,12+546,11 @@ extern void                       tcp_v4_send_check(struct sock *sk,
 
 extern int                     tcp_v4_conn_request(struct sock *sk,
                                                    struct sk_buff *skb,
-                                                   void *ptr, __u32 isn);
+                                                   __u32 isn);
 
 extern struct sock *           tcp_create_openreq_child(struct sock *sk,
                                                         struct open_request *req,
-                                                        struct sk_buff *skb,
-                                                        int mss);
+                                                        struct sk_buff *skb);
 
 extern struct sock *           tcp_v4_syn_recv_sock(struct sock *sk,
                                                     struct sk_buff *skb,
@@ -628,30+637,25 @@ struct tcp_sl_timer {
 
 extern struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX];
  
+extern int tcp_sync_mss(struct sock *sk, u32 pmtu);
+
 /* Compute the current effective MSS, taking SACKs and IP options,
  * and even PMTU discovery events into account.
  */
+
 static __inline__ unsigned int tcp_current_mss(struct sock *sk)
 {
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
        struct dst_entry *dst = sk->dst_cache;
-       unsigned int mss_now = sk->mss
+       int mss_now = tp->mss_cache
 
-       if(dst && (sk->mtu < dst->pmtu)) {
-               unsigned int mss_distance = (sk->mtu - sk->mss);
-
-               /* PMTU discovery event has occurred. */
-               sk->mtu = dst->pmtu;
-               mss_now = sk->mss = sk->mtu - mss_distance;
-       }
+       if (dst && dst->pmtu != tp->pmtu_cookie)
+               mss_now = tcp_sync_mss(sk, dst->pmtu);
 
        if(tp->sack_ok && tp->num_sacks)
                mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
                            (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
-       if(sk->opt)
-               mss_now -= sk->opt->optlen;
-
-       return mss_now; 
+       return mss_now > 8 ? mss_now : 8;
 }
 
 /* Compute the actual receive window we are currently advertising.
@@ -715,7+719,12 @@ extern __inline__ int tcp_raise_window(struct sock *sk)
  * skbuff.h:skbuff->cb[xxx] size appropriately.
  */
 struct tcp_skb_cb {
-       struct inet_skb_parm    header; /* For incoming frames          */
+       union {
+               struct inet_skb_parm    h4;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+               struct inet6_skb_parm   h6;
+#endif
+       } header;       /* For incoming frames          */
        __u32           seq;            /* Starting sequence number     */
        __u32           end_seq;        /* SEQ + FIN + SYN + datalen    */
        unsigned long   when;           /* used to compute rtt's        */
@@ -787,7+796,7 @@ static __inline__ int tcp_snd_test(struct sock *sk, struct sk_buff *skb)
         *
         *      Don't use the nagle rule for urgent data.
         */
-       if (!sk->nonagle && skb->len < (sk->mss >> 1) && tp->packets_out &&
+       if (!sk->nonagle && skb->len < (tp->mss_cache >> 1) && tp->packets_out &&
            !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG))
                nagle_check = 0;
 
@@ -913,8+922,6 @@ extern __inline__ void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sa
         * SACKs don't matter, we never delay an ACK when we
         * have any of those going out.
         */
-       if(ts)
-               mss += TCPOLEN_TSTAMP_ALIGNED;
        *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
        if (ts) {
                if(sack)
index 45bdcc5..b697d7c 100644 (file)
@@ -28,7+28,7 @@ extern int                    datagram_recv_ctl(struct sock *sk,
 extern int                     datagram_send_ctl(struct msghdr *msg,
                                                  int *oif,
                                                  struct in6_addr **src_addr,
-                                                 struct ipv6_options *opt,
+                                                 struct ipv6_txoptions *opt,
                                                  int *hlimit);
 
 #define                LOOPBACK4_IPV6          __constant_htonl(0x7f000006)
@@ -38,6+38,8 @@ extern int                    datagram_send_ctl(struct msghdr *msg,
  */
 extern struct tcp_func ipv4_specific;
 
+extern int inet6_destroy_sock(struct sock *sk);
+
 #endif
 
 #endif
index 4b37834..d09a192 100644 (file)
@@ -1129,7+1129,6 @@ asmlinkage void __init start_kernel(void)
        mtrr_init ();
 #endif
 
-       sock_init();
 #ifdef CONFIG_SYSCTL
        sysctl_init();
 #endif
@@ -1200,6+1199,9 @@ static int init(void * unused)
        int real_root_mountflags;
 #endif
 
+       /* Networking initialization needs a process context */ 
+       sock_init();
+
        /* Launch bdflush from here, instead of the old syscall way. */
        kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
        /* Start the background pageout daemon. */
index f1ed3f7..b645703 100644 (file)
@@ -17,7+17,9 @@ bool 'TCP/IP networking' CONFIG_INET
 if [ "$CONFIG_INET" = "y" ]; then
   source net/ipv4/Config.in
   if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
-    tristate 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6
+#   Sorry, but IPv6 as module is still invalid.
+#   tristate 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6
+    bool 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6
     if [ "$CONFIG_IPV6" != "n" ]; then
            source net/ipv6/Config.in
     fi
index 331f3eb..bcfe9e4 100644 (file)
@@ -1017,7+1017,6 @@ static int atalk_create(struct socket *sock, int protocol)
 
        sk->destruct = NULL;
        /* Checksums on by default */
-       sk->mtu = DDP_MAXSZ;
        sk->zapped = 1;
 
        return (0);
index 71999a4..cd84989 100644 (file)
@@ -849,7+849,6 @@ int ax25_create(struct socket *sock, int protocol)
        sk->destruct = ax25_free_sock;
        sock->ops    = &ax25_proto_ops;
        sk->protocol = protocol;
-       sk->mtu      = AX25_MTU;        /* 256 */
 
        ax25->sk          = sk;
        sk->protinfo.ax25 = ax25;
@@ -892,7+891,6 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
        sk->sndbuf   = osk->sndbuf;
        sk->debug    = osk->debug;
        sk->state    = TCP_ESTABLISHED;
-       sk->mtu      = osk->mtu;
        sk->sleep    = osk->sleep;
        sk->zapped   = osk->zapped;
 
index 186ccf8..f064370 100644 (file)
 
 static inline void wait_for_packet(struct sock * sk)
 {
-       unsigned long flags;
+       struct wait_queue wait = { current, NULL };
+
+       add_wait_queue(sk->sleep, &wait);
+       current->state = TASK_INTERRUPTIBLE;
 
-       release_sock(sk);
-       save_flags(flags);
-       cli();
        if (skb_peek(&sk->receive_queue) == NULL)
-               interruptible_sleep_on(sk->sleep);
-       restore_flags(flags);
-       lock_sock(sk);
+               schedule();
+
+       current->state = TASK_RUNNING;
+       remove_wait_queue(sk->sleep, &wait);
 }
 
 /*
@@ -84,6+85,14 @@ static inline int connection_based(struct sock *sk)
  *     This function will lock the socket if a skb is returned, so the caller
  *     needs to unlock the socket in that case (usually by calling skb_free_datagram)
  *
+ *     * It does not lock socket since today. This function is
+ *     * free of race conditions. This measure should/can improve
+ *     * significantly datagram socket latencies at high loads,
+ *     * when data copying to user space takes lots of time.
+ *     * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
+ *     *  8) Great win.)
+ *     *                                           --ANK (980729)
+ *
  *     The order of the tests when we find no data waiting are specified
  *     quite explicitly by POSIX 1003.1g, don't change them without having
  *     the standard around please.
@@ -94,7+103,6 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
        int error;
        struct sk_buff *skb;
 
-       lock_sock(sk);
 restart:
        while(skb_queue_empty(&sk->receive_queue))      /* No data */
        {
@@ -129,13+137,24 @@ restart:
           will suddenly eat the receive_queue */
        if (flags & MSG_PEEK)
        {
-               unsigned long flags;
-               save_flags(flags);
-               cli();
+               unsigned long cpu_flags;
+
+               /* It is the only POTENTIAL race condition
+                  in this function. skb may be stolen by
+                  another receiver after peek, but before
+                  incrementing use count, provided kernel
+                  is reentearble (it is not) or this function
+                  is called by interrupts.
+
+                  Protect it with global skb spinlock,
+                  though for now even this is overkill.
+                                               --ANK (980728)
+                */
+               spin_lock_irqsave(&skb_queue_lock, cpu_flags);
                skb = skb_peek(&sk->receive_queue);
                if(skb!=NULL)
                        atomic_inc(&skb->users);
-               restore_flags(flags);
+               spin_unlock_irqrestore(&skb_queue_lock, cpu_flags);
        } else
                skb = skb_dequeue(&sk->receive_queue);
 
@@ -144,7+163,6 @@ restart:
        return skb;
 
 no_packet:
-       release_sock(sk);
        *err = error;
        return NULL;
 }
@@ -152,7+170,6 @@ no_packet:
 void skb_free_datagram(struct sock * sk, struct sk_buff *skb)
 {
        kfree_skb(skb);
-       release_sock(sk);
 }
 
 /*
@@ -184,6+201,10 @@ int skb_copy_datagram_iovec(struct sk_buff *skb, int offset, struct iovec *to,
  *     Datagram poll: Again totally generic. This also handles
  *     sequenced packet sockets providing the socket receive queue
  *     is only ever holding data ready to receive.
+ *
+ *     Note: when you _don't_ use this routine for this protocol,
+ *     and you use a different write policy from sock_writeable()
+ *     then please supply your own write_space callback.
  */
 
 unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait)
@@ -199,7+220,7 @@ unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *
                mask |= POLLERR;
        if (sk->shutdown & RCV_SHUTDOWN)
                mask |= POLLHUP;
-
+       
        /* readable? */
        if (!skb_queue_empty(&sk->receive_queue))
                mask |= POLLIN | POLLRDNORM;
@@ -214,15+235,8 @@ unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *
        }
 
        /* writable? */
-       if (!(sk->shutdown & SEND_SHUTDOWN)) {
-               if (sk->prot) {
-                       if (sock_wspace(sk) >= MIN_WRITE_SPACE)
-                               mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
-               } else {
-                       if (sk->sndbuf - atomic_read(&sk->wmem_alloc) >= MIN_WRITE_SPACE)
-                               mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
-               }
-       }
+       if (sock_writeable(sk))
+               mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 
        return mask;
 }
index eafeefa..768f2ad 100644 (file)
  *             Alan Cox <gw4pts@gw4pts.ampr.org>
  *             David Hinds <dhinds@allegro.stanford.edu>
  *             Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *             Adam Sulmicki <adam@cfar.umd.edu>
  *
  *     Changes:
  *             Alan Cox        :       device private ioctl copies fields back.
  *             Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  *         Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  *             Cyrus Durgin    :       Cleaned for KMOD
- *
+ *             Adam Sulmicki   :       Bug Fix : Network Device Unload
+ *                                     A network device unload needs to purge
+ *                                     the backlog queue.
+ *     Paul Rusty Russel       :       SIOCSIFNAME
  */
 
 #include <asm/uaccess.h>
@@ -154,6+158,8 @@ int netdev_fastroute_obstacles;
 struct net_fastroute_stats dev_fastroute_stat;
 #endif
 
+static void dev_clear_backlog(struct device *dev);
+
 
 /******************************************************************************************
 
@@ -171,6+177,16 @@ int netdev_nit=0;
  *     Add a protocol ID to the list. Now that the input handler is
  *     smarter we can dispense with all the messy stuff that used to be
  *     here.
+ *
+ *     BEWARE!!! Protocol handlers, mangling input packets,
+ *     MUST BE last in hash buckets and checking protocol handlers
+ *     MUST start from promiscous ptype_all chain in net_bh.
+ *     It is true now, do not change it.
+ *     Explantion follows: if protocol handler, mangling packet, will
+ *     be the first on list, it is not able to sense, that packet
+ *     is cloned and should be copied-on-write, so that it will
+ *     change it and subsequent readers will get broken packet.
+ *                                                     --ANK (980803)
  */
  
 void dev_add_pack(struct packet_type *pt)
@@ -448,7+464,8 @@ int dev_close(struct device *dev)
        /*
         *      Device is now down.
         */
-        
+       dev_clear_backlog(dev);
+
        dev->flags&=~(IFF_UP|IFF_RUNNING);
 #ifdef CONFIG_NET_FASTROUTE
        dev_clear_fastroute(dev);
@@ -457,7+474,6 @@ int dev_close(struct device *dev)
        /*
         *      Tell people we are going down
         */
-        
        notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 
        return(0);
@@ -685,6+701,45 @@ static void netdev_wakeup(void)
 }
 #endif
 
+static void dev_clear_backlog(struct device *dev)
+{
+       struct sk_buff *prev, *curr;
+
+       /*
+        *
+        *  Let now clear backlog queue. -AS
+        *
+        *  We are competing here both with netif_rx() and net_bh().
+        *  We don't want either of those to mess with skb ptrs
+        *  while we work on them, thus cli()/sti().
+        *
+        *  It looks better to use net_bh trick, at least
+        *  to be sure, that we keep interrupt latency really low. --ANK (980727)
+        */ 
+
+       if (backlog.qlen) {
+               start_bh_atomic();
+               curr = backlog.next;
+               while ( curr != (struct sk_buff *)(&backlog) ) {
+                       unsigned long flags;
+                       curr=curr->next;
+                       if ( curr->prev->dev == dev ) {
+                               prev = curr->prev;
+                               spin_lock_irqsave(&skb_queue_lock, flags);
+                               __skb_unlink(prev, &backlog);
+                               spin_unlock_irqrestore(&skb_queue_lock, flags);
+                               kfree_skb(prev);
+                       }
+               }
+               end_bh_atomic();
+#ifdef CONFIG_NET_HW_FLOWCONTROL
+               if (netdev_dropping)
+                       netdev_wakeup();
+#else
+               netdev_dropping = 0;
+#endif
+       }
+}
 
 /*
  *     Receive a packet from a device driver and queue it for the upper
@@ -1320,7+1375,7 @@ int dev_change_flags(struct device *dev, unsigned flags)
         */
 
        dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_RUNNING|IFF_NOARP|
-                              IFF_SLAVE|IFF_MASTER|
+                              IFF_NODYNARP|IFF_SLAVE|IFF_MASTER|
                               IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) |
                                       (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI));
 
@@ -1391,12+1446,11 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
                        return dev_change_flags(dev, ifr->ifr_flags);
                
                case SIOCGIFMETRIC:     /* Get the metric on the interface (currently unused) */
-                       ifr->ifr_metric = dev->metric;
+                       ifr->ifr_metric = 0;
                        return 0;
                        
                case SIOCSIFMETRIC:     /* Set the metric on the interface (currently unused) */
-                       dev->metric = ifr->ifr_metric;
-                       return 0;
+                       return -EOPNOTSUPP;
        
                case SIOCGIFMTU:        /* Get the MTU of a device */
                        ifr->ifr_mtu = dev->mtu;
@@ -1419,10+1473,8 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
                                dev->mtu = ifr->ifr_mtu;
                                err = 0;
                        }
-                       if (!err && dev->flags&IFF_UP) {
-                               printk(KERN_DEBUG "SIFMTU %s(%s)\n", dev->name, current->comm);
+                       if (!err && dev->flags&IFF_UP)
                                notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev);
-                       }
                        return err;
 
                case SIOCGIFHWADDR:
@@ -1484,11+1536,22 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
                        return 0;
 
                case SIOCSIFTXQLEN:
-                       if(ifr->ifr_qlen<2 || ifr->ifr_qlen>1024)
+                       /* Why <2? 0 and 1 are valid values. --ANK (980807) */
+                       if(/*ifr->ifr_qlen<2 ||*/ ifr->ifr_qlen>1024)
                                return -EINVAL;
                        dev->tx_queue_len = ifr->ifr_qlen;
                        return 0;
 
+               case SIOCSIFNAME:
+                       if (dev->flags&IFF_UP)
+                               return -EBUSY;
+                       if (dev_get(ifr->ifr_newname))
+                               return -EEXIST;
+                       memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ);
+                       dev->name[IFNAMSIZ-1] = 0;
+                       notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
+                       return 0;
+
                /*
                 *      Unknown or private ioctl
                 */
@@ -1597,6+1660,7 @@ int dev_ioctl(unsigned int cmd, void *arg)
                case SIOCDELMULTI:
                case SIOCSIFHWBROADCAST:
                case SIOCSIFTXQLEN:
+               case SIOCSIFNAME:
                        if (!capable(CAP_NET_ADMIN))
                                return -EPERM;
                        dev_load(ifr.ifr_name);
@@ -1668,6+1732,17 @@ int register_netdevice(struct device *dev)
        struct device *d, **dp;
 
        if (dev_boot_phase) {
+               /* This is NOT bug, but I am not sure, that all the
+                  devices, initialized before netdev module is started
+                  are sane. 
+
+                  Now they are chained to device boot list
+                  and probed later. If a module is initialized
+                  before netdev, but assumes that dev->init
+                  is really called by register_netdev(), it will fail.
+
+                  So that this message should be printed for a while.
+                */
                printk(KERN_INFO "early initialization of device %s is deferred\n", dev->name);
 
                /* Check for existence, and append to tail of chain */
index 67f7a6f..b8960ec 100644 (file)
@@ -215,7+215,7 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
                        partial_cnt = 0;
                }
 
-               if (len - copy > 0)
+               if (len > copy)
                {
                        partial_cnt = copy % 4;
                        if (partial_cnt)
index a8d7260..ead3b77 100644 (file)
@@ -9,6+9,9 @@
  *      modify it under the terms of the GNU General Public License
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
+ *
+ *     Fixes:
+ *     Vitaly E. Lavrov        releasing NULL neighbor in neigh_add.
  */
 
 #include <linux/config.h>
@@ -1033,7+1036,8 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
                                           ndm->ndm_state,
                                           nlh->nlmsg_flags&NLM_F_REPLACE, 0);
                }
-               neigh_release(n);
+               if (n)
+                       neigh_release(n);
                end_bh_atomic();
                return err;
        }
@@ -1043,7+1047,7 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 
 
 static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
-                           pid_t pid, u32 seq, int event)
+                          u32 pid, u32 seq, int event)
 {
        unsigned long now = jiffies;
        struct ndmsg *ndm;
index cd8030c..e1fe887 100644 (file)
  *             as published by the Free Software Foundation; either version
  *             2 of the License, or (at your option) any later version.
  *
+ *     Fixes:
+ *     Vitaly E. Lavrov                RTA_OK arithmetics was wrong.
  */
 
 #include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
+#include <linux/capability.h>
 #include <linux/skbuff.h>
 #include <linux/init.h>
 
@@ -135,47+138,8 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
        return err;
 }
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
 static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev,
-                                int type, pid_t pid, u32 seq)
-{
-       struct ifinfomsg *r;
-       struct nlmsghdr  *nlh;
-       unsigned char    *b = skb->tail;
-
-       nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r));
-       if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
-       r = NLMSG_DATA(nlh);
-       r->ifi_addrlen = dev->addr_len;
-       r->ifi_address.sa_family = dev->type;
-       memcpy(&r->ifi_address.sa_data, dev->dev_addr, dev->addr_len);
-       r->ifi_broadcast.sa_family = dev->type;
-       memcpy(&r->ifi_broadcast.sa_data, dev->broadcast, dev->addr_len);
-       r->ifi_flags = dev->flags;
-       r->ifi_mtu = dev->mtu;
-       r->ifi_index = dev->ifindex;
-       r->ifi_link = dev->iflink;
-       strncpy(r->ifi_name, dev->name, IFNAMSIZ-1);
-       r->ifi_qdiscname[0] = 0;
-       r->ifi_qdisc = dev->qdisc_sleeping->handle;
-       if (dev->qdisc_sleeping)
-               strcpy(r->ifi_qdiscname, dev->qdisc_sleeping->ops->id);
-       if (dev->get_stats) {
-               struct net_device_stats *stats = dev->get_stats(dev);
-               if (stats)
-                       RTA_PUT(skb, IFLA_STATS, sizeof(*stats), stats);
-       }
-       nlh->nlmsg_len = skb->tail - b;
-       return skb->len;
-
-nlmsg_failure:
-rtattr_failure:
-       skb_trim(skb, b - skb->data);
-       return -1;
-}
-#else
-static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev,
-                                int type, pid_t pid, u32 seq)
+                                int type, u32 pid, u32 seq)
 {
        struct ifinfomsg *r;
        struct nlmsghdr  *nlh;
@@ -218,7+182,6 @@ rtattr_failure:
        skb_trim(skb, b - skb->data);
        return -1;
 }
-#endif
 
 int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -266,12+229,7 @@ int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 void rtmsg_ifinfo(int type, struct device *dev)
 {
        struct sk_buff *skb;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       int size = NLMSG_SPACE(sizeof(struct ifinfomsg)+
-                              RTA_LENGTH(sizeof(struct net_device_stats)));
-#else
        int size = NLMSG_GOODSIZE;
-#endif
 
        skb = alloc_skb(size, GFP_KERNEL);
        if (!skb)
@@ -287,7+245,7 @@ void rtmsg_ifinfo(int type, struct device *dev)
 
 static int rtnetlink_done(struct netlink_callback *cb)
 {
-       if (NETLINK_CREDS(cb->skb)->uid == 0 && cb->nlh->nlmsg_flags&NLM_F_ATOMIC)
+       if (cap_raised(NETLINK_CB(cb->skb).eff_cap, CAP_NET_ADMIN) && cb->nlh->nlmsg_flags&NLM_F_ATOMIC)
                rtnl_shunlock();
        return 0;
 }
@@ -342,13+300,13 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
        sz_idx = type>>2;
        kind = type&3;
 
-       if (kind != 2 && NETLINK_CREDS(skb)->uid) {
+       if (kind != 2 && !cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) {
                *errp = -EPERM;
                return -1;
        }
 
        if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
-               int rlen;
+               u32 rlen;
 
                if (link->dumpit == NULL)
                        link = &(rtnetlink_links[PF_UNSPEC][type]);
@@ -357,12+315,13 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
                        goto err_inval;
 
                /* Super-user locks all the tables to get atomic snapshot */
-               if (NETLINK_CREDS(skb)->uid == 0 && nlh->nlmsg_flags&NLM_F_ATOMIC)
+               if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)
+                   && nlh->nlmsg_flags&NLM_F_ATOMIC)
                        atomic_inc(&rtnl_rlockct);
                if ((*errp = netlink_dump_start(rtnl, skb, nlh,
                                                link->dumpit,
                                                rtnetlink_done)) != 0) {
-                       if (NETLINK_CREDS(skb)->uid == 0 && nlh->nlmsg_flags&NLM_F_ATOMIC)
+                       if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN) && nlh->nlmsg_flags&NLM_F_ATOMIC)
                                atomic_dec(&rtnl_rlockct);
                        return -1;
                }
@@ -431,7+390,7 @@ extern __inline__ int rtnetlink_rcv_skb(struct sk_buff *skb)
        struct nlmsghdr * nlh;
 
        while (skb->len >= NLMSG_SPACE(0)) {
-               int rlen;
+               u32 rlen;
 
                nlh = (struct nlmsghdr *)skb->data;
                if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
index 3e4469f..e16c4a4 100644 (file)
@@ -138,11+138,15 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
 
        for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
        {
+               err = -EINVAL;
+
+               if ((unsigned long)(((char*)cmsg - (char*)msg->msg_control)
+                                   + cmsg->cmsg_len) > msg->msg_controllen)
+                       goto error;
+
                if (cmsg->cmsg_level != SOL_SOCKET)
                        continue;
 
-               err = -EINVAL;
-
                switch (cmsg->cmsg_type)
                {
                case SCM_RIGHTS:
index c218233..fb13b5e 100644 (file)
@@ -4,6+4,8 @@
  *     Authors:        Alan Cox <iiitac@pyr.swan.ac.uk>
  *                     Florian La Roche <rzsfl@rz.uni-sb.de>
  *
+ *     Version:        $Id: skbuff.c,v 1.53 1998/08/19 13:32:44 freitag Exp $
+ *
  *     Fixes:  
  *             Alan Cox        :       Fixed the worst of the load balancer bugs.
  *             Dave Platt      :       Interrupt stacking fix.
@@ -96,14+98,14 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 
 void show_net_buffers(void)
 {
-       printk(KERN_INFO "Networking buffers in use          : %u\n",
+       printk("Networking buffers in use          : %u\n",
               atomic_read(&net_skbcount));
-       printk(KERN_INFO "Total network buffer allocations   : %u\n",
+       printk("Total network buffer allocations   : %u\n",
               atomic_read(&net_allocs));
-       printk(KERN_INFO "Total failed network buffer allocs : %u\n",
+       printk("Total failed network buffer allocs : %u\n",
               atomic_read(&net_fails));
 #ifdef CONFIG_INET
-       printk(KERN_INFO "IP fragment buffer size            : %u\n",
+       printk("IP fragment buffer size            : %u\n",
               atomic_read(&ip_frag_mem));
 #endif 
 }
@@ -365,7+367,7 @@ void skb_add_mtu(int mtu)
 }
 #endif
 
-__initfunc(void skb_init(void))
+void __init skb_init(void)
 {
        skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
                                              sizeof(struct sk_buff),
index 07d1254..e9e293e 100644 (file)
@@ -7,7+7,7 @@
  *             handler for protocols to use and generic option handler.
  *
  *
- * Version:    @(#)sock.c      1.0.17  06/02/93
+ * Version:    $Id: sock.c,v 1.70 1998/08/26 12:03:07 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *              Chris Evans     :       Call suser() check last on F_SETOWN
  *             Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  *             Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
+ *             Andi Kleen      :       Fix write_space callback
  *
  * To Fix:
  *
@@ -445,6+446,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
                case SO_RCVLOWAT:
                case SO_SNDLOWAT:
                        v.val=1;
+                       break; 
 
                case SO_PASSCRED:
                        v.val = sock->passcred;
@@ -615,19+617,6 @@ unsigned long sock_rspace(struct sock *sk)
 }
 
 
-/* FIXME: this is also insane. See above comment */
-unsigned long sock_wspace(struct sock *sk)
-{
-       int amt = 0;
-
-       if (sk != NULL && !(sk->shutdown & SEND_SHUTDOWN)) {
-               amt = sk->sndbuf - atomic_read(&sk->wmem_alloc);
-               if (amt < 0) 
-                       amt = 0;
-       }
-       return amt;
-}
-
 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
    I think, these locks should be removed for datagram sockets.
  */
@@ -746,17+735,15 @@ void __release_sock(struct sock *sk)
  
 void sklist_remove_socket(struct sock **list, struct sock *sk)
 {
-       unsigned long flags;
        struct sock *s;
 
-       save_flags(flags);
-       cli();
+       start_bh_atomic();
 
        s= *list;
        if(s==sk)
        {
                *list = s->next;
-               restore_flags(flags);
+               end_bh_atomic();
                return;
        }
        while(s && s->next)
@@ -764,22+751,19 @@ void sklist_remove_socket(struct sock **list, struct sock *sk)
                if(s->next==sk)
                {
                        s->next=sk->next;
-                       restore_flags(flags);
-                       return;
+                       break;
                }
                s=s->next;
        }
-       restore_flags(flags);
+       end_bh_atomic();
 }
 
 void sklist_insert_socket(struct sock **list, struct sock *sk)
 {
-       unsigned long flags;
-       save_flags(flags);
-       cli();
+       start_bh_atomic();
        sk->next= *list;
        *list=sk;
-       restore_flags(flags);
+       end_bh_atomic();
 }
 
 /*
@@ -914,6+898,10 @@ int sock_no_getsockopt(struct socket *sock, int level, int optname,
        return -EOPNOTSUPP;
 }
 
+/* 
+ * Note: if you add something that sleeps here then change sock_fcntl()
+ *       to do proper fd locking.
+ */
 int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
        struct sock *sk = sock->sk;
@@ -971,12+959,15 @@ void sock_def_callback2(struct sock *sk, int len)
        }
 }
 
-void sock_def_callback3(struct sock *sk)
+void sock_def_write_space(struct sock *sk)
 {
        if(!sk->dead)
        {
                wake_up_interruptible(sk->sleep);
-               sock_wake_async(sk->socket, 2);
+
+               /* Should agree with poll, otherwise some programs break */ 
+               if (sock_writeable(sk))
+                       sock_wake_async(sk->socket, 2);
        }
 }
 
@@ -1011,7+1002,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
        sk->state_change        =       sock_def_callback1;
        sk->data_ready          =       sock_def_callback2;
-       sk->write_space         =       sock_def_callback3;
+       sk->write_space         =       sock_def_write_space;
        sk->error_report        =       sock_def_callback1;
        sk->destruct            =       sock_def_destruct;
 
index 18c31f5..8282333 100644 (file)
@@ -5,7+5,7 @@
  *
  *             PF_INET protocol family socket handler.
  *
- * Version:    $Id: af_inet.c,v 1.74 1998/05/08 21:06:24 davem Exp $
+ * Version:    $Id: af_inet.c,v 1.75 1998/08/26 12:03:15 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -177,6+177,8 @@ static __inline__ void kill_sk_now(struct sock *sk)
        if(sk->opt)
                kfree(sk->opt);
        dst_release(sk->dst_cache);
+       if (atomic_read(&sk->omem_alloc))
+               printk(KERN_DEBUG "kill_sk_now: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
        sk_free(sk);
 }
 
@@ -576,6+578,24 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
        return(0);
 }
 
+static void inet_wait_for_connect(struct sock *sk)
+{
+       struct wait_queue wait = { current, NULL };
+
+       add_wait_queue(sk->sleep, &wait);
+       current->state = TASK_INTERRUPTIBLE;
+       while (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
+               if (signal_pending(current))
+                       break;
+               if (sk->err)
+                       break;
+               schedule();
+               current->state = TASK_INTERRUPTIBLE;
+       }
+       current->state = TASK_RUNNING;
+       remove_wait_queue(sk->sleep, &wait);
+}
+
 /*
  *     Connect to a remote host. There is regrettably still a little
  *     TCP 'magic' in here.
@@ -623,6+643,13 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
        if (sk->state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) 
                return (-EINPROGRESS);
 
+#if 1
+       if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
+               inet_wait_for_connect(sk);
+               if (signal_pending(current))
+                       return -ERESTARTSYS;
+       }
+#else
        cli();
        while(sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
                interruptible_sleep_on(sk->sleep);
@@ -639,6+666,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
                }
        }
        sti();
+#endif
 
        sock->state = SS_CONNECTED;
        if ((sk->state != TCP_ESTABLISHED) && sk->err) {
@@ -876,7+904,6 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
                case FIOGETOWN:
                case SIOCGPGRP:
                        return put_user(sk->proc, (int *)arg);
-                       return(0);                      
                case SIOCGSTAMP:
                        if(sk->stamp.tv_sec==0)
                                return -ENOENT;
index e6e272b..1ce6902 100644 (file)
@@ -1,6+1,6 @@
 /* linux/net/inet/arp.c
  *
- * Version:    $Id: arp.c,v 1.67 1998/06/19 13:22:31 davem Exp $
+ * Version:    $Id: arp.c,v 1.70 1998/08/26 12:03:18 davem Exp $
  *
  * Copyright (C) 1994 by Florian  La Roche
  *
@@ -760,7+760,7 @@ int arp_req_set(struct arpreq *r, struct device * dev)
                r->arp_flags |= ATF_COM;
        if (dev == NULL) {
                struct rtable * rt;
-               if ((err = ip_route_output(&rt, ip, 0, 1, 0)) != 0)
+               if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0)
                        return err;
                dev = rt->u.dst.dev;
                ip_rt_put(rt);
@@ -843,11+843,21 @@ int arp_req_delete(struct arpreq *r, struct device * dev)
                return -EINVAL;
        }
 
+       if (dev == NULL) {
+               struct rtable * rt;
+               if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0)
+                       return err;
+               dev = rt->u.dst.dev;
+               ip_rt_put(rt);
+               if (!dev)
+                       return -EINVAL;
+       }
        err = -ENXIO;
        start_bh_atomic();
        neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0);
        if (neigh) {
-               err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0);
+               if (neigh->nud_state&~NUD_NOARP)
+                       err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0);
                neigh_release(neigh);
        }
        end_bh_atomic();
@@ -867,7+877,7 @@ int arp_ioctl(unsigned int cmd, void *arg)
        switch(cmd) {
                case SIOCDARP:
                case SIOCSARP:
-                       if (!suser())
+                       if (!capable(CAP_NET_ADMIN))
                                return -EPERM;
                case SIOCGARP:
                        err = copy_from_user(&r, arg, sizeof(struct arpreq));
@@ -899,10+909,8 @@ int arp_ioctl(unsigned int cmd, void *arg)
                err = -EINVAL;
                if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
                        goto out;
-       } else if (cmd != SIOCSARP) {
-               /* dev has not been set ... */
-               printk(KERN_ERR "arp_ioctl: invalid, null device\n");
-               err = -EINVAL;
+       } else if (cmd == SIOCGARP) {
+               err = -ENODEV;
                goto out;
        }
 
@@ -911,7+919,6 @@ int arp_ioctl(unsigned int cmd, void *arg)
                err = arp_req_delete(&r, dev);
                break;
        case SIOCSARP:
-               /* This checks for dev == NULL */
                err = arp_req_set(&r, dev);
                break;
        case SIOCGARP:
index 1829333..ac7c044 100644 (file)
@@ -1,7+1,7 @@
 /*
  *     NET3    IP device support routines.
  *
- *     Version: $Id: devinet.c,v 1.22 1998/05/08 21:06:26 davem Exp $
+ *     Version: $Id: devinet.c,v 1.23 1998/08/26 12:03:21 davem Exp $
  *
  *             This program is free software; you can redistribute it and/or
  *             modify it under the terms of the GNU General Public License
@@ -533,8+533,6 @@ int devinet_ioctl(unsigned int cmd, void *arg)
                                inet_del_ifa(in_dev, ifap, 0);
                                ifa->ifa_broadcast = 0;
                                ifa->ifa_anycast = 0;
-                               ifa->ifa_prefixlen = 32;
-                               ifa->ifa_mask = inet_make_mask(32);
                        }
 
                        ifa->ifa_address =
@@ -545,6+543,9 @@ int devinet_ioctl(unsigned int cmd, void *arg)
                                ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
                                if ((dev->flags&IFF_BROADCAST) && ifa->ifa_prefixlen < 31)
                                        ifa->ifa_broadcast = ifa->ifa_address|~ifa->ifa_mask;
+                       } else {
+                               ifa->ifa_prefixlen = 32;
+                               ifa->ifa_mask = inet_make_mask(32);
                        }
                        ret = inet_set_ifa(dev, ifa);
                        break;
@@ -702,6+703,16 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, void
        case NETDEV_UNREGISTER:
                inetdev_destroy(in_dev);
                break;
+       case NETDEV_CHANGENAME:
+               if (in_dev->ifa_list) {
+                       struct in_ifaddr *ifa;
+                       for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+                               memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+                       /* Do not notify about label change, this event is
+                          not interesting to applications using netlink.
+                        */
+               }
+               break;
        }
 
        return NOTIFY_DONE;
@@ -716,7+727,7 @@ struct notifier_block ip_netdev_notifier={
 #ifdef CONFIG_RTNETLINK
 
 static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
-                           pid_t pid, u32 seq, int event)
+                           u32 pid, u32 seq, int event)
 {
        struct ifaddrmsg *ifm;
        struct nlmsghdr  *nlh;
@@ -729,7+740,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
        ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
        ifm->ifa_scope = ifa->ifa_scope;
        ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
-       if (ifa->ifa_prefixlen)
+       if (ifa->ifa_address)
                RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address);
        if (ifa->ifa_local)
                RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local);
index d9a1502..013a4ba 100644 (file)
@@ -5,7+5,7 @@
  *
  *             IPv4 Forwarding Information Base: FIB frontend.
  *
- * Version:    $Id: fib_frontend.c,v 1.11 1998/06/11 03:15:40 davem Exp $
+ * Version:    $Id: fib_frontend.c,v 1.12 1998/08/26 12:03:24 davem Exp $
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -300,10+300,8 @@ static int inet_check_attr(struct rtmsg *r, struct rtattr **rta)
                if (attr) {
                        if (RTA_PAYLOAD(attr) < 4)
                                return -EINVAL;
-#ifndef        CONFIG_RTNL_OLD_IFINFO
                        if (i != RTA_MULTIPATH && i != RTA_METRICS)
-#endif
-                       rta[i-1] = (struct rtattr*)RTA_DATA(attr);
+                               rta[i-1] = (struct rtattr*)RTA_DATA(attr);
                }
        }
        return 0;
@@ -527,6+525,14 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 #undef BRD1_OK
 }
 
+static void fib_disable_ip(struct device *dev, int force)
+{
+       if (fib_sync_down(0, dev, force))
+               fib_flush();
+       rt_cache_flush(0);
+       arp_ifdown(dev);
+}
+
 static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
        struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
@@ -537,8+543,15 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
                rt_cache_flush(-1);
                break;
        case NETDEV_DOWN:
-               fib_del_ifaddr(ifa);
-               rt_cache_flush(-1);
+               if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
+                       /* Last address was deleted from this interface.
+                          Disable IP.
+                        */
+                       fib_disable_ip(ifa->ifa_dev->dev, 1);
+               } else {
+                       fib_del_ifaddr(ifa);
+                       rt_cache_flush(-1);
+               }
                break;
        }
        return NOTIFY_DONE;
@@ -563,18+576,10 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
                rt_cache_flush(-1);
                break;
        case NETDEV_DOWN:
-               if (fib_sync_down(0, dev, 0))
-                       fib_flush();
-               rt_cache_flush(0);
-               arp_ifdown(dev);
+               fib_disable_ip(dev, 0);
                break;
        case NETDEV_UNREGISTER:
-               if (in_dev->ifa_list)
-                       printk("About to crash!\n");
-               if (fib_sync_down(0, dev, 1))
-                       fib_flush();
-               rt_cache_flush(0);
-               arp_ifdown(dev);
+               fib_disable_ip(dev, 1);
                break;
        case NETDEV_CHANGEMTU:
        case NETDEV_CHANGE:
index 3e13671..618d247 100644 (file)
@@ -5,7+5,7 @@
  *
  *             IPv4 FIB: lookup engine and maintenance routines.
  *
- * Version:    $Id: fib_hash.c,v 1.4 1998/07/15 05:05:08 davem Exp $
+ * Version:    $Id: fib_hash.c,v 1.5 1998/08/26 12:03:27 davem Exp $
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -713,7+713,7 @@ static void rtmsg_fib(int event, struct fib_node* f, int z, int tb_id,
                      struct nlmsghdr *n, struct netlink_skb_parms *req)
 {
        struct sk_buff *skb;
-       pid_t pid = req ? req->pid : 0;
+       u32 pid = req ? req->pid : 0;
        int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
 
        skb = alloc_skb(size, GFP_KERNEL);
index 592ff5f..2302f53 100644 (file)
@@ -5,7+5,7 @@
  *
  *             IPv4 Forwarding Information Base: policy rules.
  *
- * Version:    $Id: fib_rules.c,v 1.5 1998/04/28 06:21:57 davem Exp $
+ * Version:    $Id: fib_rules.c,v 1.6 1998/08/26 12:03:30 davem Exp $
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
 
 #define FRprintk(a...)
 
-#ifndef CONFIG_RTNL_OLD_IFINFO
-#define RTA_IFNAME RTA_IIF
-#endif
-
 struct fib_rule
 {
        struct fib_rule *r_next;
@@ -91,7+87,7 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
                    rtm->rtm_tos == r->r_tos &&
                    (!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
                    (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
-                   (!rta[RTA_IFNAME-1] || strcmp(RTA_DATA(rta[RTA_IFNAME-1]), r->r_ifname) == 0) &&
+                   (!rta[RTA_IIF-1] || strcmp(RTA_DATA(rta[RTA_IIF-1]), r->r_ifname) == 0) &&
                    (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
                        *rp = r->r_next;
                        if (r != &default_rule && r != &main_rule && r != &local_rule)
@@ -126,7+122,7 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
            (rtm->rtm_tos & ~IPTOS_TOS_MASK))
                return -EINVAL;
 
-       if (rta[RTA_IFNAME-1] && RTA_PAYLOAD(rta[RTA_IFNAME-1]) > IFNAMSIZ)
+       if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ)
                return -EINVAL;
 
        table_id = rtm->rtm_table;
@@ -159,9+155,9 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
        if (rta[RTA_PRIORITY-1])
                memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
        new_r->r_table = table_id;
-       if (rta[RTA_IFNAME-1]) {
+       if (rta[RTA_IIF-1]) {
                struct device *dev;
-               memcpy(new_r->r_ifname, RTA_DATA(rta[RTA_IFNAME-1]), IFNAMSIZ);
+               memcpy(new_r->r_ifname, RTA_DATA(rta[RTA_IIF-1]), IFNAMSIZ);
                new_r->r_ifname[IFNAMSIZ-1] = 0;
                new_r->r_ifindex = -1;
                dev = dev_get(new_r->r_ifname);
@@ -339,10+335,6 @@ extern __inline__ int inet_fill_rule(struct sk_buff *skb,
        rtm->rtm_table = r->r_table;
        rtm->rtm_protocol = 0;
        rtm->rtm_scope = 0;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       rtm->rtm_nhs = 0;
-       rtm->rtm_optlen = 0;
-#endif
        rtm->rtm_type = r->r_action;
        rtm->rtm_flags = r->r_flags;
 
@@ -351,7+343,7 @@ extern __inline__ int inet_fill_rule(struct sk_buff *skb,
        if (r->r_src_len)
                RTA_PUT(skb, RTA_SRC, 4, &r->r_src);
        if (r->r_ifname[0])
-               RTA_PUT(skb, RTA_IFNAME, IFNAMSIZ, &r->r_ifname);
+               RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname);
        if (r->r_preference)
                RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference);
        if (r->r_srcmap)
index 5537016..36c801e 100644 (file)
@@ -5,7+5,7 @@
  *
  *             IPv4 Forwarding Information Base: semantics.
  *
- * Version:    $Id: fib_semantics.c,v 1.9 1998/06/11 03:15:41 davem Exp $
+ * Version:    $Id: fib_semantics.c,v 1.10 1998/08/26 12:03:32 davem Exp $
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -181,7+181,6 @@ static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
        return 0;
 }
 
-#ifndef CONFIG_RTNL_OLD_IFINFO
 static int
 fib_count_nexthops(struct rtattr *rta)
 {
@@ -189,7+188,7 @@ fib_count_nexthops(struct rtattr *rta)
        struct rtnexthop *nhp = RTA_DATA(rta);
        int nhlen = RTA_PAYLOAD(rta);
 
-       while (nhlen >= sizeof(struct rtnexthop)) {
+       while (nhlen >= (int)sizeof(struct rtnexthop)) {
                if ((nhlen -= nhp->rtnh_len) < 0)
                        return 0;
                nhs++;
@@ -197,21+196,12 @@ fib_count_nexthops(struct rtattr *rta)
        };
        return nhs;
 }
-#endif
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
-static int
-fib_get_nhs(struct fib_info *fi, const struct nlmsghdr *nlh, const struct rtmsg *r)
-{
-       struct rtnexthop *nhp = RTM_RTNH(r);
-       int nhlen = RTM_NHLEN(nlh, r);
-#else
 static int
 fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
 {
        struct rtnexthop *nhp = RTA_DATA(rta);
        int nhlen = RTA_PAYLOAD(rta);
-#endif
 
        change_nexthops(fi) {
                int attrlen = nhlen - sizeof(struct rtnexthop);
@@ -249,18+239,10 @@ int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
        }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       if (r->rtm_nhs == 0)
-               return 0;
-
-       nhp = RTM_RTNH(r);
-       nhlen = RTM_NHLEN(nlh, r);
-#else
        if (rta->rta_mp == NULL)
                return 0;
        nhp = RTA_DATA(rta->rta_mp);
        nhlen = RTA_PAYLOAD(rta->rta_mp);
-#endif
        
        for_nexthops(fi) {
                int attrlen = nhlen - sizeof(struct rtnexthop);
@@ -397,11+379,7 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
        struct fib_info *fi = NULL;
        struct fib_info *ofi;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       int nhs = r->rtm_nhs ? : 1;
-#else
        int nhs = 1;
-#endif
 #else
        const int nhs = 1;
 #endif
@@ -411,14+389,12 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
                goto err_inval;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-#ifndef CONFIG_RTNL_OLD_IFINFO
        if (rta->rta_mp) {
                nhs = fib_count_nexthops(rta->rta_mp);
                if (nhs == 0)
                        goto err_inval;
        }
 #endif
-#endif
 
        fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
        err = -ENOBUFS;
@@ -429,14+405,6 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
        fi->fib_protocol = r->rtm_protocol;
        fi->fib_nhs = nhs;
        fi->fib_flags = r->rtm_flags;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       if (rta->rta_mtu)
-               fi->fib_mtu = *rta->rta_mtu;
-       if (rta->rta_rtt)
-               fi->fib_rtt = *rta->rta_rtt;
-       if (rta->rta_window)
-               fi->fib_window = *rta->rta_window;
-#else
        if (rta->rta_mx) {
                int attrlen = RTA_PAYLOAD(rta->rta_mx);
                struct rtattr *attr = RTA_DATA(rta->rta_mx);
@@ -451,21+419,12 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
                        attr = RTA_NEXT(attr, attrlen);
                }
        }
-#endif
        if (rta->rta_prefsrc)
                memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
 
-#ifndef CONFIG_RTNL_OLD_IFINFO
        if (rta->rta_mp) {
-#else
-       if (r->rtm_nhs) {
-#endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-#ifdef CONFIG_RTNL_OLD_IFINFO
-               if ((err = fib_get_nhs(fi, nlh, r)) != 0)
-#else
                if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
-#endif
                        goto failure;
                if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
                        goto err_inval;
@@ -504,11+463,7 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
 #endif
 
        if (fib_props[r->rtm_type].error) {
-#ifndef CONFIG_RTNL_OLD_IFINFO
                if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
-#else
-               if (rta->rta_gw || rta->rta_oif || r->rtm_nhs)
-#endif
                        goto err_inval;
                goto link_it;
        }
@@ -637,16+592,13 @@ u32 __fib_res_prefsrc(struct fib_result *res)
 #ifdef CONFIG_RTNETLINK
 
 int
-fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event,
+fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
              u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
              struct fib_info *fi)
 {
        struct rtmsg *rtm;
        struct nlmsghdr  *nlh;
        unsigned char    *b = skb->tail;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       unsigned char    *o;
-#endif
 
        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
        rtm = NLMSG_DATA(nlh);
@@ -658,22+610,9 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event,
        rtm->rtm_type = type;
        rtm->rtm_flags = fi->fib_flags;
        rtm->rtm_scope = scope;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       rtm->rtm_nhs = 0;
-
-       o = skb->tail;
-#endif
        if (rtm->rtm_dst_len)
                RTA_PUT(skb, RTA_DST, 4, dst);
        rtm->rtm_protocol = fi->fib_protocol;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       if (fi->fib_mtu)
-               RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &fi->fib_mtu);
-       if (fi->fib_window)
-               RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &fi->fib_window);
-       if (fi->fib_rtt)
-               RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &fi->fib_rtt);
-#else
 #ifdef CONFIG_NET_CLS_ROUTE
        if (fi->fib_nh[0].nh_tclassid)
                RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
@@ -688,7+627,6 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event,
                }
                mx->rta_len = skb->tail - (u8*)mx;
        }
-#endif
        if (fi->fib_prefsrc)
                RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
        if (fi->fib_nhs == 1) {
@@ -697,18+635,14 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event,
                if (fi->fib_nh->nh_oif)
                        RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
        }
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       rtm->rtm_optlen = skb->tail - o;
-#endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (fi->fib_nhs > 1) {
                struct rtnexthop *nhp;
-#ifndef CONFIG_RTNL_OLD_IFINFO
                struct rtattr *mp_head;
                if (skb_tailroom(skb) <= RTA_SPACE(0))
                        goto rtattr_failure;
                mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
-#endif
+
                for_nexthops(fi) {
                        if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
                                goto rtattr_failure;
@@ -719,14+653,9 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event,
                        if (nh->nh_gw)
                                RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
                        nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-                       rtm->rtm_nhs++;
-#endif
                } endfor_nexthops(fi);
-#ifndef CONFIG_RTNL_OLD_IFINFO
                mp_head->rta_type = RTA_MULTIPATH;
                mp_head->rta_len = skb->tail - (u8*)mp_head;
-#endif
        }
 #endif
        nlh->nlmsg_len = skb->tail - b;
@@ -848,24+777,6 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
        if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
                return -EINVAL;
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       /* Ugly conversion from rtentry types to unsigned */
-
-       if (r->rt_flags&RTF_IRTT) {
-               rta->rta_rtt = (unsigned*)&r->rt_pad3;
-               *rta->rta_rtt = r->rt_irtt;
-       }
-       if (r->rt_flags&RTF_WINDOW) {
-               rta->rta_window = (unsigned*)&r->rt_window;
-               if (sizeof(*rta->rta_window) != sizeof(r->rt_window))
-                       *rta->rta_window = r->rt_window;
-       }
-       if (r->rt_flags&RTF_MTU) {
-               rta->rta_mtu = (unsigned*)&r->rt_mtu;
-               if (sizeof(*rta->rta_mtu) != sizeof(r->rt_mtu))
-                       *rta->rta_mtu = r->rt_mtu;
-       }
-#else
        if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
                struct rtattr *rec;
                struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
@@ -896,7+807,6 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
                        *(u32*)RTA_DATA(rec) = r->rt_irtt;
                }
        }
-#endif
        return 0;
 }
 
index 4e94733..9cc7c73 100644 (file)
@@ -3,7+3,7 @@
  *     
  *             Alan Cox, <alan@cymru.net>
  *
- *     Version: $Id: icmp.c,v 1.44 1998/06/16 04:38:27 davem Exp $
+ *     Version: $Id: icmp.c,v 1.45 1998/08/26 12:03:35 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *     modify it under the terms of the GNU General Public License
  *                                     into the dest entry and use a token
  *                                     bucket filter (thanks to ANK). Make
  *                                     the rates sysctl configurable.
+ *             Yu Tianli       :       Fixed two ugly bugs in icmp_send
+ *                                     - IP option length was accounted wrongly
+ *                                     - ICMP header length was not accounted at all.
  *
  * RFC1122 (Host Requirements -- Comm. Layer) Status:
  * (boy, are there a lot of rules for ICMP)
@@ -363,7+366,7 @@ int xrlim_allow(struct dst_entry *dst, int timeout)
 
        now = jiffies;
        dst->rate_tokens += now - dst->rate_last;
-       if (dst->rate_tokens > 6*timeout)
+       if (dst->rate_tokens > XRLIM_BURST_FACTOR*timeout)
                dst->rate_tokens = XRLIM_BURST_FACTOR*timeout;
        if (dst->rate_tokens >= timeout) {
                dst->rate_tokens -= timeout;
@@ -537,7+540,17 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
        /*
         *      Construct source address and options.
         */
-       
+
+#ifdef CONFIG_IP_ROUTE_NAT     
+       /*
+        *      Restore original addresses if packet has been translated.
+        */
+       if (rt->rt_flags&RTCF_NAT && IPCB(skb_in)->flags&IPSKB_TRANSLATED) {
+               iph->daddr = rt->key.dst;
+               iph->saddr = rt->key.src;
+       }
+#endif
+
        saddr = iph->daddr;
        if (!(rt->rt_flags & RTCF_LOCAL))
                saddr = 0;
@@ -587,8+600,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
        room = rt->u.dst.pmtu;
        if (room > 576)
                room = 576;
-       room -= sizeof(struct iphdr) - icmp_param.replyopts.optlen;
-       
+       room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+       room -= sizeof(struct icmphdr);
+
        icmp_param.data_len=(iph->ihl<<2)+skb_in->len;
        if (icmp_param.data_len > room)
                icmp_param.data_len = room;
index 74757ad..af49104 100644 (file)
@@ -8,7+8,7 @@
  *     the older version didn't come out right using gcc 2.5.8, the newer one
  *     seems to fall out with gcc 2.6.2.
  *
- *     Version: $Id: igmp.c,v 1.26 1998/03/08 05:56:19 davem Exp $
+ *     Version: $Id: igmp.c,v 1.27 1998/08/26 12:03:39 davem Exp $
  *
  *     Authors:
  *             Alan Cox <Alan.Cox@linux.org>
@@ -563,7+563,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
                goto done;
        }
 
-       iml = (struct ip_mc_socklist *)kmalloc(sizeof(*iml), GFP_KERNEL);
+       iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
 
        err = -EADDRINUSE;
        for (i=sk->ip_mc_list; i; i=i->next) {
@@ -590,7+590,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 done:
        rtnl_shunlock();
        if (iml)
-               kfree(iml);
+               sock_kfree_s(sk, iml, sizeof(*iml));
        return err;
 }
 
@@ -613,7+613,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
                        in_dev = inetdev_by_index(iml->multi.imr_ifindex);
                        if (in_dev)
                                ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
-                       kfree_s(iml, sizeof(*iml));
+                       sock_kfree_s(sk, iml, sizeof(*iml));
                        return 0;
                }
        }
@@ -633,7+633,7 @@ void ip_mc_drop_socket(struct sock *sk)
                sk->ip_mc_list = iml->next;
                if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL)
                        ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
-               kfree_s(iml, sizeof(*iml));
+               sock_kfree_s(sk, iml, sizeof(*iml));
        }
 }
 
index e136a16..8cd0d59 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The IP forwarding functionality.
  *             
- * Version:    $Id: ip_forward.c,v 1.40 1998/03/08 05:56:20 davem Exp $
+ * Version:    $Id: ip_forward.c,v 1.41 1998/08/26 12:03:42 davem Exp $
  *
  * Authors:    see ip.c
  *
@@ -79,10+79,8 @@ int ip_forward(struct sk_buff *skb)
        int fw_res = 0;
 #endif
 
-       if (IPCB(skb)->opt.router_alert) {
-               if (ip_call_ra_chain(skb))
-                       return 0;
-       }
+       if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+               return 0;
 
        if (skb->pkt_type != PACKET_HOST)
                goto drop;
@@ -110,7+108,7 @@ int ip_forward(struct sk_buff *skb)
                 goto local_pkt;
 #endif
 
-       if (ip_decrease_ttl(iph) <= 0)
+       if (iph->ttl <= 1)
                 goto too_many_hops;
 
        if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
@@ -121,22+119,30 @@ int ip_forward(struct sk_buff *skb)
         *      after asking the firewall permission to do so.
         */
 
-       skb->priority = rt->u.dst.priority;
+       skb->priority = rt_tos2priority(iph->tos);
        dev2 = rt->u.dst.dev;
-       mtu = dev2->mtu;
+       mtu = rt->u.dst.pmtu;
 
 #ifdef CONFIG_NET_SECURITY
        call_fw_firewall(PF_SECURITY, dev2, NULL, &mtu, NULL);
 #endif 
        
        /*
-        *      In IP you never have to forward a frame on the interface that it 
-        *      arrived upon. We now generate an ICMP HOST REDIRECT giving the route
+        *      We now generate an ICMP HOST REDIRECT giving the route
         *      we calculated.
         */
        if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
                ip_rt_send_redirect(skb);
-       
+
+       /* We are about to mangle packet. Copy it! */
+       if ((skb = skb_cow(skb, dev2->hard_header_len)) == NULL)
+               return -1;
+       iph = skb->nh.iph;
+       opt = &(IPCB(skb)->opt);
+
+       /* Decrease ttl after skb cow done */
+       ip_decrease_ttl(iph);
+
        /*
         * We now may allocate a new buffer, and copy the datagram into it.
         * If the indicated interface is up and running, kick it.
@@ -147,14+153,6 @@ int ip_forward(struct sk_buff *skb)
 
 #ifdef CONFIG_IP_ROUTE_NAT
        if (rt->rt_flags & RTCF_NAT) {
-               if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) {
-                       struct sk_buff *skb2;
-                       skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15);
-                       kfree_skb(skb);
-                       if (skb2 == NULL)
-                               return -1;
-                       skb = skb2;
-               }
                if (ip_do_nat(skb)) {
                        kfree_skb(skb);
                        return -1;
@@ -243,18+241,6 @@ skip_call_fw_firewall:
        }
 #endif
 
-       if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) {
-               struct sk_buff *skb2;
-               skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15);
-               kfree_skb(skb);
-
-               if (skb2 == NULL) {
-                       NETDEBUG(printk(KERN_ERR "\nIP: No memory available for IP forward\n"));
-                       return -1;
-               }
-               skb = skb2;
-               iph = skb2->nh.iph;
-       }
 
 #ifdef CONFIG_FIREWALL
        if ((fw_res = call_out_firewall(PF_INET, dev2, iph, NULL,&skb)) < FW_ACCEPT) {
index 9641aaa..8a0e40f 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The IP fragmentation functionality.
  *             
- * Version:    $Id: ip_fragment.c,v 1.38 1998/06/16 04:38:29 davem Exp $
+ * Version:    $Id: ip_fragment.c,v 1.39 1998/08/26 10:35:26 davem Exp $
  *
  * Authors:    Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
  *             Alan Cox <Alan.Cox@linux.org>
index 57e7761..b8ffe59 100644 (file)
@@ -1512,14+1512,14 @@ static int dump_rule(char *buffer,
                    "%9s "                      /* Chain name */
                    "%08lX/%08lX->%08lX/%08lX " /* Source & Destination IPs */
                    "%.16s "                    /* Interface */
-                   "%hX %hX "                  /* fw_flg and fw_invflg fields */
-                   "%hu "                      /* Protocol */
+                   "%X %X "                    /* fw_flg and fw_invflg fields */
+                   "%u "                       /* Protocol */
                    "%-9u %-9u %-9u %-9u "      /* Packet & byte counters */
-                   "%hu-%hu %hu-%hu "          /* Source & Dest port ranges */
+                   "%u-%u %u-%u "              /* Source & Dest port ranges */
                    "A%02X X%02X "              /* TOS and and xor masks */
                    "%08X "                     /* Redirection port */
                    "%u "                       /* fw_mark field */
-                   "%hu "                      /* output size */
+                   "%u "                       /* output size */
                    "%9s\n",                    /* Target */
                    chainlabel,
                    ntohl(rule->ipfw.fw_src.s_addr),
index 04fde61..6a2e4ec 100644 (file)
@@ -684,7+684,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev)
        else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
                struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
 
-               if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= 576) {
+               if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) {
                        if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
                            rt6->rt6i_dst.plen == 128) {
                                rt6->rt6i_flags |= RTF_MODIFIED;
@@ -692,7+692,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev)
                        }
                }
 
-               if (mtu >= 576 && mtu < skb->len - tunnel->hlen + gre_hlen) {
+               if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
                        ip_rt_put(rt);
                        goto tx_error;
@@ -722,6+722,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev)
                        tunnel->recursion--;
                        return 0;
                }
+               if (skb->sk)
+                       skb_set_owner_w(new_skb, skb->sk);
                dev_kfree_skb(skb);
                skb = new_skb;
        }
index f56a903..e06ad82 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The Internet Protocol (IP) module.
  *
- * Version:    $Id: ip_input.c,v 1.31 1998/05/17 02:19:15 freitag Exp $
+ * Version:    $Id: ip_input.c,v 1.33 1998/08/26 12:03:47 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *             2 of the License, or (at your option) any later version.
  */
 
-#include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/config.h>
 
+#include <linux/net.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
 #include <linux/in.h>
 #include <linux/inet.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
 
 #include <net/snmp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
-#include <net/tcp.h>
-#include <net/udp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/arp.h>
 #include <net/icmp.h>
 #include <net/raw.h>
 #include <net/checksum.h>
-#include <linux/igmp.h>
 #include <linux/ip_fw.h>
 #ifdef CONFIG_IP_MASQUERADE
 #include <net/ip_masq.h>
 #include <linux/firewall.h>
 #include <linux/mroute.h>
 #include <linux/netlink.h>
-#include <linux/ipsec.h>
 
 /*
  *     SNMP management statistics
@@ -199,6+191,9 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
        return 0;
 }
 
+/*
+ *     Process Router Attention IP option
+ */ 
 int ip_call_ra_chain(struct sk_buff *skb)
 {
        struct ip_ra_chain *ra;
@@ -229,6+224,9 @@ int ip_call_ra_chain(struct sk_buff *skb)
        return 0;
 }
 
+/*
+ *     Deliver IP Packets to the higher protocol layers.
+ */ 
 int ip_local_deliver(struct sk_buff *skb)
 {
        struct iphdr *iph = skb->nh.iph;
@@ -282,9+280,11 @@ int ip_local_deliver(struct sk_buff *skb)
         skb->h.raw = skb->nh.raw + iph->ihl*4;
 
        /*
-        *      Deliver to raw sockets. This is fun as to avoid copies we want to make no surplus copies.
+        *      Deliver to raw sockets. This is fun as to avoid copies we want to make no 
+        *      surplus copies.
         *
         *      RFC 1122: SHOULD pass TOS value up to the transport layer.
+        *      -> It does. And not only TOS, but all IP header.
         */
  
        /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
@@ -309,10+309,7 @@ int ip_local_deliver(struct sk_buff *skb)
                                        skb1 = skb_clone(skb, GFP_ATOMIC);
                                        if(skb1)
                                        {
-                                               if(ipsec_sk_policy(raw_sk,skb1))        
-                                                       raw_rcv(raw_sk, skb1);
-                                               else
-                                                       kfree_skb(skb1);
+                                               raw_rcv(raw_sk, skb1);
                                        }
                                }
                                raw_sk = sknext;
@@ -372,10+369,8 @@ int ip_local_deliver(struct sk_buff *skb)
 
        if(raw_sk!=NULL)        /* Shift to last raw user */
        {
-               if(ipsec_sk_policy(raw_sk, skb))
-                       raw_rcv(raw_sk, skb);
-               else
-                       kfree_skb(skb);
+               raw_rcv(raw_sk, skb);
+
        }
        else if (!flag)         /* Free and report errors */
        {
@@ -386,15+381,16 @@ int ip_local_deliver(struct sk_buff *skb)
        return(0);
 }
 
+/*
+ *     Main IP Receive routine.
+ */ 
 int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 {
        struct iphdr *iph = skb->nh.iph;
-       struct ip_options * opt = NULL;
-       int err;
 
        /*
-        * When interface is in promisc. mode, drop all the crap
-        * that it receives, do not truing to analyse it.
+        *      When the interface is in promisc. mode, drop all the crap
+        *      that it receives, do not try to analyse it.
         */
        if (skb->pkt_type == PACKET_OTHERHOST)
                goto drop;
@@ -412,24+408,32 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
         *      4.      Doesn't have a bogus length
         */
 
-       if (skb->len<sizeof(struct iphdr) || iph->ihl<5 || iph->version != 4
-#ifndef CONFIG_IP_ROUTER
-           || ip_fast_csum((unsigned char *)iph, iph->ihl) !=0
-#endif
-               || skb->len < ntohs(iph->tot_len))
-               goto inhdr_error;
+       if (skb->len < sizeof(struct iphdr))
+               goto inhdr_error; 
+       if (iph->ihl < 5 || iph->version != 4 || ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+               goto inhdr_error; 
+
+       {
+       __u32 len = ntohs(iph->tot_len); 
+       if (skb->len < len)
+               goto inhdr_error; 
 
        /*
         *      Our transport medium may have padded the buffer out. Now we know it
         *      is IP we can trim to the true length of the frame.
         *      Note this now means skb->len holds ntohs(iph->tot_len).
         */
-       __skb_trim(skb, ntohs(iph->tot_len));
 
+       __skb_trim(skb, len);
+       }
+       
+       /*
+        *      Initialise the virtual path cache for the packet. It describes
+        *      how the packet travels inside Linux networking.
+        */ 
        if (skb->dst == NULL) {
-               err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev);
-               if (err)
-                       goto drop;
+               if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+                       goto drop; 
 #ifdef CONFIG_CPU_IS_SLOW
                if (net_cpu_congestion > 10 && !(iph->tos&IPTOS_RELIABILITY) &&
                    IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) {
@@ -449,6+453,21 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 #endif
 
        if (iph->ihl > 5) {
+               struct ip_options *opt;
+
+               /* It looks as overkill, because not all
+                  IP options require packet mangling.
+                  But it is the easiest for now, especially taking
+                  into account that combination of IP options
+                  and running sniffer is extremely rare condition.
+                                                     --ANK (980813)
+               */
+                  
+               skb = skb_cow(skb, skb_headroom(skb));
+               if (skb == NULL)
+                       return 0;
+               iph = skb->nh.iph;
+
                skb->ip_summed = 0;
                if (ip_options_compile(NULL, skb))
                        goto inhdr_error;
@@ -458,8+477,8 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
                        struct in_device *in_dev = dev->ip_ptr;
                        if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev)) {
                                if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
-                                       printk(KERN_INFO "source route option %08lx -> %08lx\n",
-                                              ntohl(iph->saddr), ntohl(iph->daddr));
+                                       printk(KERN_INFO "source route option %d.%d.%d.%d -> %d.%d.%d.%d\n",
+                                              NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
                                goto drop;
                        }
                        if (ip_options_rcv_srr(skb))
index 07a7afc..4d9d7e6 100644 (file)
@@ -5,7+5,7 @@
  *
  *             Dumb Network Address Translation.
  *
- * Version:    $Id: ip_nat_dumb.c,v 1.3 1998/03/15 03:31:44 davem Exp $
+ * Version:    $Id: ip_nat_dumb.c,v 1.4 1998/08/26 12:03:49 davem Exp $
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
  * Fixes:
  *             Rani Assaf      :       A zero checksum is a special case
  *                                     only in UDP
+ *             Rani Assaf      :       Added ICMP messages rewriting
+ *
  *
  * NOTE:       It is just working model of real NAT.
  */
 #include <linux/udp.h>
 #include <linux/firewall.h>
 #include <linux/ip_fw.h>
-#ifdef CONFIG_IP_MASQUERADE
-#include <net/ip_masq.h>
-#endif
 #include <net/checksum.h>
 #include <linux/route.h>
 #include <net/route.h>
@@ -68,20+67,48 @@ ip_do_nat(struct sk_buff *skb)
 
                switch(iph->protocol) {
                case IPPROTO_TCP:
-                       cksum  = (u16*)&((struct tcphdr*)(((char*)iph) + iph->ihl*4))->check;
+                       cksum  = (u16*)&((struct tcphdr*)(((char*)iph) + (iph->ihl<<2)))->check;
+                       if ((u8*)(cksum+1) > skb->tail)
+                               goto truncated;
                        check  = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~(*cksum));
                        *cksum = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
                        break;
                case IPPROTO_UDP:
-                       cksum  = (u16*)&((struct udphdr*)(((char*)iph) + iph->ihl*4))->check;
+                       cksum  = (u16*)&((struct udphdr*)(((char*)iph) + (iph->ihl<<2)))->check;
+                       if ((u8*)(cksum+1) > skb->tail)
+                               goto truncated;
                        if ((check = *cksum) != 0) {
                                check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check);
                                check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
                                *cksum = check ? : 0xFFFF;
                        }
+                       break;
+               case IPPROTO_ICMP:
+               {
+                       struct icmphdr *icmph = (struct icmphdr*)((char*)iph + (iph->ihl<<2));
+                       struct   iphdr *ciph;
+
+                       if ((icmph->type != ICMP_DEST_UNREACH) &&
+                           (icmph->type != ICMP_TIME_EXCEEDED) &&
+                           (icmph->type != ICMP_PARAMETERPROB)) break;
+
+                       ciph = (struct iphdr *) (icmph + 1);
+
+                       if ((u8*)(ciph+1) > skb->tail)
+                               goto truncated;
+
+                       if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr)
+                               ciph->saddr = iph->daddr;
+                       if (rt->rt_flags&RTCF_SNAT && ciph->daddr == osaddr)
+                               ciph->daddr = iph->saddr;
+                       break;
+               }
                default:
                        break;
                }
        }
        return 0;
+
+truncated:
+       return -EINVAL;
 }
index 3e3674e..153c7a3 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The options processing module for ip.c
  *
- * Version:    $Id: ip_options.c,v 1.13 1998/02/12 07:43:12 davem Exp $
+ * Version:    $Id: ip_options.c,v 1.14 1998/08/26 12:03:51 davem Exp $
  *
  * Authors:    A.N.Kuznetsov
  *             
@@ -451,7+451,7 @@ eol:
 
 error:
        if (skb) {
-               icmp_send(skb, ICMP_PARAMETERPROB, 0, pp_ptr-iph);
+               icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
                kfree_skb(skb);
        }
        return -EINVAL;
@@ -579,7+579,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
        if (rt->rt_type == RTN_UNICAST) {
                if (!opt->is_strictroute)
                        return 0;
-               icmp_send(skb, ICMP_PARAMETERPROB, 0, 16);
+               icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
                return -EINVAL;
        }
        if (rt->rt_type != RTN_LOCAL)
@@ -587,7+587,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 
        for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
                if (srrptr + 3 > srrspace) {
-                       icmp_send(skb, ICMP_PARAMETERPROB, 0, opt->srr+2);
+                       icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
                        return -EINVAL;
                }
                memcpy(&nexthop, &optptr[srrptr-1], 4);
index 0527c1b..9250051 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The Internet Protocol (IP) output module.
  *
- * Version:    $Id: ip_output.c,v 1.59 1998/07/15 05:05:15 davem Exp $
+ * Version:    $Id: ip_output.c,v 1.61 1998/08/26 12:03:54 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -171,14+171,7 @@ int ip_mc_output(struct sk_buff *skb)
         */
 
        if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) {
-#ifndef CONFIG_IP_MROUTE
-#if 1
-               /* It should never occur. Delete it eventually. --ANK */
-               if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK))
-                       printk(KERN_DEBUG "ip_mc_output (mc): it should never occur\n");
-               else
-#endif
-#else
+#ifdef CONFIG_IP_MROUTE
                /* Small optimization: do not loopback not local frames,
                   which returned after forwarding; they will be  dropped
                   by ip_mr_input in any case.
@@ -199,15+192,8 @@ int ip_mc_output(struct sk_buff *skb)
                }
        }
 
-       if (rt->rt_flags&RTCF_BROADCAST) {
-#if 1
-               /* It should never occur. Delete it eventually. --ANK */
-               if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK))
-                       printk(KERN_DEBUG "ip_mc_output (brd): it should never occur!\n");
-               else
-#endif
+       if (rt->rt_flags&RTCF_BROADCAST)
                dev_loopback_xmit(skb);
-       }
 
        return ip_finish_output(skb);
 }
@@ -281,8+267,6 @@ void ip_queue_xmit(struct sk_buff *skb)
        iph->ihl      = 5;
        iph->tos      = sk->ip_tos;
        iph->frag_off = 0;
-       if(sk->ip_pmtudisc == IP_PMTUDISC_WANT && !(rt->u.dst.mxlock & (1 << RTAX_MTU)))
-               iph->frag_off |= __constant_htons(IP_DF);
        iph->ttl      = sk->ip_ttl;
        iph->daddr    = rt->rt_dst;
        iph->saddr    = rt->rt_src;
@@ -316,6+300,8 @@ void ip_queue_xmit(struct sk_buff *skb)
                kfree_skb(skb);
                if (skb2 == NULL)
                        return;
+               if (sk)
+                       skb_set_owner_w(skb, sk);
                skb = skb2;
                iph = skb->nh.iph;
        }
@@ -326,6+312,9 @@ void ip_queue_xmit(struct sk_buff *skb)
        if (tot_len > rt->u.dst.pmtu)
                goto fragment;
 
+       if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && !(rt->u.dst.mxlock & (1 << RTAX_MTU)))
+               iph->frag_off |= __constant_htons(IP_DF);
+
        /* Add an IP checksum. */
        ip_send_check(iph);
 
@@ -334,7+323,15 @@ void ip_queue_xmit(struct sk_buff *skb)
        return;
 
 fragment:
-       if ((iph->frag_off & htons(IP_DF)) != 0) {
+       if (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
+           !(rt->u.dst.mxlock & (1 << RTAX_MTU)) &&
+           tot_len > (iph->ihl<<2) + sizeof(struct tcphdr)+16) {
+               /* Reject packet ONLY if TCP might fragment
+                  it itself, if were careful enough.
+                  Test is not precise (f.e. it does not take sacks
+                  into account). Actually, tcp should make it. --ANK (980801)
+                */
+               iph->frag_off |= __constant_htons(IP_DF);
                printk(KERN_DEBUG "sending pkt_too_big to self\n");
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
                          htonl(rt->u.dst.pmtu));
@@ -701,7+698,6 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
        unsigned int mtu, hlen, left, len; 
        int offset;
        int not_last_frag;
-       u16 dont_fragment;
        struct rtable *rt = (struct rtable*)skb->dst;
 
        dev = rt->u.dst.dev;
@@ -726,10+722,14 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
         *      The protocol doesn't seem to say what to do in the case that the
         *      frame + options doesn't fit the mtu. As it used to fall down dead
         *      in this case we were fortunate it didn't happen
+        *
+        *      It is impossible, because mtu>=68. --ANK (980801)
         */
 
+#ifdef CONFIG_NET_PARANOIA
        if (mtu<8) 
                goto fail;
+#endif
 
        /*
         *      Fragment the datagram.
@@ -739,14+739,6 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
        not_last_frag = iph->frag_off & htons(IP_MF);
 
        /*
-        *      Nice moment: if DF is set and we are here,
-        *      it means that packet should be fragmented and
-        *      DF is set on fragments. If it works,
-        *      path MTU discovery can be done by ONE segment(!). --ANK
-        */
-       dont_fragment = iph->frag_off & htons(IP_DF);
-
-       /*
         *      Keep copying data until we run out.
         */
 
@@ -805,7+797,7 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
                 *      Fill in the new header fields.
                 */
                iph = skb2->nh.iph;
-               iph->frag_off = htons((offset >> 3))|dont_fragment;
+               iph->frag_off = htons((offset >> 3));
 
                /* ANK: dirty, but effective trick. Upgrade options only if
                 * the segment to be fragmented was THE FIRST (otherwise,
@@ -858,11+850,6 @@ static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
        int len; 
        int hdrflag = 1; 
 
-#if 0
-       printk("ip_reply_glue_bits: offset=%u,flen=%u iov[0].l=%u,iov[1].len=%u\n",
-              offset,fraglen,dp->iov[0].iov_len,dp->iov[1].iov_len);
-#endif
-
        iov = &dp->iov[0]; 
        if (offset >= iov->iov_len) { 
                offset -= iov->iov_len;
@@ -871,12+858,6 @@ static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
        }
        len = iov->iov_len - offset;
        if (fraglen > len) { /* overlapping. */ 
-#if 1
-               if (iov > &dp->iov[0]) {
-                       printk("frag too long! (o=%u,fl=%u)\n",offset,fraglen);
-                       return -1;
-               }
-#endif
                dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
                                             dp->csum);
                offset = 0;
index 8f712c8..3d8f4fa 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The IP to API glue.
  *             
- * Version:    $Id: ip_sockglue.c,v 1.36 1998/07/15 05:05:06 davem Exp $
+ * Version:    $Id: ip_sockglue.c,v 1.37 1998/08/26 12:03:57 davem Exp $
  *
  * Authors:    see ip.c
  *
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/icmp.h>
+#include <net/tcp.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/igmp.h>
 #include <linux/route.h>
 #include <linux/mroute.h>
 #include <net/route.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/transp_v6.h>
+#endif
 
 #include <asm/uaccess.h>
 
@@ -140,6+144,10 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
        struct cmsghdr *cmsg;
 
        for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+               if ((unsigned long)(((char*)cmsg - (char*)msg->msg_control)
+                                   + cmsg->cmsg_len) > msg->msg_controllen) {
+                       return -EINVAL;
+               }
                if (cmsg->cmsg_level != SOL_IP)
                        continue;
                switch (cmsg->cmsg_type) {
@@ -255,22+263,30 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
                case IP_OPTIONS:
                {
                        struct ip_options * opt = NULL;
-                       struct ip_options * old_opt;
                        if (optlen > 40 || optlen < 0)
                                return -EINVAL;
                        err = ip_options_get(&opt, optval, optlen, 1);
                        if (err)
                                return err;
-                       /*
-                        * ANK: I'm afraid that receive handler may change
-                        * options from under us.
-                        */
-                       cli();
-                       old_opt = sk->opt;
-                       sk->opt = opt;
-                       sti();
-                       if (old_opt)
-                               kfree_s(old_opt, sizeof(struct ip_options) + old_opt->optlen);
+                       start_bh_atomic();
+                       if (sk->type == SOCK_STREAM) {
+                               struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+                               if (sk->family == PF_INET ||
+                                   ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT)
+                                    && sk->daddr != LOOPBACK4_IPV6)) {
+#endif
+                                       if (opt)
+                                               tp->ext_header_len = opt->optlen;
+                                       tcp_sync_mss(sk, tp->pmtu_cookie);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+                               }
+#endif
+                       }
+                       opt = xchg(&sk->opt, opt);
+                       end_bh_atomic();
+                       if (opt)
+                               kfree_s(opt, sizeof(struct ip_options) + opt->optlen);
                        return 0;
                }
                case IP_PKTINFO:
@@ -497,11+513,11 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op
                        {
                                unsigned char optbuf[sizeof(struct ip_options)+40];
                                struct ip_options * opt = (struct ip_options*)optbuf;
-                               cli();
+                               start_bh_atomic();
                                opt->optlen = 0;
                                if (sk->opt)
                                        memcpy(optbuf, sk->opt, sizeof(struct ip_options)+sk->opt->optlen);
-                               sti();
+                               end_bh_atomic();
                                if (opt->optlen == 0) 
                                        return put_user(0, optlen);
 
@@ -511,7+527,7 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op
                                if(put_user(len, optlen))
                                        return -EFAULT;
                                if(copy_to_user(optval, opt->__data, len))
-                                           return -EFAULT;
+                                       return -EFAULT;
                                return 0;
                        }
                case IP_PKTINFO:
index d0b3b5f..778ac15 100644 (file)
@@ -1,7+1,7 @@
 /*
  *     Linux NET3:     IP/IP protocol decoder. 
  *
- *     Version: $Id: ipip.c,v 1.22 1998/03/08 05:56:27 davem Exp $
+ *     Version: $Id: ipip.c,v 1.23 1998/08/26 12:04:00 davem Exp $
  *
  *     Authors:
  *             Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
@@ -551,6+551,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev)
                        tunnel->recursion--;
                        return 0;
                }
+               if (skb->sk)
+                       skb_set_owner_w(new_skb, skb->sk);
                dev_kfree_skb(skb);
                skb = new_skb;
        }
index 29fd4b3..49cd6da 100644 (file)
@@ -9,7+9,7 @@
  *     as published by the Free Software Foundation; either version
  *     2 of the License, or (at your option) any later version.
  *
- *     Version: $Id: ipmr.c,v 1.35 1998/05/13 06:23:24 davem Exp $
+ *     Version: $Id: ipmr.c,v 1.36 1998/08/26 12:04:03 davem Exp $
  *
  *     Fixes:
  *     Michael Chastain        :       Incorrect size of copying.
 #include <net/raw.h>
 #include <linux/notifier.h>
 #include <linux/if_arp.h>
+#include <linux/ip_fw.h>
+#include <linux/firewall.h>
 #include <net/ipip.h>
 #include <net/checksum.h>
 
@@ -1044,7+1046,12 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
 
        dev = rt->u.dst.dev;
 
-       if (skb->len+encap > dev->mtu && (ntohs(iph->frag_off) & IP_DF)) {
+       if (skb->len+encap > rt->u.dst.pmtu /* && (ntohs(iph->frag_off) & IP_DF) */) {
+               /* Do not fragment multicasts. Alas, IPv4 does not
+                  allow to send ICMP, so that packets will disappear
+                  to blackhole.
+                */
+
                ip_statistics.IpFragFails++;
                ip_rt_put(rt);
                return;
@@ -1052,11+1059,6 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
 
        encap += dev->hard_header_len;
 
-       if (skb->len+encap > 65534) {
-               ip_rt_put(rt);
-               return;
-       }
-
        if (skb_headroom(skb) < encap || skb_cloned(skb) || !last)
                skb2 = skb_realloc_headroom(skb, (encap + 15)&~15);
        else if (atomic_read(&skb->users) != 1)
@@ -1076,18+1078,37 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
 
        dst_release(skb2->dst);
        skb2->dst = &rt->u.dst;
-
        iph = skb2->nh.iph;
        ip_decrease_ttl(iph);
 
+#ifdef CONFIG_FIREWALL
+       if (call_fw_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
+               kfree_skb(skb2);
+               return;
+       }
+       if (call_out_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
+               kfree_skb(skb2);
+               return;
+       }
+#endif
        if (vif->flags & VIFF_TUNNEL) {
                ip_encap(skb2, vif->local, vif->remote);
+#ifdef CONFIG_FIREWALL
+               /* Double output firewalling on tunnels: one is on tunnel
+                  another one is on real device.
+                */
+               if (call_out_firewall(PF_INET, dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
+                       kfree_skb(skb2);
+                       return;
+               }
+#endif
                ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
                ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb2->len;
        }
 
        IPCB(skb2)->flags |= IPSKB_FORWARDED;
 
+
        /*
         * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
         * not only before forwarding, but after forwarding on all output
@@ -1351,21+1372,12 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
        struct rtnexthop *nhp;
        struct device *dev = vif_table[c->mfc_parent].dev;
        u8 *b = skb->tail;
-
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       if (dev) {
-               u8 *o = skb->tail;
-               RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
-               rtm->rtm_optlen += skb->tail - o;
-       }
-#else
        struct rtattr *mp_head;
 
        if (dev)
                RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
 
        mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
-#endif
 
        for (ct = c->mfc_minvif; ct < c->mfc_maxvif; ct++) {
                if (c->mfc_ttls[ct] < 255) {
@@ -1376,15+1388,10 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
                        nhp->rtnh_hops = c->mfc_ttls[ct];
                        nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
                        nhp->rtnh_len = sizeof(*nhp);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-                       rtm->rtm_nhs++;
-#endif
                }
        }
-#ifndef CONFIG_RTNL_OLD_IFINFO
        mp_head->rta_type = RTA_MULTIPATH;
        mp_head->rta_len = skb->tail - (u8*)mp_head;
-#endif
        rtm->rtm_type = RTN_MULTICAST;
        return 1;
 
index b6e0624..6f06f43 100644 (file)
@@ -7,7+7,7 @@
  *             PROC file system.  It is mainly used for debugging and
  *             statistics.
  *
- * Version:    $Id: proc.c,v 1.30 1998/04/16 16:29:05 freitag Exp $
+ * Version:    $Id: proc.c,v 1.31 1998/07/29 20:09:25 freitag Exp $
  *
  * Authors:    Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *             Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
@@ -357,12+357,15 @@ int netstat_get_info(char *buffer, char **start, off_t offset, int length, int d
 
        len = sprintf(buffer,
                      "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed"
-                     "EmbryonicRsts\n"
-                     "TcpExt: %lu %lu %lu %lu\n",
+                     " EmbryonicRsts PruneCalled RcvPruned OfoPruned\n"
+                     "TcpExt: %lu %lu %lu %lu %lu %lu %lu\n",
                      net_statistics.SyncookiesSent,
                      net_statistics.SyncookiesRecv,
                      net_statistics.SyncookiesFailed,
-                     net_statistics.EmbryonicRsts);
+                     net_statistics.EmbryonicRsts,
+                     net_statistics.PruneCalled,
+                     net_statistics.RcvPruned,
+                     net_statistics.OfoPruned);
 
        if (offset >= len)
        {
index 272f922..e10ddc0 100644 (file)
@@ -5,7+5,7 @@
  *
  *             RAW - implementation of IP "raw" sockets.
  *
- * Version:    $Id: raw.c,v 1.36 1998/05/08 21:06:29 davem Exp $
+ * Version:    $Id: raw.c,v 1.37 1998/08/26 12:04:07 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -152,7+152,7 @@ void raw_err (struct sock *sk, struct sk_buff *skb)
        int type = skb->h.icmph->type;
        int code = skb->h.icmph->code;
 
-       if (sk->ip_recverr && !atomic_read(&sk->sock_readers)) {
+       if (sk->ip_recverr) {
                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
                if (skb2 && sock_queue_err_skb(sk, skb2))
                        kfree_skb(skb);
@@ -194,10+194,6 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
        
        skb->h.raw = skb->nh.raw;
 
-       if (atomic_read(&sk->sock_readers)) {
-               __skb_queue_tail(&sk->back_log, skb);
-               return 0;
-       }
        raw_rcv_skb(sk, skb);
        return 0;
 }
@@ -379,10+375,33 @@ done:
 
 static void raw_close(struct sock *sk, unsigned long timeout)
 {
+       /* Observation: when raw_close is called, processes have
+          no access to socket anymore. But net still has.
+          Step one, detach it from networking:
+
+          A. Remove from hash tables.
+        */
        sk->state = TCP_CLOSE;
+       raw_v4_unhash(sk);
+        /*
+          B. Raw sockets may have direct kernel refereneces. Kill them.
+        */
        ip_ra_control(sk, 0, NULL);
+
+       /* In this point socket cannot receive new packets anymore */
+
+
+       /* But we still have packets pending on receive
+          queue and probably, our own packets waiting in device queues.
+          sock_destroy will drain receive queue, but transmitted
+          packets will delay socket destruction.
+          Set sk->dead=1 in order to prevent wakeups, when these
+          packet will be freed.
+        */
        sk->dead=1;
        destroy_sock(sk);
+
+       /* That's all. No races here. */
 }
 
 /* This gets rid of all the nasties in af_inet. -DaveM */
@@ -474,14+493,8 @@ done:
 static int raw_init(struct sock *sk)
 {
        struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4);
-       if (sk->num == IPPROTO_ICMP) {
+       if (sk->num == IPPROTO_ICMP)
                memset(&tp->filter, 0, sizeof(tp->filter));
-
-               /* By default block ECHO and TIMESTAMP requests */
-
-               set_bit(ICMP_ECHO, &tp->filter);
-               set_bit(ICMP_TIMESTAMP, &tp->filter);
-       }
        return 0;
 }
 
index e10f65c..5788342 100644 (file)
@@ -5,7+5,7 @@
  *
  *             ROUTE - implementation of the IP router.
  *
- * Version:    $Id: route.c,v 1.54 1998/07/15 05:05:22 davem Exp $
+ * Version:    $Id: route.c,v 1.57 1998/08/26 12:04:09 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *                                     route.c and rewritten from scratch.
  *             Andi Kleen      :       Load-limit warning messages.
  *     Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
+ *     Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  *
  *             This program is free software; you can redistribute it and/or
  *             modify it under the terms of the GNU General Public License
 #include <linux/sysctl.h>
 #endif
 
+#define IP_MAX_MTU     0xFFF0
+
 #define RT_GC_TIMEOUT (300*HZ)
 
 int ip_rt_min_delay = 2*HZ;
@@ -166,7+169,7 @@ __u8 ip_tos2prio[16] = {
  * Route cache.
  */
 
-static struct rtable   *rt_hash_table[RT_HASH_DIVISOR];
+struct rtable  *rt_hash_table[RT_HASH_DIVISOR];
 
 static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth);
 
@@ -246,6+249,13 @@ static __inline__ void rt_free(struct rtable *rt)
        dst_free(&rt->u.dst);
 }
 
+static __inline__ int rt_fast_clean(struct rtable *rth)
+{
+       /* Kill broadcast/multicast entries very aggresively, if they
+          collide in hash table with more useful entries */
+       return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
+               && rth->key.iif && rth->u.rt_next);
+}
 
 static void rt_check_expire(unsigned long dummy)
 {
@@ -255,43+265,30 @@ static void rt_check_expire(unsigned long dummy)
        unsigned long now = jiffies;
 
        for (i=0; i<RT_HASH_DIVISOR/5; i++) {
+               unsigned tmo = ip_rt_gc_timeout;
+
                rover = (rover + 1) & (RT_HASH_DIVISOR-1);
                rthp = &rt_hash_table[rover];
 
                while ((rth = *rthp) != NULL) {
-                       struct rtable * rth_next = rth->u.rt_next;
-
                        /*
                         * Cleanup aged off entries.
                         */
 
                        if (!atomic_read(&rth->u.dst.use) &&
-                           (now - rth->u.dst.lastuse > ip_rt_gc_timeout)) {
-                               *rthp = rth_next;
-#if RT_CACHE_DEBUG >= 2
-                               printk("rt_check_expire clean %02x@%08x\n", rover, rth->rt_dst);
-#endif
+                           (now - rth->u.dst.lastuse > tmo
+                            || rt_fast_clean(rth))) {
+                               *rthp = rth->u.rt_next;
                                rt_free(rth);
                                continue;
                        }
 
-                       if (!rth_next)
-                               break;
-
-                       if ( (long)(rth_next->u.dst.lastuse - rth->u.dst.lastuse) > RT_CACHE_BUBBLE_THRESHOLD ||
-                           ((long)(rth->u.dst.lastuse - rth_next->u.dst.lastuse) < 0 &&
-                            atomic_read(&rth->u.dst.refcnt) < atomic_read(&rth_next->u.dst.refcnt))) {
-#if RT_CACHE_DEBUG >= 2
-                               printk("rt_check_expire bubbled %02x@%08x<->%08x\n", rover, rth->rt_dst, rth_next->rt_dst);
-#endif
-                               *rthp = rth_next;
-                               rth->u.rt_next = rth_next->u.rt_next;
-                               rth_next->u.rt_next = rth;
-                               rthp = &rth_next->u.rt_next;
-                               continue;
-                       }
+                       tmo >>= 1;
                        rthp = &rth->u.rt_next;
                }
+
+               if ((jiffies - now) > 0)
+                       break;
        }
        rt_periodic_timer.expires = now + ip_rt_gc_interval;
        add_timer(&rt_periodic_timer);
@@ -305,21+302,14 @@ static void rt_run_flush(unsigned long dummy)
        rt_deadline = 0;
 
        for (i=0; i<RT_HASH_DIVISOR; i++) {
-               int nr=0;
-
                if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL)
                        continue;
 
                for (; rth; rth=next) {
                        next = rth->u.rt_next;
-                       nr++;
                        rth->u.rt_next = NULL;
                        rt_free(rth);
                }
-#if RT_CACHE_DEBUG >= 2
-               if (nr > 0)
-                       printk("rt_cache_flush: %d@%02x\n", nr, i);
-#endif
        }
 }
   
@@ -384,17+374,23 @@ static int rt_garbage_collect(void)
        expire++;
 
        for (i=0; i<RT_HASH_DIVISOR; i++) {
+               unsigned tmo;
                if (!rt_hash_table[i])
                        continue;
+               tmo = expire;
                for (rthp=&rt_hash_table[i]; (rth=*rthp); rthp=&rth->u.rt_next) {
                        if (atomic_read(&rth->u.dst.use) ||
-                           now - rth->u.dst.lastuse < expire)
+                           (now - rth->u.dst.lastuse < tmo && !rt_fast_clean(rth))) {
+                               tmo >>= 1;
                                continue;
+                       }
                        *rthp = rth->u.rt_next;
                        rth->u.rt_next = NULL;
                        rt_free(rth);
                        break;
                }
+               if ((jiffies-now)>0)
+                       break;
        }
 
        last_gc = now;
@@ -412,8+408,6 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt)
        struct rtable   *rth, **rthp;
        unsigned long   now = jiffies;
 
-       rt->u.dst.priority = rt_tos2priority(rt->key.tos);
-
        start_bh_atomic();
 
        rthp = &rt_hash_table[hash];
@@ -793,19+787,17 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res)
        if (fi) {
                if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
                        rt->rt_gateway = FIB_RES_GW(*res);
-#ifndef CONFIG_RTNL_OLD_IFINFO
                rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
                rt->u.dst.pmtu = fi->fib_mtu;
                if (fi->fib_mtu == 0) {
                        rt->u.dst.pmtu = rt->u.dst.dev->mtu;
+                       if (rt->u.dst.pmtu > IP_MAX_MTU)
+                               rt->u.dst.pmtu = IP_MAX_MTU;
                        if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
                            rt->rt_gateway != rt->rt_dst &&
                            rt->u.dst.pmtu > 576)
                                rt->u.dst.pmtu = 576;
                }
-#else
-               rt->u.dst.pmtu  = fi->fib_mtu ? : rt->u.dst.dev->mtu;
-#endif
                rt->u.dst.window= fi->fib_window ? : 0;
                rt->u.dst.rtt   = fi->fib_rtt ? : TCP_TIMEOUT_INIT;
 #ifdef CONFIG_NET_CLS_ROUTE
@@ -813,6+805,8 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res)
 #endif
        } else {
                rt->u.dst.pmtu  = rt->u.dst.dev->mtu;
+               if (rt->u.dst.pmtu > IP_MAX_MTU)
+                       rt->u.dst.pmtu = IP_MAX_MTU;
                rt->u.dst.window= 0;
                rt->u.dst.rtt   = TCP_TIMEOUT_INIT;
        }
@@ -930,7+924,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
                goto martian_source;
 
-       if (daddr == 0xFFFFFFFF)
+       if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
                goto brd_input;
 
        /* Accept zero addresses only to limited broadcast;
@@ -991,6+985,11 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
                fib_select_multipath(&key, &res);
 #endif
        out_dev = FIB_RES_DEV(res)->ip_ptr;
+       if (out_dev == NULL) {
+               if (net_ratelimit())
+                       printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
+               return -EINVAL;
+       }
 
        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst);
        if (err < 0)
@@ -1312,15+1311,14 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int
                           tables are looked up with only one purpose:
                           to catch if destination is gatewayed, rather than
                           direct. Moreover, if MSG_DONTROUTE is set,
-                          we send packet, no matter of routing tables
-                          of ifaddr state. --ANK
+                          we send packet, ignoring both routing tables
+                          and ifaddr state. --ANK
 
 
                           We could make it even if oif is unknown,
                           likely IPv6, but we do not.
                         */
 
-                       printk(KERN_DEBUG "Dest not on link. Forcing...\n");
                        if (key.src == 0)
                                key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
                        goto make_route;
@@ -1475,7+1473,7 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
 
 #ifdef CONFIG_RTNETLINK
 
-static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int nowait)
+static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
 {
        struct rtable *rt = (struct rtable*)skb->dst;
        struct rtmsg *r;
@@ -1485,11+1483,7 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
 #ifdef CONFIG_IP_MROUTE
        struct rtattr *eptr;
 #endif
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       unsigned char    *o;
-#else
        struct rtattr *mx;
-#endif
 
        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
        r = NLMSG_DATA(nlh);
@@ -1503,11+1497,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
        r->rtm_scope = RT_SCOPE_UNIVERSE;
        r->rtm_protocol = RTPROT_UNSPEC;
        r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       r->rtm_nhs = 0;
-
-       o = skb->tail;
-#endif
        RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
        if (rt->key.src) {
                r->rtm_src_len = 32;
@@ -1521,11+1510,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
        if (rt->rt_dst != rt->rt_gateway)
                RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
-       RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window);
-       RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt);
-#else
        mx = (struct rtattr*)skb->tail;
        RTA_PUT(skb, RTA_METRICS, 0, NULL);
        if (rt->u.dst.mxlock)
@@ -1539,7+1523,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
        mx->rta_len = skb->tail - (u8*)mx;
        if (mx->rta_len == RTA_LENGTH(0))
                skb_trim(skb, (u8*)mx - skb->data);
-#endif
        ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
        ci.rta_used = atomic_read(&rt->u.dst.refcnt);
        ci.rta_clntref = atomic_read(&rt->u.dst.use);
@@ -1549,9+1532,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
        eptr = (struct rtattr*)skb->tail;
 #endif
        RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       r->rtm_optlen = skb->tail - o;
-#endif
        if (rt->key.iif) {
 #ifdef CONFIG_IP_MROUTE
                u32 dst = rt->rt_dst;
@@ -1573,9+1553,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
 #endif
                {
                        RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-                       r->rtm_optlen = skb->tail - o;
-#endif
                }
        }
 
index c62dd19..4eeecac 100644 (file)
@@ -5,7+5,7 @@
  *
  *             Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:    $Id: tcp.c,v 1.116 1998/07/26 03:06:54 davem Exp $
+ * Version:    $Id: tcp.c,v 1.119 1998/08/26 12:04:14 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *                                     tcp_do_sendmsg to avoid burstiness.
  *             Eric Schenk     :       Fix fast close down bug with
  *                                     shutdown() followed by close().
+ *             Andi Kleen :    Make poll agree with SIGIO
  *                                     
  *             This program is free software; you can redistribute it and/or
  *             modify it under the terms of the GNU General Public License
  *
  * ICMP messages (4.2.3.9)
  *   MUST act on ICMP errors. (does)
- *   MUST slow transmission upon receipt of a Source Quench. (does)
+ *   MUST slow transmission upon receipt of a Source Quench. (doesn't anymore 
+ *   because that is deprecated now by the IETF, can be turned on)
  *   MUST NOT abort connection upon receipt of soft Destination
  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
  *     Problems. (doesn't)
  *   SHOULD report soft Destination Unreachables etc. to the
- *     application. (does, but may drop them in the ICMP error handler
- *     during an accept())
+ *     application. (does, except during SYN_RECV and may drop messages
+ *     in some rare cases before accept() - ICMP is unreliable)        
  *   SHOULD abort connection upon receipt of hard Destination Unreachable
  *     messages (2, 3, 4). (does, but see above)
  *
  *   MUST reject as an error OPEN for invalid remote IP address. (does)
  *   MUST ignore SYN with invalid source address. (does)
  *   MUST silently discard incoming SYN for broadcast/multicast
- *     address. (I'm not sure if it does. Someone should check this.)
+ *     address. (does)
  *
  * Asynchronous Reports (4.2.4.1)
  * MUST provide mechanism for reporting soft errors to application
@@ -537,6+539,21 @@ static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 }
 
 /*
+ *     Compute minimal free write space needed to queue new packets. 
+ */
+static inline int tcp_min_write_space(struct sock *sk, struct tcp_opt *tp)
+{
+       int space;
+#if 1 /* This needs benchmarking and real world tests */
+       space = max(tp->mss_cache + 128, MIN_WRITE_SPACE);
+#else /* 2.0 way */
+       /* More than half of the socket queue free? */
+       space = atomic_read(&sk->wmem_alloc) / 2;
+#endif
+       return space;
+}
+
+/*
  *     Wait for a TCP event.
  *
  *     Note that we don't need to lock the socket, as the upper poll layers
@@ -557,9+574,7 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
        if (sk->err)
                mask = POLLERR;
        /* Connected? */
-       if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
-               int space;
-
+       if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE)) {
                if (sk->shutdown & RCV_SHUTDOWN)
                        mask |= POLLHUP;
                
@@ -569,23+584,31 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
                     sk->urginline || !tp->urg_data))
                        mask |= POLLIN | POLLRDNORM;
 
-#if 1 /* This needs benchmarking and real world tests */
-               space = (sk->dst_cache ? sk->dst_cache->pmtu : sk->mss) + 128;
-               if (space < 2048) /* XXX */
-                       space = 2048;
-#else /* 2.0 way */
-               /* More than half of the socket queue free? */
-               space = atomic_read(&sk->wmem_alloc) / 2;
-#endif
                /* Always wake the user up when an error occurred */
-               if (sock_wspace(sk) >= space || sk->err)
+               if (sock_wspace(sk) >= tcp_min_write_space(sk, tp) || sk->err)
                        mask |= POLLOUT | POLLWRNORM;
                if (tp->urg_data & URG_VALID)
-                       mask |= POLLPRI;
+                       mask |= POLLPRI;
        }
        return mask;
 }
 
+/*
+ *     Socket write_space callback.
+ *     This (or rather the sock_wake_async) should agree with poll. 
+ */
+void tcp_write_space(struct sock *sk)
+{
+       if (sk->dead)
+               return; 
+
+       wake_up_interruptible(sk->sleep);
+       if (sock_wspace(sk) >=
+           tcp_min_write_space(sk, &(sk->tp_pinfo.af_tcp)))
+               sock_wake_async(sk->socket, 2);
+}
+
+
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
        int answ;
@@ -1025,7+1048,7 @@ static void cleanup_rbuf(struct sock *sk, int copied)
                 * which don't advertize a larger window.
                 */
                if((copied >= rcv_window_now) &&
-                  ((rcv_window_now + sk->mss) <= tp->window_clamp))
+                  ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
                        tcp_read_wakeup(sk);
        }
 }
@@ -1543,16+1566,18 @@ struct sock *tcp_accept(struct sock *sk, int flags)
 
        tcp_synq_unlink(tp, req, prev);
        newsk = req->sk;
+       req->class->destructor(req);
        tcp_openreq_free(req);
        sk->ack_backlog--; 
 
-       /* FIXME: need to check here if newsk has already
-        * an soft_err or err set.
-        * We have two options here then: reply (this behaviour matches
-        * Solaris) or return the error to the application (old Linux)
-        */
+       /*
+        * This does not pass any already set errors on the new socket
+        * to the user, but they will be returned on the first socket operation
+        * after the accept.
+        */ 
+
        error = 0;
- out:
+out:
        release_sock(sk);
        sk->err = error;
        return newsk;
@@ -1586,7+1611,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
  */
                        if(val<1||val>MAX_WINDOW)
                                return -EINVAL;
-                       sk->user_mss=val;
+                       tp->user_mss=val;
                        return 0;
                case TCP_NODELAY:
                        sk->nonagle=(val==0)?0:1;
@@ -1614,7+1639,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
 
        switch(optname) {
                case TCP_MAXSEG:
-                       val=sk->user_mss;
+                       val=tp->user_mss;
                        break;
                case TCP_NODELAY:
                        val=sk->nonagle;
@@ -1640,7+1665,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
 
 extern void __skb_cb_too_small_for_tcp(int, int);
 
-__initfunc(void tcp_init(void))
+void __init tcp_init(void)
 {
        struct sk_buff *skb = NULL;
 
index a4ad2dc..118fa42 100644 (file)
@@ -5,7+5,7 @@
  *
  *             Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:    $Id: tcp_input.c,v 1.121 1998/07/15 04:39:12 davem Exp $
+ * Version:    $Id: tcp_input.c,v 1.127 1998/08/26 12:04:20 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *             Andi Kleen:             Make sure we never ack data there is not
  *                                     enough room for. Also make this condition
  *                                     a fatal error if it might still happen.
+ *             Andi Kleen:             Add tcp_measure_rcv_mss to make 
+ *                                     connections with MSS<min(MTU,ann. MSS)
+ *                                     work without delayed acks. 
  */
 
 #include <linux/config.h>
@@ -214,7+217,7 @@ extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp
 
 #define PAWS_24DAYS    (HZ * 60 * 60 * 24 * 24)
 
-extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, __u16 len)
+extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
 {
        /* ts_recent must be younger than 24 days */
        return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
@@ -346,9+349,11 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i
                                switch(opcode) {
                                case TCPOPT_MSS:
                                        if(opsize==TCPOLEN_MSS && th->syn) {
-                                               tp->in_mss = ntohs(*(__u16 *)ptr);
-                                               if (tp->in_mss == 0)
-                                                       tp->in_mss = 536;
+                                               u16 in_mss = ntohs(*(__u16 *)ptr);
+                                               if (in_mss == 0)
+                                                       in_mss = 536;
+                                               if (tp->mss_clamp > in_mss)
+                                                       tp->mss_clamp = in_mss;
                                        }
                                        break;
                                case TCPOPT_WINDOW:
@@ -863,7+868,7 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
  * reconnects and SYN/RST bits being set in the TCP header.
  */
 int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
-                              struct tcphdr *th, void *opt, __u16 len)
+                              struct tcphdr *th, unsigned len)
 {
        /*      RFC 1122:
         *      "When a connection is [...] on TIME-WAIT state [...]
@@ -893,7+898,7 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
                        return 0;
                skb_set_owner_r(skb, sk);
                af_specific = sk->tp_pinfo.af_tcp.af_specific;
-               if(af_specific->conn_request(sk, skb, opt, isn) < 0)
+               if(af_specific->conn_request(sk, skb, isn) < 0)
                        return 1; /* Toss a reset back. */
                return 0; /* Discard the frame. */
        }
@@ -1309,7+1314,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
                        tp->delayed_acks++;
 
                        /* Tiny-grams with PSH set make us ACK quickly. */
-                       if(skb->h.th->psh && (skb->len < (sk->mss >> 1)))
+                       if(skb->h.th->psh && (skb->len < (tp->mss_cache >> 1)))
                                tp->ato = HZ/50;
                }
                /* This may have eaten into a SACK block. */
@@ -1429,7+1434,6 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
                }
        }
 
-       /* We no longer have anyone receiving data on this connection. */
        tcp_data_queue(sk, skb);
 
        if (before(tp->rcv_nxt, tp->copied_seq)) {
@@ -1464,6+1468,26 @@ static void tcp_data_snd_check(struct sock *sk)
        }
 }
 
+/* 
+ * Adapt the MSS value used to make delayed ack decision to the 
+ * real world. 
+ */ 
+static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
+{
+       struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+       unsigned int len = skb->len, lss; 
+
+       if (len > tp->rcv_mss) 
+               tp->rcv_mss = len; 
+       lss = tp->last_seg_size; 
+       tp->last_seg_size = 0; 
+       if (len >= 536) {
+               if (len == lss) 
+                       tp->rcv_mss = len; 
+               tp->last_seg_size = len; 
+       }
+}
+
 /*
  * Check if sending an ack is needed.
  */
@@ -1486,7+1510,7 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk)
         */
 
            /* Two full frames received or... */
-       if (((tp->rcv_nxt - tp->rcv_wup) >= sk->mss * MAX_DELAY_ACK) ||
+       if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
            /* We will update the window "significantly" or... */
            tcp_raise_window(sk) ||
            /* We entered "quick ACK" mode or... */
@@ -1595,11+1619,14 @@ static int prune_queue(struct sock *sk)
 
        SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
 
+       net_statistics.PruneCalled++; 
+
        /* First Clean the out_of_order queue. */
        /* Start with the end because there are probably the least
         * useful packets (crossing fingers).
         */
        while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue))) { 
+               net_statistics.OfoPruned += skb->len; 
                kfree_skb(skb);
                if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
                        return 0;
@@ -1620,6+1647,9 @@ static int prune_queue(struct sock *sk)
                                   tp->last_ack_sent);
                        return -1;
                }
+
+               net_statistics.RcvPruned += skb->len; 
+
                __skb_unlink(skb, skb->list);
                tp->rcv_nxt = TCP_SKB_CB(skb)->seq;
                SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n",
@@ -1633,7+1663,7 @@ static int prune_queue(struct sock *sk)
 }
 
 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-                       struct tcphdr *th, __u16 len)
+                       struct tcphdr *th, unsigned len)
 {
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
        int queued = 0;
@@ -1704,7+1734,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                goto discard;
                        }
                        
-                       skb_pull(skb,th->doff*4);
+                       __skb_pull(skb,th->doff*4);
+
+                       tcp_measure_rcv_mss(sk, skb); 
 
                        /* DO NOT notify forward progress here.
                         * It saves dozen of CPU instructions in fast path. --ANK
@@ -1719,7+1751,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        tcp_delack_estimator(tp);
 
                        /* Tiny-grams with PSH set make us ACK quickly. */
-                       if(th->psh && (skb->len < (sk->mss >> 1)))
+                       if(th->psh && (skb->len < (tp->mss_cache >> 1)))
                                tp->ato = HZ/50;
 
                        tp->delayed_acks++;
@@ -1767,6+1799,25 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
        /* step 7: process the segment text */
        queued = tcp_data(skb, sk, len);
 
+       /* This must be after tcp_data() does the skb_pull() to
+        * remove the header size from skb->len.
+        *
+        * Dave!!! Phrase above (and all about rcv_mss) has 
+        * nothing to do with reality. rcv_mss must measure TOTAL
+        * size, including sacks, IP options etc. Hence, measure_rcv_mss
+        * must occure before pulling etc, otherwise it will flap
+        * like hell. Even putting it before tcp_data is wrong,
+        * it should use skb->tail - skb->nh.raw instead.
+        *                                      --ANK (980805)
+        * 
+        * BTW I broke it. Now all TCP options are handled equally
+        * in mss_clamp calculations (i.e. ignored, rfc1122),
+        * and mss_cache does include all of them (i.e. tstamps)
+        * except for sacks, to calulate effective mss faster.
+        *                                      --ANK (980805)
+        */
+       tcp_measure_rcv_mss(sk, skb); 
+
        /* Be careful, tcp_data() may have put this into TIME_WAIT. */
        if(sk->state != TCP_CLOSE) {
                tcp_data_snd_check(sk);
@@ -1853,7+1904,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
  */
        
 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
-                         struct tcphdr *th, void *opt, __u16 len)
+                         struct tcphdr *th, unsigned len)
 {
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
        int queued = 0;
@@ -1868,7+1919,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        return 1;
                
                if(th->syn) {
-                       if(tp->af_specific->conn_request(sk, skb, opt, 0) < 0)
+                       if(tp->af_specific->conn_request(sk, skb, 0) < 0)
                                return 1;
 
                        /* Now we have several options: In theory there is 
@@ -1961,28+2012,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        /* Can't be earlier, doff would be wrong. */
                        tcp_send_ack(sk);
 
-                       /* Check for the case where we tried to advertise
-                        * a window including timestamp options, but did not
-                        * end up using them for this connection.
-                        */
-                       if((tp->tstamp_ok == 0) && sysctl_tcp_timestamps)
-                               sk->mss += TCPOLEN_TSTAMP_ALIGNED;
-                       
-                       /* Now limit it if the other end negotiated a smaller
-                        * value.
-                        */
-                       if (tp->in_mss) {
-                               int real_mss = tp->in_mss;
-
-                               /* We store MSS locally with the timestamp bytes
-                                * subtracted, TCP's advertise it with them
-                                * included.  Account for this fact.
-                                */
-                               if(tp->tstamp_ok)
-                                       real_mss -= TCPOLEN_TSTAMP_ALIGNED;
-                               sk->mss = min(sk->mss, real_mss);
-                       }
-
                        sk->dport = th->source;
                        tp->copied_seq = tp->rcv_nxt;
 
@@ -1990,9+2019,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                sk->state_change(sk);
                                sock_wake_async(sk->socket, 0);
                        }
-
-                       /* Drop through step 6 */
-                       goto step6;
                } else {
                        if(th->syn && !th->rst) {
                                /* The previous version of the code
@@ -2017,11+2043,20 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
                                
                                tcp_send_synack(sk);
-                               goto discard;
-                       }               
-
+                       } else
+                               break; 
                }
-               break;
+
+               /* tp->tcp_header_len and tp->mss_clamp
+                  probably changed, synchronize mss.
+                  */
+               tcp_sync_mss(sk, tp->pmtu_cookie);
+               tp->rcv_mss = tp->mss_cache;
+
+               if (sk->state == TCP_SYN_RECV)
+                       goto discard;
+               
+               goto step6; 
        }
 
        /*   Parse the tcp_options present on this header.
@@ -2167,6+2202,11 @@ step6:
                
        case TCP_ESTABLISHED: 
                queued = tcp_data(skb, sk, len);
+
+               /* This must be after tcp_data() does the skb_pull() to
+                * remove the header size from skb->len.
+                */
+               tcp_measure_rcv_mss(sk, skb); 
                break;
        }
 
index 0255948..bf3fb24 100644 (file)
@@ -5,7+5,7 @@
  *
  *             Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:    $Id: tcp_ipv4.c,v 1.150 1998/07/28 17:45:07 freitag Exp $
+ * Version:    $Id: tcp_ipv4.c,v 1.157 1998/08/28 00:27:47 davem Exp $
  *
  *             IPv4 specific functions
  *
  *             Andi Kleen:             various fixes.
  *     Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  *     Andi Kleen              :       Fix new listen.
+ *     Andi Kleen              :       Fix accept error reporting.
  */
 
 #include <linux/config.h>
@@ -140,7+141,8 @@ void tcp_bucket_unlock(struct sock *sk)
                if(tb->port == snum) {
                        if(tb->owners == NULL &&
                           (tb->flags & TCPB_FLAG_LOCKED)) {
-                               tb->flags &= ~TCPB_FLAG_LOCKED;
+                               tb->flags &= ~(TCPB_FLAG_LOCKED |
+                                              TCPB_FLAG_FASTREUSE);
                                tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
                        }
                        break;
@@ -208,7+210,7 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
 
                        /* We must walk the whole port owner list in this case. -DaveM */
                        for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) {
-                               if(sk->bound_dev_if == sk2->bound_dev_if) {
+                               if (sk->bound_dev_if == sk2->bound_dev_if) {
                                        if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) {
                                                if(!sk2->rcv_saddr              ||
                                                   !sk->rcv_saddr               ||
@@ -223,16+225,33 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
        }
        if(result == 0) {
                if(tb == NULL) {
-                       if(tcp_bucket_create(snum) == NULL)
+                       if((tb = tcp_bucket_create(snum)) == NULL)
                                result = 1;
+                       else if (sk->reuse && sk->state != TCP_LISTEN)
+                               tb->flags |= TCPB_FLAG_FASTREUSE;
                } else {
                        /* It could be pending garbage collection, this
                         * kills the race and prevents it from disappearing
                         * out from under us by the time we use it.  -DaveM
                         */
-                       if(tb->owners == NULL && !(tb->flags & TCPB_FLAG_LOCKED)) {
-                               tb->flags = TCPB_FLAG_LOCKED;
-                               tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+                       if(tb->owners == NULL) {
+                               if (!(tb->flags & TCPB_FLAG_LOCKED)) {
+                                       tb->flags = (TCPB_FLAG_LOCKED |
+                                                    ((sk->reuse &&
+                                                      sk->state != TCP_LISTEN) ?
+                                                     TCPB_FLAG_FASTREUSE : 0));
+                                       tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+                               } else if (!(tb->flags & TCPB_FLAG_GOODSOCKNUM)) {
+                                       /* Someone is in between the bind
+                                        * and the actual connect or listen.
+                                        * See if it was a legitimate reuse
+                                        * and we are as well, else punt.
+                                        */
+                                       if (sk->reuse == 0 ||
+                                           !(tb->flags & TCPB_FLAG_FASTREUSE))
+                                               result = 1;
+                               } else
+                                       tb->flags &= ~TCPB_FLAG_GOODSOCKNUM;
                        }
                }
        }
@@ -264,8+283,11 @@ unsigned short tcp_good_socknum(void)
        next:
        } while(--remaining > 0);
        tcp_port_rover = rover;
-       if((remaining <= 0) || (tcp_bucket_create(rover) == NULL))
+       tb = NULL;
+       if((remaining <= 0) || ((tb = tcp_bucket_create(rover)) == NULL))
                rover = 0;
+       if (tb != NULL)
+               tb->flags |= TCPB_FLAG_GOODSOCKNUM;
        SOCKHASH_UNLOCK();
 
        return rover;
@@ -543,8+565,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
        struct sk_buff *buff;
        struct rtable *rt;
+       u32 daddr, nexthop;
        int tmp;
-       int mss;
 
        if (sk->state != TCP_CLOSE) 
                return(-EISCONN);
@@ -564,7+586,14 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
                        printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm);
        }
 
-       tmp = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr,
+       nexthop = daddr = usin->sin_addr.s_addr;
+       if (sk->opt && sk->opt->srr) {
+               if (daddr == 0)
+                       return -EINVAL;
+               nexthop = sk->opt->faddr;
+       }
+
+       tmp = ip_route_connect(&rt, nexthop, sk->saddr,
                               RT_TOS(sk->ip_tos)|sk->localroute, sk->bound_dev_if);
        if (tmp < 0)
                return tmp;
@@ -592,6+621,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
         */
        sk->dport = usin->sin_port;
        sk->daddr = rt->rt_dst;
+       if (sk->opt && sk->opt->srr)
+               sk->daddr = daddr;
        if (!sk->saddr)
                sk->saddr = rt->rt_src;
        sk->rcv_saddr = sk->saddr;
@@ -601,22+632,28 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
                return -EADDRNOTAVAIL;
        }
 
-       sk->mtu = rt->u.dst.pmtu;
-       if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
-            (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
-             (rt->u.dst.mxlock&(1<<RTAX_MTU)))) &&
-           rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway)
-               sk->mtu = 576;
+       tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
+                                                  sk->sport, usin->sin_port);
 
-       if (sk->mtu < 64)
-               sk->mtu = 64;   /* Sanity limit */
+       tp->ext_header_len = 0;
+       if (sk->opt)
+               tp->ext_header_len = sk->opt->optlen;
 
-       mss = sk->mtu - sizeof(struct iphdr);
+       /* Reset mss clamp */
+       tp->mss_clamp = ~0;
 
-       tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
-                                                  sk->sport, usin->sin_port);
+       if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
+            (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
+             (rt->u.dst.mxlock&(1<<RTAX_MTU)))) &&
+           rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) {
+               /* Clamp mss at maximum of 536 and user_mss.
+                  Probably, user ordered to override tiny segment size
+                  in gatewayed case.
+                */
+               tp->mss_clamp = max(tp->user_mss, 536);
+       }
 
-       tcp_connect(sk, buff, mss);
+       tcp_connect(sk, buff, rt->u.dst.pmtu);
        return 0;
 }
 
@@ -694,7+731,6 @@ static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
  */
 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip)
 {
-       int new_mtu; 
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
        /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
@@ -711,21+747,19 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip)
         * route, but I think that's acceptable.
         */
        if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && sk->dst_cache) {
-               new_mtu = sk->dst_cache->pmtu - 
-                       (ip->ihl<<2) - tp->tcp_header_len; 
-               if (new_mtu < sk->mss && new_mtu > 0) {
-                       sk->mss = new_mtu;
+               if (tp->pmtu_cookie > sk->dst_cache->pmtu &&
+                   !atomic_read(&sk->sock_readers)) {
+                       lock_sock(sk); 
+                       tcp_sync_mss(sk, sk->dst_cache->pmtu);
+
                        /* Resend the TCP packet because it's  
                         * clear that the old packet has been
                         * dropped. This is the new "fast" path mtu
                         * discovery.
                         */
-                       if (!atomic_read(&sk->sock_readers)) {
-                               lock_sock(sk); 
-                               tcp_simple_retransmit(sk);
-                               release_sock(sk);
-                       } /* else let the usual retransmit timer handle it */
-               }
+                       tcp_simple_retransmit(sk);
+                       release_sock(sk);
+               } /* else let the usual retransmit timer handle it */
        }
 }
 
@@ -821,8+855,15 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
                        return;
                }
 
+               /* The final ACK of the handshake should be already 
+                * handled in the new socket context, not here.
+                * Strictly speaking - an ICMP error for the final
+                * ACK should set the opening flag, but that is too
+                * complicated right now. 
+                */ 
                if (!th->syn && !th->ack)
                        return;
+
                req = tcp_v4_search_req(tp, iph, th, &prev); 
                if (!req)
                        return;
@@ -833,17+874,33 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
                                       seq, req->snt_isn);
                        return;
                }
-               if (req->sk) {  /* not yet accept()ed */
-                       sk = req->sk; /* report error in accept */
+               if (req->sk) {  
+                       /* 
+                        * Already in ESTABLISHED and a big socket is created,
+                        * set error code there.
+                        * The error will _not_ be reported in the accept(),
+                        * but only with the next operation on the socket after
+                        * accept. 
+                        */
+                       sk = req->sk;
                } else {
+                       /* 
+                        * Still in SYN_RECV, just remove it silently.
+                        * There is no good way to pass the error to the newly
+                        * created socket, and POSIX does not want network
+                        * errors returned from accept(). 
+                        */ 
                        tp->syn_backlog--;
                        tcp_synq_unlink(tp, req, prev);
                        req->class->destructor(req);
                        tcp_openreq_free(req);
+                       return; 
                }
-               /* FALL THOUGH */
+               break;
        case TCP_SYN_SENT:
        case TCP_SYN_RECV: 
+               if (!th->syn)
+                       return; 
                opening = 1; 
                break;
        }
@@ -855,10+912,13 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
                        tcp_statistics.TcpAttemptFails++;
                        if (sk->state != TCP_LISTEN)
                                tcp_set_state(sk,TCP_CLOSE);
+                       mb(); 
                        sk->error_report(sk);           /* Wake people up to see the error (see connect in sock.c) */
                }
-       } else  /* Only an error on timeout */
+       } else  /* Only an error on timeout */
                sk->err_soft = icmp_err_convert[code].errno;
+               mb(); 
+       }
 }
 
 /* This routine computes an IPv4 TCP checksum. */
@@ -916,7+976,7 @@ static void tcp_v4_send_reset(struct sk_buff *skb)
                                      IPPROTO_TCP,
                                      0); 
        arg.n_iov = 1;
-       arg.csumoffset = offsetof(struct tcphdr, check) / sizeof(u16)
+       arg.csumoffset = offsetof(struct tcphdr, check) / 2
 
        ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 
@@ -950,6+1010,11 @@ int tcp_chkaddr(struct sk_buff *skb)
 }
 #endif
 
+/*
+ *     Send a SYN-ACK after having received an ACK. 
+ *     This still operates on a open_request only, not on a big
+ *     socket.
+ */ 
 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 {
        struct rtable *rt;
@@ -974,7+1039,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
                return;
        }
 
-       mss = (rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
+       mss = rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 
        skb = tcp_make_synack(sk, &rt->u.dst, req, mss);
        if (skb) {
@@ -994,6+1059,9 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
        ip_rt_put(rt);
 }
 
+/*
+ *     IPv4 open_request destructor.
+ */ 
 static void tcp_v4_or_free(struct open_request *req)
 {
        if(!req->sk && req->af.v4_req.opt)
@@ -1016,9+1084,9 @@ static inline void syn_flood_warning(struct sk_buff *skb)
  * Save and compile IPv4 options into the open_request if needed. 
  */
 static inline struct ip_options * 
-tcp_v4_save_options(struct sock *sk, struct sk_buff *skb, 
-                   struct ip_options *opt)
+tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
 {
+       struct ip_options *opt = &(IPCB(skb)->opt);
        struct ip_options *dopt = NULL; 
 
        if (opt && opt->optlen) {
@@ -1052,8+1120,7 @@ struct or_calltable or_ipv4 = {
 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
 
-int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, 
-                                               __u32 isn)
+int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
 {
        struct tcp_opt tp;
        struct open_request *req;
@@ -1070,6+1137,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
        if (sk->dead) 
                goto dead; 
 
+       /* Never answer to SYNs send to broadcast or multicast */
+       if (((struct rtable *)skb->dst)->rt_flags & 
+           (RTCF_BROADCAST|RTCF_MULTICAST))
+               goto drop; 
+
        /* XXX: Check against a global syn pool counter. */
        if (BACKLOG(sk) > BACKLOGMAX(sk)) {
 #ifdef CONFIG_SYN_COOKIES
@@ -1094,13+1166,18 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 
        req->rcv_isn = TCP_SKB_CB(skb)->seq;
        tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
-       tp.in_mss = 536;
+
+       tp.mss_clamp = 65535;
        tcp_parse_options(NULL, th, &tp, want_cookie);
-       req->mss = tp.in_mss;
-       if (tp.saw_tstamp) {
-               req->mss -= TCPOLEN_TSTAMP_ALIGNED;
+       if (tp.mss_clamp == 65535)
+               tp.mss_clamp = 576 - sizeof(struct iphdr) - sizeof(struct iphdr);
+
+       if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp)
+               tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss;
+       req->mss = tp.mss_clamp;
+
+       if (tp.saw_tstamp)
                req->ts_recent = tp.rcv_tsval;
-       }
        req->tstamp_ok = tp.tstamp_ok;
        req->sack_ok = tp.sack_ok;
        req->snd_wscale = tp.snd_wscale;
@@ -1120,7+1197,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 
        req->snt_isn = isn;
 
-       req->af.v4_req.opt = tcp_v4_save_options(sk, skb, ptr);
+       req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
 
        req->class = &or_ipv4;
        req->retrans = 0;
@@ -1139,7+1216,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
                tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
        }
 
-       sk->data_ready(sk, 0);
        return 0;
 
 dead:
@@ -1160,8+1236,7 @@ drop:
  *
  * This function wants to be moved to a common for IPv[46] file. --ANK
  */
-struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb,
-                                     int snd_mss)
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
 {
        struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
 
@@ -1180,6+1255,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
                skb_queue_head_init(&newsk->receive_queue);
                atomic_set(&newsk->wmem_alloc, 0);
                skb_queue_head_init(&newsk->write_queue);
+               atomic_set(&newsk->omem_alloc, 0);
 
                newsk->done = 0;
                newsk->proc = 0;
@@ -1231,7+1307,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
                newtp->copied_seq = req->rcv_isn + 1;
 
                newtp->saw_tstamp = 0;
-               newtp->in_mss = 536;
+               newtp->mss_clamp = req->mss;
 
                init_timer(&newtp->probe_timer);
                newtp->probe_timer.function = &tcp_probe_timer;
@@ -1242,12+1318,14 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
                newtp->urg_data = 0;
                tcp_synq_init(newtp);
                newtp->syn_backlog = 0;
+               if (skb->len >= 536)
+                       newtp->last_seg_size = skb->len; 
 
                /* Back to base struct sock members. */
                newsk->err = 0;
                newsk->ack_backlog = 0;
                newsk->max_ack_backlog = SOMAXCONN;
-               newsk->priority = 1;
+               newsk->priority = 0;
 
                /* IP layer stuff */
                newsk->timeout = 0;
@@ -1276,14+1354,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
                } else {
                        newtp->tcp_header_len = sizeof(struct tcphdr);
                }
-
-               snd_mss -= newtp->tcp_header_len;
-
-               if (sk->user_mss)
-                       snd_mss = min(snd_mss, sk->user_mss);
-
-               newsk->mss = min(req->mss, snd_mss);
-
        }
        return newsk;
 }
@@ -1299,8+1369,6 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        struct ip_options *opt = req->af.v4_req.opt;
        struct tcp_opt *newtp;
        struct sock *newsk;
-       int snd_mss;
-       int mtu;
 
        if (sk->ack_backlog > sk->max_ack_backlog)
                goto exit; /* head drop */
@@ -1324,12+1392,7 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                goto exit;
 #endif
 
-       mtu = dst->pmtu;
-       if (mtu < 68) /* XXX: we should turn pmtu disc off when this happens. */
-               mtu = 68;
-       snd_mss = mtu - sizeof(struct iphdr);
-
-       newsk = tcp_create_openreq_child(sk, req, skb, snd_mss);
+       newsk = tcp_create_openreq_child(sk, req, skb);
        if (!newsk) 
                goto exit;
 
@@ -1347,15+1410,22 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        newsk->sport = req->lcl_port;
 #endif
        newsk->opt = req->af.v4_req.opt;
-       newsk->mtu = mtu;
-
-       if (newsk->rcvbuf < (3 * newsk->mtu))
-               newsk->rcvbuf = min ((3 * newsk->mtu), sysctl_rmem_max);
-       if (newsk->sndbuf < (3 * newsk->mtu))
-               newsk->sndbuf = min ((3 * newsk->mtu), sysctl_wmem_max);
+       newtp->ext_header_len = 0;
+       if (newsk->opt)
+               newtp->ext_header_len = newsk->opt->optlen;
+
+       tcp_sync_mss(newsk, dst->pmtu);
+       newtp->rcv_mss = newtp->mss_clamp;
+
+       /* It would be better to use newtp->mss_clamp here */
+       if (newsk->rcvbuf < (3 * newtp->pmtu_cookie))
+               newsk->rcvbuf = min ((3 * newtp->pmtu_cookie), sysctl_rmem_max);
+       if (newsk->sndbuf < (3 * newtp->pmtu_cookie))
+               newsk->sndbuf = min ((3 * newtp->pmtu_cookie), sysctl_wmem_max);
  
        tcp_v4_hash(newsk);
        add_to_prot_sklist(newsk);
+       sk->data_ready(sk, 0); /* Deliver SIGIO */ 
 
        return newsk;
 
@@ -1373,8+1443,8 @@ static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb)
        if (!req)
                return;
        /* Sequence number check required by RFC793 */
-       if (before(TCP_SKB_CB(skb)->seq, req->snt_isn) ||
-           after(TCP_SKB_CB(skb)->seq, req->snt_isn+1))
+       if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) ||
+           after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
                return;
        tcp_synq_unlink(tp, req, prev);
        (req->sk ? sk->ack_backlog : tp->syn_backlog)--;
@@ -1461,7+1531,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
                sk = nsk;
        }
        
-       if (tcp_rcv_state_process(sk, skb, skb->h.th, &(IPCB(skb)->opt), skb->len))
+       if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
                goto reset;
        release_sock(sk); 
        return 0;
@@ -1559,7+1629,7 @@ discard_it:
 
 do_time_wait:
        if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
-                                     skb, th, &(IPCB(skb)->opt), skb->len))
+                                     skb, th, skb->len))
                goto no_tcp_socket;
        goto discard_it;
 }
@@ -1665,6+1735,8 @@ struct tcp_func ipv4_specific = {
        tcp_v4_conn_request,
        tcp_v4_syn_recv_sock,
        tcp_v4_get_sock,
+       sizeof(struct iphdr),
+
        ip_setsockopt,
        ip_getsockopt,
        v4_addr2sockaddr,
@@ -1683,7+1755,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 
        tp->rto  = TCP_TIMEOUT_INIT;            /*TCP_WRITE_TIME*/
        tp->mdev = TCP_TIMEOUT_INIT;
-       tp->in_mss = 536;
+       tp->mss_clamp = ~0;
       
        /* See draft-stevens-tcpca-spec-01 for discussion of the
         * initialization of these values.
@@ -1691,11+1763,11 @@ static int tcp_v4_init_sock(struct sock *sk)
        tp->snd_cwnd = (1 << TCP_CWND_SHIFT);
        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
 
-       sk->priority = 1;
        sk->state = TCP_CLOSE;
        sk->max_ack_backlog = SOMAXCONN;
-       sk->mtu = 576;
-       sk->mss = 536;
+       tp->rcv_mss = 536; 
+
+       sk->write_space = tcp_write_space; 
 
        /* Init SYN queue. */
        tcp_synq_init(tp);
index 8453534..03696cb 100644 (file)
@@ -5,7+5,7 @@
  *
  *             Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:    $Id: tcp_output.c,v 1.92 1998/06/19 13:22:44 davem Exp $
+ * Version:    $Id: tcp_output.c,v 1.93 1998/08/26 12:04:32 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -117,7+117,7 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                         * is never scaled.
                         */
                        th->window      = htons(tp->rcv_wnd);
-                       tcp_syn_build_options((__u32 *)(th + 1), sk->mss,
+                       tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp,
                                              sysctl_tcp_timestamps,
                                              sysctl_tcp_sack,
                                              sysctl_tcp_window_scaling,
@@ -227,6+227,65 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
        return 0;
 }
 
+/* This function synchronize snd mss to current pmtu/exthdr set.
+
+   tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+   for TCP options, but includes only bare TCP header.
+
+   tp->mss_clamp is mss negotiated at connection setup.
+   It is minumum of user_mss and mss received with SYN.
+   It also does not include TCP options.
+
+   tp->pmtu_cookie is last pmtu, seen by this function.
+
+   tp->mss_cache is current effective sending mss, including
+   all tcp options except for SACKs. It is evaluated,
+   taking into account current pmtu, but never exceeds
+   tp->mss_clamp.
+
+   NOTE1. rfc1122 clearly states that advertised MSS
+   DOES NOT include either tcp or ip options.
+
+   NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
+   this function.                      --ANK (980731)
+ */
+
+int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+       struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+       int mss_now;
+
+       /* Calculate base mss without TCP options:
+          It is MMS_S - sizeof(tcphdr) of rfc1122
+       */
+       mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+
+       /* Clamp it (mss_clamp does not include tcp options) */
+       if (mss_now > tp->mss_clamp)
+               mss_now = tp->mss_clamp;
+
+       /* Now subtract TCP options size, not including SACKs */
+       mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+       /* Now subtract optional transport overhead */
+       mss_now -= tp->ext_header_len;
+
+       /* It we got too small (or even negative) value,
+          clamp it by 8 from below. Why 8 ?
+          Well, it could be 1 with the same success,
+          but if IP accepted segment of length 1,
+          it would love 8 even more 8)         --ANK (980731)
+        */
+       if (mss_now < 8)
+               mss_now = 8;
+
+       /* And store cached results */
+       tp->pmtu_cookie = pmtu;
+       tp->mss_cache = mss_now;
+       return mss_now;
+}
+
+
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
@@ -334,7+393,7 @@ void tcp_write_xmit(struct sock *sk)
 u32 __tcp_select_window(struct sock *sk, u32 cur_win)
 {
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-       unsigned int mss = sk->mss;
+       unsigned int mss = tp->mss_cache;
        int free_space;
        u32 window;
 
@@ -624,7+683,7 @@ void tcp_send_fin(struct sock *sk)
                 */
                if(tp->send_head == skb &&
                   !sk->nonagle &&
-                  skb->len < (sk->mss >> 1) &&
+                  skb->len < (tp->mss_cache >> 1) &&
                   tp->packets_out &&
                   !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
                        update_send_head(sk);
@@ -738,20+797,15 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
        skb->dst = dst_clone(dst);
 
-       if (sk->user_mss)
-               mss = min(mss, sk->user_mss);
-       if (req->tstamp_ok)
-               mss -= TCPOLEN_TSTAMP_ALIGNED;
-
        /* Don't offer more than they did.
         * This way we don't have to memorize who said what.
         * FIXME: maybe this should be changed for better performance
         * with syncookies.
         */
        req->mss = min(mss, req->mss);
-       if (req->mss < 1) {
-               printk(KERN_DEBUG "initial req->mss below 1\n");
-               req->mss = 1;
+       if (req->mss < 8) {
+               printk(KERN_DEBUG "initial req->mss below 8\n");
+               req->mss = 8;
        }
 
        tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
@@ -796,7+850,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        return skb;
 }
 
-void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss)
+void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
 {
        struct dst_entry *dst = sk->dst_cache;
        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -804,9+858,6 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss)
        /* Reserve space for headers. */
        skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
 
-       if (sk->priority == 0)
-               sk->priority = dst->priority;
-
        tp->snd_wnd = 0;
        tp->snd_wl1 = 0;
        tp->snd_wl2 = tp->write_seq;
@@ -821,17+872,25 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss)
        tp->tcp_header_len = sizeof(struct tcphdr) +
                (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
 
-       mss -= tp->tcp_header_len;
-
-       if (sk->user_mss)
-               mss = min(mss, sk->user_mss);
-
-       if (mss < 1) {
-               printk(KERN_DEBUG "initial sk->mss below 1\n");
-               mss = 1;        /* Sanity limit */
-       }
-
-       sk->mss = mss;
+       /* If user gave his TCP_MAXSEG, record it to clamp */
+       if (tp->user_mss)
+               tp->mss_clamp = tp->user_mss;
+       tcp_sync_mss(sk, mtu);
+
+       /* Now unpleasant action: if initial pmtu is too low
+          set lower clamp. I am not sure that it is good.
+          To be more exact, I do not think that clamping at value, which
+          is apparently transient and may improve in future is good idea.
+          It would be better to wait until peer will returns its MSS
+          (probably 65535 too) and now advertise something sort of 65535
+          or at least first hop device mtu. Is it clear, what I mean?
+          We should tell peer what maximal mss we expect to RECEIVE,
+          it has nothing to do with pmtu.
+          I am afraid someone will be confused by such huge value.
+                                                          --ANK (980731)
+        */
+       if (tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr) < tp->mss_clamp )
+               tp->mss_clamp = tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr);
 
        TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
        TCP_SKB_CB(buff)->sacked = 0;
@@ -842,7+901,7 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss)
        tp->snd_nxt = TCP_SKB_CB(buff)->end_seq;
 
        tp->window_clamp = dst->window;
-       tcp_select_initial_window(sock_rspace(sk)/2,sk->mss,
+       tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp,
                &tp->rcv_wnd,
                &tp->window_clamp,
                sysctl_tcp_window_scaling,
index ea6419f..b06a61e 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The User Datagram Protocol (UDP).
  *
- * Version:    $Id: udp.c,v 1.57 1998/05/14 06:32:44 davem Exp $
+ * Version:    $Id: udp.c,v 1.59 1998/08/27 16:54:55 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *     Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  *             Melvin Smith    :       Check msg_name not msg_namelen in sendto(),
  *                                     return ENOTCONN for unconnected sockets (POSIX)
+ *             Janos Farkas    :       don't deliver multi/broadcasts to a different
+ *                                     bound-to-device socket
  *
  *
  *             This program is free software; you can redistribute it and/or
@@ -447,7+449,8 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
                                             unsigned short num,
                                             unsigned long raddr,
                                             unsigned short rnum,
-                                            unsigned long laddr)
+                                            unsigned long laddr,
+                                            int dif)
 {
        struct sock *s = sk;
        unsigned short hnum = ntohs(num);
@@ -455,8+458,9 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
                if ((s->num != hnum)                                    ||
                    (s->dead && (s->state == TCP_CLOSE))                ||
                    (s->daddr && s->daddr!=raddr)                       ||
-                   (s->dport != rnum && s->dport != 0) ||
-                   (s->rcv_saddr  && s->rcv_saddr != laddr))
+                   (s->dport != rnum && s->dport != 0)                 ||
+                   (s->rcv_saddr  && s->rcv_saddr != laddr)            ||
+                   (s->bound_dev_if && s->bound_dev_if != dif))
                        continue;
                break;
        }
@@ -619,7+623,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
        struct ipcm_cookie ipc;
        struct udpfakehdr ufh;
        struct rtable *rt = NULL;
-       int free = 0, localroute = 0;
+       int free = 0;
+       int connected = 0;
        u32 daddr;
        u8  tos;
        int err;
@@ -683,18+688,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
                   It was main reason why I removed it from 2.1.
                   The second reason was that idle sockets held
                   a lot of stray destinations.         --ANK
+
+                  Look: route depends on ALL the options,
+                  checking its validity is exactly on cycle
+                  of ip_route_output(). We save only start_bh_atomic()
+                  in SMP case. On UP we save nothing. --ANK
                 */
        } else {
                if (sk->state != TCP_ESTABLISHED)
                        return -ENOTCONN;
                ufh.daddr = sk->daddr;
                ufh.uh.dest = sk->dport;
-
-               /*
-                  BUGGG Khm... And who will validate it? Fixing it fastly...
-                                                                       --ANK
+               /* Open fast path for connected socket.
+                  Route will not be used, if at least one option is set.
                 */
-               rt = (struct rtable *)dst_check(&sk->dst_cache, 0);
+               connected = 1;
        }
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
        if (msg->msg_flags&MSG_PROXY) {
@@ -710,6+718,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
                ufh.uh.source = from->sin_port;
                if (ipc.addr == 0)
                        ipc.addr = sk->saddr;
+               connected = 0;
        } else
 #endif
        {
@@ -725,6+734,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
                        return err;
                if (ipc.opt)
                        free = 1;
+               connected = 0;
        }
        if (!ipc.opt)
                ipc.opt = sk->opt;
@@ -736,12+746,13 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
                if (!daddr)
                        return -EINVAL;
                daddr = ipc.opt->faddr;
+               connected = 0;
        }
        tos = RT_TOS(sk->ip_tos);
        if (sk->localroute || (msg->msg_flags&MSG_DONTROUTE) || 
            (ipc.opt && ipc.opt->is_strictroute)) {
                tos |= RTO_ONLINK;
-               rt = NULL; /* sorry */
+               connected = 0;
        }
 
        if (MULTICAST(daddr)) {
@@ -749,8+760,12 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
                        ipc.oif = sk->ip_mc_index;
                if (!ufh.saddr)
                        ufh.saddr = sk->ip_mc_addr;
+               connected = 0;
        }
 
+       if (connected)
+               rt = (struct rtable*)dst_clone(sk->dst_cache);
+
        if (rt == NULL) {
                err = ip_route_output(&rt, daddr, ufh.saddr,
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
@@ -759,7+774,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
                         tos, ipc.oif);
                if (err) 
                        goto out;
-               localroute = 1;
 
                err = -EACCES;
                if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) 
@@ -780,14+794,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
        /* (MAY) and it defaults to on (MUST).  Almost makes up for the */
        /* violation above. -- MS */
 
-       lock_sock(sk);
        err = ip_build_xmit(sk,sk->no_check ? udp_getfrag_nosum : udp_getfrag,
                            &ufh, ulen, &ipc, rt, msg->msg_flags);
-       release_sock(sk);
 
 out:
-       if (localroute)
-               ip_rt_put(rt);
+       ip_rt_put(rt);
        if (free)
                kfree(ipc.opt);
        if (!err) {
@@ -822,7+833,9 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
                        if (sk->state == TCP_LISTEN)
                                return(-EINVAL);
                        amount = 0;
-                       /* N.B. Is this interrupt safe?? */
+                       /* N.B. Is this interrupt safe??
+                          -> Yes. Interrupts do not remove skbs. --ANK (980725)
+                        */
                        skb = skb_peek(&sk->receive_queue);
                        if (skb != NULL) {
                                /*
@@ -841,6+854,43 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
        return(0);
 }
 
+#ifdef CONFIG_FILTER
+#undef CONFIG_UDP_DELAY_CSUM
+#endif
+
+#ifdef CONFIG_UDP_DELAY_CSUM
+
+/* Please, read comments in net/checksum.h, asm/checksum.h
+
+   I commented out csum_partial_copy_to_user there because it did not
+   verify_area. Now I am even wondered, how clever was I that time 8)8)
+   If I did not it, I would step into this hole again.   --ANK
+ */
+
+#ifndef _HAVE_ARCH_COPY_AND_CSUM_TO_USER
+#ifdef __i386__
+static __inline__
+unsigned int csum_and_copy_to_user (const char *src, char *dst,
+                                   int len, int sum, int *err_ptr)
+{
+       int *src_err_ptr=NULL;
+
+       if (verify_area(VERIFY_WRITE, dst, len) == 0)
+               return csum_partial_copy_generic(src, dst, len, sum, src_err_ptr, err_ptr);
+
+       if (len)
+               *err_ptr = -EFAULT;
+
+       return sum;
+}
+#elif defined(__sparc__)
+#define csum_and_copy_to_user csum_partial_copy_to_user
+#else
+#undef CONFIG_UDP_DELAY_CSUM
+#endif
+#endif
+#endif
+
 
 /*
  *     This should be easy, if there is something there we
@@ -848,7+898,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
  */
 
 int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len,
-            int noblock, int flags, int *addr_len)
+               int noblock, int flags, int *addr_len)
 {
        struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
        struct sk_buff *skb;
@@ -880,18+930,47 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len,
                goto out;
   
        copied = skb->len - sizeof(struct udphdr);
-       if (copied > len)
-       {
+       if (copied > len) {
                copied = len;
                msg->msg_flags |= MSG_TRUNC;
        }
 
-       /*
-        *      FIXME : should use udp header size info value 
-        */
-        
+#ifndef CONFIG_UDP_DELAY_CSUM
        err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
                                        copied);
+#else
+       if (sk->no_check || skb->ip_summed==CHECKSUM_UNNECESSARY) {
+               err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+                                             copied);
+       } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
+               if (csum_fold(csum_partial(skb->h.raw, ntohs(skb->h.uh->len), skb->csum))) {
+                       udp_statistics.UdpInErrors++;
+
+                       /* Error for blocking case is chosen to masquerade
+                          as some normal condition.
+                        */
+                       err = (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
+                       goto out_free;
+               }
+               err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+                                             copied);
+       } else {
+               unsigned int csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum);
+
+               err = 0;
+               csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, copied, csum, &err);
+               if (err)
+                       goto out_free;
+               if (csum_fold(csum)) {
+                       udp_statistics.UdpInErrors++;
+                       /* Error for blocking case is chosen to masquerade
+                          as some normal condition.
+                        */
+                       err = (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
+                       goto out_free;
+               }
+       }
+#endif
        if (err)
                goto out_free;
        sk->stamp=skb->stamp;
@@ -986,13+1065,10 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 static void udp_close(struct sock *sk, unsigned long timeout)
 {
-       lock_sock(sk);
+       /* See for explanation: raw_close in ipv4/raw.c */
        sk->state = TCP_CLOSE;
-       if(uh_cache_sk == sk)
-               uh_cache_sk = NULL;
-       sk->dead = 1;
-       release_sock(sk);
        udp_v4_unhash(sk);
+       sk->dead = 1;
        destroy_sock(sk);
 }
 
@@ -1026,10+1102,6 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 
 static inline void udp_deliver(struct sock *sk, struct sk_buff *skb)
 {
-       if (atomic_read(&sk->sock_readers)) {
-               __skb_queue_tail(&sk->back_log, skb);
-               return;
-       }
        udp_queue_rcv_skb(sk, skb);
 }
 
@@ -1043,9+1115,11 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
                                 u32 saddr, u32 daddr)
 {
        struct sock *sk;
+       int dif;
 
        sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)];
-       sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr);
+       dif = skb->dev->ifindex;
+       sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr, dif);
        if (sk) {
                struct sock *sknext = NULL;
 
@@ -1053,7+1127,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
                        struct sk_buff *skb1 = skb;
 
                        sknext = udp_v4_mcast_next(sk->next, uh->dest, saddr,
-                                                  uh->source, daddr);
+                                                  uh->source, daddr, dif);
                        if(sknext)
                                skb1 = skb_clone(skb, GFP_ATOMIC);
 
@@ -1113,7+1187,8 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
         */
         
        uh = skb->h.uh;
-       
+       __skb_pull(skb, skb->h.raw - skb->data);
+
        ip_statistics.IpInDelivers++;
 
        /*
@@ -1121,18+1196,20 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
         */
         
        ulen = ntohs(uh->len);
-       
-       if (ulen > len || len < sizeof(*uh) || ulen < sizeof(*uh)) {
+
+       if (ulen > len || ulen < sizeof(*uh)) {
                NETDEBUG(printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len));
                udp_statistics.UdpInErrors++;
                kfree_skb(skb);
                return(0);
        }
+       skb_trim(skb, ulen);
 
+#ifndef CONFIG_UDP_DELAY_CSUM
        if (uh->check &&
-           (((skb->ip_summed==CHECKSUM_HW)&&udp_check(uh,len,saddr,daddr,skb->csum)) ||
+           (((skb->ip_summed==CHECKSUM_HW)&&udp_check(uh,ulen,saddr,daddr,skb->csum)) ||
             ((skb->ip_summed==CHECKSUM_NONE) &&
-             (udp_check(uh,len,saddr,daddr, csum_partial((char*)uh, len, 0)))))) {
+             (udp_check(uh,ulen,saddr,daddr, csum_partial((char*)uh, ulen, 0)))))) {
                /* <mea@utu.fi> wants to know, who sent it, to
                   go and stomp on the garbage sender... */
 
@@ -1147,18+1224,29 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
                kfree_skb(skb);
                return(0);
        }
-
-
-       len = ulen;
-
-       /*
-        *      FIXME:
-        *      Trimming things wrongly. We must adjust the base/end to allow
-        *      for the headers we keep!
-        *               --ANK 
-        */
-       skb_trim(skb,len);
-
+#else
+       if (uh->check==0)
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+       else if (skb->ip_summed==CHECKSUM_HW) {
+               if (udp_check(uh,ulen,saddr,daddr,skb->csum)) {
+                       /* <mea@utu.fi> wants to know, who sent it, to
+                          go and stomp on the garbage sender... */
+
+                       /* RFC1122: OK.  Discards the bad packet silently (as far as */
+                       /* the network is concerned, anyway) as per 4.1.3.4 (MUST). */
+
+                       NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %08lX:%d to %08lX:%d ulen %d\n",
+                                       ntohl(saddr),ntohs(uh->source),
+                                       ntohl(daddr),ntohs(uh->dest),
+                                       ulen));
+                       udp_statistics.UdpInErrors++;
+                       kfree_skb(skb);
+                       return(0);
+               }
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+       } else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+               skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+#endif
 
        if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
                return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
@@ -1173,6+1261,24 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
        sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
        
        if (sk == NULL) {
+#ifdef CONFIG_UDP_DELAY_CSUM
+               if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+                   csum_fold(csum_partial((char*)uh, ulen, skb->csum))) {
+                       /* <mea@utu.fi> wants to know, who sent it, to
+                          go and stomp on the garbage sender... */
+
+                       /* RFC1122: OK.  Discards the bad packet silently (as far as */
+                       /* the network is concerned, anyway) as per 4.1.3.4 (MUST). */
+
+                       NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %08lX:%d to %08lX:%d ulen %d\n",
+                                       ntohl(saddr),ntohs(uh->source),
+                                       ntohl(daddr),ntohs(uh->dest),
+                                       ulen));
+                       udp_statistics.UdpInErrors++;
+                       kfree_skb(skb);
+                       return(0);
+               }
+#endif
                udp_statistics.UdpNoPorts++;
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
index 3298070..a61be48 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: addrconf.c,v 1.43 1998/07/15 05:05:32 davem Exp $
+ *     $Id: addrconf.c,v 1.45 1998/08/26 12:04:41 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
+#include <linux/delay.h>
 
 #include <linux/proc_fs.h>
 #include <net/sock.h>
 #include <linux/rtnetlink.h>
 
 #include <asm/uaccess.h>
-#include <asm/delay.h>
 
 /* Set to 3 to get tracing... */
 #define ACONF_DEBUG 2
@@ -100,7+100,7 @@ struct ipv6_devconf ipv6_devconf =
 {
        0,                              /* forwarding           */
        IPV6_DEFAULT_HOPLIMIT,          /* hop limit            */
-       576,                            /* mtu                  */
+       IPV6_MIN_MTU,                   /* mtu                  */
        1,                              /* accept RAs           */
        1,                              /* accept redirects     */
        1,                              /* autoconfiguration    */
@@ -114,7+114,7 @@ static struct ipv6_devconf ipv6_devconf_dflt =
 {
        0,                              /* forwarding           */
        IPV6_DEFAULT_HOPLIMIT,          /* hop limit            */
-       576,                            /* mtu                  */
+       IPV6_MIN_MTU,                   /* mtu                  */
        1,                              /* accept RAs           */
        1,                              /* accept redirects     */
        1,                              /* autoconfiguration    */
@@ -185,7+185,7 @@ static struct inet6_dev * ipv6_add_dev(struct device *dev)
        struct inet6_dev *ndev, **bptr, *iter;
        int hash;
 
-       if (dev->mtu < 576)
+       if (dev->mtu < IPV6_MIN_MTU)
                return NULL;
 
        ndev = kmalloc(sizeof(struct inet6_dev), gfp_any());
@@ -548,7+548,6 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev,
                      unsigned long expires, unsigned flags)
 {
        struct in6_rtmsg rtmsg;
-       int err;
 
        memset(&rtmsg, 0, sizeof(rtmsg));
        memcpy(&rtmsg.rtmsg_dst, pfx, sizeof(struct in6_addr));
@@ -566,7+565,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev,
        if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
                rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
 
-       ip6_route_add(&rtmsg, &err);
+       ip6_route_add(&rtmsg);
 }
 
 /* Create "default" multicast route to the interface */
@@ -574,7+573,6 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev,
 static void addrconf_add_mroute(struct device *dev)
 {
        struct in6_rtmsg rtmsg;
-       int err;
 
        memset(&rtmsg, 0, sizeof(rtmsg));
        ipv6_addr_set(&rtmsg.rtmsg_dst,
@@ -584,13+582,12 @@ static void addrconf_add_mroute(struct device *dev)
        rtmsg.rtmsg_ifindex = dev->ifindex;
        rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF;
        rtmsg.rtmsg_type = RTMSG_NEWROUTE;
-       ip6_route_add(&rtmsg, &err);
+       ip6_route_add(&rtmsg);
 }
 
 static void sit_route_add(struct device *dev)
 {
        struct in6_rtmsg rtmsg;
-       int err;
 
        memset(&rtmsg, 0, sizeof(rtmsg));
 
@@ -602,7+599,7 @@ static void sit_route_add(struct device *dev)
        rtmsg.rtmsg_flags       = RTF_UP|RTF_NONEXTHOP;
        rtmsg.rtmsg_ifindex     = dev->ifindex;
 
-       ip6_route_add(&rtmsg, &err);
+       ip6_route_add(&rtmsg);
 }
 
 static void addrconf_add_lroute(struct device *dev)
@@ -690,13+687,12 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len)
        else
                rt_expires = jiffies + valid_lft * HZ;
 
-       rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, RTF_LINKRT);
+       rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1);
 
        if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
                if (rt->rt6i_flags&RTF_EXPIRES) {
                        if (pinfo->onlink == 0 || valid_lft == 0) {
                                ip6_del_rt(rt);
-                               rt = NULL;
                        } else {
                                rt->rt6i_expires = rt_expires;
                        }
@@ -705,6+701,8 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len)
                addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
                                      dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES);
        }
+       if (rt)
+               dst_release(&rt->u.dst);
 
        /* Try to figure out our local address for this prefix */
 
@@ -1118,11+1116,17 @@ int addrconf_notify(struct notifier_block *this, unsigned long event,
                break;
 
        case NETDEV_CHANGEMTU:
-               /* BUGGG... Should scan FIB to change pmtu on routes. --ANK */
-               if (dev->mtu >= 576)
+               if (dev->mtu >= IPV6_MIN_MTU) {
+                       struct inet6_dev *idev;
+
+                       if ((idev = ipv6_find_idev(dev)) == NULL)
+                               break;
+                       idev->cnf.mtu6 = dev->mtu;
+                       rt6_mtu_change(dev, dev->mtu);
                        break;
+               }
 
-               /* MTU falled under 576. Stop IPv6 on this interface. */
+               /* MTU falled under IPV6_MIN_MTU. Stop IPv6 on this interface. */
 
        case NETDEV_DOWN:
        case NETDEV_UNREGISTER:
@@ -1240,7+1244,6 @@ static void addrconf_rs_timer(unsigned long data)
                add_timer(&ifp->timer);
        } else {
                struct in6_rtmsg rtmsg;
-               int err;
 
                printk(KERN_DEBUG "%s: no IPv6 routers present\n",
                       ifp->idev->dev->name);
@@ -1253,7+1256,7 @@ static void addrconf_rs_timer(unsigned long data)
 
                rtmsg.rtmsg_ifindex = ifp->idev->dev->ifindex;
 
-               ip6_route_add(&rtmsg, &err);
+               ip6_route_add(&rtmsg);
        }
 }
 
@@ -1501,7+1504,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 }
 
 static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
-                           pid_t pid, u32 seq, int event)
+                            u32 pid, u32 seq, int event)
 {
        struct ifaddrmsg *ifm;
        struct nlmsghdr  *nlh;
@@ -1659,8+1662,11 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
 
                addrconf_forward_change(idev);
 
-               if (*valp)
+               if (*valp) {
+                       start_bh_atomic();
                        rt6_purge_dflt_routers(0);
+                       end_bh_atomic();
+               }
        }
 
         return ret;
index 051f9a2..a9ee649 100644 (file)
@@ -7,7+7,7 @@
  *
  *     Adapted from linux/net/ipv4/af_inet.c
  *
- *     $Id: af_inet6.c,v 1.36 1998/06/10 07:29:25 davem Exp $
+ *     $Id: af_inet6.c,v 1.37 1998/08/26 12:04:45 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -64,6+64,7 @@ extern int raw6_get_info(char *, char **, off_t, int, int);
 extern int tcp6_get_info(char *, char **, off_t, int, int);
 extern int udp6_get_info(char *, char **, off_t, int, int);
 extern int afinet6_get_info(char *, char **, off_t, int, int);
+extern int afinet6_get_snmp(char *, char **, off_t, int, int);
 #endif
 
 #ifdef CONFIG_SYSCTL
@@ -243,10+244,49 @@ static int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 static int inet6_release(struct socket *sock, struct socket *peer)
 {
+       struct sock *sk = sock->sk;
+
+       if (sk == NULL)
+               return -EINVAL;
+
+       /* Free mc lists */
+       ipv6_sock_mc_close(sk);
+
+       /* Huh! MOD_DEC_USE_COUNT was here :-(
+          It is impossible by two reasons: socket destroy
+          may be delayed and inet_release may sleep and
+          return to nowhere then. It should be moved to
+          inet6_destroy_sock(), but we have no explicit constructor :-(
+                                           --ANK (980802)
+        */
        MOD_DEC_USE_COUNT;
        return inet_release(sock, peer);
 }
 
+int inet6_destroy_sock(struct sock *sk)
+{
+       struct sk_buff *skb;
+       struct ipv6_txoptions *opt;
+
+       /*
+        *      Release destination entry
+        */
+
+       dst_release(xchg(&sk->dst_cache,NULL));
+
+       /* Release rx options */
+
+       if ((skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, NULL)) != NULL)
+               kfree_skb(skb);
+
+       /* Free tx options */
+
+       if ((opt = xchg(&sk->net_pinfo.af_inet6.opt, NULL)) != NULL)
+               sock_kfree_s(sk, opt, opt->tot_len);
+
+       return 0;
+}
+
 /*
  *     This does both peername and sockname.
  */
@@ -412,6+452,12 @@ static struct proc_dir_entry proc_net_sockstat6 = {
        0, &proc_net_inode_operations,
        afinet6_get_info
 };
+static struct proc_dir_entry proc_net_snmp6 = {
+       PROC_NET_SNMP6, 5, "snmp6",
+       S_IFREG | S_IRUGO, 1, 0, 0,
+       0, &proc_net_inode_operations,
+       afinet6_get_snmp
+};
 #endif /* CONFIG_PROC_FS */
 
 #ifdef MODULE
@@ -445,7+491,7 @@ __initfunc(void inet6_proto_init(struct net_proto *pro))
 
        printk(KERN_INFO "IPv6 v0.2 for NET3.037\n");
 
-       if (sizeof(struct ipv6_options) > sizeof(dummy_skb->cb))
+       if (sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb))
        {
                printk(KERN_CRIT "inet6_proto_init: size fault\n");
 #ifdef MODULE
@@ -490,6+536,7 @@ __initfunc(void inet6_proto_init(struct net_proto *pro))
        proc_net_register(&proc_net_tcp6);
        proc_net_register(&proc_net_udp6);
        proc_net_register(&proc_net_sockstat6);
+       proc_net_register(&proc_net_snmp6);
 #endif
 
        /* Now the userspace is allowed to create INET6 sockets. */
@@ -526,6+573,7 @@ void cleanup_module(void)
        proc_net_unregister(proc_net_tcp6.low_ino);
        proc_net_unregister(proc_net_udp6.low_ino);
        proc_net_unregister(proc_net_sockstat6.low_ino);
+       proc_net_unregister(proc_net_snmp6.low_ino);
 #endif
        /* Cleanup code parts. */
        sit_cleanup();
index b87f31b..51960bd 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: datagram.c,v 1.14 1998/03/20 09:12:15 davem Exp $
+ *     $Id: datagram.c,v 1.15 1998/08/26 12:04:47 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
 int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
 {
        struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
-       struct ipv6_options *opt = (struct ipv6_options *) skb->cb;
-       
-       if (np->rxinfo) {
+       struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb;
+
+       if (np->rxopt.bits.rxinfo) {
                struct in6_pktinfo src_info;
 
-               src_info.ipi6_ifindex = skb->dev->ifindex;
+               src_info.ipi6_ifindex = opt->iif;
                ipv6_addr_copy(&src_info.ipi6_addr, &skb->nh.ipv6h->daddr);
                put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
        }
 
-       if (np->rxhlim) {
+       if (np->rxopt.bits.rxhlim) {
                int hlim = skb->nh.ipv6h->hop_limit;
                put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
        }
 
-       if (opt->srcrt) {
-               int hdrlen = sizeof(struct rt0_hdr) + (opt->srcrt->hdrlen << 3);
-
-               put_cmsg(msg, SOL_IPV6, IPV6_RXSRCRT, hdrlen, opt->srcrt);
+       if (np->rxopt.bits.hopopts && opt->hop) {
+               u8 *ptr = skb->nh.raw + opt->hop;
+               put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
+       }
+       if (np->rxopt.bits.dstopts && opt->dst0) {
+               u8 *ptr = skb->nh.raw + opt->dst0;
+               put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr);
+       }
+       if (np->rxopt.bits.srcrt && opt->srcrt) {
+               struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(skb->nh.raw + opt->srcrt);
+               put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, (rthdr->hdrlen+1) << 3, rthdr);
+       }
+       if (np->rxopt.bits.authhdr && opt->auth) {
+               u8 *ptr = skb->nh.raw + opt->auth;
+               put_cmsg(msg, SOL_IPV6, IPV6_AUTHHDR, (ptr[1]+1)<<2, ptr);
+       }
+       if (np->rxopt.bits.dstopts && opt->dst1) {
+               u8 *ptr = skb->nh.raw + opt->dst1;
+               put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr);
        }
        return 0;
 }
 
 int datagram_send_ctl(struct msghdr *msg, int *oif,
-                     struct in6_addr **src_addr, struct ipv6_options *opt, 
+                     struct in6_addr **src_addr, struct ipv6_txoptions *opt,
                      int *hlimit)
 {
        struct in6_pktinfo *src_info;
        struct cmsghdr *cmsg;
        struct ipv6_rt_hdr *rthdr;
+       struct ipv6_opt_hdr *hdr;
        int len;
        int err = 0;
 
        for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+               if ((unsigned long)(((char*)cmsg - (char*)msg->msg_control)
+                                   + cmsg->cmsg_len) > msg->msg_controllen) {
+                       err = -EINVAL;
+                       goto exit_f;
+               }
+
                if (cmsg->cmsg_level != SOL_IPV6) {
-                       printk(KERN_DEBUG "invalid cmsg_level %d\n", cmsg->cmsg_level);
+                       if (net_ratelimit())
+                               printk(KERN_DEBUG "invalid cmsg_level %d\n", cmsg->cmsg_level);
                        continue;
                }
 
                switch (cmsg->cmsg_type) {
                case IPV6_PKTINFO:
-                       if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in6_pktinfo))) {
+                       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
                                err = -EINVAL;
                                goto exit_f;
                        }
@@ -100,14+124,77 @@ int datagram_send_ctl(struct msghdr *msg, int *oif,
                        }
 
                        break;
-                       
-               case IPV6_RXSRCRT:
+
+               case IPV6_HOPOPTS:
+                        if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+                               err = -EINVAL;
+                               goto exit_f;
+                       }
+
+                       hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+                       len = ((hdr->hdrlen + 1) << 3);
+                       if (cmsg->cmsg_len < CMSG_LEN(len)) {
+                               err = -EINVAL;
+                               goto exit_f;
+                       }
+                       if (!capable(CAP_NET_RAW)) {
+                               err = -EPERM;
+                               goto exit_f;
+                       }
+                       opt->opt_nflen += len;
+                       opt->hopopt = hdr;
+                       break;
+
+               case IPV6_DSTOPTS:
+                        if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+                               err = -EINVAL;
+                               goto exit_f;
+                       }
+
+                       hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+                       len = ((hdr->hdrlen + 1) << 3);
+                       if (cmsg->cmsg_len < CMSG_LEN(len)) {
+                               err = -EINVAL;
+                               goto exit_f;
+                       }
+                       if (!capable(CAP_NET_RAW)) {
+                               err = -EPERM;
+                               goto exit_f;
+                       }
+                       if (opt->dst1opt) {
+                               err = -EINVAL;
+                               goto exit_f;
+                       }
+                       opt->opt_flen += len;
+                       opt->dst1opt = hdr;
+                       break;
+
+               case IPV6_AUTHHDR:
+                        if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+                               err = -EINVAL;
+                               goto exit_f;
+                       }
+
+                       hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+                       len = ((hdr->hdrlen + 2) << 2);
+                       if (cmsg->cmsg_len < CMSG_LEN(len)) {
+                               err = -EINVAL;
+                               goto exit_f;
+                       }
+                       if (len & ~7) {
+                               err = -EINVAL;
+                               goto exit_f;
+                       }
+                       opt->opt_flen += len;
+                       opt->auth = hdr;
+                       break;
+
+               case IPV6_RTHDR:
                         if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) {
                                err = -EINVAL;
                                goto exit_f;
                        }
 
-                       len = cmsg->cmsg_len - sizeof(struct cmsghdr);
                        rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg);
 
                        /*
@@ -118,7+205,9 @@ int datagram_send_ctl(struct msghdr *msg, int *oif,
                                goto exit_f;
                        }
 
-                       if (((rthdr->hdrlen + 1) << 3) < len) {
+                       len = ((rthdr->hdrlen + 1) << 3);
+
+                        if (cmsg->cmsg_len < CMSG_LEN(len)) {
                                err = -EINVAL;
                                goto exit_f;
                        }
@@ -128,12+217,21 @@ int datagram_send_ctl(struct msghdr *msg, int *oif,
                                err = -EINVAL;
                                goto exit_f;
                        }
-                       
-                       opt->opt_nflen += ((rthdr->hdrlen + 1) << 3);
+
+                       opt->opt_nflen += len;
                        opt->srcrt = rthdr;
 
+                       if (opt->dst1opt) {
+                               int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3);
+
+                               opt->opt_nflen += dsthdrlen;
+                               opt->dst0opt = opt->dst1opt;
+                               opt->dst1opt = NULL;
+                               opt->opt_flen -= dsthdrlen;
+                       }
+
                        break;
-                       
+
                case IPV6_HOPLIMIT:
                        if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
                                err = -EINVAL;
index 0b82687..89d5893 100644 (file)
@@ -5,8+5,9 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>
  *     Andi Kleen              <ak@muc.de>
+ *     Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
  *
- *     $Id: exthdrs.c,v 1.6 1998/04/30 16:24:20 freitag Exp $
+ *     $Id: exthdrs.c,v 1.7 1998/08/26 12:04:49 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
 
 #include <asm/uaccess.h>
 
-#define swap(a,b) do { typeof (a) tmp; tmp = (a); (a) = (b); (b) = (tmp); } while(0)
+/*
+ *     Parsing inbound headers.
+ *
+ *     Parsing function "func" returns pointer to the place,
+ *     where next nexthdr value is stored or NULL, if parsing
+ *     failed. It should also update skb->h.
+ */
+
+struct hdrtype_proc
+{
+       int     type;
+       u8*     (*func) (struct sk_buff **, u8 *ptr);
+};
 
 /*
- *     inbound
+ *     Parsing tlv encoded headers.
+ *
+ *     Parsing function "func" returns 1, if parsing succeed
+ *     and 0, if it failed.
+ *     It MUST NOT touch skb->h.
  */
-#if 0
-int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev,
-                       __u8 *nhptr, struct ipv6_options *opt)
+
+struct tlvtype_proc
+{
+       int     type;
+       int     (*func) (struct sk_buff *, __u8 *ptr);
+};
+
+/*********************
+  Generic functions
+ *********************/
+
+/* An unknown option is detected, decide what to do */
+
+int ip6_tlvopt_unknown(struct sk_buff *skb, u8 *opt)
+{
+       switch ((opt[0] & 0xC0) >> 6) {
+       case 0: /* ignore */
+               return 1;
+               
+       case 1: /* drop packet */
+               break;
+
+       case 3: /* Send ICMP if not a multicast address and drop packet */
+               /* Actually, it is redundant check. icmp_send
+                  will recheck in any case.
+                */
+               if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr))
+                       break;
+       case 2: /* send ICMP PARM PROB regardless and drop packet */
+               icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, opt);
+               return 0;
+       };
+
+       kfree_skb(skb);
+       return 0;
+}
+
+/* Parse tlv encoded option header (hop-by-hop or destination) */
+
+static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb,
+                        __u8 *nhptr)
+{
+       struct tlvtype_proc *curr;
+       u8 *ptr = skb->h.raw;
+       int len = ((ptr[1]+1)<<3) - 2;
+
+       ptr += 2;
+
+       if (skb->tail - (ptr + len) < 0) {
+               kfree_skb(skb);
+               return 0;
+       }
+
+       while (len > 0) {
+               int optlen = ptr[1]+2;
+
+               switch (ptr[0]) {
+               case IPV6_TLV_PAD0:
+                       optlen = 1;
+                       break;
+
+               case IPV6_TLV_PADN:
+                       break;
+
+               default: /* Other TLV code so scan list */
+                       for (curr=procs; curr->type >= 0; curr++) {
+                               if (curr->type == ptr[0]) {
+                                       if (curr->func(skb, ptr) == 0)
+                                               return 0;
+                                       break;
+                               }
+                       }
+                       if (curr->type < 0) {
+                               if (ip6_tlvopt_unknown(skb, ptr) == 0)
+                                       return 0;
+                       }
+                       break;
+               }
+               ptr += optlen;
+               len -= optlen;
+       }
+       if (len == 0)
+               return 1;
+       kfree_skb(skb);
+       return 0;
+}
+
+/*****************************
+  Destination options header.
+ *****************************/
+
+struct tlvtype_proc tlvprocdestopt_lst[] = {
+       /* No destination options are defined now */
+       {-1,                    NULL}
+};
+
+static u8 *ipv6_dest_opt(struct sk_buff **skb_ptr, u8 *nhptr)
+{
+       struct sk_buff *skb=*skb_ptr;
+       struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+       struct ipv6_destopt_hdr *hdr = (struct ipv6_destopt_hdr *) skb->h.raw;
+
+       opt->dst1 = (u8*)hdr - skb->nh.raw;
+
+       if (ip6_parse_tlv(tlvprocdestopt_lst, skb, nhptr)) {
+               skb->h.raw += ((hdr->hdrlen+1)<<3);
+               return &hdr->nexthdr;
+       }
+
+       return NULL;
+}
+
+/********************************
+  NONE header. No data in packet.
+ ********************************/
+
+static u8 *ipv6_nodata(struct sk_buff **skb_ptr, u8 *nhptr)
+{
+       kfree_skb(*skb_ptr);
+       return NULL;
+}
+
+/********************************
+  Routing header.
+ ********************************/
+
+static u8* ipv6_routing_header(struct sk_buff **skb_ptr, u8 *nhptr)
 {
        struct sk_buff *skb = *skb_ptr;
+       struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
        struct in6_addr *addr;
        struct in6_addr daddr;
-       int addr_type = 0;
-       int strict = 0;
-       __u32 bit_map;
-       int pos;
+       int addr_type;
        int n, i;
 
        struct ipv6_rt_hdr *hdr = (struct ipv6_rt_hdr *) skb->h.raw;
        struct rt0_hdr *rthdr;
 
-       if (hdr->segments_left == 0) {
-               struct ipv6_options *opt;
-
-               opt = (struct ipv6_options *) skb->cb;
-               opt->srcrt = hdr;
+       if (((hdr->hdrlen+1)<<3) > skb->tail - skb->h.raw) {
+               ipv6_statistics.Ip6InHdrErrors++;
+               kfree_skb(skb);
+               return NULL;
+       }
 
+looped_back:
+       if (hdr->segments_left == 0) {
+               opt->srcrt = (u8*)hdr - skb->nh.raw;
                skb->h.raw += (hdr->hdrlen + 1) << 3;
-               return hdr->nexthdr;            
+               opt->dst0 = opt->dst1;
+               opt->dst1 = 0;
+               return &hdr->nexthdr;           
        }
 
-       if (hdr->type != IPV6_SRCRT_TYPE_0 || hdr->hdrlen & 0x01 ||
-           hdr->hdrlen > 46) {
-                /* 
-                *      Discard 
-                */
-               
-               pos = (__u8 *) hdr - (__u8 *) skb->nh.ipv6h + 2;
+       if (hdr->type != IPV6_SRCRT_TYPE_0 || hdr->hdrlen & 0x01) {
+               u8 *pos = (u8*) hdr;
 
-               if (hdr->type)
+               if (hdr->type != IPV6_SRCRT_TYPE_0)
                        pos += 2;
                else
                        pos += 1;
 
-               icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev);
-               kfree_skb(skb);
-               return 0;       
+               icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, pos);
+               return NULL;    
        }
-
+       
        /*
         *      This is the routing header forwarding algorithm from
         *      RFC 1883, page 17.
@@ -94,13+232,21 @@ int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev,
        n = hdr->hdrlen >> 1;
 
        if (hdr->segments_left > n) {
-               pos = (__u8 *) hdr - (__u8 *) skb->nh.ipv6h + 2;
-
-               pos += 3;
+               icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, &hdr->segments_left);
+               return NULL;
+       }
 
-               icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev);
+       /* We are about to mangle packet header. Be careful!
+          Do not damage packets queued somewhere.
+        */
+       if (skb_cloned(skb)) {
+               struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
                kfree_skb(skb);
-               return 0;
+               if (skb2 == NULL)
+                       return NULL;
+               *skb_ptr = skb = skb2;
+               opt = (struct inet6_skb_parm *)skb2->cb;
+               hdr = (struct ipv6_rt_hdr *) skb2->h.raw;
        }
 
        i = n - --hdr->segments_left;
@@ -113,58+259,429 @@ int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev,
 
        if (addr_type == IPV6_ADDR_MULTICAST) {
                kfree_skb(skb);
-               return 0;
+               return NULL;
        }
 
        ipv6_addr_copy(&daddr, addr);
        ipv6_addr_copy(addr, &skb->nh.ipv6h->daddr);
        ipv6_addr_copy(&skb->nh.ipv6h->daddr, &daddr);
 
-       /*
-        *      Check Strick Source Route
+       dst_release(xchg(&skb->dst, NULL));
+       ip6_route_input(skb);
+       if (skb->dst->error) {
+               skb->dst->input(skb);
+               return NULL;
+       }
+       if (skb->dst->dev->flags&IFF_LOOPBACK) {
+               if (skb->nh.ipv6h->hop_limit <= 1) {
+                       icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
+                                   0, skb->dev);
+                       kfree_skb(skb);
+                       return NULL;
+               }
+               skb->nh.ipv6h->hop_limit--;
+               goto looped_back;
+       }
+
+       skb->dst->input(skb);
+       return NULL;
+}
+
+/*
+   This function inverts received rthdr.
+   NOTE: specs allow to make it automatically only if
+   packet authenticated.
+
+   I will not discuss it here (though, I am really pissed off at
+   this stupid requirement making rthdr idea useless)
+
+   Actually, it creates severe problems  for us.
+   Embrionic requests has no associated sockets,
+   so that user have no control over it and
+   cannot not only to set reply options, but
+   even to know, that someone wants to connect
+   without success. :-(
+
+   For now we need to test the engine, so that I created
+   temporary (or permanent) backdoor.
+   If listening socket set IPV6_RTHDR to 2, then we invert header.
+                                                   --ANK (980729)
+ */
+
+struct ipv6_txoptions *
+ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr)
+{
+       /* Received rthdr:
+
+          [ H1 -> H2 -> ... H_prev ]  daddr=ME
+
+          Inverted result:
+          [ H_prev -> ... -> H1 ] daddr =sender
+
+          Note, that IP output engine will rewrire this rthdr
+          by rotating it left by one addr.
         */
 
-       bit_map = ntohl(rthdr->bitmap);
+       int n, i;
+       struct rt0_hdr *rthdr = (struct rt0_hdr*)hdr;
+       struct rt0_hdr *irthdr;
+       struct ipv6_txoptions *opt;
+       int hdrlen = ipv6_optlen(hdr);
+
+       if (hdr->segments_left ||
+           hdr->type != IPV6_SRCRT_TYPE_0 ||
+           hdr->hdrlen & 0x01)
+               return NULL;
 
-       if ((bit_map & (1 << i)) == IPV6_SRCRT_STRICT)
-               strict = 1;
+       n = hdr->hdrlen >> 1;
+       opt = sock_kmalloc(sk, sizeof(*opt) + hdrlen, GFP_ATOMIC);
+       if (opt == NULL)
+               return NULL;
+       memset(opt, 0, sizeof(*opt));
+       opt->tot_len = sizeof(*opt) + hdrlen;
+       opt->srcrt = (void*)(opt+1);
+       opt->opt_nflen = hdrlen;
+
+       memcpy(opt->srcrt, hdr, sizeof(*hdr));
+       irthdr = (struct rt0_hdr*)opt->srcrt;
+       /* Obsolete field, MBZ, when originated by us */
+       irthdr->bitmap = 0;
+       opt->srcrt->segments_left = n;
+       for (i=0; i<n; i++)
+               memcpy(irthdr->addr+i, rthdr->addr+(n-1-i), 16);
+       return opt;
+}
 
-       ipv6_forward(skb, dev, (strict ? IP6_FW_STRICT : 0) | IP6_FW_SRCRT);
+/********************************
+  AUTH header.
+ ********************************/
 
+/*
+   rfc1826 said, that if a host does not implement AUTH header
+   it MAY ignore it. We use this hole 8)
+
+   Actually, now we can implement OSPFv6 without kernel IPsec.
+   Authentication for poors may be done in user space with the same success.
+
+   Yes, it means, that we allow application to send/receive
+   raw authentication header. Apparently, we suppose, that it knows
+   what it does and calculates authentication data correctly.
+   Certainly, it is possible only for udp and raw sockets, but not for tcp.
+
+   BTW I beg pardon, it is not good place for flames, but
+   I cannot be silent 8) It is very sad, but fools prevail 8)
+   AUTH header has 4byte granular length, what kills all the idea
+   behind AUTOMATIC 64bit alignment of IPv6. Now we will loose
+   cpu ticks, checking that sender did not something stupid
+   and opt->hdrlen is even. Shit!              --ANK (980730)
+ */
+
+static u8 *ipv6_auth_hdr(struct sk_buff **skb_ptr, u8 *nhptr)
+{
+       struct sk_buff *skb=*skb_ptr;
+       struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+       struct ipv6_opt_hdr *hdr = (struct ipv6_opt_hdr *)skb->h.raw;
+       int len = (hdr->hdrlen+2)<<2;
+
+       opt->auth = (u8*)hdr - skb->nh.raw;
+       if (skb->h.raw + len > skb->tail)
+               return NULL;
+       skb->h.raw += len;
+       return &hdr->nexthdr;
+}
+
+/* This list MUST NOT contain entry for NEXTHDR_HOP.
+   It is parsed immediately after packet received
+   and if it occurs somewhere in another place we must
+   generate error.
+ */
+
+struct hdrtype_proc hdrproc_lst[] = {
+       {NEXTHDR_FRAGMENT,      ipv6_reassembly},
+       {NEXTHDR_ROUTING,       ipv6_routing_header},
+       {NEXTHDR_DEST,          ipv6_dest_opt},
+       {NEXTHDR_NONE,          ipv6_nodata},
+       {NEXTHDR_AUTH,          ipv6_auth_hdr},
+   /*
+       {NEXTHDR_ESP,           ipv6_esp_hdr},
+    */
+       {-1,                    NULL}
+};
+
+u8 *ipv6_parse_exthdrs(struct sk_buff **skb_in, u8 *nhptr)
+{
+       struct hdrtype_proc *hdrt;
+       u8 nexthdr = *nhptr;
+
+restart:
+       for (hdrt=hdrproc_lst; hdrt->type >= 0; hdrt++) {
+               if (hdrt->type == nexthdr) {
+                       if ((nhptr = hdrt->func(skb_in, nhptr)) != NULL) {
+                               nexthdr = *nhptr;
+                               goto restart;
+                       }
+                       return NULL;
+               }
+       }
+       return nhptr;
+}
+
+
+/**********************************
+  Hop-by-hop options.
+ **********************************/
+
+/* Router Alert as of draft-ietf-ipngwg-ipv6router-alert-04 */
+
+static int ipv6_hop_ra(struct sk_buff *skb, u8 *ptr)
+{
+       if (ptr[1] == 2) {
+               ((struct inet6_skb_parm*)skb->cb)->ra = ptr - skb->nh.raw;
+               return 1;
+       }
+       if (net_ratelimit())
+               printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", ptr[1]);
+       kfree_skb(skb);
        return 0;
 }
 
+/* Jumbo payload */
+
+static int ipv6_hop_jumbo(struct sk_buff *skb, u8 *ptr)
+{
+       u32 pkt_len;
+
+       if (ptr[1] != 4 || ((ptr-skb->nh.raw)&3) != 2) {
+               if (net_ratelimit())
+                       printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", ptr[1]);
+               goto drop;
+       }
+
+       pkt_len = ntohl(*(u32*)(ptr+2));
+       if (pkt_len < 0x10000) {
+               icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ptr+2);
+               return 0;
+       }
+       if (skb->nh.ipv6h->payload_len) {
+               icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ptr);
+               return 0;
+       }
+
+       if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
+               ipv6_statistics.Ip6InTruncatedPkts++;
+               goto drop;
+       }
+       skb_trim(skb, pkt_len + sizeof(struct ipv6hdr));
+       return 1;
+
+drop:
+       kfree_skb(skb);
+       return 0;
+}
+
+struct tlvtype_proc tlvprochopopt_lst[] = {
+       {IPV6_TLV_ROUTERALERT,  ipv6_hop_ra},
+       {IPV6_TLV_JUMBO,        ipv6_hop_jumbo},
+       {-1,                    NULL}
+};
+
+u8 * ipv6_parse_hopopts(struct sk_buff *skb, u8 *nhptr)
+{
+       ((struct inet6_skb_parm*)skb->cb)->hop = sizeof(struct ipv6hdr);
+       if (ip6_parse_tlv(tlvprochopopt_lst, skb, nhptr))
+               return nhptr+((nhptr[1]+1)<<3);
+       return NULL;
+}
 
 /*
- *     outbound
+ *     Creating outbound headers.
+ *
+ *     "build" functions work when skb is filled from head to tail (datagram)
+ *     "push"  functions work when headers are added from tail to head (tcp)
+ *
+ *     In both cases we assume, that caller reserved enough room
+ *     for headers.
  */
 
-int ipv6opt_bld_rthdr(struct sk_buff *skb, struct ipv6_options *opt,
-                     struct in6_addr *addr)                  
+u8 *ipv6_build_rthdr(struct sk_buff *skb, u8 *prev_hdr,
+                    struct ipv6_rt_hdr *opt, struct in6_addr *addr)
 {
        struct rt0_hdr *phdr, *ihdr;
        int hops;
 
-       ihdr = (struct rt0_hdr *) opt->srcrt;
+       ihdr = (struct rt0_hdr *) opt;
        
        phdr = (struct rt0_hdr *) skb_put(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
-       memcpy(phdr, ihdr, sizeof(struct ipv6_rt_hdr));
+       memcpy(phdr, ihdr, sizeof(struct rt0_hdr));
 
        hops = ihdr->rt_hdr.hdrlen >> 1;
-       
+
        if (hops > 1)
                memcpy(phdr->addr, ihdr->addr + 1,
                       (hops - 1) * sizeof(struct in6_addr));
 
        ipv6_addr_copy(phdr->addr + (hops - 1), addr);
+
+       phdr->rt_hdr.nexthdr = *prev_hdr;
+       *prev_hdr = NEXTHDR_ROUTING;
+       return &phdr->rt_hdr.nexthdr;
+}
+
+static u8 *ipv6_build_exthdr(struct sk_buff *skb, u8 *prev_hdr, u8 type, struct ipv6_opt_hdr *opt)
+{
+       struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_put(skb, ipv6_optlen(opt));
+
+       memcpy(h, opt, ipv6_optlen(opt));
+       h->nexthdr = *prev_hdr;
+       *prev_hdr = type;
+       return &h->nexthdr;
+}
+
+static u8 *ipv6_build_authhdr(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_opt_hdr *opt)
+{
+       struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_put(skb, (opt->hdrlen+2)<<2);
+
+       memcpy(h, opt, (opt->hdrlen+2)<<2);
+       h->nexthdr = *prev_hdr;
+       *prev_hdr = NEXTHDR_AUTH;
+       return &h->nexthdr;
+}
+
+
+u8 *ipv6_build_nfrag_opts(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_txoptions *opt,
+                         struct in6_addr *daddr, u32 jumbolen)
+{
+       struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb->data;
+
+       if (opt && opt->hopopt)
+               prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_HOP, opt->hopopt);
+
+       if (jumbolen) {
+               u8 *jumboopt = (u8 *)skb_put(skb, 8);
+
+               if (opt && opt->hopopt) {
+                       *jumboopt++ = IPV6_TLV_PADN;
+                       *jumboopt++ = 0;
+                       h->hdrlen++;
+               } else {
+                       h = (struct ipv6_opt_hdr *)jumboopt;
+                       h->nexthdr = *prev_hdr;
+                       h->hdrlen = 0;
+                       jumboopt += 2;
+                       *prev_hdr = NEXTHDR_HOP;
+                       prev_hdr = &h->nexthdr;
+               }
+               jumboopt[0] = IPV6_TLV_JUMBO;
+               jumboopt[1] = 4;
+               *(u32*)(jumboopt+2) = htonl(jumbolen);
+       }
+       if (opt) {
+               if (opt->dst0opt)
+                       prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_DEST, opt->dst0opt);
+               if (opt->srcrt)
+                       prev_hdr = ipv6_build_rthdr(skb, prev_hdr, opt->srcrt, daddr);
+       }
+       return prev_hdr;
+}
+
+u8 *ipv6_build_frag_opts(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_txoptions *opt)
+{
+       if (opt->auth)
+               prev_hdr = ipv6_build_authhdr(skb, prev_hdr, opt->auth);
+       if (opt->dst1opt)
+               prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_DEST, opt->dst1opt);
+       return prev_hdr;
+}
+
+static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
+                           struct ipv6_rt_hdr *opt,
+                           struct in6_addr **addr_p)
+{
+       struct rt0_hdr *phdr, *ihdr;
+       int hops;
+
+       ihdr = (struct rt0_hdr *) opt;
        
-       phdr->rt_hdr.nexthdr = proto; 
-       return NEXTHDR_ROUTING;
+       phdr = (struct rt0_hdr *) skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
+       memcpy(phdr, ihdr, sizeof(struct rt0_hdr));
+
+       hops = ihdr->rt_hdr.hdrlen >> 1;
+
+       if (hops > 1)
+               memcpy(phdr->addr, ihdr->addr + 1,
+                      (hops - 1) * sizeof(struct in6_addr));
+
+       ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p);
+       *addr_p = ihdr->addr;
+
+       phdr->rt_hdr.nexthdr = *proto;
+       *proto = NEXTHDR_ROUTING;
+}
+
+static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt)
+{
+       struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt));
+
+       memcpy(h, opt, ipv6_optlen(opt));
+       h->nexthdr = *proto;
+       *proto = type;
 }
-#endif
+
+static void ipv6_push_authhdr(struct sk_buff *skb, u8 *proto, struct ipv6_opt_hdr *opt)
+{
+       struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, (opt->hdrlen+2)<<2);
+
+       memcpy(h, opt, (opt->hdrlen+2)<<2);
+       h->nexthdr = *proto;
+       *proto = NEXTHDR_AUTH;
+}
+
+void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
+                         u8 *proto,
+                         struct in6_addr **daddr)
+{
+       if (opt->srcrt)
+               ipv6_push_rthdr(skb, proto, opt->srcrt, daddr);
+       if (opt->dst0opt)
+               ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt);
+       if (opt->hopopt)
+               ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt);
+}
+
+void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto)
+{
+       if (opt->dst1opt)
+               ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt);
+       if (opt->auth)
+               ipv6_push_authhdr(skb, proto, opt->auth);
+}
+
+struct ipv6_txoptions *
+ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
+{
+       struct ipv6_txoptions *opt2;
+
+       opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC);
+       if (opt2) {
+               long dif = (char*)opt2 - (char*)opt;
+               memcpy(opt2, opt, opt->tot_len);
+               if (opt2->hopopt)
+                       *((char**)&opt2->hopopt) += dif;
+               if (opt2->dst0opt)
+                       *((char**)&opt2->dst0opt) += dif;
+               if (opt2->dst1opt)
+                       *((char**)&opt2->dst1opt) += dif;
+               if (opt2->auth)
+                       *((char**)&opt2->auth) += dif;
+               if (opt2->srcrt)
+                       *((char**)&opt2->srcrt) += dif;
+       }
+       return opt2;
+}
+
 
 /* 
- * find out if nexthdr is an extension header or a protocol
+ * find out if nexthdr is a well-known extension header or a protocol
  */
 
 static __inline__ int ipv6_ext_hdr(u8 nexthdr)
@@ -175,11+692,9 @@ static __inline__ int ipv6_ext_hdr(u8 nexthdr)
        return ( (nexthdr == NEXTHDR_HOP)       ||
                 (nexthdr == NEXTHDR_ROUTING)   ||
                 (nexthdr == NEXTHDR_FRAGMENT)  ||
-                (nexthdr == NEXTHDR_ESP)       ||
                 (nexthdr == NEXTHDR_AUTH)      ||
                 (nexthdr == NEXTHDR_NONE)      ||
                 (nexthdr == NEXTHDR_DEST) );
-                
 }
 
 /*
@@ -200,34+715,57 @@ static __inline__ int ipv6_ext_hdr(u8 nexthdr)
  * 
  * But I see no other way to do this. This might need to be reexamined
  * when Linux implements ESP (and maybe AUTH) headers.
+ * --AK
+ *
+ * This function parses (probably truncated) exthdr set "hdr"
+ * of length "len". "nexthdrp" initially points to some place,
+ * where type of the first header can be found.
+ *
+ * It skips all well-known exthdrs, and returns pointer to the start
+ * of unparsable area i.e. the first header with unknown type.
+ * If it is not NULL *nexthdr is updated by type/protocol of this header.
+ *
+ * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
+ *        - it may return pointer pointing beyond end of packet,
+ *         if the last recognized header is truncated in the middle.
+ *        - if packet is truncated, so that all parsed headers are skipped,
+ *         it returns NULL.
+ *       - First fragment header is skipped, not-first ones
+ *         are considered as unparsable.
+ *       - ESP is unparsable for now and considered like
+ *         normal payload protocol.
+ *       - Note also special handling of AUTH header. Thanks to IPsec wizards.
+ *
+ * --ANK (980726)
  */
-struct ipv6_opt_hdr *ipv6_skip_exthdr(struct ipv6_opt_hdr *hdr, 
-                                     u8 *nexthdrp, int len)
+
+u8 *ipv6_skip_exthdr(struct ipv6_opt_hdr *hdr, u8 *nexthdrp, int len)
 {
        u8 nexthdr = *nexthdrp;
 
        while (ipv6_ext_hdr(nexthdr)) {
                int hdrlen; 
-               
-               if (nexthdr == NEXTHDR_NONE)
+
+               if (len < sizeof(struct ipv6_opt_hdr))
                        return NULL;
-               if (len < sizeof(struct ipv6_opt_hdr)) /* be anal today */
+               if (nexthdr == NEXTHDR_NONE)
                        return NULL;
-
-               hdrlen = ipv6_optlen(hdr); 
-               if (len < hdrlen)
-                       return NULL; 
+               if (nexthdr == NEXTHDR_FRAGMENT) {
+                       struct frag_hdr *fhdr = (struct frag_hdr *) hdr;
+                       if (ntohs(fhdr->frag_off) & ~0x7)
+                               break;
+                       hdrlen = 8;
+               } else if (nexthdr == NEXTHDR_AUTH)
+                       hdrlen = (hdr->hdrlen+2)<<2; 
+               else
+                       hdrlen = ipv6_optlen(hdr); 
 
                nexthdr = hdr->nexthdr;
                hdr = (struct ipv6_opt_hdr *) ((u8*)hdr + hdrlen);
                len -= hdrlen;
        }
 
-       /* Hack.. Do the same for AUTH headers? */
-       if (nexthdr == NEXTHDR_ESP) 
-               return NULL; 
-
        *nexthdrp = nexthdr;
-       return hdr;
+       return (u8*)hdr;
 }
 
index c3b6f7b..d43d1f9 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>
  *
- *     $Id: icmp.c,v 1.18 1998/05/07 15:42:59 davem Exp $
+ *     $Id: icmp.c,v 1.19 1998/08/26 12:04:52 davem Exp $
  *
  *     Based on net/ipv4/icmp.c
  *
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
+struct icmpv6_mib icmpv6_statistics;
+
 /*
  *     ICMP socket for flow control.
  */
 
 struct socket *icmpv6_socket;
 
-int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
-              struct in6_addr *saddr, struct in6_addr *daddr,
-              struct ipv6_options *opt, unsigned short len,
-              int redo, struct inet6_protocol *protocol);
+int icmpv6_rcv(struct sk_buff *skb, unsigned long len);
 
 static struct inet6_protocol icmpv6_protocol = 
 {
@@ -80,8+79,6 @@ static struct inet6_protocol icmpv6_protocol =
        "ICMPv6"                /* name                 */
 };
 
-
-
 struct icmpv6_msg {
        struct icmp6hdr         icmph;
        __u8                    *data;
@@ -105,8+102,11 @@ static int icmpv6_getfrag(const void *data, struct in6_addr *saddr,
 
        /* 
         *      in theory offset must be 0 since we never send more 
-        *      than 576 bytes on an error or more than the path mtu
+        *      than IPV6_MIN_MTU bytes on an error or more than the path mtu
         *      on an echo reply. (those are the rules on RFC 1883)
+        *
+        *      Luckily, this statement is obsolete after
+        *      draft-ietf-ipngwg-icmp-v2-00           --ANK (980730)
         */
 
        if (offset) {
@@ -143,13+143,36 @@ void icmpv6_param_prob(struct sk_buff *skb, int code, void *pos)
        kfree_skb(skb);
 }
 
-static inline int is_icmp(struct ipv6hdr *hdr, int len)
+/*
+ * Figure out, may we reply to this packet with icmp error.
+ *
+ * We do not reply, if:
+ *     - it was icmp error message.
+ *     - it is truncated, so that it is known, that protocol is ICMPV6
+ *       (i.e. in the middle of some exthdr)
+ *     - it is not the first fragment. BTW IPv6 specs say nothing about
+ *       this case, but it is clear, that our reply would be useless
+ *       for sender.
+ *
+ *     --ANK (980726)
+ */
+
+static int is_ineligible(struct ipv6hdr *hdr, int len)
 {
-       __u8 nexthdr = hdr->nexthdr; 
+       u8 *ptr;
+       __u8 nexthdr = hdr->nexthdr;
+
+       if (len < (int)sizeof(*hdr))
+               return 1;
 
-       if (!ipv6_skip_exthdr((struct ipv6_opt_hdr *)(hdr+1), &nexthdr, len))
-               return 0; 
-       return nexthdr == IPPROTO_ICMP; 
+       ptr = ipv6_skip_exthdr((struct ipv6_opt_hdr *)(hdr+1), &nexthdr, len - sizeof(*hdr));
+       if (!ptr)
+               return 0;
+       if (nexthdr == IPPROTO_ICMPV6) {
+               struct icmp6hdr *ihdr = (struct icmp6hdr *)ptr;
+               return (ptr - (u8*)hdr) > len || !(ihdr->icmp6_type & 0x80); 
+       }
+       return nexthdr == NEXTHDR_FRAGMENT;
 }
 
 int sysctl_icmpv6_time = 1*HZ; 
@@ -160,31+183,37 @@ int sysctl_icmpv6_time = 1*HZ;
 static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
                                     struct flowi *fl)
 {
-#if 0
-       struct dst_entry *dst; 
-       int allow = 0;
-#endif
+       struct dst_entry *dst;
+       int res = 0;
+
        /* Informational messages are not limited. */
        if (type & 0x80)
-               return 1; 
+               return 1;
 
-#if 0 /* not yet, first fix routing COW */
+       /* Do not limit pmtu discovery, it would break it. */
+       if (type == ICMPV6_PKT_TOOBIG)
+               return 1;
 
        /* 
         * Look up the output route.
         * XXX: perhaps the expire for routing entries cloned by
         * this lookup should be more aggressive (not longer than timeout).
         */
-       dst = ip6_route_output(sk, fl, 1);
-       if (dst->error) 
+       dst = ip6_route_output(sk, fl);
+       if (dst->error)
                ipv6_statistics.Ip6OutNoRoutes++;
-       else 
-               allow = xrlim_allow(dst, sysctl_icmpv6_time);
+       else {
+               struct rt6_info *rt = (struct rt6_info *)dst;
+               int tmo = sysctl_icmpv6_time;
+
+               /* Give more bandwidth to wider prefixes. */
+               if (rt->rt6i_dst.plen < 128)
+                       tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
+
+               res = xrlim_allow(dst, tmo);
+       }
        dst_release(dst);
-       return allow;
-#else
-       return 1;
-#endif
+       return res;
 }
 
 /*
@@ -196,7+225,7 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
 
 static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset)
 {
-       char *buff = skb->nh.raw;
+       u8 *buff = skb->nh.raw;
 
        return ( ( *(buff + offset) & 0xC0 ) == 0x80 );
 }
@@ -215,7+244,6 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
        struct icmpv6_msg msg;
        struct flowi fl;
        int addr_type = 0;
-       int optlen;
        int len;
 
        /*
@@ -237,7+265,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
        
        addr_type = ipv6_addr_type(&hdr->daddr);
 
-       if (ipv6_chk_addr(&hdr->daddr, NULL, 0))
+       if (ipv6_chk_addr(&hdr->daddr, skb->dev, 0))
                saddr = &hdr->daddr;
 
        /*
@@ -275,8+303,9 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
        /* 
         *      Never answer to a ICMP packet.
         */
-       if (is_icmp(hdr, (u8*)skb->tail - (u8*)hdr)) {
-               printk(KERN_DEBUG "icmpv6_send: no reply to icmp\n"); 
+       if (is_ineligible(hdr, (u8*)skb->tail - (u8*)hdr)) {
+               if (net_ratelimit())
+                       printk(KERN_DEBUG "icmpv6_send: no reply to icmp error/fragment\n"); 
                return;
        }
 
@@ -303,34+332,22 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
        msg.data = skb->nh.raw;
        msg.csum = 0;
        msg.daddr = &hdr->saddr;
-        /*
-       if (skb->opt)
-               optlen = skb->opt->optlen;
-       else
-       */
-
-       optlen = 0;
 
-       len = min(skb->tail - ((unsigned char *) hdr), 
-                 576 - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)
-                 - optlen);
+       len = min((skb->tail - ((unsigned char *) hdr)) + sizeof(struct icmp6hdr), 
+                 IPV6_MIN_MTU - sizeof(struct icmp6hdr));
 
        if (len < 0) {
                printk(KERN_DEBUG "icmp: len problem\n");
                return;
        }
 
-       len += sizeof(struct icmp6hdr);
-
        msg.len = len;
 
        ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1,
                       MSG_DONTWAIT);
-
-       /* Oops! We must purge cached dst, otherwise
-          all the following ICMP messages will go there :) --ANK
-        */
-       dst_release(xchg(&sk->dst_cache, NULL));
+       if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
+               (&icmpv6_statistics.Icmp6OutDestUnreachs)[type-1]++;
+       icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static void icmpv6_echo_reply(struct sk_buff *skb)
@@ -374,38+391,41 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 
        ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1,
                       MSG_DONTWAIT);
-
-       /* Oops! We must purge cached dst, otherwise
-          all the following ICMP messages will go there :) --ANK
-        */
-       dst_release(xchg(&sk->dst_cache, NULL));
+       icmpv6_statistics.Icmp6OutEchoReplies++;
+       icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static void icmpv6_notify(struct sk_buff *skb,
-                         int type, int code, unsigned char *buff, int len,
-                         struct in6_addr *saddr, struct in6_addr *daddr, 
-                         struct inet6_protocol *protocol)
+                         int type, int code, unsigned char *buff, int len)
 {
+       struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+       struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
        struct ipv6hdr *hdr = (struct ipv6hdr *) buff;
        struct inet6_protocol *ipprot;
        struct sock *sk;
-       struct ipv6_opt_hdr *pb;
+       u8 *pb;
        __u32 info = 0;
        int hash;
        u8 nexthdr;
 
        nexthdr = hdr->nexthdr;
 
-       pb = (struct ipv6_opt_hdr *) (hdr + 1);
        len -= sizeof(struct ipv6hdr);
        if (len < 0)
                return;
 
        /* now skip over extension headers */
-       pb = ipv6_skip_exthdr(pb, &nexthdr, len);
+       pb = ipv6_skip_exthdr((struct ipv6_opt_hdr *) (hdr + 1), &nexthdr, len);
        if (!pb)
                return;
 
+       /* BUGGG_FUTURE: we should try to parse exthdrs in this packet.
+          Without this we will not able f.e. to make source routed
+          pmtu discovery.
+          Corresponding argument (opt) to notifiers is already added.
+          --ANK (980726)
+        */
+
        hash = nexthdr & (MAX_INET_PROTOS - 1);
 
        for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; 
@@ -414,9+434,8 @@ static void icmpv6_notify(struct sk_buff *skb,
                if (ipprot->protocol != nexthdr)
                        continue;
 
-               if (ipprot->err_handler) 
-                       ipprot->err_handler(skb, type, code, (u8*)pb, info,
-                                           saddr, daddr, ipprot);
+               if (ipprot->err_handler)
+                       ipprot->err_handler(skb, hdr, NULL, type, code, pb, info);
                return;
        }
 
@@ -428,7+447,7 @@ static void icmpv6_notify(struct sk_buff *skb,
                return;
 
        while((sk = raw_v6_lookup(sk, nexthdr, daddr, saddr))) {
-               rawv6_err(sk, type, code, (char*)pb, saddr, daddr);
+               rawv6_err(sk, skb, hdr, NULL, type, code, pb, info);
                sk = sk->next;
        }
 }
@@ -437,14+456,17 @@ static void icmpv6_notify(struct sk_buff *skb,
  *     Handle icmp messages
  */
 
-int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
-              struct in6_addr *saddr, struct in6_addr *daddr,
-              struct ipv6_options *opt, unsigned short len,
-              int redo, struct inet6_protocol *protocol)
+int icmpv6_rcv(struct sk_buff *skb, unsigned long len)
 {
+       struct device *dev = skb->dev;
+       struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+       struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
        struct ipv6hdr *orig_hdr;
        struct icmp6hdr *hdr = (struct icmp6hdr *) skb->h.raw;
        int ulen;
+       int type;
+
+       icmpv6_statistics.Icmp6InMsgs++;
 
        /* Perform checksum. */
        switch (skb->ip_summed) {       
@@ -480,8+502,15 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
         *      length of original packet carried in skb
         */
        ulen = skb->tail - (unsigned char *) (hdr + 1);
-       
-       switch (hdr->icmp6_type) {
+
+       type = hdr->icmp6_type;
+
+       if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
+               (&icmpv6_statistics.Icmp6InDestUnreachs)[type-ICMPV6_DEST_UNREACH]++;
+       else if (type >= ICMPV6_ECHO_REQUEST && type <= NDISC_REDIRECT)
+               (&icmpv6_statistics.Icmp6InEchos)[type-ICMPV6_ECHO_REQUEST]++;
+
+       switch (type) {
 
        case ICMPV6_ECHO_REQUEST:
                icmpv6_echo_reply(skb);
@@ -492,9+521,14 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
                break;
 
        case ICMPV6_PKT_TOOBIG:
+               /* BUGGG_FUTURE: if packet contains rthdr, we cannot update
+                  standard destination cache. Seems, only "advanced"
+                  destination cache will allow to solve this problem
+                  --ANK (980726)
+                */
                orig_hdr = (struct ipv6hdr *) (hdr + 1);
                if (ulen >= sizeof(struct ipv6hdr))
-                       rt6_pmtu_discovery(&orig_hdr->daddr, dev,
+                       rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev,
                                           ntohl(hdr->icmp6_mtu));
 
                /*
@@ -504,10+538,8 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
        case ICMPV6_DEST_UNREACH:
        case ICMPV6_TIME_EXCEED:
        case ICMPV6_PARAMPROB:
-
-               icmpv6_notify(skb, hdr->icmp6_type, hdr->icmp6_code,
-                             (char *) (hdr + 1), ulen,
-                             saddr, daddr, protocol);
+               icmpv6_notify(skb, type, hdr->icmp6_code,
+                             (char *) (hdr + 1), ulen);
                break;
 
        case NDISC_ROUTER_SOLICITATION:
@@ -515,7+547,7 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
        case NDISC_NEIGHBOUR_SOLICITATION:
        case NDISC_NEIGHBOUR_ADVERTISEMENT:
        case NDISC_REDIRECT:
-               ndisc_rcv(skb, dev, saddr, daddr, opt, len);            
+               ndisc_rcv(skb, len);
                break;
 
        case ICMPV6_MGM_QUERY:
@@ -530,23+562,26 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
                break;
 
        default:
-               printk(KERN_DEBUG "icmpv6: msg of unkown type\n");
+               if (net_ratelimit())
+                       printk(KERN_DEBUG "icmpv6: msg of unkown type\n");
                
                /* informational */
-               if (hdr->icmp6_type & 0x80)
-                       goto discard_it;
+               if (type & 0x80)
+                       break;
 
                /* 
                 * error of unkown type. 
                 * must pass to upper level 
                 */
 
-               icmpv6_notify(skb, hdr->icmp6_type, hdr->icmp6_code,
-                             (char *) (hdr + 1), ulen,
-                             saddr, daddr, protocol);  
+               icmpv6_notify(skb, type, hdr->icmp6_code,
+                             (char *) (hdr + 1), ulen);
        };
+       kfree_skb(skb);
+       return 0;
 
 discard_it:
+       icmpv6_statistics.Icmp6InErrors++;
        kfree_skb(skb);
        return 0;
 }
@@ -597,7+632,7 @@ static struct icmp6_err {
 } tab_unreach[] = {
        { ENETUNREACH,  0},     /* NOROUTE              */
        { EACCES,       1},     /* ADM_PROHIBITED       */
-       { EOPNOTSUPP,   1},     /* NOT_NEIGHBOUR        */
+       { 0,            0},     /* Was NOT_NEIGHBOUR, now reserved */
        { EHOSTUNREACH, 0},     /* ADDR_UNREACH         */
        { ECONNREFUSED, 1},     /* PORT_UNREACH         */
 };
index e7e12e3..bad3a13 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: ip6_fib.c,v 1.14 1998/05/07 15:43:03 davem Exp $
+ *     $Id: ip6_fib.c,v 1.15 1998/08/26 12:04:55 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
 #include <net/ip6_fib.h>
 #include <net/ip6_route.h>
 
-#define RT_DEBUG 2
+#define RT6_DEBUG 2
+#undef CONFIG_IPV6_SUBTREES
+
+#if RT6_DEBUG >= 1
+#define BUG_TRAP(x) ({ if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } })
+#else
+#define BUG_TRAP(x) do { ; } while (0)
+#endif
+
+#if RT6_DEBUG >= 3
+#define RT6_TRACE(x...) printk(KERN_DEBUG x)
+#else
+#define RT6_TRACE(x...) do { ; } while (0)
+#endif
 
 struct rt6_statistics  rt6_stats;
 
+enum fib_walk_state_t
+{
+#ifdef CONFIG_IPV6_SUBTREES
+       FWS_S,
+#endif
+       FWS_L,
+       FWS_R,
+       FWS_C,
+       FWS_U
+};
+
+struct fib6_cleaner_t
+{
+       struct fib6_walker_t w;
+       int (*func)(struct rt6_info *, void *arg);
+       void *arg;
+};
+
+#ifdef CONFIG_IPV6_SUBTREES
+#define FWS_INIT FWS_S
+#define SUBTREE(fn) ((fn)->subtree)
+#else
+#define FWS_INIT FWS_L
+#define SUBTREE(fn) NULL
+#endif
+
+static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt);
+static void fib6_repair_tree(struct fib6_node *fn);
+
 /*
  *     A routing update causes an increase of the serial number on the
  *     afected subtree. This allows for cached routes to be asynchronously
@@ -48,10+90,24 @@ static __u32        rt_sernum       = 0;
 static struct timer_list ip6_fib_timer = {
        NULL, NULL,
        0,
-       0,
+       ~0UL,
        fib6_run_gc
 };
 
+static struct fib6_walker_t fib6_walker_list = {
+       &fib6_walker_list, &fib6_walker_list, 
+};
+
+#define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next)
+
+static __inline__ u32 fib6_new_sernum(void)
+{
+       u32 n = ++rt_sernum;
+       if (n == 0)
+               n = ++rt_sernum;
+       return n;
+}
+
 /*
  *     Auxiliary address test functions for the radix tree.
  *
@@ -70,7+126,7 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen)
        int pdw;
        int pbi;
 
-       pdw = prefixlen >> 0x05;  /* num of whole __u32 in prefix */
+       pdw = prefixlen >> 5;     /* num of whole __u32 in prefix */
        pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */
 
        if (pdw)
@@ -78,15+134,11 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen)
                        return 0;
 
        if (pbi) {
-               __u32 w1, w2;
                __u32 mask;
 
-               w1 = a1[pdw];
-               w2 = a2[pdw];
-
-               mask = htonl((0xffffffff) << (0x20 - pbi));
+               mask = htonl((0xffffffff) << (32 - pbi));
 
-               if ((w1 ^ w2) & mask)
+               if ((a1[pdw] ^ a2[pdw]) & mask)
                        return 0;
        }
 
@@ -99,24+151,11 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen)
 
 static __inline__ int addr_bit_set(void *token, int fn_bit)
 {
-       int dw;
-       __u32 b1;
-       __u32 mask;
-       int bit = fn_bit;
        __u32 *addr = token;
 
-       dw = bit >> 0x05;
-
-       b1 = addr[dw];
-       
-       bit = ~bit;
-       bit &= 0x1f;
-       mask = htonl(1 << bit);
-       return (b1 & mask);
+       return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5];
 }
 
-
-
 /*
  *     find the first different bit between two addresses
  *     length of address must be a multiple of 32bits
@@ -131,42+170,47 @@ static __inline__ int addr_diff(void *token1, void *token2, int addrlen)
        addrlen >>= 2;
 
        for (i = 0; i < addrlen; i++) {
-               __u32 b1, b2;
                __u32 xb;
 
-               b1 = a1[i];
-               b2 = a2[i];
-
-               xb = b1 ^ b2;
+               xb = a1[i] ^ a2[i];
 
                if (xb) {
-                       int res = 0;
-                       int j=31;
+                       int j = 31;
 
                        xb = ntohl(xb);
 
-                       while (test_bit(j, &xb) == 0) {
-                               res++;
+                       while (test_bit(j, &xb) == 0)
                                j--;
-                       }
 
-                       return (i * 32 + res);
+                       return (i * 32 + 31 - j);
                }
        }
 
        /*
         *      we should *never* get to this point since that 
         *      would mean the addrs are equal
+        *
+        *      However, we do get to it 8) And exacly, when
+        *      addresses are equal 8)
+        *
+        *      ip route add 1111::/128 via ...
+        *      ip route add 1111::/64 via ...
+        *      and we are here.
+        *
+        *      Ideally, this function should stop comparison
+        *      at prefix length. It does not, but it is still OK,
+        *      if returned value is greater than prefix length.
+        *                                      --ANK (980803)
         */
 
-       return -1;
+       return addrlen<<5;
 }
 
 static __inline__ struct fib6_node * node_alloc(void)
 {
        struct fib6_node *fn;
 
-       if ((fn = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC))) {
+       if ((fn = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC)) != NULL) {
                memset(fn, 0, sizeof(struct fib6_node));
                rt6_stats.fib_nodes++;
        }
@@ -180,13+224,10 @@ static __inline__ void node_free(struct fib6_node * fn)
        kfree(fn);
 }
 
-extern __inline__ void rt6_release(struct rt6_info *rt)
+static __inline__ void rt6_release(struct rt6_info *rt)
 {
-       struct dst_entry *dst = (struct dst_entry *) rt;
-       if (atomic_dec_and_test(&dst->refcnt)) {
-               rt->rt6i_node = NULL;
-               dst_free(dst);
-       }
+       if (atomic_dec_and_test(&rt->rt6i_ref))
+               dst_free(&rt->u.dst);
 }
 
 
@@ -200,18+241,16 @@ extern __inline__ void rt6_release(struct rt6_info *rt)
 
 static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
                                     int addrlen, int plen,
-                                    unsigned long offset,
-                                    struct rt6_info *rt)
-                                    
+                                    int offset)
 {
-       struct fib6_node *fn;
+       struct fib6_node *fn, *in, *ln;
        struct fib6_node *pn = NULL;
-       struct fib6_node *in;
-       struct fib6_node *ln;
        struct rt6key *key;
-       __u32   bit;
-       __u32   dir = 0;
-       __u32   sernum = ++rt_sernum;
+       int     bit;
+               int     dir = 0;
+       __u32   sernum = fib6_new_sernum();
+
+       RT6_TRACE("fib6_add_1\n");
 
        /* insert node in tree */
 
@@ -220,146+259,143 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
        if (plen == 0)
                return fn;
 
-       for (;;) {
-               if (fn == NULL) {
-                       ln = node_alloc();
-
-                       if (ln == NULL)
-                               return NULL;
-                       ln->fn_bit = plen;
-                       
-                       ln->parent = pn;
-                       ln->fn_sernum = sernum;
-                       rt->rt6i_node = ln;
-
-                       if (dir)
-                               pn->right = ln;
-                       else
-                               pn->left  = ln;
-
-                       return ln;
-               }
-
+       do {
                key = (struct rt6key *)((u8 *)fn->leaf + offset);
 
                /*
                 *      Prefix match
                 */
-               if (addr_match(&key->addr, addr, fn->fn_bit)) {
+               if (plen < fn->fn_bit ||
+                   !addr_match(&key->addr, addr, fn->fn_bit))
+                       goto insert_above;
                
-                       /*
-                        *      Exact match ?
-                        */
+               /*
+                *      Exact match ?
+                */
                         
-                       if (plen == fn->fn_bit) {
-                               /* clean up an intermediate node */
-                               if ((fn->fn_flags & RTN_RTINFO) == 0) {
-                                       rt6_release(fn->leaf);
-                                       fn->leaf = NULL;
-                               }
+               if (plen == fn->fn_bit) {
+                       /* clean up an intermediate node */
+                       if ((fn->fn_flags & RTN_RTINFO) == 0) {
+                               rt6_release(fn->leaf);
+                               fn->leaf = NULL;
+                       }
                        
-                               fn->fn_sernum = sernum;
+                       fn->fn_sernum = sernum;
                                
-                               return fn;
-                       }
-
-                       /*
-                        *      We have more bits to go
-                        */
-                        
-                       if (plen > fn->fn_bit) {
-                               /* Walk down on tree. */
-                               fn->fn_sernum = sernum;
-                               dir = addr_bit_set(addr, fn->fn_bit);
-                               pn = fn;
-                               fn = dir ? fn->right: fn->left;
-
-                               /*
-                                *      Round we go. Note if fn has become
-                                *      NULL then dir is set and fn is handled
-                                *      top of loop.
-                                */
-                               continue;
-                       }
+                       return fn;
                }
 
                /*
-                * split since we don't have a common prefix anymore or 
-                * we have a less significant route.
-                * we've to insert an intermediate node on the list
-                * this new node will point to the one we need to create
-                * and the current
+                *      We have more bits to go
                 */
+                        
+               /* Try to walk down on tree. */
+               fn->fn_sernum = sernum;
+               dir = addr_bit_set(addr, fn->fn_bit);
+               pn = fn;
+               fn = dir ? fn->right: fn->left;
+       } while (fn);
 
-               pn = fn->parent;
+       /*
+        *      We wlaked to the bottom of tree.
+        *      Create new leaf node without children.
+        */
 
-               /* find 1st bit in difference between the 2 addrs */
-               bit = addr_diff(addr, &key->addr, addrlen);
+       ln = node_alloc();
 
+       if (ln == NULL)
+               return NULL;
+       ln->fn_bit = plen;
+                       
+       ln->parent = pn;
+       ln->fn_sernum = sernum;
 
-               /* 
-                *              (intermediate)  
-                *                /        \
-                *      (new leaf node)    (old node)
-                */
-               if (plen > bit) {
-                       in = node_alloc();
-               
-                       if (in == NULL)
-                               return NULL;
-
-                       /* 
-                        * new intermediate node. 
-                        * RTN_RTINFO will
-                        * be off since that an address that chooses one of
-                        * the branches would not match less specific routes
-                        * int the other branch
-                        */
+       if (dir)
+               pn->right = ln;
+       else
+               pn->left  = ln;
+
+       return ln;
 
-                       in->fn_bit = bit;
 
-                       in->parent = pn;
-                       in->leaf = rt;
+insert_above:
+       /*
+        * split since we don't have a common prefix anymore or 
+        * we have a less significant route.
+        * we've to insert an intermediate node on the list
+        * this new node will point to the one we need to create
+        * and the current
+        */
+
+       pn = fn->parent;
 
-                       in->fn_sernum = sernum;
-                       atomic_inc(&rt->rt6i_ref);
+       /* find 1st bit in difference between the 2 addrs.
 
-                       /* leaf node */
-                       ln = node_alloc();
+          See comment in addr_diff: bit may be an invalid value,
+          but if it is >= plen, the value is ignored in any case.
+        */
+       
+       bit = addr_diff(addr, &key->addr, addrlen);
 
-                       if (ln == NULL) {
+       /* 
+        *              (intermediate)[in]      
+        *                /        \
+        *      (new leaf node)[ln] (old node)[fn]
+        */
+       if (plen > bit) {
+               in = node_alloc();
+               ln = node_alloc();
+               
+               if (in == NULL || ln == NULL) {
+                       if (in)
                                node_free(in);
-                               return NULL;
-                       }
+                       if (ln)
+                               node_free(ln);
+                       return NULL;
+               }
+
+               /* 
+                * new intermediate node. 
+                * RTN_RTINFO will
+                * be off since that an address that chooses one of
+                * the branches would not match less specific routes
+                * in the other branch
+                */
 
-                       /* update parent pointer */
-                       if (dir)
-                               pn->right = in;
-                       else
-                               pn->left  = in;
+               in->fn_bit = bit;
 
-                       ln->fn_bit = plen;
+               in->parent = pn;
+               in->leaf = fn->leaf;
+               atomic_inc(&in->leaf->rt6i_ref);
 
-                       ln->parent = in;
-                       fn->parent = in;
+               in->fn_sernum = sernum;
 
-                       ln->fn_sernum = sernum;
+               /* update parent pointer */
+               if (dir)
+                       pn->right = in;
+               else
+                       pn->left  = in;
 
-                       if (addr_bit_set(addr, bit)) {
-                               in->right = ln;
-                               in->left  = fn;
-                       } else {
-                               in->left  = ln;
-                               in->right = fn;
-                       }
+               ln->fn_bit = plen;
+
+               ln->parent = in;
+               fn->parent = in;
+
+               ln->fn_sernum = sernum;
 
-                       return ln;
+               if (addr_bit_set(addr, bit)) {
+                       in->right = ln;
+                       in->left  = fn;
+               } else {
+                       in->left  = ln;
+                       in->right = fn;
                }
+       } else { /* plen <= bit */
 
                /* 
-                *              (new leaf node)
+                *              (new leaf node)[ln]
                 *                /        \
-                *           (old node)    NULL
+                *           (old node)[fn] NULL
                 */
 
                ln = node_alloc();
@@ -377,7+413,6 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
                        pn->right = ln;
                else
                        pn->left  = ln;
-               
 
                if (addr_bit_set(&key->addr, plen))
                        ln->right = fn;
@@ -385,11+420,8 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
                        ln->left  = fn;
 
                fn->parent = ln;
-
-               return ln;
        }
-
-       return NULL;
+       return ln;
 }
 
 /*
@@ -401,7+433,6 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
        struct rt6_info *iter = NULL;
        struct rt6_info **ins;
 
-       rt->rt6i_node = fn;
        ins = &fn->leaf;
 
        for (iter = fn->leaf; iter; iter=iter->u.next) {
@@ -423,7+454,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
                                iter->rt6i_expires = rt->rt6i_expires;
                                if (!(rt->rt6i_flags&RTF_EXPIRES)) {
                                        iter->rt6i_flags &= ~RTF_EXPIRES;
-                                       iter->rt6i_expires = rt->rt6i_expires;
+                                       iter->rt6i_expires = 0;
                                }
                                return -EEXIST;
                        }
@@ -439,8+470,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
         *      insert node
         */
 
-       *ins = rt;
        rt->u.next = iter;
+       *ins = rt;
+       rt->rt6i_node = fn;
        atomic_inc(&rt->rt6i_ref);
 #ifdef CONFIG_RTNETLINK
        inet6_rt_notify(RTM_NEWROUTE, rt);
@@ -457,8+489,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
 
 static __inline__ void fib6_start_gc(struct rt6_info *rt)
 {
-       if ((ip6_fib_timer.expires == 0) &&
-           (rt->rt6i_flags & (RTF_ADDRCONF | RTF_CACHE))) {
+       if (ip6_fib_timer.expires == 0 &&
+           (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) {
                del_timer(&ip6_fib_timer);
                ip6_fib_timer.expires = jiffies + ip6_rt_gc_interval;
                add_timer(&ip6_fib_timer);
@@ -475,67+507,97 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt)
 {
        struct fib6_node *fn;
        int err = -ENOMEM;
-       unsigned long offset;
-       
-       offset = (u8*) &rt->rt6i_dst - (u8*) rt;
+
        fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr),
-                       rt->rt6i_dst.plen, offset, rt);
+                       rt->rt6i_dst.plen, (u8*) &rt->rt6i_dst - (u8*) rt);
 
-       if (fn == NULL) {
-#if RT_DEBUG >= 2
-               printk(KERN_DEBUG "fib6_add: fn == NULL\n");
-#endif
-               goto out;
-       }
+       if (fn == NULL)
+               return -ENOMEM;
 
+#ifdef CONFIG_IPV6_SUBTREES
        if (rt->rt6i_src.plen) {
                struct fib6_node *sn;
 
-#if RT_DEBUG >= 2
-               printk(KERN_DEBUG "fib6_add: src.len > 0\n");
-#endif
-
                if (fn->subtree == NULL) {
                        struct fib6_node *sfn;
 
-                       if (fn->leaf == NULL) {
-                               fn->leaf = rt;
-                               atomic_inc(&rt->rt6i_ref);
-                       }
+                       /*
+                        * Create subtree.
+                        *
+                        *              fn[main tree]
+                        *              |
+                        *              sfn[subtree root]
+                        *                 \
+                        *                  sn[new leaf node]
+                        */
 
+                       /* Create subtree root node */
                        sfn = node_alloc();
-
                        if (sfn == NULL)
-                               goto out;
+                               goto st_failure;
 
-                       sfn->parent = fn;
                        sfn->leaf = &ip6_null_entry;
+                       atomic_inc(&ip6_null_entry.rt6i_ref);
                        sfn->fn_flags = RTN_ROOT;
-                       sfn->fn_sernum = ++rt_sernum;
+                       sfn->fn_sernum = fib6_new_sernum();
 
-                       fn->subtree = sfn;
-               }
+                       /* Now add the first leaf node to new subtree */
 
-               offset = (u8*) &rt->rt6i_src - (u8*) rt;
+                       sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
+                                       sizeof(struct in6_addr), rt->rt6i_src.plen,
+                                       (u8*) &rt->rt6i_src - (u8*) rt);
 
-               sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
-                               sizeof(struct in6_addr), rt->rt6i_src.plen,
-                               offset, rt);
+                       if (sn == NULL) {
+                               /* If it is failed, discard just allocated
+                                  root, and then (in st_failure) stale node
+                                  in main tree.
+                                */
+                               node_free(sfn);
+                               goto st_failure;
+                       }
 
-               if (sn == NULL)
-                       goto out;
+                       /* Now link new subtree to main tree */
+                       sfn->parent = fn;
+                       fn->subtree = sfn;
+                       if (fn->leaf == NULL) {
+                               fn->leaf = rt;
+                               atomic_inc(&rt->rt6i_ref);
+                       }
+               } else {
+                       sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
+                                       sizeof(struct in6_addr), rt->rt6i_src.plen,
+                                       (u8*) &rt->rt6i_src - (u8*) rt);
+
+                       if (sn == NULL)
+                               goto st_failure;
+               }
 
                fn = sn;
        }
+#endif
 
        err = fib6_add_rt2node(fn, rt);
 
-       if (err == 0)
+       if (err == 0) {
                fib6_start_gc(rt);
-out:
+               if (!(rt->rt6i_flags&RTF_CACHE))
+                       fib6_prune_clones(fn, rt);
+       }
+
        if (err)
                dst_free(&rt->u.dst);
        return err;
+
+#ifdef CONFIG_IPV6_SUBTREES
+       /* Subtree creation failed, probably main tree node
+          is orphan. If it is, shot it.
+        */
+st_failure:
+       if (fn && !(fn->fn_flags&RTN_RTINFO|RTN_ROOT))
+               fib_repair_tree(fn);
+       dst_free(&rt->u.dst);
+       return err;
+#endif
 }
 
 /*
@@ -544,7+606,7 @@ out:
  */
 
 struct lookup_args {
-       unsigned long   offset;         /* key offset on rt6_info       */
+       int             offset;         /* key offset on rt6_info       */
        struct in6_addr *addr;          /* search key                   */
 };
 
@@ -576,6+638,7 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
        }
 
        while ((fn->fn_flags & RTN_ROOT) == 0) {
+#ifdef CONFIG_IPV6_SUBTREES
                if (fn->subtree) {
                        struct fib6_node *st;
                        struct lookup_args *narg;
@@ -591,6+654,7 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
                                }
                        }
                }
+#endif
 
                if (fn->fn_flags & RTN_RTINFO) {
                        struct rt6key *key;
@@ -618,8+682,10 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr,
        args[0].offset = (u8*) &rt->rt6i_dst - (u8*) rt;
        args[0].addr = daddr;
 
+#ifdef CONFIG_IPV6_SUBTREES
        args[1].offset = (u8*) &rt->rt6i_src - (u8*) rt;
        args[1].addr = saddr;
+#endif
 
        fn = fib6_lookup_1(root, args);
 
@@ -630,12+696,79 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr,
 }
 
 /*
+ *     Get node with sepciafied destination prefix (and source prefix,
+ *     if subtrees are used)
+ */
+
+
+static struct fib6_node * fib6_locate_1(struct fib6_node *root,
+                                       struct in6_addr *addr,
+                                       int plen, int offset)
+{
+       struct fib6_node *fn;
+
+       for (fn = root; fn ; ) {
+               struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
+
+               /*
+                *      Prefix match
+                */
+               if (plen < fn->fn_bit ||
+                   !addr_match(&key->addr, addr, fn->fn_bit))
+                       return NULL;
+
+               if (plen == fn->fn_bit)
+                       return fn;
+
+               /*
+                *      We have more bits to go
+                */
+               if (addr_bit_set(addr, fn->fn_bit))
+                       fn = fn->right;
+               else
+                       fn = fn->left;
+       }
+       return NULL;
+}
+
+struct fib6_node * fib6_locate(struct fib6_node *root,
+                              struct in6_addr *daddr, int dst_len,
+                              struct in6_addr *saddr, int src_len)
+{
+       struct rt6_info *rt = NULL;
+       struct fib6_node *fn;
+
+       fn = fib6_locate_1(root, daddr, dst_len,
+                          (u8*) &rt->rt6i_dst - (u8*) rt);
+
+#ifdef CONFIG_IPV6_SUBTREES
+       if (src_len) {
+               BUG_TRAP(saddr!=NULL);
+               if (fn == NULL)
+                       fn = fn->subtree;
+               if (fn)
+                       fn = fib6_locate_1(fn, saddr, src_len,
+                                          (u8*) &rt->rt6i_src - (u8*) rt);
+       }
+#endif
+
+       if (fn && fn->fn_flags&RTN_RTINFO)
+               return fn;
+
+       return NULL;
+}
+
+
+/*
  *     Deletion
  *
  */
 
 static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
 {
+       if (fn->fn_flags&RTN_ROOT)
+               return &ip6_null_entry;
+
        while(fn) {
                if(fn->left)
                        return fn->left->leaf;
@@ -643,7+776,7 @@ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
                if(fn->right)
                        return fn->right->leaf;
 
-               fn = fn->subtree;
+               fn = SUBTREE(fn);
        }
        return NULL;
 }
@@ -653,428+786,414 @@ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
  *     is the node we want to try and remove.
  */
 
-static void fib6_del_2(struct fib6_node *fn)
+static void fib6_repair_tree(struct fib6_node *fn)
 {
-       struct rt6_info *rt;
-
-       fn->fn_flags &= ~RTN_RTINFO;
-       rt6_stats.fib_route_nodes--;
+       int children;
+       int nstate;
+       struct fib6_node *child, *pn;
+       struct fib6_walker_t *w;
+       int iter = 0;
 
-       /*
-        *      Can't delete a root node
-        */
-        
-       if (fn->fn_flags & RTN_TL_ROOT)
-               return;
+       for (;;) {
+               RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
+               iter++;
 
-       do {
-               struct fib6_node *pn, *child;
-               int children = 0;
+               BUG_TRAP(!(fn->fn_flags&RTN_RTINFO));
+               BUG_TRAP(!(fn->fn_flags&RTN_TL_ROOT));
+               BUG_TRAP(fn->leaf==NULL);
 
+               children = 0;
                child = NULL;
+               if (fn->right) child = fn->right, children |= 1;
+               if (fn->left) child = fn->left, children |= 2;
 
-               /*
-                *      We have a child to left
-                */
-                
-               if (fn->left) {
-                       children++;
-                       child = fn->left;
-               }
-
-               /*
-                *      To right
-                */
-                
-               if (fn->right) {
-                       children++;
-                       child = fn->right;
-               }
-
-               /*
-                *      We can't tidy a case of two children.
-                */
-               if (children > 1) {
-                       if (fn->leaf == NULL)
-                               goto split_repair;
-                       break;
+               if (children == 3 || SUBTREE(fn) 
+#ifdef CONFIG_IPV6_SUBTREES
+                   /* Subtree root (i.e. fn) may have one child */
+                   || (children && fn->fn_flags&RTN_ROOT)
+#endif
+                   ) {
+                       fn->leaf = fib6_find_prefix(fn);
+#if RT6_DEBUG >= 2
+                       if (fn->leaf==NULL) {
+                               BUG_TRAP(fn->leaf);
+                               fn->leaf = &ip6_null_entry;
+                       }
+#endif
+                       atomic_inc(&fn->leaf->rt6i_ref);
+                       return;
                }
 
-               if (fn->fn_flags & RTN_RTINFO)
-                       break;
-
-               /*
-                *      The node we plan to tidy has an stree. Talk about
-                *      making life hard.
-                */
-                
-               if (fn->subtree)
-                       goto stree_node;
-
-               /*
-                *      Up we go
-                */
-                
                pn = fn->parent;
-
-               /*
-                *      Not a ROOT - we can tidy
-                */
-                
-               if ((fn->fn_flags & RTN_ROOT) == 0) {
-                       /*
-                        *      Make our child our parents child
-                        */
-                       if (pn->left == fn)
-                               pn->left = child;
-                       else
-                               pn->right = child;
-
-                       /*
-                        *      Reparent the child
-                        */
+#ifdef CONFIG_IPV6_SUBTREES
+               if (SUBTREE(pn) == fn) {
+                       BUG_TRAP(fn->fn_flags&RTN_ROOT);
+                       SUBTREE(pn) = NULL;
+                       nstate = FWS_L;
+               } else {
+                       BUG_TRAP(!(fn->fn_flags&RTN_ROOT));
+#endif
+                       if (pn->right == fn) pn->right = child;
+                       else if (pn->left == fn) pn->left = child;
+#if RT6_DEBUG >= 2
+                       else BUG_TRAP(0);
+#endif
                        if (child)
                                child->parent = pn;
+                       nstate = FWS_R;
+#ifdef CONFIG_IPV6_SUBTREES
+               }
+#endif
 
-                       /*
-                        *      Discard leaf entries
-                        */
-                       if (fn->leaf)
-                               rt6_release(fn->leaf);
-               } else {
-                       if (children)
-                               break;
-                       /*
-                        *      No children so no subtree
-                        */
-
-                       pn->subtree = NULL;
+               FOR_WALKERS(w) {
+                       if (child == NULL) {
+                               if (w->root == fn) {
+                                       w->root = w->node = NULL;
+                                       RT6_TRACE("W %p adjusted by delroot 1\n", w);
+                               } else if (w->node == fn) {
+                                       RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
+                                       w->node = pn;
+                                       w->state = nstate;
+                               }
+                       } else {
+                               if (w->root == fn) {
+                                       w->root = child;
+                                       RT6_TRACE("W %p adjusted by delroot 2\n", w);
+                               }
+                               if (w->node == fn) {
+                                       w->node = child;
+                                       if (children&2) {
+                                               RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+                                               w->state = w->state>=FWS_R ? FWS_U : FWS_INIT;
+                                       } else {
+                                               RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+                                               w->state = w->state>=FWS_C ? FWS_U : FWS_INIT;
+                                       }
+                               }
+                       }
                }
 
-               /*
-                *      We are discarding 
-                */
                node_free(fn);
-               
-               /*
-                *      Our merge of entries might propogate further
-                *      up the tree, so move up a level and retry.
-                */
-                
-               fn = pn;
-
-       } while (!(fn->fn_flags & RTN_TL_ROOT));
-
-       return;
-
-stree_node:
-
-       rt6_release(fn->leaf);
-
-split_repair:
-       rt = fib6_find_prefix(fn);
-
-       if (rt == NULL)
-               panic("fib6_del_2: inconsistent tree\n");
+               if (pn->fn_flags&RTN_RTINFO || SUBTREE(pn))
+                       return;
 
-       atomic_inc(&rt->rt6i_ref);
-       fn->leaf = rt;
+               rt6_release(pn->leaf);
+               pn->leaf = NULL;
+               fn = pn;
+       }
 }
 
-/*
- *     Remove our entry in the tree. This throws away the route entry
- *     from the list of entries attached to this fib node. It doesn't
- *     expunge from the tree.
- */
-
-static struct fib6_node * fib6_del_1(struct rt6_info *rt)
+static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp)
 {
-       struct fib6_node *fn;
-       
-       fn = rt->rt6i_node;
+       struct fib6_walker_t *w;
+       struct rt6_info *rt = *rtp;
+
+       RT6_TRACE("fib6_del_route\n");
+
+       if (!(rt->rt6i_flags&RTF_CACHE))
+               fib6_prune_clones(fn, rt);
+
+       /* Unlink it */
+       *rtp = rt->u.next;
+       rt->rt6i_node = NULL;
+       rt6_stats.fib_rt_entries--;
+
+       /* Adjust walkers */
+       FOR_WALKERS(w) {
+               if (w->state == FWS_C && w->leaf == rt) {
+                       RT6_TRACE("walker %p adjusted by delroute\n", w);
+                       w->leaf = rt->u.next;
+                       if (w->leaf == NULL)
+                               w->state = FWS_U;
+               }
+       }
 
-       /* We need a fib node! */
-       if (fn) {
-               struct rt6_info **back;
-               struct rt6_info *lf;
+       rt->u.next = NULL;
 
-               back = &fn->leaf;
-               
-               /*
-                *      Walk the leaf entries looking for ourself
-                */
-                
-               for(lf = fn->leaf; lf; lf=lf->u.next) {
-                       if (rt == lf) {
-                               /*
-                                *      Delete this entry.
-                                */
-
-                               *back = lf->u.next;
-#ifdef CONFIG_RTNETLINK
-                               inet6_rt_notify(RTM_DELROUTE, lf);
-#endif                 
-                               rt6_release(lf);
-                               rt6_stats.fib_rt_entries--;
-                               return fn;
-                       }
-                       back = &lf->u.next;
-               }
+       /* If it was last route, expunge its radix tree node */
+       if (fn->leaf == NULL) {
+               fn->fn_flags &= ~RTN_RTINFO;
+               rt6_stats.fib_route_nodes--;
+               fib6_repair_tree(fn);
        }
 
-       return NULL;
+#ifdef CONFIG_RTNETLINK
+       inet6_rt_notify(RTM_DELROUTE, rt);
+#endif
+       rt6_release(rt);
 }
 
 int fib6_del(struct rt6_info *rt)
 {
-       struct fib6_node *fn;
-
-       fn = fib6_del_1(rt);
+       struct fib6_node *fn = rt->rt6i_node;
+       struct rt6_info **rtp;
 
-       if (fn == NULL)
+#if RT6_DEBUG >= 2
+       if (rt->u.dst.obsolete>0) {
+               BUG_TRAP(rt->u.dst.obsolete>0);
+               return -EFAULT;
+       }
+#endif
+       if (fn == NULL || rt == &ip6_null_entry)
                return -ENOENT;
 
-       if (fn->leaf == NULL)
-               fib6_del_2(fn);
+       BUG_TRAP(fn->fn_flags&RTN_RTINFO);
 
-       return 0;
+       /*
+        *      Walk the leaf entries looking for ourself
+        */
+
+       for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
+               if (*rtp == rt) {
+                       fib6_del_route(fn, rtp);
+                       return 0;
+               }
+       }
+       return -ENOENT;
 }
 
 /*
- *     Tree transversal function
+ *     Tree transversal function.
  *
- *     Wau... It is NOT REENTERABLE!!!!!!! It is cathastrophe. --ANK
+ *     Certainly, it is not interrupt safe.
+ *     However, it is internally reenterable wrt itself and fib6_add/fib6_del.
+ *     It means, that we can modify tree during walking
+ *     and use this function for garbage collection, clone pruning,
+ *     cleaning tree when a device goes down etc. etc. 
+ *
+ *     It guarantees that every node will be traversed,
+ *     and that it will be traversed only once.
+ *
+ *     Callback function w->func may return:
+ *     0 -> continue walking.
+ *     positive value -> walking is suspended (used by tree dumps,
+ *     and probably by gc, if it will be split to several slices)
+ *     negative value -> terminate walking.
+ *
+ *     The function itself returns:
+ *     0   -> walk is complete.
+ *     >0  -> walk is incomplete (i.e. suspended)
+ *     <0  -> walk is terminated by an error.
  */
 
-int fib6_walk_count;
-
-void fib6_walk_tree(struct fib6_node *root, f_pnode func, void *arg,
-                   int filter)
+int fib6_walk_continue(struct fib6_walker_t *w)
 {
-       struct fib6_node *fn;
+       struct fib6_node *fn, *pn;
 
-       fn = root;
+       for (;;) {
+               fn = w->node;
+               if (fn == NULL)
+                       return 0;
 
-       fib6_walk_count++;
-       
-       do {
-               if (!(fn->fn_flags & RTN_TAG)) {
-                       fn->fn_flags |= RTN_TAG;
-                       
+               if (w->prune && fn != w->root &&
+                   fn->fn_flags&RTN_RTINFO && w->state < FWS_C) {
+                       w->state = FWS_C;
+                       w->leaf = fn->leaf;
+               }
+               switch (w->state) {
+#ifdef CONFIG_IPV6_SUBTREES
+               case FWS_S:
+                       if (SUBTREE(fn)) {
+                               w->node = SUBTREE(fn);
+                               continue;
+                       }
+                       w->state = FWS_L;
+#endif 
+               case FWS_L:
                        if (fn->left) {
-                               fn = fn->left;
+                               w->node = fn->left;
+                               w->state = FWS_INIT;
                                continue;
                        }
-               }
-
-               fn->fn_flags &= ~RTN_TAG;
-
-               if (fn->right) {
-                       fn = fn->right;
-                       continue;
-               }
-               
-               do {
-                       struct fib6_node *node;
-                       
-                       if (fn->fn_flags & RTN_ROOT)
-                               break;
-                       node = fn;
-                       fn = fn->parent;
-                       
-                       if (!(node->fn_flags & RTN_TAG)) {
-                               if (node->subtree) {
-                                       fib6_walk_tree(node->subtree, func,
-                                                      arg, filter);
-                               }
-
-                               if (!filter ||
-                                   (node->fn_flags & RTN_RTINFO))
-                                       (*func)(node, arg);
+                       w->state = FWS_R;
+               case FWS_R:
+                       if (fn->right) {
+                               w->node = fn->right;
+                               w->state = FWS_INIT;
+                               continue;
                        }
-                       
-               } while (!(fn->fn_flags & RTN_TAG));
-
-       } while (!(fn->fn_flags & RTN_ROOT) || (fn->fn_flags & RTN_TAG));
-
-       fib6_walk_count--;
+                       w->state = FWS_C;
+                       w->leaf = fn->leaf;
+               case FWS_C:
+                       if (w->leaf && fn->fn_flags&RTN_RTINFO) {
+                               int err = w->func(w);
+                               if (err)
+                                       return err;
+                               continue;
+                       }
+                       w->state = FWS_U;
+               case FWS_U:
+                       if (fn == w->root)
+                               return 0;
+                       pn = fn->parent;
+                       w->node = pn;
+#ifdef CONFIG_IPV6_SUBTREES
+                       if (SUBTREE(pn) == fn) {
+                               BUG_TRAP(fn->fn_flags&RTN_ROOT);
+                               w->state = FWS_L;
+                               continue;
+                       }
+#endif
+                       if (pn->left == fn) {
+                               w->state = FWS_R;
+                               continue;
+                       }
+                       if (pn->right == fn) {
+                               w->state = FWS_C;
+                               w->leaf = w->node->leaf;
+                               continue;
+                       }
+#if RT6_DEBUG >= 2
+                       BUG_TRAP(0);
+#endif
+               }
+       }
 }
 
-/*
- *     Garbage collection
- */
-
-static int fib6_gc_node(struct fib6_node *fn, int timeout)
+int fib6_walk(struct fib6_walker_t *w)
 {
-       struct rt6_info *rt, **back;
-       int more = 0;
-       unsigned long now = jiffies;
-
-       back = &fn->leaf;
-
-       for (rt = fn->leaf; rt;) {
-               if ((rt->rt6i_flags & RTF_CACHE) && atomic_read(&rt->rt6i_use) == 0) {
-                       if ((long)(now - rt->rt6i_tstamp) >= timeout) {
-                               struct rt6_info *old;
-
-                               old = rt;
+       int res;
 
-                               rt = rt->u.next;
+       w->state = FWS_INIT;
+       w->node = w->root;
 
-                               *back = rt;
+       fib6_walker_link(w);
+       res = fib6_walk_continue(w);
+       if (res <= 0)
+               fib6_walker_unlink(w);
+       return res;
+}
 
-                               old->rt6i_node = NULL;
-#ifdef CONFIG_RTNETLINK
-                               inet6_rt_notify(RTM_DELROUTE, old);
+static int fib6_clean_node(struct fib6_walker_t *w)
+{
+       int res;
+       struct rt6_info *rt;
+       struct fib6_cleaner_t *c = (struct fib6_cleaner_t*)w;
+
+       for (rt = w->leaf; rt; rt = rt->u.next) {
+               res = c->func(rt, c->arg);
+               if (res < 0) {
+                       w->leaf = rt;
+                       res = fib6_del(rt);
+                       if (res) {
+#if RT6_DEBUG >= 2
+                               printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
 #endif
-                               old->u.dst.obsolete = 1;
-                               rt6_release(old);
-                               rt6_stats.fib_rt_entries--;
                                continue;
                        }
-                       more++;
+                       return 0;
                }
+               BUG_TRAP(res==0);
+       }
+       w->leaf = rt;
+       return 0;
+}
 
-               /*
               *      check addrconf expiration here.
-                *
               *      BUGGGG Crossing fingers and ...
-                *      Seems, radix tree walking is absolutely broken,
-                *      but we will try in any case --ANK
-                */
-               if ((rt->rt6i_flags&RTF_EXPIRES) && rt->rt6i_expires
-                   && (long)(now - rt->rt6i_expires) > 0) {
-                       struct rt6_info *old;
+/*
*     Convenient frontend to tree walker.
+ *     
*     func is called on each route.
+ *             It may return -1 -> delete this route.
+ *                           0  -> continue walking
+ *
+ *     prune==1 -> only immediate children of node (certainly,
+ *     ignoring pure split nodes) will be scanned.
+ */
 
-                       old = rt;
-                       rt = rt->u.next;
+void fib6_clean_tree(struct fib6_node *root,
+                    int (*func)(struct rt6_info *, void *arg),
+                    int prune, void *arg)
+{
+       struct fib6_cleaner_t c;
 
-                       *back = rt;
+       c.w.root = root;
+       c.w.func = fib6_clean_node;
+       c.w.prune = prune;
+       c.func = func;
+       c.arg = arg;
 
-                       old->rt6i_node = NULL;
-#ifdef CONFIG_RTNETLINK
-                       inet6_rt_notify(RTM_DELROUTE, old);
-#endif
-                       old->u.dst.obsolete = 1;
-                       rt6_release(old);
-                       rt6_stats.fib_rt_entries--;
-                       continue;
-               }
-               back = &rt->u.next;
-               rt = rt->u.next;
+       start_bh_atomic();
+       fib6_walk(&c.w);
+       end_bh_atomic();
+}
+
+static int fib6_prune_clone(struct rt6_info *rt, void *arg)
+{
+       if (rt->rt6i_flags & RTF_CACHE) {
+               RT6_TRACE("pruning clone %p\n", rt);
+               return -1;
        }
 
-       return more;
+       return 0;
 }
 
-struct fib6_gc_args {
-       unsigned long   timeout;
-       int             more;
-};
+static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt)
+{
+       fib6_clean_tree(fn, fib6_prune_clone, 1, rt);
+}
+
+/*
+ *     Garbage collection
+ */
 
-static void fib6_garbage_collect(struct fib6_node *fn, void *p_arg)
+static struct fib6_gc_args
 {
-       struct fib6_gc_args * args = (struct fib6_gc_args *) p_arg;
+       int                     timeout;
+       int                     more;
+} gc_args;
 
-       if (fn->fn_flags & RTN_RTINFO) {
-               int more;
+static int fib6_age(struct rt6_info *rt, void *arg)
+{
+       unsigned long now = jiffies;
 
-               more = fib6_gc_node(fn, args->timeout);
+       /* Age clones. Note, that clones are aged out
+          only if they are not in use now.
+        */
 
-               if (fn->leaf) {
-                       args->more += more;
-                       return;
+       if (rt->rt6i_flags & RTF_CACHE) {
+               if (atomic_read(&rt->u.dst.use) == 0 &&
+                   (long)(now - rt->u.dst.lastuse) >= gc_args.timeout) {
+                       RT6_TRACE("aging clone %p\n", rt);
+                       return -1;
                }
-
-               rt6_stats.fib_route_nodes--;
-               fn->fn_flags &= ~RTN_RTINFO;
+               gc_args.more++;
+               return 0;
        }
 
        /*
-        *      tree nodes (with no routing information)
+        *      check addrconf expiration here.
+        *      They are expired even if they are in use.
         */
 
-       if (!fn->subtree && !(fn->fn_flags & RTN_TL_ROOT)) {
-               int children = 0;
-               struct fib6_node *chld = NULL;
-
-               if (fn->left) {
-                       children++;
-                       chld = fn->left;
-               }
-                       
-               if (fn->right) {
-                       children++;
-                       chld = fn->right;
-               }
-               
-               if ((fn->fn_flags & RTN_ROOT)) {
-                       if (children == 0) {
-                               struct fib6_node *pn;
-
-                               pn = fn->parent;
-                               pn->subtree = NULL;
-
-                               node_free(fn);
-                       }
-                       return;
-               }
-
-               if (children <= 1) {
-                       struct fib6_node *pn = fn->parent;
-                       
-                       if (pn->left == fn)
-                               pn->left = chld;
-                       else
-                               pn->right = chld;
-                       
-                       if (chld)
-                               chld->parent = pn;
-                       
-                       if (fn->leaf)
-                               rt6_release(fn->leaf);
-
-                       node_free(fn);
-
-                       return;
+       if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) {
+               if ((long)(now - rt->rt6i_expires) > 0) {
+                       RT6_TRACE("expiring %p\n", rt);
+                       return -1;
                }
+               gc_args.more++;
+               return 0;
        }
 
-       if (fn->leaf == NULL) {
-               struct rt6_info *nrt;
-               
-               nrt = fib6_find_prefix(fn);
-
-               if (nrt == NULL)
-                       panic("fib6: inconsistent tree\n");
-
-               atomic_inc(&nrt->rt6i_ref);
-               fn->leaf = nrt;
-       }
+       return 0;
 }
 
 void fib6_run_gc(unsigned long dummy)
 {
-       struct fib6_gc_args arg = {
-               ip6_rt_gc_timeout,
-               0
-       };
+       if (dummy != ~0UL)
+               gc_args.timeout = (int)dummy;
+       else
+               gc_args.timeout = ip6_rt_gc_interval;
 
-       del_timer(&ip6_fib_timer);
+       gc_args.more = 0;
 
-       if (dummy)
-               arg.timeout = dummy;
+       fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL);
 
-       if (fib6_walk_count == 0)
-               fib6_walk_tree(&ip6_routing_table, fib6_garbage_collect, &arg, 0);
-       else
-               arg.more = 1;
+       del_timer(&ip6_fib_timer);
 
-       if (arg.more) {
+       ip6_fib_timer.expires = 0;
+       if (gc_args.more) {
                ip6_fib_timer.expires = jiffies + ip6_rt_gc_interval;
                add_timer(&ip6_fib_timer);
-       } else {
-               ip6_fib_timer.expires = 0;
        }
 }
 
@@ -1084,3+1203,5 @@ void fib6_gc_cleanup(void)
        del_timer(&ip6_fib_timer);
 }
 #endif
+
+
index 3c3a0cf..c19a561 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: ip6_fw.c,v 1.9 1998/02/12 07:43:42 davem Exp $
+ *     $Id: ip6_fw.c,v 1.10 1998/08/26 12:04:57 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -300,14+300,19 @@ int ip6_fw_msg_add(struct ip6_fw_msg *msg)
        rl->info.uli_u.data = msg->u.data;
 
        rtmsg.rtmsg_flags = RTF_NONEXTHOP|RTF_POLICY;
-       rt = ip6_route_add(&rtmsg, &err);
+       err = ip6_route_add(&rtmsg);
 
-       /* BUGGGG! rt can point to nowhere. */
-       if (rt == NULL) {
+       if (err) {
                ip6_fwrule_free(rl);
-               return -ENOMEM;
+               return err;
        }
 
+       /* The rest will not work for now. --ABK (989725) */
+
+#ifndef notdef
+       ip6_fwrule_free(rl);
+       return -EPERM;
+#else
        rt->u.dst.error = -EPERM;
 
        if (msg->policy == IP6_FW_ACCEPT) {
@@ -327,6+332,7 @@ int ip6_fw_msg_add(struct ip6_fw_msg *msg)
        rt->rt6i_flowr = flow_clone((struct flow_rule *)rl);
 
        return 0;
+#endif
 }
 
 static int ip6_fw_msgrcv(int unit, struct sk_buff *skb)
index 6ab4d2c..6d7359a 100644 (file)
@@ -6,7+6,7 @@
  *     Pedro Roque             <roque@di.fc.ul.pt>
  *     Ian P. Morris           <I.P.Morris@soton.ac.uk>
  *
- *     $Id: ip6_input.c,v 1.10 1998/07/15 05:05:34 davem Exp $
+ *     $Id: ip6_input.c,v 1.11 1998/08/26 12:04:59 davem Exp $
  *
  *     Based in linux/net/ipv4/ip_input.c
  *
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
 
-static int ipv6_dest_opt(struct sk_buff **skb_ptr, struct device *dev,
-                        __u8 *nhptr, struct ipv6_options *opt);
-
-struct hdrtype_proc {
-       u8      type;
-       int     (*func) (struct sk_buff **, struct device *dev, __u8 *ptr,
-                        struct ipv6_options *opt);
-} hdrproc_lst[] = {
-
-  /*
-       TODO
-
-       {NEXTHDR_HOP,           ipv6_hop_by_hop}
-       {NEXTHDR_ROUTING,       ipv6_routing_header},
-   */
-       {NEXTHDR_FRAGMENT,      ipv6_reassembly},
-  
-       {NEXTHDR_DEST,          ipv6_dest_opt},
-   /*  
-       {NEXTHDR_AUTH,          ipv6_auth_hdr},
-       {NEXTHDR_ESP,           ipv6_esp_hdr},
-    */
-       {NEXTHDR_MAX,           NULL}
-};
-
-/* New header structures */
-
-
-struct tlvtype_proc {
-       u8      type;
-       int     (*func) (struct sk_buff *, struct device *dev, __u8 *ptr,
-                        struct ipv6_options *opt);
-       /*
-        *      these functions do NOT update skb->h.raw
-        */
-
-} tlvprocdestopt_lst[] = {
-       {255,                   NULL}
-};
-
-int ip6_dstopt_unknown(struct sk_buff *skb, struct ipv6_tlvtype *hdr)
-{
-       struct in6_addr *daddr;
-       int pos;
-
-       /*
-        *      unkown destination option type
-        */
-       
-       pos = (__u8 *) hdr - (__u8 *) skb->nh.raw;
-       
-       /* I think this is correct please check - IPM */
-
-       switch ((hdr->type & 0xC0) >> 6) {
-       case 0: /* ignore */
-               skb->h.raw += hdr->len+2;
-               return 1;
-               
-       case 1: /* drop packet */
-               break;
-
-       case 2: /* send ICMP PARM PROB regardless and drop packet */
-               icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_OPTION,
-                           pos, skb->dev);
-               break;
-               
-       case 3: /* Send ICMP if not a multicast address and drop packet */
-               daddr = &skb->nh.ipv6h->daddr;
-               if (!(ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST))
-                       icmpv6_send(skb, ICMPV6_PARAMPROB,
-                                   ICMPV6_UNK_OPTION, pos, skb->dev);
-       };
-       
-       kfree_skb(skb);
-       return 0;
-}
-
-static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb,
-                        struct device *dev, __u8 *nhptr,
-                        struct ipv6_options *opt, void *lastopt)
-{
-       struct ipv6_tlvtype *hdr;
-       struct tlvtype_proc *curr;
-
-       while ((hdr=(struct ipv6_tlvtype *)skb->h.raw) != lastopt) {
-               switch (hdr->type) {
-               case 0: /* TLV encoded Pad1 */
-                       skb->h.raw++;
-                       break;
-
-               case 1: /* TLV encoded PadN */
-                       skb->h.raw += hdr->len+2;
-                       break;
-
-               default: /* Other TLV code so scan list */
-                       for (curr=procs; curr->type != 255; curr++) {
-                               if (curr->type == (hdr->type)) {
-                                       curr->func(skb, dev, nhptr, opt);
-                                       skb->h.raw += hdr->len+2;
-                                       break;
-                               }
-                       }
-                       if (curr->type==255) {
-                               if (ip6_dstopt_unknown(skb, hdr) == 0)
-                                       return 0;
-                       }
-                       break;
-               }
-       }
-       return 1;
-}
-
-static int ipv6_dest_opt(struct sk_buff **skb_ptr, struct device *dev,
-                        __u8 *nhptr, struct ipv6_options *opt)
-{
-       struct sk_buff *skb=*skb_ptr;
-       struct ipv6_destopt_hdr *hdr = (struct ipv6_destopt_hdr *) skb->h.raw;
-       int res = 0;
-       void *lastopt=skb->h.raw+hdr->hdrlen+sizeof(struct ipv6_destopt_hdr);
-
-       skb->h.raw += sizeof(struct ipv6_destopt_hdr);
-       if (ip6_parse_tlv(tlvprocdestopt_lst, skb, dev, nhptr, opt, lastopt))
-               res = hdr->nexthdr;
-       skb->h.raw+=hdr->hdrlen;
-
-       return res;
-}
-
 
 int ipv6_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 {
        struct ipv6hdr *hdr;
-       int pkt_len;
+       u32             pkt_len;
 
-       if (skb->pkt_type == PACKET_OTHERHOST) {
-               kfree_skb(skb);
-               return 0;
-       }
+       if (skb->pkt_type == PACKET_OTHERHOST)
+               goto drop;
+
+       ipv6_statistics.Ip6InReceives++;
+
+       /* Store incoming device index. When the packet will
+          be queued, we cannot refer to skb->dev anymore.
+        */
+       ((struct inet6_skb_parm *)skb->cb)->iif = dev->ifindex;
 
        hdr = skb->nh.ipv6h;
 
@@ -183,16+60,31 @@ int ipv6_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 
        pkt_len = ntohs(hdr->payload_len);
 
-       if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
-               goto err;
+       /* pkt_len may be zero if Jumbo payload option is present */
+       if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+               if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+                       goto truncated;
+               skb_trim(skb, pkt_len + sizeof(struct ipv6hdr));
+       }
 
-       skb_trim(skb, pkt_len + sizeof(struct ipv6hdr));
+       if (hdr->nexthdr == NEXTHDR_HOP) {
+               skb->h.raw = (u8*)(hdr+1);
+               if (!ipv6_parse_hopopts(skb, &hdr->nexthdr)) {
+                       ipv6_statistics.Ip6InHdrErrors++;
+                       return 0;
+               }
+       }
 
-       ip6_route_input(skb);
-       
-       return 0;
+       if (skb->dst == NULL)
+               ip6_route_input(skb);
+
+       return skb->dst->input(skb);
+
+truncated:
+       ipv6_statistics.Ip6InTruncatedPkts++;
 err:
        ipv6_statistics.Ip6InHdrErrors++;
+drop:
        kfree_skb(skb);
        return 0;
 }
@@ -217,8+109,7 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
  *     without calling rawv6.c)
  */
 static struct sock * ipv6_raw_deliver(struct sk_buff *skb,
-                                     struct ipv6_options *opt,
-                                     int nexthdr, int len)
+                                     int nexthdr, unsigned long len)
 {
        struct in6_addr *saddr;
        struct in6_addr *daddr;
@@ -253,8+144,8 @@ static struct sock * ipv6_raw_deliver(struct sk_buff *skb,
                                continue;
 
                        buff = skb_clone(skb, GFP_ATOMIC);
-                       buff->sk = sk2;
-                       rawv6_rcv(buff, skb->dev, saddr, daddr, opt, len);
+                       if (buff)
+                               rawv6_rcv(sk2, buff, len);
                }
        }
 
@@ -270,10+161,8 @@ static struct sock * ipv6_raw_deliver(struct sk_buff *skb,
 
 int ip6_input(struct sk_buff *skb)
 {
-       struct ipv6_options *opt = (struct ipv6_options *) skb->cb;
        struct ipv6hdr *hdr = skb->nh.ipv6h;
        struct inet6_protocol *ipprot;
-       struct hdrtype_proc *hdrt;
        struct sock *raw_sk;
        __u8 *nhptr;
        int nexthdr;
@@ -281,7+170,7 @@ int ip6_input(struct sk_buff *skb)
        u8 hash;
        int len;
        
-       skb->h.raw += sizeof(struct ipv6hdr);
+       skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr);
 
        /*
         *      Parse extension headers
@@ -290,64+179,55 @@ int ip6_input(struct sk_buff *skb)
        nexthdr = hdr->nexthdr;
        nhptr = &hdr->nexthdr;
 
-       /*
-        *      check for extension headers
-        */
-
-st_loop:
+       /* Skip  hop-by-hop options, they are already parsed. */
+       if (nexthdr == NEXTHDR_HOP) {
+               nhptr = (u8*)(hdr+1);
+               nexthdr = *nhptr;
+               skb->h.raw += (nhptr[1]+1)<<3;
+       }
 
-       for (hdrt=hdrproc_lst; hdrt->type != NEXTHDR_MAX; hdrt++) {
-               if (hdrt->type == nexthdr) {
-                       if ((nexthdr = hdrt->func(&skb, skb->dev, nhptr, opt))) {
-                               nhptr = skb->h.raw;
-                               hdr = skb->nh.ipv6h;
-                               goto st_loop;
-                       }
+       /* This check is sort of optimization.
+          It would be stupid to detect for optional headers,
+          which are missing with probability of 200%
+        */
+       if (nexthdr != IPPROTO_TCP && nexthdr != IPPROTO_UDP) {
+               nhptr = ipv6_parse_exthdrs(&skb, nhptr);
+               if (nhptr == NULL)
                        return 0;
-               }
+               nexthdr = *nhptr;
+               hdr = skb->nh.ipv6h;
        }
-
        len = skb->tail - skb->h.raw;
 
-       raw_sk = ipv6_raw_deliver(skb, opt, nexthdr, len);
+       raw_sk = ipv6_raw_deliver(skb, nexthdr, len);
 
        hash = nexthdr & (MAX_INET_PROTOS - 1);
        for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; 
             ipprot != NULL; 
             ipprot = (struct inet6_protocol *) ipprot->next) {
                struct sk_buff *buff = skb;
-               
+
                if (ipprot->protocol != nexthdr)
                        continue;
-               
+
                if (ipprot->copy || raw_sk)
                        buff = skb_clone(skb, GFP_ATOMIC);
-               
-               
-               ipprot->handler(buff, skb->dev, &hdr->saddr, &hdr->daddr,
-                               opt, len, 0, ipprot);
+
+               ipprot->handler(buff, len);
                found = 1;
        }
-       
+
        if (raw_sk) {
-               skb->sk = raw_sk;
-               rawv6_rcv(skb, skb->dev, &hdr->saddr, &hdr->daddr, opt, len);
+               rawv6_rcv(raw_sk, skb, len);
                found = 1;
        }
-       
+
        /*
         *      not found: send ICMP parameter problem back
         */
-       
        if (!found) {
-               unsigned long offset;
-#if IP6_DEBUG >= 2
-               printk(KERN_DEBUG "proto not found %d\n", nexthdr);
-#endif
-               offset = nhptr - (u8*) hdr;
-               icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_NEXTHDR,
-                           offset, skb->dev);
-               kfree_skb(skb);
+               ipv6_statistics.Ip6InUnknownProtos++;
+               icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhptr);
        }
 
        return 0;
@@ -359,6+239,8 @@ int ip6_mc_input(struct sk_buff *skb)
        int deliver = 0;
        int discard = 1;
 
+       ipv6_statistics.Ip6InMcastPkts++;
+
        hdr = skb->nh.ipv6h;
        if (ipv6_chk_mcast_addr(skb->dev, &hdr->daddr))
                deliver = 1;
index aa13c20..0555c1a 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: ip6_output.c,v 1.13 1998/07/15 05:05:38 davem Exp $
+ *     $Id: ip6_output.c,v 1.14 1998/08/26 12:05:01 davem Exp $
  *
  *     Based on linux/net/ipv4/ip_output.c
  *
  *      modify it under the terms of the GNU General Public License
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
+ *
+ *     Changes:
+ *     A.N.Kuznetsov   :       airthmetics in fragmentation.
+ *                             extension headers are implemented.
+ *                             route changes now work.
+ *                             ip6_forward does not confuse sniffers.
+ *                             etc.
+ *                             
  */
 
 #include <linux/errno.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
 #include <net/rawv6.h>
+#include <net/icmp.h>
 
 static u32     ipv6_fragmentation_id = 1;
 
@@ -59,6+68,8 @@ int ip6_output(struct sk_buff *skb)
                                return 0;
                        }
                }
+
+               ipv6_statistics.Ip6OutMcastPkts++;
        }
 
        if (hh) {
@@ -85,17+96,40 @@ int ip6_output(struct sk_buff *skb)
  */
 
 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
-            struct ipv6_options *opt)
+            struct ipv6_txoptions *opt)
 {
        struct ipv6_pinfo * np = sk ? &sk->net_pinfo.af_inet6 : NULL;
+       struct in6_addr *first_hop = fl->nl_u.ip6_u.daddr;
        struct dst_entry *dst = skb->dst;
        struct ipv6hdr *hdr;
-       int seg_len;
+       u8  proto = fl->proto;
+       int seg_len = skb->len;
        int hlimit;
 
-       /* Do something with IPv6 options headers here. */
+       if (opt) {
+               int head_room;
 
-       seg_len = skb->len;
+               /* First: exthdrs may take lots of space (~8K for now)
+                  MAX_HEADER is not enough.
+                */
+               head_room = opt->opt_nflen + opt->opt_flen;
+               seg_len += head_room;
+               head_room += sizeof(struct ipv6hdr) + ((dst->dev->hard_header_len + 15)&~15);
+
+               if (skb_headroom(skb) < head_room) {
+                       struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
+                       kfree(skb);
+                       skb = skb2;
+                       if (skb == NULL)
+                               return -ENOBUFS;
+                       if (sk)
+                               skb_set_owner_w(skb, sk);
+               }
+               if (opt->opt_flen)
+                       ipv6_push_frag_opts(skb, opt, &proto);
+               if (opt->opt_nflen)
+                       ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
+       }
 
        hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
 
@@ -117,16+151,22 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit;
 
        hdr->payload_len = htons(seg_len);
-       hdr->nexthdr = fl->proto;
+       hdr->nexthdr = proto;
        hdr->hop_limit = hlimit;
 
        ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
-       ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);
+       ipv6_addr_copy(&hdr->daddr, first_hop);
 
-       ipv6_statistics.Ip6OutRequests++;
-       dst->output(skb);
+       if (skb->len <= dst->pmtu) {
+               ipv6_statistics.Ip6OutRequests++;
+               dst->output(skb);
+               return 0;
+       }
 
-       return 0;
+       printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
+       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
+       kfree_skb(skb);
+       return -EMSGSIZE;
 }
 
 /*
@@ -166,8+206,8 @@ int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct device *dev,
        return 0;
 }
 
-static void ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
-                     int hlimit, unsigned short pktlength)
+static struct ipv6hdr * ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+                                 int hlimit, unsigned pktlength)
 {
        struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
        struct ipv6hdr *hdr;
@@ -177,43+217,56 @@ static void ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
        
        hdr->version = 6;
        hdr->priority = np->priority;
-       
        memcpy(hdr->flow_lbl, &np->flow_lbl, 3);
        
        hdr->payload_len = htons(pktlength - sizeof(struct ipv6hdr));
-
-       /*
-        *      FIXME: hop limit has default UNI/MCAST and
-        *      msgctl settings
-        */
        hdr->hop_limit = hlimit;
+       hdr->nexthdr = fl->proto;
 
        ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
-       ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);      
+       ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);
+       return hdr;
+}
+
+static __inline__ u8 * ipv6_build_fraghdr(struct sk_buff *skb, u8* prev_hdr, unsigned offset)
+{
+       struct frag_hdr *fhdr;
+
+       fhdr = (struct frag_hdr *) skb_put(skb, sizeof(struct frag_hdr));
+
+       fhdr->nexthdr  = *prev_hdr;
+       *prev_hdr = NEXTHDR_FRAGMENT;
+       prev_hdr = &fhdr->nexthdr;
+
+       fhdr->reserved = 0;
+       fhdr->frag_off = htons(offset);
+       fhdr->identification = ipv6_fragmentation_id++;
+       return &fhdr->nexthdr;
 }
 
 static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
                         const void *data, struct dst_entry *dst,
-                        struct flowi *fl, struct ipv6_options *opt,
-                        int hlimit, int flags, unsigned length)
+                        struct flowi *fl, struct ipv6_txoptions *opt,
+                        struct in6_addr *final_dst,
+                        int hlimit, int flags, unsigned length, int mtu)
 {
-       struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
        struct ipv6hdr *hdr;
        struct sk_buff *last_skb;
-       struct frag_hdr *fhdr;
+       u8 *prev_hdr;
        int unfrag_len;
-       int payl_len;
        int frag_len;
        int last_len;
        int nfrags;
        int fhdr_dist;
+       int frag_off;
+       int data_off;
        int err;
 
        /*
         *      Fragmentation
         *
         *      Extension header order:
-        *      Hop-by-hop -> Routing -> Fragment -> rest (...)
+        *      Hop-by-hop -> Dest0 -> Routing -> Fragment -> Auth -> Dest1 -> rest (...)
         *      
         *      We must build the non-fragmented part that
         *      will be in every packet... this also means
@@ -222,11+275,11 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
         */
 
        unfrag_len = sizeof(struct ipv6hdr) + sizeof(struct frag_hdr);
-       payl_len = length;
+       last_len = length;
 
        if (opt) {
                unfrag_len += opt->opt_nflen;
-               payl_len += opt->opt_flen;
+               last_len += opt->opt_flen;
        }
 
        /*
@@ -235,9+288,13 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
         *      "integer multiple of 8 octects".
         */
 
-       frag_len = (dst->pmtu - unfrag_len) & ~0x7;
+       frag_len = (mtu - unfrag_len) & ~0x7;
 
-       nfrags = payl_len / frag_len;
+       /* Unfragmentable part exceeds mtu. */
+       if (frag_len <= 0)
+               return -EMSGSIZE;
+
+       nfrags = last_len / frag_len;
 
        /*
         *      We must send from end to start because of 
@@ -250,13+307,25 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
         *      might be a good idea.
         */
 
-       last_len = payl_len - (nfrags * frag_len);
+       frag_off = nfrags * frag_len;
+       last_len -= frag_off;
 
        if (last_len == 0) {
                last_len = frag_len;
+               frag_off -= frag_len;
                nfrags--;
        }
-               
+       data_off = frag_off;
+
+       /* And it is implementation problem: for now we assume, that
+          all the exthdrs will fit to the first fragment.
+        */
+       if (opt) {
+               if (frag_len < opt->opt_flen)
+                       return -EMSGSIZE;
+               data_off = frag_off - opt->opt_flen;
+       }
+
        last_skb = sock_alloc_send_skb(sk, unfrag_len + frag_len +
                                       dst->dev->hard_header_len + 15,
                                       0, flags & MSG_DONTWAIT, &err);
@@ -267,41+336,17 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
        last_skb->dst = dst_clone(dst);
 
        skb_reserve(last_skb, (dst->dev->hard_header_len + 15) & ~15);
-       
-       hdr = (struct ipv6hdr *) skb_put(last_skb, sizeof(struct ipv6hdr));
-       last_skb->nh.ipv6h = hdr;
 
-       hdr->version = 6;
-       hdr->priority = np->priority;
-       
-       memcpy(hdr->flow_lbl, &np->flow_lbl, 3);
-       hdr->payload_len = htons(unfrag_len + frag_len - sizeof(struct ipv6hdr));
+       hdr = ip6_bld_1(sk, last_skb, fl, hlimit, frag_len+unfrag_len);
+       prev_hdr = &hdr->nexthdr;
 
-       hdr->hop_limit = hlimit;
+       if (opt && opt->opt_nflen)
+               prev_hdr = ipv6_build_nfrag_opts(last_skb, prev_hdr, opt, final_dst, 0);
 
-       hdr->nexthdr = NEXTHDR_FRAGMENT;
+       prev_hdr = ipv6_build_fraghdr(last_skb, prev_hdr, frag_off);
+       fhdr_dist = prev_hdr - last_skb->data;
 
-       ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
-       ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);
-
-#if 0
-       if (opt && opt->srcrt) {
-               hdr->nexthdr = ipv6opt_bld_rthdr(last_skb, opt, daddr,
-                                                NEXTHDR_FRAGMENT);
-       }
-#endif
-
-       fhdr = (struct frag_hdr *) skb_put(last_skb, sizeof(struct frag_hdr));
-       memset(fhdr, 0, sizeof(struct frag_hdr));
-
-       fhdr->nexthdr  = fl->proto;             
-       fhdr->frag_off = ntohs(nfrags * frag_len);
-       fhdr->identification = ipv6_fragmentation_id++;
-
-       fhdr_dist = (unsigned char *) fhdr - last_skb->data;
-
-       err = getfrag(data, &hdr->saddr, last_skb->tail, nfrags * frag_len,
-                     last_len);
+       err = getfrag(data, &hdr->saddr, last_skb->tail, data_off, last_len);
 
        if (!err) {
                while (nfrags--) {
@@ -309,58+354,60 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
                        
                        struct frag_hdr *fhdr2;
                                
-#if 0
-                       printk(KERN_DEBUG "sending frag %d\n", nfrags);
-#endif
                        skb = skb_copy(last_skb, sk->allocation);
 
-                       if (skb == NULL)
+                       if (skb == NULL) {
+                               ipv6_statistics.Ip6FragFails++;
+                               kfree_skb(last_skb);
                                return -ENOMEM;
+                       }
                        
+                       frag_off -= frag_len;
+                       data_off -= frag_len;
+
                        fhdr2 = (struct frag_hdr *) (skb->data + fhdr_dist);
 
                        /* more flag on */
-                       fhdr2->frag_off = ntohs(nfrags * frag_len + 1);
+                       fhdr2->frag_off = htons(frag_off | 1);
 
-                       /*
-                        *      FIXME:
-                        *      if (nfrags == 0)
-                        *      put rest of headers
-                        */
+                       /* Write fragmentable exthdrs to the first chunk */
+                       if (nfrags == 0 && opt && opt->opt_flen) {
+                               ipv6_build_frag_opts(skb, &fhdr2->nexthdr, opt);
+                               frag_len -= opt->opt_flen;
+                               data_off = 0;
+                       }
 
                        err = getfrag(data, &hdr->saddr,skb_put(skb, frag_len),
-                                     nfrags * frag_len, frag_len);
+                                     data_off, frag_len);
 
                        if (err) {
                                kfree_skb(skb);
                                break;
                        }
 
+                       ipv6_statistics.Ip6FragCreates++;
                        ipv6_statistics.Ip6OutRequests++;
                        dst->output(skb);
                }
        }
 
        if (err) {
+               ipv6_statistics.Ip6FragFails++;
                kfree_skb(last_skb);
                return -EFAULT;
        }
 
-#if 0
-       printk(KERN_DEBUG "sending last frag \n");
-#endif
-
-       hdr->payload_len = htons(unfrag_len + last_len - 
-                                sizeof(struct ipv6hdr));
+       hdr->payload_len = htons(unfrag_len + last_len - sizeof(struct ipv6hdr));
 
        /*
         *      update last_skb to reflect the getfrag we did
         *      on start.
         */
-       
-       last_skb->tail += last_len;
-       last_skb->len += last_len;
 
+       skb_put(last_skb, last_len);
+
+       ipv6_statistics.Ip6FragCreates++;
+       ipv6_statistics.Ip6FragOKs++;
        ipv6_statistics.Ip6OutRequests++;
        dst->output(last_skb);
 
@@ -369,42+416,71 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 
 int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
                   struct flowi *fl, unsigned length,
-                  struct ipv6_options *opt, int hlimit, int flags)
+                  struct ipv6_txoptions *opt, int hlimit, int flags)
 {
        struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
        struct in6_addr *final_dst = NULL;
        struct dst_entry *dst;
-       int pktlength;
        int err = 0;
-       
+       unsigned int pktlength, jumbolen, mtu;
+
        if (opt && opt->srcrt) {
                struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
                final_dst = fl->nl_u.ip6_u.daddr;
                fl->nl_u.ip6_u.daddr = rt0->addr;
        }
 
-       dst = NULL;
-
        if (!fl->oif && ipv6_addr_is_multicast(fl->nl_u.ip6_u.daddr))
                fl->oif = np->mcast_oif;
-       
-       if (sk->dst_cache)
+
+       dst = NULL;
+       if (sk->dst_cache) {
                dst = dst_check(&sk->dst_cache, np->dst_cookie);
+               if (dst) {
+                       struct rt6_info *rt = (struct rt6_info*)dst_clone(dst);
+
+                       /* Yes, checking route validity in not connected
+                          case is not very simple. Take into account,
+                          that we do not support routing by source, TOS,
+                          and MSG_DONTROUTE            --ANK (980726)
+
+                          1. If route was host route, check that
+                             cached destination is current.
+                             If it is network route, we still may
+                             check its validity using saved pointer
+                             to the last used address: daddr_cache.
+                             We do not want to save whole address now,
+                             (because main consumer of this service
+                              is tcp, which has not this problem),
+                             so that the last trick works only on connected
+                             sockets.
+                          2. oif also should be the same.
+                        */
+                       if (((rt->rt6i_dst.plen != 128 ||
+                             ipv6_addr_cmp(fl->fl6_dst, &rt->rt6i_dst.addr))
+                            && (np->daddr_cache == NULL ||
+                                ipv6_addr_cmp(fl->fl6_dst, np->daddr_cache)))
+                           || (fl->oif && fl->oif != dst->dev->ifindex)) {
+                               dst_release(dst);
+                               dst = NULL;
+                       }
+               }
+       }
 
        if (dst == NULL)
                dst = ip6_route_output(sk, fl);
 
        if (dst->error) {
                ipv6_statistics.Ip6OutNoRoutes++;
-               err = -ENETUNREACH;
-               goto out;
+               dst_release(dst);
+               return -ENETUNREACH;
        }
 
        if (fl->nl_u.ip6_u.saddr == NULL) {
                struct inet6_ifaddr *ifa;
                
                ifa = ipv6_get_saddr(dst, fl->nl_u.ip6_u.daddr);
-               
+
                if (ifa == NULL) {
 #if IP6_DEBUG >= 2
                        printk(KERN_DEBUG "ip6_build_xmit: "
@@ -415,7+491,6 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
                }
                fl->nl_u.ip6_u.saddr = &ifa->addr;
        }
-       
        pktlength = length;
 
        if (hlimit < 0) {
@@ -427,29+502,38 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
                        hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit;
        }
 
+       jumbolen = 0;
+
        if (!sk->ip_hdrincl) {
                pktlength += sizeof(struct ipv6hdr);
                if (opt)
                        pktlength += opt->opt_flen + opt->opt_nflen;
 
-               /* Due to conservative check made by caller,
-                  pktlength cannot overflow here.
-
-                  When (and if) jumbo option will be implemented
-                  we could try soemething sort of:
+               if (pktlength > 0xFFFF + sizeof(struct ipv6hdr)) {
+                       /* Jumbo datagram.
+                          It is assumed, that in the case of sk->ip_hdrincl
+                          jumbo option is supplied by user.
+                        */
+                       pktlength += 8;
+                       jumbolen = pktlength - sizeof(struct ipv6hdr);
+               }
+       }
 
-                  if (pktlength < length) return -EMSGSIZE;
+       mtu = dst->pmtu;
 
-               */
-       }
+       /* Critical arithmetic overflow check.
+          FIXME: may gcc optimize it out? --ANK (980726)
+        */
+       if (pktlength < length)
+               return -EMSGSIZE;
 
-       if (pktlength <= dst->pmtu) {
+       if (pktlength <= mtu) {
                struct sk_buff *skb;
                struct ipv6hdr *hdr;
-               struct device *dev;
+               struct device *dev = dst->dev;
 
                skb = sock_alloc_send_skb(sk, pktlength + 15 +
-                                         dst->dev->hard_header_len, 0,
+                                         dev->hard_header_len, 0,
                                          flags & MSG_DONTWAIT, &err);
 
                if (skb == NULL) {
@@ -457,7+541,6 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
                        goto out;
                }
 
-               dev = dst->dev;
                skb->dst = dst_clone(dst);
 
                skb_reserve(skb, (dev->hard_header_len + 15) & ~15);
@@ -466,23+549,22 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
                skb->nh.ipv6h = hdr;
 
                if (!sk->ip_hdrincl) {
-                       ip6_bld_1(sk, skb, fl, hlimit, pktlength);
-#if 0
-                       if (opt && opt->srcrt) {
-                               hdr->nexthdr = ipv6opt_bld_rthdr(skb, opt,
-                                                                final_dst,
-                                                                fl->proto);
+                       ip6_bld_1(sk, skb, fl, hlimit,
+                                 jumbolen ? sizeof(struct ipv6hdr) : pktlength);
+
+                       if (opt || jumbolen) {
+                               u8 *prev_hdr = &hdr->nexthdr;
+                               prev_hdr = ipv6_build_nfrag_opts(skb, prev_hdr, opt, final_dst, jumbolen);
+                               if (opt && opt->opt_flen)
+                                       ipv6_build_frag_opts(skb, prev_hdr, opt);
                        }
-                       else
-#endif
-                               hdr->nexthdr = fl->proto;
                }
 
                skb_put(skb, length);
                err = getfrag(data, &hdr->saddr,
                              ((char *) hdr) + (pktlength - length),
                              0, length);
-               
+
                if (!err) {
                        ipv6_statistics.Ip6OutRequests++;
                        dst->output(skb);
@@ -491,32+573,18 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
                        kfree_skb(skb);
                }
        } else {
-               if (sk->ip_hdrincl)
+               if (sk->ip_hdrincl || jumbolen)
                        return -EMSGSIZE;
 
-               /* pktlength includes IPv6 header, not included
-                  in IPv6 payload length.
-                  FIXME are non-fragmentable options included
-                  in packet after defragmentation? If not, we
-                  should subtract opt_nflen also. --ANK
-                */
-               if (pktlength > 0xFFFF + sizeof(struct ipv6hdr))
-                       return -EMSGSIZE;
-
-               err = ip6_frag_xmit(sk, getfrag, data, dst, fl, opt, hlimit,
-                                   flags, length);
+               err = ip6_frag_xmit(sk, getfrag, data, dst, fl, opt, final_dst, hlimit,
+                                   flags, length, mtu);
        }
-       
+
        /*
         *      cleanup
         */
-  out:
-       
-       if (sk->dst_cache)
-               ip6_dst_store(sk, dst);
-       else
-               dst_release(dst);
-
+out:
+       ip6_dst_store(sk, dst, fl->nl_u.ip6_u.daddr == &np->daddr ? &np->daddr : NULL);
        return err;
 }
 
@@ -530,20+598,15 @@ int ip6_call_ra_chain(struct sk_buff *skb, int sel)
                if (sk && ra->sel == sel) {
                        if (last) {
                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-                               if (skb2) {
-                                       skb2->sk = last;
-                                       rawv6_rcv(skb2, skb2->dev, &skb2->nh.ipv6h->saddr,
-                                                 &skb2->nh.ipv6h->daddr, NULL, skb2->len);
-                               }
+                               if (skb2)
+                                       rawv6_rcv(last, skb2, skb2->len);
                        }
                        last = sk;
                }
        }
 
        if (last) {
-               skb->sk = last;
-               rawv6_rcv(skb, skb->dev, &skb->nh.ipv6h->saddr,
-                         &skb->nh.ipv6h->daddr, NULL, skb->len);
+               rawv6_rcv(last, skb, skb->len);
                return 1;
        }
        return 0;
@@ -553,24+616,16 @@ int ip6_forward(struct sk_buff *skb)
 {
        struct dst_entry *dst = skb->dst;
        struct ipv6hdr *hdr = skb->nh.ipv6h;
-       int size;
+       struct inet6_skb_parm *opt =(struct inet6_skb_parm*)skb->cb;
        
-       if (ipv6_devconf.forwarding == 0)
+       if (ipv6_devconf.forwarding == 0 && opt->srcrt == 0)
                goto drop;
 
        /*
-        *      check hop-by-hop options present
-        */
-       /*
-        *      Note, that NEXTHDR_HOP header must be checked
-        *      always at the most beginning of ipv6_rcv.
-        *      The result should be saved somewhere, but
-        *      we do not it for now. Alas. Let's do it here. --ANK
-        *
-        *      Second note: we DO NOT make any processing on
+        *      We DO NOT make any processing on
         *      RA packets, pushing them to user level AS IS
-        *      without ane WARRANTY that application will able
-        *      to interpret them. The reson is that we
+        *      without ane WARRANTY that application will be able
+        *      to interpret them. The reason is that we
         *      cannot make anything clever here.
         *
         *      We are not end-node, so that if packet contains
@@ -579,42+634,9 @@ int ip6_forward(struct sk_buff *skb)
         *      cannot be fragmented, because there is no warranty
         *      that different fragments will go along one path. --ANK
         */
-       if (hdr->nexthdr == NEXTHDR_HOP) {
-               int ra_value = -1;
-               u8 *ptr = (u8*)(skb->nh.ipv6h+1);
-               int len = (ptr[1]+1)<<3;
-
-               if (len + sizeof(struct ipv6hdr) > skb->len)
-                       goto drop;
-
-               ptr += 2;
-               len -= 2;
-               while (len > 0) {
-                       u8 *opt;
-                       int optlen;
-
-                       if (ptr[0] == 0) {
-                               len--;
-                               ptr++;
-                               continue;
-                       }
-                       opt = ptr;
-                       optlen = ptr[1]+1;
-
-                       len -= optlen;
-                       ptr += optlen;
-                       if (len < 0)
-                               goto drop;
-
-                       if (opt[0] == 20) {
-                               /* Router Alert as of draft-ietf-ipngwg-ipv6router-alert-04 */
-                               if (optlen < 4)
-                                       goto drop;
-                               ra_value = opt[2] + (opt[3]<<8);
-                       } else if (!ip6_dstopt_unknown(skb, (struct ipv6_tlvtype*)opt))
-                               goto drop;
-               }
-               if (ra_value>=0 && ip6_call_ra_chain(skb, ra_value))
+       if (opt->ra) {
+               u8 *ptr = skb->nh.raw + opt->ra;
+               if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
                        return 0;
        }
 
@@ -622,6+644,8 @@ int ip6_forward(struct sk_buff *skb)
         *      check and decrement ttl
         */
        if (hdr->hop_limit <= 1) {
+               /* Force OUTPUT device used as source address */
+               skb->dev = dst->dev;
                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
                            0, skb->dev);
 
@@ -629,9+653,10 @@ int ip6_forward(struct sk_buff *skb)
                return -ETIMEDOUT;
        }
 
-       hdr->hop_limit--;
-
-       if (skb->dev == dst->dev && dst->neighbour) {
+       /* IPv6 specs say nothing about it, but it is clear that we cannot
+          send redirects to source routed frames.
+        */
+       if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
                struct in6_addr *target = NULL;
                struct rt6_info *rt;
                struct neighbour *n = dst->neighbour;
@@ -647,30+672,40 @@ int ip6_forward(struct sk_buff *skb)
                else
                        target = &hdr->daddr;
 
-               ndisc_send_redirect(skb, dst->neighbour, target);
+               /* Limit redirects both by destination (here)
+                  and by source (inside ndisc_send_redirect)
+                */
+               if (xrlim_allow(dst, 1*HZ))
+                       ndisc_send_redirect(skb, n, target);
+       } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
+                                               |IPV6_ADDR_LINKLOCAL)) {
+               /* This check is security critical. */
+               goto drop;
        }
-       
-       size = sizeof(struct ipv6hdr) + ntohs(hdr->payload_len);
 
-       if (size > dst->pmtu) {
+       if (skb->len > dst->pmtu) {
+               /* Again, force OUTPUT device used as source address */
+               skb->dev = dst->dev;
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
+               ipv6_statistics.Ip6InTooBigErrors++;
                kfree_skb(skb);
                return -EMSGSIZE;
        }
 
-       if (skb_headroom(skb) < dst->dev->hard_header_len || skb_cloned(skb)) {
-               struct sk_buff *skb2;
-               skb2 = skb_realloc_headroom(skb, (dst->dev->hard_header_len + 15)&~15);
-               kfree_skb(skb);
-               skb = skb2;
-       }
+       if ((skb = skb_cow(skb, dst->dev->hard_header_len)) == NULL)
+               return 0;
 
-       ipv6_statistics.Ip6ForwDatagrams++;
-       dst->output(skb);
+       hdr = skb->nh.ipv6h;
 
-       return 0;
+       /* Mangling hops number delayed to point after skb COW */
+       hdr->hop_limit--;
+
+       ipv6_statistics.Ip6OutForwDatagrams++;
+       return dst->output(skb);
 
 drop:
+       ipv6_statistics.Ip6InAddrErrors++;
        kfree_skb(skb);
        return -EINVAL;
 }
index b31c07c..a246b99 100644 (file)
@@ -7,7+7,7 @@
  *
  *     Based on linux/net/ipv4/ip_sockglue.c
  *
- *     $Id: ipv6_sockglue.c,v 1.22 1998/07/15 05:05:39 davem Exp $
+ *     $Id: ipv6_sockglue.c,v 1.23 1998/08/26 12:05:04 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -110,7+110,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
                    int optlen)
 {
        struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
-       int val, err;
+       int val, valbool;
        int retv = -ENOPROTOOPT;
 
        if(level==SOL_IP && sk->type != SOCK_RAW)
@@ -119,19+119,20 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
        if(level!=SOL_IPV6)
                goto out;
 
-       if (optval == NULL) {
+       if (optval == NULL)
                val=0;
-       } else {
-               err = get_user(val, (int *) optval);
-               if(err)
-                       return err;
-       }
-       
+       else if (get_user(val, (int *) optval))
+               return -EFAULT;
+
+       valbool = (val!=0);
 
        switch (optname) {
 
        case IPV6_ADDRFORM:
                if (val == PF_INET) {
+                       struct ipv6_txoptions *opt;
+                       struct sk_buff *pktopt;
+
                        if (sk->protocol != IPPROTO_UDP &&
                            sk->protocol != IPPROTO_TCP)
                                goto out;
@@ -140,7+141,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
                                retv = ENOTCONN;
                                goto out;
                        }
-                       
+
                        if (!(ipv6_addr_type(&np->daddr) & IPV6_ADDR_MAPPED)) {
                                retv = -EADDRNOTAVAIL;
                                goto out;
@@ -153,10+154,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
                                tp->af_specific = &ipv4_specific;
                                sk->socket->ops = &inet_stream_ops;
                                sk->family = PF_INET;
+                               tcp_sync_mss(sk, tp->pmtu_cookie);
                        } else {
                                sk->prot = &udp_prot;
                                sk->socket->ops = &inet_dgram_ops;
                        }
+                       opt = xchg(&np->opt, NULL);
+                       if (opt)
+                               sock_kfree_s(sk, opt, opt->tot_len);
+                       pktopt = xchg(&np->pktoptions, NULL);
+                       if (pktopt)
+                               kfree_skb(pktopt);
                        retv = 0;
                } else {
                        retv = -EINVAL;
@@ -164,15+172,85 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
                break;
 
        case IPV6_PKTINFO:
-               np->rxinfo = val;
+               np->rxopt.bits.rxinfo = valbool;
                retv = 0;
                break;
 
        case IPV6_HOPLIMIT:
-               np->rxhlim = val;
+               np->rxopt.bits.rxhlim = valbool;
+               retv = 0;
+               break;
+
+       case IPV6_RTHDR:
+               retv = -EINVAL;
+               if (val >= 0 && val <= 2) {
+                       np->rxopt.bits.srcrt = val;
+                       retv = 0;
+               }
+               break;
+
+       case IPV6_HOPOPTS:
+               np->rxopt.bits.hopopts = valbool;
+               retv = 0;
+               break;
+
+       case IPV6_AUTHHDR:
+               np->rxopt.bits.authhdr = valbool;
                retv = 0;
                break;
 
+       case IPV6_DSTOPTS:
+               np->rxopt.bits.dstopts = valbool;
+               retv = 0;
+               break;
+
+       case IPV6_PKTOPTIONS:
+       {
+               struct ipv6_txoptions *opt = NULL;
+               struct msghdr msg;
+               int junk;
+               struct in6_addr *saddr;
+
+               if (optlen == 0)
+                       goto update;
+
+               opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL);
+               retv = -ENOBUFS;
+               if (opt == NULL)
+                       break;
+
+               memset(opt, 0, sizeof(*opt));
+               opt->tot_len = sizeof(*opt) + optlen;
+               retv = -EFAULT;
+               if (copy_from_user(opt+1, optval, optlen))
+                       goto done;
+
+               msg.msg_controllen = optlen;
+               msg.msg_control = (void*)(opt+1);
+
+               retv = datagram_send_ctl(&msg, &junk, &saddr, opt, &junk);
+               if (retv)
+                       goto done;
+update:
+               retv = 0;
+               start_bh_atomic();
+               if (opt && sk->type == SOCK_STREAM) {
+                       struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+                       if ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT)
+                           && sk->daddr != LOOPBACK4_IPV6) {
+                               tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
+                               tcp_sync_mss(sk, tp->pmtu_cookie);
+                       }
+               }
+               opt = xchg(&np->opt, opt);
+               dst_release(xchg(&sk->dst_cache, NULL));
+               end_bh_atomic();
+
+done:
+               if (opt)
+                       sock_kfree_s(sk, opt, opt->tot_len);
+               break;
+       }
        case IPV6_UNICAST_HOPS:
                if (val > 255 || val < -1)
                        retv = -EINVAL;
@@ -190,10+268,9 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
                        retv = 0;
                }
                break;
-               break;
 
        case IPV6_MULTICAST_LOOP:
-               np->mc_loop = (val != 0);
+               np->mc_loop = valbool;
                retv = 0;
                break;
 
@@ -229,12+306,10 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
        case IPV6_DROP_MEMBERSHIP:
        {
                struct ipv6_mreq mreq;
-               int err;
 
-               err = copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq));
-               if(err)
+               if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
                        return -EFAULT;
-               
+
                if (optname == IPV6_ADD_MEMBERSHIP)
                        retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr);
                else
@@ -253,10+328,44 @@ out:
 int ipv6_getsockopt(struct sock *sk, int level, int optname, char *optval, 
                    int *optlen)
 {
+       struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
+       int len;
+
        if(level==SOL_IP && sk->type != SOCK_RAW)
                return udp_prot.getsockopt(sk, level, optname, optval, optlen);
        if(level!=SOL_IPV6)
                return -ENOPROTOOPT;
+       if (get_user(len, optlen))
+               return -EFAULT;
+       switch (optname) {
+       case IPV6_PKTOPTIONS:
+       {
+               struct msghdr msg;
+               struct sk_buff *skb;
+
+               start_bh_atomic();
+               skb = np->pktoptions;
+               if (skb)
+                       atomic_inc(&skb->users);
+               end_bh_atomic();
+
+               if (skb) {
+                       int err;
+
+                       msg.msg_control = optval;
+                       msg.msg_controllen = len;
+                       msg.msg_flags = 0;
+                       err = datagram_recv_ctl(sk, &msg, skb);
+                       kfree_skb(skb);
+                       if (err)
+                               return err;
+                       len -= msg.msg_controllen;
+               } else
+                       len = 0;
+               return put_user(len, optlen);
+       }
+       default:
+       }
        return -EINVAL;
 }
 
index c50f37f..8895048 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: mcast.c,v 1.16 1998/05/07 15:43:10 davem Exp $
+ *     $Id: mcast.c,v 1.17 1998/08/26 12:05:06 davem Exp $
  *
  *     Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c 
  *
@@ -79,7+79,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
        if (!(ipv6_addr_type(addr) & IPV6_ADDR_MULTICAST))
                return -EINVAL;
 
-       mc_lst = kmalloc(sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
+       mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
 
        if (mc_lst == NULL)
                return -ENOMEM;
@@ -91,13+91,15 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
        if (ifindex == 0) {
                struct rt6_info *rt;
                rt = rt6_lookup(addr, NULL, 0, 0);
-               if (rt)
+               if (rt) {
                        dev = rt->rt6i_dev;
+                       dst_release(&rt->u.dst);
+               }
        } else
                dev = dev_get_by_index(ifindex);
 
        if (dev == NULL) {
-               kfree(mc_lst);
+               sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
                return -ENODEV;
        }
 
@@ -108,7+110,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
        err = ipv6_dev_mc_inc(dev, addr);
 
        if (err) {
-               kfree(mc_lst);
+               sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
                return err;
        }
 
@@ -133,7+135,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
                        *lnk = mc_lst->next;
                        if ((dev = dev_get_by_index(ifindex)) != NULL)
                                ipv6_dev_mc_dec(dev, &mc_lst->addr);
-                       kfree(mc_lst);
+                       sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
                        return 0;
                }
        }
@@ -153,7+155,7 @@ void ipv6_sock_mc_close(struct sock *sk)
                        ipv6_dev_mc_dec(dev, &mc_lst->addr);
 
                np->ipv6_mc_list = mc_lst->next;
-               kfree(mc_lst);
+               sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
        }
 }
 
@@ -308,11+310,19 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
 {
        unsigned long delay = resptime;
 
+       /* Do not start timer for addresses with link/host scope */
+       if (ipv6_addr_type(&ma->mca_addr)&(IPV6_ADDR_LINKLOCAL|IPV6_ADDR_LOOPBACK))
+               return;
+
        if (del_timer(&ma->mca_timer))
                delay = ma->mca_timer.expires - jiffies;
 
-       if (delay >= resptime)
-               delay = net_random() % resptime;
+       if (delay >= resptime) {
+               if (resptime)
+                       delay = net_random() % resptime;
+               else
+                       delay = 1;
+       }
 
        ma->mca_flags |= MAF_TIMER_RUNNING;
        ma->mca_timer.expires = jiffies + delay;
@@ -325,10+335,16 @@ int igmp6_event_query(struct sk_buff *skb, struct icmp6hdr *hdr, int len)
        struct in6_addr *addrp;
        unsigned long resptime;
 
-       if (len < sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr))
+       if (len < sizeof(struct icmp6hdr) + sizeof(struct in6_addr))
                return -EINVAL;
 
-       resptime = hdr->icmp6_maxdelay;
+       /* Drop queries with not link local source */
+       if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL))
+               return -EINVAL;
+
+       resptime = ntohs(hdr->icmp6_maxdelay);
+       /* Translate milliseconds to jiffies */
+       resptime = (resptime<<10)/(1024000/HZ);
 
        addrp = (struct in6_addr *) (hdr + 1);
 
@@ -365,7+381,15 @@ int igmp6_event_report(struct sk_buff *skb, struct icmp6hdr *hdr, int len)
        struct device *dev;
        int hash;
 
-       if (len < sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr))
+       /* Our own report looped back. Ignore it. */
+       if (skb->pkt_type == PACKET_LOOPBACK)
+               return 0;
+
+       if (len < sizeof(struct icmp6hdr) + sizeof(struct in6_addr))
+               return -EINVAL;
+
+       /* Drop reports with not link local source */
+       if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL))
                return -EINVAL;
 
        addrp = (struct in6_addr *) (hdr + 1);
@@ -399,14+423,25 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
         struct sk_buff *skb;
         struct icmp6hdr *hdr;
        struct inet6_ifaddr *ifp;
-       struct in6_addr *addrp; 
-       int err, len, plen;
+       struct in6_addr *snd_addr;
+       struct in6_addr *addrp;
+       struct in6_addr all_routers;
+       int err, len, payload_len, full_len;
+       u8 ra[8] = { IPPROTO_ICMPV6, 0,
+                    IPV6_TLV_ROUTERALERT, 0, 0, 0,
+                    IPV6_TLV_PADN, 0 };
+
+       snd_addr = addr;
+       if (type == ICMPV6_MGM_REDUCTION) {
+               snd_addr = &all_routers;
+               ipv6_addr_all_routers(&all_routers);
+       }
 
        len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
+       payload_len = len + sizeof(ra);
+       full_len = sizeof(struct ipv6hdr) + payload_len;
 
-       plen = sizeof(struct ipv6hdr) + len;
-
-       skb = sock_alloc_send_skb(sk, dev->hard_header_len + plen + 15, 0, 0, &err);
+       skb = sock_alloc_send_skb(sk, dev->hard_header_len + full_len + 15, 0, 0, &err);
 
        if (skb == NULL)
                return;
@@ -414,8+449,8 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
        skb_reserve(skb, (dev->hard_header_len + 15) & ~15);
        if (dev->hard_header) {
                unsigned char ha[MAX_ADDR_LEN];
-               ndisc_mc_map(addr, ha, dev, 1);
-               dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, plen);
+               ndisc_mc_map(snd_addr, ha, dev, 1);
+               dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len);
        }
 
        ifp = ipv6_get_lladdr(dev);
@@ -428,11+463,9 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
                return;
        }
 
-       ip6_nd_hdr(sk, skb, dev, &ifp->addr, addr, IPPROTO_ICMPV6, len);
+       ip6_nd_hdr(sk, skb, dev, &ifp->addr, snd_addr, NEXTHDR_HOP, payload_len);
 
-       /*
-        *      need hop-by-hop router alert option.
-        */
+       memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra));
 
        hdr = (struct icmp6hdr *) skb_put(skb, sizeof(struct icmp6hdr));
        memset(hdr, 0, sizeof(struct icmp6hdr));
@@ -441,11+474,16 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
        addrp = (struct in6_addr *) skb_put(skb, sizeof(struct in6_addr));
        ipv6_addr_copy(addrp, addr);
 
-       hdr->icmp6_cksum = csum_ipv6_magic(&ifp->addr, addr, len,
+       hdr->icmp6_cksum = csum_ipv6_magic(&ifp->addr, snd_addr, len,
                                           IPPROTO_ICMPV6,
                                           csum_partial((__u8 *) hdr, len, 0));
 
        dev_queue_xmit(skb);
+       if (type == ICMPV6_MGM_REDUCTION)
+               icmpv6_statistics.Icmp6OutGroupMembReductions++;
+       else
+               icmpv6_statistics.Icmp6OutGroupMembResponses++;
+       icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static void igmp6_join_group(struct ifmcaddr6 *ma)
@@ -455,7+493,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma)
 
        addr_type = ipv6_addr_type(&ma->mca_addr);
 
-       if ((addr_type & IPV6_ADDR_LINKLOCAL))
+       if ((addr_type & (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_LOOPBACK)))
                return;
 
        igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REPORT);
index 26e42a1..b6c855a 100644 (file)
 #include <net/ndisc.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
-
-
+#include <net/icmp.h>
 
 #include <net/checksum.h>
 #include <linux/proc_fs.h>
@@ -350,6+349,9 @@ void ndisc_send_na(struct device *dev, struct neighbour *neigh,
                                                              len, 0));
 
        dev_queue_xmit(skb);
+
+       icmpv6_statistics.Icmp6OutNeighborAdvertisements++;
+       icmpv6_statistics.Icmp6OutMsgs++;
 }        
 
 void ndisc_send_ns(struct device *dev, struct neighbour *neigh,
@@ -410,6+412,9 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh,
                                                              len, 0));
        /* send it! */
        dev_queue_xmit(skb);
+
+       icmpv6_statistics.Icmp6OutNeighborSolicits++;
+       icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 void ndisc_send_rs(struct device *dev, struct in6_addr *saddr,
@@ -458,6+463,9 @@ void ndisc_send_rs(struct device *dev, struct in6_addr *saddr,
 
        /* send it! */
        dev_queue_xmit(skb);
+
+       icmpv6_statistics.Icmp6OutRouterSolicits++;
+       icmpv6_statistics.Icmp6OutMsgs++;
 }
                   
 
@@ -575,6+583,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 
        if (rt && lifetime == 0) {
                ip6_del_rt(rt);
+               dst_release(&rt->u.dst);
                rt = NULL;
        }
 
@@ -582,11+591,6 @@ static void ndisc_router_discovery(struct sk_buff *skb)
                ND_PRINTK2("ndisc_rdisc: adding default router\n");
 
                rt = rt6_add_dflt_router(&skb->nh.ipv6h->saddr, skb->dev);
-
-#if 1
-               /* BUGGGGG! Previous routine can return invalid pointer. */
-               rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev);
-#endif
                if (rt == NULL) {
                        ND_PRINTK1("route_add failed\n");
                        return;
@@ -595,6+599,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
                neigh = rt->rt6i_nexthop;
                if (neigh == NULL) {
                        ND_PRINTK1("nd: add default router: null neighbour\n");
+                       dst_release(&rt->u.dst);
                        return;
                }
                neigh->flags |= NTF_ROUTER;
@@ -658,7+663,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
                                
                                mtu = htonl(*(__u32 *)(opt+4));
 
-                               if (mtu < 576 || mtu > skb->dev->mtu) {
+                               if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
                                        ND_PRINTK0("NDISC: router "
                                                   "announcement with mtu = %d\n",
                                                   mtu);
@@ -671,10+676,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
                                        if (rt)
                                                rt->u.dst.pmtu = mtu;
 
-                                       /* BUGGG... Scan routing tables and
-                                          adjust mtu on routes going
-                                          via this device
-                                        */
+                                       rt6_mtu_change(skb->dev, mtu);
                                }
                        }
                         break;
@@ -689,6+691,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
                 optlen -= len;
                 opt += len;
         }
+       if (rt)
+               dst_release(&rt->u.dst);
 }
 
 static void ndisc_redirect_rcv(struct sk_buff *skb)
@@ -698,7+702,6 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
        struct in6_addr *dest;
        struct in6_addr *target;        /* new first hop to destination */
        struct neighbour *neigh;
-       struct rt6_info *rt;
        int on_link = 0;
        int optlen;
 
@@ -740,20+743,21 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
        if (!in6_dev || in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
                return;
 
-       /* passed validation tests
+       /* passed validation tests */
 
-          NOTE We should not install redirect if sender did not supply
-          ll address on link, which requires it. It would break, if
-          we have non-transitive address resolution protocol.
-          Fix it later. --ANK
+       /*
+          We install redirect only if nexthop state is valid.
         */
-       rt = rt6_redirect(dest, &skb->nh.ipv6h->saddr, target, skb->dev, on_link);
-
-       if (rt == NULL)
-               return;
 
-       neigh = rt->rt6i_nexthop;
-       ndisc_update(neigh, (u8*)(dest + 1), optlen, ND_OPT_TARGET_LL_ADDR);
+       neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
+       if (neigh) {
+               ndisc_update(neigh, (u8*)(dest + 1), optlen, ND_OPT_TARGET_LL_ADDR);
+               if (neigh->nud_state&NUD_VALID)
+                       rt6_redirect(dest, &skb->nh.ipv6h->saddr, neigh, on_link);
+               else
+                       __neigh_event_send(neigh, NULL);
+               neigh_release(neigh);
+       }
 }
 
 void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
@@ -773,17+777,21 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
        int hlen;
 
        dev = skb->dev;
-       rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 0);
+       rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 1);
 
-       if (rt == NULL || rt->u.dst.error) {
-               ND_PRINTK1("ndisc_send_redirect: hostunreach\n");
+       if (rt == NULL)
                return;
-       }
 
        if (rt->rt6i_flags & RTF_GATEWAY) {
                ND_PRINTK1("ndisc_send_redirect: not a neighbour\n");
+               dst_release(&rt->u.dst);
                return;
        }
+       if (!xrlim_allow(&rt->u.dst, 1*HZ)) {
+               dst_release(&rt->u.dst);
+               return;
+       }
+       dst_release(&rt->u.dst);
 
        if (dev->addr_len) {
                if (neigh->nud_state&NUD_VALID) {
@@ -797,7+805,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
                }
        }
 
-       rd_len = min(536 - len, ntohs(skb->nh.ipv6h->payload_len) + 8);
+       rd_len = min(IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, ntohs(skb->nh.ipv6h->payload_len) + 8);
        rd_len &= ~0x7;
        len += rd_len;
 
@@ -814,14+822,14 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
                ND_PRINTK1("ndisc_send_redirect: alloc_skb failed\n");
                return;
        }
-       
+
        hlen = 0;
 
        if (ndisc_build_ll_hdr(buff, dev, &skb->nh.ipv6h->saddr, NULL, len) == 0) {
                kfree_skb(buff);
                return;
        }
-       
+
        ip6_nd_hdr(sk, buff, dev, &ifp->addr, &skb->nh.ipv6h->saddr,
                   IPPROTO_ICMPV6, len);
 
@@ -838,9+846,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
        ipv6_addr_copy(addrp, target);
        addrp++;
        ipv6_addr_copy(addrp, &skb->nh.ipv6h->daddr);
-       
+
        opt = (u8*) (addrp + 1);
-               
+
        /*
         *      include target_address option
         */
@@ -858,12+866,15 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
        opt += 6;
 
        memcpy(opt, &skb->nh.ipv6h, rd_len - 8);
-       
+
        icmph->icmp6_cksum = csum_ipv6_magic(&ifp->addr, &skb->nh.ipv6h->saddr,
                                             len, IPPROTO_ICMPV6,
                                             csum_partial((u8 *) icmph, len, 0));
 
        dev_queue_xmit(buff);
+
+       icmpv6_statistics.Icmp6OutRedirects++;
+       icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static __inline__ struct neighbour *
@@ -894,15+905,15 @@ static __inline__ int ndisc_recv_na(struct neighbour *neigh, struct sk_buff *skb
 
 static void pndisc_redo(struct sk_buff *skb)
 {
-       ndisc_rcv(skb, skb->dev, &skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
-                 NULL, skb->len);
+       ndisc_rcv(skb, skb->len);
        kfree_skb(skb);
 }
 
-int ndisc_rcv(struct sk_buff *skb, struct device *dev,
-             struct in6_addr *saddr, struct in6_addr *daddr,
-             struct ipv6_options *opt, unsigned short len)
+int ndisc_rcv(struct sk_buff *skb, unsigned long len)
 {
+       struct device *dev = skb->dev;
+       struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+       struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
        struct nd_msg *msg = (struct nd_msg *) skb->h.raw;
        struct neighbour *neigh;
        struct inet6_ifaddr *ifp;
@@ -977,7+988,7 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev,
 
                                        if (neigh) {
                                                ndisc_send_na(dev, neigh, saddr, &msg->target,
-                                                             1, 0, inc, inc);
+                                                             0, 0, inc, inc);
                                                neigh_release(neigh);
                                        }
                                } else {
@@ -1023,13+1034,14 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev,
                                        /*
                                         *      Change: router to host
                                         */
-#if 0                                  
                                        struct rt6_info *rt;
-                                       rt = ndisc_get_dflt_router(skb->dev,
-                                                                  saddr);
-                                       if (rt)
-                                               ndisc_del_dflt_router(rt);
-#endif
+                                       rt = rt6_get_dflt_router(saddr, skb->dev);
+                                       if (rt) {
+                                               /* It is safe only because
+                                                  we aer in BH */
+                                               dst_release(&rt->u.dst);
+                                               ip6_del_rt(rt);
+                                       }
                                }
                        } else {
                                if (msg->icmph.icmp6_router)
index 9b24b49..31f6a2f 100644 (file)
@@ -7,7+7,7 @@
  *             PROC file system.  This is very similar to the IPv4 version,
  *             except it reports the sockets in the INET6 address family.
  *
- * Version:    $Id: proc.c,v 1.8 1998/04/13 17:06:03 davem Exp $
+ * Version:    $Id: proc.c,v 1.9 1998/08/26 12:05:11 davem Exp $
  *
  * Authors:    David S. Miller (davem@caip.rutgers.edu)
  *
 #include <linux/socket.h>
 #include <linux/net.h>
 #include <linux/in6.h>
+#include <linux/stddef.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <net/transp_v6.h>
+#include <net/ipv6.h>
 
 /* This is the main implementation workhorse of all these routines. */
 static int get__netinfo6(struct proto *pro, char *buffer, int format, char **start,
@@ -176,3+178,105 @@ int afinet6_get_info(char *buffer, char **start, off_t offset, int length, int d
                len = length;
        return len;
 }
+
+
+struct snmp6_item
+{
+       char *name;
+       unsigned long *ptr;
+} snmp6_list[] = {
+/* ipv6 mib according to draft-ietf-ipngwg-ipv6-mib-04 */
+#define SNMP6_GEN(x) { #x , &ipv6_statistics.x }
+       SNMP6_GEN(Ip6InReceives),
+       SNMP6_GEN(Ip6InHdrErrors),
+       SNMP6_GEN(Ip6InTooBigErrors),
+       SNMP6_GEN(Ip6InNoRoutes),
+       SNMP6_GEN(Ip6InAddrErrors),
+       SNMP6_GEN(Ip6InUnknownProtos),
+       SNMP6_GEN(Ip6InTruncatedPkts),
+       SNMP6_GEN(Ip6InDiscards),
+       SNMP6_GEN(Ip6InDelivers),
+       SNMP6_GEN(Ip6OutForwDatagrams),
+       SNMP6_GEN(Ip6OutRequests),
+       SNMP6_GEN(Ip6OutDiscards),
+       SNMP6_GEN(Ip6OutNoRoutes),
+       SNMP6_GEN(Ip6ReasmTimeout),
+       SNMP6_GEN(Ip6ReasmReqds),
+       SNMP6_GEN(Ip6ReasmOKs),
+       SNMP6_GEN(Ip6ReasmFails),
+       SNMP6_GEN(Ip6FragOKs),
+       SNMP6_GEN(Ip6FragFails),
+       SNMP6_GEN(Ip6FragCreates),
+       SNMP6_GEN(Ip6InMcastPkts),
+       SNMP6_GEN(Ip6OutMcastPkts),
+#undef SNMP6_GEN
+/* icmpv6 mib according to draft-ietf-ipngwg-ipv6-icmp-mib-02
+
+   Exceptions:  {In|Out}AdminProhibs are removed, because I see
+                no good reasons to account them separately
+               of another dest.unreachs.
+               OutErrs is zero identically.
+               OutEchos too.
+               OutRouterAdvertisements too.
+               OutGroupMembQueries too.
+ */
+#define SNMP6_GEN(x) { #x , &icmpv6_statistics.x }
+       SNMP6_GEN(Icmp6InMsgs),
+       SNMP6_GEN(Icmp6InErrors),
+       SNMP6_GEN(Icmp6InDestUnreachs),
+       SNMP6_GEN(Icmp6InPktTooBigs),
+       SNMP6_GEN(Icmp6InTimeExcds),
+       SNMP6_GEN(Icmp6InParmProblems),
+       SNMP6_GEN(Icmp6InEchos),
+       SNMP6_GEN(Icmp6InEchoReplies),
+       SNMP6_GEN(Icmp6InGroupMembQueries),
+       SNMP6_GEN(Icmp6InGroupMembResponses),
+       SNMP6_GEN(Icmp6InGroupMembReductions),
+       SNMP6_GEN(Icmp6InRouterSolicits),
+       SNMP6_GEN(Icmp6InRouterAdvertisements),
+       SNMP6_GEN(Icmp6InNeighborSolicits),
+       SNMP6_GEN(Icmp6InNeighborAdvertisements),
+       SNMP6_GEN(Icmp6InRedirects),
+       SNMP6_GEN(Icmp6OutMsgs),
+       SNMP6_GEN(Icmp6OutDestUnreachs),
+       SNMP6_GEN(Icmp6OutPktTooBigs),
+       SNMP6_GEN(Icmp6OutTimeExcds),
+       SNMP6_GEN(Icmp6OutParmProblems),
+       SNMP6_GEN(Icmp6OutEchoReplies),
+       SNMP6_GEN(Icmp6OutRouterSolicits),
+       SNMP6_GEN(Icmp6OutNeighborSolicits),
+       SNMP6_GEN(Icmp6OutNeighborAdvertisements),
+       SNMP6_GEN(Icmp6OutRedirects),
+       SNMP6_GEN(Icmp6OutGroupMembResponses),
+       SNMP6_GEN(Icmp6OutGroupMembReductions),
+#undef SNMP6_GEN
+#define SNMP6_GEN(x) { "Udp6" #x , &udp_stats_in6.Udp##x }
+       SNMP6_GEN(InDatagrams),
+       SNMP6_GEN(NoPorts),
+       SNMP6_GEN(InErrors),
+       SNMP6_GEN(OutDatagrams)
+#undef SNMP6_GEN
+};
+
+
+int afinet6_get_snmp(char *buffer, char **start, off_t offset, int length,
+                    int dummy)
+{
+       int len = 0;
+       int i;
+
+       for (i=0; i<sizeof(snmp6_list)/sizeof(snmp6_list[0]); i++)
+               len += sprintf(buffer+len, "%-32s\t%ld\n", snmp6_list[i].name,
+                              *(snmp6_list[i].ptr));
+
+       len -= offset;
+
+       if (len > length)
+               len = length;
+       if(len < 0)
+               len = 0;
+
+       *start = buffer + offset;
+
+       return len;
+}
index e8e8348..76339ff 100644 (file)
@@ -7,7+7,7 @@
  *
  *     Adapted from linux/net/ipv4/raw.c
  *
- *     $Id: raw.c,v 1.20 1998/07/15 05:05:41 davem Exp $
+ *     $Id: raw.c,v 1.21 1998/08/26 12:05:13 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -156,9+156,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
        /* Check if the address belongs to the host. */
        if (addr_type == IPV6_ADDR_MAPPED) {
-               v4addr = addr->sin6_addr.s6_addr32[3];
-               if (inet_addr_type(v4addr) != RTN_LOCAL)
-                       return(-EADDRNOTAVAIL);
+               /* Raw sockets are IPv6 only */
+               return(-EADDRNOTAVAIL);
        } else {
                if (addr_type != IPV6_ADDR_ANY) {
                        /* ipv4 addr of the socket is invalid.  Only the
@@ -182,10+181,11 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        return 0;
 }
 
-void rawv6_err(struct sock *sk, int type, int code, unsigned char *buff,
-              struct in6_addr *saddr, struct in6_addr *daddr)
+void rawv6_err(struct sock *sk, struct sk_buff *skb, struct ipv6hdr *hdr,
+              struct inet6_skb_parm *opt,
+              int type, int code, unsigned char *buff, u32 info)
 {
-       if (sk == NULL) 
+       if (sk == NULL)
                return;
 }
 
@@ -193,12+193,12 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
 {
        /* Charge it to the socket. */
        if (sock_queue_rcv_skb(sk,skb)<0) {
-               /* ip_statistics.IpInDiscards++; */
+               ipv6_statistics.Ip6InDiscards++;
                kfree_skb(skb);
                return 0;
        }
 
-       /* ip_statistics.IpInDelivers++; */
+       ipv6_statistics.Ip6InDelivers++;
        return 0;
 }
 
@@ -209,22+209,11 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
  *     maybe we could have the network decide uppon a hint if it 
  *     should call raw_rcv for demultiplexing
  */
-int rawv6_rcv(struct sk_buff *skb, struct device *dev,
-             struct in6_addr *saddr, struct in6_addr *daddr,
-             struct ipv6_options *opt, unsigned short len)
+int rawv6_rcv(struct sock *sk, struct sk_buff *skb, unsigned long len)
 {
-       struct sock *sk;
-
-       sk = skb->sk;
-
        if (sk->ip_hdrincl)
                skb->h.raw = skb->nh.raw;
 
-       if (atomic_read(&sk->sock_readers)) {
-               __skb_queue_tail(&sk->back_log, skb);
-               return 0;
-       }
-
        rawv6_rcv_skb(sk, skb);
        return 0;
 }
@@ -255,8+244,12 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
        if (!skb)
                goto out;
        
-       copied = min(len, skb->tail - skb->h.raw);
-       
+       copied = skb->tail - skb->h.raw;
+       if (copied > len) {
+               copied = len;
+               msg->msg_flags |= MSG_TRUNC;
+       }
+
        err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
        sk->stamp=skb->stamp;
        if (err)
@@ -269,7+262,7 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
                       sizeof(struct in6_addr));
        }
 
-       if (msg->msg_controllen)
+       if (sk->net_pinfo.af_inet6.rxopt.all)
                datagram_recv_ctl(sk, msg, skb);
        err = copied;
 
@@ -332,11+325,9 @@ static int rawv6_frag_cksum(const void *data, struct in6_addr *addr,
                        csum = (__u16 *) (buff + opt->offset);
                        *csum = hdr->cksum;
                } else {
-                       /* 
-                        *  FIXME 
-                        *  signal an error to user via sk->err
-                        */
-                       printk(KERN_DEBUG "icmp: cksum offset too big\n");
+                       if (net_ratelimit())
+                               printk(KERN_DEBUG "icmp: cksum offset too big\n");
+                       return -EINVAL;
                }
        }       
        return 0; 
@@ -345,10+336,10 @@ static int rawv6_frag_cksum(const void *data, struct in6_addr *addr,
 
 static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 {
-       struct ipv6_options opt_space;
+       struct ipv6_txoptions opt_space;
        struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name;
        struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
-       struct ipv6_options *opt = NULL;
+       struct ipv6_txoptions *opt = NULL;
        struct in6_addr *saddr = NULL;
        struct flowi fl;
        int addr_len = msg->msg_namelen;
@@ -360,11+351,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 
        /* Rough check on arithmetic overflow,
           better check is made in ip6_build_xmit
-
-          When jumbo header will be implemeted we will remove it
-          at all (len will be size_t)
         */
-       if (len < 0 || len > 0xFFFF)
+       if (len < 0)
                return -EMSGSIZE;
 
        /* Mirror BSD error message compatibility */
@@ -394,14+382,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
                        return(-EINVAL);
 
                daddr = &sin6->sin6_addr;
-               
-               /* BUGGGG If route is not cloned, this check always
-                  fails, hence dst_cache only slows down tramsmission --ANK
-                */
-               if (sk->dst_cache && ipv6_addr_cmp(daddr, &np->daddr)) {
-                       dst_release(sk->dst_cache);
-                       sk->dst_cache = NULL;
-               }               
        } else {
                if (sk->state != TCP_ESTABLISHED) 
                        return(-EINVAL);
@@ -422,12+402,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 
        if (msg->msg_controllen) {
                opt = &opt_space;
-               memset(opt, 0, sizeof(struct ipv6_options));
+               memset(opt, 0, sizeof(struct ipv6_txoptions));
 
                err = datagram_send_ctl(msg, &fl.oif, &saddr, opt, &hlimit);
                if (err < 0)
                        return err;
        }
+       if (opt == NULL || !(opt->opt_nflen|opt->opt_flen))
+               opt = np->opt;
 
        raw_opt = &sk->tp_pinfo.tp_raw;
 
@@ -594,8+576,9 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
 
 static void rawv6_close(struct sock *sk, unsigned long timeout)
 {
+       /* See for explanation: raw_close in ipv4/raw.c */
        sk->state = TCP_CLOSE;
-       ipv6_sock_mc_close(sk);
+       raw_v6_unhash(sk);
        if (sk->num == IPPROTO_RAW)
                ip6_ra_control(sk, -1, NULL);
        sk->dead = 1;
@@ -619,7+602,7 @@ struct proto rawv6_prot = {
        datagram_poll,                  /* poll */
        NULL,                           /* ioctl */
        rawv6_init_sk,                  /* init */
-       NULL,                           /* destroy */
+       inet6_destroy_sock,             /* destroy */
        NULL,                           /* shutdown */
        rawv6_setsockopt,               /* setsockopt */
        rawv6_getsockopt,               /* getsockopt */
index e78cf97..e455b05 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: reassembly.c,v 1.10 1998/04/30 16:24:32 freitag Exp $
+ *     $Id: reassembly.c,v 1.11 1998/08/26 12:05:16 davem Exp $
  *
  *     Based on: net/ipv4/ip_fragment.c
  *
 #include <net/ndisc.h>
 #include <net/addrconf.h>
 
+int sysctl_ip6frag_high_thresh = 256*1024;
+int sysctl_ip6frag_low_thresh = 192*1024;
+int sysctl_ip6frag_time = IPV6_FRAG_TIMEOUT;
+
+atomic_t ip6_frag_mem = ATOMIC_INIT(0);
+
+struct ipv6_frag {
+       __u16                   offset;
+       __u16                   len;
+       struct sk_buff          *skb;
+
+       struct frag_hdr         *fhdr;
+
+       struct ipv6_frag        *next;
+};
+
+/*
+ *     Equivalent of ipv4 struct ipq
+ */
+
+struct frag_queue {
+
+       struct frag_queue       *next;
+       struct frag_queue       *prev;
+
+       __u32                   id;             /* fragment id          */
+       struct in6_addr         saddr;
+       struct in6_addr         daddr;
+       struct timer_list       timer;          /* expire timer         */
+       struct ipv6_frag        *fragments;
+       struct device           *dev;
+       int                     iif;
+       __u8                    last_in;        /* has first/last segment arrived? */
+#define FIRST_IN               2
+#define LAST_IN                        1
+       __u8                    nexthdr;
+       __u16                   nhoffset;
+};
 
 static struct frag_queue ipv6_frag_queue = {
        &ipv6_frag_queue, &ipv6_frag_queue,
        0, {{{0}}}, {{{0}}},
        {0}, NULL, NULL,
-       0, 0, NULL
+       0, 0, 0, 0
 };
 
+/* Memory Tracking Functions. */
+extern __inline__ void frag_kfree_skb(struct sk_buff *skb)
+{
+       atomic_sub(skb->truesize, &ip6_frag_mem);
+       kfree_skb(skb);
+}
+
+extern __inline__ void frag_kfree_s(void *ptr, int len)
+{
+       atomic_sub(len, &ip6_frag_mem);
+       kfree(ptr);
+}
+extern __inline__ void *frag_kmalloc(int size, int pri)
+{
+       void *vp = kmalloc(size, pri);
+
+       if(!vp)
+               return NULL;
+       atomic_add(size, &ip6_frag_mem);
+       return vp;
+}
+
+
 static void                    create_frag_entry(struct sk_buff *skb, 
-                                                 struct device *dev,
                                                  __u8 *nhptr,
                                                  struct frag_hdr *fhdr);
-static int                     reasm_frag_1(struct frag_queue *fq, 
-                                            struct sk_buff **skb_in);
+static u8 *                    reasm_frag(struct frag_queue *fq, 
+                                          struct sk_buff **skb_in);
 
 static void                    reasm_queue(struct frag_queue *fq, 
                                            struct sk_buff *skb, 
-                                           struct frag_hdr *fhdr);
+                                           struct frag_hdr *fhdr,
+                                           u8 *nhptr);
 
-static int reasm_frag(struct frag_queue *fq, struct sk_buff **skb, 
-                     __u8 *nhptr,
-                     struct frag_hdr *fhdr)
-{
-       __u32   expires = jiffies + IPV6_FRAG_TIMEOUT;
-       int nh;
-
-       if (del_timer(&fq->timer))
-               expires = fq->timer.expires;
+static void                    fq_free(struct frag_queue *fq);
 
-       /*
-        *      We queue the packet even if it's the last.
-        *      It's a trade off. This allows the reassembly 
-        *      code to be simpler (=faster) and of the
-        *      steps we do for queueing the only unnecessary 
-        *      one it's the kmalloc for a struct ipv6_frag.
-        *      Feel free to try other alternatives...
-        */
-       if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) {
-               fq->last_in = 1;
-               fq->nhptr = nhptr;
-       }
-       reasm_queue(fq, *skb, fhdr);
+static void frag_prune(void)
+{
+       struct frag_queue *fq;
 
-       if (fq->last_in) {
-               if ((nh = reasm_frag_1(fq, skb)))
-                       return nh;
+       while ((fq = ipv6_frag_queue.next) != &ipv6_frag_queue) {
+               ipv6_statistics.Ip6ReasmFails++;
+               fq_free(fq);
+               if (atomic_read(&ip6_frag_mem) <= sysctl_ip6frag_low_thresh)
+                       return;
        }
-
-       fq->timer.expires = expires;
-       add_timer(&fq->timer);
-       
-       return 0;
+       if (atomic_read(&ip6_frag_mem))
+               printk(KERN_DEBUG "IPv6 frag_prune: memleak\n");
+       atomic_set(&ip6_frag_mem, 0);
 }
 
-int ipv6_reassembly(struct sk_buff **skbp, struct device *dev, __u8 *nhptr,
-                   struct ipv6_options *opt)
+
+u8* ipv6_reassembly(struct sk_buff **skbp, __u8 *nhptr)
 {
        struct sk_buff *skb = *skbp; 
        struct frag_hdr *fhdr = (struct frag_hdr *) (skb->h.raw);
        struct frag_queue *fq;
        struct ipv6hdr *hdr;
 
+       hdr = skb->nh.ipv6h;
+
+       ipv6_statistics.Ip6ReasmReqds++;
+
+       /* Jumbo payload inhibits frag. header */
+       if (hdr->payload_len==0) {
+               icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw);
+               return NULL;
+       }
        if ((u8 *)(fhdr+1) > skb->tail) {
                icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw);
-               return 0;
+               return NULL;
        }
-       hdr = skb->nh.ipv6h;
+       if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh)
+               frag_prune();
+
        for (fq = ipv6_frag_queue.next; fq != &ipv6_frag_queue; fq = fq->next) {
                if (fq->id == fhdr->identification && 
                    !ipv6_addr_cmp(&hdr->saddr, &fq->saddr) &&
-                   !ipv6_addr_cmp(&hdr->daddr, &fq->daddr))
-                       return reasm_frag(fq, skbp, nhptr,fhdr);
+                   !ipv6_addr_cmp(&hdr->daddr, &fq->daddr)) {
+
+                       reasm_queue(fq, skb, fhdr, nhptr);
+
+                       if (fq->last_in == (FIRST_IN|LAST_IN))
+                               return reasm_frag(fq, skbp);
+
+                       return NULL;
+               }
        }
-       
-       create_frag_entry(skb, dev, nhptr, fhdr);
 
-       return 0;
+       create_frag_entry(skb, nhptr, fhdr);
+
+       return NULL;
 }
 
 
@@ -125,11+187,13 @@ static void fq_free(struct frag_queue *fq)
 {
        struct ipv6_frag *fp, *back;
 
-       for(fp = fq->fragments; fp; ) {
-               kfree_skb(fp->skb);             
+       del_timer(&fq->timer);
+
+       for (fp = fq->fragments; fp; ) {
+               frag_kfree_skb(fp->skb);
                back = fp;
                fp=fp->next;
-               kfree(back);
+               frag_kfree_s(back, sizeof(*back));
        }
 
        fq->prev->next = fq->next;
@@ -137,7+201,7 @@ static void fq_free(struct frag_queue *fq)
 
        fq->prev = fq->next = NULL;
        
-       kfree(fq);
+       frag_kfree_s(fq, sizeof(*fq));
 }
 
 static void frag_expire(unsigned long data)
@@ -147,33+211,50 @@ static void frag_expire(unsigned long data)
 
        fq = (struct frag_queue *) data;
 
-       del_timer(&fq->timer);
-
        frag = fq->fragments;
 
+       ipv6_statistics.Ip6ReasmTimeout++;
+       ipv6_statistics.Ip6ReasmFails++;
+
        if (frag == NULL) {
                printk(KERN_DEBUG "invalid fragment queue\n");
                return;
        }
 
-       icmpv6_send(frag->skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0,
-                   frag->skb->dev);
+       /* Send error only if the first segment arrived.
+          (fixed --ANK (980728))
+        */
+       if (fq->last_in&FIRST_IN) {
+               struct device *dev = dev_get_by_index(fq->iif);
+
+               /*
+                  But use as source device on which LAST ARRIVED
+                  segment was received. And do not use fq->dev
+                  pointer directly, device might already disappeared.
+                */
+               if (dev) {
+                       frag->skb->dev = dev;
+                       icmpv6_send(frag->skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0,
+                                   dev);
+               }
+       }
        
        fq_free(fq);
 }
 
 
-static void create_frag_entry(struct sk_buff *skb, struct device *dev, 
+static void create_frag_entry(struct sk_buff *skb,
                              __u8 *nhptr,
                              struct frag_hdr *fhdr)
 {
        struct frag_queue *fq;
        struct ipv6hdr *hdr; 
 
-       fq = (struct frag_queue *) kmalloc(sizeof(struct frag_queue), 
-                                          GFP_ATOMIC);
+       fq = (struct frag_queue *) frag_kmalloc(sizeof(struct frag_queue), 
+                                               GFP_ATOMIC);
 
        if (fq == NULL) {
+               ipv6_statistics.Ip6ReasmFails++;
                kfree_skb(skb);
                return;
        }
@@ -186,38+267,41 @@ static void create_frag_entry(struct sk_buff *skb, struct device *dev,
        ipv6_addr_copy(&fq->saddr, &hdr->saddr);
        ipv6_addr_copy(&fq->daddr, &hdr->daddr);
 
-       fq->dev = dev;
-
        /* init_timer has been done by the memset */
        fq->timer.function = frag_expire;
        fq->timer.data = (long) fq;
-       fq->timer.expires = jiffies + IPV6_FRAG_TIMEOUT;
+       fq->timer.expires = jiffies + sysctl_ip6frag_time;
 
-       fq->nexthdr = fhdr->nexthdr;
+       reasm_queue(fq, skb, fhdr, nhptr);
 
+       if (fq->fragments) {
+               fq->prev = ipv6_frag_queue.prev;
+               fq->next = &ipv6_frag_queue;
+               fq->prev->next = fq;
+               ipv6_frag_queue.prev = fq;
 
-       if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) {
-               fq->last_in = 1;
-               fq->nhptr = nhptr;
-       }
-       reasm_queue(fq, skb, fhdr);
-
-       fq->prev = ipv6_frag_queue.prev;
-       fq->next = &ipv6_frag_queue;
-       fq->prev->next = fq;
-       ipv6_frag_queue.prev = fq;
-       
-       add_timer(&fq->timer);
+               add_timer(&fq->timer);
+       } else
+               frag_kfree_s(fq, sizeof(*fq));
 }
 
 
+/*
+ *     We queue the packet even if it's the last.
+ *     It's a trade off. This allows the reassembly 
+ *     code to be simpler (=faster) and of the
+ *     steps we do for queueing the only unnecessary 
+ *     one it's the kmalloc for a struct ipv6_frag.
+ *     Feel free to try other alternatives...
+ */
+
 static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, 
-                                    struct frag_hdr *fhdr)
+                                    struct frag_hdr *fhdr, u8 *nhptr)
 {
        struct ipv6_frag *nfp, *fp, **bptr;
 
-       nfp = (struct ipv6_frag *) kmalloc(sizeof(struct ipv6_frag), 
-                                          GFP_ATOMIC);
+       nfp = (struct ipv6_frag *) frag_kmalloc(sizeof(struct ipv6_frag), 
+                                               GFP_ATOMIC);
 
        if (nfp == NULL) {              
                kfree_skb(skb);
@@ -228,24+312,40 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb,
        nfp->len = (ntohs(skb->nh.ipv6h->payload_len) -
                    ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1)));
 
-       if ((u32)nfp->offset + (u32)nfp->len > 65536) {
+       if ((u32)nfp->offset + (u32)nfp->len >= 65536) {
                icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off); 
                goto err;
        }
+       if (fhdr->frag_off & __constant_htons(0x0001)) {
+               /* Check if the fragment is rounded to 8 bytes.
+                * Required by the RFC.
+                * ... and would break our defragmentation algorithm 8)
+                */
+               if (nfp->len & 0x7) {
+                       printk(KERN_DEBUG "fragment not rounded to 8bytes\n");
+
+                       /*
+                          It is not in specs, but I see no reasons
+                          to send an error in this case. --ANK
+                        */
+                       if (nfp->offset == 0)
+                               icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, 
+                                                 &skb->nh.ipv6h->payload_len);
+                       goto err;
+               }
+       }
 
        nfp->skb  = skb;
        nfp->fhdr = fhdr;
-
        nfp->next = NULL;
 
        bptr = &fq->fragments;
-       
+
        for (fp = fq->fragments; fp; fp=fp->next) {
                if (nfp->offset <= fp->offset)
                        break;
                bptr = &fp->next;
        }
-       
        if (fp && fp->offset == nfp->offset) {
                if (nfp->len != fp->len) {
                        printk(KERN_DEBUG "reasm_queue: dup with wrong len\n");
@@ -254,29+354,40 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb,
                /* duplicate. discard it. */
                goto err;
        }
-       
-       *bptr = nfp;
-       nfp->next = fp;
 
-#ifdef STRICT_RFC
-       if (fhdr->frag_off & __constant_htons(0x0001)) {
-               /* Check if the fragment is rounded to 8 bytes.
-                * Required by the RFC.
-                */
-               if (nfp->len & 0x7) {
-                       printk(KERN_DEBUG "fragment not rounded to 8bytes\n");
+       atomic_add(skb->truesize, &ip6_frag_mem);
 
-                       icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, 
-                                         &skb->nh.ipv6h->payload_len);
-                       goto err;
-               }
+       /* All the checks are done, fragment is acepted.
+          Only now we are allowed to update reassembly data!
+          (fixed --ANK (980728))
+        */
+
+       /* iif always set to one of the last arrived segment */
+       fq->dev = skb->dev;
+       fq->iif = skb->dev->ifindex;
+
+       /* Last fragment */
+       if ((fhdr->frag_off & __constant_htons(0x0001)) == 0)
+               fq->last_in |= LAST_IN;
+
+       /* First fragment.
+          nexthdr and nhptr are get from the first fragment.
+          Moreover, nexthdr is UNDEFINED for all the fragments but the
+          first one.
+          (fixed --ANK (980728))
+        */
+       if (nfp->offset == 0) {
+               fq->nexthdr = fhdr->nexthdr;
+               fq->last_in |= FIRST_IN;
+               fq->nhoffset = nhptr - skb->nh.raw;
        }
-#endif 
 
+       *bptr = nfp;
+       nfp->next = fp;
        return;
 
 err:
-       kfree(nfp);
+       frag_kfree_s(nfp, sizeof(*nfp));
        kfree_skb(skb);
 }
 
@@ -284,20+395,21 @@ err:
  *     check if this fragment completes the packet
  *     returns true on success
  */
-static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in)
+static u8* reasm_frag(struct frag_queue *fq, struct sk_buff **skb_in)
 {
        struct ipv6_frag *fp;
+       struct ipv6_frag *head = fq->fragments;
        struct ipv6_frag *tail = NULL;
        struct sk_buff *skb;
        __u32  offset = 0;
        __u32  payload_len;
        __u16  unfrag_len;
        __u16  copy;
-       int    nh;
+       u8     *nhptr;
 
-       for(fp = fq->fragments; fp; fp=fp->next) {
+       for(fp = head; fp; fp=fp->next) {
                if (offset != fp->offset)
-                       return 0;
+                       return NULL;
 
                offset += fp->len;
                tail = fp;
@@ -309,31+421,42 @@ static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in)
         * this means we have all fragments.
         */
 
-       unfrag_len = (u8 *) (tail->fhdr) - (u8 *) (tail->skb->nh.ipv6h + 1);
+       /* Unfragmented part is taken from the first segment.
+          (fixed --ANK (980728))
+        */
+       unfrag_len = (u8 *) (head->fhdr) - (u8 *) (head->skb->nh.ipv6h + 1);
 
        payload_len = (unfrag_len + tail->offset + 
                       (tail->skb->tail - (__u8 *) (tail->fhdr + 1)));
 
-#if 0
-       printk(KERN_DEBUG "reasm: payload len = %d\n", payload_len);
-#endif
+       if (payload_len > 65535) {
+               if (net_ratelimit())
+                       printk(KERN_DEBUG "reasm_frag: payload len = %d\n", payload_len);
+               ipv6_statistics.Ip6ReasmFails++;
+               fq_free(fq);
+               return NULL;
+       }
 
        if ((skb = dev_alloc_skb(sizeof(struct ipv6hdr) + payload_len))==NULL) {
-               printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n");
+               if (net_ratelimit())
+                       printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n");
+               ipv6_statistics.Ip6ReasmFails++;
                fq_free(fq);
-               return 1;
+               return NULL;
        }
 
        copy = unfrag_len + sizeof(struct ipv6hdr);
 
        skb->nh.ipv6h = (struct ipv6hdr *) skb->data;
-
        skb->dev = fq->dev;
+       skb->protocol = __constant_htons(ETH_P_IPV6);
+       skb->pkt_type = head->skb->pkt_type;
+       memcpy(skb->cb, head->skb->cb, sizeof(skb->cb));
+       skb->dst = dst_clone(head->skb->dst);
 
-       nh = fq->nexthdr;
-
-       *(fq->nhptr) = nh;
-       memcpy(skb_put(skb, copy), tail->skb->nh.ipv6h, copy);
+       memcpy(skb_put(skb, copy), head->skb->nh.ipv6h, copy);
+       nhptr = skb->nh.raw + fq->nhoffset;
+       *nhptr = fq->nexthdr;
 
        skb->h.raw = skb->tail;
 
@@ -351,18+474,19 @@ static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in)
                struct ipv6_frag *back;
 
                memcpy(skb_put(skb, fp->len), (__u8*)(fp->fhdr + 1), fp->len);
-               kfree_skb(fp->skb);
+               frag_kfree_skb(fp->skb);
                back = fp;
                fp=fp->next;
-               kfree(back);
+               frag_kfree_s(back, sizeof(*back));
        }
-       
+
+       del_timer(&fq->timer);
        fq->prev->next = fq->next;
        fq->next->prev = fq->prev;
-
        fq->prev = fq->next = NULL;
-       
-       kfree(fq);
 
-       return nh;
+       frag_kfree_s(fq, sizeof(*fq));
+
+       ipv6_statistics.Ip6ReasmOKs++;
+       return nhptr;
 }
index 9d159fe..8d1f596 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: route.c,v 1.32 1998/07/25 23:28:52 davem Exp $
+ *     $Id: route.c,v 1.33 1998/08/26 12:05:18 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
 
 #if RT6_DEBUG >= 3
 #define RDBG(x) printk x
+#define RT6_TRACE(x...) printk(KERN_DEBUG x)
 #else
 #define RDBG(x)
+#define RT6_TRACE(x...) do { ; } while (0)
 #endif
 
+#if RT6_DEBUG >= 1
+#define BUG_TRAP(x) ({ if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } })
+#else
+#define BUG_TRAP(x) do { ; } while (0)
+#endif
+
+
 int ip6_rt_max_size = 4096;
 int ip6_rt_gc_min_interval = 5*HZ;
 int ip6_rt_gc_timeout = 60*HZ;
@@ -87,16+96,16 @@ struct dst_ops ip6_dst_ops = {
 };
 
 struct rt6_info ip6_null_entry = {
-       {{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), NULL,
-         -1, 0, 0, 0, 0, 0, 0, 0, 0,
+       {{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), &loopback_dev,
+         -1, 0, 0, 0, 0, 0, 0, 0,
          -ENETUNREACH, NULL, NULL,
          ip6_pkt_discard, ip6_pkt_discard,
 #ifdef CONFIG_NET_CLS_ROUTE
          0,
 #endif
          &ip6_dst_ops}},
-       NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0U,
-       255, 0, {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
+       NULL, {{{0}}}, RTF_REJECT|RTF_NONEXTHOP, ~0U,
+       255, 0, ATOMIC_INIT(1), {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
 };
 
 struct fib6_node ip6_routing_table = {
@@ -123,89+132,6 @@ static struct rt6_info     *rt6_flow_lookup(struct rt6_info *rt,
 #define ip6_rt_policy (0)
 #endif
 
-static atomic_t        rt6_tbl_lock    = ATOMIC_INIT(0);
-static int     rt6_bh_mask     = 0;
-
-#define RT_BH_REQUEST          1
-#define RT_BH_GC               2
-
-static void __rt6_run_bh(void);
-
-/*
- *     request queue operations
- *     FIFO queue/dequeue
- */
-
-static struct rt6_req request_queue = {
-       0, NULL, &request_queue, &request_queue
-};
-
-static __inline__ void rtreq_queue(struct rt6_req * req)
-{
-       unsigned long flags;
-       struct rt6_req *next = &request_queue;
-
-       save_flags(flags);
-       cli();
-
-       req->prev = next->prev;
-       req->prev->next = req;
-       next->prev = req;
-       req->next = next;
-       restore_flags(flags);
-}
-
-static __inline__ struct rt6_req * rtreq_dequeue(void)
-{
-       struct rt6_req *next = &request_queue;
-       struct rt6_req *head;
-
-       head = next->next;
-
-       if (head == next)
-               return NULL;
-
-       head->next->prev = head->prev;
-       next->next = head->next;
-
-       head->next = NULL;
-       head->prev = NULL;
-
-       return head;
-}
-
-void rtreq_add(struct rt6_info *rt, int operation)
-{
-       struct rt6_req *rtreq;
-
-       rtreq = kmalloc(sizeof(struct rt6_req), GFP_ATOMIC);
-       
-       if (rtreq == NULL)
-               return;
-
-       memset(rtreq, 0, sizeof(struct rt6_req));
-
-       rtreq->operation = operation;
-       rtreq->ptr = rt;
-       rtreq_queue(rtreq);
-
-       rt6_bh_mask |= RT_BH_REQUEST;
-}
-
-static __inline__ void rt6_lock(void)
-{
-       atomic_inc(&rt6_tbl_lock);
-}
-
-static __inline__ void rt6_unlock(void)
-{
-       if (atomic_dec_and_test(&rt6_tbl_lock) && rt6_bh_mask) {
-               start_bh_atomic();
-               __rt6_run_bh();
-               end_bh_atomic();
-       }
-}
-
 /*
  *     Route lookup
  */
@@ -219,23+145,19 @@ static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
 
        if (oif) {
                for (sprt = rt; sprt; sprt = sprt->u.next) {
-                       if (sprt->rt6i_dev) {
-                               if (sprt->rt6i_dev->ifindex == oif)
-                                       return sprt;
-                               if (sprt->rt6i_dev->flags&IFF_LOOPBACK)
-                                       local = sprt;
-                       }
+                       struct device *dev = sprt->rt6i_dev;
+                       if (dev->ifindex == oif)
+                               return sprt;
+                       if (dev->flags&IFF_LOOPBACK)
+                               local = sprt;
                }
 
                if (local)
                        return local;
 
-               if (strict) {
-                       RDBG(("nomatch & STRICT --> ip6_null_entry\n"));
+               if (strict)
                        return &ip6_null_entry;
-               }
        }
-       RDBG(("!dev or (no match and !strict) --> rt(%p)\n", rt));
        return rt;
 }
 
@@ -282,7+204,7 @@ static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, int oif)
                                break;
                        };
 
-                       if (oif && sprt->rt6i_dev && sprt->rt6i_dev->ifindex == oif) {
+                       if (oif && sprt->rt6i_dev->ifindex == oif) {
                                m += 2;
                        }
 
@@ -319,21+241,40 @@ out:
 }
 
 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
-                           int oif, int flags)
+                           int oif, int strict)
 {
        struct fib6_node *fn;
        struct rt6_info *rt;
 
-       rt6_lock();
+       start_bh_atomic();
        fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
-       rt = rt6_device_match(fn->leaf, oif, flags&RTF_LINKRT);
-       rt6_unlock();
-       return rt;
+       rt = rt6_device_match(fn->leaf, oif, strict);
+       atomic_inc(&rt->u.dst.use);
+       atomic_inc(&rt->u.dst.refcnt);
+       end_bh_atomic();
+
+       rt->u.dst.lastuse = jiffies;
+       if (rt->u.dst.error == 0)
+               return rt;
+       dst_release(&rt->u.dst);
+       return NULL;
+}
+
+static int rt6_ins(struct rt6_info *rt)
+{
+       int err;
+
+       start_bh_atomic();
+       err = fib6_add(&ip6_routing_table, rt);
+       end_bh_atomic();
+
+       return err;
 }
 
 static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
                                struct in6_addr *saddr)
 {
+       int err;
        struct rt6_info *rt;
 
        /*
@@ -351,18+292,24 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
                rt->rt6i_dst.plen = 128;
                rt->rt6i_flags |= RTF_CACHE;
 
-               if (rt->rt6i_src.plen) {
+#ifdef CONFIG_IPV6_SUBTREES
+               if (rt->rt6i_src.plen && saddr) {
                        ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
                        rt->rt6i_src.plen = 128;
                }
+#endif
 
                rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
 
-               rtreq_add(rt, RT_OPER_ADD);
-       } else {
-               rt = &ip6_null_entry;
+               dst_clone(&rt->u.dst);
+               err = rt6_ins(rt);
+               if (err == 0)
+                       return rt;
+               rt->u.dst.error = err;
+               return rt;
        }
-       return rt;
+       dst_clone(&ip6_null_entry.u.dst);
+       return &ip6_null_entry;
 }
 
 #ifdef CONFIG_RT6_POLICY
@@ -397,24+344,38 @@ static __inline__ struct rt6_info *rt6_flow_lookup_out(struct rt6_info *rt,
 
 #endif
 
+#define BACKTRACK() \
+if (rt == &ip6_null_entry && strict) { \
+       while ((fn = fn->parent) != NULL) { \
+               if (fn->fn_flags & RTN_ROOT) { \
+                       dst_clone(&rt->u.dst); \
+                       goto out; \
+               } \
+               if (fn->fn_flags & RTN_RTINFO) \
+                       goto restart; \
+       } \
+}
+
+
 void ip6_route_input(struct sk_buff *skb)
 {
        struct fib6_node *fn;
        struct rt6_info *rt;
-       struct dst_entry *dst;
+       int strict;
+
+       strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
 
-       RDBG(("ip6_route_input(%p) from %p\n", skb, __builtin_return_address(0)));
-       if ((dst = skb->dst) != NULL)
-               goto looped_back;
-       rt6_lock();
        fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
                         &skb->nh.ipv6h->saddr);
 
+restart:
        rt = fn->leaf;
 
        if ((rt->rt6i_flags & RTF_CACHE)) {
                if (ip6_rt_policy == 0) {
-                       rt = rt6_device_match(rt, skb->dev->ifindex, 0);
+                       rt = rt6_device_match(rt, skb->dev->ifindex, strict);
+                       BACKTRACK();
+                       dst_clone(&rt->u.dst);
                        goto out;
                }
 
@@ -425,6+386,7 @@ void ip6_route_input(struct sk_buff *skb)
                        for (sprt = rt; sprt; sprt = sprt->u.next) {
                                if (rt6_flow_match_in(sprt, skb)) {
                                        rt = sprt;
+                                       dst_clone(&rt->u.dst);
                                        goto out;
                                }
                        }
@@ -433,38+395,38 @@ void ip6_route_input(struct sk_buff *skb)
        }
 
        rt = rt6_device_match(rt, skb->dev->ifindex, 0);
+       BACKTRACK();
 
        if (ip6_rt_policy == 0) {
-               if (!rt->rt6i_nexthop && rt->rt6i_dev &&
-                   ((rt->rt6i_flags & RTF_NONEXTHOP) == 0)) {
+               if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
                        rt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
                                     &skb->nh.ipv6h->saddr);
+                       goto out;
                }
+               dst_clone(&rt->u.dst);
        } else {
 #ifdef CONFIG_RT6_POLICY
                rt = rt6_flow_lookup_in(rt, skb);
+#else
+               /* NEVER REACHED */
 #endif
        }
 
 out:
-       dst = dst_clone((struct dst_entry *) rt);
-       rt6_unlock();
-
-       skb->dst = dst;
-looped_back:
-       dst->input(skb);
+       rt->u.dst.lastuse = jiffies;
+       atomic_inc(&rt->u.dst.refcnt);
+       skb->dst = (struct dst_entry *) rt;
 }
 
 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
 {
        struct fib6_node *fn;
        struct rt6_info *rt;
-       struct dst_entry *dst;
        int strict;
 
        strict = ipv6_addr_type(fl->nl_u.ip6_u.daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
 
-       rt6_lock();
+       start_bh_atomic();
        fn = fib6_lookup(&ip6_routing_table, fl->nl_u.ip6_u.daddr,
                         fl->nl_u.ip6_u.saddr);
 
@@ -472,25+434,10 @@ restart:
        rt = fn->leaf;
 
        if ((rt->rt6i_flags & RTF_CACHE)) {
-               RDBG(("RTF_CACHE "));
                if (ip6_rt_policy == 0) {
                        rt = rt6_device_match(rt, fl->oif, strict);
-
-                       /* BUGGGG! It is capital bug, that was hidden
-                          by not-cloning multicast routes. However,
-                          the same problem was with link-local addresses.
-                          Fix is the following if-statement,
-                          but it will not properly handle Pedro's subtrees --ANK
-                        */
-                       if (rt == &ip6_null_entry && strict) {
-                               while ((fn = fn->parent) != NULL) {
-                                       if (fn->fn_flags & RTN_ROOT)
-                                               goto out;
-                                       if (fn->fn_flags & RTN_RTINFO)
-                                               goto restart;
-                               }
-                       }
-                       RDBG(("devmatch(%p) ", rt));
+                       BACKTRACK();
+                       dst_clone(&rt->u.dst);
                        goto out;
                }
 
@@ -501,68+448,46 @@ restart:
                        for (sprt = rt; sprt; sprt = sprt->u.next) {
                                if (rt6_flow_match_out(sprt, sk)) {
                                        rt = sprt;
+                                       dst_clone(&rt->u.dst);
                                        goto out;
                                }
                        }
                }
 #endif
        }
-       RDBG(("!RTF_CACHE "));
        if (rt->rt6i_flags & RTF_DEFAULT) {
-               RDBG(("RTF_DEFAULT "));
-               if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF) {
+               if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF)
                        rt = rt6_best_dflt(rt, fl->oif);
-                       RDBG(("best_dflt(%p) ", rt));
-               }
        } else {
                rt = rt6_device_match(rt, fl->oif, strict);
-               RDBG(("!RTF_DEFAULT devmatch(%p) ", rt));
+               BACKTRACK();
        }
 
        if (ip6_rt_policy == 0) {
-               if (!rt->rt6i_nexthop && rt->rt6i_dev &&
-                   ((rt->rt6i_flags & RTF_NONEXTHOP) == 0)) {
+               if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
                        rt = rt6_cow(rt, fl->nl_u.ip6_u.daddr,
                                     fl->nl_u.ip6_u.saddr);
-                       RDBG(("(!nhop&&rt6i_dev&&!RTF_NONEXTHOP) cow(%p) ", rt));
+                       goto out;
                }
+               dst_clone(&rt->u.dst);
        } else {
 #ifdef CONFIG_RT6_POLICY
                rt = rt6_flow_lookup_out(rt, sk, fl);
+#else
+               /* NEVER REACHED */
 #endif
        }
 
 out:
-       dst = dst_clone((struct dst_entry *) rt);
-       rt6_unlock();
-       RDBG(("dclone/ret(%p)\n", dst));
-       return dst;
-}
-
-
-static void rt6_ins(struct rt6_info *rt)
-{
-       start_bh_atomic();
-       if (atomic_read(&rt6_tbl_lock) == 1)
-               fib6_add(&ip6_routing_table, rt);
-       else
-               rtreq_add(rt, RT_OPER_ADD);
+       rt->u.dst.lastuse = jiffies;
+       atomic_inc(&rt->u.dst.refcnt);
        end_bh_atomic();
+       return &rt->u.dst;
 }
 
+
 /*
  *     Destination cache support functions
- *
- *     BUGGG! This function is absolutely wrong.
- *     First of all it is never called. (look at include/net/dst.h)
- *     Second, even when it is called rt->rt6i_node == NULL
- *       ** partially fixed: now dst->obsolete = -1 for IPv6 not cache routes.
- *     Third, even we fixed previous bugs,
- *     it will not work because sernum is incorrectly checked/updated and
- *     it does not handle change of the parent of cloned route.
- *     Purging stray clones is not easy task, it would require
- *     massive remake of ip6_fib.c. Alas...
- *                                                     --ANK
  */
 
 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
@@ -646,7+571,7 @@ static int ipv6_get_mtu(struct device *dev)
        if (idev)
                return idev->cnf.mtu6;
        else
-               return 576;
+               return IPV6_MIN_MTU;
 }
 
 static int ipv6_get_hoplimit(struct device *dev)
@@ -664,72+589,68 @@ static int ipv6_get_hoplimit(struct device *dev)
  *
  */
 
-struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
+int ip6_route_add(struct in6_rtmsg *rtmsg)
 {
+       int err;
        struct rt6_info *rt;
        struct device *dev = NULL;
        int addr_type;
-       
-       if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128) {
-               *err = -EINVAL;
-               return NULL;
-       }
+
+       if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
+               return -EINVAL;
+#ifndef CONFIG_IPV6_SUBTREES
+       if (rtmsg->rtmsg_src_len)
+               return -EINVAL;
+#endif
        if (rtmsg->rtmsg_metric == 0)
                rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
 
-       *err = 0;
-       
        rt = dst_alloc(sizeof(struct rt6_info), &ip6_dst_ops);
 
-       if (rt == NULL) {
-               RDBG(("dalloc fails, "));
-               *err = -ENOMEM;
-               return NULL;
-       }
+       if (rt == NULL)
+               return -ENOMEM;
 
        rt->u.dst.obsolete = -1;
        rt->rt6i_expires = rtmsg->rtmsg_info;
 
        addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
 
-       if (addr_type & IPV6_ADDR_MULTICAST) {
-               RDBG(("MCAST, "));
+       if (addr_type & IPV6_ADDR_MULTICAST)
                rt->u.dst.input = ip6_mc_input;
-       } else {
-               RDBG(("!MCAST "));
+       else
                rt->u.dst.input = ip6_forward;
-       }
 
        rt->u.dst.output = ip6_output;
 
        if (rtmsg->rtmsg_ifindex) {
                dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
-               if (dev == NULL) {
-                       *err = -ENODEV;
+               err = -ENODEV;
+               if (dev == NULL)
                        goto out;
-               }
        }
 
        ipv6_addr_copy(&rt->rt6i_dst.addr, &rtmsg->rtmsg_dst);
        rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
        ipv6_wash_prefix(&rt->rt6i_dst.addr, rt->rt6i_dst.plen);
 
+#ifdef CONFIG_IPV6_SUBTREES
        ipv6_addr_copy(&rt->rt6i_src.addr, &rtmsg->rtmsg_src);
        rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
        ipv6_wash_prefix(&rt->rt6i_src.addr, rt->rt6i_src.plen);
+#endif
+
+       rt->rt6i_metric = rtmsg->rtmsg_metric;
 
        /* We cannot add true routes via loopback here,
           they would result in kernel looping; promote them to reject routes
         */
        if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
            (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
-               dev = dev_get("lo");
+               dev = &loopback_dev;
                rt->u.dst.output = ip6_pkt_discard;
                rt->u.dst.input = ip6_pkt_discard;
                rt->u.dst.error = -ENETUNREACH;
                rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
-               rt->rt6i_metric = rtmsg->rtmsg_metric;
-               rt->rt6i_dev = dev;
                goto install_route;
        }
 
@@ -746,50+667,44 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
 
                        /* IPv6 strictly inhibits using not link-local
                           addresses as nexthop address.
+                          Otherwise, router will not able to send redirects.
                           It is very good, but in some (rare!) curcumstances
-                          (SIT, NBMA NOARP links) it is handy to allow
-                          some exceptions.
+                          (SIT, PtP, NBMA NOARP links) it is handy to allow
+                          some exceptions. --ANK
                         */
-                       if (!(gwa_type&IPV6_ADDR_UNICAST)) {
-                               *err = -EINVAL;
+                       err = -EINVAL;
+                       if (!(gwa_type&IPV6_ADDR_UNICAST))
                                goto out;
-                       }
 
-                       grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, RTF_LINKRT);
+                       grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
 
-                       if (grt == NULL || (grt->rt6i_flags&RTF_GATEWAY)) {
-                               *err = -EHOSTUNREACH;
+                       err = -EHOSTUNREACH;
+                       if (grt == NULL)
                                goto out;
-                       }
+                       if (!(grt->rt6i_flags&RTF_GATEWAY))
+                               err = 0;
                        dev = grt->rt6i_dev;
+                       dst_release(&grt->u.dst);
+
+                       if (err)
+                               goto out;
                }
-               if (dev == NULL || (dev->flags&IFF_LOOPBACK)) {
-                       *err = -EINVAL;
+               err = -EINVAL;
+               if (dev == NULL || (dev->flags&IFF_LOOPBACK))
                        goto out;
-               }
        }
 
-       if (dev == NULL) {
-               RDBG(("!dev, "));
-               *err = -ENODEV;
+       err = -ENODEV;
+       if (dev == NULL)
                goto out;
-       }
 
        if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
                rt->rt6i_nexthop = ndisc_get_neigh(dev, &rt->rt6i_gateway);
-               if (rt->rt6i_nexthop == NULL) {
-                       RDBG(("!nxthop, "));
-                       *err = -ENOMEM;
+               err = -ENOMEM;
+               if (rt->rt6i_nexthop == NULL)
                        goto out;
-               }
-               RDBG(("nxthop, "));
        }
 
-       rt->rt6i_metric = rtmsg->rtmsg_metric;
-
-       rt->rt6i_dev = dev;
-       rt->u.dst.pmtu = ipv6_get_mtu(dev);
-       rt->u.dst.rtt = TCP_TIMEOUT_INIT;
        if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr))
                rt->rt6i_hoplimit = IPV6_DEFAULT_MCASTHOPS;
        else
@@ -797,153+712,59 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
        rt->rt6i_flags = rtmsg->rtmsg_flags;
 
 install_route:
-       RDBG(("rt6ins(%p) ", rt));
-
-       rt6_lock();
-       rt6_ins(rt);
-       rt6_unlock();
-
-       /* BUGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG!
-
-          If rt6_ins will fail (and it occurs regularly f.e. if route
-          already existed), the route will be freed -> Finita.
-          Crash. No recovery. NO FIX. Unfortunately, it is not the only
-          place will it is fatal. It is sad, I believed this
-          code is a bit more accurate :-(
-
-          Really, the problem can be solved in two ways:
-
-          * As I did in old 2.0 IPv4: to increase use count and force
-            user to destroy stray route. It requires some care,
-            well, much more care.
-          * Second and the best: to get rid of this damn backlogging
-            system. I wonder why Pedro so liked it. It was the most
-            unhappy day when I invented it (well, by a strange reason
-            I believed that it is very clever :-)),
-            and when I managed to clean IPv4 of this crap,
-            it was really great win.
-            BTW I forgot how 2.0 route/arp works :-) :-)
-                                                                      --ANK
-        */
+       rt->u.dst.pmtu = ipv6_get_mtu(dev);
+       rt->u.dst.rtt = TCP_TIMEOUT_INIT;
+       rt->rt6i_dev = dev;
+       return rt6_ins(rt);
 
 out:
-       if (*err) {
-               RDBG(("dfree(%p) ", rt));
-               dst_free((struct dst_entry *) rt);
-               rt = NULL;
-       }
-       RDBG(("ret(%p)\n", rt));
-#if 0
-       return rt;
-#else
-       /* BUGGG! For now always return NULL. (see above)
-
-          Really, it was used only in two places, and one of them
-          (rt6_add_dflt_router) is repaired, ip6_fw is not essential
-          at all. --ANK
-        */
-       return NULL;
-#endif
+       dst_free((struct dst_entry *) rt);
+       return err;
 }
 
 int ip6_del_rt(struct rt6_info *rt)
 {
-       rt6_lock();
+       int err;
 
        start_bh_atomic();
-
-       /* I'd add here couple of cli()
-          cli(); cli(); cli();
-
-          Now it is really LOCKED. :-) :-) --ANK
-        */
-
        rt6_dflt_pointer = NULL;
-
-       if (atomic_read(&rt6_tbl_lock) == 1)
-               fib6_del(rt);
-       else
-               rtreq_add(rt, RT_OPER_DEL);
+       err = fib6_del(rt);
        end_bh_atomic();
-       rt6_unlock();
-       return 0;
+
+       return err;
 }
 
 int ip6_route_del(struct in6_rtmsg *rtmsg)
 {
        struct fib6_node *fn;
        struct rt6_info *rt;
+       int err = -ESRCH;
 
-       rt6_lock();
-       fn = fib6_lookup(&ip6_routing_table, &rtmsg->rtmsg_dst, &rtmsg->rtmsg_src);
-       rt = fn->leaf;
-
-       /*
-        *      Blow it away
-        *
-        *      BUGGGG It will not help with Pedro's subtrees.
-        *      We urgently need fib6_locate_node function, and
-        *      it is not the only place where rt6_lookup is used
-        *      for wrong purpose.
-        *                                                      --ANK
-        */
-restart:
-       if (rt && rt->rt6i_src.plen == rtmsg->rtmsg_src_len) {
-               if (rt->rt6i_dst.plen > rtmsg->rtmsg_dst_len) {
-                       struct fib6_node *fn = rt->rt6i_node;
-                       while ((fn = fn->parent) != NULL) {
-                               if (fn->fn_flags & RTN_ROOT)
-                                       break;
-                               if (fn->fn_flags & RTN_RTINFO) {
-                                       rt = fn->leaf;
-                                       goto restart;
-                               }
-                       }
-               }
+       start_bh_atomic();
 
-               if (rt->rt6i_dst.plen == rtmsg->rtmsg_dst_len) {
-                       for ( ; rt; rt = rt->u.next) {
-                               if (rtmsg->rtmsg_ifindex &&
-                                   (rt->rt6i_dev == NULL ||
-                                    rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
-                                       continue;
-                               if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
-                                    ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
-                                       continue;
-                               if (rtmsg->rtmsg_metric &&
-                                   rtmsg->rtmsg_metric != rt->rt6i_metric)
-                                       continue;
-                               ip6_del_rt(rt);
-                               rt6_unlock();
-                               return 0;
-                       }
+       fn = fib6_locate(&ip6_routing_table,
+                        &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
+                        &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
+       
+       if (fn) {
+               for (rt = fn->leaf; rt; rt = rt->u.next) {
+                       if (rtmsg->rtmsg_ifindex &&
+                           (rt->rt6i_dev == NULL ||
+                            rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
+                               continue;
+                       if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
+                           ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
+                               continue;
+                       if (rtmsg->rtmsg_metric &&
+                           rtmsg->rtmsg_metric != rt->rt6i_metric)
+                               continue;
+                       err = ip6_del_rt(rt);
+                       break;
                }
        }
-       rt6_unlock();
-
-       return -ESRCH;
-}
-
-
-/*
- *     bottom handler, runs with atomic_bh protection
- */
-void __rt6_run_bh(void)
-{
-       struct rt6_req *rtreq;
+       end_bh_atomic();
 
-       while ((rtreq = rtreq_dequeue())) {
-               switch (rtreq->operation) {
-               case RT_OPER_ADD:
-                       fib6_add(&ip6_routing_table, rtreq->ptr);
-                       break;
-               case RT_OPER_DEL:
-                       fib6_del(rtreq->ptr);
-                       break;
-               };
-               kfree(rtreq);
-       }
-       rt6_bh_mask = 0;
+       return err;
 }
 
 #ifdef CONFIG_IPV6_NETLINK
@@ -971,10+792,10 @@ static int rt6_msgrcv(int unit, struct sk_buff *skb)
 
                switch (rtmsg->rtmsg_type) {
                case RTMSG_NEWROUTE:
-                       ip6_route_add(rtmsg, &err);
+                       err = ip6_route_add(rtmsg);
                        break;
                case RTMSG_DELROUTE:
-                       ip6_route_del(rtmsg);
+                       err = ip6_route_del(rtmsg);
                        break;
                default:
                        count = -EINVAL;
@@ -1047,17+868,19 @@ void rt6_sndmsg(int type, struct in6_addr *dst, struct in6_addr *src,
 /*
  *     Handle redirects
  */
-struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
-                             struct in6_addr *target, struct device *dev,
-                             int on_link)
+void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
+                 struct neighbour *neigh, int on_link)
 {
        struct rt6_info *rt, *nrt;
 
        /* Locate old route to this destination. */
-       rt = rt6_lookup(dest, NULL, dev->ifindex, 0);
+       rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
 
-       if (rt == NULL || rt->u.dst.error)
-               return NULL;
+       if (rt == NULL)
+               return;
+
+       if (neigh->dev != rt->rt6i_dev)
+               goto out;
 
        /* Redirect received -> path was valid.
           Look, redirects are sent only in response to data packets,
@@ -1066,12+889,18 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
        dst_confirm(&rt->u.dst);
 
        /* Duplicate redirect: silently ignore. */
-       if (ipv6_addr_cmp(target, &rt->rt6i_gateway) == 0)
-               return NULL;
+       if (neigh == rt->u.dst.neighbour)
+               goto out;
 
-       /* Current route is on-link; redirect is always invalid. */
+       /* Current route is on-link; redirect is always invalid.
+          
+          Seems, previous statement is not true. It could
+          be node, which looks for us as on-link (f.e. proxy ndisc)
+          But then router serving it might decide, that we should
+          know truth 8)8) --ANK (980726).
+        */
        if (!(rt->rt6i_flags&RTF_GATEWAY))
-               return NULL;
+               goto out;
 
 #if !defined(CONFIG_IPV6_EUI64) || defined(CONFIG_IPV6_NO_PB)
        /*
@@ -1089,16+918,21 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
 
        if (ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) {
                if (rt->rt6i_flags & RTF_DEFAULT) {
-                       rt = ip6_routing_table.leaf;
+                       struct rt6_info *rt1;
 
-                       for (; rt; rt = rt->u.next) {
-                               if (!ipv6_addr_cmp(saddr, &rt->rt6i_gateway))
+                       for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
+                               if (!ipv6_addr_cmp(saddr, &rt1->rt6i_gateway)) {
+                                       dst_clone(&rt1->u.dst);
+                                       dst_release(&rt->u.dst);
+                                       rt = rt1;
                                        goto source_ok;
+                               }
                        }
                }
-               printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
+               if (net_ratelimit())
+                       printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
                               "for redirect target\n");
-               return NULL;
+               goto out;
        }
 
 source_ok:
@@ -1107,36+941,11 @@ source_ok:
        /*
         *      We have finally decided to accept it.
         */
-       if (rt->rt6i_dst.plen == 128) {
-               /* BUGGGG! Very bad bug. Fast path code does not protect
-                * itself of changing nexthop on the fly, it was supposed
-                * that crucial parameters (dev, nexthop, hh) ARE VOLATILE.
-                *                                                   --ANK
-                * Not fixed!! I plugged it to avoid random crashes
-                * (they are very unlikely, but I do not want to shrug
-                *  every time when redirect arrives)
-                * but the plug must be removed. --ANK
-                */
-
-#if 0
-               /*
-                *      Already a host route.
-                *
-                */
-               if (rt->rt6i_nexthop)
-                       neigh_release(rt->rt6i_nexthop);
-               rt->rt6i_flags |= RTF_MODIFIED | RTF_CACHE;
-               if (on_link)
-                       rt->rt6i_flags &= ~RTF_GATEWAY;
-               ipv6_addr_copy(&rt->rt6i_gateway, target);
-               rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, target);
-               return rt;
-#else
-               return NULL;
-#endif
-       }
 
        nrt = ip6_rt_copy(rt);
+       if (nrt == NULL)
+               goto out;
+
        nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
        if (on_link)
                nrt->rt6i_flags &= ~RTF_GATEWAY;
@@ -1144,19+953,24 @@ source_ok:
        ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
        nrt->rt6i_dst.plen = 128;
 
-       ipv6_addr_copy(&nrt->rt6i_gateway, target);
-       nrt->rt6i_nexthop = ndisc_get_neigh(nrt->rt6i_dev, target);
-       nrt->rt6i_dev = dev;
-       nrt->u.dst.pmtu = ipv6_get_mtu(dev);
-       if (!ipv6_addr_is_multicast(&nrt->rt6i_dst.addr))
-               nrt->rt6i_hoplimit = ipv6_get_hoplimit(dev);
+       ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
+       nrt->rt6i_nexthop = neigh_clone(neigh);
+       /* Reset pmtu, it may be better */
+       nrt->u.dst.pmtu = ipv6_get_mtu(neigh->dev);
+       nrt->rt6i_hoplimit = ipv6_get_hoplimit(neigh->dev);
+
+       if (rt6_ins(nrt))
+               goto out;
 
-       rt6_lock();
-       rt6_ins(nrt);
-       rt6_unlock();
+       /* Sic! rt6_redirect is called by bh, so that it is allowed */
+       dst_release(&rt->u.dst);
+       if (rt->rt6i_flags&RTF_CACHE)
+               ip6_del_rt(rt);
+       return;
 
-       /* BUGGGGGGG! nrt can point to nowhere. */
-       return nrt;
+out:
+        dst_release(&rt->u.dst);
+       return;
 }
 
 /*
@@ -1164,29+978,25 @@ source_ok:
  *     i.e. Path MTU discovery
  */
 
-void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu)
+void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
+                       struct device *dev, u32 pmtu)
 {
        struct rt6_info *rt, *nrt;
 
-       if (pmtu < 576 || pmtu > 65536) {
-#if RT6_DEBUG >= 1
-               printk(KERN_DEBUG "rt6_pmtu_discovery: invalid MTU value %d\n",
-                      pmtu);
-#endif
+       if (pmtu < IPV6_MIN_MTU) {
+               if (net_ratelimit())
+                       printk(KERN_DEBUG "rt6_pmtu_discovery: invalid MTU value %d\n",
+                              pmtu);
                return;
        }
 
-       rt = rt6_lookup(addr, NULL, dev->ifindex, 0);
+       rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
 
-       if (rt == NULL || rt->u.dst.error) {
-#if RT6_DEBUG >= 2
-               printk(KERN_DEBUG "rt6_pmtu_discovery: no route to host\n");
-#endif
+       if (rt == NULL)
                return;
-       }
 
        if (pmtu >= rt->u.dst.pmtu)
-               return;
+               goto out;
 
        /* New mtu received -> path was valid.
           They are sent only in response to data packets,
@@ -1194,39+1004,42 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu)
         */
        dst_confirm(&rt->u.dst);
 
-       /* It is wrong, but I plugged the hole here.
-          On-link routes are cloned differently,
-          look at rt6_redirect --ANK
+       /* Host route. If it is static, it would be better
+          not to override it, but add new one, so that
+          when cache entry will expire old pmtu
+          would return automatically.
         */
-       if (!(rt->rt6i_flags&RTF_GATEWAY))
-               return;
-
        if (rt->rt6i_dst.plen == 128) {
                /*
                 *      host route
                 */
                rt->u.dst.pmtu = pmtu;
                rt->rt6i_flags |= RTF_MODIFIED;
-
-               return;
+               goto out;
        }
 
-       nrt = ip6_rt_copy(rt);
-       ipv6_addr_copy(&nrt->rt6i_dst.addr, addr);
-       nrt->rt6i_dst.plen = 128;
-
-       nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE);
-
-       /* It was missing. :-) :-)
-          I wonder, kernel was deemed to crash after pkt_too_big
-          and nobody noticed it. Hey, guys, do someone really
-          use it? --ANK
+       /* Network route.
+          Two cases are possible:
+          1. It is connected route. Action: COW
+          2. It is gatewayed route or NONEXTHOP route. Action: clone it.
         */
-       nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
+       if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
+               nrt = rt6_cow(rt, daddr, saddr);
+               nrt->rt6i_flags |= RTF_DYNAMIC;
+               dst_release(&nrt->u.dst);
+       } else {
+               nrt = ip6_rt_copy(rt);
+               if (nrt == NULL)
+                       goto out;
+               ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr);
+               nrt->rt6i_dst.plen = 128;
+               nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
+               nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE);
+               rt6_ins(nrt);
+       }
 
-       rt6_lock();
-       rt6_ins(rt);
-       rt6_unlock();
+out:
+       dst_release(&rt->u.dst);
 }
 
 /*
@@ -1247,16+1060,19 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
                rt->u.dst.rtt = ort->u.dst.rtt;
                rt->u.dst.window = ort->u.dst.window;
                rt->u.dst.mxlock = ort->u.dst.mxlock;
+               rt->u.dst.dev = ort->u.dst.dev;
+               rt->u.dst.lastuse = jiffies;
                rt->rt6i_hoplimit = ort->rt6i_hoplimit;
-               rt->rt6i_dev = ort->rt6i_dev;
+               rt->rt6i_expires = ort->rt6i_expires;
 
                ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
-               rt->rt6i_keylen = ort->rt6i_keylen;
                rt->rt6i_flags = ort->rt6i_flags;
                rt->rt6i_metric = ort->rt6i_metric;
 
                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
+#ifdef CONFIG_IPV6_SUBTREES
                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+#endif
        }
        return rt;
 }
@@ -1266,31+1082,17 @@ struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct device *dev)
        struct rt6_info *rt;
        struct fib6_node *fn;
 
-       RDBG(("rt6_get_dflt_router(%p,%p)[%p]", addr, dev,
-             __builtin_return_address(0)));
-#if RT6_DEBUG >= 3
-       {
-               int i;
-
-               RDBG(("addr["));
-               for(i = 0; i < 8; i++) {
-                       RDBG(("%04x%c", addr->s6_addr16[i],
-                             i == 7 ? ']' : ':'));
-               }
-       }
-#endif
-       RDBG(("\n"));
-       rt6_lock();
-
        fn = &ip6_routing_table;
 
+       start_bh_atomic();
        for (rt = fn->leaf; rt; rt=rt->u.next) {
                if (dev == rt->rt6i_dev &&
                    ipv6_addr_cmp(&rt->rt6i_gateway, addr) == 0)
                        break;
        }
-
-       rt6_unlock();
+       if (rt)
+               dst_clone(&rt->u.dst);
+       end_bh_atomic();
        return rt;
 }
 
@@ -1298,24+1100,6 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
                                     struct device *dev)
 {
        struct in6_rtmsg rtmsg;
-       struct rt6_info *rt;
-       int err;
-
-       RDBG(("rt6_add_dflt_router(%p,%p)[%p] ", gwaddr, dev,
-             __builtin_return_address(0)));
-#if RT6_DEBUG >= 3
-       {
-               struct in6_addr *addr = gwaddr;
-               int i;
-
-               RDBG(("gwaddr["));
-               for(i = 0; i < 8; i++) {
-                       RDBG(("%04x%c", addr->s6_addr16[i],
-                             i == 7 ? ']' : ':'));
-               }
-       }
-#endif
-       RDBG(("\n"));
 
        memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
        rtmsg.rtmsg_type = RTMSG_NEWROUTE;
@@ -1325,48+1109,28 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
 
        rtmsg.rtmsg_ifindex = dev->ifindex;
 
-       rt = ip6_route_add(&rtmsg, &err);
-
-       /* BUGGGGGGGGGGGGGGGGGGGG!
-          rt can be not NULL, but point to heavens.
-        */
-
-       if (err) {
-               printk(KERN_DEBUG "rt6_add_dflt: ip6_route_add error %d\n",
-                      err);
-       }
-       return rt;
+       ip6_route_add(&rtmsg);
+       return rt6_get_dflt_router(gwaddr, dev);
 }
 
 void rt6_purge_dflt_routers(int last_resort)
 {
        struct rt6_info *rt;
-       struct fib6_node *fn;
        u32 flags;
 
-       RDBG(("rt6_purge_dflt_routers(%d)[%p]\n", last_resort,
-             __builtin_return_address(0)));
-       fn = &ip6_routing_table;
-
-       rt6_dflt_pointer = NULL;
-
        if (last_resort)
                flags = RTF_ALLONLINK;
        else
                flags = RTF_DEFAULT | RTF_ADDRCONF;     
 
-       for (rt = fn->leaf; rt; ) {
-               if ((rt->rt6i_flags & flags)) {
-                       struct rt6_info *drt;
-#if RT6_DEBUG >= 2
-                       printk(KERN_DEBUG "rt6_purge_dflt: deleting entry\n");
-#endif
-                       drt = rt;
-                       rt = rt->u.next;
-                       ip6_del_rt(drt);
-                       continue;
+restart:
+       rt6_dflt_pointer = NULL;
+
+       for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
+               if (rt->rt6i_flags & flags) {
+                       ip6_del_rt(rt);
+                       goto restart;
                }
-               rt = rt->u.next;
        }
 }
 
@@ -1389,7+1153,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg)
                rtnl_lock();
                switch (cmd) {
                case SIOCADDRT:
-                       ip6_route_add(&rtmsg, &err);
+                       err = ip6_route_add(&rtmsg);
                        break;
                case SIOCDELRT:
                        err = ip6_route_del(&rtmsg);
@@ -1414,7+1178,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg)
  */
 
 int ip6_pkt_discard(struct sk_buff *skb)
-{      
+{
        ipv6_statistics.Ip6OutNoRoutes++;
        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
        kfree_skb(skb);
@@ -1429,21+1193,6 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev)
 {
        struct rt6_info *rt;
 
-       RDBG(("ip6_rt_addr_add(%p,%p)[%p]\n", addr, dev,
-             __builtin_return_address(0)));
-#if RT6_DEBUG >= 3
-       {
-               int i;
-
-               RDBG(("addr["));
-               for(i = 0; i < 8; i++) {
-                       RDBG(("%04x%c", addr->s6_addr16[i],
-                             i == 7 ? ']' : ':'));
-               }
-       }
-#endif
-       RDBG(("\n"));
-
        rt = dst_alloc(sizeof(struct rt6_info), &ip6_dst_ops);
        if (rt == NULL)
                return -ENOMEM;
@@ -1465,10+1214,7 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev)
 
        ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
        rt->rt6i_dst.plen = 128;
-
-       rt6_lock();
        rt6_ins(rt);
-       rt6_unlock();
 
        return 0;
 }
@@ -1480,12+1226,16 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev)
 int ip6_rt_addr_del(struct in6_addr *addr, struct device *dev)
 {
        struct rt6_info *rt;
+       int err = -ENOENT;
 
-       rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, RTF_LINKRT);
-       if (rt && rt->rt6i_dst.plen == 128)
-               return ip6_del_rt(rt);
+       rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, 1);
+       if (rt) {
+               if (rt->rt6i_dst.plen == 128)
+                       err= ip6_del_rt(rt);
+               dst_release(&rt->u.dst);
+       }
 
-       return 0;
+       return err;
 }
 
 #ifdef CONFIG_RT6_POLICY
@@ -1587,75+1337,65 @@ static struct rt6_info *rt6_flow_lookup(struct rt6_info *rt,
        }
 
 error:
+       dst_clone(&ip6_null_entry.u.dst);
        return &ip6_null_entry;
 
 found:
-
        if (nrt == NULL)
                goto error;
 
        nrt->rt6i_flags |= RTF_CACHE;
-       /* BUGGGG! nrt can point to nowhere! */
-       rt6_ins(nrt);
-
+       dst_clone(&nrt->u.dst);
+       err = rt6_ins(nrt);
+       if (err)
+               nrt->u.dst.error = err;
        return nrt;
 }
 #endif
 
-/* 
- * Nope, I am not idiot. I see that it is the ugliest of ugly routines.
- * Anyone is advertised to write better one. --ANK
- */
+static int fib6_ifdown(struct rt6_info *rt, void *arg)
+{
+       if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
+           rt != &ip6_null_entry) {
+               RT6_TRACE("deleted by ifdown %p\n", rt);
+               return -1;
+       }
+       return 0;
+}
 
-struct rt6_ifdown_arg {
+void rt6_ifdown(struct device *dev)
+{
+       fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
+}
+
+struct rt6_mtu_change_arg
+{
        struct device *dev;
-       struct rt6_info *rt;
+       unsigned mtu;
 };
 
-
-static void rt6_ifdown_node(struct fib6_node *fn, void *p_arg)
+static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 {
-       struct rt6_info *rt;
-       struct rt6_ifdown_arg *arg = (struct rt6_ifdown_arg *) p_arg;
-
-       if (arg->rt != NULL)
-               return;
-
-       for (rt = fn->leaf; rt; rt = rt->u.next) {
-               if (rt->rt6i_dev == arg->dev || arg->dev == NULL) {
-                       arg->rt = rt;
-                       return;
-               }
-       }
+       struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
+
+       /* In IPv6 pmtu discovery is not optional,
+          so that RTAX_MTU lock cannot dissable it.
+          We still use this lock to block changes
+          caused by addrconf/ndisc.
+          */
+       if (rt->rt6i_dev == arg->dev &&
+           !(rt->u.dst.mxlock&(1<<RTAX_MTU)))
+               rt->u.dst.pmtu = arg->mtu;
+       return 0;
 }
 
-void rt6_ifdown(struct device *dev)
+void rt6_mtu_change(struct device *dev, unsigned mtu)
 {
-       int count = 0;
-       struct rt6_ifdown_arg arg;
-       struct rt6_info *rt;
+       struct rt6_mtu_change_arg arg;
 
-       do {
-               arg.dev = dev;
-               arg.rt = NULL;
-               fib6_walk_tree(&ip6_routing_table, rt6_ifdown_node, &arg,
-                              RT6_FILTER_RTNODES);
-               if (arg.rt != NULL)
-                       ip6_del_rt(arg.rt);
-               count++;
-       } while (arg.rt != NULL);
-
-       /* And default routes ... */
-
-       for (rt = ip6_routing_table.leaf; rt; ) {
-               if (rt != &ip6_null_entry && (rt->rt6i_dev == dev || dev == NULL)) {
-                       struct rt6_info *deleting = rt;
-                       rt = rt->u.next;
-                       ip6_del_rt(deleting);
-                       continue;
-               }
-               rt = rt->u.next;
-       }
+       arg.dev = dev;
+       arg.mtu = mtu;
+       fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
 }
 
 #ifdef CONFIG_RTNETLINK
@@ -1714,37+1454,28 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
        struct rtmsg *r = NLMSG_DATA(nlh);
        struct in6_rtmsg rtmsg;
-       int err = 0;
 
        if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
                return -EINVAL;
-       ip6_route_add(&rtmsg, &err);
-       return err;
+       return ip6_route_add(&rtmsg);
 }
 
 struct rt6_rtnl_dump_arg
 {
        struct sk_buff *skb;
        struct netlink_callback *cb;
-       int skip;
-       int count;
-       int stop;
 };
 
 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
                         struct in6_addr *dst,
                         struct in6_addr *src,
                         int iif,
-                        int type, pid_t pid, u32 seq)
+                        int type, u32 pid, u32 seq)
 {
        struct rtmsg *rtm;
        struct nlmsghdr  *nlh;
        unsigned char    *b = skb->tail;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       unsigned char    *o;
-#else
        struct rtattr *mx;
-#endif
        struct rta_cacheinfo ci;
 
        nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm));
@@ -1762,9+1493,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
                rtm->rtm_type = RTN_UNICAST;
        rtm->rtm_flags = 0;
        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       rtm->rtm_nhs = 0;
-#endif
        rtm->rtm_protocol = RTPROT_BOOT;
        if (rt->rt6i_flags&RTF_DYNAMIC)
                rtm->rtm_protocol = RTPROT_REDIRECT;
@@ -1776,19+1504,18 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
        if (rt->rt6i_flags&RTF_CACHE)
                rtm->rtm_flags |= RTM_F_CLONED;
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       o = skb->tail;
-#endif
        if (dst) {
                RTA_PUT(skb, RTA_DST, 16, dst);
                rtm->rtm_dst_len = 128;
        } else if (rtm->rtm_dst_len)
                RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
+#ifdef CONFIG_IPV6_SUBTREES
        if (src) {
                RTA_PUT(skb, RTA_SRC, 16, src);
                rtm->rtm_src_len = 128;
        } else if (rtm->rtm_src_len)
                RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
+#endif
        if (iif)
                RTA_PUT(skb, RTA_IIF, 4, &iif);
        else if (dst) {
@@ -1796,14+1523,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
                if (ifp)
                        RTA_PUT(skb, RTA_PREFSRC, 16, &ifp->addr);
        }
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       if (rt->u.dst.pmtu)
-               RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
-       if (rt->u.dst.window)
-               RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window);
-       if (rt->u.dst.rtt)
-               RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt);
-#else
        mx = (struct rtattr*)skb->tail;
        RTA_PUT(skb, RTA_METRICS, 0, NULL);
        if (rt->u.dst.mxlock)
@@ -1817,7+1536,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
        mx->rta_len = skb->tail - (u8*)mx;
        if (mx->rta_len == RTA_LENGTH(0))
                skb_trim(skb, (u8*)mx - skb->data);
-#endif
        if (rt->u.dst.neighbour)
                RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
        if (rt->u.dst.dev)
@@ -1828,13+1546,10 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
                ci.rta_expires = rt->rt6i_expires - jiffies;
        else
                ci.rta_expires = 0;
-       ci.rta_used = 0;
+       ci.rta_used = atomic_read(&rt->u.dst.refcnt);
        ci.rta_clntref = atomic_read(&rt->u.dst.use);
        ci.rta_error = rt->u.dst.error;
        RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-       rtm->rtm_optlen = skb->tail - o;
-#endif
        nlh->nlmsg_len = skb->tail - b;
        return skb->len;
 
@@ -1844,45+1559,98 @@ rtattr_failure:
        return -1;
 }
 
-static void rt6_dump_node(struct fib6_node *fn, void *p_arg)
+static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 {
-       struct rt6_info *rt;
        struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
 
-       if (arg->stop)
-               return;
+       return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
+                            NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq);
+}
 
-       for (rt = fn->leaf; rt; rt = rt->u.next) {
-               if (arg->count < arg->skip) {
-                       arg->count++;
-                       continue;
-               }
-               if (rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
-                                 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq) <= 0) {
-                       arg->stop = 1;
-                       break;
+static int fib6_dump_node(struct fib6_walker_t *w)
+{
+       int res;
+       struct rt6_info *rt;
+
+       for (rt = w->leaf; rt; rt = rt->u.next) {
+               res = rt6_dump_route(rt, w->args);
+               if (res < 0) {
+                       /* Frame is full, suspend walking */
+                       w->leaf = rt;
+                       return 1;
                }
-               arg->count++;
+               BUG_TRAP(res!=0);
        }
+       w->leaf = NULL;
+       return 0;
 }
 
+static int fib6_dump_done(struct netlink_callback *cb)
+{
+       struct fib6_walker_t *w = (void*)cb->args[0];
+
+       if (w) {
+               cb->args[0] = 0;
+               start_bh_atomic();
+               fib6_walker_unlink(w);
+               end_bh_atomic();
+               kfree(w);
+       }
+       if (cb->args[1]) {
+               cb->done = (void*)cb->args[1];
+               cb->args[1] = 0;
+       }
+       return cb->done(cb);
+}
 
 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
        struct rt6_rtnl_dump_arg arg;
+       struct fib6_walker_t *w;
+       int res;
 
        arg.skb = skb;
        arg.cb = cb;
-       arg.skip = cb->args[0];
-       arg.count = 0;
-       arg.stop = 0;
-       start_bh_atomic();
-       fib6_walk_tree(&ip6_routing_table, rt6_dump_node, &arg, RT6_FILTER_RTNODES);
-       if (arg.stop == 0)
-               rt6_dump_node(&ip6_routing_table, &arg);
-       end_bh_atomic();
-       cb->args[0] = arg.count;
-       return skb->len;
+
+       w = (void*)cb->args[0];
+       if (w == NULL) {
+               /* New dump:
+                * 
+                * 1. hook callback destructor.
+                */
+               cb->args[1] = (long)cb->done;
+               cb->done = fib6_dump_done;
+
+               /*
+                * 2. allocate and initialize walker.
+                */
+               w = kmalloc(sizeof(*w), GFP_KERNEL);
+               if (w == NULL)
+                       return -ENOMEM;
+               RT6_TRACE("dump<%p", w);
+               memset(w, 0, sizeof(*w));
+               w->root = &ip6_routing_table;
+               w->func = fib6_dump_node;
+               w->args = &arg;
+               cb->args[0] = (long)w;
+               start_bh_atomic();
+               res = fib6_walk(w);
+               end_bh_atomic();
+       } else {
+               w->args = &arg;
+               start_bh_atomic();
+               res = fib6_walk_continue(w);
+               end_bh_atomic();
+       }
+#if RT6_DEBUG >= 3
+       if (res <= 0 && skb->len == 0)
+               RT6_TRACE("%p>dump end\n", w);
+#endif
+       /* res < 0 is an error. (really, impossible)
+          res == 0 means that dump is complete, but skb still can contain data.
+          res > 0 dump is not complete, but frame is full.
+        */
+       return res < 0 ? res : skb->len;
 }
 
 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
@@ -1974,10+1742,10 @@ void inet6_rt_notify(int event, struct rt6_info *rt)
 
 #ifdef CONFIG_PROC_FS
 
-
 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
 
-struct rt6_proc_arg {
+struct rt6_proc_arg
+{
        char *buffer;
        int offset;
        int length;
@@ -1985,109+1753,18 @@ struct rt6_proc_arg {
        int len;
 };
 
-static void rt6_info_node(struct fib6_node *fn, void *p_arg)
+static int rt6_info_route(struct rt6_info *rt, void *p_arg)
 {
-       struct rt6_info *rt;
        struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
-
-       for (rt = fn->leaf; rt; rt = rt->u.next) {
-               int i;
-
-               if (arg->skip < arg->offset / RT6_INFO_LEN) {
-                       arg->skip++;
-                       continue;
-               }
-
-               if (arg->len >= arg->length)
-                       return;
-               
-               for (i=0; i<16; i++) {
-                       sprintf(arg->buffer + arg->len, "%02x",
-                               rt->rt6i_dst.addr.s6_addr[i]);
-                       arg->len += 2;
-               }
-               arg->len += sprintf(arg->buffer + arg->len, " %02x ",
-                                   rt->rt6i_dst.plen);
-
-               for (i=0; i<16; i++) {
-                       sprintf(arg->buffer + arg->len, "%02x",
-                               rt->rt6i_src.addr.s6_addr[i]);
-                       arg->len += 2;
-               }
-               arg->len += sprintf(arg->buffer + arg->len, " %02x ",
-                                   rt->rt6i_src.plen);
-               
-               if (rt->rt6i_nexthop) {
-                       for (i=0; i<16; i++) {
-                               sprintf(arg->buffer + arg->len, "%02x",
-                                       rt->rt6i_nexthop->primary_key[i]);
-                               arg->len += 2;
-                       }
-               } else {
-                       sprintf(arg->buffer + arg->len,
-                               "00000000000000000000000000000000");
-                       arg->len += 32;
-               }
-               arg->len += sprintf(arg->buffer + arg->len,
-                                   " %08x %08x %08x %08x %8s\n",
-                                   rt->rt6i_metric, atomic_read(&rt->rt6i_use),
-                                   atomic_read(&rt->rt6i_ref), rt->rt6i_flags, 
-                                   rt->rt6i_dev ? rt->rt6i_dev->name : "");
-       }
-}
-
-static int rt6_proc_info(char *buffer, char **start, off_t offset, int length,
-                        int dummy)
-{
-       struct rt6_proc_arg arg;
-       arg.buffer = buffer;
-       arg.offset = offset;
-       arg.length = length;
-       arg.skip = 0;
-       arg.len = 0;
-
-       fib6_walk_tree(&ip6_routing_table, rt6_info_node, &arg,
-                      RT6_FILTER_RTNODES);
-
-       rt6_info_node(&ip6_routing_table, &arg);
-
-       *start = buffer;
-       if (offset)
-               *start += offset % RT6_INFO_LEN;
-
-       arg.len -= offset % RT6_INFO_LEN;
-
-       if(arg.len > length)
-               arg.len = length;
-       if(arg.len < 0)
-               arg.len = 0;
-
-       return arg.len;
-}
-
-#define PTR_SZ (sizeof(void *) * 2)
-#define FI_LINE_SZ (2 * (PTR_SZ) + 7 + 32 + 4 + 32 + 4)
-
-static void rt6_tree_node(struct fib6_node *fn, void *p_arg)
-{
-       struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
-       struct rt6_info *rt;
-       char f;
        int i;
 
-       rt = fn->leaf;
-
-       if (arg->skip < arg->offset / FI_LINE_SZ) {
+       if (arg->skip < arg->offset / RT6_INFO_LEN) {
                arg->skip++;
-               return;
+               return 0;
        }
 
-       if (arg->len + FI_LINE_SZ >= arg->length)
-               return;
-
-       f = (fn->fn_flags & RTN_RTINFO) ? 'r' : 'n';
-       arg->len += sprintf(arg->buffer + arg->len, "%p %p %02x %c ",
-                           fn, fn->parent, fn->fn_bit, f);
+       if (arg->len >= arg->length)
+               return 0;
 
        for (i=0; i<16; i++) {
                sprintf(arg->buffer + arg->len, "%02x",
@@ -2096,18+1773,41 @@ static void rt6_tree_node(struct fib6_node *fn, void *p_arg)
        }
        arg->len += sprintf(arg->buffer + arg->len, " %02x ",
                            rt->rt6i_dst.plen);
-       
+
+#ifdef CONFIG_IPV6_SUBTREES
        for (i=0; i<16; i++) {
                sprintf(arg->buffer + arg->len, "%02x",
                        rt->rt6i_src.addr.s6_addr[i]);
                arg->len += 2;
        }
-       arg->len += sprintf(arg->buffer + arg->len, " %02x\n",
+       arg->len += sprintf(arg->buffer + arg->len, " %02x ",
                            rt->rt6i_src.plen);
+#else
+       sprintf(arg->buffer + arg->len,
+               "00000000000000000000000000000000 00 ");
+       arg->len += 36;
+#endif
 
+       if (rt->rt6i_nexthop) {
+               for (i=0; i<16; i++) {
+                       sprintf(arg->buffer + arg->len, "%02x",
+                               rt->rt6i_nexthop->primary_key[i]);
+                       arg->len += 2;
+               }
+       } else {
+               sprintf(arg->buffer + arg->len,
+                       "00000000000000000000000000000000");
+               arg->len += 32;
+       }
+       arg->len += sprintf(arg->buffer + arg->len,
+                           " %08x %08x %08x %08x %8s\n",
+                           rt->rt6i_metric, atomic_read(&rt->u.dst.use),
+                           atomic_read(&rt->u.dst.refcnt), rt->rt6i_flags, 
+                           rt->rt6i_dev ? rt->rt6i_dev->name : "");
+       return 0;
 }
 
-static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length,
+static int rt6_proc_info(char *buffer, char **start, off_t offset, int length,
                         int dummy)
 {
        struct rt6_proc_arg arg;
@@ -2117,7+1817,7 @@ static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length,
        arg.skip = 0;
        arg.len = 0;
 
-       fib6_walk_tree(&ip6_routing_table, rt6_tree_node, &arg, 0);
+       fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
 
        *start = buffer;
        if (offset)
@@ -2125,15+1825,14 @@ static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length,
 
        arg.len -= offset % RT6_INFO_LEN;
 
-       if(arg.len > length)
+       if (arg.len > length)
                arg.len = length;
-       if(arg.len < 0)
+       if (arg.len < 0)
                arg.len = 0;
 
        return arg.len;
 }
 
-
 extern struct rt6_statistics rt6_stats;
 
 static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length,
@@ -2141,10+1840,11 @@ static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length,
 {
        int len;
 
-       len = sprintf(buffer, "%04x %04x %04x %04x %04x\n",
+       len = sprintf(buffer, "%04x %04x %04x %04x %04x %04x\n",
                      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
                      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
-                     rt6_stats.fib_rt_cache);
+                     rt6_stats.fib_rt_cache,
+                     atomic_read(&ip6_dst_ops.entries));
 
        len -= offset;
 
@@ -2164,12+1864,6 @@ static struct proc_dir_entry proc_rt6_info = {
        0, &proc_net_inode_operations,
        rt6_proc_info
 };
-static struct proc_dir_entry proc_rt6_tree = {
-       PROC_NET_RT6_TREE, 7, "ip6_fib",
-       S_IFREG | S_IRUGO, 1, 0, 0,
-       0, &proc_net_inode_operations,
-       rt6_proc_tree
-};
 static struct proc_dir_entry proc_rt6_stats = {
        PROC_NET_RT6_STATS, 9, "rt6_stats",
        S_IFREG | S_IRUGO, 1, 0, 0,
@@ -2230,7+1924,6 @@ __initfunc(void ip6_route_init(void))
 {
 #ifdef         CONFIG_PROC_FS
        proc_net_register(&proc_rt6_info);
-       proc_net_register(&proc_rt6_tree);
        proc_net_register(&proc_rt6_stats);
 #endif
 #ifdef CONFIG_IPV6_NETLINK
@@ -2243,7+1936,6 @@ void ip6_route_cleanup(void)
 {
 #ifdef CONFIG_PROC_FS
        proc_net_unregister(PROC_NET_RT6);
-       proc_net_unregister(PROC_NET_RT6_TREE);
        proc_net_unregister(PROC_NET_RT6_STATS);
 #endif
 #ifdef CONFIG_IPV6_NETLINK
index 577b85d..0d6efd5 100644 (file)
@@ -6,7+6,7 @@
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *     Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
  *
- *     $Id: sit.c,v 1.27 1998/03/08 05:56:57 davem Exp $
+ *     $Id: sit.c,v 1.28 1998/08/26 12:05:22 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -434,7+434,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev)
                ip_rt_put(rt);
                goto tx_error;
        }
-       if (mtu >= 576) {
+       if (mtu >= IPV6_MIN_MTU) {
                if (skb->dst && mtu < skb->dst->pmtu) {
                        struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
                        if (mtu < rt6->u.dst.pmtu) {
@@ -475,6+475,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev)
                        tunnel->recursion--;
                        return 0;
                }
+               if (skb->sk)
+                       skb_set_owner_w(new_skb, skb->sk);
                dev_kfree_skb(skb);
                skb = new_skb;
        }
@@ -491,7+493,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev)
        iph                     =       skb->nh.iph;
        iph->version            =       4;
        iph->ihl                =       sizeof(struct iphdr)>>2;
-       if (mtu > 576)
+       if (mtu > IPV6_MIN_MTU)
                iph->frag_off   =       __constant_htons(IP_DF);
        else
                iph->frag_off   =       0;
@@ -608,7+610,7 @@ static struct net_device_stats *ipip6_tunnel_get_stats(struct device *dev)
 
 static int ipip6_tunnel_change_mtu(struct device *dev, int new_mtu)
 {
-       if (new_mtu < 576 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+       if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr))
                return -EINVAL;
        dev->mtu = new_mtu;
        return 0;
@@ -662,8+664,8 @@ static int ipip6_tunnel_init(struct device *dev)
        if (tdev) {
                dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
                dev->mtu = tdev->mtu - sizeof(struct iphdr);
-               if (dev->mtu < 576)
-                       dev->mtu = 576;
+               if (dev->mtu < IPV6_MIN_MTU)
+                       dev->mtu = IPV6_MIN_MTU;
        }
        dev->iflink = tunnel->parms.link;
 
index c4cd197..c997999 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: tcp_ipv6.c,v 1.82 1998/06/11 03:15:52 davem Exp $
+ *     $Id: tcp_ipv6.c,v 1.89 1998/08/28 00:27:54 davem Exp $
  *
  *     Based on: 
  *     linux/net/ipv4/tcp.c
@@ -123,16+123,33 @@ static int tcp_v6_verify_bind(struct sock *sk, unsigned short snum)
        }
        if(result == 0) {
                if(tb == NULL) {
-                       if(tcp_bucket_create(snum) == NULL)
+                       if((tb = tcp_bucket_create(snum)) == NULL)
                                result = 1;
+                       else if (sk->reuse && sk->state != TCP_LISTEN)
+                               tb->flags |= TCPB_FLAG_FASTREUSE;
                } else {
                        /* It could be pending garbage collection, this
                         * kills the race and prevents it from disappearing
                         * out from under us by the time we use it.  -DaveM
                         */
-                       if(tb->owners == NULL && !(tb->flags & TCPB_FLAG_LOCKED)) {
-                               tb->flags = TCPB_FLAG_LOCKED;
-                               tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+                       if(tb->owners == NULL) {
+                               if (!(tb->flags & TCPB_FLAG_LOCKED)) {
+                                       tb->flags = (TCPB_FLAG_LOCKED |
+                                                    ((sk->reuse &&
+                                                      sk->state != TCP_LISTEN) ?
+                                                     TCPB_FLAG_FASTREUSE : 0));
+                                       tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+                               } else if (!(tb->flags & TCPB_FLAG_GOODSOCKNUM)) {
+                                       /* Someone is in between the bind
+                                        * and the actual connect or listen.
+                                        * See if it was a legitimate reuse
+                                        * and we are as well, else punt.
+                                        */
+                                       if (sk->reuse == 0 ||
+                                           !(tb->flags & TCPB_FLAG_FASTREUSE))
+                                               result = 1;
+                               } else
+                                       tb->flags &= ~TCPB_FLAG_GOODSOCKNUM;
                        }
                }
        }
@@ -358,7+375,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
        struct dst_entry *dst;
        struct sk_buff *buff;
        int addr_type;
-       int mss;
 
        if (sk->state != TCP_CLOSE) 
                return(-EISCONN);
@@ -403,6+419,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
         */
 
        if (addr_type == IPV6_ADDR_MAPPED) {
+               u32 exthdrlen = tp->ext_header_len;
                struct sockaddr_in sin;
                int err;
 
@@ -418,10+435,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
                err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
 
                if (err) {
+                       tp->ext_header_len = exthdrlen;
                        sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific;
                        sk->backlog_rcv = tcp_v6_do_rcv;
                } else {
-                       /* Yuup... And it is not the only place... --ANK */
                        ipv6_addr_set(&np->saddr, 0, 0, __constant_htonl(0x0000FFFF),
                                      sk->saddr);
                        ipv6_addr_set(&np->rcv_saddr, 0, 0, __constant_htonl(0x0000FFFF),
@@ -441,18+458,18 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
        fl.uli_u.ports.dport = usin->sin6_port;
        fl.uli_u.ports.sport = sk->sport;
 
+       if (np->opt && np->opt->srcrt) {
+               struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+               fl.nl_u.ip6_u.daddr = rt0->addr;
+       }
+
        dst = ip6_route_output(sk, &fl);
-       
+
        if (dst->error) {
                dst_release(dst);
                return dst->error;
        }
 
-       if (dst->pmtu < 576) {
-               dst_release(dst);
-               return -EINVAL;
-       }
-
        if (fl.oif == 0 && addr_type&IPV6_ADDR_LINKLOCAL) {
                /* Ough! This guy tries to connect to link local
                 * address and did not specify interface.
@@ -462,11+479,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
                sk->bound_dev_if = dst->dev->ifindex;
        }
 
-       ip6_dst_store(sk, dst);
+       ip6_dst_store(sk, dst, NULL);
 
        if (saddr == NULL) {
                ifa = ipv6_get_saddr(dst, &np->daddr);
-       
+
                if (ifa == NULL)
                        return -ENETUNREACH;
                
@@ -477,6+494,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
                ipv6_addr_copy(&np->saddr, saddr);
        }
 
+       tp->ext_header_len = 0;
+       if (np->opt)
+               tp->ext_header_len = np->opt->opt_flen+np->opt->opt_nflen;
+       /* Reset mss clamp */
+       tp->mss_clamp = ~0;
+
        buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
                            0, GFP_KERNEL);
 
@@ -498,15+521,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
                                                   np->daddr.s6_addr32[3],
                                                   sk->sport, sk->dport);
 
-       sk->mtu = dst->pmtu;
-       mss = sk->mtu - sizeof(struct ipv6hdr);
-#if 0
-       if (np->opt) {
-               /* Adjust mss */
-       }
-#endif
-
-       tcp_connect(sk, buff, mss);
+       tcp_connect(sk, buff, dst->pmtu);
 
        return 0;
 }
@@ -555,10+570,12 @@ out:
        return retval;
 }
 
-void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, __u32 info,
-               struct in6_addr *saddr, struct in6_addr *daddr,
-               struct inet6_protocol *protocol)
+void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr,
+               struct inet6_skb_parm *opt,
+               int type, int code, unsigned char *header, __u32 info)
 {
+       struct in6_addr *saddr = &hdr->saddr;
+       struct in6_addr *daddr = &hdr->daddr;
        struct tcphdr *th = (struct tcphdr *)header;
        struct ipv6_pinfo *np;
        struct sock *sk;
@@ -567,7+584,8 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header,
        struct tcp_opt *tp; 
        __u32 seq; 
 
-       /* XXX: length check for tcphdr missing here */
+       if (header + 8 > skb->tail)
+               return;
 
        sk = tcp_v6_lookup(daddr, th->dest, saddr, th->source, skb->dev->ifindex);
 
@@ -588,15+606,20 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header,
 
        np = &sk->net_pinfo.af_inet6;
        if (type == ICMPV6_PKT_TOOBIG && sk->state != TCP_LISTEN) {
+               struct dst_entry *dst = NULL;
                /* icmp should have updated the destination cache entry */
 
                if (sk->dst_cache)
-                       dst_check(&sk->dst_cache, np->dst_cookie);
+                       dst = dst_check(&sk->dst_cache, np->dst_cookie);
 
-               if (sk->dst_cache == NULL) {
+               if (dst == NULL) {
                        struct flowi fl;
                        struct dst_entry *dst;
-                       
+
+                       /* BUGGG_FUTURE: Again, it is not clear how
+                          to handle rthdr case. Ignore this complexity
+                          for now.
+                        */
                        fl.proto = IPPROTO_TCP;
                        fl.nl_u.ip6_u.daddr = &np->daddr;
                        fl.nl_u.ip6_u.saddr = &np->saddr;
@@ -605,23+628,19 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header,
                        fl.uli_u.ports.sport = sk->sport;
 
                        dst = ip6_route_output(sk, &fl);
+               } else
+                       dst = dst_clone(dst);
 
-                       ip6_dst_store(sk, dst);
-               }
-
-               if (sk->dst_cache->error) {
-                       sk->err_soft = sk->dst_cache->error;
-               } else {
-                       /* FIXME: Reset sk->mss, taking into account TCP option
-                        *        bytes for timestamps. -DaveM
-                        */
-                       sk->mtu = sk->dst_cache->pmtu;
-               }
-               if (atomic_read(&sk->sock_readers)) { /* remove later */
-                       printk(KERN_DEBUG "tcp_v6_err: pmtu disc: socket locked.\n");
-                       return;
-               }
-               tcp_simple_retransmit(sk);
+               if (dst->error) {
+                       sk->err_soft = dst->error;
+               } else if (tp->pmtu_cookie > dst->pmtu
+                          && !atomic_read(&sk->sock_readers)) {
+                       lock_sock(sk); 
+                       tcp_sync_mss(sk, dst->pmtu);
+                       tcp_simple_retransmit(sk);
+                       release_sock(sk);
+               } /* else let the usual retransmit timer handle it */
+               dst_release(dst);
                return;
        }
 
@@ -680,6+699,7 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
 {
        struct sk_buff * skb;
        struct dst_entry *dst;
+       struct ipv6_txoptions *opt = NULL;
        struct flowi fl;
        int mss;
 
@@ -690,19+710,26 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
        fl.uli_u.ports.dport = req->rmt_port;
        fl.uli_u.ports.sport = sk->sport;
 
-       dst = ip6_route_output(sk, &fl);
-       if (dst->error) {
-               dst_release(dst);
-               return;
+       opt = sk->net_pinfo.af_inet6.opt;
+       if (opt == NULL &&
+           sk->net_pinfo.af_inet6.rxopt.bits.srcrt == 2 &&
+           req->af.v6_req.pktopts) {
+               struct sk_buff *pktopts = req->af.v6_req.pktopts;
+               struct inet6_skb_parm *rxopt = (struct inet6_skb_parm *)pktopts->cb;
+               if (rxopt->srcrt)
+                       opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt));
        }
 
-       mss = dst->pmtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr);
-#if 0
-       /* Subtract option length... */
-       if (opt) {
-               mss -= opt->optlen;
+       if (opt && opt->srcrt) {
+               struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+               fl.nl_u.ip6_u.daddr = rt0->addr;
        }
-#endif
+
+       dst = ip6_route_output(sk, &fl);
+       if (dst->error)
+               goto done;
+
+       mss = dst->pmtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr);
 
        skb = tcp_make_synack(sk, dst, req, mss);
        if (skb) {
@@ -712,13+739,22 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
                                         &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr,
                                         csum_partial((char *)th, skb->len, skb->csum));
 
-               ip6_xmit(sk, skb, &fl, req->af.v6_req.opt);
+               fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr;
+               ip6_xmit(sk, skb, &fl, opt);
        }
+
+done:
        dst_release(dst);
+        if (opt && opt != sk->net_pinfo.af_inet6.opt)
+               sock_kfree_s(sk, opt, opt->tot_len);
 }
 
 static void tcp_v6_or_free(struct open_request *req)
 {
+       if (req->af.v6_req.pktopts) {
+               kfree_skb(req->af.v6_req.pktopts);
+               req->af.v6_req.pktopts = NULL;
+       }
 }
 
 static struct or_calltable or_ipv6 = {
@@ -727,14+763,27 @@ static struct or_calltable or_ipv6 = {
        tcp_v6_send_reset
 };
 
+static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
+{
+       struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+
+       if (sk->net_pinfo.af_inet6.rxopt.all) {
+               if ((opt->hop && sk->net_pinfo.af_inet6.rxopt.bits.hopopts) ||
+                   (opt->srcrt && sk->net_pinfo.af_inet6.rxopt.bits.srcrt) ||
+                   ((opt->dst1 || opt->dst0) && sk->net_pinfo.af_inet6.rxopt.bits.dstopts))
+                       return 1;
+       }
+       return 0;
+}
+
+
 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
 
 /* FIXME: this is substantially similar to the ipv4 code.
  * Can some kind of merge be done? -- erics
  */
-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
-                                                          __u32 isn)
+static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
 {
        struct tcp_opt tp;
        struct open_request *req;
@@ -747,7+796,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
        }
 
        if (skb->protocol == __constant_htons(ETH_P_IP))
-               return tcp_v4_conn_request(sk, skb, ptr, isn);
+               return tcp_v4_conn_request(sk, skb, isn);
+
+       /* FIXME: do the same check for anycast */
+       if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr))
+               goto drop; 
 
        if (isn == 0) 
                isn = tcp_v6_init_sequence(sk,skb);
@@ -756,8+809,9 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
         *      There are no SYN attacks on IPv6, yet...        
         */
        if (BACKLOG(sk) >= BACKLOGMAX(sk)) {
-               printk(KERN_DEBUG "droping syn ack:%d max:%d\n",
-                      BACKLOG(sk), BACKLOGMAX(sk));
+               (void)(net_ratelimit() && 
+                      printk(KERN_INFO "droping syn ack:%d max:%d\n",
+                              BACKLOG(sk), BACKLOGMAX(sk)));
                goto drop;              
        }
 
@@ -773,13+827,16 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
        req->rcv_isn = TCP_SKB_CB(skb)->seq;
        req->snt_isn = isn;
        tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
-       tp.in_mss = 536;
+       tp.mss_clamp = 65535;
        tcp_parse_options(NULL, skb->h.th, &tp, 0);
-        req->mss = tp.in_mss;
-       if (tp.saw_tstamp) {
-               req->mss -= TCPOLEN_TSTAMP_ALIGNED;
+       if (tp.mss_clamp == 65535)
+               tp.mss_clamp = 576 - sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+       if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp)
+               tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss;
+
+        req->mss = tp.mss_clamp;
+       if (tp.saw_tstamp)
                 req->ts_recent = tp.rcv_tsval;
-       }
         req->tstamp_ok = tp.tstamp_ok;
        req->sack_ok = tp.sack_ok;
         req->snd_wscale = tp.snd_wscale;
@@ -787,7+844,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
        req->rmt_port = skb->h.th->source;
        ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr);
        ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr);
-       req->af.v6_req.opt = NULL;      /* FIXME: options */
+       req->af.v6_req.pktopts = NULL;
+       if (ipv6_opt_accepted(sk, skb)) {
+               atomic_inc(&skb->users);
+               req->af.v6_req.pktopts = skb;
+       }
        req->af.v6_req.iif = sk->bound_dev_if;
 
        /* So that link locals have meaning */
@@ -804,8+865,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
        tcp_inc_slow_timer(TCP_SLT_SYNACK);
        tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);      
 
-       sk->data_ready(sk, 0);
-
        return 0;
 
 drop:
@@ -832,8+891,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        struct flowi fl;
        struct tcp_opt *newtp;
        struct sock *newsk;
-       int mss;
-      
+       struct ipv6_txoptions *opt;
+
        if (skb->protocol == __constant_htons(ETH_P_IP)) {
                /*
                 *      v6 mapped
@@ -856,21+915,37 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
                newsk->tp_pinfo.af_tcp.af_specific = &ipv6_mapped;
                newsk->backlog_rcv = tcp_v4_do_rcv;
+               newsk->net_pinfo.af_inet6.pktoptions = NULL;
+               newsk->net_pinfo.af_inet6.opt = NULL;
+
+               /* It is tricky place. Until this moment IPv4 tcp
+                  worked with IPv6 af_tcp.af_specific.
+                  Sync it now.
+                */
+               tcp_sync_mss(newsk, newsk->tp_pinfo.af_tcp.pmtu_cookie);
 
                return newsk;
        }
 
+       opt = sk->net_pinfo.af_inet6.opt;
 
        if (sk->ack_backlog > sk->max_ack_backlog)
-               return NULL; 
+               goto out;
+
+       if (sk->net_pinfo.af_inet6.rxopt.bits.srcrt == 2 &&
+           opt == NULL && req->af.v6_req.pktopts) {
+               struct inet6_skb_parm *rxopt = (struct inet6_skb_parm *)req->af.v6_req.pktopts->cb;
+               if (rxopt->srcrt)
+                       opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(req->af.v6_req.pktopts->nh.raw+rxopt->srcrt));
+       }
 
        if (dst == NULL) {
-               /*
-                *      options / mss / route cache
-                */
-           
                fl.proto = IPPROTO_TCP;
                fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr;
+               if (opt && opt->srcrt) {
+                       struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+                       fl.nl_u.ip6_u.daddr = rt0->addr;
+               }
                fl.nl_u.ip6_u.saddr = &req->af.v6_req.loc_addr;
                fl.oif = sk->bound_dev_if;
                fl.uli_u.ports.dport = req->rmt_port;
@@ -879,22+954,17 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                dst = ip6_route_output(sk, &fl);
        }
 
-       if (dst->error || dst->pmtu < 576)
+       if (dst->error)
                goto out;
-       
+
        sk->tp_pinfo.af_tcp.syn_backlog--;
        sk->ack_backlog++;
 
-       mss = dst->pmtu - sizeof(struct ipv6hdr);
-#if 0
-       /* Adjust mss by option size */
-#endif
-
-       newsk = tcp_create_openreq_child(sk, req, skb, mss);
+       newsk = tcp_create_openreq_child(sk, req, skb);
        if (newsk == NULL)
                goto out;
 
-       ip6_dst_store(newsk, dst);
+       ip6_dst_store(newsk, dst, NULL);
 
        newtp = &(newsk->tp_pinfo.af_tcp);
 
@@ -903,18+973,55 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        ipv6_addr_copy(&np->saddr, &req->af.v6_req.loc_addr);
        ipv6_addr_copy(&np->rcv_saddr, &req->af.v6_req.loc_addr);
        newsk->bound_dev_if = req->af.v6_req.iif;
-       newsk->mtu = dst->pmtu;
+
+       /* Now IPv6 options... 
+
+          First: no IPv4 options.
+        */
        newsk->opt = NULL;
 
+       /* Clone RX bits */
+       np->rxopt.all = sk->net_pinfo.af_inet6.rxopt.all;
+
+       /* Clone pktoptions received with SYN */
+       np->pktoptions = req->af.v6_req.pktopts;
+       if (np->pktoptions)
+               atomic_inc(&np->pktoptions->users);
+       np->opt = NULL;
+
+       /* Clone native IPv6 options from listening socket (if any)
+
+          Yes, keeping reference count would be much more clever,
+          but we make one more one thing there: reattach optmem
+          to newsk.
+        */
+       if (opt) {
+               np->opt = ipv6_dup_options(newsk, opt);
+               if (opt != sk->net_pinfo.af_inet6.opt)
+                       sock_kfree_s(sk, opt, opt->tot_len);
+       }
+
+       newtp->ext_header_len = 0;
+       if (np->opt)
+               newtp->ext_header_len = np->opt->opt_nflen + np->opt->opt_flen;
+
+       tcp_sync_mss(newsk, dst->pmtu);
+       newtp->rcv_mss = newtp->mss_clamp;
+
        newsk->daddr    = LOOPBACK4_IPV6;
        newsk->saddr    = LOOPBACK4_IPV6;
        newsk->rcv_saddr= LOOPBACK4_IPV6;
 
        newsk->prot->hash(newsk);
        add_to_prot_sklist(newsk);
+
+       sk->data_ready(sk, 0); /* Deliver SIGIO */ 
+
        return newsk;
 
 out:
+       if (opt && opt != sk->net_pinfo.af_inet6.opt)
+               sock_kfree_s(sk, opt, opt->tot_len);
        dst_release(dst);
        return NULL;
 }
@@ -1020,8+1127,8 @@ static void tcp_v6_rst_req(struct sock *sk, struct sk_buff *skb)
        if (!req)
                return;
        /* Sequence number check required by RFC793 */
-       if (before(TCP_SKB_CB(skb)->seq, req->snt_isn) ||
-           after(TCP_SKB_CB(skb)->seq, req->snt_isn+1))
+       if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) ||
+           after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
                return;
        if(req->sk)
                sk->ack_backlog--;
@@ -1055,7+1162,7 @@ static inline struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
                }
 #if 0 /*def CONFIG_SYN_COOKIES */
                 else {
-                       sk = cookie_v6_check(sk, skb, (struct ipv6_options *) skb->cb);
+                       sk = cookie_v6_check(sk, skb);
                 }
 #endif
        }
@@ -1064,6+1171,8 @@ static inline struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
 
 static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
+       int users = 0;
+
        /* Imagine: socket is IPv6. IPv4 packet arrives,
           goes to IPv4 receive handler and backlogged.
           From backlog it always goes here. Kerboom...
@@ -1080,6+1189,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
         *      is currently called with bh processing disabled.
         */
 
+       ipv6_statistics.Ip6InDelivers++;
+
        /* XXX We need to think more about socket locking
         * XXX wrt. backlog queues, __release_sock(), etc.  -DaveM
         */
@@ -1092,9+1203,29 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
         */
        skb_set_owner_r(skb, sk);
 
+       /* Do Stevens' IPV6_PKTOPTIONS.
+
+          Yes, guys, it is the only place in our code, where we
+          may make it not affecting IPv4.
+          The rest of code is protocol independent,
+          and I do not like idea to uglify IPv4.
+
+          Actually, all the idea behind IPV6_PKTOPTIONS
+          looks not very well thought. For now we latch
+          options, received in the last packet, enqueued
+          by tcp. Feel free to propose better solution.
+                                              --ANK (980728)
+        */
+       if (sk->net_pinfo.af_inet6.rxopt.all) {
+               users = atomic_read(&skb->users);
+               atomic_inc(&skb->users);
+       }
+
        if (sk->state == TCP_ESTABLISHED) { /* Fast path */
                if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
                        goto reset;
+               if (users)
+                       goto ipv6_pktoptions;
                release_sock(sk);
                return 0;
        }
@@ -1110,26+1241,60 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
                sk = nsk;
        }
 
-       if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->cb, skb->len))
+       if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
                goto reset;
+       if (users)
+               goto ipv6_pktoptions;
        release_sock(sk);
        return 0;
 
 reset:
        tcp_v6_send_reset(skb);
 discard:
+       if (users)
+               kfree_skb(skb);
        kfree_skb(skb);
        release_sock(sk);  
        return 0;
+
+ipv6_pktoptions:
+       /* Do you ask, what is it?
+
+          1. skb was enqueued by tcp.
+          2. skb is added to tail of read queue, rather than out of order.
+          3. socket is not in passive state.
+          4. Finally, it really contains options, which user wants to receive.
+        */
+       if (atomic_read(&skb->users) > users &&
+           TCP_SKB_CB(skb)->end_seq == sk->tp_pinfo.af_tcp.rcv_nxt &&
+           !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
+               if (ipv6_opt_accepted(sk, skb)) {
+                       struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+                       kfree_skb(skb);
+                       skb = NULL;
+                       if (skb2) {
+                               skb_set_owner_r(skb2, sk);
+                               skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, skb2);
+                       }
+               } else {
+                       kfree_skb(skb);
+                       skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, NULL);
+               }
+       }
+
+       if (skb)
+               kfree_skb(skb);
+       release_sock(sk);
+       return 0;
 }
 
-int tcp_v6_rcv(struct sk_buff *skb, struct device *dev,
-              struct in6_addr *saddr, struct in6_addr *daddr,
-              struct ipv6_options *opt, unsigned short len,
-              int redo, struct inet6_protocol *protocol)
+int tcp_v6_rcv(struct sk_buff *skb, unsigned long len)
 {
        struct tcphdr *th;      
        struct sock *sk;
+       struct device *dev = skb->dev;
+       struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+       struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
 
        th = skb->h.th;
 
@@ -1198,7+1363,7 @@ discard_it:
 
 do_time_wait:
        if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
-                                     skb, th, &(IPCB(skb)->opt), skb->len))
+                                     skb, th, skb->len))
                goto no_tcp_socket;
        goto discard_it;
 }
@@ -1221,6+1386,12 @@ static int tcp_v6_rebuild_header(struct sock *sk)
                fl.uli_u.ports.dport = sk->dport;
                fl.uli_u.ports.sport = sk->sport;
 
+               if (np->opt && np->opt->srcrt) {
+                       struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+                       fl.nl_u.ip6_u.daddr = rt0->addr;
+               }
+
+
                dst = ip6_route_output(sk, &fl);
 
                if (dst->error) {
@@ -1228,7+1399,7 @@ static int tcp_v6_rebuild_header(struct sock *sk)
                        return dst->error;
                }
 
-               ip6_dst_store(sk, dst);
+               ip6_dst_store(sk, dst, NULL);
        }
 
        return dst->error;
@@ -1258,6+1429,11 @@ static void tcp_v6_xmit(struct sk_buff *skb)
        fl.uli_u.ports.sport = sk->sport;
        fl.uli_u.ports.dport = sk->dport;
 
+       if (np->opt && np->opt->srcrt) {
+               struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+               fl.nl_u.ip6_u.daddr = rt0->addr;
+       }
+
        if (sk->dst_cache)
                dst = dst_check(&sk->dst_cache, np->dst_cookie);
 
@@ -1270,11+1446,14 @@ static void tcp_v6_xmit(struct sk_buff *skb)
                        return;
                }
 
-               ip6_dst_store(sk, dst);
+               ip6_dst_store(sk, dst, NULL);
        }
 
        skb->dst = dst_clone(dst);
 
+       /* Restore final destination back after routing done */
+       fl.nl_u.ip6_u.daddr = &np->daddr;
+
        ip6_xmit(sk, skb, &fl, np->opt);
 }
 
@@ -1295,6+1474,8 @@ static struct tcp_func ipv6_specific = {
        tcp_v6_conn_request,
        tcp_v6_syn_recv_sock,
        tcp_v6_get_sock,
+       sizeof(struct ipv6hdr),
+
        ipv6_setsockopt,
        ipv6_getsockopt,
        v6_addr2sockaddr,
@@ -1312,6+1493,8 @@ static struct tcp_func ipv6_mapped = {
        tcp_v6_conn_request,
        tcp_v6_syn_recv_sock,
        tcp_v6_get_sock,
+       sizeof(struct iphdr),
+
        ipv6_setsockopt,
        ipv6_getsockopt,
        v6_addr2sockaddr,
@@ -1330,7+1513,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 
        tp->rto  = TCP_TIMEOUT_INIT;            /*TCP_WRITE_TIME*/
        tp->mdev = TCP_TIMEOUT_INIT;
-       tp->in_mss = 536;
+       tp->mss_clamp = ~0;
 
        /* See draft-stevens-tcpca-spec-01 for discussion of the
         * initialization of these values.
@@ -1338,17+1521,17 @@ static int tcp_v6_init_sock(struct sock *sk)
        tp->snd_cwnd = (1 << TCP_CWND_SHIFT);
        tp->snd_ssthresh = 0x7fffffff;
 
-       sk->priority = 1;
        sk->state = TCP_CLOSE;
        sk->max_ack_backlog = SOMAXCONN;
-       sk->mtu = 576;
-       sk->mss = 536;
+       tp->rcv_mss = 536; 
 
        /* Init SYN queue. */
        tcp_synq_init(tp);
 
        sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific;
 
+       sk->write_space = tcp_write_space;
+
        return 0;
 }
 
@@ -1376,12+1559,6 @@ static int tcp_v6_destroy_sock(struct sock *sk)
        while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL)
                kfree_skb(skb);
 
-       /*
-        *      Release destination entry
-        */
-
-       dst_release(xchg(&sk->dst_cache,NULL));
-
        /* Clean up a locked TCP bind bucket, this only happens if a
         * port is allocated for a socket, but it never fully connects.
         * In which case we will find num to be non-zero and daddr to
@@ -1390,7+1567,7 @@ static int tcp_v6_destroy_sock(struct sock *sk)
        if(ipv6_addr_any(&(sk->net_pinfo.af_inet6.daddr)) && sk->num != 0)
                tcp_bucket_unlock(sk);
 
-       return 0;
+       return inet6_destroy_sock(sk);
 }
 
 struct proto tcpv6_prot = {
index 971e966..90662cb 100644 (file)
@@ -7,7+7,7 @@
  *
  *     Based on linux/ipv4/udp.c
  *
- *     $Id: udp.c,v 1.31 1998/07/15 05:05:45 davem Exp $
+ *     $Id: udp.c,v 1.33 1998/08/27 16:55:20 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -59,6+59,14 @@ static int udp_v6_verify_bind(struct sock *sk, unsigned short snum)
                if((sk2->num == snum) && (sk2 != sk)) {
                        unsigned char state = sk2->state;
                        int sk2_reuse = sk2->reuse;
+
+                       /* Two sockets can be bound to the same port if they're
+                        * bound to different interfaces.
+                        */
+
+                       if(sk2->bound_dev_if != sk->bound_dev_if)
+                               continue;
+
                        if(addr_type == IPV6_ADDR_ANY || (!sk2->rcv_saddr)) {
                                if((!sk2_reuse)                 ||
                                   (!sk_reuse)                  ||
@@ -139,7+147,7 @@ static void udp_v6_rehash(struct sock *sk)
 }
 
 static struct sock *udp_v6_lookup(struct in6_addr *saddr, u16 sport,
-                                 struct in6_addr *daddr, u16 dport)
+                                 struct in6_addr *daddr, u16 dport, int dif)
 {
        struct sock *sk, *result = NULL;
        unsigned short hnum = ntohs(dport);
@@ -166,7+174,12 @@ static struct sock *udp_v6_lookup(struct in6_addr *saddr, u16 sport,
                                        continue;
                                score++;
                        }
-                       if(score == 3) {
+                       if(sk->bound_dev_if) {
+                               if(sk->bound_dev_if != dif)
+                                       continue;
+                               score++;
+                       }
+                       if(score == 4) {
                                result = sk;
                                break;
                        } else if(score > badness) {
@@ -257,20+270,25 @@ ipv4_connected:
         */
 
        fl.proto = IPPROTO_UDP;
-       fl.nl_u.ip6_u.daddr = daddr;
+       fl.nl_u.ip6_u.daddr = &np->daddr;
        fl.nl_u.ip6_u.saddr = NULL;
        fl.oif = sk->bound_dev_if;
        fl.uli_u.ports.dport = sk->dport;
        fl.uli_u.ports.sport = sk->sport;
 
+       if (np->opt && np->opt->srcrt) {
+               struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+               fl.nl_u.ip6_u.daddr = rt0->addr;
+       }
+
        dst = ip6_route_output(sk, &fl);
-       
+
        if (dst->error) {
                dst_release(dst);
                return dst->error;
        }
 
-       ip6_dst_store(sk, dst);
+       ip6_dst_store(sk, dst, fl.nl_u.ip6_u.daddr);
 
        /* get the source adddress used in the apropriate device */
 
@@ -291,15+309,50 @@ ipv4_connected:
 
 static void udpv6_close(struct sock *sk, unsigned long timeout)
 {
-       lock_sock(sk);
+       /* See for explanation: raw_close in ipv4/raw.c */
        sk->state = TCP_CLOSE;
-       ipv6_sock_mc_close(sk);
        udp_v6_unhash(sk);
        sk->dead = 1;
-       release_sock(sk);
        destroy_sock(sk);
 }
 
+#ifdef CONFIG_FILTER
+#undef CONFIG_UDP_DELAY_CSUM
+#endif
+
+#ifdef CONFIG_UDP_DELAY_CSUM
+
+/* Please, read comments in net/checksum.h, asm/checksum.h
+
+   I commented out csum_partial_copy_to_user there because it did not
+   verify_area. Now I am even wondered, how clever was I that time 8)8)
+   If I did not it, I would step into this hole again.   --ANK
+ */
+
+#ifndef _HAVE_ARCH_COPY_AND_CSUM_TO_USER
+#if defined(__i386__)
+static __inline__
+unsigned int csum_and_copy_to_user (const char *src, char *dst,
+                                   int len, int sum, int *err_ptr)
+{
+       int *src_err_ptr=NULL;
+
+       if (verify_area(VERIFY_WRITE, dst, len) == 0)
+               return csum_partial_copy_generic(src, dst, len, sum, src_err_ptr, err_ptr);
+
+       if (len)
+               *err_ptr = -EFAULT;
+
+       return sum;
+}
+#elif defined(__sparc__)
+#define csum_and_copy_to_user csum_partial_copy_to_user
+#else
+#undef CONFIG_UDP_DELAY_CSUM
+#endif
+#endif
+#endif
+
 /*
  *     This should be easy, if there is something there we
  *     return it, otherwise we block.
@@ -322,12+375,12 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
         *      From here the generic datagram does a lot of the work. Come
         *      the finished NET3, it will do _ALL_ the work!
         */
-               
+
        skb = skb_recv_datagram(sk, flags, noblock, &err);
        if (!skb)
                goto out;
   
-       copied = ntohs(((struct udphdr *)skb->h.raw)->len) - sizeof(struct udphdr);
+       copied = skb->len - sizeof(struct udphdr);
        if (copied > len) {
                copied = len;
                msg->msg_flags |= MSG_TRUNC;
@@ -337,8+390,41 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
         *      FIXME : should use udp header size info value 
         */
         
+#ifndef CONFIG_UDP_DELAY_CSUM
        err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), 
                                      msg->msg_iov, copied);
+#else
+       if (sk->no_check || skb->ip_summed==CHECKSUM_UNNECESSARY) {
+               err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+                                             copied);
+       } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
+               if (csum_fold(csum_partial(skb->h.raw, ntohs(skb->h.uh->len), skb->csum))) {
+                       /* Error for blocking case is chosen to masquerade
+                          as some normal condition.
+                        */
+                       err = (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
+                       udp_stats_in6.UdpInErrors++;
+                       goto out_free;
+               }
+               err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+                                             copied);
+       } else {
+               unsigned int csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum);
+
+               err = 0;
+               csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, copied, csum, &err);
+               if (err)
+                       goto out_free;
+               if (csum_fold(csum)) {
+                       /* Error for blocking case is chosen to masquerade
+                          as some normal condition.
+                        */
+                       err = (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
+                       udp_stats_in6.UdpInErrors++;
+                       goto out_free;
+               }
+       }
+#endif
        if (err)
                goto out_free;
        
@@ -361,7+447,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
                        memcpy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr,
                               sizeof(struct in6_addr));
 
-                       if (msg->msg_controllen)
+                       if (sk->net_pinfo.af_inet6.rxopt.all)
                                datagram_recv_ctl(sk, msg, skb);
                }
        }
@@ -373,20+459,27 @@ out:
        return err;
 }
 
-void udpv6_err(struct sk_buff *skb, int type, int code, unsigned char *buff, __u32 info,
-              struct in6_addr *saddr, struct in6_addr *daddr,
-              struct inet6_protocol *protocol)
+void udpv6_err(struct sk_buff *skb, struct ipv6hdr *hdr,
+              struct inet6_skb_parm *opt,
+              int type, int code, unsigned char *buff, __u32 info)
 {
+       struct device *dev = skb->dev;
+       struct in6_addr *saddr = &hdr->saddr;
+       struct in6_addr *daddr = &hdr->daddr;
        struct sock *sk;
        struct udphdr *uh;
        int err;
-       
+
+       if (buff + sizeof(struct udphdr) > skb->tail)
+               return;
+
        uh = (struct udphdr *) buff;
 
-       sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source);
+       sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source, dev->ifindex);
    
        if (sk == NULL) {
-               printk(KERN_DEBUG "icmp for unknown sock\n");
+               if (net_ratelimit())
+                       printk(KERN_DEBUG "icmp for unknown sock\n");
                return;
        }
 
@@ -407,11+500,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
        if (sock_queue_rcv_skb(sk,skb)<0) {
                udp_stats_in6.UdpInErrors++;
                ipv6_statistics.Ip6InDiscards++;
-               ipv6_statistics.Ip6InDelivers--;
-               skb->sk = NULL;
                kfree_skb(skb);
                return 0;
        }
+       ipv6_statistics.Ip6InDelivers++;
        udp_stats_in6.UdpInDatagrams++;
        return 0;
 }
@@ -430,7+522,8 @@ static __inline__ int inet6_mc_check(struct sock *sk, struct in6_addr *addr)
 
 static struct sock *udp_v6_mcast_next(struct sock *sk,
                                      u16 loc_port, struct in6_addr *loc_addr,
-                                     u16 rmt_port, struct in6_addr *rmt_addr)
+                                     u16 rmt_port, struct in6_addr *rmt_addr,
+                                     int dif)
 {
        struct sock *s = sk;
        unsigned short num = ntohs(loc_port);
@@ -446,6+539,9 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
                           ipv6_addr_cmp(&np->daddr, rmt_addr))
                                continue;
 
+                       if (s->bound_dev_if && s->bound_dev_if != dif)
+                               continue;
+
                        if(!ipv6_addr_any(&np->rcv_saddr)) {
                                if(ipv6_addr_cmp(&np->rcv_saddr, loc_addr) == 0)
                                        return s;
@@ -468,16+564,18 @@ static void udpv6_mcast_deliver(struct udphdr *uh,
 {
        struct sock *sk, *sk2;
        struct sk_buff *buff;
+       int dif;
 
        sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)];
-       sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr);
+       dif = skb->dev->ifindex;
+       sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
        if (!sk)
                goto free_skb;
 
        buff = NULL;
        sk2 = sk;
        while((sk2 = udp_v6_mcast_next(sk2->next, uh->dest, saddr,
-                                                 uh->source, daddr))) {
+                                                 uh->source, daddr, dif))) {
                if (!buff) {
                        buff = skb_clone(skb, GFP_ATOMIC);
                        if (!buff)
@@ -486,59+584,70 @@ static void udpv6_mcast_deliver(struct udphdr *uh,
                if (sock_queue_rcv_skb(sk2, buff) >= 0)
                        buff = NULL;
        }
-       if (buff) {
-               buff->sk = NULL;
+       if (buff)
                kfree_skb(buff);
-       }
        if (sock_queue_rcv_skb(sk, skb) < 0) {
-       free_skb:
-               skb->sk = NULL;
+free_skb:
                kfree_skb(skb);
        }
 }
 
-int udpv6_rcv(struct sk_buff *skb, struct device *dev,
-             struct in6_addr *saddr, struct in6_addr *daddr,
-             struct ipv6_options *opt, unsigned short len,
-             int redo, struct inet6_protocol *protocol)
+int udpv6_rcv(struct sk_buff *skb, unsigned long len)
 {
        struct sock *sk;
        struct udphdr *uh;
-       int ulen;
-
-       /*
-        *      check if the address is ours...
-        *      I believe that this is being done in IP layer
-        */
+       struct device *dev = skb->dev;
+       struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+       struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
+       u32 ulen;
 
-       uh = (struct udphdr *) skb->h.uh;
-       
-       ipv6_statistics.Ip6InDelivers++;
+       uh = skb->h.uh;
+       __skb_pull(skb, skb->h.raw - skb->data);
 
        ulen = ntohs(uh->len);
-       
+
+       /* Check for jumbo payload */
+       if (ulen == 0 && skb->nh.ipv6h->payload_len == 0)
+               ulen = len;
+
        if (ulen > len || len < sizeof(*uh)) {
-               printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len);
+               if (net_ratelimit())
+                       printk(KERN_DEBUG "UDP: short packet: %d/%ld\n", ulen, len);
                udp_stats_in6.UdpInErrors++;
                kfree_skb(skb);
                return(0);
        }
 
        if (uh->check == 0) {
-               printk(KERN_DEBUG "IPv6: udp checksum is 0\n");
+               /* IPv6 draft-v2 section 8.1 says that we SHOULD log
+                  this error. Well, it is reasonable.
+                */
+               if (net_ratelimit())
+                       printk(KERN_INFO "IPv6: udp checksum is 0\n");
                goto discard;
        }
 
+       skb_trim(skb, ulen);
+
+#ifndef CONFIG_UDP_DELAY_CSUM
        switch (skb->ip_summed) {
        case CHECKSUM_NONE:
-               skb->csum = csum_partial((char*)uh, len, 0);
+               skb->csum = csum_partial((char*)uh, ulen, 0);
        case CHECKSUM_HW:
-               if (csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, skb->csum)) {
+               if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) {
                        printk(KERN_DEBUG "IPv6: udp checksum error\n");
                        goto discard;
                }
        };
-       
+#else
+       if (skb->ip_summed==CHECKSUM_HW) {
+               if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum))
+                       goto discard;
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+       } else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+               skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0);
+#endif
+
        len = ulen;
 
        /* 
@@ -555,10+664,16 @@ int udpv6_rcv(struct sk_buff *skb, struct device *dev,
         * check socket cache ... must talk to Alan about his plans
         * for sock caches... i'll skip this for now.
         */
-
-       sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest);
-
+       
+       sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest, dev->ifindex);
+       
        if (sk == NULL) {
+#ifdef CONFIG_UDP_DELAY_CSUM
+               if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+                   csum_fold(csum_partial((char*)uh, len, skb->csum)))
+                       goto discard;
+#endif
+               
                udp_stats_in6.UdpNoPorts++;
 
                icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev);
@@ -566,16+681,13 @@ int udpv6_rcv(struct sk_buff *skb, struct device *dev,
                kfree_skb(skb);
                return(0);
        }
-
+       
        /* deliver */
-
-       if (atomic_read(&sk->sock_readers))
-               __skb_queue_tail(&sk->back_log, skb);
-       else
-               udpv6_queue_rcv_skb(sk, skb);
+       
+       udpv6_queue_rcv_skb(sk, skb);
        
        return(0);
-
+       
 discard:
        udp_stats_in6.UdpInErrors++;
        kfree_skb(skb);
@@ -618,7+730,7 @@ static int udpv6_getfrag(const void *data, struct in6_addr *addr,
        }
 
        if (csum_partial_copy_fromiovecend(dst, udh->iov, offset,
-                                                    clen, &udh->wcheck))
+                                          clen, &udh->wcheck))
                return -EFAULT;
 
        if (final) {
@@ -649,11+761,11 @@ static int udpv6_getfrag(const void *data, struct in6_addr *addr,
 
 static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 {
-       struct ipv6_options opt_space;
+       struct ipv6_txoptions opt_space;
        struct udpv6fakehdr udh;
        struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name;
-       struct ipv6_options *opt = NULL;
+       struct ipv6_txoptions *opt = NULL;
        struct flowi fl;
        int addr_len = msg->msg_namelen;
        struct in6_addr *daddr;
@@ -661,22+773,18 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
        int len = ulen + sizeof(struct udphdr);
        int addr_type;
        int hlimit = -1;
-
+       
        int err;
        
        /* Rough check on arithmetic overflow,
           better check is made in ip6_build_xmit
-
-          When jumbo header will be implemeted we will change it
-          to something sort of (len will be size_t)
-          ulen > SIZE_T_MAX - sizeof(struct udphdr)
-        */
-       if (ulen < 0 || ulen > 0xFFFF - sizeof(struct udphdr))
+          */
+       if (ulen < 0 || ulen > INT_MAX - sizeof(struct udphdr))
                return -EMSGSIZE;
-
+       
        if (msg->msg_flags & ~(MSG_DONTROUTE|MSG_DONTWAIT))
                return(-EINVAL);
-
+       
        if (sin6) {
                if (sin6->sin6_family == AF_INET)
                        return udp_sendmsg(sk, msg, ulen);
@@ -692,14+800,6 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
               
                udh.uh.dest = sin6->sin6_port;
                daddr = &sin6->sin6_addr;
-
-               /* BUGGGG! If route is not cloned, this check always
-                  fails, hence dst_cache only slows down transmission --ANK
-                */
-               if (sk->dst_cache && ipv6_addr_cmp(daddr, &np->daddr)) {
-                       dst_release(sk->dst_cache);
-                       sk->dst_cache = NULL;
-               }
        } else {
                if (sk->state != TCP_ESTABLISHED)
                        return(-ENOTCONN);
@@ -707,9+807,9 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
                udh.uh.dest = sk->dport;
                daddr = &sk->net_pinfo.af_inet6.daddr;
        }
-
+       
        addr_type = ipv6_addr_type(daddr);
-
+       
        if (addr_type == IPV6_ADDR_MAPPED) {
                struct sockaddr_in sin;
                
@@ -720,24+820,25 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 
                return udp_sendmsg(sk, msg, ulen);
        }
-
+       
        udh.daddr = NULL;
        fl.oif = sk->bound_dev_if;
        
        if (msg->msg_controllen) {
                opt = &opt_space;
-               memset(opt, 0, sizeof(struct ipv6_options));
+               memset(opt, 0, sizeof(struct ipv6_txoptions));
 
                err = datagram_send_ctl(msg, &fl.oif, &saddr, opt, &hlimit);
                if (err < 0)
                        return err;
-               
-               if (opt->srcrt)
-                       udh.daddr = daddr;
        }
-       
+       if (opt == NULL || !(opt->opt_nflen|opt->opt_flen))
+               opt = np->opt;
+       if (opt && opt->srcrt)
+               udh.daddr = daddr;
+
        udh.uh.source = sk->sport;
-       udh.uh.len = htons(len);
+       udh.uh.len = len < 0x1000 ? htons(len) : 0;
        udh.uh.check = 0;
        udh.iov = msg->msg_iov;
        udh.wcheck = 0;
@@ -783,7+884,7 @@ struct proto udpv6_prot = {
        datagram_poll,                  /* poll */
        udp_ioctl,                      /* ioctl */
        NULL,                           /* init */
-       NULL,                           /* destroy */
+       inet6_destroy_sock,             /* destroy */
        NULL,                           /* shutdown */
        ipv6_setsockopt,                /* setsockopt */
        ipv6_getsockopt,                /* getsockopt */
index 85aaaa7..0db8e06 100644 (file)
@@ -1749,7+1749,6 @@ static int ipx_create(struct socket *sock, int protocol)
 
        sock_init_data(sock, sk);
        sk->destruct    = NULL;
-       sk->mtu         = IPX_MTU;
        sk->no_check    = 1;            /* Checksum off by default */
 
        MOD_INC_USE_COUNT;
index fb2cffe..5b95a7e 100644 (file)
@@ -103,7+103,6 @@ static int spx_create(struct socket *sock, int protocol)
        spx_sock_init(sk);
        sk->data_ready  = spx_rcv;
        sk->destruct    = NULL;
-        sk->mtu        = IPX_MTU;
         sk->no_check   = 1;
 
         MOD_INC_USE_COUNT;
index c57d793..de10481 100644 (file)
@@ -98,7+98,7 @@ static __inline__ int netlink_locked(struct sock *sk)
        return atomic_read(&sk->protinfo.af_netlink.locks);
 }
 
-static __inline__ struct sock *netlink_lookup(int protocol, pid_t pid)
+static __inline__ struct sock *netlink_lookup(int protocol, u32 pid)
 {
        struct sock *sk;
 
@@ -116,10+116,8 @@ extern struct proto_ops netlink_ops;
 
 static void netlink_insert(struct sock *sk)
 {
-       cli();
        sk->next = nl_table[sk->protocol];
        nl_table[sk->protocol] = sk;
-       sti();
 }
 
 static void netlink_remove(struct sock *sk)
@@ -154,26+152,10 @@ static int netlink_create(struct socket *sock, int protocol)
        sock_init_data(sock,sk);
        sk->destruct = NULL;
        
-       sk->mtu=4096;
        sk->protocol=protocol;
        return 0;
 }
 
-static void netlink_destroy_timer(unsigned long data)
-{
-       struct sock *sk=(struct sock *)data;
-
-       if (!netlink_locked(sk) && !atomic_read(&sk->wmem_alloc)
-           && !atomic_read(&sk->rmem_alloc)) {
-               sk_free(sk);
-               return;
-       }
-       
-       sk->timer.expires=jiffies+10*HZ;
-       add_timer(&sk->timer);
-       printk(KERN_DEBUG "netlink sk destroy delayed\n");
-}
-
 static int netlink_release(struct socket *sock, struct socket *peer)
 {
        struct sock *sk = sock->sk;
@@ -223,11+205,7 @@ static int netlink_release(struct socket *sock, struct socket *peer)
        }
 
        if (atomic_read(&sk->rmem_alloc) || atomic_read(&sk->wmem_alloc)) {
-               sk->timer.data=(unsigned long)sk;
-               sk->timer.expires=jiffies+HZ;
-               sk->timer.function=netlink_destroy_timer;
-               add_timer(&sk->timer);
-               printk(KERN_DEBUG "impossible 333\n");
+               printk(KERN_DEBUG "netlink_release: impossible event. Please, report.\n");
                return 0;
        }
 
@@ -270,7+248,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
                return -EINVAL;
 
        /* Only superuser is allowed to listen multicasts */
-       if (nladdr->nl_groups && !suser())
+       if (nladdr->nl_groups && !capable(CAP_NET_ADMIN))
                return -EPERM;
 
        if (sk->protinfo.af_netlink.pid) {
@@ -315,7+293,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
                return -EINVAL;
 
        /* Only superuser is allowed to send multicasts */
-       if (!suser() && nladdr->nl_groups)
+       if (nladdr->nl_groups && !capable(CAP_NET_ADMIN))
                return -EPERM;
 
        sk->protinfo.af_netlink.dst_pid = nladdr->nl_pid;
@@ -344,11+322,12 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr
        return 0;
 }
 
-int netlink_unicast(struct sock *ssk, struct sk_buff *skb, pid_t pid, int nonblock)
+int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)
 {
        struct sock *sk;
        int len = skb->len;
        int protocol = ssk->protocol;
+       struct wait_queue wait = { current, NULL };
 
 retry:
        for (sk = nl_table[protocol]; sk; sk = sk->next) {
@@ -366,17+345,23 @@ retry:
                }
 #endif
 
-               cli();
+               if (!nonblock) {
+                       add_wait_queue(sk->sleep, &wait);
+                       current->state = TASK_INTERRUPTIBLE;
+               }
+
                if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
                        if (nonblock) {
-                               sti();
                                netlink_unlock(sk);
                                kfree_skb(skb);
                                return -EAGAIN;
                        }
-                       interruptible_sleep_on(sk->sleep);
+
+                       schedule();
+
+                       current->state = TASK_RUNNING;
+                       remove_wait_queue(sk->sleep, &wait);
                        netlink_unlock(sk);
-                       sti();
 
                        if (signal_pending(current)) {
                                kfree_skb(skb);
@@ -384,8+369,12 @@ retry:
                        }
                        goto retry;
                }
-               sti();
-Nprintk("unicast_deliver %d\n", skb->len);
+
+               if (!nonblock) {
+                       current->state = TASK_RUNNING;
+                       remove_wait_queue(sk->sleep, &wait);
+               }
+
                skb_orphan(skb);
                skb_set_owner_r(skb, sk);
                skb_queue_tail(&sk->receive_queue, skb);
@@ -417,8+406,8 @@ Nprintk("broadcast_deliver %d\n", skb->len);
        return -1;
 }
 
-void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, pid_t pid,
-                      unsigned group, int allocation)
+void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
+                      u32 group, int allocation)
 {
        struct sock *sk;
        struct sk_buff *skb2 = NULL;
@@ -472,7+461,7 @@ void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, pid_t pid,
        kfree_skb(skb);
 }
 
-void netlink_set_err(struct sock *ssk, pid_t pid, unsigned group, int code)
+void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
 {
        struct sock *sk;
        int protocol = ssk->protocol;
@@ -496,34+485,28 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, int len,
 {
        struct sock *sk = sock->sk;
        struct sockaddr_nl *addr=msg->msg_name;
-       pid_t dst_pid;
-       unsigned dst_groups;
+       u32 dst_pid;
+       u32 dst_groups;
        struct sk_buff *skb;
-       int err;
 
        if (msg->msg_flags&MSG_OOB)
                return -EOPNOTSUPP;
 
-       if (msg->msg_flags&~MSG_DONTWAIT) {
-               printk("1 %08x\n", msg->msg_flags);
+       if (msg->msg_flags&~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE))
                return -EINVAL;
-       }
 
        if (msg->msg_namelen) {
-               if (addr->nl_family != AF_NETLINK) {
-                       printk("2 %08x\n", addr->nl_family);
+               if (addr->nl_family != AF_NETLINK)
                        return -EINVAL;
-               }
                dst_pid = addr->nl_pid;
                dst_groups = addr->nl_groups;
-               if (dst_groups && !suser())
+               if (dst_groups && !capable(CAP_NET_ADMIN))
                        return -EPERM;
        } else {
                dst_pid = sk->protinfo.af_netlink.dst_pid;
                dst_groups = sk->protinfo.af_netlink.dst_groups;
        }
 
-
        if (!sk->protinfo.af_netlink.pid)
                netlink_autobind(sock);
 
@@ -536,17+519,24 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, int len,
        NETLINK_CB(skb).dst_pid = dst_pid;
        NETLINK_CB(skb).dst_groups = dst_groups;
        memcpy(NETLINK_CREDS(skb), &scm->creds, sizeof(struct ucred));
-       memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
+
+       /* What can I do? Netlink is asynchronous, so that
+          we will have to save current capabilities to
+          check them, when this message will be delivered
+          to corresponding kernel module.   --ANK (980802)
+        */
+       NETLINK_CB(skb).eff_cap = current->cap_effective;
+
+       if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) {
+               kfree_skb(skb);
+               return -EFAULT;
+       }
 
        if (dst_groups) {
                atomic_inc(&skb->users);
                netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL);
        }
-       err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
-       if (err < 0) {
-               printk("3\n");
-       }
-       return err;
+       return netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
 }
 
 static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, int len,
@@ -594,7+584,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, int len,
        if (sk->protinfo.af_netlink.cb
            && atomic_read(&sk->rmem_alloc) <= sk->rcvbuf/2)
                netlink_dump(sk);
-       return err ? err : copied;
+       return err ? : copied;
 }
 
 /*
@@ -651,11+641,11 @@ static int netlink_dump(struct sock *sk)
        skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL);
        if (!skb)
                return -ENOBUFS;
-       
+
        cb = sk->protinfo.af_netlink.cb;
 
        len = cb->dump(skb, cb);
-       
+
        if (len > 0) {
                skb_queue_tail(&sk->receive_queue, skb);
                sk->data_ready(sk, len);
@@ -667,7+657,7 @@ static int netlink_dump(struct sock *sk)
        memcpy(NLMSG_DATA(nlh), &len, sizeof(len));
        skb_queue_tail(&sk->receive_queue, skb);
        sk->data_ready(sk, skb->len);
-       
+
        cb->done(cb);
        sk->protinfo.af_netlink.cb = NULL;
        netlink_destroy_callback(cb);
@@ -769,167+759,6 @@ int netlink_post(int unit, struct sk_buff *skb)
 
 #endif
 
-#if 0
-
-/* What a pity... It was good code, but at the moment it
-   results in unnecessary complications.
- */
-
-/*
- *     "High" level netlink interface. (ANK)
- *     
- *     Features:
- *             - standard message format.
- *             - pseudo-reliable delivery. Messages can be still lost, but
- *               user level will know that they were lost and can
- *               recover (f.e. gated could reread FIB and device list)
- *             - messages are batched.
- */
-
-/*
- *     Try to deliver queued messages.
- */
-
-static void nlmsg_delayed_flush(struct sock *sk)
-{
-       nlmsg_flush(sk, GFP_ATOMIC);
-}
-
-static void nlmsg_flush(struct sock *sk, int allocation)
-{
-       struct sk_buff *skb;
-       unsigned long flags;
-
-       save_flags(flags);
-       cli();
-       while ((skb=skb_dequeue(&sk->write_queue)) != NULL) {
-               if (skb->users != 1) {
-                       skb_queue_head(&sk->write_queue, skb);
-                       break;
-               }
-               restore_flags(flags);
-               netlink_broadcast(sk, skb, 0, NETLINK_CB(skb).dst_groups, allocation);
-               cli();
-       }
-       start_bh_atomic();
-       restore_flags(flags);
-       if (skb) {
-               if (sk->timer.function)
-                       del_timer(&sk->timer)
-               sk->timer.expires = jiffies + (sk->protinfo.af_netlink.delay ? : HZ/2);
-               sk->timer.function = (void (*)(unsigned long))nlmsg_delayed_flush;
-               sk->timer.data = (unsigned long)sk;
-               add_timer(&sk->timer);
-       }
-       end_bh_atomic();
-}
-
-/*
- *     Allocate room for new message. If it is impossible, return NULL.
- */
-
-void *nlmsg_broadcast(struct sock *sk, struct sk_buff **skbp,
-                     unsigned long type, int len,
-                     unsigned groups, int allocation)
-{
-       struct nlmsghdr *nlh;
-       struct sk_buff *skb;
-       int     rlen;
-       unsigned long flags;
-
-       rlen = NLMSG_SPACE(len);
-
-       save_flags(flags);
-       cli();
-       skb = sk->write_queue.tail;
-       if (skb == sk->write_queue.head)
-               skb = NULL;
-       if (skb == NULL || skb_tailroom(skb) < rlen || NETLINK_CB(skb).dst_groups != groups) {
-               restore_flags(flags);
-
-               if (skb)
-                       nlmsg_flush(sk, allocation);
-
-               skb = sock_wmalloc(rlen > NLMSG_GOODSIZE ? rlen : NLMSG_GOODSIZE,
-                                  sk, 0, allocation);
-
-               if (skb==NULL) {
-                       printk (KERN_WARNING "nlmsg at unit %d overrunned\n", sk->protocol);
-                       return NULL;
-               }
-
-               NETLINK_CB(skb).dst_groups = groups;
-               cli();
-               skb_queue_tail(&sk->write_queue, skb);
-       }
-       atomic_inc(&skb->users);
-       restore_flags(flags);
-
-       nlh = (struct nlmsghdr*)skb_put(skb, rlen);
-       nlh->nlmsg_type = type;
-       nlh->nlmsg_len = NLMSG_LENGTH(len);
-       nlh->nlmsg_seq = 0;
-       nlh->nlmsg_pid = 0;
-       *skbp = skb;
-       return nlh->nlmsg_data;
-}
-
-struct sk_buff* nlmsg_alloc(unsigned long type, int len,
-                           unsigned long seq, unsigned long pid, int allocation)
-{
-       struct nlmsghdr *nlh;
-       struct sk_buff *skb;
-       int             rlen;
-
-       rlen = NLMSG_SPACE(len);
-
-       skb = alloc_skb(rlen, allocation);
-       if (skb==NULL)
-               return NULL;
-
-       nlh = (struct nlmsghdr*)skb_put(skb, rlen);
-       nlh->nlmsg_type = type;
-       nlh->nlmsg_len = NLMSG_LENGTH(len);
-       nlh->nlmsg_seq = seq;
-       nlh->nlmsg_pid = pid;
-       return skb;
-}
-
-void nlmsg_release(struct sk_buff *skb)
-{
-       atomic_dec(skb->users);
-}
-
-
-/*
- *     Kick message queue.
- *     Two modes:
- *             - synchronous (delay==0). Messages are delivered immediately.
- *             - delayed. Do not deliver, but start delivery timer.
- */
-
-void __nlmsg_transmit(struct sock *sk, int allocation)
-{
-       start_bh_atomic();
-       if (!sk->protinfo.af_netlink.delay) {
-               if (sk->timer.function) {
-                       del_timer(&sk->timer);
-                       sk->timer.function = NULL;
-               }
-               end_bh_atomic();
-               nlmsg_flush(sk, allocation);
-               return;
-       }
-       if (!sk->timer.function) {
-               sk->timer.expires = jiffies + sk->protinfo.af_netlink.delay;
-               sk->timer.function = (void (*)(unsigned long))nlmsg_delayed_flush;
-               sk->timer.data = (unsigned long)sk;
-               add_timer(&sk->timer);
-       }
-       end_bh_atomic();
-}
-
-#endif
 
 #ifdef CONFIG_PROC_FS
 static int netlink_read_proc(char *buffer, char **start, off_t offset,
index 288cfd9..b127137 100644 (file)
@@ -144,6+144,7 @@ static int netlink_open(struct inode * inode, struct file * file)
 
 out:
        open_map &= ~(1<<minor);
+       MOD_DEC_USE_COUNT;
        return err;
 }
 
index 84451d0..66b49db 100644 (file)
@@ -475,7+475,6 @@ static int nr_create(struct socket *sock, int protocol)
 
        sock->ops    = &nr_proto_ops;
        sk->protocol = protocol;
-       sk->mtu      = NETROM_MTU;      /* 236 */
 
        skb_queue_head_init(&nr->ack_queue);
        skb_queue_head_init(&nr->reseq_queue);
@@ -522,7+521,6 @@ static struct sock *nr_make_new(struct sock *osk)
        sk->sndbuf   = osk->sndbuf;
        sk->debug    = osk->debug;
        sk->state    = TCP_ESTABLISHED;
-       sk->mtu      = osk->mtu;
        sk->sleep    = osk->sleep;
        sk->zapped   = osk->zapped;
 
index d9767a0..f987d94 100644 (file)
@@ -264,6+264,7 @@ EXPORT_SYMBOL(tcp_close);
 EXPORT_SYMBOL(tcp_accept);
 EXPORT_SYMBOL(tcp_write_wakeup);
 EXPORT_SYMBOL(tcp_read_wakeup);
+EXPORT_SYMBOL(tcp_write_space);
 EXPORT_SYMBOL(tcp_poll);
 EXPORT_SYMBOL(tcp_ioctl);
 EXPORT_SYMBOL(tcp_shutdown);
index 1c17b36..8a681b8 100644 (file)
@@ -552,7+552,6 @@ static int rose_create(struct socket *sock, int protocol)
 
        sock->ops    = &rose_proto_ops;
        sk->protocol = protocol;
-       sk->mtu      = ROSE_MTU;        /* 253 */
 
        init_timer(&rose->timer);
        init_timer(&rose->idletimer);
@@ -593,7+592,6 @@ static struct sock *rose_make_new(struct sock *osk)
        sk->sndbuf   = osk->sndbuf;
        sk->debug    = osk->debug;
        sk->state    = TCP_ESTABLISHED;
-       sk->mtu      = osk->mtu;
        sk->sleep    = osk->sleep;
        sk->zapped   = osk->zapped;
 
index 0bf7a92..081896d 100644 (file)
@@ -291,7+291,7 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
                          struct tcf_proto *tp, unsigned long fh, int event)
 {
        struct sk_buff *skb;
-       pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0;
+       u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
 
        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
index 8f50013..4168f54 100644 (file)
@@ -537,7+537,7 @@ insert:
        if (s == NULL)
                goto errout;
        memset(s, 0, sizeof(*s));
-       memcpy(s->dst, dst, sizeof(*dst));
+       memcpy(s->dst, dst, sizeof(s->dst));
        s->dpi = pinfo->dpi;
        s->protocol = pinfo->protocol;
        s->tunnelid = pinfo->tunnelid;
@@ -590,7+590,6 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
                     struct sk_buff *skb, struct tcmsg *t)
 {
-       struct rsvp_head *head = tp->root;
        struct rsvp_filter *f = (struct rsvp_filter*)fh;
        struct rsvp_session *s;
        unsigned char    *b = skb->tail;
index a684cde..f2fb9e3 100644 (file)
@@ -7,6+7,10 @@
  *             2 of the License, or (at your option) any later version.
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  */
 
 #include <linux/config.h>
@@ -506,7+510,7 @@ process_existing:
 }
 
 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q,
-                        pid_t pid, u32 seq, unsigned flags, int event)
+                        u32 pid, u32 seq, unsigned flags, int event)
 {
        struct tcmsg *tcm;
        struct nlmsghdr  *nlh;
@@ -538,7+542,7 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
                         struct Qdisc *old, struct Qdisc *new)
 {
        struct sk_buff *skb;
-       pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0;
+       u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
 
        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
@@ -715,7+719,7 @@ out:
 
 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
                          unsigned long cl,
-                         pid_t pid, u32 seq, unsigned flags, int event)
+                         u32 pid, u32 seq, unsigned flags, int event)
 {
        struct tcmsg *tcm;
        struct nlmsghdr  *nlh;
@@ -745,7+749,7 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
                          struct Qdisc *q, unsigned long cl, int event)
 {
        struct sk_buff *skb;
-       pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0;
+       u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
 
        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
@@ -850,7+854,7 @@ int psched_clock_scale;
 #endif
 
 #ifdef PSCHED_WATCHER
-u32 psched_time_mark;
+PSCHED_WATCHER psched_time_mark;
 
 static void psched_tick(unsigned long);
 
@@ -864,10+868,10 @@ static void psched_tick(unsigned long dummy)
        PSCHED_GET_TIME(dummy_stamp);
        psched_timer.expires = jiffies + 4*HZ;
 #else
-       unsigned long jiffies = now;
+       unsigned long now = jiffies;
        psched_time_base = ((u64)now)<<PSCHED_JSCALE;
        psched_time_mark = now;
-       psched_timer.expires = jiffies + 60*60*HZ;
+       psched_timer.expires = now + 60*60*HZ;
 #endif
        add_timer(&psched_timer);
 }
@@ -883,7+887,7 @@ __initfunc(int psched_calibrate_clock(void))
        unsigned long stop;
 
 #if CPU == 586 || CPU == 686
-       if (!(boot_cpu_data.x86_capability & X86_FEATURE_TSC)
+       if (!(boot_cpu_data.x86_capability & X86_FEATURE_TSC))
                return -1;
 #endif
 
@@ -895,7+899,7 @@ __initfunc(int psched_calibrate_clock(void))
        PSCHED_GET_TIME(stamp);
        do_gettimeofday(&tv);
        while (jiffies < stop)
-               boundary();
+               barrier();
        PSCHED_GET_TIME(stamp1);
        do_gettimeofday(&tv1);
        end_bh_atomic();
@@ -910,7+914,7 @@ __initfunc(int psched_calibrate_clock(void))
        while ((delay>>=1) != 0)
                psched_clock_scale++;
        psched_us_per_tick = 1<<psched_clock_scale;
-       psched_clock_per_hz = (delay*(1000000/HZ))>>psched_clock_scale;
+       psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
        return 0;
 }
 #endif
index 4ddb606..118a727 100644 (file)
@@ -497,7+497,6 @@ static int sock_fasync(int fd, struct file *filp, int on)
 {
        struct fasync_struct *fa, *fna=NULL, **prev;
        struct socket *sock;
-       unsigned long flags;
        
        if (on)
        {
@@ -509,9+508,8 @@ static int sock_fasync(int fd, struct file *filp, int on)
        sock = socki_lookup(filp->f_dentry->d_inode);
        
        prev=&(sock->fasync_list);
-       
-       save_flags(flags);
-       cli();
+
+       lock_sock(sock->sk); 
        
        for (fa=*prev; fa!=NULL; prev=&fa->fa_next,fa=*prev)
                if (fa->fa_file==filp)
@@ -523,7+521,7 @@ static int sock_fasync(int fd, struct file *filp, int on)
                {
                        fa->fa_fd=fd;
                        kfree_s(fna,sizeof(struct fasync_struct));
-                       restore_flags(flags);
+                       release_sock(sock->sk); 
                        return 0;
                }
                fna->fa_file=filp;
@@ -540,7+538,8 @@ static int sock_fasync(int fd, struct file *filp, int on)
                        kfree_s(fa,sizeof(struct fasync_struct));
                }
        }
-       restore_flags(flags);
+
+       release_sock(sock->sk); 
        return 0;
 }
 
@@ -1305,7+1304,8 @@ out:
 /*
  *     Perform a file control on a socket file descriptor.
  *
- *     FIXME: does this need an fd lock ?
+ *     Doesn't aquire a fd lock, because no network fcntl
+ *     function sleeps currently.
  */
 
 int sock_fcntl(struct file *filp, unsigned int cmd, unsigned long arg)
index 528e2a3..8e0110b 100644 (file)
@@ -8,6+8,8 @@
  *             as published by the Free Software Foundation; either version
  *             2 of the License, or (at your option) any later version.
  *
+ * Version:    $Id: af_unix.c,v 1.68 1998/08/26 13:18:35 davem Exp $
+ *
  * Fixes:
  *             Linus Torvalds  :       Assorted bug cures.
  *             Niibe Yutaka    :       async I/O support.
  *             Andreas Schwab  :       Replace inode by dentry for proper
  *                                     reference counting
  *             Kirk Petersen   :       Made this a module
+ *         Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
+ *                                     Lots of bug fixes.
+ *          Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
+ *                                     by above two patches.
  *
  * Known differences from reference BSD that was tested:
  *
@@ -102,6+108,7 @@ unix_socket *unix_socket_table[UNIX_HASH_SIZE+1];
 #define UNIX_ABSTRACT(sk)      ((sk)->protinfo.af_unix.addr->hash!=UNIX_HASH_SIZE)
 
 static void unix_destroy_socket(unix_socket *sk);
+static void unix_stream_write_space(struct sock *sk);
 
 extern __inline__ unsigned unix_hash_fold(unsigned hash)
 {
@@ -120,7+127,7 @@ extern __inline__ int unix_our_peer(unix_socket *sk, unix_socket *osk)
 
 extern __inline__ int unix_may_send(unix_socket *sk, unix_socket *osk)
 {
-       return (sk->type==osk->type);
+       return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
 }
 
 extern __inline__ void unix_lock(unix_socket *sk)
@@ -128,9+135,9 @@ extern __inline__ void unix_lock(unix_socket *sk)
        atomic_inc(&sk->sock_readers);
 }
 
-extern __inline__ int unix_unlock(unix_socket *sk)
+extern __inline__ void unix_unlock(unix_socket *sk)
 {
-       return atomic_dec_and_test(&sk->sock_readers);
+       atomic_dec(&sk->sock_readers);
 }
 
 extern __inline__ int unix_locked(unix_socket *sk)
@@ -257,7+264,6 @@ static void unix_destroy_timer(unsigned long data)
        if(!unix_locked(sk) && atomic_read(&sk->wmem_alloc) == 0)
        {
                sk_free(sk);
-               unix_remove_socket(sk);
        
                /* socket destroyed, decrement count                  */
                MOD_DEC_USE_COUNT;
@@ -291,9+297,6 @@ static int unix_release_sock (unix_socket *sk)
 
        skpair=unix_peer(sk);
 
-       /* Try to flush out this socket. Throw out buffers at least */
-       unix_destroy_socket(sk);
-
        if (skpair!=NULL)
        {
                if (sk->type==SOCK_STREAM && unix_our_peer(sk, skpair))
@@ -304,6+307,9 @@ static int unix_release_sock (unix_socket *sk)
                unix_unlock(skpair); /* It may now die */
        }
 
+       /* Try to flush out this socket. Throw out buffers at least */
+       unix_destroy_socket(sk);
+
        /*
         * Fixme: BSD difference: In BSD all sockets connected to use get
         *        ECONNRESET and we die on the spot. In Linux we behave
@@ -311,6+317,8 @@ static int unix_release_sock (unix_socket *sk)
         *        dereference.
         *
         * Can't we simply set sock->err?
+        *
+        *        What the above comment does talk about? --ANK(980817)
         */
 
        unix_gc();              /* Garbage collect fds */       
@@ -321,13+329,12 @@ static void unix_destroy_socket(unix_socket *sk)
 {
        struct sk_buff *skb;
 
+       unix_remove_socket(sk);
+
        while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
        {
                if(sk->state==TCP_LISTEN)
-               {
-                       unix_unlock(sk);
                        unix_release_sock(skb->sk);
-               }
                /* passed fds are erased in the kfree_skb hook        */
                kfree_skb(skb);
        }
@@ -338,10+345,9 @@ static void unix_destroy_socket(unix_socket *sk)
                sk->protinfo.af_unix.dentry=NULL;
        }
        
-       if(unix_unlock(sk) && atomic_read(&sk->wmem_alloc) == 0)
+       if(!unix_locked(sk) && atomic_read(&sk->wmem_alloc) == 0)
        {
                sk_free(sk);
-               unix_remove_socket(sk);
        
                /* socket destroyed, decrement count                  */
                MOD_DEC_USE_COUNT;
@@ -366,8+372,6 @@ static int unix_listen(struct socket *sock, int backlog)
        if (!sk->protinfo.af_unix.addr)
                return -EINVAL;                 /* No listens on an unbound socket */
        sk->max_ack_backlog=backlog;
-       if (sk->ack_backlog < backlog)
-               sk->state_change(sk);
        sk->state=TCP_LISTEN;
        sock->flags |= SO_ACCEPTCON;
        /* set credentials so connect can copy them */
@@ -380,61+384,60 @@ static int unix_listen(struct socket *sock, int backlog)
 extern struct proto_ops unix_stream_ops;
 extern struct proto_ops unix_dgram_ops;
 
-static int unix_create1(struct socket *sock, struct sock **skp, int protocol)
+static struct sock * unix_create1(struct socket *sock, int stream)
 {
        struct sock *sk;
 
-       if (protocol && protocol != PF_UNIX)
-               return -EPROTONOSUPPORT;
-
-       if (sock)
-       {
-               sock->state = SS_UNCONNECTED;
-
-               switch (sock->type)
-               {
-               case SOCK_STREAM:
-                       sock->ops = &unix_stream_ops;
-                       break;
-               /*
-                *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
-                *      nothing uses it.
-                */
-               case SOCK_RAW:
-                       sock->type=SOCK_DGRAM;
-               case SOCK_DGRAM:
-                       sock->ops = &unix_dgram_ops;
-                       break;
-               default:
-                       return -ESOCKTNOSUPPORT;
-               }
-       }
+       MOD_INC_USE_COUNT;
        sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1);
-       if (!sk)
-               return -ENOMEM;
+       if (!sk) {
+               MOD_DEC_USE_COUNT;
+               return NULL;
+       }
 
        sock_init_data(sock,sk);
 
+       if (stream)
+               sk->write_space = unix_stream_write_space; 
+
        sk->destruct = unix_destruct_addr;
        sk->protinfo.af_unix.family=PF_UNIX;
        sk->protinfo.af_unix.dentry=NULL;
-       atomic_set(&sk->sock_readers, 1);       /* Us */
        sk->protinfo.af_unix.readsem=MUTEX;     /* single task reading lock */
-       sk->mtu=4096;
        sk->protinfo.af_unix.list=&unix_sockets_unbound;
        unix_insert_socket(sk);
-       if (skp)
-               *skp =sk;
-       
-       /* socket created, increment count */
-       MOD_INC_USE_COUNT;
 
-       return 0;
+       return sk;
 }
 
 static int unix_create(struct socket *sock, int protocol)
 {
-       return unix_create1(sock, NULL, protocol);
+       int stream = 0;
+
+       if (protocol && protocol != PF_UNIX)
+               return -EPROTONOSUPPORT;
+
+       sock->state = SS_UNCONNECTED;
+
+       switch (sock->type) {
+       case SOCK_STREAM:
+               sock->ops = &unix_stream_ops;
+               stream = 1;
+               break;
+               /*
+                *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
+                *      nothing uses it.
+                */
+       case SOCK_RAW:
+               sock->type=SOCK_DGRAM;
+       case SOCK_DGRAM:
+               sock->ops = &unix_dgram_ops;
+               break;
+       default:
+               return -ESOCKTNOSUPPORT;
+       }
+
+       return unix_create1(sock, stream) ? 0 : -ENOMEM;
 }
 
 static int unix_release(struct socket *sock, struct socket *peer)
@@ -665,6+668,22 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
        if (addr_len < 0)
                return addr_len;
 
+       /* First of all allocate resources.
+          If we will make it after state checks,
+          we will have to recheck all again in any case.
+        */
+
+       /*  Find listening sock */
+       other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err);
+
+       /* create new sock for complete connection */
+       newsk = unix_create1(NULL, 1);
+
+       /* Allocate skb for sending to listening sock */
+       skb = NULL;
+       if (newsk)
+               skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
+
        switch (sock->state) 
        {
                case SS_UNCONNECTED:
@@ -672,37+691,25 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                        break;
                case SS_CONNECTED:
                        /* Socket is already connected */
-                       return -EISCONN;
+                       err = -EISCONN;
+                       goto out;
                default:
-                       return(-EINVAL);
+                       err = -EINVAL;
+                       goto out;
        }
 
-       /*
-        *      Now ready to connect
-        */
-        
-       sk->state=TCP_CLOSE;
-       
-       /*  Find listening sock */
-       other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err);
-       if(other==NULL)
+       err = -EINVAL;
+       if (sk->state != TCP_CLOSE)
                goto out;
 
-       /* create new sock for complete connection */
-       err = unix_create1(NULL, &newsk, PF_UNIX);
-       if (newsk == NULL)
+       /* Check that listener is in valid state. */
+       err = -ECONNREFUSED;
+       if (other == NULL || other->dead || other->state != TCP_LISTEN)
                goto out;
 
-       /* Allocate skb for sending to listening sock */
-       skb=sock_alloc_send_skb(newsk, 0, 0, flags&O_NONBLOCK, &err);
-       if(skb==NULL)
-               /*
-                * if it gives EAGAIN we should give back
-                * EINPROGRESS. But this should not happen since the
-                * socket should have some writespace left (it did not
-                * allocate any memory until now)
-                */
-               goto out_release;
+       err = -ENOMEM;
+       if (newsk == NULL || skb == NULL)
+               goto out;
 
        UNIXCB(skb).attr = MSG_SYN;
 
@@ -715,7+722,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
        sk->state=TCP_ESTABLISHED;
        /* Set credentials */
        sk->peercred = other->peercred;
-       
+
        /* set up newly created sock */
        unix_peer(newsk)=sk;
        unix_lock(newsk);
@@ -738,12+745,16 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
        other->ack_backlog++;
        skb_queue_tail(&other->receive_queue,skb);
        other->data_ready(other,0);             /* Wake up !          */
-
+       unix_unlock(other);
        return 0;
 
-out_release:
-       unix_destroy_socket(newsk);
 out:
+       if (skb)
+               kfree_skb(skb);
+       if (newsk)
+               unix_destroy_socket(newsk);
+       if (other)
+               unix_unlock(other);
        return err;
 }
 
@@ -803,13+814,14 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
                        kfree_skb(skb);
                        continue;
                }
-               break;
+               tsk = skb->sk;
+               sk->ack_backlog--;
+               kfree_skb(skb);
+               if (!tsk->dead) 
+                       break;
+               unix_release_sock(tsk);
        }
 
-       tsk=skb->sk;
-       sk->ack_backlog--;
-       unix_unlock(sk);        /* No longer locked to master      */
-       kfree_skb(skb);
 
        /* attach accepted sock to socket */
        newsock->state=SS_CONNECTED;
@@ -1015,8+1027,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len,
                size=len-sent;
 
                /* Keep two messages in the pipe so it schedules better */
-               if (size > (sk->sndbuf - sizeof(struct sk_buff)) / 2)
-                       size = (sk->sndbuf - sizeof(struct sk_buff)) / 2;
+               if (size > sk->sndbuf/2 - 16)
+                       size = sk->sndbuf/2 - 16;
 
                /*
                 *      Keep to page sized kmalloc()'s as various people
@@ -1024,8+1036,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len,
                 *      much.
                 */
 
-               if (size > 3500)
-                       limit = 3500;   /* Fall back to a page if we can't grab a big buffer this instant */
+               if (size > 4096-16)
+                       limit = 4096-16; /* Fall back to a page if we can't grab a big buffer this instant */
                else
                        limit = 0;      /* Otherwise just grab and wait */
 
@@ -1056,8+1068,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len,
                if (scm->fp)
                        unix_attach_fds(scm, skb);
 
-               /* N.B. this could fail with -EFAULT */
-               memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size);
+               if (memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) {
+                       kfree_skb(skb);
+                       if (sent)
+                               goto out;
+                       return -EFAULT;
+               }
 
                other=unix_peer(sk);
 
@@ -1247,8+1263,12 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size
                }
 
                chunk = min(skb->len, size);
-               /* N.B. This could fail with -EFAULT */
-               memcpy_toiovec(msg->msg_iov, skb->data, chunk);
+               if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+                       skb_queue_head(&sk->receive_queue, skb);
+                       if (copied == 0)
+                               copied = -EFAULT;
+                       break;
+               }
                copied += chunk;
                size -= chunk;
 
@@ -1299,28+1319,20 @@ static int unix_shutdown(struct socket *sock, int mode)
        struct sock *sk = sock->sk;
        unix_socket *other=unix_peer(sk);
        
-       mode++;
+       mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
 
-       if (mode&SEND_SHUTDOWN)
-       {
-               sk->shutdown|=SEND_SHUTDOWN;
+       if (mode) {
+               sk->shutdown |= mode;
                sk->state_change(sk);
-               if(other && sk->type == SOCK_STREAM && other->state != TCP_LISTEN)
-               {
-                       if (unix_our_peer(sk, other))
-                               other->shutdown|=RCV_SHUTDOWN;
-                       other->state_change(other);
-               }
-       }
-       other=unix_peer(sk);
-       if(mode&RCV_SHUTDOWN)
-       {
-               sk->shutdown|=RCV_SHUTDOWN;
-               sk->state_change(sk);
-               if(other && sk->type != SOCK_DGRAM && other->state != TCP_LISTEN)
-               {
-                       if (unix_our_peer(sk, other))
-                               other->shutdown|=SEND_SHUTDOWN;
+               if (other && sk->type == SOCK_STREAM &&
+                   unix_our_peer(sk, other)) {
+                       int peer_mode = 0;
+
+                       if (mode&RCV_SHUTDOWN)
+                               peer_mode |= SEND_SHUTDOWN;
+                       if (mode&SEND_SHUTDOWN)
+                               peer_mode |= RCV_SHUTDOWN;
+                       other->shutdown |= mode;
                        other->state_change(other);
                }
        }
@@ -1388,12+1400,21 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl
         * we set writable also when the other side has shut down the
         * connection. This prevents stuck sockets.
         */
-       if (sk->sndbuf - atomic_read(&sk->wmem_alloc) >= MIN_WRITE_SPACE)
+       if (sk->sndbuf - (int)atomic_read(&sk->wmem_alloc) >= MIN_WRITE_SPACE)
                        mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 
        return mask;
 }
 
+static void unix_stream_write_space(struct sock *sk)
+{
+       if (sk->dead)  
+               return;
+       wake_up_interruptible(sk->sleep);
+       if (sk->sndbuf - (int)atomic_read(&sk->wmem_alloc) >= MIN_WRITE_SPACE)
+               sock_wake_async(sk->socket, 2);
+}
+
 #ifdef CONFIG_PROC_FS
 static int unix_read_proc(char *buffer, char **start, off_t offset,
                          int length, int *eof, void *data)
@@ -1433,7+1454,7 @@ static int unix_read_proc(char *buffer, char **start, off_t offset,
                }
                buffer[len++]='\n';
                
-               pos+=len;
+               pos = begin + len;
                if(pos<offset)
                {
                        len=0;
index 60a3581..e7f894e 100644 (file)
@@ -468,7+468,6 @@ static int x25_create(struct socket *sock, int protocol)
 
        sock->ops    = &x25_proto_ops;
        sk->protocol = protocol;
-       sk->mtu      = X25_DEFAULT_PACKET_SIZE; /* X25_PS128 */
 
        x25->t21   = sysctl_x25_call_request_timeout;
        x25->t22   = sysctl_x25_reset_request_timeout;
@@ -507,7+506,6 @@ static struct sock *x25_make_new(struct sock *osk)
        sk->sndbuf      = osk->sndbuf;
        sk->debug       = osk->debug;
        sk->state       = TCP_ESTABLISHED;
-       sk->mtu         = osk->mtu;
        sk->sleep       = osk->sleep;
        sk->zapped      = osk->zapped;
 
close