Import 2.2.4pre62.2.4pre6
authorLinus Torvalds<torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:18:25 +0000 (23 15:18 -0500)
committerLinus Torvalds<torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:18:25 +0000 (23 15:18 -0500)
121 files changed:
Documentation/Configure.help
Documentation/cdrom/cdrom-standard.tex
Documentation/filesystems/affs.txt
arch/i386/kernel/bios32.c
arch/sparc/defconfig
arch/sparc/kernel/process.c
arch/sparc/kernel/sparc_ksyms.c
arch/sparc/kernel/sys_sparc.c
arch/sparc/kernel/systbls.S
arch/sparc/lib/Makefile
arch/sparc/lib/lshrdi3.S[new file with mode: 0644]
arch/sparc/mm/init.c
arch/sparc/mm/srmmu.c
arch/sparc64/defconfig
arch/sparc64/kernel/binfmt_aout32.c
arch/sparc64/kernel/cpu.c
arch/sparc64/kernel/process.c
arch/sparc64/kernel/systbls.S
drivers/block/genhd.c
drivers/cdrom/cdrom.c
drivers/net/3c527.c[new file with mode: 0644]
drivers/net/3c527.h[new file with mode: 0644]
drivers/net/Config.in
drivers/net/Makefile
drivers/net/a2065.c
drivers/net/sunlance.c
drivers/sbus/audio/audio.c
drivers/sound/es1370.c
drivers/sound/midi_synth.c
fs/affs/Changes
fs/sysv/ialloc.c
fs/sysv/inode.c
fs/sysv/namei.c
include/asm-sparc/checksum.h
include/asm-sparc/floppy.h
include/asm-sparc/mmu_context.h
include/asm-sparc64/floppy.h
include/linux/cdrom.h
include/linux/in.h
include/linux/netdevice.h
include/linux/pkt_cls.h
include/linux/rtnetlink.h
include/linux/sysctl.h
include/linux/sysv_fs.h
include/net/addrconf.h
include/net/dst.h
include/net/ip.h
include/net/ip6_fib.h
include/net/ip_fib.h
include/net/ipv6.h
include/net/neighbour.h
include/net/pkt_cls.h
include/net/pkt_sched.h
include/net/route.h
include/net/sock.h
kernel/signal.c
kernel/softirq.c
net/core/dev.c
net/core/dev_mcast.c
net/core/filter.c
net/core/neighbour.c
net/core/rtnetlink.c
net/core/sock.c
net/ipv4/Config.in
net/ipv4/af_inet.c
net/ipv4/arp.c
net/ipv4/devinet.c
net/ipv4/fib_frontend.c
net/ipv4/fib_hash.c
net/ipv4/fib_rules.c
net/ipv4/fib_semantics.c
net/ipv4/icmp.c
net/ipv4/igmp.c
net/ipv4/ip_forward.c
net/ipv4/ip_fragment.c
net/ipv4/ip_gre.c
net/ipv4/ip_input.c
net/ipv4/ip_nat_dumb.c
net/ipv4/ip_options.c
net/ipv4/ip_output.c
net/ipv4/ip_sockglue.c
net/ipv4/ipip.c
net/ipv4/ipmr.c
net/ipv4/route.c
net/ipv4/tcp_ipv4.c
net/ipv4/udp.c
net/ipv6/addrconf.c
net/ipv6/icmp.c
net/ipv6/ip6_fib.c
net/ipv6/ip6_output.c
net/ipv6/ipv6_sockglue.c
net/ipv6/mcast.c
net/ipv6/ndisc.c
net/ipv6/route.c
net/ipv6/sit.c
net/ipv6/tcp_ipv6.c
net/ipv6/udp.c
net/netlink/af_netlink.c
net/netsyms.c
net/packet/af_packet.c
net/sched/Config.in
net/sched/Makefile
net/sched/cls_api.c
net/sched/cls_fw.c
net/sched/cls_route.c
net/sched/cls_rsvp.h
net/sched/cls_u32.c
net/sched/estimator.c
net/sched/police.c
net/sched/sch_api.c
net/sched/sch_cbq.c
net/sched/sch_csz.c
net/sched/sch_fifo.c
net/sched/sch_generic.c
net/sched/sch_prio.c
net/sched/sch_red.c
net/sched/sch_sfq.c
net/sched/sch_tbf.c
net/sched/sch_teql.c
net/unix/af_unix.c
net/unix/sysctl_net_unix.c

index 84d7ae9..f61efe7 100644 (file)
@@ -2251,6+2251,12 @@ CONFIG_IP_ROUTE_TOS
   you say Y here, you will be able to specify different routes for
   packets with different TOS values.
 
+IP: use FWMARK value as routing key
+CONFIG_IP_ROUTE_FWMARK
+  If you say Y here, you will be able to specify different routes for
+  packets with different FWMARK ("firewalling mark") values
+  (see ipchains(8), "-m" argument).
+
 IP: verbose route monitoring
 CONFIG_IP_ROUTE_VERBOSE
   If you say Y here, which is recommended, then the kernel will print
@@ -5103,7+5109,7 @@ CONFIG_NET_FASTROUTE
     *** networking options: especially CONFIG*FIREWALL.      ***
 
   However, it will work with all options in CONFIG_IP_ADVANCED_ROUTER
-  section (except for CONFIG_IP_ROUTE_TOS). At the moment, few devices
+  section (except for CONFIG_IP_ROUTE_TOS&FWMARK). At the moment, few devices
   support fast switching (tulip is one of them, modified 8390 can be
   found at ftp://ftp.inr.ac.ru/ip-routing/fastroute-8390.tar.gz). 
 
index 624ff37..1bf6434 100644 (file)
 \author{David van Leeuwen\\{\normalsize\tt david@ElseWare.cistron.nl}
 \\{\footnotesize updated by Erik Andersen {\tt(andersee@debian.org)}}
 \\{\footnotesize updated by Jens Axboe {\tt(axboe@image.dk)}}}
-\date{11 January 1999}
+\date{12 March 1999}
 
 \maketitle
 
@@ -549,7+549,9 @@ non-supported $ioctl$s are: {\it CDROMREADMODE1, CDROMREADMODE2,
   CDROMREADAUDIO, CDROMREADRAW, CDROMREADCOOKED, CDROMSEEK,
   CDROMPLAY\-BLK and CDROM\-READALL}.
 
+
 \subsection{\cdrom\ capabilities}
+\label{capability}
 
 Instead of just implementing some $ioctl$ calls, the interface in
 \cdromc\ supplies the possibility to indicate the {\em capabilities\/}
@@ -944,6+946,13 @@ the current flags.
 \item[CDROM_CHANGER_NSLOTS] Returns the number of slots in a
   juke-box. 
 \item[CDROMRESET] Reset the drive. 
+\item[CDROM_GET_CAPABILITY] Returns the $capability$ flags for the
+  drive. Refer to section \ref{capability} for more information on
+  these flags.
+\item[CDROM_LOCKDOOR] Locks the door of the drive. $arg == \rm0$
+  unlocks the door, any other value locks it.
+\item[CDROM_DEBUG] Turns on debugging info. Only root is allowed
+  to do this. Same semantics as CDROM_LOCKDOOR.
 \end{description}
 
 \subsubsection{Device dependent $ioctl$s}
index f63a6e3..543b02a 100644 (file)
@@ -151,6+151,28 @@ Command line:
 /etc/fstab entry:
     /dev/sdb5  /amiga/Workbench    affs    noauto,user,exec,verbose 0 0
 
+IMPORTANT NOTE
+==============
+
+If you boot Windows 95 (don't know about 3.x, 98 and NT) while you
+have an Amiga harddisk connected to your PC, it will overwrite
+the bytes 0x00dc..0x00df of block 0 with garbage, thus invalidating
+the Rigid Disk Block. Sheer luck has it that this is an unused
+area of the RDB, so only the checksum doesn's match anymore.
+Linux will ignore this garbage and recognize the RDB anyway, but
+before you connect that drive to your Amiga again, you must
+restore or repair your RDB. So please do make a backup copy of it
+before booting Windows!
+
+If the damage is already done, the following should fix the RDB
+(where <disk> is the device name).
+DO AT YOUR OWN RISK:
+
+  dd if=/dev/<disk> of=rdb.tmp count=1
+  cp rdb.tmp rdb.fixed
+  dd if=/dev/zero of=rdb.fixed bs=1 seek=220 count=4
+  dd if=rdb.fixed of=/dev/<disk>
+
 Bugs, Restrictions, Caveats
 ===========================
 
@@ -185,9+207,8 @@ system crashes while an affs partition is mounted. There's currently
 no way to fix a garbled filesystem without an Amiga (disk validator)
 or manually (who would do this?). Maybe later.
 
-A fsck.affs and mkfs.affs will probably be available in the future.
-If you mount them on system startup, you may want to tell fsck
-that the fs should not be checked (place a '0' in the sixth field
+If you mount affs partitions on system startup, you may want to tell
+fsck that the fs should not be checked (place a '0' in the sixth field
 of /etc/fstab).
 
 It's not possible to read floppy disks with a normal PC or workstation
index af67d50..a7018a7 100644 (file)
@@ -997,15+997,15 @@ static void __init pcibios_fixup_peer_bridges(void)
                            l != 0x0000 && l != 0xffff) {
 #ifdef CONFIG_PCI_BIOS
                                if (pci_bios_present) {
-                                       int succ, idx = 0;
+                                       int err, idx = 0;
                                        u8 bios_bus, bios_dfn;
                                        u16 d;
                                        pcibios_read_config_word(n, i, PCI_DEVICE_ID, &d);
                                        DBG("BIOS test for %02x:%02x (%04x:%04x)\n", n, i, l, d);
-                                       while ((succ = pci_bios_find_device(l, d, idx, &bios_bus, &bios_dfn)) &&
+                                       while (!(err = pci_bios_find_device(l, d, idx, &bios_bus, &bios_dfn)) &&
                                               (bios_bus != n || bios_dfn != i))
                                                idx++;
-                                       if (!succ)
+                                       if (err)
                                                break;
                                }
 #endif
index bcb89ab..b4b94fd 100644 (file)
@@ -210,6+210,10 @@ CONFIG_HAPPYMEAL=m
 CONFIG_SUNBMAC=m
 CONFIG_SUNQE=m
 CONFIG_MYRI_SBUS=m
+
+#
+# Unix98 PTY support
+#
 CONFIG_UNIX98_PTYS=y
 CONFIG_UNIX98_PTY_COUNT=256
 
index 573153c..306e96b 100644 (file)
@@ -1,4+1,4 @@
-/*  $Id: process.c,v 1.131 1999/01/19 07:54:33 davem Exp $
+/*  $Id: process.c,v 1.132 1999/03/22 02:12:13 davem Exp $
  *  linux/arch/sparc/kernel/process.c
  *
  *  Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
@@ -442,10+442,17 @@ clone_stackframe(struct sparc_stackf *dst, struct sparc_stackf *src)
        size = ((unsigned long)src->fp) - ((unsigned long)src);
        sp = (struct sparc_stackf *)(((unsigned long)dst) - size); 
 
+       /* do_fork() grabs the parent semaphore, we must release it
+        * temporarily so we can build the child clone stack frame
+        * without deadlocking.
+        */
+       up(&current->mm->mmap_sem);
        if (copy_to_user(sp, src, size))
-               return 0;
-       if (put_user(dst, &sp->fp))
-               return 0;
+               sp = (struct sparc_stackf *) 0;
+       else if (put_user(dst, &sp->fp))
+               sp = (struct sparc_stackf *) 0;
+       down(&current->mm->mmap_sem);
+
        return sp;
 }
 
index b199418..b043d64 100644 (file)
@@ -1,4+1,4 @@
-/* $Id: sparc_ksyms.c,v 1.76 1999/01/29 02:06:54 davem Exp $
+/* $Id: sparc_ksyms.c,v 1.77 1999/03/21 06:37:43 davem Exp $
  * arch/sparc/kernel/ksyms.c: Sparc specific ksyms support.
  *
  * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
@@ -66,6+66,7 @@ extern char saved_command_line[];
 
 extern void bcopy (const char *, char *, int);
 extern int __ashrdi3(int, int);
+extern int __lshrdi3(int, int);
 
 extern void dump_thread(struct pt_regs *, struct user *);
 
@@ -271,6+272,7 @@ EXPORT_SYMBOL_NOVERS(memcpy);
 EXPORT_SYMBOL_NOVERS(memset);
 EXPORT_SYMBOL_NOVERS(memmove);
 EXPORT_SYMBOL_NOVERS(__ashrdi3);
+EXPORT_SYMBOL_NOVERS(__lshrdi3);
 
 EXPORT_SYMBOL_DOT(rem);
 EXPORT_SYMBOL_DOT(urem);
index 341faf4..ab1515f 100644 (file)
@@ -1,4+1,4 @@
-/* $Id: sys_sparc.c,v 1.50 1999/01/07 19:06:57 jj Exp $
+/* $Id: sys_sparc.c,v 1.51 1999/03/20 22:02:00 davem Exp $
  * linux/arch/sparc/kernel/sys_sparc.c
  *
  * This file contains various random system calls that
@@ -191,6+191,7 @@ asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len,
                        goto out;
        }
        retval = -ENOMEM;
+       len = PAGE_ALIGN(len);
        if(!(flags & MAP_FIXED) && !addr) {
                addr = get_unmapped_area(addr, len);
                if(!addr)
@@ -204,6+205,7 @@ asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len,
 
        if(ARCH_SUN4C_SUN4) {
                if(((addr >= 0x20000000) && (addr < 0xe0000000))) {
+                       /* VM hole */
                        retval = current->mm->brk;
                        goto out_putf;
                }
index 28fd1fc..588bcfc 100644 (file)
@@ -1,4+1,4 @@
-/* $Id: systbls.S,v 1.81 1999/03/12 13:30:15 jj Exp $
+/* $Id: systbls.S,v 1.82 1999/03/20 22:01:59 davem Exp $
  * systbls.S: System call entry point tables for OS compatibility.
  *            The native Linux system call table lives here also.
  *
@@ -97,7+97,7 @@ sunos_sys_table:
        .long sunos_nosys, sys_symlink, sys_readlink
        .long sys_execve, sys_umask, sys_chroot
        .long sys_newfstat, sunos_nosys, sys_getpagesize
-       .long sys_msync, sys_fork, sunos_nosys
+       .long sys_msync, sys_vfork, sunos_nosys
        .long sunos_nosys, sunos_sbrk, sunos_sstk
        .long sunos_mmap, sunos_vadvise, sys_munmap
        .long sys_mprotect, sunos_madvise, sys_vhangup
index 9ca26bb..d5b4754 100644 (file)
-# $Id: Makefile,v 1.27 1999/01/02 16:45:45 davem Exp $
+# $Id: Makefile,v 1.28 1999/03/21 06:37:44 davem Exp $
 # Makefile for Sparc library files..
 #
 
 OBJS  = mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o memcpy.o memset.o \
         strlen.o checksum.o blockops.o memscan.o memcmp.o strncmp.o \
        strncpy_from_user.o divdi3.o udivdi3.o strlen_user.o \
-       copy_user.o locks.o atomic.o bitops.o debuglocks.o
+       copy_user.o locks.o atomic.o bitops.o debuglocks.o lshrdi3.o
 
 ifdef CONFIG_SMP
 OBJS += irqlock.o
@@ -89,6+89,9 @@ urem.o: urem.S
 ashrdi3.o: ashrdi3.S
        $(CC) -D__ASSEMBLY__ -c -o ashrdi3.o ashrdi3.S
 
+lshrdi3.o: lshrdi3.S
+       $(CC) -D__ASSEMBLY__ -c -o lshrdi3.o lshrdi3.S
+
 dep:
 
 include $(TOPDIR)/Rules.make
diff --git a/arch/sparc/lib/lshrdi3.S b/arch/sparc/lib/lshrdi3.S
new file mode 100644 (file)
index 0000000..f5300a1
--- /dev/null
@@ -0,0 +1,29 @@
+/* $Id: lshrdi3.S,v 1.1 1999/03/21 06:37:45 davem Exp $ */
+
+#include <asm/cprefix.h>
+
+       .globl  C_LABEL(__lshrdi3)
+C_LABEL(__lshrdi3):
+       cmp     %o2, 0
+       be      3f
+        mov    0x20, %g2
+
+       sub     %g2, %o2, %g2
+       cmp     %g2, 0
+       bg      1f
+        srl    %o0, %o2, %o4
+
+       clr     %o4
+       neg     %g2
+       b       2f
+        srl    %o0, %g2, %o5
+1:
+       sll  %o0, %g2, %g3
+       srl  %o1, %o2, %g2
+       or  %g2, %g3, %o5
+2:
+       mov  %o4, %o0
+       mov  %o5, %o1
+3:
+       retl 
+        nop 
index da7f3c0..ae57866 100644 (file)
@@ -1,4+1,4 @@
-/*  $Id: init.c,v 1.62 1999/01/07 14:13:00 jj Exp $
+/*  $Id: init.c,v 1.63 1999/03/20 22:02:01 davem Exp $
  *  linux/arch/sparc/mm/init.c
  *
  *  Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
@@ -330,11+330,18 @@ __initfunc(void mem_init(unsigned long start_mem, unsigned long end_mem))
               initpages << (PAGE_SHIFT-10),
               (unsigned long)PAGE_OFFSET, end_mem);
 
-       freepages.min = nr_free_pages >> 7;
-       if(freepages.min < 16)
-               freepages.min = 16;
-       freepages.low = freepages.min + (freepages.min >> 1);
-       freepages.high = freepages.min + freepages.min;
+       /* NOTE NOTE NOTE NOTE
+        * Please keep track of things and make sure this
+        * always matches the code in mm/page_alloc.c -DaveM
+        */
+       i = nr_free_pages >> 7;
+       if (i < 48)
+               i = 48;
+       if (i > 256)
+               i = 256;
+       freepages.min = i;
+       freepages.low = i << 1;
+       freepages.high = freepages.low + i;
 }
 
 void free_initmem (void)
index 991e806..bd87b02 100644 (file)
@@ -1,4+1,4 @@
-/* $Id: srmmu.c,v 1.183 1999/03/16 11:36:16 davem Exp $
+/* $Id: srmmu.c,v 1.184 1999/03/20 22:02:03 davem Exp $
  * srmmu.c:  SRMMU specific routines for memory management.
  *
  * Copyright (C) 1995 David S. Miller  (davem@caip.rutgers.edu)
@@ -465,7+465,8 @@ static inline pte_t *srmmu_s_pte_offset(pmd_t * dir, unsigned long address)
 /* This must update the context table entry for this process. */
 static void srmmu_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp) 
 {
-       if(tsk->mm->context != NO_CONTEXT) {
+       if(tsk->mm->context != NO_CONTEXT &&
+          tsk->mm->pgd != pgdp) {
                flush_cache_mm(tsk->mm);
                ctxd_set(&srmmu_context_table[tsk->mm->context], pgdp);
                flush_tlb_mm(tsk->mm);
@@ -816,13+817,19 @@ static inline void free_context(int context)
 
 static void srmmu_switch_to_context(struct task_struct *tsk)
 {
+       int set = 0;
+
        if(tsk->mm->context == NO_CONTEXT) {
                alloc_context(tsk->mm);
                flush_cache_mm(tsk->mm);
                ctxd_set(&srmmu_context_table[tsk->mm->context], tsk->mm->pgd);
                flush_tlb_mm(tsk->mm);
-       }
-       srmmu_set_context(tsk->mm->context);
+               set = 1;
+       } else if(tsk->mm != current->mm)
+               set = 1;
+
+       if(set != 0)
+               srmmu_set_context(tsk->mm->context);
 }
 
 static void srmmu_init_new_context(struct mm_struct *mm)
@@ -1335,7+1342,8 @@ static void hypersparc_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp)
        if(pgdp != swapper_pg_dir)
                hypersparc_flush_page_to_ram(page);
 
-       if(tsk->mm->context != NO_CONTEXT) {
+       if(tsk->mm->context != NO_CONTEXT &&
+          tsk->mm->pgd != pgdp) {
                flush_cache_mm(tsk->mm);
                ctxd_set(&srmmu_context_table[tsk->mm->context], pgdp);
                flush_tlb_mm(tsk->mm);
@@ -1344,8+1352,10 @@ static void hypersparc_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp)
 
 static void viking_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp) 
 {
-       viking_flush_page((unsigned long)pgdp);
-       if(tsk->mm->context != NO_CONTEXT) {
+       if(pgdp != swapper_pg_dir)
+               viking_flush_page((unsigned long)pgdp);
+       if(tsk->mm->context != NO_CONTEXT &&
+          tsk->mm->pgd != pgdp) {
                flush_cache_mm(tsk->mm);
                ctxd_set(&srmmu_context_table[tsk->mm->context], pgdp);
                flush_tlb_mm(tsk->mm);
@@ -1358,6+1368,9 @@ static void cypress_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp)
        unsigned long page = ((unsigned long) pgdp) & PAGE_MASK;
        unsigned long line;
 
+       if(pgdp == swapper_pg_dir)
+               goto skip_flush;
+
        a = 0x20; b = 0x40; c = 0x60; d = 0x80; e = 0xa0; f = 0xc0; g = 0xe0;
        page &= PAGE_MASK;
        line = (page + PAGE_SIZE) - 0x100;
@@ -1378,8+1391,9 @@ static void cypress_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp)
                                     "r" (a), "r" (b), "r" (c), "r" (d),
                                     "r" (e), "r" (f), "r" (g));
        } while(line != page);
-
-       if(tsk->mm->context != NO_CONTEXT) {
+skip_flush:
+       if(tsk->mm->context != NO_CONTEXT &&
+          tsk->mm->pgd != pgdp) {
                flush_cache_mm(tsk->mm);
                ctxd_set(&srmmu_context_table[tsk->mm->context], pgdp);
                flush_tlb_mm(tsk->mm);
@@ -1388,6+1402,8 @@ static void cypress_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp)
 
 static void hypersparc_switch_to_context(struct task_struct *tsk)
 {
+       int set = 0;
+
        if(tsk->mm->context == NO_CONTEXT) {
                ctxd_t *ctxp;
 
@@ -1395,9+1411,14 @@ static void hypersparc_switch_to_context(struct task_struct *tsk)
                ctxp = &srmmu_context_table[tsk->mm->context];
                srmmu_set_entry((pte_t *)ctxp, __pte((SRMMU_ET_PTD | (srmmu_v2p((unsigned long) tsk->mm->pgd) >> 4))));
                hypersparc_flush_page_to_ram((unsigned long)ctxp);
+               set = 1;
+       } else if(tsk->mm != current->mm)
+               set = 1;
+
+       if(set != 0) {
+               hyper_flush_whole_icache();
+               srmmu_set_context(tsk->mm->context);
        }
-       hyper_flush_whole_icache();
-       srmmu_set_context(tsk->mm->context);
 }
 
 static void hypersparc_init_new_context(struct mm_struct *mm)
@@ -1410,9+1431,10 @@ static void hypersparc_init_new_context(struct mm_struct *mm)
        srmmu_set_entry((pte_t *)ctxp, __pte((SRMMU_ET_PTD | (srmmu_v2p((unsigned long) mm->pgd) >> 4))));
        hypersparc_flush_page_to_ram((unsigned long)ctxp);
 
-       hyper_flush_whole_icache();
-       if(mm == current->mm)
+       if(mm == current->mm) {
+               hyper_flush_whole_icache();
                srmmu_set_context(mm->context);
+       }
 }
 
 static unsigned long mempool;
@@ -2022,6+2044,11 @@ static void srmmu_update_mmu_cache(struct vm_area_struct * vma, unsigned long ad
 static void srmmu_destroy_context(struct mm_struct *mm)
 {
        if(mm->context != NO_CONTEXT && atomic_read(&mm->count) == 1) {
+               /* XXX This could be drastically improved.
+                * XXX We are only called from __exit_mm and it just did
+                * XXX cache/tlb mm flush and right after this will (re-)
+                * XXX SET_PAGE_DIR to swapper_pg_dir.  -DaveM
+                */
                flush_cache_mm(mm);
                ctxd_set(&srmmu_context_table[mm->context], swapper_pg_dir);
                flush_tlb_mm(mm);
@@ -2680,15+2707,8 @@ __initfunc(static void init_viking(void))
 
        /* Ahhh, the viking.  SRMMU VLSI abortion number two... */
        if(mreg & VIKING_MMODE) {
-               unsigned long bpreg;
-
                srmmu_name = "TI Viking";
                viking_mxcc_present = 0;
-
-               bpreg = viking_get_bpreg();
-               bpreg &= ~(VIKING_ACTION_MIX);
-               viking_set_bpreg(bpreg);
-
                msi_set_sync();
 
                BTFIXUPSET_CALL(set_pte, srmmu_set_pte_nocache_viking, BTFIXUPCALL_NORM);
index 6c259f2..58a39c4 100644 (file)
@@ -247,6+247,10 @@ CONFIG_SUNQE=m
 CONFIG_MYRI_SBUS=m
 CONFIG_DE4X5=m
 CONFIG_VORTEX=m
+
+#
+# Unix 98 PTY support
+#
 CONFIG_UNIX98_PTYS=y
 CONFIG_UNIX98_PTY_COUNT=256
 
index dc898e3..9849c23 100644 (file)
@@ -9,7+9,6 @@
 
 #include <linux/module.h>
 
-#include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
-#include <linux/file.h>
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/malloc.h>
@@ -58,14+58,13 @@ static void set_brk(unsigned long start, unsigned long end)
  * macros to write out all the necessary info.
  */
 #define DUMP_WRITE(addr,nr) \
-while (file.f_op->write(&file,(char *)(addr),(nr),&file.f_pos) != (nr)) \
-       goto close_coredump
+while (file->f_op->write(file,(char *)(addr),(nr),&file->f_pos) != (nr)) goto close_coredump
 
 #define DUMP_SEEK(offset) \
-if (file.f_op->llseek) { \
-       if (file.f_op->llseek(&file,(offset),0) != (offset)) \
+if (file->f_op->llseek) { \
+       if (file->f_op->llseek(file,(offset),0) != (offset)) \
                goto close_coredump; \
-} else file.f_pos = (offset)
+} else file->f_pos = (offset)
 
 /*
  * Routine writes a core dump image in the current directory.
@@ -82,7+81,7 @@ do_aout32_core_dump(long signr, struct pt_regs * regs)
 {
        struct dentry * dentry = NULL;
        struct inode * inode = NULL;
-       struct file file;
+       struct file file;
        mm_segment_t fs;
        int has_dumped = 0;
        char corefile[6+sizeof(current->comm)];
@@ -106,29+105,16 @@ do_aout32_core_dump(long signr, struct pt_regs * regs)
 #else
        corefile[4] = '\0';
 #endif
-       dentry = open_namei(corefile,O_CREAT | 2 | O_TRUNC | O_NOFOLLOW, 0600);
-       if (IS_ERR(dentry)) {
-               dentry = NULL;
+       file = filp_open(corefile,O_CREAT | 2 | O_TRUNC | O_NOFOLLOW, 0600);
+       if (IS_ERR(file))
                goto end_coredump;
-       }
+       dentry = file->f_dentry;
        inode = dentry->d_inode;
        if (!S_ISREG(inode->i_mode))
-               goto end_coredump;
+               goto close_coredump;
        if (!inode->i_op || !inode->i_op->default_file_ops)
-               goto end_coredump;
-       if (get_write_access(inode))
-               goto end_coredump;
-       file.f_mode = 3;
-       file.f_flags = 0;
-       file.f_count = 1;
-       file.f_dentry = dentry;
-       file.f_pos = 0;
-       file.f_reada = 0;
-       file.f_op = inode->i_op->default_file_ops;
-       if (file.f_op->open)
-               if (file.f_op->open(inode,&file))
-                       goto done_coredump;
-       if (!file.f_op->write)
+               goto close_coredump;
+       if (!file->f_op->write)
                goto close_coredump;
        has_dumped = 1;
        current->flags |= PF_DUMPCORE;
@@ -175,13+161,9 @@ do_aout32_core_dump(long signr, struct pt_regs * regs)
        set_fs(KERNEL_DS);
        DUMP_WRITE(current,sizeof(*current));
 close_coredump:
-       if (file.f_op->release)
-               file.f_op->release(inode,&file);
-done_coredump:
-       put_write_access(inode);
+       close_fp(file, NULL);
 end_coredump:
        set_fs(fs);
-       dput(dentry);
        return has_dumped;
 }
 
@@ -269,7+251,6 @@ static inline int do_load_aout32_binary(struct linux_binprm * bprm,
                return -ENOEXEC;
        }
 
-       current->personality = PER_LINUX;
        fd_offset = N_TXTOFF(ex);
 
        /* Check initial limits. This avoids letting people circumvent
@@ -288,6+269,8 @@ static inline int do_load_aout32_binary(struct linux_binprm * bprm,
                return retval;
 
        /* OK, This is the point of no return */
+       current->personality = PER_LINUX;
+
        current->mm->end_code = ex.a_text +
                (current->mm->start_code = N_TXTADDR(ex));
        current->mm->end_data = ex.a_data +
@@ -297,8+280,7 @@ static inline int do_load_aout32_binary(struct linux_binprm * bprm,
 
        current->mm->rss = 0;
        current->mm->mmap = NULL;
-       current->suid = current->euid = current->fsuid = bprm->e_uid;
-       current->sgid = current->egid = current->fsgid = bprm->e_gid;
+       compute_creds(bprm);
        current->flags &= ~PF_FORKNOEXEC;
        if (N_MAGIC(ex) == NMAGIC) {
                /* Fuck me plenty... */
@@ -404,48+386,44 @@ static inline int
 do_load_aout32_library(int fd)
 {
         struct file * file;
-       struct exec ex;
-       struct dentry * dentry;
        struct inode * inode;
-       unsigned int len;
-       unsigned int bss;
-       unsigned int start_addr;
+       unsigned long bss, start_addr, len;
        unsigned long error;
+       int retval;
+       loff_t offset = 0;
+       struct exec ex;
 
-       file = fcheck(fd);
-
-       if (!file || !file->f_op)
-               return -EACCES;
-
-       dentry = file->f_dentry;
-       inode = dentry->d_inode;
-
-       /* Seek into the file */
-       if (file->f_op->llseek) {
-               if ((error = file->f_op->llseek(file, 0, 0)) != 0)
-                       return -ENOEXEC;
-       } else
-               file->f_pos = 0;
+       retval = -EACCES;
+       file = fget(fd);
+       if (!file)
+               goto out;
+       if (!file->f_op)
+               goto out_putf;
+       inode = file->f_dentry->d_inode;
 
+       retval = -ENOEXEC;
+       /* N.B. Save current fs? */
        set_fs(KERNEL_DS);
-       error = file->f_op->read(file, (char *) &ex, sizeof(ex), &file->f_pos);
+       error = file->f_op->read(file, (char *) &ex, sizeof(ex), &offset);
        set_fs(USER_DS);
        if (error != sizeof(ex))
-               return -ENOEXEC;
+               goto out_putf;
 
        /* We come in here for the regular a.out style of shared libraries */
        if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
            N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
            inode->i_size < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
-               return -ENOEXEC;
+               goto out_putf;
        }
+
        if (N_MAGIC(ex) == ZMAGIC && N_TXTOFF(ex) &&
            (N_TXTOFF(ex) < inode->i_sb->s_blocksize)) {
                printk("N_TXTOFF < BLOCK_SIZE. Please convert library\n");
-               return -ENOEXEC;
+               goto out_putf;
        }
 
-       if (N_FLAGS(ex)) return -ENOEXEC;
+       if (N_FLAGS(ex))
+               goto out_putf;
 
        /* For  QMAGIC, the starting address is 0x20 into the page.  We mask
           this off to get the starting address for the page */
@@ -457,18+435,26 @@ do_load_aout32_library(int fd)
                        PROT_READ | PROT_WRITE | PROT_EXEC,
                        MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
                        N_TXTOFF(ex));
+       retval = error;
        if (error != start_addr)
-               return error;
+               goto out_putf;
+
        len = PAGE_ALIGN(ex.a_text + ex.a_data);
        bss = ex.a_text + ex.a_data + ex.a_bss;
        if (bss > len) {
-               error = do_mmap(NULL, start_addr + len, bss-len,
-                               PROT_READ|PROT_WRITE|PROT_EXEC,
-                               MAP_PRIVATE|MAP_FIXED, 0);
+               error = do_mmap(NULL, start_addr + len, bss - len,
+                               PROT_READ | PROT_WRITE | PROT_EXEC,
+                               MAP_PRIVATE | MAP_FIXED, 0);
+               retval = error;
                if (error != start_addr + len)
-                       return error;
+                       goto out_putf;
        }
-       return 0;
+       retval = 0;
+
+out_putf:
+       fput(file);
+out:
+       return retval;
 }
 
 static int
index d8b1df2..a6d751e 100644 (file)
@@ -88,7+88,7 @@ __initfunc(void cpu_probe(void))
        if(i==NSPARCCHIPS) {
                printk("DEBUG: manuf = 0x%x   impl = 0x%x\n", manuf, 
                            impl);
-               sparc_cpu_type[cpuid] = "Unknow CPU";
+               sparc_cpu_type[cpuid] = "Unknown CPU";
        }
 
        for(i = 0; i<NSPARCFPU; i++) {
index 19d9367..c255ad5 100644 (file)
@@ -1,4+1,4 @@
-/*  $Id: process.c,v 1.89 1999/01/19 07:54:39 davem Exp $
+/*  $Id: process.c,v 1.90 1999/03/22 02:12:16 davem Exp $
  *  arch/sparc64/kernel/process.c
  *
  *  Copyright (C) 1995, 1996 David S. Miller (davem@caip.rutgers.edu)
@@ -456,6+456,11 @@ static unsigned long clone_stackframe(unsigned long csp, unsigned long psp)
 {
        unsigned long fp, distance, rval;
 
+       /* do_fork() grabs the parent semaphore, we must release it
+        * temporarily so we can build the child clone stack frame
+        * without deadlocking.
+        */
+       up(&current->mm->mmap_sem);
        if(!(current->tss.flags & SPARC_FLAG_32BIT)) {
                csp += STACK_BIAS;
                psp += STACK_BIAS;
@@ -472,17+477,20 @@ static unsigned long clone_stackframe(unsigned long csp, unsigned long psp)
        distance = fp - psp;
        rval = (csp - distance);
        if(copy_in_user(rval, psp, distance))
-               return 0;
-       if(current->tss.flags & SPARC_FLAG_32BIT) {
+               rval = 0;
+       else if(current->tss.flags & SPARC_FLAG_32BIT) {
                if(put_user(((u32)csp), &(((struct reg_window32 *)rval)->ins[6])))
-                       return 0;
-               return rval;
+                       rval = 0;
        } else {
                if(put_user(((u64)csp - STACK_BIAS),
                            &(((struct reg_window *)rval)->ins[6])))
-                       return 0;
-               return rval - STACK_BIAS;
+                       rval = 0;
+               else
+                       rval = rval - STACK_BIAS;
        }
+       down(&current->mm->mmap_sem);
+
+       return rval;
 }
 
 /* Standard stuff. */
index ef91e1f..607aa36 100644 (file)
@@ -1,4+1,4 @@
-/* $Id: systbls.S,v 1.51 1999/03/12 13:30:24 jj Exp $
+/* $Id: systbls.S,v 1.52 1999/03/20 22:02:05 davem Exp $
  * systbls.S: System call entry point tables for OS compatibility.
  *            The native Linux system call table lives here also.
  *
@@ -156,7+156,7 @@ sunos_sys_table:
        .word sunos_nosys, sys_symlink, sys_readlink
        .word sys32_execve, sys_umask, sys_chroot
        .word sys32_newfstat, sunos_nosys, sys_getpagesize
-       .word sys_msync, sys_fork, sunos_nosys
+       .word sys_msync, sys_vfork, sunos_nosys
        .word sunos_nosys, sunos_sbrk, sunos_sstk
        .word sunos_mmap, sunos_vadvise, sys_munmap
        .word sys_mprotect, sunos_madvise, sys_vhangup
index 89424cd..8b66c76 100644 (file)
@@ -867,19+867,20 @@ amiga_partition(struct gendisk *hd, kdev_t dev, unsigned long first_sector)
        int                      nr_sects;
        int                      blk;
        int                      part, res;
+       int                      old_blocksize;
+       int                      blocksize;
 
-       /*
-        *      Don't bother touching M/O 2K media.
-        */
-        
-       if (get_ptable_blocksize(dev) != 1024)
-               return 0;
-               
-       set_blocksize(dev,512);
+       old_blocksize = get_ptable_blocksize(dev);
+       if (hardsect_size[MAJOR(dev)] != NULL)
+               blocksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
+       else
+               blocksize = 512;
+
+       set_blocksize(dev,blocksize);
        res = 0;
 
        for (blk = 0; blk < RDB_ALLOCATION_LIMIT; blk++) {
-               if(!(bh = bread(dev,blk,512))) {
+               if(!(bh = bread(dev,blk,blocksize))) {
                        printk("Dev %s: unable to read RDB block %d\n",
                               kdevname(dev),blk);
                        goto rdb_done;
@@ -887,16+888,25 @@ amiga_partition(struct gendisk *hd, kdev_t dev, unsigned long first_sector)
                if (*(u32 *)bh->b_data == htonl(IDNAME_RIGIDDISK)) {
                        rdb = (struct RigidDiskBlock *)bh->b_data;
                        if (checksum_block((u32 *)bh->b_data,htonl(rdb->rdb_SummedLongs) & 0x7F)) {
-                               printk("Dev %s: RDB in block %d has bad checksum\n",
-                                      kdevname(dev),blk);
-                               brelse(bh);
-                               continue;
+                               /* Try again with 0xdc..0xdf zeroed, Windows might have
+                                * trashed it.
+                                */
+                               *(u32 *)(&bh->b_data[0xdc]) = 0;
+                               if (checksum_block((u32 *)bh->b_data,
+                                               htonl(rdb->rdb_SummedLongs) & 0x7F)) {
+                                       brelse(bh);
+                                       printk("Dev %s: RDB in block %d has bad checksum\n",
+                                              kdevname(dev),blk);
+                                       continue;
+                               }
+                               printk("Warning: Trashed word at 0xd0 in block %d "
+                                       "ignored in checksum calculation\n",kdevname(dev),blk);
                        }
                        printk(" RDSK");
                        blk = htonl(rdb->rdb_PartitionList);
                        brelse(bh);
                        for (part = 1; blk > 0 && part <= 16; part++) {
-                               if (!(bh = bread(dev,blk, 512))) {
+                               if (!(bh = bread(dev,blk,blocksize))) {
                                        printk("Dev %s: unable to read partition block %d\n",
                                                       kdevname(dev),blk);
                                        goto rdb_done;
@@ -929,11+939,7 @@ amiga_partition(struct gendisk *hd, kdev_t dev, unsigned long first_sector)
        }
 
 rdb_done:
-       /*
-        *      FIXME: should restore the original size. Then we could clean
-        *      up the M/O skip. Amiga people ?
-        */
-       set_blocksize(dev,BLOCK_SIZE);
+       set_blocksize(dev,old_blocksize);
        return res;
 }
 #endif /* CONFIG_AMIGA_PARTITION */
index 877001d..c4b0b5a 100644 (file)
   -- Added CDROM_DEBUG ioctl. Enable debug messages on-the-fly.
   -- Added CDROM_GET_CAPABILITY ioctl. This relieves userspace programs
   from parsing /proc/sys/dev/cdrom/info.
+  
+  2.54 Mar 15, 1999 - Jens Axboe <axboe@image.dk>
+  -- Check capability mask from low level driver when counting tracks as
+  per suggestion from Corey J. Scotts <cstotts@blue.weeg.uiowa.edu>.
 
 -------------------------------------------------------------------------*/
 
-#define REVISION "Revision: 2.53"
-#define VERSION "Id: cdrom.c 2.53 1999/02/22"
+#define REVISION "Revision: 2.54"
+#define VERSION "Id: cdrom.c 2.54 1999/03/15"
 
 /* I use an error-log mask to give fine grain control over the type of
    messages dumped to the system logs.  The available masks include: */
@@ -601,14+605,17 @@ void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype* tracks)
        tracks->xa=0;
        tracks->error=0;
        cdinfo(CD_COUNT_TRACKS, "entering cdrom_count_tracks\n"); 
-        if (!(cdi->ops->capability & CDC_PLAY_AUDIO)) { 
+        if (!(cdi->ops->capability & ~cdi->mask & CDC_PLAY_AUDIO)) { 
                 tracks->error=CDS_NO_INFO;
                 return;
         }        
        /* Grab the TOC header so we can see how many tracks there are */
-       ret=cdi->ops->audio_ioctl(cdi, CDROMREADTOCHDR, &header);
+       ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCHDR, &header);
        if (ret) {
-               tracks->error=(ret == -ENOMEDIUM) ? CDS_NO_DISC : CDS_NO_INFO;
+               if (ret == -ENOMEDIUM)
+                       tracks->error = CDS_NO_DISC;
+               else
+                       tracks->error = CDS_NO_INFO;
                return;
        }       
        /* check what type of tracks are on this disc */
@@ -729,7+736,7 @@ int cdrom_ioctl(struct inode *ip, struct file *fp,
                cdinfo(CD_DO_IOCTL, "entering CDROMEJECT\n"); 
                if (!(cdo->capability & ~cdi->mask & CDC_OPEN_TRAY))
                        return -ENOSYS;
-               if (cdi->use_count != 1)
+               if (cdi->use_count != 1 || keeplocked)
                        return -EBUSY;
                if (cdo->capability & ~cdi->mask & CDC_LOCK)
                        if ((ret=cdo->lock_door(cdi, 0)))
@@ -748,6+755,8 @@ int cdrom_ioctl(struct inode *ip, struct file *fp,
                cdinfo(CD_DO_IOCTL, "entering CDROMEJECT_SW\n"); 
                if (!(cdo->capability & ~cdi->mask & CDC_OPEN_TRAY))
                        return -ENOSYS;
+               if (keeplocked)
+                       return -EBUSY;
                cdi->options &= ~(CDO_AUTO_CLOSE | CDO_AUTO_EJECT);
                if (arg)
                        cdi->options |= CDO_AUTO_CLOSE | CDO_AUTO_EJECT;
@@ -778,6+787,8 @@ int cdrom_ioctl(struct inode *ip, struct file *fp,
                        if (!(cdo->capability & ~cdi->mask & CDC_LOCK))
                                return -ENOSYS;
                        break;
+               case 0:
+                       return cdi->options;
                /* default is basically CDO_[AUTO_CLOSE|AUTO_EJECT] */
                default:
                        if (!(cdo->capability & ~cdi->mask & arg))
@@ -814,31+825,30 @@ int cdrom_ioctl(struct inode *ip, struct file *fp,
                if (!(cdo->capability & ~cdi->mask & CDC_RESET))
                        return -ENOSYS;
                return cdo->reset(cdi);
-       }
+               }
 
        case CDROM_LOCKDOOR: {
                cdinfo(CD_DO_IOCTL, "%socking door.\n",arg?"L":"Unl");
-               if (cdo->capability & ~cdi->mask & CDC_LOCK) {
+               if (!(cdo->capability & ~cdi->mask & CDC_LOCK)) {
+                       return -EDRIVE_CANT_DO_THIS;
+               } else {
                        keeplocked = arg ? 1 : 0;
                        return cdo->lock_door(cdi, arg);
-               } else
-                       return -EDRIVE_CANT_DO_THIS;
-       }
+               }
+               }
 
        case CDROM_DEBUG: {
                if (!capable(CAP_SYS_ADMIN))
                        return -EACCES;
                cdinfo(CD_DO_IOCTL, "%sabling debug.\n",arg?"En":"Dis");
                debug = arg ? 1 : 0;
-               return 0;
-       }
+               return debug;
+               }
 
        case CDROM_GET_CAPABILITY: {
                cdinfo(CD_DO_IOCTL, "entering CDROM_GET_CAPABILITY\n");
                return cdo->capability;
-       }
-
-
+               }
 
 /* The following function is implemented, although very few audio
  * discs give Universal Product Code information, which should just be
diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c
new file mode 100644 (file)
index 0000000..ca19959
--- /dev/null
@@ -0,0 +1,1152 @@
+/* 3c527.c: 3Com Etherlink/MC32 driver for Linux
+ *
+ *     (c) Copyright 1998 Red Hat Software Inc
+ *     Written by Alan Cox.
+ *
+ *     Based on skeleton.c written 1993-94 by Donald Becker and ne2.c
+ *     (for the MCA stuff) written by Wim Dumon.
+ *
+ *     Thanks to 3Com for making this possible by providing me with the
+ *     documentation.
+ *
+ *     This software may be used and distributed according to the terms
+ *     of the GNU Public License, incorporated herein by reference.
+ *
+ */
+
+static const char *version =
+       "3c527.c:v0.04 1999/03/16 Alan Cox (alan@redhat.com)\n";
+
+/*
+ *     Things you need
+ *     o       The databook.
+ *
+ *     Traps for the unwary
+ *
+ *     The diagram (Figure 1-1) and the POS summary disagree with the
+ *     "Interrupt Level" section in the manual.
+ *
+ *     The documentation in places seems to miss things. In actual fact
+ *     I've always eventually found everything is documented, it just
+ *     requires careful study.
+ */
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/interrupt.h>
+#include <linux/ptrace.h>
+#include <linux/mca.h>
+#include <linux/ioport.h>
+#include <linux/in.h>
+#include <linux/malloc.h>
+#include <linux/string.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <asm/io.h>
+#include <asm/dma.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+
+#include "3c527.h"
+
+/*
+ * The name of the card. Is used for messages and in the requests for
+ * io regions, irqs and dma channels
+ */
+static const char* cardname = "3c527";
+
+/* use 0 for production, 1 for verification, >2 for debug */
+#ifndef NET_DEBUG
+#define NET_DEBUG 2
+#endif
+static unsigned int mc32_debug = NET_DEBUG;
+
+/* The number of low I/O ports used by the ethercard. */
+#define NETCARD_IO_EXTENT      8
+
+
+struct mc32_mailbox
+{
+       u16     mbox __attribute((packed));
+       u16     data[1] __attribute((packed));
+};
+
+/* Information that need to be kept for each board. */
+
+#define TX_RING_MAX    16      /* Typically the card supports 37 */
+#define RX_RING_MAX    32      /*        "     "       "         */
+
+struct mc32_local 
+{
+       struct net_device_stats net_stats;
+       int slot;
+       volatile struct mc32_mailbox *rx_box;
+       volatile struct mc32_mailbox *tx_box;
+       volatile struct mc32_mailbox *exec_box;
+       volatile u16 *stats;
+       u16 tx_chain;
+       u16 rx_chain;
+       u16 tx_len;
+       u16 rx_len;
+       u32 base;
+       u16 rx_halted;
+       u16 tx_halted;
+       u16 exec_pending;
+       u16 mc_reload_wait;     /* a multicast load request is pending */
+       atomic_t tx_count;              /* buffers left */
+       struct wait_queue *event;
+       struct sk_buff *tx_skb[TX_RING_MAX];    /* Transmit ring */
+       u16 tx_skb_top;
+       u16 tx_skb_end;
+       struct sk_buff *rx_skb[RX_RING_MAX];    /* Receive ring */
+       void *rx_ptr[RX_RING_MAX];              /* Data pointers */
+};
+
+/* The station (ethernet) address prefix, used for a sanity check. */
+#define SA_ADDR0 0x02
+#define SA_ADDR1 0x60
+#define SA_ADDR2 0xAC
+
+struct mca_adapters_t {
+       unsigned int    id;
+       char            *name;
+};
+
+const struct mca_adapters_t mc32_adapters[] = {
+       { 0x0041, "3COM EtherLink MC/32" },
+       { 0x8EF5, "IBM High Performance Lan Adapter" },
+       { 0x0000, NULL }
+};
+
+
+/* Index to functions, as function prototypes. */
+
+extern int mc32_probe(struct device *dev);
+
+static int     mc32_probe1(struct device *dev, int ioaddr);
+static int     mc32_open(struct device *dev);
+static int     mc32_send_packet(struct sk_buff *skb, struct device *dev);
+static void    mc32_interrupt(int irq, void *dev_id, struct pt_regs *regs);
+static int     mc32_close(struct device *dev);
+static struct  net_device_stats *mc32_get_stats(struct device *dev);
+static void    mc32_set_multicast_list(struct device *dev);
+
+/*
+ * Check for a network adaptor of this type, and return '0' iff one exists.
+ * If dev->base_addr == 0, probe all likely locations.
+ * If dev->base_addr == 1, always return failure.
+ * If dev->base_addr == 2, allocate space for the device and return success
+ * (detachable devices only).
+ */
+
+__initfunc(int mc32_probe(struct device *dev))
+{
+       static int current_mca_slot = -1;
+       int i;
+       int adapter_found = 0;
+
+       /* Do not check any supplied i/o locations. 
+          POS registers usually don't fail :) */
+
+       /* MCA cards have POS registers.  
+          Autodetecting MCA cards is extremely simple. 
+          Just search for the card. */
+
+       for(i = 0; (mc32_adapters[i].name != NULL) && !adapter_found; i++) {
+               current_mca_slot = 
+                       mca_find_unused_adapter(mc32_adapters[i].id, 0);
+
+               if((current_mca_slot != MCA_NOTFOUND) && !adapter_found) {
+                       if(!mc32_probe1(dev, current_mca_slot))
+                       {
+                               mca_set_adapter_name(current_mca_slot, 
+                                               mc32_adapters[i].name);
+                               mca_mark_as_used(current_mca_slot);
+                               return 0;
+                       }
+                       
+               }
+       }
+       return -ENODEV;
+}
+
+/*
+ * This is the real probe routine. Linux has a history of friendly device
+ * probes on the ISA bus. A good device probes avoids doing writes, and
+ * verifies that the correct device exists and functions.
+ */
+__initfunc(static int mc32_probe1(struct device *dev, int slot))
+{
+       static unsigned version_printed = 0;
+       int i;
+       u8 POS;
+       u32 base;
+       struct mc32_local *lp;
+       static u16 mca_io_bases[]={
+               0x7280,0x7290,
+               0x7680,0x7690,
+               0x7A80,0x7A90,
+               0x7E80,0x7E90
+       };
+       static u32 mca_mem_bases[]={
+               0x00C0000,
+               0x00C4000,
+               0x00C8000,
+               0x00CC000,
+               0x00D0000,
+               0x00D4000,
+               0x00D8000,
+               0x00DC000
+       };
+       static char *failures[]={
+               "Processor instruction",
+               "Processor data bus",
+               "Processor data bus",
+               "Processor data bus",
+               "Adapter bus",
+               "ROM checksum",
+               "Base RAM",
+               "Extended RAM",
+               "82586 internal loopback",
+               "82586 initialisation failure",
+               "Adapter list configuration error"
+       };
+       
+       /* Time to play MCA games */
+
+       if (mc32_debug  &&  version_printed++ == 0)
+               printk(KERN_DEBUG "%s", version);
+
+       printk(KERN_INFO "%s: %s found in slot %d:", dev->name, cardname, slot);
+
+       POS = mca_read_stored_pos(slot, 2);
+       
+       if(!(POS&1))
+       {
+               printk(" disabled.\n");
+               return -ENODEV;
+       }
+
+       /* Allocate a new 'dev' if needed. */
+       if (dev == NULL) {
+               /*
+                * Don't allocate the private data here, it is done later
+                * This makes it easier to free the memory when this driver
+                * is used as a module.
+                */
+               dev = init_etherdev(0, 0);
+               if (dev == NULL)
+                       return -ENOMEM;
+       }
+
+       /* Fill in the 'dev' fields. */
+       dev->base_addr = mca_io_bases[(POS>>1)&7];
+       dev->mem_start = mca_mem_bases[(POS>>4)&7];
+       
+       POS = mca_read_stored_pos(slot, 4);
+       if(!(POS&1))
+       {
+               printk("memory window disabled.\n");
+               return -ENODEV;
+       }
+
+       POS = mca_read_stored_pos(slot, 5);
+       
+       i=(POS>>4)&3;
+       if(i==3)
+       {
+               printk("invalid memory window.\n");
+               return -ENODEV;
+       }
+       
+       i*=16384;
+       i+=16384;
+       
+       dev->mem_end=dev->mem_start + i;
+       
+       dev->irq = ((POS>>2)&3)+9;
+       
+       printk("io 0x%3lX irq %d mem 0x%lX (%dK)\n",
+               dev->base_addr, dev->irq, dev->mem_start, i/1024);
+       
+       
+       /* We ought to set the cache line size here.. */
+       
+       
+       /*
+        *      Go PROM browsing
+        */
+        
+       printk("%s: Address ", dev->name);
+        
+       /* Retrieve and print the ethernet address. */
+       for (i = 0; i < 6; i++)
+       {
+               mca_write_pos(slot, 6, i+12);
+               mca_write_pos(slot, 7, 0);
+       
+               printk(" %2.2x", dev->dev_addr[i] = mca_read_pos(slot,3));
+       }
+
+       mca_write_pos(slot, 6, 0);
+       mca_write_pos(slot, 7, 0);
+
+       POS = mca_read_stored_pos(slot, 4);
+       
+       if(POS&2)
+               printk(" : BNC port selected.\n");
+       else 
+               printk(" : AUI port selected.\n");
+               
+       POS=inb(dev->base_addr+HOST_CTRL);
+       POS|=HOST_CTRL_ATTN|HOST_CTRL_RESET;
+       POS&=~HOST_CTRL_INTE;
+       outb(POS, dev->base_addr+HOST_CTRL);
+       /* Reset adapter */
+       udelay(100);
+       /* Reset off */
+       POS&=~(HOST_CTRL_ATTN|HOST_CTRL_RESET);
+       outb(POS, dev->base_addr+HOST_CTRL);
+       
+       udelay(300);
+       
+       /*
+        *      Grab the IRQ
+        */
+
+       if(request_irq(dev->irq, &mc32_interrupt, 0, cardname, dev))
+       {
+               printk("%s: unable to get IRQ %d.\n",
+                                  dev->name, dev->irq);
+               return -EAGAIN;
+       }
+
+       /* Initialize the device structure. */
+       if (dev->priv == NULL) {
+               dev->priv = kmalloc(sizeof(struct mc32_local), GFP_KERNEL);
+               if (dev->priv == NULL)
+               {
+                       free_irq(dev->irq, dev);
+                       return -ENOMEM;
+               }
+       }
+
+       memset(dev->priv, 0, sizeof(struct mc32_local));
+       lp = (struct mc32_local *)dev->priv;
+       lp->slot = slot;
+
+       i=0;
+
+       base = inb(dev->base_addr);
+       
+       while(base==0xFF)
+       {
+               i++;
+               if(i==1000)
+               {
+                       printk("%s: failed to boot adapter.\n", dev->name);
+                       free_irq(dev->irq, dev);
+                       return -ENODEV;
+               }
+               udelay(1000);
+               if(inb(dev->base_addr+2)&(1<<5))
+                       base = inb(dev->base_addr);
+       }
+
+       if(base>0)
+       {
+               if(base < 0x0C)
+                       printk("%s: %s%s.\n", dev->name, failures[base-1],
+                               base<0x0A?" test failure":"");
+               else
+                       printk("%s: unknown failure %d.\n", dev->name, base);
+               free_irq(dev->irq, dev);
+               return -ENODEV;
+       }
+       
+       base=0;
+       for(i=0;i<4;i++)
+       {
+               int n=0;
+       
+               while(!(inb(dev->base_addr+2)&(1<<5)))
+               {
+                       n++;
+                       udelay(50);
+                       if(n>100)
+                       {
+                               printk(KERN_ERR "%s: mailbox read fail (%d).\n", dev->name, i);
+                               free_irq(dev->irq, dev);
+                               return -ENODEV;
+                       }
+               }
+
+               base|=(inb(dev->base_addr)<<(8*i));
+       }
+       
+       lp->exec_box=bus_to_virt(dev->mem_start+base);
+       
+       base=lp->exec_box->data[1]<<16|lp->exec_box->data[0];
+       
+       lp->base = dev->mem_start+base;
+       
+       lp->rx_box=bus_to_virt(lp->base + lp->exec_box->data[2]);
+       lp->tx_box=bus_to_virt(lp->base + lp->exec_box->data[3]);
+       
+       lp->stats = bus_to_virt(lp->base + lp->exec_box->data[5]);
+
+       /*
+        *      Descriptor chains (card relative)
+        */
+        
+       lp->tx_chain            = lp->exec_box->data[8];
+       lp->rx_chain            = lp->exec_box->data[10];
+       lp->tx_len              = lp->exec_box->data[9];
+       lp->rx_len              = lp->exec_box->data[11];
+       
+       printk("%s: %d RX buffers, %d TX buffers. Base of 0x%08X.\n",
+               dev->name, lp->rx_len, lp->tx_len, lp->base);
+       
+       dev->open               = mc32_open;
+       dev->stop               = mc32_close;
+       dev->hard_start_xmit    = mc32_send_packet;
+       dev->get_stats          = mc32_get_stats;
+       dev->set_multicast_list = mc32_set_multicast_list;
+       
+       lp->rx_halted           = 1;
+       lp->tx_halted           = 1;
+
+       /* Fill in the fields of the device structure with ethernet values. */
+       ether_setup(dev);
+       return 0;
+}
+
+
+/*
+ *     Polled command stuff 
+ */
+static void mc32_ring_poll(struct device *dev)
+{
+       int ioaddr = dev->base_addr;
+       while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+}
+
+
+/*
+ *     Send exec commands
+ */
+static int mc32_command(struct device *dev, u16 cmd, void *data, int len)
+{
+       struct mc32_local *lp = (struct mc32_local *)dev->priv;
+       int ioaddr = dev->base_addr;
+       unsigned long flags;
+       
+       while(lp->exec_pending)
+               sleep_on(&lp->event);
+               
+       lp->exec_pending=1;
+       lp->exec_box->mbox=0;
+       lp->exec_box->mbox=cmd;
+       memcpy((void *)lp->exec_box->data, data, len);
+       barrier();      /* the memcpy forgot the volatile so be sure */
+
+       /* Send the command */
+       while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+       outb(1<<6, ioaddr+HOST_CMD);    
+       
+       save_flags(flags);
+       cli();
+       while(lp->exec_pending!=2)
+               sleep_on(&lp->event);
+       lp->exec_pending=0;
+       restore_flags(flags);
+       
+       /*
+        *      A multicast set got blocked - do it now
+        */
+        
+       if(lp->mc_reload_wait)
+               mc32_set_multicast_list(dev);
+
+       if(lp->exec_box->data[0]&(1<<13))
+               return -1;
+       return 0;
+}
+
+/*
+ *     RX abort
+ */
+static void mc32_rx_abort(struct device *dev)
+{
+       struct mc32_local *lp = (struct mc32_local *)dev->priv;
+       int ioaddr = dev->base_addr;
+
+       while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+       
+       lp->rx_box->mbox=0;
+       outb(3<<3, ioaddr+HOST_CMD);    /* Suspend reception */
+}
+
+/*
+ *     RX enable
+ */
+static void mc32_rx_begin(struct device *dev)
+{
+       struct mc32_local *lp = (struct mc32_local *)dev->priv;
+       int ioaddr = dev->base_addr;
+       
+       while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+       
+       lp->rx_box->mbox=0;
+       outb(1<<3, ioaddr+HOST_CMD);    /* GO */
+       mc32_ring_poll(dev);    
+       
+       lp->rx_halted=0;
+}
+
+static void mc32_tx_abort(struct device *dev)
+{
+       struct mc32_local *lp = (struct mc32_local *)dev->priv;
+       int ioaddr = dev->base_addr;
+       
+       while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+       
+       lp->tx_box->mbox=0;
+       outb(3, ioaddr+HOST_CMD);       /* Suspend */
+       
+       /* Ring empty */
+       
+       atomic_set(&lp->tx_count, lp->tx_len);
+       
+       /* Flush */
+       if(lp->tx_skb_top!=lp->tx_skb_end)
+       {
+               int i;
+               if(lp->tx_skb_top<=lp->tx_skb_end)
+               {
+                       for(i=lp->tx_skb_top;i<lp->tx_skb_end;i++)
+                       {
+                               dev_kfree_skb(lp->tx_skb[i]);
+                               lp->tx_skb[i]=NULL;
+                       }
+               }
+               else
+               {
+                       for(i=lp->tx_skb_end;i<TX_RING_MAX;i++)
+                       {
+                               dev_kfree_skb(lp->tx_skb[i]);
+                               lp->tx_skb[i]=NULL;
+                       }
+                       for(i=0;i<lp->tx_skb_top;i++)
+                       {
+                               dev_kfree_skb(lp->tx_skb[i]);
+                               lp->tx_skb[i]=NULL;
+                       }
+               }
+       }
+       lp->tx_skb_top=lp->tx_skb_end=0;
+}
+
+/*
+ *     TX enable
+ */
+static void mc32_tx_begin(struct device *dev)
+{
+       struct mc32_local *lp = (struct mc32_local *)dev->priv;
+       int ioaddr = dev->base_addr;
+       
+       while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+       
+       lp->tx_box->mbox=0;
+#if 0  
+       outb(5, ioaddr+HOST_CMD);       /* GO */
+       printk("TX=>5\n");
+       mc32_ring_poll(dev);    
+       if(lp->tx_box->mbox&(1<<13))
+               printk("TX begin error!\n");
+#endif         
+       lp->tx_halted=0;
+}
+
+       
+/*
+ *     Load the rx ring
+ */
+static int mc32_load_rx_ring(struct device *dev)
+{
+       struct mc32_local *lp = (struct mc32_local *)dev->priv;
+       int i;
+       u16 base;
+       volatile struct skb_header *p;
+       
+       base = lp->rx_box->data[0];
+       
+       /* Fix me - should use card size - also fix flush ! */ 
+
+       for(i=0;i<RX_RING_MAX;i++)
+       {
+               lp->rx_skb[i]=alloc_skb(1532, GFP_KERNEL);
+               if(lp->rx_skb[i]==NULL)
+               {
+                       for(;i>=0;i--)
+                               kfree_skb(lp->rx_skb[i]);
+                       return -ENOBUFS;
+               }
+               lp->rx_ptr[i]=lp->rx_skb[i]->data+18;
+               
+               p=bus_to_virt(lp->base+base);
+               p->control=0;
+               p->data = virt_to_bus(lp->rx_ptr[i]);
+               p->status=0;
+               p->length = 1532;
+               base = p->next;
+       }
+       p->control = (1<<6);
+       lp->rx_box->mbox = 0;
+       return 0;
+}      
+
+static void mc32_flush_rx_ring(struct mc32_local *lp)
+{
+       int i;
+       for(i=0;i<RX_RING_MAX;i++)
+               kfree_skb(lp->rx_skb[i]);
+}
+
+static void mc32_flush_tx_ring(struct mc32_local *lp)
+{
+       int i;
+       
+       if(lp->tx_skb_top <= lp->tx_skb_end)
+       {
+               for(i=lp->tx_skb_top;i<lp->tx_skb_end;i++)
+                       dev_kfree_skb(lp->tx_skb[i]);
+       }
+       else
+       {
+               for(i=0;i<lp->tx_skb_end;i++)
+                       dev_kfree_skb(lp->tx_skb[i]);
+               for(i=lp->tx_skb_top;i<TX_RING_MAX;i++)
+                       dev_kfree_skb(lp->tx_skb[i]);
+       }
+}
+       
+/*
+ * Open/initialize the board. This is called (in the current kernel)
+ * sometime after booting when the 'ifconfig' program is run.
+ */
+
+static int mc32_open(struct device *dev)
+{
+       int ioaddr = dev->base_addr;
+       u16 zero_word=0;
+       u8 one=1;
+       u8 regs;
+       
+       dev->tbusy = 0;
+       dev->interrupt = 0;
+       dev->start = 1;
+
+       /*
+        *      Interrupts enabled
+        */
+
+       regs=inb(ioaddr+HOST_CTRL);
+       regs|=HOST_CTRL_INTE;
+       outb(regs, ioaddr+HOST_CTRL);
+       
+
+       /*
+        *      Send the indications on command
+        */
+
+       mc32_command(dev, 4, &one, 2);
+
+               
+       /*
+        *      Send the command sequence "abort, resume" for RX and TX.
+        *      The abort cleans up the buffer chains if needed.
+        */
+
+       mc32_rx_abort(dev);
+       mc32_tx_abort(dev);
+       
+       /* Set Network Address */
+       mc32_command(dev, 1, dev->dev_addr, 6);
+       
+       /* Set the filters */
+       mc32_set_multicast_list(dev);
+       
+       /* Issue the 82586 workaround command - this is for "busy lans",
+          but basically means for all lans now days - has a performance
+          cost but best set */
+          
+       mc32_command(dev, 0x0D, &zero_word, 2); /* 82586 bug workaround on */
+       
+       /* Load the ring we just initialised */
+       
+       if(mc32_load_rx_ring(dev))
+       {
+               mc32_close(dev);
+               return -ENOBUFS;
+       }
+       
+       /* And the resume command goes last */
+       
+       mc32_rx_begin(dev);
+       mc32_tx_begin(dev);
+       
+       MOD_INC_USE_COUNT;
+
+       return 0;
+}
+
+static int mc32_send_packet(struct sk_buff *skb, struct device *dev)
+{
+       struct mc32_local *lp = (struct mc32_local *)dev->priv;
+
+       if (dev->tbusy) {
+               /*
+                * If we get here, some higher level has decided we are broken.
+                * There should really be a "kick me" function call instead.
+                */
+               int tickssofar = jiffies - dev->trans_start;
+               if (tickssofar < 5)
+                       return 1;
+               printk(KERN_WARNING "%s: transmit timed out?\n", dev->name);
+               /* Try to restart the adaptor. */
+               dev->tbusy=0;
+               dev->trans_start = jiffies;
+       }
+
+       /*
+        * Block a timer-based transmit from overlapping. This could better be
+        * done with atomic_swap(1, dev->tbusy), but set_bit() works as well.
+        */
+       if (test_and_set_bit(0, (void*)&dev->tbusy) != 0)
+       {
+               printk(KERN_WARNING "%s: Transmitter access conflict.\n", dev->name);
+               dev_kfree_skb(skb);
+       }
+       else 
+       {
+               unsigned long flags;
+               
+               u16 tx_head;
+               volatile struct skb_header *p, *np;
+
+               save_flags(flags);
+               cli();
+               
+               if(atomic_read(&lp->tx_count)==0)
+               {
+                       dev->tbusy=1;
+                       restore_flags(flags);
+                       return 1;
+               }
+
+               tx_head = lp->tx_box->data[0];
+               atomic_dec(&lp->tx_count);
+
+               /* We will need this to flush the buffer out */
+               
+               lp->tx_skb[lp->tx_skb_end] = skb;
+               lp->tx_skb_end++;
+               lp->tx_skb_end&=(TX_RING_MAX-1);
+               
+               /* P is the last sending/sent buffer as a pointer */
+               p=(struct skb_header *)bus_to_virt(lp->base+tx_head);
+               
+               /* NP is the buffer we will be loading */
+               np=(struct skb_header *)bus_to_virt(lp->base+p->next);
+               
+               np->control     |= (1<<6);      /* EOL */
+               wmb();
+               
+               np->length      = skb->len;
+               np->data        = virt_to_bus(skb->data);
+               np->status      = 0;
+               np->control     = (1<<7)|(1<<6);        /* EOP EOL */
+               wmb();
+               
+               p->status       = 0;
+               p->control      &= ~(1<<6);
+               
+               dev->tbusy      = 0;                    /* Keep feeding me */           
+               
+               lp->tx_box->mbox=0;
+               restore_flags(flags);
+       }
+       return 0;
+}
+
+static void mc32_update_stats(struct device *dev)
+{
+}
+
+
+static void mc32_rx_ring(struct device *dev)
+{
+       struct mc32_local *lp=dev->priv;
+       int ioaddr = dev->base_addr;
+       int x=0;
+       volatile struct skb_header *p;
+       u16 base;
+       u16 top;
+       
+       top = base = lp->rx_box->data[0];
+       do
+       {
+               p=(struct skb_header *)bus_to_virt(base+lp->base);
+               if(!(p->status & (1<<7)))
+                       break;
+               if(p->status & (1<<6))
+               {
+                       u16 length = p->length;
+                       struct sk_buff *skb=dev_alloc_skb(length+2);
+                       if(skb!=NULL)
+                       {
+                               skb_reserve(skb,2);
+                               /*printk("Frame at %p\n", bus_to_virt(p->data)); */
+                               memcpy(skb_put(skb, length),
+                                       bus_to_virt(p->data), length);
+                               skb->protocol=eth_type_trans(skb,dev);
+                               skb->dev=dev;
+                               lp->net_stats.rx_packets++;
+                               lp->net_stats.rx_bytes+=skb->len;
+                               netif_rx(skb);
+                       }
+                       else
+                               lp->net_stats.rx_dropped++;
+               }
+               else
+               {
+                       lp->net_stats.rx_errors++;
+                       switch(p->status&0x0F)
+                       {
+                               case 1:
+                                       lp->net_stats.rx_crc_errors++;break;
+                               case 2:
+                                       lp->net_stats.rx_fifo_errors++;break;
+                               case 3:
+                                       lp->net_stats.rx_frame_errors++;break;
+                               case 4:
+                                       lp->net_stats.rx_missed_errors++;break;
+                               case 5:
+                                       lp->net_stats.rx_length_errors++;break;
+                       }
+               }
+               p->length = 1532;
+               p->control &= ~(1<<6);
+               p->status = 0;
+               base = p->next;
+       }
+       while(x++<48);
+       
+       /* 
+        *      This is curious. It seems the receive stop and receive continue
+        *      commands race against each other, even though we poll for 
+        *      command ready to be issued. The delay is hackish but is a workaround
+        *      while I investigate in depth
+        */
+       
+       while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+       lp->rx_box->mbox=0;
+       lp->rx_box->data[0] = top;
+       outb(1<<3, ioaddr+HOST_CMD);    
+}
+
+
+/*
+ * The typical workload of the driver:
+ *   Handle the network interface interrupts.
+ */
+static void mc32_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+       struct device *dev = dev_id;
+       struct mc32_local *lp;
+       int ioaddr, status, boguscount = 0;
+       int rx_event = 0;
+       
+       if (dev == NULL) {
+               printk(KERN_WARNING "%s: irq %d for unknown device.\n", cardname, irq);
+               return;
+       }
+       dev->interrupt = 1;
+
+       ioaddr = dev->base_addr;
+       lp = (struct mc32_local *)dev->priv;
+
+       /* See whats cooking */
+       
+       while((inb(ioaddr+2)&(1<<5)) && boguscount++<2000)
+       {
+               status=inb(ioaddr+HOST_CMD);
+
+#ifdef DEBUG_IRQ               
+               printk("Status TX%d RX%d EX%d OV%d\n",
+                       (status&7), (status>>3)&7, (status>>6)&1,
+                       (status>>7)&1);
+#endif
+                       
+               switch(status&7)
+               {
+                       case 0:
+                               break;
+                       case 6: /* TX fail */
+                               lp->net_stats.tx_errors++;
+                       case 2: /* TX ok */
+                               lp->net_stats.tx_packets++;
+                               /* Packets are sent in order - this is
+                                  basically a FIFO queue of buffers matching
+                                  the card ring */
+                               lp->net_stats.tx_bytes+=lp->tx_skb[lp->tx_skb_top]->len;
+                               dev_kfree_skb(lp->tx_skb[lp->tx_skb_top]);
+                               lp->tx_skb[lp->tx_skb_top]=NULL;
+                               lp->tx_skb_top++;
+                               lp->tx_skb_top&=(TX_RING_MAX-1);
+                               atomic_inc(&lp->tx_count);
+                               dev->tbusy=0;
+                               mark_bh(NET_BH);
+                               break;
+                       case 3: /* Halt */
+                       case 4: /* Abort */
+                               lp->tx_halted=1;
+                               wake_up(&lp->event);
+                               break;
+                       case 5:
+                               lp->tx_halted=0;
+                               wake_up(&lp->event);
+                               break;
+                       default:
+                               printk("%s: strange tx ack %d\n", 
+                                       dev->name, status&7);
+               }
+               status>>=3;
+               switch(status&7)
+               {
+                       case 0:
+                               break;
+                       case 2: /* RX */
+                               rx_event=1;
+                               break;
+                       case 3:
+                       case 4:
+                               lp->rx_halted=1;
+                               wake_up(&lp->event);
+                               break;
+                       case 5:
+                               lp->rx_halted=0;
+                               wake_up(&lp->event);
+                               break;
+                       case 6:
+                               /* Out of RX buffers stat */
+                               /* Must restart */
+                               lp->net_stats.rx_dropped++;
+                               rx_event = 1;   /* To restart */
+                               break;
+                       default:
+                               printk("%s: strange rx ack %d\n", 
+                                       dev->name, status&7);
+                       
+               }
+               status>>=3;
+               if(status&1)
+               {
+                       /* 0=no 1=yes 2=reply clearing */
+                       lp->exec_pending=2;
+                       wake_up(&lp->event);
+               }
+               if(status&2)
+               {
+                       /*
+                        *      Update the stats as soon as
+                        *      we have it flagged and can 
+                        *      send an immediate reply (CRR set)
+                        */
+                        
+                       if(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR)
+                       {
+                               mc32_update_stats(dev);
+                               outb(0, ioaddr+HOST_CMD);
+                       }
+               }
+       }
+       
+       /*
+        *      Process and restart the receive ring.
+        */
+        
+       if(rx_event)
+               mc32_rx_ring(dev);
+       dev->interrupt = 0;
+       return;
+}
+
+
+/* The inverse routine to mc32_open(). */
+
+static int mc32_close(struct device *dev)
+{
+       struct mc32_local *lp = (struct mc32_local *)dev->priv;
+       int ioaddr = dev->base_addr;
+       u8 regs;
+       u16 one=1;
+
+       /*
+        *      Send the indications on command (handy debug check)
+        */
+
+       mc32_command(dev, 4, &one, 2);
+
+       /* Abort RX and Abort TX */
+       
+       mc32_rx_abort(dev);     
+       mc32_tx_abort(dev);
+       
+       /* Catch any waiting commands */
+       
+       while(lp->exec_pending==1)
+               sleep_on(&lp->event);
+               
+       /* Ok the card is now stopping */       
+       
+       regs=inb(ioaddr+HOST_CTRL);
+       regs&=~HOST_CTRL_INTE;
+       outb(regs, ioaddr+HOST_CTRL);
+
+       mc32_flush_rx_ring(lp);
+       mc32_flush_tx_ring(lp);
+       
+       dev->tbusy = 1;
+       dev->start = 0;
+
+       /* Update the statistics here. */
+
+       MOD_DEC_USE_COUNT;
+
+       return 0;
+
+}
+
+/*
+ * Get the current statistics.
+ * This may be called with the card open or closed.
+ */
+
+static struct net_device_stats *mc32_get_stats(struct device *dev)
+{
+       struct mc32_local *lp = (struct mc32_local *)dev->priv;
+       return &lp->net_stats;
+}
+
+/*
+ * Set or clear the multicast filter for this adaptor.
+ * num_addrs == -1     Promiscuous mode, receive all packets
+ * num_addrs == 0      Normal mode, clear multicast list
+ * num_addrs > 0       Multicast mode, receive normal and MC packets,
+ *                     and do best-effort filtering.
+ */
+static void mc32_set_multicast_list(struct device *dev)
+{
+       u16 filt;
+       if (dev->flags&IFF_PROMISC)
+       {
+               /* Enable promiscuous mode */
+               filt = 1;
+               mc32_command(dev, 0, &filt, 2);
+       }
+       else if((dev->flags&IFF_ALLMULTI) || dev->mc_count > 10)
+       {
+               dev->flags|=IFF_PROMISC;
+               filt = 1;
+               mc32_command(dev, 0, &filt, 2);
+       }
+       else if(dev->mc_count)
+       {
+               unsigned char block[62];
+               unsigned char *bp;
+               struct dev_mc_list *dmc=dev->mc_list;
+               
+               int i;
+               
+               filt = 0;
+               block[1]=0;
+               block[0]=dev->mc_count;
+               bp=block+2;
+               
+               for(i=0;i<dev->mc_count;i++)
+               {
+                       memcpy(bp, dmc->dmi_addr, 6);
+                       bp+=6;
+                       dmc=dmc->next;
+               }
+               mc32_command(dev, 2, block, 2+6*dev->mc_count);
+               mc32_command(dev, 0, &filt, 2);
+       }
+       else 
+       {
+               filt = 0;
+               mc32_command(dev, 0, &filt, 2);
+       }
+}
+
+#ifdef MODULE
+
+static char devicename[9] = { 0, };
+static struct device this_device = {
+       devicename, /* will be inserted by linux/drivers/net/mc32_init.c */
+       0, 0, 0, 0,
+       0, 0,  /* I/O address, IRQ */
+       0, 0, 0, NULL, mc32_probe };
+
+int init_module(void)
+{
+       int result;
+
+       if ((result = register_netdev(&this_device)) != 0)
+               return result;
+
+       return 0;
+}
+
+void cleanup_module(void)
+{
+       int slot;
+       
+       /* No need to check MOD_IN_USE, as sys_delete_module() checks. */
+       unregister_netdev(&this_device);
+
+       /*
+        * If we don't do this, we can't re-insmod it later.
+        * Release irq/dma here, when you have jumpered versions and
+        * allocate them in mc32_probe1().
+        */
+        
+       if (this_device.priv)
+       {
+               struct mc32_local *lp=this_device.priv;
+               slot = lp->slot;
+               mca_mark_as_unused(slot);
+               mca_set_adapter_name(slot, NULL);
+               kfree_s(this_device.priv, sizeof(struct mc32_local));
+       }
+       free_irq(this_device.irq, &this_device);
+}
+
+#endif /* MODULE */
diff --git a/drivers/net/3c527.h b/drivers/net/3c527.h
new file mode 100644 (file)
index 0000000..dfe2738
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ *     3COM "EtherLink MC/32" Descriptions
+ */
+
+/*
+ *     Registers
+ */
+  
+#define HOST_CMD               0
+
+#define HOST_STATUS            2
+#define                HOST_STATUS_CRR (1<<6)
+#define                HOST_STATUS_CWR (1<<5)
+
+#define HOST_CTRL              6
+#define                HOST_CTRL_ATTN  (1<<7)
+#define        HOST_CTRL_RESET (1<<6)
+#define        HOST_CTRL_INTE  (1<<2)
+
+#define HOST_RAMPAGE           8
+
+struct skb_header
+{
+       u8      status __attribute((packed));
+       u8      control __attribute((packed));
+       u16     next __attribute((packed));     /* Do not change! */
+       u16     length __attribute((packed));
+       u32     data __attribute((packed));
+};
+
+#define STATUS_MASK    0x0F
+#define COMPLETED      0x80
+#define COMPLETED_OK   0x40
+#define BUFFER_BUSY    0x20
+
+#define CONTROL_EOP    0x80    /* End Of Packet */
+#define CONTROL_EL     0x40    /* End of List */
+
+
+#define MCA_MC32_ID    0x0041  /* Our MCA ident */
\ No newline at end of file
index 3fab7ad..f0dd2ae 100644 (file)
@@ -53,6+53,7 @@ if [ "$CONFIG_NET_ETHERNET" = "y" ]; then
       tristate '3c507 support' CONFIG_EL16
       if [ "$CONFIG_MCA" = "y" ]; then
         tristate '3c523 support' CONFIG_ELMC
+        tristate '3c527 support' CONFIG_ELMC_II
       fi
     fi
     tristate '3c509/3c579 support' CONFIG_EL3
index c708e4d..bf12c3d 100644 (file)
@@ -486,6+486,14 @@ else
   endif
 endif
 
+ifeq ($(CONFIG_ELMC_II),y)
+L_OBJS += 3c527.o
+else
+  ifeq ($(CONFIG_ELMC_II),m)
+  M_OBJS += 3c527.o
+  endif
+endif
+
 ifeq ($(CONFIG_EL3),y)
 L_OBJS += 3c509.o
 else
index 41d2579..037fc82 100644 (file)
@@ -135,6+135,7 @@ struct lance_private {
        struct Linux_SBus_DMA *ledma; /* if set this points to ledma and arch=4m */
        int burst_sizes;              /* ledma SBus burst sizes */
 #endif
+       struct timer_list         multicast_timer;
 };
 
 #define TX_BUFFS_AVAIL ((lp->tx_old<=lp->tx_new)?\
@@ -527,6+528,7 @@ static int lance_close (struct device *dev)
 
        dev->start = 0;
        dev->tbusy = 1;
+       del_timer(&lp->multicast_timer);
 
        /* Stop the card */
        ll->rap = LE_CSR0;
@@ -706,12+708,20 @@ static void lance_set_multicast (struct device *dev)
        volatile struct lance_init_block *ib = lp->init_block;
        volatile struct lance_regs *ll = lp->ll;
 
-       while (dev->tbusy)
-               schedule();
+       if (!dev->start)
+               return;
+
+       if (dev->tbusy) {
+               mod_timer(&lp->multicast_timer, jiffies + 2);
+               return;
+       }
        set_bit (0, (void *) &dev->tbusy);
 
-       while (lp->tx_old != lp->tx_new)
-               schedule();
+       if (lp->tx_old != lp->tx_new) {
+               mod_timer(&lp->multicast_timer, jiffies + 4);
+               dev->tbusy = 0;
+               return;
+       }
 
        ll->rap = LE_CSR0;
        ll->rdp = LE_C0_STOP;
@@ -726,6+736,7 @@ static void lance_set_multicast (struct device *dev)
        load_csrs (lp);
        init_restart_lance (lp);
        dev->tbusy = 0;
+       mark_bh(NET_BH);
 }
 
 
@@ -795,6+806,11 @@ __initfunc(int a2065_probe(struct device *dev))
                        dev->dma = 0;
 
                        ether_setup(dev);
+                       init_timer(&priv->multicast_timer);
+                       priv->multicast_timer.data = (unsigned long) dev;
+                       priv->multicast_timer.function =
+                               (void (*)(unsigned long)) &lance_set_multicast;
+
                        zorro_config_board(key, 0);
                        return(0);
                }
index 9b57ad6..3485a2a 100644 (file)
@@ -1,4+1,4 @@
-/* $Id: sunlance.c,v 1.84 1999/03/11 12:30:22 anton Exp $
+/* $Id: sunlance.c,v 1.85 1999/03/21 05:22:05 davem Exp $
  * lance.c: Linux/Sparc/Lance driver
  *
  *     Written 1995, 1996 by Miguel de Icaza
@@ -331,8+331,6 @@ static void lance_init_ring (struct device *dev)
        lp->rx_new = lp->tx_new = 0;
        lp->rx_old = lp->tx_old = 0;
 
-       ib->mode = 0;
-
        /* Copy the ethernet address to the lance init block
         * Note that on the sparc you need to swap the ethernet address.
         * Note also we want the CPU ptr of the init_block here.
@@ -389,10+387,6 @@ static void lance_init_ring (struct device *dev)
        ib->tx_ptr = leptr;
        if (ZERO)
                printk ("TX ptr: %8.8x\n", leptr);
-
-       /* Clear the multicast filter */
-       ib->filter [0] = 0;
-       ib->filter [1] = 0;
 }
 
 static int init_restart_lance (struct lance_private *lp)
@@ -673,6+667,7 @@ static int lance_open (struct device *dev)
 {
        struct lance_private *lp = (struct lance_private *)dev->priv;
        volatile struct lance_regs *ll = lp->ll;
+       volatile struct lance_init_block *ib = lp->init_block;
        int status = 0;
 
        last_dev = dev;
@@ -691,6+686,16 @@ static int lance_open (struct device *dev)
        if (lp->ledma)
                lp->ledma->regs->dma_test = ((__u32) lp->init_block_dvma) & 0xff000000;
 
+       /* Set mode and clear multicast filter only at device open,
+          so that lance_init_ring() called at any error will not
+          forget multicast filters.
+
+          BTW it is common bug in all lance drivers! --ANK
+        */
+       ib->mode = 0;
+       ib->filter [0] = 0;
+       ib->filter [1] = 0;
+
        lance_init_ring (dev);
        load_csrs (lp);
 
@@ -747,6+752,7 @@ static int lance_close (struct device *dev)
 
        dev->start = 0;
        dev->tbusy = 1;
+       del_timer(&lp->multicast_timer);
 
        /* Stop the card */
        ll->rap = LE_CSR0;
@@ -916,14+922,31 @@ static void lance_set_multicast (struct device *dev)
        volatile struct lance_init_block *ib = lp->init_block;
        volatile struct lance_regs *ll = lp->ll;
 
+       if (!dev->start)
+               return;
+
        if (dev->tbusy) {
                mod_timer(&lp->multicast_timer, jiffies + 2);
                return;
        }
+       /* This CANNOT be correct. Chip is running
+          and dev->tbusy may change any moment.
+          It is useless to set it.
+
+          Generally, usage of dev->tbusy in this driver is completely
+          wrong.
+
+          I protected calls to this function
+          with start_bh_atomic, so that set_multicast_list
+          and hard_start_xmit are serialized now by top level. --ANK
+
+          The same is true about a2065.
+        */
        set_bit (0, (void *) &dev->tbusy);
 
        if (lp->tx_old != lp->tx_new) {
                mod_timer(&lp->multicast_timer, jiffies + 4);
+               dev->tbusy = 0;
                return;
        }
 
@@ -940,6+963,7 @@ static void lance_set_multicast (struct device *dev)
        load_csrs (lp);
        init_restart_lance (lp);
        dev->tbusy = 0;
+       mark_bh(NET_BH);
 }
 
 __initfunc(static int 
index d80d600..2f90628 100644 (file)
@@ -636,6+636,28 @@ static int sparcaudio_mixer_ioctl(struct inode * inode, struct file * file,
 
   k = arg;
 
+  if(cmd == SOUND_MIXER_INFO) {
+          audio_device_t tmp;
+          mixer_info info;
+          int retval = -EINVAL;
+
+          if(drv->ops->sunaudio_getdev) {
+                  drv->ops->sunaudio_getdev(drv, &tmp);
+                  memset(&info, 0, sizeof(info));
+                  strncpy(info.id, tmp.name, sizeof(info.id));
+                  strncpy(info.name, "Sparc Audio", sizeof(info.name));
+
+                  /* XXX do this right... */
+                  info.modify_counter = 0;
+
+                  if(copy_to_user((char *)arg, &info, sizeof(info)))
+                          retval = -EFAULT;
+                  else
+                          retval = 0;
+          }
+          return retval;
+  }
+
   switch (cmd) {
   case SOUND_MIXER_WRITE_RECLEV:
   case SOUND_MIXER_WRITE_MIC:
index dd73886..1705d7b 100644 (file)
 /* --------------------------------------------------------------------- */
 
 #undef OSS_DOCUMENTED_MIXER_SEMANTICS
+#define DBG(x) {}
+/*#define DBG(x) {x}*/
 
 /* --------------------------------------------------------------------- */
 
@@ -1019,7+1021,7 @@ static int drain_dac1(struct es1370_state *s, int nonblock)
                tmo = (count * HZ) / dac1_samplerate[(s->ctrl & CTRL_WTSRSEL) >> CTRL_SH_WTSRSEL];
                tmo >>= sample_shift[(s->sctrl & SCTRL_P1FMT) >> SCTRL_SH_P1FMT];
                if (!schedule_timeout(tmo ? : 1) && tmo)
-                       printk(KERN_DEBUG "es1370: dma timed out??\n");
+                       DBG(printk(KERN_DEBUG "es1370: dma timed out??\n");)
         }
         remove_wait_queue(&s->dma_dac1.wait, &wait);
         current->state = TASK_RUNNING;
@@ -1054,7+1056,7 @@ static int drain_dac2(struct es1370_state *s, int nonblock)
                tmo = (count * HZ) / DAC2_DIVTOSR((s->ctrl & CTRL_PCLKDIV) >> CTRL_SH_PCLKDIV);
                tmo >>= sample_shift[(s->sctrl & SCTRL_P2FMT) >> SCTRL_SH_P2FMT];
                if (!schedule_timeout(tmo ? : 1) && tmo)
-                       printk(KERN_DEBUG "es1370: dma timed out??\n");
+                       DBG(printk(KERN_DEBUG "es1370: dma timed out??\n");)
         }
         remove_wait_queue(&s->dma_dac2.wait, &wait);
         current->state = TASK_RUNNING;
@@ -2189,7+2191,7 @@ static int es1370_midi_release(struct inode *inode, struct file *file)
                        }
                        tmo = (count * HZ) / 3100;
                        if (!schedule_timeout(tmo ? : 1) && tmo)
-                               printk(KERN_DEBUG "es1370: midi timed out??\n");
+                               DBG(printk(KERN_DEBUG "es1370: midi timed out??\n");)
                }
                remove_wait_queue(&s->midi.owait, &wait);
                current->state = TASK_RUNNING;
index 0cbd80a..03d7d28 100644 (file)
@@ -84,7+84,7 @@ do_midi_msg(int synthno, unsigned char *msg, int mlen)
 
          case 0xE0:
                  STORE(SEQ_BENDER(synthno, msg[0] & 0x0f,
-                             (msg[1] % 0x7f) | ((msg[2] & 0x7f) << 7)));
+                             (msg[1] & 0x7f) | ((msg[2] & 0x7f) << 7)));
                  break;
 
          default:
index 0432bdd..7a7faed 100644 (file)
@@ -28,6+28,15 @@ Known bugs:
 
 Please direct bug reports to: hjw@zvw.de
 
+Version 3.10
+------------
+
+- Changed partition checker to allow devices
+  with physical blocks != 512 bytes.
+
+- The partition checker now also ignores the
+  word at 0xd0 that Windows likes to write to.
+
 Version 3.9
 -----------
 
index 8718b8b..2826286 100644 (file)
@@ -86,6+86,7 @@ void sysv_free_inode(struct inode * inode)
                return;
        }
        raw_inode = (struct sysv_inode *) bh->b_data + ((ino-1) & sb->sv_inodes_per_block_1);
+       clear_inode(inode);
        lock_super(sb);
        if (*sb->sv_sb_fic_count < sb->sv_fic_size)
                *sv_sb_fic_inode(sb,(*sb->sv_sb_fic_count)++) = ino;
@@ -97,7+98,6 @@ void sysv_free_inode(struct inode * inode)
        mark_buffer_dirty(bh, 1);
        unlock_super(sb);
        brelse(bh);
-       clear_inode(inode);
 }
 
 struct inode * sysv_new_inode(const struct inode * dir)
index d5af681..f585609 100644 (file)
@@ -55,10+55,8 @@ void sysv_print_inode(struct inode * inode)
 }
 #endif
 
-void sysv_put_inode(struct inode *inode)
+static void sysv_delete_inode(struct inode *inode)
 {
-       if (inode->i_nlink)
-               return;
        inode->i_size = 0;
        sysv_truncate(inode);
        sysv_free_inode(inode);
@@ -68,8+66,8 @@ void sysv_put_inode(struct inode *inode)
 static struct super_operations sysv_sops = {
        sysv_read_inode,
        sysv_write_inode,
-       sysv_put_inode,
-       NULL,                   /* delete_inode */
+       NULL,                   /* nothing special on put_inode() */
+       sysv_delete_inode,
        sysv_notify_change,
        sysv_put_super,
        sysv_write_super,
index 1ae3366..b7d2d5e 100644 (file)
@@ -576,6+576,7 @@ int sysv_link(struct dentry * old_dentry, struct inode * dir,
        oldinode->i_nlink++;
        oldinode->i_ctime = CURRENT_TIME;
        mark_inode_dirty(oldinode);
+       inode->i_count++;
         d_instantiate(dentry, oldinode);
        return 0;
 }
index 90bb197..e552cfc 100644 (file)
@@ -1,4+1,4 @@
-/* $Id: checksum.h,v 1.28 1998/04/17 02:37:25 davem Exp $ */
+/* $Id: checksum.h,v 1.29 1999/03/21 05:22:07 davem Exp $ */
 #ifndef __SPARC_CHECKSUM_H
 #define __SPARC_CHECKSUM_H
 
@@ -117,7+117,10 @@ csum_partial_copy_to_user(const char *src, char *dst, int len,
                return ret;
        }
 }
-  
+
+#define HAVE_CSUM_COPY_USER
+#define csum_and_copy_to_user csum_partial_copy_to_user
+
 /* ihl is always 5 or greater, almost always is 5, and iph is word aligned
  * the majority of the time.
  */
index 43d3c7c..e20b151 100644 (file)
@@ -99,6+99,7 @@ static int FDC2=-1;
 /* Routines unique to each controller type on a Sun. */
 static unsigned char sun_82072_fd_inb(int port)
 {
+       udelay(5);
        switch(port & 7) {
        default:
                printk("floppy: Asked to read unknown port %d\n", port);
@@ -115,6+116,7 @@ static unsigned char sun_82072_fd_inb(int port)
 
 static void sun_82072_fd_outb(unsigned char value, int port)
 {
+       udelay(5);
        switch(port & 7) {
        default:
                printk("floppy: Asked to write to unknown port %d\n", port);
@@ -150,6+152,7 @@ static void sun_82072_fd_outb(unsigned char value, int port)
 
 static unsigned char sun_82077_fd_inb(int port)
 {
+       udelay(5);
        switch(port & 7) {
        default:
                printk("floppy: Asked to read unknown port %d\n", port);
@@ -167,6+170,7 @@ static unsigned char sun_82077_fd_inb(int port)
 
 static void sun_82077_fd_outb(unsigned char value, int port)
 {
+       udelay(5);
        switch(port & 7) {
        default:
                printk("floppy: Asked to write to unknown port %d\n", port);
index d73b71f..c833a2a 100644 (file)
@@ -22,11+22,9 @@ BTFIXUPDEF_CALL(void, destroy_context, struct mm_struct *)
 
 #define destroy_context(mm) BTFIXUP_CALL(destroy_context)(mm)
 
-/*
- * After we have set current->mm to a new value, this activates
+/* After we have set current->mm to a new value, this activates
  * the context for the new mm so we see the new mappings.
- * XXX this presumably needs a sensible implementation - paulus.
  */
-#define activate_context(tsk)  do { } while(0)
+#define activate_context(tsk)  switch_to_context(tsk)
 
 #endif /* !(__SPARC_MMU_CONTEXT_H) */
index a907a0c..0d3f8b5 100644 (file)
@@ -1,4+1,4 @@
-/* $Id: floppy.h,v 1.17 1998/12/02 12:42:23 davem Exp $
+/* $Id: floppy.h,v 1.18 1999/03/21 10:51:38 davem Exp $
  * asm-sparc64/floppy.h: Sparc specific parts of the Floppy driver.
  *
  * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
@@ -114,6+114,7 @@ static int FDC2 =           -1;
 
 static unsigned char sun_82077_fd_inb(unsigned long port)
 {
+       udelay(5);
        switch(port & 7) {
        default:
                printk("floppy: Asked to read unknown port %lx\n", port);
@@ -131,6+132,7 @@ static unsigned char sun_82077_fd_inb(unsigned long port)
 
 static void sun_82077_fd_outb(unsigned char value, unsigned long port)
 {
+       udelay(5);
        switch(port & 7) {
        default:
                printk("floppy: Asked to write to unknown port %lx\n", port);
index 3b7ac70..a8c028f 100644 (file)
@@ -343,14+343,15 @@ struct cdrom_blk
 #define CDC_IOCTLS              0x400   /* driver has non-standard ioctls */
 #define CDC_DRIVE_STATUS        0x800   /* driver implements drive status */
 
-/* drive status possibilities used with the uniform CD-ROM driver */
-#define CDS_NO_INFO            0               /* if not implemented */
+/* drive status possibilities returned by CDROM_DRIVE_STATUS ioctl */
+#define CDS_NO_INFO            0       /* if not implemented */
 #define CDS_NO_DISC            1
 #define CDS_TRAY_OPEN          2
 #define CDS_DRIVE_NOT_READY    3
 #define CDS_DISC_OK            4
 
-/* disc status possibilities, other than CDS_NO_DISC and CDS_NO_INFO */
+/* return values for the CDROM_DISC_STATUS ioctl */
+/* can also return CDS_NO_[INFO|DISC], from above */
 #define CDS_AUDIO              100
 #define CDS_DATA_1             101
 #define CDS_DATA_2             102
index 0dc2231..8d6c8b5 100644 (file)
@@ -80,7+80,6 @@ struct in_addr {
 /* These need to appear somewhere around here */
 #define IP_DEFAULT_MULTICAST_TTL        1
 #define IP_DEFAULT_MULTICAST_LOOP       1
-#define IP_MAX_MEMBERSHIPS              20
 
 /* Request struct for multicast socket ops */
 
index 41f6405..18e7fde 100644 (file)
 #define MAX_HEADER (LL_MAX_HEADER + 48)
 #endif
 
-struct neighbour;
-struct neigh_parms;
-struct sk_buff;
-
-/*
- *     We tag multicasts with these structures.
- */
-struct dev_mc_list
-{      
-       struct dev_mc_list      *next;
-       __u8                    dmi_addr[MAX_ADDR_LEN];
-       unsigned char           dmi_addrlen;
-       int                     dmi_users;
-       int                     dmi_gusers;
-};
-
-struct hh_cache
-{
-       struct hh_cache *hh_next;       /* Next entry                        */
-       atomic_t        hh_refcnt;      /* number of users                   */
-       unsigned short  hh_type;        /* protocol identifier, f.e ETH_P_IP */
-       int             (*hh_output)(struct sk_buff *skb);
-       /* cached hardware header; allow for machine alignment needs.        */
-       unsigned long   hh_data[16/sizeof(unsigned long)];
-};
-
 /*
  *     Network device statistics. Akin to the 2.0 ether stats but
  *     with byte counters.
@@ -157,6+130,35 @@ extern const char *if_port_text[];
 
 #include <linux/skbuff.h>
 
+struct neighbour;
+struct neigh_parms;
+struct sk_buff;
+
+/*
+ *     We tag multicasts with these structures.
+ */
+struct dev_mc_list
+{      
+       struct dev_mc_list      *next;
+       __u8                    dmi_addr[MAX_ADDR_LEN];
+       unsigned char           dmi_addrlen;
+       int                     dmi_users;
+       int                     dmi_gusers;
+};
+
+struct hh_cache
+{
+       struct hh_cache *hh_next;       /* Next entry                        */
+       atomic_t        hh_refcnt;      /* number of users                   */
+       unsigned short  hh_type;        /* protocol identifier, f.e ETH_P_IP */
+       int             (*hh_output)(struct sk_buff *skb);
+       rwlock_t        hh_lock;
+       /* cached hardware header; allow for machine alignment needs.        */
+       unsigned long   hh_data[16/sizeof(unsigned long)];
+};
+
+
 /*
  *     The DEVICE structure.
  *     Actually, this whole structure is a big mistake.  It mixes I/O
@@ -432,6+434,7 @@ extern int          dev_mc_add(struct device *dev, void *addr, int alen, int newonly);
 extern void            dev_mc_discard(struct device *dev);
 extern void            dev_set_promiscuity(struct device *dev, int inc);
 extern void            dev_set_allmulti(struct device *dev, int inc);
+extern void            netdev_state_change(struct device *dev);
 /* Load a device via the kmod */
 extern void            dev_load(const char *name);
 extern void            dev_mcast_init(void);
index 0c38cac..36935ed 100644 (file)
@@ -23,9+23,12 @@ enum
        TCA_POLICE_TBF,
        TCA_POLICE_RATE,
        TCA_POLICE_PEAKRATE,
+       TCA_POLICE_AVRATE,
+       TCA_POLICE_RESULT
+#define TCA_POLICE_RESULT TCA_POLICE_RESULT
 };
 
-#define TCA_POLICE_MAX TCA_POLICE_PEAKRATE
+#define TCA_POLICE_MAX TCA_POLICE_RESULT
 
 /* U32 filters */
 
@@ -114,4+117,30 @@ struct tc_rsvp_pinfo
        __u8    tunnelhdr;
 };
 
+/* ROUTE filter */
+
+enum
+{
+       TCA_ROUTE4_UNSPEC,
+       TCA_ROUTE4_CLASSID,
+       TCA_ROUTE4_TO,
+       TCA_ROUTE4_FROM,
+       TCA_ROUTE4_IIF,
+       TCA_ROUTE4_POLICE,
+};
+
+#define TCA_ROUTE4_MAX TCA_ROUTE4_POLICE
+
+
+/* FW filter */
+
+enum
+{
+       TCA_FW_UNSPEC,
+       TCA_FW_CLASSID,
+       TCA_FW_POLICE,
+};
+
+#define TCA_FW_MAX TCA_FW_POLICE
+
 #endif
index be60739..b339f65 100644 (file)
@@ -139,6+139,7 @@ enum
 #define RTPROT_RA      9       /* RDISC/ND router advertisments */
 #define RTPROT_MRT     10      /* Merit MRT */
 #define RTPROT_ZEBRA   11      /* Zebra */
+#define RTPROT_BIRD    12      /* BIRD */
 
 /* rtm_scope
 
index 413864d..9b615ac 100644 (file)
@@ -161,7+161,8 @@ enum
 enum
 {
        NET_UNIX_DESTROY_DELAY=1,
-       NET_UNIX_DELETE_DELAY=2
+       NET_UNIX_DELETE_DELAY=2,
+       NET_UNIX_MAX_DGRAM_QLEN=3,
 };
 
 /* /proc/sys/net/ipv4 */
@@ -225,7+226,8 @@ enum {
        NET_IPV4_ROUTE_REDIRECT_SILENCE=11,
        NET_IPV4_ROUTE_ERROR_COST=12,
        NET_IPV4_ROUTE_ERROR_BURST=13,
-       NET_IPV4_ROUTE_GC_ELASTICITY=14
+       NET_IPV4_ROUTE_GC_ELASTICITY=14,
+       NET_IPV4_ROUTE_MTU_EXPIRES=15
 };
 
 enum
@@ -265,7+267,8 @@ enum {
        NET_IPV6_ROUTE_GC_MIN_INTERVAL=4,
        NET_IPV6_ROUTE_GC_TIMEOUT=5,
        NET_IPV6_ROUTE_GC_INTERVAL=6,
-       NET_IPV6_ROUTE_GC_ELASTICITY=7
+       NET_IPV6_ROUTE_GC_ELASTICITY=7,
+       NET_IPV6_ROUTE_MTU_EXPIRES=8
 };
 
 enum {
index 547e5c2..7c80168 100644 (file)
@@ -398,7+398,6 @@ extern void sysv_write_super(struct super_block *);
 extern void sysv_read_inode(struct inode *);
 extern int sysv_notify_change(struct dentry *, struct iattr *);
 extern void sysv_write_inode(struct inode *);
-extern void sysv_put_inode(struct inode *);
 extern int sysv_statfs(struct super_block *, struct statfs *, int);
 extern int sysv_sync_inode(struct inode *);
 extern int sysv_sync_file(struct file *, struct dentry *);
index 44fb44d..d711d0d 100644 (file)
@@ -53,8+53,9 @@ extern int                    addrconf_set_dstaddr(void *arg);
 
 extern struct inet6_ifaddr *   ipv6_chk_addr(struct in6_addr *addr,
                                              struct device *dev, int nd);
-extern struct inet6_ifaddr *   ipv6_get_saddr(struct dst_entry *dst, 
-                                              struct in6_addr *daddr);
+extern int                     ipv6_get_saddr(struct dst_entry *dst, 
+                                              struct in6_addr *daddr,
+                                              struct in6_addr *saddr);
 extern struct inet6_ifaddr *   ipv6_get_lladdr(struct device *dev);
 
 /*
index 50b3373..896a517 100644 (file)
@@ -36,9+36,10 @@ struct dst_entry
        struct device           *dev;
        int                     obsolete;
        unsigned long           lastuse;
+       unsigned long           expires;
        unsigned                mxlock;
-       unsigned                window;
        unsigned                pmtu;
+       unsigned                window;
        unsigned                rtt;
        unsigned long           rate_last;      /* rate limiting for ICMP */
        unsigned long           rate_tokens;
@@ -98,6+99,19 @@ void dst_release(struct dst_entry * dst)
                atomic_dec(&dst->use);
 }
 
+/* The following primitive should be use if and only if
+   destination entry has just been removed from a location
+   accessed directly by hard irq.
+ */
+extern __inline__
+void dst_release_irqwait(struct dst_entry * dst)
+{
+       if (dst) {
+               synchronize_irq();
+               atomic_dec(&dst->use);
+       }
+}
+
 extern __inline__
 struct dst_entry * dst_check(struct dst_entry ** dst_p, u32 cookie)
 {
@@ -152,6+166,17 @@ extern __inline__ void dst_link_failure(struct sk_buff *skb)
        if (dst && dst->ops && dst->ops->link_failure)
                dst->ops->link_failure(skb);
 }
+
+extern __inline__ void dst_set_expires(struct dst_entry *dst, int timeout)
+{
+       unsigned long expires = jiffies + timeout;
+
+       if (expires == 0)
+               expires = 1;
+
+       if (dst->expires == 0 || (long)(dst->expires - expires) > 0)
+               dst->expires = expires;
+}
 #endif
 
 #endif /* _NET_DST_H */
index 5268603..5e80fb4 100644 (file)
@@ -147,13+147,14 @@ extern __inline__ int ip_finish_output(struct sk_buff *skb)
        skb->protocol = __constant_htons(ETH_P_IP);
 
        if (hh) {
+               read_lock_irq(&hh->hh_lock);
                memcpy(skb->data - 16, hh->hh_data, 16);
+               read_unlock_irq(&hh->hh_lock);
                skb_push(skb, dev->hard_header_len);
                return hh->hh_output(skb);
        } else if (dst->neighbour)
                return dst->neighbour->output(skb);
 
-       printk(KERN_DEBUG "khm\n");
        kfree_skb(skb);
        return -EINVAL;
 }
index 905876d..efd652f 100644 (file)
@@ -59,6+59,7 @@ struct rt6_info
 
 #define rt6i_dev                       u.dst.dev
 #define rt6i_nexthop                   u.dst.neighbour
+#define rt6i_expires                   u.dst.expires
 
        struct fib6_node                *rt6i_node;
 
@@ -67,7+68,6 @@ struct rt6_info
        u32                             rt6i_flags;
        u32                             rt6i_metric;
        u8                              rt6i_hoplimit;
-       unsigned long                   rt6i_expires;
        atomic_t                        rt6i_ref;
 
        union {
index 5c8d6bf..6e1fd86 100644 (file)
@@ -127,6+127,8 @@ struct fib_table
        int             (*tb_flush)(struct fib_table *table);
        int             (*tb_get_info)(struct fib_table *table, char *buf,
                                       int first, int count);
+       void            (*tb_select_default)(struct fib_table *table,
+                                            const struct rt_key *key, struct fib_result *res);
 
        unsigned char   tb_data[0];
 };
@@ -156,6+158,12 @@ extern __inline__ int fib_lookup(const struct rt_key *key, struct fib_result *re
        return 0;
 }
 
+extern __inline__ void fib_select_default(const struct rt_key *key, struct fib_result *res)
+{
+       if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+               main_table->tb_select_default(main_table, key, res);
+}
+
 #else /* CONFIG_IP_MULTIPLE_TABLES */
 #define local_table (fib_tables[RT_TABLE_LOCAL])
 #define main_table (fib_tables[RT_TABLE_MAIN])
@@ -179,6+187,9 @@ extern __inline__ struct fib_table *fib_new_table(int id)
 
        return fib_tables[id] ? : __fib_new_table(id);
 }
+
+extern void fib_select_default(const struct rt_key *key, struct fib_result *res);
+
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
 
 /* Exported by fib_frontend.c */
@@ -189,7+200,7 @@ extern int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *ar
 extern int inet_rtm_getroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
 extern int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb);
 extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
-                              struct device *dev, u32 *spec_dst);
+                              struct device *dev, u32 *spec_dst, u32 *itag);
 extern void fib_select_multipath(const struct rt_key *key, struct fib_result *res);
 
 /* Exported by fib_semantics.c */
@@ -227,4+238,20 @@ extern u32 fib_rules_policy(u32 saddr, struct fib_result *res, unsigned *flags);
 extern void fib_rules_init(void);
 #endif
 
+extern __inline__ void fib_combine_itag(u32 *itag, struct fib_result *res)
+{
+#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+       u32 rtag;
+#endif
+       *itag = FIB_RES_NH(*res).nh_tclassid<<16;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+       rtag = fib_rules_tclass(res);
+       if (*itag == 0)
+               *itag = (rtag<<16);
+       *itag |= (rtag>>16);
+#endif
+#endif
+}
+
 #endif  _NET_FIB_H
index 118eec2..bef91b1 100644 (file)
@@ -4,7+4,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>
  *
- *     $Id: ipv6.h,v 1.14 1998/10/03 09:36:45 davem Exp $
+ *     $Id: ipv6.h,v 1.15 1999/03/21 05:22:16 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -186,14+186,6 @@ extern __inline__ int ipv6_addr_any(struct in6_addr *a)
                 a->s6_addr32[2] | a->s6_addr32[3] ) == 0); 
 }
 
-extern __inline__ int gfp_any(void)
-{
-       int pri = GFP_KERNEL;
-       if (in_interrupt())
-               pri = GFP_ATOMIC;
-       return pri;
-}
-
 /*
  *     Prototypes exported by ipv6
  */
index 76dd6f4..5c5d90b 100644 (file)
@@ -226,7+226,6 @@ extern __inline__ void neigh_confirm(struct neighbour *neigh)
                neigh->confirmed = jiffies;
 }
 
-
 extern __inline__ struct neighbour *
 neigh_lookup(struct neigh_table *tbl, const void *pkey, struct device *dev)
 {
@@ -258,6+257,7 @@ extern __inline__ int neigh_event_send(struct neighbour *neigh, struct sk_buff *
 extern __inline__ void neigh_table_lock(struct neigh_table *tbl)
 {
        atomic_inc(&tbl->lock);
+       synchronize_bh();
 }
 
 extern __inline__ void neigh_table_unlock(struct neigh_table *tbl)
index 0d3c25e..a906fc9 100644 (file)
@@ -50,7+50,7 @@ struct tcf_proto_ops
 
        unsigned long           (*get)(struct tcf_proto*, u32 handle);
        void                    (*put)(struct tcf_proto*, unsigned long);
-       int                     (*change)(struct tcf_proto*, u32 handle, struct rtattr **, unsigned long *);
+       int                     (*change)(struct tcf_proto*, unsigned long, u32 handle, struct rtattr **, unsigned long *);
        int                     (*delete)(struct tcf_proto*, unsigned long);
        void                    (*walk)(struct tcf_proto*, struct tcf_walker *arg);
 
@@ -77,6+77,14 @@ extern __inline__ int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, str
        return -1;
 }
 
+extern __inline__ unsigned long cls_set_class(unsigned long *clp, unsigned long cl)
+{
+       net_serialize_enter();
+       cl = xchg(clp, cl);
+       net_serialize_leave();
+       return cl;
+}
+
 extern int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 extern int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
index 142f7b3..ba23d7a 100644 (file)
@@ -5,7+5,7 @@
 #define PSCHED_JIFFIES                 2
 #define PSCHED_CPU             3
 
-#define PSCHED_CLOCK_SOURCE    PSCHED_GETTIMEOFDAY
+#define PSCHED_CLOCK_SOURCE    PSCHED_JIFFIES
 
 #include <linux/pkt_sched.h>
 #include <net/pkt_cls.h>
@@ -25,6+25,7 @@ struct Qdisc_class_ops
 {
        /* Child qdisc manipulation */
        int                     (*graft)(struct Qdisc *, unsigned long cl, struct Qdisc *, struct Qdisc **);
+       struct Qdisc *          (*leaf)(struct Qdisc *, unsigned long cl);
 
        /* Class manipulation routines */
        unsigned long           (*get)(struct Qdisc *, u32 classid);
@@ -35,7+36,7 @@ struct Qdisc_class_ops
 
        /* Filter manipulation */
        struct tcf_proto **     (*tcf_chain)(struct Qdisc *, unsigned long);
-       unsigned long           (*bind_tcf)(struct Qdisc *, u32 classid);
+       unsigned long           (*bind_tcf)(struct Qdisc *, unsigned long, u32 classid);
        void                    (*unbind_tcf)(struct Qdisc *, unsigned long);
 
        /* rtnetlink specific */
@@ -57,6+58,7 @@ struct Qdisc_ops
        int                     (*init)(struct Qdisc *, struct rtattr *arg);
        void                    (*reset)(struct Qdisc *);
        void                    (*destroy)(struct Qdisc *);
+       int                     (*change)(struct Qdisc *, struct rtattr *arg);
 
        int                     (*dump)(struct Qdisc *, struct sk_buff *);
 };
@@ -74,13+76,12 @@ struct Qdisc
        int                     (*enqueue)(struct sk_buff *skb, struct Qdisc *dev);
        struct sk_buff *        (*dequeue)(struct Qdisc *dev);
        unsigned                flags;
-#define TCQ_F_DEFAULT  1
-#define TCQ_F_BUILTIN  2
+#define TCQ_F_BUILTIN  1
+#define TCQ_F_THROTTLED        2
        struct Qdisc_ops        *ops;
        struct Qdisc            *next;
        u32                     handle;
-       u32                     classid;
-       struct Qdisc            *parent;
+       atomic_t                refcnt;
        struct sk_buff_head     q;
        struct device           *dev;
 
@@ -89,6+90,11 @@ struct Qdisc
        unsigned long           tx_last;
        int                     (*reshape_fail)(struct sk_buff *skb, struct Qdisc *q);
 
+       /* This field is deprecated, but it is still used by CBQ
+        * and it will live until better solution will be invented.
+        */
+       struct Qdisc            *__parent;
+
        char                    data[0];
 };
 
@@ -129,6+135,15 @@ struct qdisc_rate_table
    which have fast and precise clock source, but it is too expensive.
  */
 
+/* General note about internal clock.
+
+   Any clock source returns time intervals, measured in units
+   close to 1usec. With source PSCHED_GETTIMEOFDAY it is precisely
+   microseconds, otherwise something close but different chosen to minimize
+   arithmetic cost. Ratio usec/internal untis in form nominator/denominator
+   may be read from /proc/net/psched.
+ */
+
 
 #if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
 
@@ -138,8+153,12 @@ typedef long               psched_tdiff_t;
 #define PSCHED_GET_TIME(stamp) do_gettimeofday(&(stamp))
 #define PSCHED_US2JIFFIE(usecs) (((usecs)+(1000000/HZ-1))/(1000000/HZ))
 
+#define PSCHED_EXPORTLIST EXPORT_SYMBOL(psched_tod_diff);
+
 #else /* PSCHED_CLOCK_SOURCE != PSCHED_GETTIMEOFDAY */
 
+#define PSCHED_EXPORTLIST PSCHED_EXPORTLIST_1 PSCHED_EXPORTLIST_2
+
 typedef u64    psched_time_t;
 typedef long   psched_tdiff_t;
 
@@ -147,10+166,6 @@ extern psched_time_t       psched_time_base;
 
 #if PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
 
-#define PSCHED_WATCHER unsigned long
-
-extern PSCHED_WATCHER psched_time_mark;
-
 #if HZ == 100
 #define PSCHED_JSCALE 13
 #elif HZ == 1024
@@ -159,22+174,45 @@ extern PSCHED_WATCHER psched_time_mark;
 #define PSCHED_JSCALE 0
 #endif
 
+#define PSCHED_EXPORTLIST_2
+
+#if ~0UL == 0xFFFFFFFF
+
+#define PSCHED_WATCHER unsigned long
+
+extern PSCHED_WATCHER psched_time_mark;
+
 #define PSCHED_GET_TIME(stamp) ((stamp) = psched_time_base + (((unsigned long)(jiffies-psched_time_mark))<<PSCHED_JSCALE))
-#define PSCHED_US2JIFFIE(delay) ((delay)>>PSCHED_JSCALE)
+
+#define PSCHED_EXPORTLIST_1 EXPORT_SYMBOL(psched_time_base); \
+                            EXPORT_SYMBOL(psched_time_mark);
+
+#else
+
+#define PSCHED_GET_TIME(stamp) ((stamp) = (jiffies<<PSCHED_JSCALE))
+
+#define PSCHED_EXPORTLIST_1 
+
+#endif
+
+#define PSCHED_US2JIFFIE(delay) (((delay)+(1<<PSCHED_JSCALE)-1)>>PSCHED_JSCALE)
 
 #elif PSCHED_CLOCK_SOURCE == PSCHED_CPU
 
 extern psched_tdiff_t psched_clock_per_hz;
 extern int psched_clock_scale;
 
+#define PSCHED_EXPORTLIST_2 EXPORT_SYMBOL(psched_clock_per_hz); \
+                            EXPORT_SYMBOL(psched_clock_scale);
+
 #define PSCHED_US2JIFFIE(delay) (((delay)+psched_clock_per_hz-1)/psched_clock_per_hz)
 
 #if CPU == 586 || CPU == 686
 
 #define PSCHED_GET_TIME(stamp) \
-({ u32 hi, lo; \
-   __asm__ __volatile__ (".byte 0x0f,0x31" :"=a" (lo), "=d" (hi)); \
-   (stamp) = ((((u64)hi)<<32) + lo)>>psched_clock_scale; \
+({ u64 __cur; \
+   __asm__ __volatile__ (".byte 0x0f,0x31" :"=A" (__cur)); \
+   (stamp) = __cur>>psched_clock_scale; \
 })
 
 #elif defined (__alpha__)
@@ -191,6+229,9 @@ extern PSCHED_WATCHER psched_time_mark;
    (stamp) = (psched_time_base + __res)>>psched_clock_scale; \
 })
 
+#define PSCHED_EXPORTLIST_1 EXPORT_SYMBOL(psched_time_base); \
+                            EXPORT_SYMBOL(psched_time_mark);
+
 #else
 
 #error PSCHED_CLOCK_SOURCE=PSCHED_CPU is not supported on this arch.
@@ -219,13+260,15 @@ extern PSCHED_WATCHER psched_time_mark;
           __delta; \
 })
 
+extern int psched_tod_diff(int delta_sec, int bound);
+
 #define PSCHED_TDIFF_SAFE(tv1, tv2, bound, guard) \
 ({ \
           int __delta_sec = (tv1).tv_sec - (tv2).tv_sec; \
           int __delta = (tv1).tv_usec - (tv2).tv_usec; \
           switch (__delta_sec) { \
           default: \
-                  __delta = (bound); guard; break; \
+                  __delta = psched_tod_diff(__delta_sec, bound); guard; break; \
           case 2: \
                   __delta += 1000000; \
           case 1: \
@@ -290,6+333,8 @@ struct tcf_police
        u32             index;
 
        int             action;
+       int             result;
+       u32             ewma_rate;
        u32             burst;
        u32             mtu;
 
@@ -298,10+343,12 @@ struct tcf_police
        psched_time_t   t_c;
        struct qdisc_rate_table *R_tab;
        struct qdisc_rate_table *P_tab;
+
+       struct tc_stats stats;
 };
 
 extern void tcf_police_destroy(struct tcf_police *p);
-extern struct tcf_police * tcf_police_locate(struct rtattr *rta);
+extern struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est);
 extern int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p);
 extern int tcf_police(struct sk_buff *skb, struct tcf_police *p);
 
@@ -327,7+374,6 @@ void dev_deactivate(struct device *dev);
 void qdisc_reset(struct Qdisc *qdisc);
 void qdisc_destroy(struct Qdisc *qdisc);
 struct Qdisc * qdisc_create_dflt(struct device *dev, struct Qdisc_ops *ops);
-struct Qdisc * dev_set_scheduler(struct device *dev, struct Qdisc *qdisc);
 int qdisc_new_estimator(struct tc_stats *stats, struct rtattr *opt);
 void qdisc_kill_estimator(struct tc_stats *stats);
 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab);
index bd76d03..e5b0344 100644 (file)
@@ -57,6+57,9 @@ struct rt_key
        __u32                   src;
        int                     iif;
        int                     oif;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+       __u32                   fwmark;
+#endif
        __u8                    tos;
        __u8                    scope;
 };
@@ -93,6+96,16 @@ struct rtable
 
 extern struct rtable   *rt_hash_table[RT_HASH_DIVISOR];
 
+struct ip_rt_acct
+{
+       __u32   o_bytes;
+       __u32   o_packets;
+       __u32   i_bytes;
+       __u32   i_packets;
+};
+
+extern struct ip_rt_acct ip_rt_acct[256];
+
 extern void            ip_rt_init(void);
 extern void            ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw,
                                       u32 src, u8 tos, struct device *dev);
index e46f2bc..834ca8a 100644 (file)
@@ -918,6+918,18 @@ extern void net_delete_timer (struct sock *);
 extern void net_reset_timer (struct sock *, int, unsigned long);
 extern void net_timer (unsigned long);
 
+extern __inline__ int gfp_any(void)
+{
+       return in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
+}
+
+#ifdef __SMP__
+#define net_serialize_enter()  start_bh_atomic()
+#define net_serialize_leave()  end_bh_atomic()
+#else
+#define net_serialize_enter()  barrier();
+#define net_serialize_leave()  barrier();
+#endif
 
 /* 
  *     Enable debug/info messages 
index 1136628..65f12e6 100644 (file)
@@ -265,7+265,6 @@ printk("SIG queue (%s:%d): %d ", t->comm, t->pid, sig);
            && ((sig != SIGCONT) || (current->session != t->session))
            && (current->euid ^ t->suid) && (current->euid ^ t->uid)
            && (current->uid ^ t->suid) && (current->uid ^ t->uid)
-           && (cap_issubset(t->cap_permitted, current->cap_permitted))
            && !capable(CAP_SYS_ADMIN))
                goto out_nolock;
 
index 3708ca2..d184c94 100644 (file)
@@ -62,9+62,9 @@ asmlinkage void do_bottom_half(void)
                if (hardirq_trylock(cpu)) {
                        __sti();
                        run_bottom_halves();
+                       __cli();
                        hardirq_endlock(cpu);
                }
-               __cli();
                softirq_endlock(cpu);
        }
 }
index 9517843..53e5352 100644 (file)
@@ -232,7+232,9 @@ void dev_remove_pack(struct packet_type *pt)
        {
                if(pt==(*pt1))
                {
+                       net_serialize_enter();
                        *pt1=pt->next;
+                       net_serialize_leave();
 #ifdef CONFIG_NET_FASTROUTE
                        if (pt->data)
                                netdev_fastroute_obstacles--;
@@ -328,6+330,12 @@ struct device *dev_alloc(const char *name, int *err)
        return dev;
 }
 
+void netdev_state_change(struct device *dev)
+{
+       if (dev->flags&IFF_UP)
+               notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
+}
+
 
 /*
  *     Find and possibly load an interface.
@@ -422,7+430,7 @@ static __inline__ void dev_do_clear_fastroute(struct device *dev)
                int i;
 
                for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++)
-                       dst_release(xchg(dev->fastpath+i, NULL));
+                       dst_release_irqwait(xchg(dev->fastpath+i, NULL));
        }
 }
 
@@ -895,22+903,6 @@ void net_bh(void)
 #endif
 
                /*
-                *      Fetch the packet protocol ID. 
-                */
-               
-               type = skb->protocol;
-
-               
-#ifdef CONFIG_BRIDGE
-               /*
-                *      If we are bridging then pass the frame up to the
-                *      bridging code (if this protocol is to be bridged).
-                *      If it is bridged then move on
-                */
-               handle_bridge(skb, type); 
-#endif
-               
-               /*
                 *      Bump the pointer to the next structure.
                 * 
                 *      On entry to the protocol layer. skb->data and
@@ -927,11+919,26 @@ void net_bh(void)
                }
 
                /*
+                *      Fetch the packet protocol ID. 
+                */
+
+               type = skb->protocol;
+
+#ifdef CONFIG_BRIDGE
+               /*
+                *      If we are bridging then pass the frame up to the
+                *      bridging code (if this protocol is to be bridged).
+                *      If it is bridged then move on
+                */
+               handle_bridge(skb, type); 
+#endif
+
+               /*
                 *      We got a packet ID.  Now loop over the "known protocols"
                 *      list. There are two lists. The ptype_all list of taps (normally empty)
                 *      and the main protocol list which is hashed perfectly for normal protocols.
                 */
-               
+
                pt_prev = NULL;
                for (ptype = ptype_all; ptype!=NULL; ptype=ptype->next)
                {
@@ -1536,8+1543,7 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
                        return 0;
 
                case SIOCSIFTXQLEN:
-                       /* Why <2? 0 and 1 are valid values. --ANK (980807) */
-                       if(/*ifr->ifr_qlen<2 ||*/ ifr->ifr_qlen>1024)
+                       if(ifr->ifr_qlen<0)
                                return -EINVAL;
                        dev->tx_queue_len = ifr->ifr_qlen;
                        return 0;
@@ -1817,8+1823,11 @@ int unregister_netdevice(struct device *dev)
        /* And unlink it from device chain. */
        for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
                if (d == dev) {
+                       net_serialize_enter();
                        *dp = d->next;
+                       net_serialize_leave();
                        d->next = NULL;
+
                        if (dev->destructor)
                                dev->destructor(dev);
                        return 0;
@@ -1977,7+1986,9 @@ __initfunc(int net_dev_init(void))
                        /*
                         *      It failed to come up. Unhook it.
                         */
+                       net_serialize_enter();
                        *dp = dev->next;
+                       net_serialize_leave();
                } 
                else
                {
index a724497..bce3f4a 100644 (file)
  *     protocols without doing damage to the protocols when it deletes the
  *     entries. It also helps IP as it tracks overlapping maps.
  *
- *     BUGGGG! IPv6 calls dev_mac_add/delete from BH, it means
- *     that all the functions in this file are racy. [NOT FIXED] --ANK
+ *     Device mc lists are changed by bh at least if IPv6 is enabled,
+ *     so that it must be bh protected.
  */
 
 /*
  *     Update the multicast list into the physical NIC controller.
@@ -77,11+76,13 @@ void dev_mc_upload(struct device *dev)
        /*
         *      Devices with no set multicast don't get set 
         */
-        
+
        if(dev->set_multicast_list==NULL)
                return;
-               
+
+       start_bh_atomic();
        dev->set_multicast_list(dev);
+       end_bh_atomic();
 }
   
 /*
@@ -90,8+91,10 @@ void dev_mc_upload(struct device *dev)
  
 int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl)
 {
+       int err = 0;
        struct dev_mc_list *dmi, **dmip;
 
+       start_bh_atomic();
        for (dmip=&dev->mc_list; (dmi=*dmip)!=NULL; dmip=&dmi->next) {
                /*
                 *      Find the entry we want to delete. The device could
@@ -102,10+105,10 @@ int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl)
                                int old_glbl = dmi->dmi_gusers;
                                dmi->dmi_gusers = 0;
                                if (old_glbl == 0)
-                                       return -ENOENT;
+                                       break;
                        }
                        if(--dmi->dmi_users)
-                               return 0;
+                               goto done;
 
                        /*
                         *      Last user. So delete the entry.
@@ -117,11+120,15 @@ int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl)
                         *      We have altered the list, so the card
                         *      loaded filter is now wrong. Fix it
                         */
+                       end_bh_atomic();
                        dev_mc_upload(dev);
                        return 0;
                }
        }
-       return -ENOENT;
+       err = -ENOENT;
+done:
+       end_bh_atomic();
+       return err;
 }
 
 /*
@@ -130,30+137,27 @@ int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl)
  
 int dev_mc_add(struct device *dev, void *addr, int alen, int glbl)
 {
-       struct dev_mc_list *dmi;
+       int err = 0;
+       struct dev_mc_list *dmi, *dmi1;
+
+       dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), gfp_any());
 
+       start_bh_atomic();
        for(dmi=dev->mc_list; dmi!=NULL; dmi=dmi->next) {
                if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) {
                        if (glbl) {
                                int old_glbl = dmi->dmi_gusers;
                                dmi->dmi_gusers = 1;
                                if (old_glbl)
-                                       return 0;
+                                       goto done;
                        }
                        dmi->dmi_users++;
-                       return 0;
+                       goto done;
                }
        }
 
-       /* GFP_ATOMIC!! It is used by IPv6 from interrupt,
-          when new address arrives.
-
-          Particularly, it means that this part of code is weirdly
-          racy, and needs numerous *_bh_atomic --ANK
-        */
-       dmi=(struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC);
-       if (dmi==NULL)
-               return -ENOBUFS;
+       if ((dmi=dmi1)==NULL)
+               return -ENOMEM;
        memcpy(dmi->dmi_addr, addr, alen);
        dmi->dmi_addrlen=alen;
        dmi->next=dev->mc_list;
@@ -161,8+165,15 @@ int dev_mc_add(struct device *dev, void *addr, int alen, int glbl)
        dmi->dmi_gusers=glbl ? 1 : 0;
        dev->mc_list=dmi;
        dev->mc_count++;
+       end_bh_atomic();
        dev_mc_upload(dev);
        return 0;
+
+done:
+       end_bh_atomic();
+       if (dmi1)
+               kfree(dmi1);
+       return err;
 }
 
 /*
@@ -171,6+182,7 @@ int dev_mc_add(struct device *dev, void *addr, int alen, int glbl)
 
 void dev_mc_discard(struct device *dev)
 {
+       start_bh_atomic();
        while (dev->mc_list!=NULL) {
                struct dev_mc_list *tmp=dev->mc_list;
                dev->mc_list=tmp->next;
@@ -179,6+191,7 @@ void dev_mc_discard(struct device *dev)
                kfree_s(tmp,sizeof(*tmp));
        }
        dev->mc_count=0;
+       end_bh_atomic();
 }
 
 #ifdef CONFIG_PROC_FS
@@ -189,7+202,9 @@ static int dev_mc_read_proc(char *buffer, char **start, off_t offset,
        struct dev_mc_list *m;
        int len=0;
        struct device *dev;
-       
+
+       start_bh_atomic();
+
        for (dev = dev_base; dev; dev = dev->next) {
                for (m = dev->mc_list; m; m = m->next) {
                        int i;
@@ -214,10+229,13 @@ static int dev_mc_read_proc(char *buffer, char **start, off_t offset,
        *eof = 1;
 
 done:
+       end_bh_atomic();
        *start=buffer+(offset-begin);
        len-=(offset-begin);
        if(len>length)
                len=length;
+       if(len<0)
+               len=0;
        return len;
 }
 #endif
index 7636a1e..112bbca 100644 (file)
@@ -441,8+441,9 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 
        if ((err = sk_chk_filter(fp->insns, fp->len))==0) {
                struct sk_filter *old_fp = sk->filter;
+               net_serialize_enter();
                sk->filter = fp;
-               wmb();
+               net_serialize_leave();
                fp = old_fp;
        }
 
index 637322f..7b00cd9 100644 (file)
 #endif
 #include <net/neighbour.h>
 #include <net/dst.h>
+#include <net/sock.h>
 #include <linux/rtnetlink.h>
 
+/*
+   NOTE. The most unpleasent question is serialization of
+   accesses to resolved addresses. The problem is that addresses
+   are modified by bh, but they are referenced from normal
+   kernel thread. Before today no locking was made.
+   My reasoning was that corrupted address token will be copied
+   to packet with cosmologically small probability
+   (it is even difficult to estimate such small number)
+   and it is very silly to waste cycles in fast path to lock them.
+
+   But now I changed my mind, but not because previous statement
+   is wrong. Actually, neigh->ha MAY BE not opaque byte array,
+   but reference to some private data. In this case even neglibible
+   corruption probability becomes bug.
+
+   - hh cache is protected by rwlock. It assumes that
+     hh cache update procedure is short and fast, and that
+     read_lock is cheaper than start_bh_atomic().
+   - ha tokens, saved in neighbour entries, are protected
+     by bh_atomic().
+   - no protection is made in /proc reading. It is OK, because
+     /proc is broken by design in any case, and
+     corrupted output is normal behaviour there.
+
+     --ANK (981025)
+ */
+
 #define NEIGH_DEBUG 1
 
 #define NEIGH_PRINTK(x...) printk(x)
@@ -48,6+76,7 @@ static void neigh_timer_handler(unsigned long arg);
 #ifdef CONFIG_ARPD
 static void neigh_app_notify(struct neighbour *n);
 #endif
+static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev);
 
 static int neigh_glbl_allocs;
 static struct neigh_table *neigh_tables;
@@ -83,8+112,20 @@ static int neigh_forced_gc(struct neigh_table *tbl)
 
                np = &tbl->hash_buckets[i];
                while ((n = *np) != NULL) {
+                       /* Neighbour record may be discarded if:
+                          - nobody refers to it.
+                          - it is not premanent
+                          - (NEW and probably wrong)
+                            INCOMPLETE entries are kept at least for
+                            n->parms->retrans_time, otherwise we could
+                            flood network with resolution requests.
+                            It is not clear, what is better table overflow
+                            or flooding.
+                        */
                        if (atomic_read(&n->refcnt) == 0 &&
-                           !(n->nud_state&NUD_PERMANENT)) {
+                           !(n->nud_state&NUD_PERMANENT) &&
+                           (n->nud_state != NUD_INCOMPLETE ||
+                            jiffies - n->used > n->parms->retrans_time)) {
                                *np = n->next;
                                n->tbl = NULL;
                                tbl->entries--;
@@ -149,6+190,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct device *dev)
 
        del_timer(&tbl->proxy_timer);
        skb_queue_purge(&tbl->proxy_queue);
+       pneigh_ifdown(tbl, dev);
        end_bh_atomic();
        return 0;
 }
@@ -295,7+337,9 @@ int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev)
 
        for (np = &tbl->phash_buckets[hash_val]; (n=*np) != NULL; np = &n->next) {
                if (memcmp(n->key, pkey, key_len) == 0 && n->dev == dev) {
+                       net_serialize_enter();
                        *np = n->next;
+                       net_serialize_leave();
                        if (tbl->pdestructor)
                                tbl->pdestructor(n);
                        kfree(n);
@@ -305,6+349,30 @@ int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev)
        return -ENOENT;
 }
 
+static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev)
+{
+       struct pneigh_entry *n, **np;
+       u32 h;
+
+       for (h=0; h<=PNEIGH_HASHMASK; h++) {
+               np = &tbl->phash_buckets[h]; 
+               for (np = &tbl->phash_buckets[h]; (n=*np) != NULL; np = &n->next) {
+                       if (n->dev == dev || dev == NULL) {
+                               net_serialize_enter();
+                               *np = n->next;
+                               net_serialize_leave();
+                               if (tbl->pdestructor)
+                                       tbl->pdestructor(n);
+                               kfree(n);
+                               continue;
+                       }
+                       np = &n->next;
+               }
+       }
+       return -ENOENT;
+}
+
+
 /*
  *     neighbour must already be out of the table;
  *
@@ -516,11+584,11 @@ static void neigh_timer_handler(unsigned long arg)
                return;
        }
 
-       neigh->probes++;
        neigh->timer.expires = now + neigh->parms->retrans_time;
        add_timer(&neigh->timer);
 
        neigh->ops->solicit(neigh, skb_peek(&neigh->arp_queue));
+       neigh->probes++;
 }
 
 int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
@@ -542,6+610,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
                                add_timer(&neigh->timer);
 
                                neigh->ops->solicit(neigh, skb);
+                               neigh->probes++;
                        } else {
                                neigh->nud_state = NUD_FAILED;
                                if (skb)
@@ -581,8+650,11 @@ static __inline__ void neigh_update_hhs(struct neighbour *neigh)
                neigh->dev->header_cache_update;
 
        if (update) {
-               for (hh=neigh->hh; hh; hh=hh->hh_next)
+               for (hh=neigh->hh; hh; hh=hh->hh_next) {
+                       write_lock_irq(&hh->hh_lock);
                        update(hh, neigh->dev, neigh->ha);
+                       write_unlock_irq(&hh->hh_lock);
+               }
        }
 }
 
@@ -655,7+727,7 @@ int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int
                del_timer(&neigh->timer);
        neigh->nud_state = new;
        if (lladdr != neigh->ha) {
-               memcpy(neigh->ha, lladdr, dev->addr_len);
+               memcpy(&neigh->ha, lladdr, dev->addr_len);
                neigh_update_hhs(neigh);
                neigh->confirmed = jiffies - (neigh->parms->base_reachable_time<<1);
 #ifdef CONFIG_ARPD
@@ -764,14+836,20 @@ int neigh_resolve_output(struct sk_buff *skb)
        __skb_pull(skb, skb->nh.raw - skb->data);
 
        if (neigh_event_send(neigh, skb) == 0) {
+               int err;
                struct device *dev = neigh->dev;
-               if (dev->hard_header_cache) {
+               if (dev->hard_header_cache && dst->hh == NULL) {
                        start_bh_atomic();
                        if (dst->hh == NULL)
                                neigh_hh_init(neigh, dst, dst->ops->protocol);
+                       err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
+                       end_bh_atomic();
+               } else {
+                       start_bh_atomic();
+                       err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
                        end_bh_atomic();
                }
-               if (dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len) >= 0)
+               if (err >= 0)
                        return neigh->ops->queue_xmit(skb);
                kfree_skb(skb);
                return -EINVAL;
@@ -788,13+866,17 @@ discard:
 
 int neigh_connected_output(struct sk_buff *skb)
 {
+       int err;
        struct dst_entry *dst = skb->dst;
        struct neighbour *neigh = dst->neighbour;
        struct device *dev = neigh->dev;
 
        __skb_pull(skb, skb->nh.raw - skb->data);
 
-       if (dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len) >= 0)
+       start_bh_atomic();
+       err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
+       end_bh_atomic();
+       if (err >= 0)
                return neigh->ops->queue_xmit(skb);
        kfree_skb(skb);
        return -EINVAL;
@@ -868,7+950,6 @@ struct neigh_parms *neigh_parms_alloc(struct device *dev, struct neigh_table *tb
                        }
                }
                p->next = tbl->parms.next;
-               /* ATOMIC_SET */
                tbl->parms.next = p;
        }
        return p;
@@ -882,8+963,9 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
                return;
        for (p = &tbl->parms.next; *p; p = &(*p)->next) {
                if (*p == parms) {
-                       /* ATOMIC_SET */
+                       net_serialize_enter();
                        *p = parms->next;
+                       net_serialize_leave();
 #ifdef CONFIG_SYSCTL
                        neigh_sysctl_unregister(parms);
 #endif
@@ -926,14+1008,15 @@ int neigh_table_clear(struct neigh_table *tbl)
        del_timer(&tbl->gc_timer);
        del_timer(&tbl->proxy_timer);
        skb_queue_purge(&tbl->proxy_queue);
-       if (tbl->entries)
-               neigh_ifdown(tbl, NULL);
+       neigh_ifdown(tbl, NULL);
        end_bh_atomic();
        if (tbl->entries)
                printk(KERN_CRIT "neighbour leakage\n");
        for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
                if (*tp == tbl) {
+                       net_serialize_enter();
                        *tp = tbl->next;
+                       net_serialize_leave();
                        break;
                }
        }
@@ -976,7+1059,7 @@ int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
                        return -EINVAL;
 
                start_bh_atomic();
-               n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev);
+               n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0);
                if (n) {
                        err = neigh_update(n, NULL, NUD_FAILED, 1, 0);
                        neigh_release(n);
@@ -1020,7+1103,7 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
                    nda[NDA_LLADDR-1]->rta_len != RTA_LENGTH(dev->addr_len))
                        return -EINVAL;
                start_bh_atomic();
-               n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev);
+               n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0);
                if (n) {
                        if (nlh->nlmsg_flags&NLM_F_EXCL)
                                err = -EEXIST;
@@ -1091,7+1174,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct
        for (h=0; h <= NEIGH_HASHMASK; h++) {
                if (h < s_h) continue;
                if (h > s_h)
-                       memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(int));
+                       memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0]));
                start_bh_atomic();
                for (n = tbl->hash_buckets[h], idx = 0; n;
                     n = n->next, idx++) {
@@ -1125,7+1208,7 @@ int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
                if (family && tbl->family != family)
                        continue;
                if (t > s_t)
-                       memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+                       memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
                if (neigh_dump_table(tbl, skb, cb) < 0) 
                        break;
        }
index e1fe887..ed27c8e 100644 (file)
 
 #include <linux/inet.h>
 #include <linux/netdevice.h>
-#include <net/pkt_sched.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/arp.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/sock.h>
+#include <net/pkt_sched.h>
 
 atomic_t rtnl_rlockct;
 struct wait_queue *rtnl_wait;
index fd9c6f9..2cfcbad 100644 (file)
@@ -7,7+7,7 @@
  *             handler for protocols to use and generic option handler.
  *
  *
- * Version:    $Id: sock.c,v 1.76 1999/02/23 08:12:29 davem Exp $
+ * Version:    $Id: sock.c,v 1.77 1999/03/21 05:22:26 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -251,12+251,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
                        break;
 
                case SO_PRIORITY:
-                       if (val >= 0 && val <= 7) 
-                       {
-                               if(val==7 && !capable(CAP_NET_ADMIN))
-                                       return -EPERM;
+                       if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
                                sk->priority = val;
-                       }                       
+                       else
+                               return(-EPERM);
                        break;
 
                case SO_LINGER:
@@ -348,8+346,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 
                                filter = sk->filter;
 
+                               net_serialize_enter();
                                sk->filter = NULL;
-                               wmb();
+                               net_serialize_leave();
                                
                                if (filter)
                                        sk_filter_release(sk, filter);
@@ -515,6+514,16 @@ void sk_free(struct sock *sk)
        if (atomic_read(&sk->omem_alloc))
                printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
 
+#ifdef CONFIG_FILTER
+       if (sk->filter) {
+               sk_filter_release(sk, sk->filter);
+               sk->filter = NULL;
+       }
+#endif
+
+       if (atomic_read(&sk->omem_alloc))
+               printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
+
        kmem_cache_free(sk_cachep, sk);
 }
 
index 43334d5..29786da 100644 (file)
@@ -32,6+32,9 @@ if [ "$CONFIG_FIREWALL" = "y" ]; then
        fi
     fi
     bool 'IP: always defragment (required for masquerading)' CONFIG_IP_ALWAYS_DEFRAG
+    if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then
+       bool 'IP: use FWMARK value as routing key' CONFIG_IP_ROUTE_FWMARK
+    fi
   fi
 fi
 if [ "$CONFIG_IP_FIREWALL" = "y" ]; then
index 60705bc..434dd61 100644 (file)
@@ -5,7+5,7 @@
  *
  *             PF_INET protocol family socket handler.
  *
- * Version:    $Id: af_inet.c,v 1.84 1999/03/15 22:16:47 davem Exp $
+ * Version:    $Id: af_inet.c,v 1.85 1999/03/21 05:22:28 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -176,8+176,6 @@ static __inline__ void kill_sk_now(struct sock *sk)
        if(sk->opt)
                kfree(sk->opt);
        dst_release(sk->dst_cache);
-       if (atomic_read(&sk->omem_alloc))
-               printk(KERN_DEBUG "kill_sk_now: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
        sk_free(sk);
 }
 
index 84ec7ba..2c311f2 100644 (file)
@@ -1,6+1,6 @@
 /* linux/net/inet/arp.c
  *
- * Version:    $Id: arp.c,v 1.76 1999/03/09 14:10:07 davem Exp $
+ * Version:    $Id: arp.c,v 1.77 1999/03/21 05:22:30 davem Exp $
  *
  * Copyright (C) 1994 by Florian  La Roche
  *
@@ -294,7+294,7 @@ static int arp_constructor(struct neighbour *neigh)
 
 static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
 {
-       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+       dst_link_failure(skb);
        kfree_skb(skb);
 }
 
@@ -401,8+401,12 @@ int arp_bind_neighbour(struct dst_entry *dst)
 
        if (dev == NULL)
                return 0;
-       if (dst->neighbour == NULL)
-               dst->neighbour = __neigh_lookup(&arp_tbl, &((struct rtable*)dst)->rt_gateway, dev, 1);
+       if (dst->neighbour == NULL) {
+               u32 nexthop = ((struct rtable*)dst)->rt_gateway;
+               if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+                       nexthop = 0;
+               dst->neighbour = __neigh_lookup(&arp_tbl, &nexthop, dev, 1);
+       }
        return (dst->neighbour != NULL);
 }
 
index b1aa1a0..c02171f 100644 (file)
@@ -1,7+1,7 @@
 /*
  *     NET3    IP device support routines.
  *
- *     Version: $Id: devinet.c,v 1.25 1999/01/04 20:14:33 davem Exp $
+ *     Version: $Id: devinet.c,v 1.26 1999/03/21 05:22:31 davem Exp $
  *
  *             This program is free software; you can redistribute it and/or
  *             modify it under the terms of the GNU General Public License
@@ -138,7+138,9 @@ static void inetdev_destroy(struct in_device *in_dev)
 #ifdef CONFIG_SYSCTL
        devinet_sysctl_unregister(&in_dev->cnf);
 #endif
+       net_serialize_enter();
        in_dev->dev->ip_ptr = NULL;
+       net_serialize_leave();
        neigh_parms_release(&arp_tbl, in_dev->arp_parms);
        kfree(in_dev);
 }
@@ -172,7+174,10 @@ inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy)
                                ifap1 = &ifa->ifa_next;
                                continue;
                        }
+                       net_serialize_enter();
                        *ifap1 = ifa->ifa_next;
+                       net_serialize_leave();
+
                        rtmsg_ifa(RTM_DELADDR, ifa);
                        notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
                        inet_free_ifa(ifa);
@@ -181,8+186,9 @@ inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy)
 
        /* 2. Unlink it */
 
+       net_serialize_enter();
        *ifap = ifa1->ifa_next;
-
+       net_serialize_leave();
 
        /* 3. Announce address deletion */
 
@@ -238,8+244,9 @@ inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa)
        }
 
        ifa->ifa_next = *ifap;
-       /* ATOMIC_SET */
+       net_serialize_enter();
        *ifap = ifa;
+       net_serialize_leave();
 
        /* Send message first, then call notifier.
           Notifier will trigger FIB update, so that
@@ -650,8+657,25 @@ u32 inet_select_addr(struct device *dev, u32 dst, int scope)
                if (!dst || inet_ifa_match(dst, ifa))
                        return addr;
        } endfor_ifa(in_dev);
+       
+       if (addr || scope >= RT_SCOPE_LINK)
+               return addr;
 
-       return addr;
+       /* Not loopback addresses on loopback should be preferred
+          in this case. It is importnat that lo is the first interface
+          in dev_base list.
+        */
+       for (dev=dev_base; dev; dev=dev->next) {
+               if ((in_dev=dev->ip_ptr) == NULL)
+                       continue;
+
+               for_primary_ifa(in_dev) {
+                       if (ifa->ifa_scope <= scope)
+                               return ifa->ifa_local;
+               } endfor_ifa(in_dev);
+       }
+
+       return 0;
 }
 
 /*
index a3585cc..a174704 100644 (file)
@@ -5,7+5,7 @@
  *
  *             IPv4 Forwarding Information Base: FIB frontend.
  *
- * Version:    $Id: fib_frontend.c,v 1.14 1999/01/04 20:13:55 davem Exp $
+ * Version:    $Id: fib_frontend.c,v 1.15 1999/03/21 05:22:31 davem Exp $
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -189,7+189,7 @@ unsigned inet_addr_type(u32 addr)
  */
 
 int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
-                       struct device *dev, u32 *spec_dst)
+                       struct device *dev, u32 *spec_dst, u32 *itag)
 {
        struct in_device *in_dev = dev->ip_ptr;
        struct rt_key key;
@@ -209,6+209,8 @@ int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
        if (res.type != RTN_UNICAST)
                return -EINVAL;
        *spec_dst = FIB_RES_PREFSRC(res);
+       if (itag)
+               fib_combine_itag(itag, &res);
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
 #else
@@ -231,6+233,7 @@ last_resort:
        if (IN_DEV_RPFILTER(in_dev))
                return -EINVAL;
        *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+       *itag = 0;
        return 0;
 }
 
@@ -354,7+357,7 @@ int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
        for (t=s_t; t<=RT_TABLE_MAX; t++) {
                if (t < s_t) continue;
                if (t > s_t)
-                       memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+                       memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
                if ((tb = fib_get_table(t))==NULL)
                        continue;
                if (tb->tb_dump(tb, skb, cb) < 0) 
index 5232c61..ba8dc6d 100644 (file)
@@ -5,7+5,7 @@
  *
  *             IPv4 FIB: lookup engine and maintenance routines.
  *
- * Version:    $Id: fib_hash.c,v 1.6 1998/10/03 09:37:06 davem Exp $
+ * Version:    $Id: fib_hash.c,v 1.7 1999/03/21 05:22:32 davem Exp $
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -302,6+302,90 @@ fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result
        return 1;
 }
 
+static int fn_hash_last_dflt=-1;
+
+static int fib_detect_death(struct fib_info *fi, int order,
+                           struct fib_info **last_resort, int *last_idx)
+{
+       struct neighbour *n;
+       int state = NUD_NONE;
+
+       n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+       if (n) {
+               state = n->nud_state;
+               neigh_release(n);
+       }
+       if (state==NUD_REACHABLE)
+               return 0;
+       if ((state&NUD_VALID) && order != fn_hash_last_dflt)
+               return 0;
+       if ((state&NUD_VALID) ||
+           (*last_idx<0 && order > fn_hash_last_dflt)) {
+               *last_resort = fi;
+               *last_idx = order;
+       }
+       return 1;
+}
+
+static void
+fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fib_result *res)
+{
+       int order, last_idx;
+       struct fib_node *f;
+       struct fib_info *fi = NULL;
+       struct fib_info *last_resort;
+       struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+       struct fn_zone *fz = t->fn_zones[0];
+
+       if (fz == NULL)
+               return;
+
+       last_idx = -1;
+       last_resort = NULL;
+       order = -1;
+
+       for (f = fz->fz_hash[0]; f; f = f->fn_next) {
+               struct fib_info *next_fi = FIB_INFO(f);
+
+               if ((f->fn_state&FN_S_ZOMBIE) ||
+                   f->fn_scope != res->scope ||
+                   f->fn_type != RTN_UNICAST)
+                       continue;
+
+               if (next_fi->fib_priority > res->fi->fib_priority)
+                       break;
+               if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+                       continue;
+               f->fn_state |= FN_S_ACCESSED;
+
+               if (fi == NULL) {
+                       if (next_fi != res->fi)
+                               break;
+               } else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
+                       res->fi = fi;
+                       fn_hash_last_dflt = order;
+                       return;
+               }
+               fi = next_fi;
+               order++;
+       }
+
+       if (order<=0 || fi==NULL) {
+               fn_hash_last_dflt = -1;
+               return;
+       }
+
+       if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
+               res->fi = fi;
+               fn_hash_last_dflt = order;
+               return;
+       }
+
+       if (last_idx >= 0)
+               res->fi = last_resort;
+       fn_hash_last_dflt = last_idx;
+}
+
 #define FIB_SCAN(f, fp) \
 for ( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next)
 
@@ -476,14+560,16 @@ replace:
         */
 
        new_f->fn_next = f;
-       /* ATOMIC_SET */
        *fp = new_f;
        fz->fz_nent++;
 
        if (del_fp) {
                f = *del_fp;
                /* Unlink replaced node */
+               net_serialize_enter();
                *del_fp = f->fn_next;
+               net_serialize_leave();
+
                if (!(f->fn_state&FN_S_ZOMBIE))
                        rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
                if (f->fn_state&FN_S_ACCESSED)
@@ -570,7+656,10 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
                rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
 
                if (matched != 1) {
+                       net_serialize_enter();
                        *del_fp = f->fn_next;
+                       net_serialize_leave();
+
                        if (f->fn_state&FN_S_ACCESSED)
                                rt_cache_flush(-1);
                        fn_free_node(f);
@@ -600,7+689,10 @@ fn_flush_list(struct fib_node ** fp, int z, struct fn_hash *table)
                struct fib_info *fi = FIB_INFO(f);
 
                if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) {
+                       net_serialize_enter();
                        *fp = f->fn_next;
+                       net_serialize_leave();
+
                        fn_free_node(f);
                        found++;
                        continue;
@@ -710,7+802,7 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
        for (h=0; h < fz->fz_divisor; h++) {
                if (h < s_h) continue;
                if (h > s_h)
-                       memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(int));
+                       memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
                if (fz->fz_hash == NULL || fz->fz_hash[h] == NULL)
                        continue;
                if (fn_hash_dump_bucket(skb, cb, tb, fz, fz->fz_hash[h]) < 0) {
@@ -732,7+824,7 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
        for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
                if (m < s_m) continue;
                if (m > s_m)
-                       memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(int));
+                       memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0]));
                if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
                        cb->args[1] = m;
                        return -1;
@@ -784,6+876,7 @@ __initfunc(struct fib_table * fib_hash_init(int id))
        tb->tb_insert = fn_hash_insert;
        tb->tb_delete = fn_hash_delete;
        tb->tb_flush = fn_hash_flush;
+       tb->tb_select_default = fn_hash_select_default;
 #ifdef CONFIG_RTNETLINK
        tb->tb_dump = fn_hash_dump;
 #endif
index 70fa5d8..815c6d3 100644 (file)
@@ -5,7+5,7 @@
  *
  *             IPv4 Forwarding Information Base: policy rules.
  *
- * Version:    $Id: fib_rules.c,v 1.7 1998/10/03 09:37:09 davem Exp $
+ * Version:    $Id: fib_rules.c,v 1.8 1999/03/21 05:22:33 davem Exp $
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
  *
  * Fixes:
  *             Rani Assaf      :       local_rule cannot be deleted
+ *             Marc Boucher    :       routing by fwmark
  */
 
 #include <linux/config.h>
@@ -63,6+64,9 @@ struct fib_rule
        u32             r_srcmap;
        u8              r_flags;
        u8              r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+       u32             r_fwmark;
+#endif
        int             r_ifindex;
 #ifdef CONFIG_NET_CLS_ROUTE
        __u32           r_tclassid;
@@ -88,13+92,18 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
                    rtm->rtm_dst_len == r->r_dst_len &&
                    (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) &&
                    rtm->rtm_tos == r->r_tos &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                   (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) &&
+#endif
                    (!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
                    (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
                    (!rta[RTA_IIF-1] || strcmp(RTA_DATA(rta[RTA_IIF-1]), r->r_ifname) == 0) &&
                    (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
                        if (r == &local_rule)
                                return -EPERM;
+                       net_serialize_enter();
                        *rp = r->r_next;
+                       net_serialize_leave();
                        if (r != &default_rule && r != &main_rule)
                                kfree(r);
                        return 0;
@@ -155,6+164,10 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
        new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len);
        new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len);
        new_r->r_tos = rtm->rtm_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+       if (rta[RTA_PROTOINFO-1])
+               memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4);
+#endif
        new_r->r_action = rtm->rtm_type;
        new_r->r_flags = rtm->rtm_flags;
        if (rta[RTA_PRIORITY-1])
@@ -267,14+280,15 @@ FRprintk("Lookup: %08x <- %08x ", key->dst, key->src);
 #ifdef CONFIG_IP_ROUTE_TOS
                    (r->r_tos && r->r_tos != key->tos) ||
 #endif
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                   (r->r_fwmark && r->r_fwmark != key->fwmark) ||
+#endif
                    (r->r_ifindex && r->r_ifindex != key->iif))
                        continue;
 
 FRprintk("tb %d r %d ", r->r_table, r->r_action);
                switch (r->r_action) {
                case RTN_UNICAST:
-                       policy = NULL;
-                       break;
                case RTN_NAT:
                        policy = r;
                        break;
@@ -295,14+309,23 @@ FRprintk("ok\n");
                        res->r = policy;
                        return 0;
                }
-               if (err < 0)
+               if (err < 0 && err != -EAGAIN)
                        return err;
-FRprintk("RCONT ");
        }
 FRprintk("FAILURE\n");
        return -ENETUNREACH;
 }
 
+void fib_select_default(const struct rt_key *key, struct fib_result *res)
+{
+       if (res->r && res->r->r_action == RTN_UNICAST &&
+           FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
+               struct fib_table *tb;
+               if ((tb = fib_get_table(res->r->r_table)) != NULL)
+                       tb->tb_select_default(tb, key, res);
+       }
+}
+
 static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
        struct device *dev = ptr;
@@ -337,6+360,10 @@ extern __inline__ int inet_fill_rule(struct sk_buff *skb,
        rtm->rtm_dst_len = r->r_dst_len;
        rtm->rtm_src_len = r->r_src_len;
        rtm->rtm_tos = r->r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+       if (r->r_fwmark)
+               RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark);
+#endif
        rtm->rtm_table = r->r_table;
        rtm->rtm_protocol = 0;
        rtm->rtm_scope = 0;
index bd35f7b..b78f7eb 100644 (file)
@@ -5,7+5,7 @@
  *
  *             IPv4 Forwarding Information Base: semantics.
  *
- * Version:    $Id: fib_semantics.c,v 1.12 1999/01/26 05:33:44 davem Exp $
+ * Version:    $Id: fib_semantics.c,v 1.13 1999/03/21 05:22:34 davem Exp $
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -89,7+89,7 @@ static struct
        { -EINVAL, RT_SCOPE_UNIVERSE},  /* RTN_BLACKHOLE */
        { -EHOSTUNREACH, RT_SCOPE_UNIVERSE},/* RTN_UNREACHABLE */
        { -EACCES, RT_SCOPE_UNIVERSE},  /* RTN_PROHIBIT */
-       { 1, RT_SCOPE_UNIVERSE},        /* RTN_THROW */
+       { -EAGAIN, RT_SCOPE_UNIVERSE},  /* RTN_THROW */
 #ifdef CONFIG_IP_ROUTE_NAT
        { 0, RT_SCOPE_HOST},            /* RTN_NAT */
 #else
@@ -420,7+420,7 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
                        unsigned flavor = attr->rta_type;
                        if (flavor) {
                                if (flavor > FIB_MAX_METRICS)
-                                       goto failure;
+                                       goto err_inval;
                                fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
                        }
                        attr = RTA_NEXT(attr, attrlen);
index 7b78afb..199550f 100644 (file)
@@ -3,7+3,7 @@
  *     
  *             Alan Cox, <alan@cymru.net>
  *
- *     Version: $Id: icmp.c,v 1.50 1999/03/17 01:53:55 davem Exp $
+ *     Version: $Id: icmp.c,v 1.52 1999/03/21 12:04:11 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *     modify it under the terms of the GNU General Public License
@@ -373,6+373,12 @@ struct socket *icmp_socket=&icmp_inode.u.socket_i;
  *     works for icmp destinations. This means the rate limiting information
  *     for one "ip object" is shared.
  *
+ *     Note that the same dst_entry fields are modified by functions in 
+ *     route.c too, but these work for packet destinations while xrlim_allow
+ *     works for icmp destinations. This means the rate limiting information
+ *     for one "ip object" is shared - and these ICMPs are twice limited:
+ *     by source and by destination.
+ *
  *     RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
  *                       SHOULD allow setting of rate limits 
  *
@@ -385,10+391,10 @@ int xrlim_allow(struct dst_entry *dst, int timeout)
 
        now = jiffies;
        dst->rate_tokens += now - dst->rate_last;
+       dst->rate_last = now;
        if (dst->rate_tokens > XRLIM_BURST_FACTOR*timeout)
                dst->rate_tokens = XRLIM_BURST_FACTOR*timeout;
        if (dst->rate_tokens >= timeout) {
-               dst->rate_last = now;
                dst->rate_tokens -= timeout;
                return 1;
        }
@@ -406,6+412,10 @@ static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
                return 1;
 
+       /* Redirect has its own rate limit mechanism */
+       if (type == ICMP_REDIRECT)
+               return 1;
+
        /* No rate limit on loopback */
        if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
                return 1;
@@ -526,8+536,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
        /*
         *      Now check at the protocol level
         */
-       if (!rt)
+       if (!rt) {
+#ifndef CONFIG_IP_ALWAYS_DEFRAG
+               if (net_ratelimit())
+                       printk(KERN_DEBUG "icmp_send: destinationless packet\n");
+#endif
                return;
+       }
        if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
                return;
         
index b0e7b6d..3ae0936 100644 (file)
@@ -8,7+8,7 @@
  *     the older version didn't come out right using gcc 2.5.8, the newer one
  *     seems to fall out with gcc 2.6.2.
  *
- *     Version: $Id: igmp.c,v 1.28 1998/11/30 15:53:13 davem Exp $
+ *     Version: $Id: igmp.c,v 1.29 1999/03/21 05:22:36 davem Exp $
  *
  *     Authors:
  *             Alan Cox <Alan.Cox@linux.org>
 #include <linux/mroute.h>
 #endif
 
+#define IP_MAX_MEMBERSHIPS 20
 
 #ifdef CONFIG_IP_MULTICAST
 
@@ -462,7+463,9 @@ int ip_mc_dec_group(struct in_device *in_dev, u32 addr)
        for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
                if (i->multiaddr==addr) {
                        if (--i->users == 0) {
+                               net_serialize_enter();
                                *ip = i->next;
+                               net_serialize_leave();
                                igmp_group_dropped(i);
                                if (in_dev->dev->flags & IFF_UP)
                                        ip_rt_multicast_event(in_dev);
@@ -610,7+613,9 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
                        struct in_device *in_dev;
                        if (--iml->count)
                                return 0;
+                       net_serialize_enter();
                        *imlp = iml->next;
+                       net_serialize_leave();
                        in_dev = inetdev_by_index(iml->multi.imr_ifindex);
                        if (in_dev)
                                ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
@@ -684,6+689,8 @@ done:
        len-=(offset-begin);
        if(len>length)
                len=length;
+       if(len<0)
+               len=0;
        return len;
 }
 #endif
index b617bc3..08ebbc2 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The IP forwarding functionality.
  *             
- * Version:    $Id: ip_forward.c,v 1.42 1998/10/03 09:37:19 davem Exp $
+ * Version:    $Id: ip_forward.c,v 1.43 1999/03/21 05:22:37 davem Exp $
  *
  * Authors:    see ip.c
  *
@@ -260,7+260,7 @@ skip_call_fw_firewall:
                if (rt->rt_flags&RTCF_FAST && !netdev_fastroute_obstacles) {
                        unsigned h = ((*(u8*)&rt->key.dst)^(*(u8*)&rt->key.src))&NETDEV_FASTROUTE_HMASK;
                        /* Time to switch to functional programming :-) */
-                       dst_release(xchg(&skb->dev->fastpath[h], dst_clone(&rt->u.dst)));
+                       dst_release_irqwait(xchg(&skb->dev->fastpath[h], dst_clone(&rt->u.dst)));
                }
 #endif
                ip_send(skb);
index 2aeca9d..f066e60 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The IP fragmentation functionality.
  *             
- * Version:    $Id: ip_fragment.c,v 1.39 1998/08/26 10:35:26 davem Exp $
+ * Version:    $Id: ip_fragment.c,v 1.40 1999/03/20 23:58:34 davem Exp $
  *
  * Authors:    Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
  *             Alan Cox <Alan.Cox@linux.org>
index 6488e9d..7c7dc2a 100644 (file)
@@ -189,6+189,48 @@ static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
        return NULL;
 }
 
+static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
+{
+       u32 remote = t->parms.iph.daddr;
+       u32 local = t->parms.iph.saddr;
+       u32 key = t->parms.i_key;
+       unsigned h = HASH(key);
+       int prio = 0;
+
+       if (local)
+               prio |= 1;
+       if (remote && !MULTICAST(remote)) {
+               prio |= 2;
+               h ^= HASH(remote);
+       }
+
+       return &tunnels[prio][h];
+}
+
+static void ipgre_tunnel_link(struct ip_tunnel *t)
+{
+       struct ip_tunnel **tp = ipgre_bucket(t);
+
+       net_serialize_enter();
+       t->next = *tp;
+       *tp = t;
+       net_serialize_leave();
+}
+
+static void ipgre_tunnel_unlink(struct ip_tunnel *t)
+{
+       struct ip_tunnel **tp;
+
+       for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
+               if (t == *tp) {
+                       net_serialize_enter();
+                       *tp = t->next;
+                       net_serialize_leave();
+                       break;
+               }
+       }
+}
+
 static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
 {
        u32 remote = parms->iph.daddr;
@@ -241,10+283,7 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
        if (register_netdevice(dev) < 0)
                goto failed;
 
-       start_bh_atomic();
-       nt->next = t;
-       *tp = nt;
-       end_bh_atomic();
+       ipgre_tunnel_link(nt);
        /* Do not decrement MOD_USE_COUNT here. */
        return nt;
 
@@ -256,28+295,11 @@ failed:
 
 static void ipgre_tunnel_destroy(struct device *dev)
 {
-       struct ip_tunnel *t, **tp;
-       struct ip_tunnel *t0 = (struct ip_tunnel*)dev->priv;
-       u32 remote = t0->parms.iph.daddr;
-       u32 local = t0->parms.iph.saddr;
-       unsigned h = HASH(t0->parms.i_key);
-       int prio = 0;
+       ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
 
-       if (local)
-               prio |= 1;
-       if (remote && !MULTICAST(remote)) {
-               prio |= 2;
-               h ^= HASH(remote);
-       }
-       for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
-               if (t == t0) {
-                       *tp = t->next;
-                       if (dev != &ipgre_fb_tunnel_dev) {
-                               kfree(dev);
-                               MOD_DEC_USE_COUNT;
-                       }
-                       break;
-               }
+       if (dev != &ipgre_fb_tunnel_dev) {
+               kfree(dev);
+               MOD_DEC_USE_COUNT;
        }
 }
 
@@ -849,6+871,41 @@ ipgre_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
 
                t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
 
+               if (dev != &ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
+                   t != &ipgre_fb_tunnel) {
+                       if (t != NULL) {
+                               if (t->dev != dev) {
+                                       err = -EEXIST;
+                                       break;
+                               }
+                       } else {
+                               unsigned nflags=0;
+
+                               t = (struct ip_tunnel*)dev->priv;
+
+                               if (MULTICAST(p.iph.daddr))
+                                       nflags = IFF_BROADCAST;
+                               else if (p.iph.daddr)
+                                       nflags = IFF_POINTOPOINT;
+
+                               if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+                                       err = -EINVAL;
+                                       break;
+                               }
+                               start_bh_atomic();
+                               ipgre_tunnel_unlink(t);
+                               t->parms.iph.saddr = p.iph.saddr;
+                               t->parms.iph.daddr = p.iph.daddr;
+                               t->parms.i_key = p.i_key;
+                               t->parms.o_key = p.o_key;
+                               memcpy(dev->dev_addr, &p.iph.saddr, 4);
+                               memcpy(dev->broadcast, &p.iph.daddr, 4);
+                               ipgre_tunnel_link(t);
+                               end_bh_atomic();
+                               netdev_state_change(dev);
+                       }
+               }
+
                if (t) {
                        err = 0;
                        if (cmd == SIOCCHGTUNNEL) {
index fbbfbbf..e7a864f 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The Internet Protocol (IP) module.
  *
- * Version:    $Id: ip_input.c,v 1.35 1999/01/12 14:32:48 davem Exp $
+ * Version:    $Id: ip_input.c,v 1.36 1999/03/21 05:22:38 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -387,6+387,10 @@ int ip_local_deliver(struct sk_buff *skb)
 int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 {
        struct iphdr *iph = skb->nh.iph;
+#ifdef CONFIG_FIREWALL
+       int fwres;
+       u16 rport;
+#endif /* CONFIG_FIREWALL */
 
        /*
         *      When the interface is in promisc. mode, drop all the crap
@@ -427,6+431,30 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
        __skb_trim(skb, len);
        }
        
+#ifdef CONFIG_IP_ALWAYS_DEFRAG
+       /* Won't send ICMP reply, since skb->dst == NULL. --RR */
+       if (iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+               skb = ip_defrag(skb);
+               if (!skb)
+                       return 0;
+               iph = skb->nh.iph;
+               ip_send_check(iph);
+       }
+#endif
+
+#ifdef CONFIG_FIREWALL
+       /*
+        *      See if the firewall wants to dispose of the packet. 
+        *
+        * We can't do ICMP reply or local delivery before routing,
+        * so we delay those decisions until after route. --RR
+        */
+       fwres = call_in_firewall(PF_INET, dev, iph, &rport, &skb);
+       if (fwres < FW_ACCEPT && fwres != FW_REJECT)
+               goto drop;
+       iph = skb->nh.iph;
+#endif /* CONFIG_FIREWALL */
+
        /*
         *      Initialise the virtual path cache for the packet. It describes
         *      how the packet travels inside Linux networking.
@@ -442,13+470,13 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 #endif
        }
 
-#ifdef CONFIG_IP_ALWAYS_DEFRAG
-       if (iph->frag_off & htons(IP_MF|IP_OFFSET)) {
-               skb = ip_defrag(skb);
-               if (!skb)
-                       return 0;
-               iph = skb->nh.iph;
-               ip_send_check(iph);
+#ifdef CONFIG_NET_CLS_ROUTE
+       if (skb->dst->tclassid) {
+               u32 idx = skb->dst->tclassid;
+               ip_rt_acct[idx&0xFF].o_packets++;
+               ip_rt_acct[idx&0xFF].o_bytes+=skb->len;
+               ip_rt_acct[(idx>>16)&0xFF].i_packets++;
+               ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len;
        }
 #endif
 
@@ -462,7+490,7 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
                   and running sniffer is extremely rare condition.
                                                      --ANK (980813)
                */
-                  
+
                skb = skb_cow(skb, skb_headroom(skb));
                if (skb == NULL)
                        return 0;
@@ -486,51+514,17 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
                }
        }
 
-       /*
-        *      See if the firewall wants to dispose of the packet. 
-        *
-        *      Note: the current standard firewall code expects that the 
-        *      destination address was already checked against the interface 
-        *      address lists.
-        *
-        *      If this code is ever moved in front of ip_route_input() you need
-        *      to fix the fw code [moving it might be a good idea anyways,
-        *      so that we can firewall against potentially bugs in the options
-        *      or routing code]
-        */
-       
-#ifdef CONFIG_FIREWALL
-        {
-               int fwres;
-               u16 rport;
-#ifdef  CONFIG_IP_ROUTE_TOS
-               u8  tos = iph->tos;
-#endif
-
-               if ((fwres=call_in_firewall(PF_INET, skb->dev, iph, &rport, &skb))<FW_ACCEPT) {
-                       if (fwres==FW_REJECT)
-                               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-                       goto drop;
-               }
-
+#ifdef CONFIG_FIREWALL
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
-               if (fwres==FW_REDIRECT && (IPCB(skb)->redirport = rport) != 0)
-                       return ip_local_deliver(skb);
-#endif
-#ifdef CONFIG_IP_ROUTE_TOS
-               /* It is for 2.2 only. Firewalling should make smart
-                  rerouting itself, ideally, but now it is too late
-                  to teach it.                         --ANK (980905)
-                */
-               if (iph->tos != tos && ((struct rtable*)skb->dst)->rt_type == RTN_UNICAST) {
-                       dst_release(skb->dst);
-                       skb->dst = NULL;
-                       if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
-                               goto drop; 
-               }
-#endif
+       if (fwres == FW_REDIRECT && (IPCB(skb)->redirport = rport) != 0)
+               return ip_local_deliver(skb);
+#endif /* CONFIG_IP_TRANSPARENT_PROXY */
+
+       if (fwres == FW_REJECT) {
+               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+               goto drop;
        }
-#endif
+#endif /* CONFIG_FIREWALL */
 
        return skb->dst->input(skb);
 
index 9f9966b..5a1c6d7 100644 (file)
@@ -5,7+5,7 @@
  *
  *             Dumb Network Address Translation.
  *
- * Version:    $Id: ip_nat_dumb.c,v 1.7 1998/10/06 04:49:09 davem Exp $
+ * Version:    $Id: ip_nat_dumb.c,v 1.8 1999/03/21 05:22:40 davem Exp $
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -89,6+89,8 @@ ip_do_nat(struct sk_buff *skb)
                {
                        struct icmphdr *icmph = (struct icmphdr*)((char*)iph + (iph->ihl<<2));
                        struct   iphdr *ciph;
+                       u32 idaddr, isaddr;
+                       int updated;
 
                        if ((icmph->type != ICMP_DEST_UNREACH) &&
                            (icmph->type != ICMP_TIME_EXCEEDED) &&
@@ -100,8+102,14 @@ ip_do_nat(struct sk_buff *skb)
                        if ((u8*)(ciph+1) > skb->tail)
                                goto truncated;
 
-                       if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr)
+                       isaddr = ciph->saddr;
+                       idaddr = ciph->daddr;
+                       updated = 0;
+
+                       if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr) {
                                ciph->saddr = iph->daddr;
+                               updated = 1;
+                       }
                        if (rt->rt_flags&RTCF_SNAT) {
                                if (ciph->daddr != osaddr) {
                                        struct   fib_result res;
@@ -115,16+123,27 @@ ip_do_nat(struct sk_buff *skb)
 #ifdef CONFIG_IP_ROUTE_TOS
                                        key.tos = RT_TOS(ciph->tos);
 #endif
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                                       key.fwmark = 0;
+#endif
                                        /* Use fib_lookup() until we get our own
                                         * hash table of NATed hosts -- Rani
                                         */
-                                       if (fib_lookup(&key, &res) != 0)
-                                               return 0;
-                                       if (res.r)
+                                       if (fib_lookup(&key, &res) == 0 && res.r) {
                                                ciph->daddr = fib_rules_policy(ciph->daddr, &res, &flags);
-                               }
-                               else
+                                               if (ciph->daddr != idaddr)
+                                                       updated = 1;
+                                       }
+                               } else {
                                        ciph->daddr = iph->saddr;
+                                       updated = 1;
+                               }
+                       }
+                       if (updated) {
+                               cksum  = &icmph->checksum;
+                               /* Using tcpudp primitive. Why not? */
+                               check  = csum_tcpudp_magic(ciph->saddr, ciph->daddr, 0, 0, ~(*cksum));
+                               *cksum = csum_tcpudp_magic(~isaddr, ~idaddr, 0, 0, ~check);
                        }
                        break;
                }
index 9250223..fae22cb 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The options processing module for ip.c
  *
- * Version:    $Id: ip_options.c,v 1.15 1998/10/03 09:37:27 davem Exp $
+ * Version:    $Id: ip_options.c,v 1.16 1999/03/21 05:22:40 davem Exp $
  *
  * Authors:    A.N.Kuznetsov
  *             
@@ -137,17+137,17 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
                        if (sopt->ts_needtime) {
                                if (soffset + 3 > optlen)
                                        return -EINVAL;
-                               dopt->ts_needtime = 1;
-                               soffset += 4;
-                               if ((dptr[3]&0xF) == IPOPT_TS_PRESPEC) {
-                                       __u32 addr;
-                                       if (soffset + 3 > optlen)
-                                               return -EINVAL;
+                               if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+                                       dopt->ts_needtime = 1;
                                        soffset += 4;
+                               } else {
+                                       dopt->ts_needtime = 0;
+
                                        if (soffset + 8 <= optlen) {
-                                               dopt->ts_needtime = 0;
+                                               __u32 addr;
+
                                                memcpy(&addr, sptr+soffset-1, 4);
-                                               if (inet_addr_type(addr) != RTN_UNICAST) {
+                                               if (inet_addr_type(addr) != RTN_LOCAL) {
                                                        dopt->ts_needtime = 1;
                                                        soffset += 8;
                                                }
@@ -471,19+471,21 @@ void ip_options_undo(struct ip_options * opt)
        }
        if (opt->rr_needaddr) {
                unsigned  char * optptr = opt->__data+opt->rr-sizeof(struct  iphdr);
-               memset(&optptr[optptr[2]-1], 0, 4);
                optptr[2] -= 4;
+               memset(&optptr[optptr[2]-1], 0, 4);
        }
        if (opt->ts) {
                unsigned  char * optptr = opt->__data+opt->ts-sizeof(struct  iphdr);
                if (opt->ts_needtime) {
-                       memset(&optptr[optptr[2]-1], 0, 4);
                        optptr[2] -= 4;
-               }
-               if (opt->ts_needaddr)
                        memset(&optptr[optptr[2]-1], 0, 4);
-               if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC)
+                       if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+                               optptr[2] -= 4;
+               }
+               if (opt->ts_needaddr) {
                        optptr[2] -= 4;
+                       memset(&optptr[optptr[2]-1], 0, 4);
+               }
        }
 }
 
index d684c9c..a519c45 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The Internet Protocol (IP) output module.
  *
- * Version:    $Id: ip_output.c,v 1.65 1999/01/21 13:37:34 davem Exp $
+ * Version:    $Id: ip_output.c,v 1.66 1999/03/21 05:22:41 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *                                     for decreased register pressure on x86 
  *                                     and more readibility. 
  *             Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
- *                                     silently abort send instead of failing
- *                                     with -EPERM.
+ *                                     silently drop skb instead of failing with -EPERM.
  */
 
 #include <asm/uaccess.h>
@@ -132,8+131,16 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
        dev = rt->u.dst.dev;
 
 #ifdef CONFIG_FIREWALL
-       if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT)
-               goto drop;
+       /* Now we have no better mechanism to notify about error. */
+       switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
+       case FW_REJECT:
+               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+               /* Fall thru... */
+       case FW_BLOCK:
+       case FW_QUEUE:
+               kfree_skb(skb);
+               return;
+       }
 #endif
 
        ip_send_check(iph);
@@ -141,11+148,6 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
        /* Send it out. */
        skb->dst->output(skb);
        return;
-
-#ifdef CONFIG_FIREWALL
-drop:
-       kfree_skb(skb);
-#endif
 }
 
 int __ip_finish_output(struct sk_buff *skb)
@@ -292,8+294,17 @@ void ip_queue_xmit(struct sk_buff *skb)
        dev = rt->u.dst.dev;
 
 #ifdef CONFIG_FIREWALL
-       if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT) 
-               goto drop;
+       /* Now we have no better mechanism to notify about error. */
+       switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
+       case FW_REJECT:
+               start_bh_atomic();
+               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+               end_bh_atomic();
+               /* Fall thru... */
+       case FW_BLOCK:
+       case FW_QUEUE:
+               goto drop;
+       }
 #endif
 
        /* This can happen when the transport layer has segments queued
@@ -340,8+351,12 @@ fragment:
                 */
                iph->frag_off |= __constant_htons(IP_DF);
                printk(KERN_DEBUG "sending pkt_too_big to self\n");
+
+               /* icmp_send is not reenterable, so that bh_atomic... --ANK */
+               start_bh_atomic();
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
                          htonl(rt->u.dst.pmtu));
+               end_bh_atomic();
                goto drop;
        }
        ip_fragment(skb, skb->dst->output);
@@ -402,14+417,13 @@ int ip_build_xmit_slow(struct sock *sk,
        if (ip_dont_fragment(sk, &rt->u.dst))
                df = htons(IP_DF);
   
-       if (!sk->ip_hdrincl)
-               length -= sizeof(struct iphdr);
+       length -= sizeof(struct iphdr);
 
        if (opt) {
                fragheaderlen = sizeof(struct iphdr) + opt->optlen;
                maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
        } else {
-               fragheaderlen = sk->ip_hdrincl ? 0 : sizeof(struct iphdr);
+               fragheaderlen = sizeof(struct iphdr);
                
                /*
                 *      Fragheaderlen is the size of 'overhead' on each buffer. Now work
@@ -474,7+488,6 @@ int ip_build_xmit_slow(struct sock *sk,
         */
         
        do {
-               int error;
                char *data;
                struct sk_buff * skb;
 
@@ -482,15+495,10 @@ int ip_build_xmit_slow(struct sock *sk,
                 *      Get the memory we require with some space left for alignment.
                 */
 
-               skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &error);
-               if (skb == NULL) {
-                       ip_statistics.IpOutDiscards++;
-                       if(nfrags>1)
-                               ip_statistics.IpFragCreates++;                  
-                       dev_unlock_list();
-                       return(error);
-               }
-               
+               skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err);
+               if (skb == NULL)
+                       goto error;
+
                /*
                 *      Fill in the control structures
                 */
@@ -510,7+518,7 @@ int ip_build_xmit_slow(struct sock *sk,
                 *      Only write IP header onto non-raw packets 
                 */
                 
-               if(!sk->ip_hdrincl) {
+               {
                        struct iphdr *iph = (struct iphdr *)data;
 
                        iph->version = 4;
@@ -547,53+555,46 @@ int ip_build_xmit_slow(struct sock *sk,
                 *      User data callback
                 */
 
-               err = 0;
-               if (getfrag(frag, data, offset, fraglen-fragheaderlen))
+               if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
                        err = -EFAULT;
-               
-               /*
-                *      Account for the fragment.
-                */
-
-#ifdef CONFIG_FIREWALL
-               if(!err) {
-                       int fw_res;
-
-                       fw_res = call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb);
-                       if(fw_res == FW_QUEUE) {
-                               kfree_skb(skb);
-                               skb = NULL;
-                       } else if(fw_res < FW_ACCEPT) {
-                               err = -EPERM;
-                       }
-               }
-#endif
-
-               if (err) { 
-                       ip_statistics.IpOutDiscards++;
                        kfree_skb(skb);
-                       dev_unlock_list();
-                       return err; 
+                       goto error;
                }
-                       
 
                offset -= (maxfraglen-fragheaderlen);
                fraglen = maxfraglen;
 
                nfrags++;
 
-               err = 0; 
-               if (skb && rt->u.dst.output(skb)) {
-                       err = -ENETDOWN;
-                       ip_statistics.IpOutDiscards++;  
-                       break;
+#ifdef CONFIG_FIREWALL
+               switch (call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb)) {
+               case FW_QUEUE:
+                       kfree_skb(skb);
+                       continue;
+               case FW_BLOCK:
+               case FW_REJECT:
+                       kfree_skb(skb);
+                       err = -EPERM;
+                       goto error;
                }
+#endif
+
+               err = -ENETDOWN;
+               if (rt->u.dst.output(skb))
+                       goto error;
        } while (offset >= 0);
 
        if (nfrags>1)
                ip_statistics.IpFragCreates += nfrags;
        dev_unlock_list();
-       return err;
+       return 0;
+
+error:
+       ip_statistics.IpOutDiscards++;
+       if (nfrags>1)
+               ip_statistics.IpFragCreates += nfrags;
+       dev_unlock_list();
+       return err; 
 }
 
 
@@ -621,14+622,20 @@ int ip_build_xmit(struct sock *sk,
         *      choice RAW frames within 20 bytes of maximum size(rare) to the long path
         */
 
-       if (!sk->ip_hdrincl)
+       if (!sk->ip_hdrincl) {
                length += sizeof(struct iphdr);
 
-       /*
-        *      Check for slow path.
-        */
-       if (length > rt->u.dst.pmtu || ipc->opt != NULL)  
-               return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags); 
+               /*
+                *      Check for slow path.
+                */
+               if (length > rt->u.dst.pmtu || ipc->opt != NULL)  
+                       return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags); 
+       } else {
+               if (length > rt->u.dst.dev->mtu) {
+                       ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
+                       return -EMSGSIZE;
+               }
+       }
 
        /*
         *      Do path mtu discovery if needed.
@@ -636,7+643,7 @@ int ip_build_xmit(struct sock *sk,
        df = 0;
        if (ip_dont_fragment(sk, &rt->u.dst))
                df = htons(IP_DF);
-               
+
        /* 
         *      Fast path for unfragmented frames without options. 
         */ 
@@ -679,31+686,27 @@ int ip_build_xmit(struct sock *sk,
 
        dev_unlock_list();
 
-       if (err) 
-               err = -EFAULT;
+       if (err)
+               goto error_fault;
 
 #ifdef CONFIG_FIREWALL
-       if(!err) {
-               int fw_res;
-
-               fw_res = call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb);
-               if(fw_res == FW_QUEUE) {
-                       /* re-queued elsewhere; silently abort this send */
-                       kfree_skb(skb);
-                       return 0;
-               }
-               if(fw_res < FW_ACCEPT)
-                       err = -EPERM;
-       }
-#endif
-
-       if (err) { 
+       switch (call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb)) {
+       case FW_QUEUE:
                kfree_skb(skb);
+               return 0;
+       case FW_BLOCK:
+       case FW_REJECT:
+               kfree_skb(skb);
+               err = -EPERM;
                goto error;
        }
-       
+#endif
+
        return rt->u.dst.output(skb);
 
+error_fault:
+       err = -EFAULT;
+       kfree_skb(skb);
 error:
        ip_statistics.IpOutDiscards++;
        return err; 
index 1391cbd..017e6e6 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The IP to API glue.
  *             
- * Version:    $Id: ip_sockglue.c,v 1.39 1998/10/03 09:37:33 davem Exp $
+ * Version:    $Id: ip_sockglue.c,v 1.40 1999/03/21 05:22:42 davem Exp $
  *
  * Authors:    see ip.c
  *
@@ -209,7+209,9 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s
                                        kfree(new_ra);
                                return -EADDRINUSE;
                        }
+                       net_serialize_enter();
                        *rap = ra->next;
+                       net_serialize_leave();
                        if (ra->destructor)
                                ra->destructor(sk);
                        kfree(ra);
@@ -220,10+222,10 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s
                return -ENOBUFS;
        new_ra->sk = sk;
        new_ra->destructor = destructor;
-       start_bh_atomic();
        new_ra->next = ra;
+       net_serialize_enter();
        *rap = new_ra;
-       end_bh_atomic();
+       net_serialize_leave();
        return 0;
 }
 
@@ -404,7+406,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
                        err = ip_options_get(&opt, optval, optlen, 1);
                        if (err)
                                return err;
-                       start_bh_atomic();
+                       lock_sock(sk);
                        if (sk->type == SOCK_STREAM) {
                                struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -420,7+422,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
 #endif
                        }
                        opt = xchg(&sk->opt, opt);
-                       end_bh_atomic();
+                       release_sock(sk);
                        if (opt)
                                kfree_s(opt, sizeof(struct ip_options) + opt->optlen);
                        return 0;
@@ -463,11+465,12 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
                            !capable(CAP_NET_ADMIN))
                                return -EPERM;
                        if (sk->ip_tos != val) {
+                               lock_sock(sk);
                                sk->ip_tos=val;
                                sk->priority = rt_tos2priority(val);
                                dst_release(xchg(&sk->dst_cache, NULL)); 
+                               release_sock(sk);
                        }
-                       sk->priority = rt_tos2priority(val);
                        return 0;
                case IP_TTL:
                        if (optlen<1)
@@ -637,11+640,11 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op
                        {
                                unsigned char optbuf[sizeof(struct ip_options)+40];
                                struct ip_options * opt = (struct ip_options*)optbuf;
-                               start_bh_atomic();
+                               lock_sock(sk);
                                opt->optlen = 0;
                                if (sk->opt)
                                        memcpy(optbuf, sk->opt, sizeof(struct ip_options)+sk->opt->optlen);
-                               end_bh_atomic();
+                               release_sock(sk);
                                if (opt->optlen == 0) 
                                        return put_user(0, optlen);
 
index 9175e6f..0bb5f25 100644 (file)
@@ -1,7+1,7 @@
 /*
  *     Linux NET3:     IP/IP protocol decoder. 
  *
- *     Version: $Id: ipip.c,v 1.24 1998/10/03 09:37:35 davem Exp $
+ *     Version: $Id: ipip.c,v 1.25 1999/03/21 05:22:43 davem Exp $
  *
  *     Authors:
  *             Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
@@ -157,6+157,49 @@ static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local)
        return NULL;
 }
 
+static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t)
+{
+       u32 remote = t->parms.iph.daddr;
+       u32 local = t->parms.iph.saddr;
+       unsigned h = 0;
+       int prio = 0;
+
+       if (remote) {
+               prio |= 2;
+               h ^= HASH(remote);
+       }
+       if (local) {
+               prio |= 1;
+               h ^= HASH(local);
+       }
+       return &tunnels[prio][h];
+}
+
+
+static void ipip_tunnel_unlink(struct ip_tunnel *t)
+{
+       struct ip_tunnel **tp;
+
+       for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) {
+               if (t == *tp) {
+                       net_serialize_enter();
+                       *tp = t->next;
+                       net_serialize_leave();
+                       break;
+               }
+       }
+}
+
+static void ipip_tunnel_link(struct ip_tunnel *t)
+{
+       struct ip_tunnel **tp = ipip_bucket(t);
+
+       net_serialize_enter();
+       t->next = *tp;
+       *tp = t;
+       net_serialize_leave();
+}
+
 struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
 {
        u32 remote = parms->iph.daddr;
@@ -208,10+251,7 @@ struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
        if (register_netdevice(dev) < 0)
                goto failed;
 
-       start_bh_atomic();
-       nt->next = t;
-       *tp = nt;
-       end_bh_atomic();
+       ipip_tunnel_link(nt);
        /* Do not decrement MOD_USE_COUNT here. */
        return nt;
 
@@ -221,39+261,20 @@ failed:
        return NULL;
 }
 
+
 static void ipip_tunnel_destroy(struct device *dev)
 {
-       struct ip_tunnel *t, **tp;
-       struct ip_tunnel *t0 = (struct ip_tunnel*)dev->priv;
-       u32 remote = t0->parms.iph.daddr;
-       u32 local = t0->parms.iph.saddr;
-       unsigned h = 0;
-       int prio = 0;
-
        if (dev == &ipip_fb_tunnel_dev) {
+               net_serialize_enter();
                tunnels_wc[0] = NULL;
-               return;
-       }
-
-       if (remote) {
-               prio |= 2;
-               h ^= HASH(remote);
-       }
-       if (local) {
-               prio |= 1;
-               h ^= HASH(local);
-       }
-       for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
-               if (t == t0) {
-                       *tp = t->next;
-                       kfree(dev);
-                       MOD_DEC_USE_COUNT;
-                       break;
-               }
+               net_serialize_leave();
+       } else {
+               ipip_tunnel_unlink((struct ip_tunnel*)dev->priv);
+               kfree(dev);
+               MOD_DEC_USE_COUNT;
        }
 }
 
-
 void ipip_err(struct sk_buff *skb, unsigned char *dp, int len)
 {
 #ifndef I_WISH_WORLD_WERE_PERFECT
@@ -642,6+663,32 @@ ipip_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
 
                t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
 
+               if (dev != &ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
+                   t != &ipip_fb_tunnel) {
+                       if (t != NULL) {
+                               if (t->dev != dev) {
+                                       err = -EEXIST;
+                                       break;
+                               }
+                       } else {
+                               if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
+                                   (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
+                                       err = -EINVAL;
+                                       break;
+                               }
+                               t = (struct ip_tunnel*)dev->priv;
+                               start_bh_atomic();
+                               ipip_tunnel_unlink(t);
+                               t->parms.iph.saddr = p.iph.saddr;
+                               t->parms.iph.daddr = p.iph.daddr;
+                               memcpy(dev->dev_addr, &p.iph.saddr, 4);
+                               memcpy(dev->broadcast, &p.iph.daddr, 4);
+                               ipip_tunnel_link(t);
+                               end_bh_atomic();
+                               netdev_state_change(dev);
+                       }
+               }
+
                if (t) {
                        err = 0;
                        if (cmd == SIOCCHGTUNNEL) {
index 99cda3e..a164025 100644 (file)
@@ -9,7+9,7 @@
  *     as published by the Free Software Foundation; either version
  *     2 of the License, or (at your option) any later version.
  *
- *     Version: $Id: ipmr.c,v 1.38 1999/01/12 14:34:40 davem Exp $
+ *     Version: $Id: ipmr.c,v 1.39 1999/03/21 05:22:44 davem Exp $
  *
  *     Fixes:
  *     Michael Chastain        :       Incorrect size of copying.
@@ -138,6+138,8 @@ static struct device * reg_dev;
 
 static int reg_vif_xmit(struct sk_buff *skb, struct device *dev)
 {
+       ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
+       ((struct net_device_stats*)dev->priv)->tx_packets++;
        ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
        kfree_skb(skb);
        return 0;
@@ -449,6+451,9 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
        struct igmpmsg *msg;
        int ret;
 
+       if (mroute_socket==NULL)
+               return -EINVAL;
+
 #ifdef CONFIG_IP_PIMSM
        if (assert == IGMPMSG_WHOLEPKT)
                skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
@@ -656,7+661,9 @@ static void mrtsock_destruct(struct sock *sk)
 {
        if (sk == mroute_socket) {
                ipv4_devconf.mc_forwarding = 0;
+               net_serialize_enter();
                mroute_socket=NULL;
+               net_serialize_leave();
                mroute_close(sk);
        }
 }
@@ -1045,7+1052,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
 
        dev = rt->u.dst.dev;
 
-       if (skb->len+encap > rt->u.dst.pmtu /* && (ntohs(iph->frag_off) & IP_DF) */) {
+       if (skb->len+encap > rt->u.dst.pmtu && (ntohs(iph->frag_off) & IP_DF)) {
                /* Do not fragment multicasts. Alas, IPv4 does not
                   allow to send ICMP, so that packets will disappear
                   to blackhole.
@@ -1119,7+1126,10 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
         * not mrouter) cannot join to more than one interface - it will
         * result in receiving multiple packets.
         */
-       skb2->dst->output(skb2);
+       if (skb2->len <= rt->u.dst.pmtu)
+               skb2->dst->output(skb2);
+       else
+               ip_fragment(skb2, skb2->dst->output);
 }
 
 int ipmr_find_vif(struct device *dev)
index 62b2451..6c7e5c7 100644 (file)
@@ -5,7+5,7 @@
  *
  *             ROUTE - implementation of the IP router.
  *
- * Version:    $Id: route.c,v 1.62 1999/03/15 22:16:51 davem Exp $
+ * Version:    $Id: route.c,v 1.63 1999/03/21 05:22:45 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *             Andi Kleen      :       Load-limit warning messages.
  *     Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  *     Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
+ *     Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
+ *     Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
+ *             Marc Boucher    :       routing by fwmark
  *
  *             This program is free software; you can redistribute it and/or
  *             modify it under the terms of the GNU General Public License
@@ -108,6+111,7 @@ int ip_rt_redirect_silence = ((HZ/50) << (9+1));
 int ip_rt_error_cost = HZ;
 int ip_rt_error_burst = 5*HZ;
 int ip_rt_gc_elasticity = 8;
+int ip_rt_mtu_expires = 10*60*HZ;
 
 static unsigned long rt_deadline = 0;
 
@@ -165,13+169,14 @@ __u8 ip_tos2prio[16] = {
        TC_PRIO_FILLER
 };
 
+
 /*
  * Route cache.
  */
 
 struct rtable  *rt_hash_table[RT_HASH_DIVISOR];
 
-static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth);
+static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
 
 static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 {
@@ -249,6+254,12 @@ static __inline__ void rt_free(struct rtable *rt)
        dst_free(&rt->u.dst);
 }
 
+static __inline__ void rt_drop(struct rtable *rt)
+{
+       ip_rt_put(rt);
+       dst_free(&rt->u.dst);
+}
+
 static __inline__ int rt_fast_clean(struct rtable *rth)
 {
        /* Kill broadcast/multicast entries very aggresively, if they
@@ -257,6+268,27 @@ static __inline__ int rt_fast_clean(struct rtable *rth)
                && rth->key.iif && rth->u.rt_next);
 }
 
+static __inline__ int rt_valuable(struct rtable *rth)
+{
+       return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
+               || rth->u.dst.expires);
+}
+
+static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
+{
+       int age;
+
+       if (atomic_read(&rth->u.dst.use))
+               return 0;
+
+       age = jiffies - rth->u.dst.lastuse;
+       if (age <= tmo1 && !rt_fast_clean(rth))
+               return 0;
+       if (age <= tmo2 && rt_valuable(rth))
+               return 0;
+       return 1;
+}
+
 static void rt_check_expire(unsigned long dummy)
 {
        int i;
@@ -271,22+303,27 @@ static void rt_check_expire(unsigned long dummy)
                rthp = &rt_hash_table[rover];
 
                while ((rth = *rthp) != NULL) {
-                       /*
-                        * Cleanup aged off entries.
-                        */
-
-                       if (!atomic_read(&rth->u.dst.use) &&
-                           (now - rth->u.dst.lastuse > tmo
-                            || rt_fast_clean(rth))) {
-                               *rthp = rth->u.rt_next;
-                               rt_free(rth);
+                       if (rth->u.dst.expires) {
+                               /* Entrie is expired even if it is in use */
+                               if ((long)(now - rth->u.dst.expires) < tmo) {
+                                       tmo >>= 1;
+                                       rthp = &rth->u.rt_next;
+                                       continue;
+                               }
+                       } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
+                               tmo >>= 1;
+                               rthp = &rth->u.rt_next;
                                continue;
                        }
 
-                       tmo >>= 1;
-                       rthp = &rth->u.rt_next;
+                       /*
+                        * Cleanup aged off entries.
+                        */
+                       *rthp = rth->u.rt_next;
+                       rt_free(rth);
                }
 
+               /* Fallback loop breaker. */
                if ((jiffies - now) > 0)
                        break;
        }
@@ -301,16+338,21 @@ static void rt_run_flush(unsigned long dummy)
 
        rt_deadline = 0;
 
+       net_serialize_enter();
        for (i=0; i<RT_HASH_DIVISOR; i++) {
                if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL)
                        continue;
+               net_serialize_leave();
 
                for (; rth; rth=next) {
                        next = rth->u.rt_next;
                        rth->u.rt_next = NULL;
                        rt_free(rth);
                }
+
+               net_serialize_enter();
        }
+       net_serialize_leave();
 }
   
 void rt_cache_flush(int delay)
@@ -354,60+396,137 @@ void rt_cache_flush(int delay)
        end_bh_atomic();
 }
 
+/*
+   Short description of GC goals.
+
+   We want to build algorithm, which will keep routing cache
+   at some equilibrium point, when number of aged off entries
+   is kept approximately equal to newly generated ones.
+
+   Current expiration strength is variable "expire".
+   We try to adjust it dynamically, so that if networking
+   is idle expires is large enough to keep enough of warm entries,
+   and when load increases it reduces to limit cache size.
+ */
+
 static int rt_garbage_collect(void)
 {
-       int i;
-       static unsigned expire = RT_GC_TIMEOUT>>1;
+       static unsigned expire = RT_GC_TIMEOUT;
        static unsigned long last_gc;
+       static int rover;
+       static int equilibrium;
        struct rtable *rth, **rthp;
        unsigned long now = jiffies;
-
-       start_bh_atomic();
+       int goal;
 
        /*
         * Garbage collection is pretty expensive,
-        * do not make it too frequently, but just increase expire strength.
+        * do not make it too frequently.
         */
-       if (now - last_gc < ip_rt_gc_min_interval)
-               goto out;
+       if (now - last_gc < ip_rt_gc_min_interval &&
+           atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+               return 0;
 
-       expire++;
+       /* Calculate number of entries, which we want to expire now. */
+       goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
+       if (goal <= 0) {
+               if (equilibrium < ipv4_dst_ops.gc_thresh)
+                       equilibrium = ipv4_dst_ops.gc_thresh;
+               goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+               if (goal > 0) {
+                       equilibrium += min(goal/2, RT_HASH_DIVISOR);
+                       goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+               }
+       } else {
+               /* We are in dangerous area. Try to reduce cache really
+                * aggressively.
+                */
+               goal = max(goal/2, RT_HASH_DIVISOR);
+               equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+       }
 
-       for (i=0; i<RT_HASH_DIVISOR; i++) {
-               unsigned tmo;
-               if (!rt_hash_table[i])
-                       continue;
-               tmo = expire;
-               for (rthp=&rt_hash_table[i]; (rth=*rthp); rthp=&rth->u.rt_next) {
-                       if (atomic_read(&rth->u.dst.use) ||
-                           (now - rth->u.dst.lastuse < tmo && !rt_fast_clean(rth))) {
-                               tmo >>= 1;
-                               continue;
+       if (now - last_gc >= ip_rt_gc_min_interval)
+               last_gc = now;
+
+       if (goal <= 0) {
+               equilibrium += goal;
+               goto work_done;
+       }
+
+       do {
+               int i, k;
+
+               start_bh_atomic();
+               for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
+                       unsigned tmo = expire;
+
+                       k = (k + 1) & (RT_HASH_DIVISOR-1);
+                       rthp = &rt_hash_table[k];
+                       while ((rth = *rthp) != NULL) {
+                               if (!rt_may_expire(rth, tmo, expire)) {
+                                       tmo >>= 1;
+                                       rthp = &rth->u.rt_next;
+                                       continue;
+                               }
+                               *rthp = rth->u.rt_next;
+                               rth->u.rt_next = NULL;
+                               rt_free(rth);
+                               goal--;
                        }
-                       *rthp = rth->u.rt_next;
-                       rth->u.rt_next = NULL;
-                       rt_free(rth);
-                       break;
+                       if (goal <= 0)
+                               break;
                }
-               if ((jiffies-now)>0)
+               rover = k;
+               end_bh_atomic();
+
+               if (goal <= 0)
+                       goto work_done;
+
+               /* Goal is not achieved. We stop process if:
+
+                  - if expire reduced to zero. Otherwise, expire is halfed.
+                  - if table is not full.
+                  - if we are called from interrupt.
+                  - jiffies check is just fallback/debug loop breaker.
+                    We will not spin here for long time in any case.
+                */
+
+               if (expire == 0)
                        break;
-       }
 
-       last_gc = now;
-       if (atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
-               expire = ip_rt_gc_timeout>>1;
+               expire >>= 1;
+#if RT_CACHE_DEBUG >= 2
+               printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
+#endif
 
-out:
-       expire -= expire>>ip_rt_gc_elasticity;
-       end_bh_atomic();
-       return (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size);
+               if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+                       return 0;
+       } while (!in_interrupt() && jiffies - now < 1);
+
+       if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+               return 0;
+       if (net_ratelimit())
+               printk("dst cache overflow\n");
+       return 1;
+
+work_done:
+       expire += ip_rt_gc_min_interval;
+       if (expire > ip_rt_gc_timeout ||
+           atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+               expire = ip_rt_gc_timeout;
+#if RT_CACHE_DEBUG >= 2
+       printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
+#endif
+       return 0;
 }
 
-static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt)
+static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
 {
        struct rtable   *rth, **rthp;
        unsigned long   now = jiffies;
+       int attempts = !in_interrupt();
 
+restart:
        start_bh_atomic();
 
        rthp = &rt_hash_table[hash];
@@ -424,9+543,9 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt)
                        rth->u.dst.lastuse = now;
                        end_bh_atomic();
 
-                       ip_rt_put(rt);
-                       rt_free(rt);
-                       return rth;
+                       rt_drop(rt);
+                       *rp = rth;
+                       return 0;
                }
 
                rthp = &rth->u.rt_next;
@@ -435,8+554,28 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt)
        /* Try to bind route to arp only if it is output
           route or unicast forwarding path.
         */
-       if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0)
-               arp_bind_neighbour(&rt->u.dst);
+       if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
+               if (!arp_bind_neighbour(&rt->u.dst)) {
+                       end_bh_atomic();
+
+                       /* Neighbour tables are full and nothing
+                          can be released. Try to shrink route cache,
+                          it is most likely it holds some neighbour records.
+                        */
+                       if (attempts-- > 0) {
+                               int saved_elasticity = ip_rt_gc_elasticity;
+                               ip_rt_gc_elasticity = 1;
+                               rt_garbage_collect();
+                               ip_rt_gc_elasticity = saved_elasticity;
+                               goto restart;
+                       }
+
+                       rt_drop(rt);
+                       if (net_ratelimit())
+                               printk("neighbour table overflow\n");
+                       return -ENOBUFS;
+               }
+       }
 
        rt->u.rt_next = rt_hash_table[hash];
 #if RT_CACHE_DEBUG >= 2
@@ -449,9+588,9 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt)
        }
 #endif
        rt_hash_table[hash] = rt;
-
        end_bh_atomic();
-       return rt;
+       *rp = rt;
+       return 0;
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -537,17+676,15 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
                                    !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
                                        if (rt->u.dst.neighbour)
                                                neigh_event_send(rt->u.dst.neighbour, NULL);
-                                       ip_rt_put(rt);
                                        ip_rt_put(rth);
-                                       rt_free(rt);
+                                       rt_drop(rt);
                                        break;
                                }
 
                                *rthp = rth->u.rt_next;
-                               rt = rt_intern_hash(hash, rt);
-                               ip_rt_put(rt);
-                               ip_rt_put(rth);
-                               rt_free(rth);
+                               if (!rt_intern_hash(hash, rt, &rt))
+                                       ip_rt_put(rt);
+                               rt_drop(rth);
                                break;
                        }
                }
@@ -573,14+710,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
                        ip_rt_put(rt);
                        return NULL;
                }
-               if (rt->rt_flags&RTCF_REDIRECTED) {
+               if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
                        unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
                        struct rtable **rthp;
 #if RT_CACHE_DEBUG >= 1
                        printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
 #endif
-                       ip_rt_put(rt);
                        start_bh_atomic();
+                       ip_rt_put(rt);
                        for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
                                if (*rthp == rt) {
                                        *rthp = rt->u.rt_next;
@@ -614,6+751,10 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 void ip_rt_send_redirect(struct sk_buff *skb)
 {
        struct rtable *rt = (struct rtable*)skb->dst;
+       struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr;
+
+       if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev))
+               return;
 
        /* No redirected packets during ip_rt_redirect_silence;
         * reset the algorithm.
@@ -637,7+778,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
                rt->u.dst.rate_last = jiffies;
                ++rt->u.dst.rate_tokens;
 #ifdef CONFIG_IP_ROUTE_VERBOSE
-               if (skb->dev->ip_ptr && IN_DEV_LOG_MARTIANS((struct in_device*)skb->dev->ip_ptr) &&
+               if (IN_DEV_LOG_MARTIANS(in_dev) &&
                    rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
                        printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
                               rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
@@ -737,6+878,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
                                        if (mtu < rth->u.dst.pmtu) { 
                                                dst_confirm(&rth->u.dst);
                                                rth->u.dst.pmtu = mtu;
+                                               dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
                                        }
                                        est_mtu = mtu;
                                }
@@ -760,7+902,13 @@ static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
 
 static void ipv4_link_failure(struct sk_buff *skb)
 {
+       struct rtable *rt;
+
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+
+       rt = (struct rtable *) skb->dst;
+       if (rt)
+               dst_set_expires(&rt->u.dst, 0);
 }
 
 static int ip_rt_bug(struct sk_buff *skb)
@@ -794,7+942,17 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
        memcpy(addr, &src, 4);
 }
 
-static void rt_set_nexthop(struct rtable *rt, struct fib_result *res)
+#ifdef CONFIG_NET_CLS_ROUTE
+static void set_class_tag(struct rtable *rt, u32 tag)
+{
+       if (!(rt->u.dst.tclassid&0xFFFF))
+               rt->u.dst.tclassid |= tag&0xFFFF;
+       if (!(rt->u.dst.tclassid&0xFFFF0000))
+               rt->u.dst.tclassid |= tag&0xFFFF0000;
+}
+#endif
+
+static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
 {
        struct fib_info *fi = res->fi;
 
@@ -824,9+982,11 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res)
                rt->u.dst.window= 0;
                rt->u.dst.rtt   = TCP_TIMEOUT_INIT;
        }
-#if defined(CONFIG_NET_CLS_ROUTE) && defined(CONFIG_IP_MULTIPLE_TABLES)
-       if (rt->u.dst.tclassid == 0)
-               rt->u.dst.tclassid = fib_rules_tclass(res);
+#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+       set_class_tag(rt, fib_rules_tclass(res));
+#endif
+       set_class_tag(rt, itag);
 #endif
         rt->rt_type = res->type;
 }
@@ -839,6+999,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
        struct rtable *rth;
        u32 spec_dst;
        struct in_device *in_dev = dev->ip_ptr;
+       u32 itag = 0;
 
        /* Primary sanity checks. */
 
@@ -850,7+1011,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
                if (!LOCAL_MCAST(daddr))
                        return -EINVAL;
                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
-       } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst) < 0)
+       } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
                return -EINVAL;
 
        rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
@@ -863,12+1024,18 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
        rth->key.dst    = daddr;
        rth->rt_dst     = daddr;
        rth->key.tos    = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+       rth->key.fwmark = skb->fwmark;
+#endif
        rth->key.src    = saddr;
        rth->rt_src     = saddr;
 #ifdef CONFIG_IP_ROUTE_NAT
        rth->rt_dst_map = daddr;
        rth->rt_src_map = saddr;
 #endif
+#ifdef CONFIG_NET_CLS_ROUTE
+       rth->u.dst.tclassid = itag;
+#endif
        rth->rt_iif     =
        rth->key.iif    = dev->ifindex;
        rth->u.dst.dev  = &loopback_dev;
@@ -888,8+1055,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
 #endif
 
        hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
-       skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
-       return 0;
+       return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
 }
 
 /*
@@ -910,6+1076,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
        struct in_device *in_dev = dev->ip_ptr;
        struct in_device *out_dev;
        unsigned        flags = 0;
+       u32             itag = 0;
        struct rtable * rth;
        unsigned        hash;
        u32             spec_dst;
@@ -925,6+1092,9 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
        key.dst = daddr;
        key.src = saddr;
        key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+       key.fwmark = skb->fwmark;
+#endif
        key.iif = dev->ifindex;
        key.oif = 0;
        key.scope = RT_SCOPE_UNIVERSE;
@@ -983,9+1153,14 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
                goto brd_input;
 
        if (res.type == RTN_LOCAL) {
-               spec_dst = daddr;
-               if (inet_addr_type(saddr) != RTN_UNICAST)
+               int result;
+               result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
+                                            dev, &spec_dst, &itag);
+               if (result < 0)
                        goto martian_source;
+               if (result)
+                       flags |= RTCF_DIRECTSRC;
+               spec_dst = daddr;
                goto local_input;
        }
 
@@ -1005,7+1180,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
                return -EINVAL;
        }
 
-       err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst);
+       err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
        if (err < 0)
                goto martian_source;
 
@@ -1033,6+1208,9 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
        rth->key.dst    = daddr;
        rth->rt_dst     = daddr;
        rth->key.tos    = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+       rth->key.fwmark = skb->fwmark;
+#endif
        rth->key.src    = saddr;
        rth->rt_src     = saddr;
        rth->rt_gateway = daddr;
@@ -1051,7+1229,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
        rth->u.dst.input = ip_forward;
        rth->u.dst.output = ip_output;
 
-       rt_set_nexthop(rth, &res);
+       rt_set_nexthop(rth, &res, itag);
 
        rth->rt_flags = flags;
 
@@ -1066,8+1244,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
        }
 #endif
 
-       skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
-       return 0;
+       return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
 
 brd_input:
        if (skb->protocol != __constant_htons(ETH_P_IP))
@@ -1076,7+1253,7 @@ brd_input:
        if (ZERONET(saddr)) {
                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
        } else {
-               err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst);
+               err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
                if (err < 0)
                        goto martian_source;
                if (err)
@@ -1096,12+1273,18 @@ local_input:
        rth->key.dst    = daddr;
        rth->rt_dst     = daddr;
        rth->key.tos    = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+       rth->key.fwmark = skb->fwmark;
+#endif
        rth->key.src    = saddr;
        rth->rt_src     = saddr;
 #ifdef CONFIG_IP_ROUTE_NAT
        rth->rt_dst_map = key.dst;
        rth->rt_src_map = key.src;
 #endif
+#ifdef CONFIG_NET_CLS_ROUTE
+       rth->u.dst.tclassid = itag;
+#endif
        rth->rt_iif     =
        rth->key.iif    = dev->ifindex;
        rth->u.dst.dev  = &loopback_dev;
@@ -1116,8+1299,7 @@ local_input:
                rth->rt_flags   &= ~RTCF_LOCAL;
        }
        rth->rt_type    = res.type;
-       skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
-       return 0;
+       return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
 
 no_route:
        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -1170,6+1352,9 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
                    rth->key.src == saddr &&
                    rth->key.iif == iif &&
                    rth->key.oif == 0 &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                   rth->key.fwmark == skb->fwmark &&
+#endif
                    rth->key.tos == tos) {
                        rth->u.dst.lastuse = jiffies;
                        atomic_inc(&rth->u.dst.use);
@@ -1344,43+1529,33 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int
        if (res.type == RTN_NAT)
                return -EINVAL;
 
-
-       if (!key.src) {
-               key.src = FIB_RES_PREFSRC(res);
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-               /*
-                * "Stabilization" of route.
-                * This step is necessary, if locally originated packets
-                * are subjected to policy routing, otherwise we could get
-                * route flapping.
-                */
-               if (fib_lookup(&key, &res))
-                       return -ENETUNREACH;
-#endif
+       if (res.type == RTN_LOCAL) {
+               if (!key.src)
+                       key.src = key.dst;
+               dev_out = &loopback_dev;
+               key.oif = dev_out->ifindex;
+               res.fi = NULL;
+               flags |= RTCF_LOCAL;
+               goto make_route;
        }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res.fi->fib_nhs > 1 && key.oif == 0)
                fib_select_multipath(&key, &res);
+       else
 #endif
+       if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
+               fib_select_default(&key, &res);
 
-       dev_out = FIB_RES_DEV(res);
-
-       if (res.type == RTN_LOCAL) {
-               dev_out = &loopback_dev;
-               key.oif = dev_out->ifindex;
-               res.fi = NULL;
-               flags |= RTCF_LOCAL;
-       }
+       if (!key.src)
+               key.src = FIB_RES_PREFSRC(res);
 
+       dev_out = FIB_RES_DEV(res);
        key.oif = dev_out->ifindex;
 
 make_route:
-       if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK)) {
-               printk(KERN_DEBUG "this guy talks to %08x from loopback\n", key.dst);
+       if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
                return -EINVAL;
-       }
 
        if (key.dst == 0xFFFFFFFF)
                res.type = RTN_BROADCAST;
@@ -1449,13+1624,12 @@ make_route:
 #endif
        }
 
-       rt_set_nexthop(rth, &res);
+       rt_set_nexthop(rth, &res, 0);
 
        rth->rt_flags = flags;
 
        hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
-       *rp = rt_intern_hash(hash, rth);
-       return 0;
+       return rt_intern_hash(hash, rth, rp);
 }
 
 int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
@@ -1507,7+1681,7 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int no
 
        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
        r = NLMSG_DATA(nlh);
-       nlh->nlmsg_flags = nowait ? NLM_F_MULTI : 0;
+       nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
        r->rtm_family = AF_INET;
        r->rtm_dst_len = 32;
        r->rtm_src_len = 0;
@@ -1517,6+1691,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int no
        r->rtm_scope = RT_SCOPE_UNIVERSE;
        r->rtm_protocol = RTPROT_UNSPEC;
        r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
+       if (rt->rt_flags & RTCF_NOTIFY)
+               r->rtm_flags |= RTM_F_NOTIFY;
        RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
        if (rt->key.src) {
                r->rtm_src_len = 32;
@@ -1524,6+1700,10 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int no
        }
        if (rt->u.dst.dev)
                RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+#ifdef CONFIG_NET_CLS_ROUTE
+       if (rt->u.dst.tclassid)
+               RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
+#endif
        if (rt->key.iif)
                RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
        else if (rt->rt_src != rt->key.src)
@@ -1546,7+1726,10 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int no
        ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
        ci.rta_used = atomic_read(&rt->u.dst.refcnt);
        ci.rta_clntref = atomic_read(&rt->u.dst.use);
-       ci.rta_expires = 0;
+       if (rt->u.dst.expires)
+               ci.rta_expires = rt->u.dst.expires - jiffies;
+       else
+               ci.rta_expires = 0;
        ci.rta_error = rt->u.dst.error;
 #ifdef CONFIG_IP_MROUTE
        eptr = (struct rtattr*)skb->tail;
@@ -1625,7+1808,7 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
                end_bh_atomic();
                rt = (struct rtable*)skb->dst;
                if (!err && rt->u.dst.error)
-                       err = rt->u.dst.error;
+                       err = -rt->u.dst.error;
        } else {
                int oif = 0;
                if (rta[RTA_OIF-1])
@@ -1667,7+1850,7 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
        for (h=0; h < RT_HASH_DIVISOR; h++) {
                if (h < s_h) continue;
                if (h > s_h)
-                       memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(int));
+                       memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0]));
                start_bh_atomic();
                for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
                        if (idx < s_idx)
@@ -1758,12+1941,45 @@ ctl_table ipv4_route_table[] = {
        {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
          &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
          &proc_dointvec},
+       {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
+         &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
+         &proc_dointvec_jiffies},
         {0}
 };
 #endif
 
+#ifdef CONFIG_NET_CLS_ROUTE
+struct ip_rt_acct ip_rt_acct[256];
+
+#ifdef CONFIG_PROC_FS
+static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
+                          int length, int *eof, void *data)
+{
+       *start=buffer;
+
+       if (offset + length > sizeof(ip_rt_acct)) {
+               length = sizeof(ip_rt_acct) - offset;
+               *eof = 1;
+       }
+       if (length > 0) {
+               start_bh_atomic();
+               memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
+               end_bh_atomic();
+               return length;
+       }
+       return 0;
+}
+#endif
+#endif
+
+
 __initfunc(void ip_rt_init(void))
 {
+#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_NET_CLS_ROUTE
+       struct proc_dir_entry *ent;
+#endif
+#endif
        devinet_init();
        ip_fib_init();
        rt_periodic_timer.function = rt_check_expire;
@@ -1781,5+1997,9 @@ __initfunc(void ip_rt_init(void))
                0, &proc_net_inode_operations,
                rt_cache_get_info
        });
+#ifdef CONFIG_NET_CLS_ROUTE
+       ent = create_proc_entry("net/rt_acct", 0, 0);
+       ent->read_proc = ip_rt_acct_read;
+#endif
 #endif
 }
index 647c156..3afdb55 100644 (file)
@@ -5,7+5,7 @@
  *
  *             Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:    $Id: tcp_ipv4.c,v 1.169 1999/03/11 00:04:22 davem Exp $
+ * Version:    $Id: tcp_ipv4.c,v 1.170 1999/03/21 05:22:47 davem Exp $
  *
  *             IPv4 specific functions
  *
@@ -726,6+726,9 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip)
 {
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
+       if (atomic_read(&sk->sock_readers))
+               return;
+
        /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
         * send out by Linux are always <576bytes so they should go through
         * unfragmented).
@@ -739,19+742,18 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip)
         * There is a small race when the user changes this flag in the
         * route, but I think that's acceptable.
         */
-       if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && sk->dst_cache) {
-               if (tp->pmtu_cookie > sk->dst_cache->pmtu &&
-                   !atomic_read(&sk->sock_readers)) {
-                       tcp_sync_mss(sk, sk->dst_cache->pmtu);
-
-                       /* Resend the TCP packet because it's  
-                        * clear that the old packet has been
-                        * dropped. This is the new "fast" path mtu
-                        * discovery.
-                        */
-                       tcp_simple_retransmit(sk);
-               } /* else let the usual retransmit timer handle it */
-       }
+       if (sk->dst_cache &&
+           sk->ip_pmtudisc != IP_PMTUDISC_DONT &&
+           tp->pmtu_cookie > sk->dst_cache->pmtu) {
+               tcp_sync_mss(sk, sk->dst_cache->pmtu);
+
+               /* Resend the TCP packet because it's  
+                * clear that the old packet has been
+                * dropped. This is the new "fast" path mtu
+                * discovery.
+                */
+               tcp_simple_retransmit(sk);
+       } /* else let the usual retransmit timer handle it */
 }
 
 /*
@@ -778,6+780,11 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
        struct tcp_opt *tp;
        int type = skb->h.icmph->type;
        int code = skb->h.icmph->code;
+#if ICMP_MIN_LENGTH < 14
+       int no_flags = 0;
+#else
+#define no_flags 0
+#endif
        struct sock *sk;
        __u32 seq;
        int err;
@@ -786,6+793,10 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
                icmp_statistics.IcmpInErrors++; 
                return;
        }
+#if ICMP_MIN_LENGTH < 14
+       if (len < (iph->ihl << 2) + 14)
+               no_flags = 1;
+#endif
 
        th = (struct tcphdr*)(dp+(iph->ihl<<2));
 
@@ -852,7+863,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
                 * ACK should set the opening flag, but that is too
                 * complicated right now. 
                 */ 
-               if (!th->syn && !th->ack)
+               if (!no_flags && !th->syn && !th->ack)
                        return;
 
                req = tcp_v4_search_req(tp, iph, th, &prev); 
@@ -887,7+898,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
                break;
        case TCP_SYN_SENT:
        case TCP_SYN_RECV:  /* Cannot happen */ 
-               if (!th->syn)
+               if (!no_flags && !th->syn)
                        return;
                tcp_statistics.TcpAttemptFails++;
                sk->err = err;
index 113b06e..de0a918 100644 (file)
@@ -5,7+5,7 @@
  *
  *             The User Datagram Protocol (UDP).
  *
- * Version:    $Id: udp.c,v 1.64 1998/11/08 11:17:07 davem Exp $
+ * Version:    $Id: udp.c,v 1.65 1999/03/21 05:22:49 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -570,7+570,6 @@ struct udpfakehdr
        struct udphdr uh;
        u32 saddr;
        u32 daddr;
-       u32 other;
        struct iovec *iov;
        u32 wcheck;
 };
@@ -778,7+777,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
                ufh.daddr = ipc.addr = rt->rt_dst;
        ufh.uh.len = htons(ulen);
        ufh.uh.check = 0;
-       ufh.other = (htons(ulen) << 16) + IPPROTO_UDP*256;
        ufh.iov = msg->msg_iov;
        ufh.wcheck = 0;
 
@@ -846,7+844,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
        return(0);
 }
 
-#if defined(CONFIG_FILTER) || !defined(HAVE_CSUM_COPY_USER) 
+#ifndef HAVE_CSUM_COPY_USER
 #undef CONFIG_UDP_DELAY_CSUM
 #endif
 
@@ -890,11+888,11 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len,
        err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
                                        copied);
 #else
-       if (sk->no_check || skb->ip_summed==CHECKSUM_UNNECESSARY) {
+       if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
                err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
                                              copied);
        } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
-               if (csum_fold(csum_partial(skb->h.raw, ntohs(skb->h.uh->len), skb->csum))) 
+               if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) 
                        goto csum_copy_err;
                err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
                                              copied);
@@ -907,7+905,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len,
                                             copied, csum, &err);
                if (err)
                        goto out_free;
-               if (csum_fold(csum)) 
+               if ((unsigned short)csum_fold(csum)) 
                        goto csum_copy_err;
        }
 #endif
@@ -1030,6+1028,19 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
         *      Charge it to the socket, dropping if the queue is full.
         */
 
+#if defined(CONFIG_FILTER) && defined(CONFIG_UDP_DELAY_CSUM)
+       if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
+               if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) {
+                       udp_statistics.UdpInErrors++;
+                       ip_statistics.IpInDiscards++;
+                       ip_statistics.IpInDelivers--;
+                       kfree_skb(skb);
+                       return -1;
+               }
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+       }
+#endif
+
        if (sock_queue_rcv_skb(sk,skb)<0) {
                udp_statistics.UdpInErrors++;
                ip_statistics.IpInDiscards++;
@@ -1179,7+1190,7 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
        if (sk == NULL) {
 #ifdef CONFIG_UDP_DELAY_CSUM
                if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
-                   csum_fold(csum_partial((char*)uh, ulen, skb->csum))) 
+                   (unsigned short)csum_fold(csum_partial((char*)uh, ulen, skb->csum))) 
                        goto csum_error;
 #endif
                udp_statistics.UdpNoPorts++;
index 362b606..e3257e0 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: addrconf.c,v 1.46 1999/01/12 14:34:47 davem Exp $
+ *     $Id: addrconf.c,v 1.47 1999/03/21 05:22:50 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -88,6+88,34 @@ static struct timer_list addr_chk_timer = {
        0, 0, addrconf_verify
 };
 
+/* These locks protect only against address deletions,
+   but not against address adds or status updates.
+   It is OK. The only race is when address is selected,
+   which becomes invalid immediately after selection.
+   It is harmless, because this address could be already invalid
+   several usecs ago.
+
+   Its important, that:
+
+   1. The result of inet6_add_addr() is used only inside lock
+      or from bh_atomic context.
+
+   2. inet6_get_lladdr() is used only from bh protected context.
+
+   3. The result of ipv6_chk_addr() is not used outside of bh protected context.
+ */
+
+static __inline__ void addrconf_lock(void)
+{
+       atomic_inc(&addr_list_lock);
+       synchronize_bh();
+}
+
+static __inline__ void addrconf_unlock(void)
+{
+       atomic_dec(&addr_list_lock);
+}
+
 static int addrconf_ifdown(struct device *dev, int how);
 
 static void addrconf_dad_start(struct inet6_ifaddr *ifp);
@@ -188,7+216,7 @@ static struct inet6_dev * ipv6_add_dev(struct device *dev)
        if (dev->mtu < IPV6_MIN_MTU)
                return NULL;
 
-       ndev = kmalloc(sizeof(struct inet6_dev), gfp_any());
+       ndev = kmalloc(sizeof(struct inet6_dev), GFP_KERNEL);
 
        if (ndev) {
                memset(ndev, 0, sizeof(struct inet6_dev));
@@ -227,9+255,9 @@ static struct inet6_dev * ipv6_find_idev(struct device *dev)
                idev = ipv6_add_dev(dev);
                if (idev == NULL)
                        return NULL;
+               if (dev->flags&IFF_UP)
+                       ipv6_mc_up(idev);
        }
-       if (dev->flags&IFF_UP)
-               ipv6_mc_up(idev);
        return idev;
 }
 
@@ -260,13+288,13 @@ struct inet6_dev * ipv6_get_idev(struct device *dev)
        return NULL;
 }
 
-struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev, 
-                                   struct in6_addr *addr, int scope)
+static struct inet6_ifaddr *
+ipv6_add_addr(struct inet6_dev *idev, struct in6_addr *addr, int scope)
 {
        struct inet6_ifaddr *ifa;
        int hash;
 
-       ifa = kmalloc(sizeof(struct inet6_ifaddr), gfp_any());
+       ifa = kmalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
 
        if (ifa == NULL) {
                ADBG(("ipv6_add_addr: malloc failed\n"));
@@ -312,7+340,9 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 
        for (; iter; iter = iter->lst_next) {
                if (iter == ifp) {
+                       net_serialize_enter();
                        *back = ifp->lst_next;
+                       net_serialize_leave();
                        ifp->lst_next = NULL;
                        break;
                }
@@ -324,7+354,9 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 
        for (; iter; iter = iter->if_next) {
                if (iter == ifp) {
+                       net_serialize_enter();
                        *back = ifp->if_next;
+                       net_serialize_leave();
                        ifp->if_next = NULL;
                        break;
                }
@@ -343,24+375,23 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
  *     ii)     see if there is a specific route for the destination and use
  *             an address of the attached interface 
  *     iii)    don't use deprecated addresses
- *
- *     at the moment I believe only iii) is missing.
  */
-struct inet6_ifaddr * ipv6_get_saddr(struct dst_entry *dst,
-                                    struct in6_addr *daddr)
+int ipv6_get_saddr(struct dst_entry *dst,
+                  struct in6_addr *daddr, struct in6_addr *saddr)
 {
        int scope;
        struct inet6_ifaddr *ifp = NULL;
        struct inet6_ifaddr *match = NULL;
        struct device *dev = NULL;
        struct rt6_info *rt;
+       int err;
        int i;
 
        rt = (struct rt6_info *) dst;
        if (rt)
                dev = rt->rt6i_dev;
        
-       atomic_inc(&addr_list_lock);
+       addrconf_lock();
 
        scope = ipv6_addr_scope(daddr);
        if (rt && (rt->rt6i_flags & RTF_ALLONLINK)) {
@@ -388,10+419,10 @@ struct inet6_ifaddr * ipv6_get_saddr(struct dst_entry *dst,
                        if (idev->dev == dev) {
                                for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
                                        if (ifp->scope == scope) {
-                                               if (!(ifp->flags & ADDR_STATUS))
+                                               if (!(ifp->flags & (ADDR_STATUS|DAD_STATUS)))
                                                        goto out;
 
-                                               if (!(ifp->flags & ADDR_INVALID))
+                                               if (!(ifp->flags & (ADDR_INVALID|DAD_STATUS)))
                                                        match = ifp;
                                        }
                                }
@@ -410,10+441,10 @@ struct inet6_ifaddr * ipv6_get_saddr(struct dst_entry *dst,
        for (i=0; i < IN6_ADDR_HSIZE; i++) {
                for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) {
                        if (ifp->scope == scope) {
-                               if (!(ifp->flags & ADDR_STATUS))
+                               if (!(ifp->flags & (ADDR_STATUS|DAD_STATUS)))
                                        goto out;
 
-                               if (!(ifp->flags & ADDR_INVALID))
+                               if (!(ifp->flags & (ADDR_INVALID|DAD_STATUS)))
                                        match = ifp;
                        }
                }
@@ -422,28+453,30 @@ struct inet6_ifaddr * ipv6_get_saddr(struct dst_entry *dst,
 out:
        if (ifp == NULL)
                ifp = match;
-       atomic_dec(&addr_list_lock);
-       return ifp;
+
+       err = -ENETUNREACH;
+       if (ifp) {
+               memcpy(saddr, &ifp->addr, sizeof(struct in6_addr));
+               err = 0;
+       }
+       addrconf_unlock();
+       return err;
 }
 
 struct inet6_ifaddr * ipv6_get_lladdr(struct device *dev)
 {
-       struct inet6_ifaddr *ifp;
+       struct inet6_ifaddr *ifp = NULL;
        struct inet6_dev *idev;
-       int hash;
-
-       hash = ipv6_devindex_hash(dev->ifindex);
 
-       for (idev = inet6_dev_lst[hash]; idev; idev=idev->next) {
-               if (idev->dev == dev) {
-                       for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
-                               if (ifp->scope == IFA_LINK)
-                                       return ifp;
-                       }
-                       break;
+       if ((idev = ipv6_get_idev(dev)) != NULL) {
+               addrconf_lock();
+               for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
+                       if (ifp->scope == IFA_LINK)
+                               break;
                }
+               addrconf_unlock();
        }
-       return NULL;
+       return ifp;
 }
 
 /*
@@ -461,7+494,7 @@ struct inet6_ifaddr * ipv6_chk_addr(struct in6_addr *addr, struct device *dev, i
        if (!nd)
                flags |= DAD_STATUS|ADDR_INVALID;
 
-       atomic_inc(&addr_list_lock);
+       addrconf_lock();
 
        hash = ipv6_addr_hash(addr);
        for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) {
@@ -472,7+505,7 @@ struct inet6_ifaddr * ipv6_chk_addr(struct in6_addr *addr, struct device *dev, i
                }
        }
 
-       atomic_dec(&addr_list_lock);
+       addrconf_unlock();
        return ifp;
 }
 
@@ -665,13+698,6 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len)
        }
 
        /*
-        *      If we where using an "all destinations on link" route
-        *      delete it
-        */
-
-       rt6_purge_dflt_routers(RTF_ALLONLINK);
-
-       /*
         *      Two things going on here:
         *      1) Add routes for on-link prefixes
         *      2) Configure prefixes with the auto flag set
@@ -845,14+871,17 @@ static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen)
 
        scope = ipv6_addr_scope(pfx);
 
-       if ((ifp = ipv6_add_addr(idev, pfx, scope)) == NULL)
-               return -ENOMEM;
-
-       ifp->prefix_len = plen;
-       ifp->flags |= ADDR_PERMANENT;
+       addrconf_lock();
+       if ((ifp = ipv6_add_addr(idev, pfx, scope)) != NULL) {
+               ifp->prefix_len = plen;
+               ifp->flags |= ADDR_PERMANENT;
+               addrconf_dad_start(ifp);
+               addrconf_unlock();
+               return 0;
+       }
+       addrconf_unlock();
 
-       addrconf_dad_start(ifp);
-       return 0;
+       return -ENOBUFS;
 }
 
 static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen)
@@ -870,20+899,22 @@ static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen)
 
        scope = ipv6_addr_scope(pfx);
 
+       start_bh_atomic();
        for (ifp = idev->addr_list; ifp; ifp=ifp->if_next) {
                if (ifp->scope == scope && ifp->prefix_len == plen &&
                    (!memcmp(pfx, &ifp->addr, sizeof(struct in6_addr)))) {
                        ipv6_del_addr(ifp);
+                       end_bh_atomic();
 
                        /* If the last address is deleted administratively,
                           disable IPv6 on this interface.
                         */
-                          
                        if (idev->addr_list == NULL)
                                addrconf_ifdown(idev->dev, 1);
                        return 0;
                }
        }
+       end_bh_atomic();
        return -EADDRNOTAVAIL;
 }
 
@@ -940,12+971,14 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
        }
 
        if (addr.s6_addr32[3]) {
+               addrconf_lock();
                ifp = ipv6_add_addr(idev, &addr, scope);
                if (ifp) {
                        ifp->flags |= ADDR_PERMANENT;
                        ifp->prefix_len = 128;
                        ipv6_ifa_notify(RTM_NEWADDR, ifp);
                }
+               addrconf_unlock();
                return;
        }
 
@@ -967,17+1000,17 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
                                        flag |= IFA_HOST;
                                }
 
+                               addrconf_lock();
                                ifp = ipv6_add_addr(idev, &addr, flag);
-                       
-                               if (ifp == NULL)
-                                       continue;
-
-                               if (idev->dev->flags&IFF_POINTOPOINT)
-                                       ifp->prefix_len = 10;
-                               else
-                                       ifp->prefix_len = 96;
-                               ifp->flags |= ADDR_PERMANENT;
-                               ipv6_ifa_notify(RTM_NEWADDR, ifp);
+                               if (ifp) {
+                                       if (idev->dev->flags&IFF_POINTOPOINT)
+                                               ifp->prefix_len = 10;
+                                       else
+                                               ifp->prefix_len = 96;
+                                       ifp->flags |= ADDR_PERMANENT;
+                                       ipv6_ifa_notify(RTM_NEWADDR, ifp);
+                               }
+                               addrconf_unlock();
                        }
                }
         }
@@ -999,31+1032,29 @@ static void init_loopback(struct device *dev)
                return;
        }
 
+       addrconf_lock();
        ifp = ipv6_add_addr(idev, &addr, IFA_HOST);
 
-       if (ifp == NULL) {
-               printk(KERN_DEBUG "init_loopback: add_addr failed\n");
-               return;
+       if (ifp) {
+               ifp->flags |= ADDR_PERMANENT;
+               ifp->prefix_len = 128;
+               ipv6_ifa_notify(RTM_NEWADDR, ifp);
        }
-
-       ifp->flags |= ADDR_PERMANENT;
-       ifp->prefix_len = 128;
-
-       ipv6_ifa_notify(RTM_NEWADDR, ifp);
+       addrconf_unlock();
 }
 
 static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr)
 {
        struct inet6_ifaddr * ifp;
 
+       addrconf_lock();
        ifp = ipv6_add_addr(idev, addr, IFA_LINK);
-       if (ifp == NULL)
-               return;
-
-       ifp->flags = ADDR_PERMANENT;
-       ifp->prefix_len = 10;
-
-       addrconf_dad_start(ifp);
+       if (ifp) {
+               ifp->flags = ADDR_PERMANENT;
+               ifp->prefix_len = 10;
+               addrconf_dad_start(ifp);
+       }
+       addrconf_unlock();
 }
 
 static void addrconf_dev_config(struct device *dev)
@@ -1375,8+1406,12 @@ static int iface_proc_info(char *buffer, char **start, off_t offset,
        struct inet6_ifaddr *ifp;
        int i;
        int len = 0;
+       off_t pos=0;
+       off_t begin=0;
 
-       for (i=0; i < IN6_ADDR_HSIZE; i++)
+       addrconf_lock();
+
+       for (i=0; i < IN6_ADDR_HSIZE; i++) {
                for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) {
                        int j;
 
@@ -1393,14+1428,25 @@ static int iface_proc_info(char *buffer, char **start, off_t offset,
                                       ifp->scope,
                                       ifp->flags,
                                       ifp->idev->dev->name);
+                       pos=begin+len;
+                       if(pos<offset) {
+                               len=0;
+                               begin=pos;
+                       }
+                       if(pos>offset+length)
+                               goto done;
                }
+       }
 
-       *start = buffer + offset;
-
-       len -= offset;
-
-       if (len > length)
-               len = length;
+done:
+       addrconf_unlock();
+
+       *start=buffer+(offset-begin);
+       len-=(offset-begin);
+       if(len>length)
+               len=length;
+       if(len<0)
+               len=0;
        return len;
 }
 
@@ -1423,6+1469,12 @@ void addrconf_verify(unsigned long foo)
        unsigned long now = jiffies;
        int i;
 
+       if (atomic_read(&addr_list_lock)) {
+               addr_chk_timer.expires = jiffies + 1*HZ;
+               add_timer(&addr_chk_timer);
+               return;
+       }
+
        for (i=0; i < IN6_ADDR_HSIZE; i++) {
                for (ifp=inet6_addr_lst[i]; ifp;) {
                        if (ifp->flags & ADDR_INVALID) {
index 8f49443..3760be8 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>
  *
- *     $Id: icmp.c,v 1.20 1998/10/03 09:38:31 davem Exp $
+ *     $Id: icmp.c,v 1.21 1999/03/21 05:22:51 davem Exp $
  *
  *     Based on net/ipv4/icmp.c
  *
@@ -200,9+200,11 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
         * this lookup should be more aggressive (not longer than timeout).
         */
        dst = ip6_route_output(sk, fl);
-       if (dst->error)
+       if (dst->error) {
                ipv6_statistics.Ip6OutNoRoutes++;
-       else {
+       } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
+               res = 1;
+       } else {
                struct rt6_info *rt = (struct rt6_info *)dst;
                int tmo = sysctl_icmpv6_time;
 
index bad3a13..089eeef 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: ip6_fib.c,v 1.15 1998/08/26 12:04:55 davem Exp $
+ *     $Id: ip6_fib.c,v 1.16 1999/03/21 05:22:52 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -103,8+103,8 @@ static struct fib6_walker_t fib6_walker_list = {
 static __inline__ u32 fib6_new_sernum(void)
 {
        u32 n = ++rt_sernum;
-       if (n == 0)
-               n = ++rt_sernum;
+       if ((__s32)n <= 0)
+               rt_sernum = n = 1;
        return n;
 }
 
@@ -1157,7+1157,6 @@ static int fib6_age(struct rt6_info *rt, void *arg)
                        return -1;
                }
                gc_args.more++;
-               return 0;
        }
 
        /*
@@ -1171,7+1170,6 @@ static int fib6_age(struct rt6_info *rt, void *arg)
                        return -1;
                }
                gc_args.more++;
-               return 0;
        }
 
        return 0;
index a9dfa97..1538c26 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: ip6_output.c,v 1.15 1998/10/03 09:38:34 davem Exp $
+ *     $Id: ip6_output.c,v 1.16 1999/03/21 05:22:54 davem Exp $
  *
  *     Based on linux/net/ipv4/ip_output.c
  *
@@ -77,11+77,14 @@ int ip6_output(struct sk_buff *skb)
                /* Alpha has disguisting memcpy. Help it. */
                u64 *aligned_hdr = (u64*)(skb->data - 16);
                u64 *aligned_hdr0 = hh->hh_data;
+               read_lock_irq(&hh->hh_lock);
                aligned_hdr[0] = aligned_hdr0[0];
                aligned_hdr[1] = aligned_hdr0[1];
 #else
+               read_lock_irq(&hh->hh_lock);
                memcpy(skb->data - 16, hh->hh_data, 16);
 #endif
+               read_unlock_irq(&hh->hh_lock);
                skb_push(skb, dev->hard_header_len);
                return hh->hh_output(skb);
        } else if (dst->neighbour)
@@ -164,7+167,9 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
        }
 
        printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
+       start_bh_atomic();
        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
+       end_bh_atomic();
        kfree_skb(skb);
        return -EMSGSIZE;
 }
@@ -427,6+432,7 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
        struct dst_entry *dst;
        int err = 0;
        unsigned int pktlength, jumbolen, mtu;
+       struct in6_addr saddr;
 
        if (opt && opt->srcrt) {
                struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
@@ -481,19+487,16 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
        }
 
        if (fl->nl_u.ip6_u.saddr == NULL) {
-               struct inet6_ifaddr *ifa;
-               
-               ifa = ipv6_get_saddr(dst, fl->nl_u.ip6_u.daddr);
+               err = ipv6_get_saddr(dst, fl->nl_u.ip6_u.daddr, &saddr);
 
-               if (ifa == NULL) {
+               if (err) {
 #if IP6_DEBUG >= 2
                        printk(KERN_DEBUG "ip6_build_xmit: "
                               "no availiable source address\n");
 #endif
-                       err = -ENETUNREACH;
                        goto out;
                }
-               fl->nl_u.ip6_u.saddr = &ifa->addr;
+               fl->nl_u.ip6_u.saddr = &saddr;
        }
        pktlength = length;
 
index 4b8089d..b92a138 100644 (file)
@@ -7,7+7,7 @@
  *
  *     Based on linux/net/ipv4/ip_sockglue.c
  *
- *     $Id: ipv6_sockglue.c,v 1.24 1998/10/03 09:38:37 davem Exp $
+ *     $Id: ipv6_sockglue.c,v 1.25 1999/03/21 05:22:54 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -86,7+86,9 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *))
                                        kfree(new_ra);
                                return -EADDRINUSE;
                        }
+                       net_serialize_enter();
                        *rap = ra->next;
+                       net_serialize_leave();
                        if (ra->destructor)
                                ra->destructor(sk);
                        kfree(ra);
@@ -136,15+138,16 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
                        if (sk->protocol != IPPROTO_UDP &&
                            sk->protocol != IPPROTO_TCP)
                                goto out;
-                       
+
+                       lock_sock(sk);
                        if (sk->state != TCP_ESTABLISHED) {
                                retv = ENOTCONN;
-                               goto out;
+                               goto addrform_done;
                        }
 
                        if (!(ipv6_addr_type(&np->daddr) & IPV6_ADDR_MAPPED)) {
                                retv = -EADDRNOTAVAIL;
-                               goto out;
+                               goto addrform_done;
                        }
 
                        if (sk->protocol == IPPROTO_TCP) {
@@ -166,6+169,9 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
                        if (pktopt)
                                kfree_skb(pktopt);
                        retv = 0;
+
+addrform_done:
+                       release_sock(sk);
                } else {
                        retv = -EINVAL;
                }
index 8895048..9b9ddbb 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: mcast.c,v 1.17 1998/08/26 12:05:06 davem Exp $
+ *     $Id: mcast.c,v 1.18 1999/03/21 05:22:55 davem Exp $
  *
  *     Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c 
  *
@@ -132,7+132,9 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
                if (mc_lst->ifindex == ifindex &&
                    ipv6_addr_cmp(&mc_lst->addr, addr) == 0) {
                        struct device *dev;
+                       net_serialize_enter();
                        *lnk = mc_lst->next;
+                       net_serialize_leave();
                        if ((dev = dev_get_by_index(ifindex)) != NULL)
                                ipv6_dev_mc_dec(dev, &mc_lst->addr);
                        sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
@@ -252,7+254,9 @@ static void ipv6_mca_remove(struct device *dev, struct ifmcaddr6 *ma)
                
                for (lnk = &idev->mc_list; (iter = *lnk) != NULL; lnk = &iter->if_next) {
                        if (iter == ma) {
+                               net_serialize_enter();
                                *lnk = iter->if_next;
+                               net_serialize_leave();
                                return;
                        }
                }
@@ -273,7+277,9 @@ int ipv6_dev_mc_dec(struct device *dev, struct in6_addr *addr)
                if (ipv6_addr_cmp(&ma->mca_addr, addr) == 0 && ma->dev == dev) {
                        if (atomic_dec_and_test(&ma->mca_users)) {
                                igmp6_group_dropped(ma);
+                               net_serialize_enter();
                                *lnk = ma->next;
+                               net_serialize_leave();
                                ipv6_mca_remove(dev, ma);
                                kfree(ma);
                        }
@@ -496,10+502,10 @@ static void igmp6_join_group(struct ifmcaddr6 *ma)
        if ((addr_type & (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_LOOPBACK)))
                return;
 
+       start_bh_atomic();
        igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REPORT);
 
        delay = net_random() % IGMP6_UNSOLICITED_IVAL;
-       start_bh_atomic();
        if (del_timer(&ma->mca_timer))
                delay = ma->mca_timer.expires - jiffies;
 
@@ -519,11+525,13 @@ static void igmp6_leave_group(struct ifmcaddr6 *ma)
        if ((addr_type & IPV6_ADDR_LINKLOCAL))
                return;
 
+       start_bh_atomic();
        if (ma->mca_flags & MAF_LAST_REPORTER)
                igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REDUCTION);
 
        if (ma->mca_flags & MAF_TIMER_RUNNING)
                del_timer(&ma->mca_timer);
+       end_bh_atomic();
 }
 
 void igmp6_timer_handler(unsigned long data)
@@ -577,10+585,22 @@ void ipv6_mc_up(struct inet6_dev *idev)
 
 void ipv6_mc_destroy_dev(struct inet6_dev *idev)
 {
-       struct ifmcaddr6 *i;
+       int hash;
+       struct ifmcaddr6 *i, **lnk;
 
        while ((i = idev->mc_list) != NULL) {
                idev->mc_list = i->if_next;
+
+               hash = ipv6_addr_hash(&i->mca_addr);
+
+               for (lnk = &inet6_mcast_lst[hash]; *lnk; lnk = &(*lnk)->next) {
+                       if (*lnk == i) {
+                               net_serialize_enter();
+                               *lnk = i->next;
+                               net_serialize_leave();
+                               break;
+                       }
+               }
                igmp6_group_dropped(i);
                kfree(i);
        }
@@ -631,6+651,8 @@ done:
        len-=(offset-begin);
        if(len>length)
                len=length;
+       if (len<0)
+               len=0;
        return len;
 }
 #endif
index c21e48d..0ba3325 100644 (file)
@@ -335,7+335,7 @@ void ndisc_send_na(struct device *dev, struct neighbour *neigh,
         msg->icmph.icmp6_unused = 0;
         msg->icmph.icmp6_router    = router;
         msg->icmph.icmp6_solicited = solicited;
-        msg->icmph.icmp6_override  = override;
+        msg->icmph.icmp6_override  = !!override;
 
         /* Set the target address. */
        ipv6_addr_copy(&msg->target, solicited_addr);
@@ -497,7+497,7 @@ static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb)
         *      "The sender MUST return an ICMP
         *       destination unreachable"
         */
-       icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
+       dst_link_failure(skb);
        kfree_skb(skb);
 }
 
@@ -604,6+604,13 @@ static void ndisc_router_discovery(struct sk_buff *skb)
                        return;
                }
                neigh->flags |= NTF_ROUTER;
+
+               /*
+                *      If we where using an "all destinations on link" route
+                *      delete it
+                */
+
+               rt6_purge_dflt_routers(RTF_ALLONLINK);
        }
 
        if (rt)
@@ -989,7+996,7 @@ int ndisc_rcv(struct sk_buff *skb, unsigned long len)
 
                                        if (neigh) {
                                                ndisc_send_na(dev, neigh, saddr, &msg->target,
-                                                             0, 0, inc, inc);
+                                                             0, 1, 0, inc);
                                                neigh_release(neigh);
                                        }
                                } else {
@@ -1173,7+1180,6 @@ __initfunc(int ndisc_init(struct net_proto_family *ops))
        sk = ndisc_socket->sk;
        sk->allocation = GFP_ATOMIC;
        sk->net_pinfo.af_inet6.hop_limit = 255;
-       sk->net_pinfo.af_inet6.priority  = 15;
        /* Do not loopback ndisc messages */
        sk->net_pinfo.af_inet6.mc_loop = 0;
        sk->num = 256;
index 9ae8f63..04b49d8 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: route.c,v 1.34 1998/10/03 09:38:43 davem Exp $
+ *     $Id: route.c,v 1.35 1999/03/21 05:22:57 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -71,6+71,7 @@ int ip6_rt_gc_min_interval = 5*HZ;
 int ip6_rt_gc_timeout = 60*HZ;
 int ip6_rt_gc_interval = 30*HZ;
 int ip6_rt_gc_elasticity = 9;
+int ip6_rt_mtu_expires = 10*60*HZ;
 
 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
 static struct dst_entry        *ip6_dst_check(struct dst_entry *dst, u32 cookie);
@@ -97,7+98,7 @@ struct dst_ops ip6_dst_ops = {
 
 struct rt6_info ip6_null_entry = {
        {{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), &loopback_dev,
-         -1, 0, 0, 0, 0, 0, 0, 0,
+         -1, 0, 0, 0, 0, 0, 0, 0, 0,
          -ENETUNREACH, NULL, NULL,
          ip6_pkt_discard, ip6_pkt_discard,
 #ifdef CONFIG_NET_CLS_ROUTE
@@ -105,7+106,7 @@ struct rt6_info ip6_null_entry = {
 #endif
          &ip6_dst_ops}},
        NULL, {{{0}}}, RTF_REJECT|RTF_NONEXTHOP, ~0U,
-       255, 0, ATOMIC_INIT(1), {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
+       255, ATOMIC_INIT(1), {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
 };
 
 struct fib6_node ip6_routing_table = {
@@ -515,13+516,30 @@ static struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, struct sk_buff *
 
 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 {
-       dst_release(dst);
+       struct rt6_info *rt = (struct rt6_info *) dst;
+
+       if (rt) {
+               if (rt->rt6i_flags & RTF_CACHE)
+                       ip6_del_rt(rt);
+               dst_release(dst);
+       }
        return NULL;
 }
 
 static void ip6_link_failure(struct sk_buff *skb)
 {
+       struct rt6_info *rt;
+
        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
+
+       rt = (struct rt6_info *) skb->dst;
+       if (rt) {
+               if (rt->rt6i_flags&RTF_CACHE) {
+                       dst_set_expires(&rt->u.dst, 0);
+                       rt->rt6i_flags |= RTF_EXPIRES;
+               } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
+                       rt->rt6i_node->fn_sernum = -1;
+       }
 }
 
 static int ip6_dst_gc()
@@ -1009,12+1027,10 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
           when cache entry will expire old pmtu
           would return automatically.
         */
-       if (rt->rt6i_dst.plen == 128) {
-               /*
-                *      host route
-                */
+       if (rt->rt6i_flags & RTF_CACHE) {
                rt->u.dst.pmtu = pmtu;
-               rt->rt6i_flags |= RTF_MODIFIED;
+               dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
+               rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
                goto out;
        }
 
@@ -1025,9+1041,12 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
         */
        if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
                nrt = rt6_cow(rt, daddr, saddr);
-               nrt->u.dst.pmtu = pmtu;
-               nrt->rt6i_flags |= RTF_DYNAMIC;
-               dst_release(&nrt->u.dst);
+               if (!nrt->u.dst.error) {
+                       nrt->u.dst.pmtu = pmtu;
+                       dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
+                       nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
+                       dst_release(&nrt->u.dst);
+               }
        } else {
                nrt = ip6_rt_copy(rt);
                if (nrt == NULL)
@@ -1035,7+1054,8 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
                ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr);
                nrt->rt6i_dst.plen = 128;
                nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
-               nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE);
+               dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
+               nrt->rt6i_flags |= RTF_DYNAMIC|RTF_CACHE|RTF_EXPIRES;
                nrt->u.dst.pmtu = pmtu;
                rt6_ins(nrt);
        }
@@ -1069,7+1089,7 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
 
                ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
                rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
-               rt->rt6i_metric = ort->rt6i_metric;
+               rt->rt6i_metric = 0;
 
                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1521,9+1541,9 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
        if (iif)
                RTA_PUT(skb, RTA_IIF, 4, &iif);
        else if (dst) {
-               struct inet6_ifaddr *ifp = ipv6_get_saddr(&rt->u.dst, dst);
-               if (ifp)
-                       RTA_PUT(skb, RTA_PREFSRC, 16, &ifp->addr);
+               struct in6_addr saddr_buf;
+               if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf))
+                       RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
        }
        mx = (struct rtattr*)skb->tail;
        RTA_PUT(skb, RTA_METRICS, 0, NULL);
@@ -1722,7+1742,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt)
        struct sk_buff *skb;
        int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
 
-       skb = alloc_skb(size, GFP_ATOMIC);
+       skb = alloc_skb(size, gfp_any());
        if (!skb) {
                netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
                return;
@@ -1733,7+1753,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt)
                return;
        }
        NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE;
-       netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, GFP_ATOMIC);
+       netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any());
 }
 
 #endif
@@ -1916,6+1936,9 @@ ctl_table ipv6_route_table[] = {
        {NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity",
          &ip6_rt_gc_elasticity, sizeof(int), 0644, NULL,
          &proc_dointvec_jiffies},
+       {NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires",
+         &ip6_rt_mtu_expires, sizeof(int), 0644, NULL,
+         &proc_dointvec_jiffies},
         {0}
 };
 
index 850553d..058fd04 100644 (file)
@@ -6,7+6,7 @@
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *     Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
  *
- *     $Id: sit.c,v 1.29 1998/10/03 09:38:47 davem Exp $
+ *     $Id: sit.c,v 1.30 1999/03/21 05:22:58 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -94,6+94,48 @@ static struct ip_tunnel * ipip6_tunnel_lookup(u32 remote, u32 local)
        return NULL;
 }
 
+static struct ip_tunnel ** ipip6_bucket(struct ip_tunnel *t)
+{
+       u32 remote = t->parms.iph.daddr;
+       u32 local = t->parms.iph.saddr;
+       unsigned h = 0;
+       int prio = 0;
+
+       if (remote) {
+               prio |= 2;
+               h ^= HASH(remote);
+       }
+       if (local) {
+               prio |= 1;
+               h ^= HASH(local);
+       }
+       return &tunnels[prio][h];
+}
+
+static void ipip6_tunnel_unlink(struct ip_tunnel *t)
+{
+       struct ip_tunnel **tp;
+
+       for (tp = ipip6_bucket(t); *tp; tp = &(*tp)->next) {
+               if (t == *tp) {
+                       net_serialize_enter();
+                       *tp = t->next;
+                       net_serialize_leave();
+                       break;
+               }
+       }
+}
+
+static void ipip6_tunnel_link(struct ip_tunnel *t)
+{
+       struct ip_tunnel **tp = ipip6_bucket(t);
+
+       net_serialize_enter();
+       t->next = *tp;
+       *tp = t;
+       net_serialize_leave();
+}
+
 struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create)
 {
        u32 remote = parms->iph.daddr;
@@ -145,10+187,7 @@ struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create)
        if (register_netdevice(dev) < 0)
                goto failed;
 
-       start_bh_atomic();
-       nt->next = t;
-       *tp = nt;
-       end_bh_atomic();
+       ipip6_tunnel_link(nt);
        /* Do not decrement MOD_USE_COUNT here. */
        return nt;
 
@@ -160,37+199,18 @@ failed:
 
 static void ipip6_tunnel_destroy(struct device *dev)
 {
-       struct ip_tunnel *t, **tp;
-       struct ip_tunnel *t0 = (struct ip_tunnel*)dev->priv;
-       u32 remote = t0->parms.iph.daddr;
-       u32 local = t0->parms.iph.saddr;
-       unsigned h = 0;
-       int prio = 0;
-
        if (dev == &ipip6_fb_tunnel_dev) {
+               net_serialize_enter();
                tunnels_wc[0] = NULL;
+               net_serialize_leave();
                return;
-       }
-
-       if (remote) {
-               prio |= 2;
-               h ^= HASH(remote);
-       }
-       if (local) {
-               prio |= 1;
-               h ^= HASH(local);
-       }
-       for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
-               if (t == t0) {
-                       *tp = t->next;
-                       kfree(dev);
-                       MOD_DEC_USE_COUNT;
-                       break;
-               }
+       } else {
+               ipip6_tunnel_unlink((struct ip_tunnel*)dev->priv);
+               kfree(dev);
+               MOD_DEC_USE_COUNT;
        }
 }
 
-
 void ipip6_err(struct sk_buff *skb, unsigned char *dp, int len)
 {
 #ifndef I_WISH_WORLD_WERE_PERFECT
@@ -571,6+591,32 @@ ipip6_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
 
                t = ipip6_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
 
+               if (dev != &ipip6_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
+                   t != &ipip6_fb_tunnel) {
+                       if (t != NULL) {
+                               if (t->dev != dev) {
+                                       err = -EEXIST;
+                                       break;
+                               }
+                       } else {
+                               if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
+                                   (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
+                                       err = -EINVAL;
+                                       break;
+                               }
+                               t = (struct ip_tunnel*)dev->priv;
+                               start_bh_atomic();
+                               ipip6_tunnel_unlink(t);
+                               t->parms.iph.saddr = p.iph.saddr;
+                               t->parms.iph.daddr = p.iph.daddr;
+                               memcpy(dev->dev_addr, &p.iph.saddr, 4);
+                               memcpy(dev->broadcast, &p.iph.daddr, 4);
+                               ipip6_tunnel_link(t);
+                               end_bh_atomic();
+                               netdev_state_change(dev);
+                       }
+               }
+
                if (t) {
                        err = 0;
                        if (cmd == SIOCCHGTUNNEL) {
index 9fa7abd..f27c614 100644 (file)
@@ -5,7+5,7 @@
  *     Authors:
  *     Pedro Roque             <roque@di.fc.ul.pt>     
  *
- *     $Id: tcp_ipv6.c,v 1.99 1999/03/11 00:04:26 davem Exp $
+ *     $Id: tcp_ipv6.c,v 1.100 1999/03/21 05:22:59 davem Exp $
  *
  *     Based on: 
  *     linux/net/ipv4/tcp.c
@@ -376,12+376,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
        struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
        struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-       struct inet6_ifaddr *ifa;
        struct in6_addr *saddr = NULL;
+       struct in6_addr saddr_buf;
        struct flowi fl;
        struct dst_entry *dst;
        struct sk_buff *buff;
        int addr_type;
+       int err;
 
        if (sk->state != TCP_CLOSE) 
                return(-EISCONN);
@@ -428,7+429,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
        if (addr_type == IPV6_ADDR_MAPPED) {
                u32 exthdrlen = tp->ext_header_len;
                struct sockaddr_in sin;
-               int err;
 
                SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
 
@@ -472,9+472,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
        dst = ip6_route_output(sk, &fl);
 
-       if (dst->error) {
+       if ((err = dst->error) != 0) {
                dst_release(dst);
-               return dst->error;
+               return err;
        }
 
        if (fl.oif == 0 && addr_type&IPV6_ADDR_LINKLOCAL) {
@@ -489,18+489,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
        ip6_dst_store(sk, dst, NULL);
 
        if (saddr == NULL) {
-               ifa = ipv6_get_saddr(dst, &np->daddr);
-
-               if (ifa == NULL)
-                       return -ENETUNREACH;
-               
-               saddr = &ifa->addr;
+               err = ipv6_get_saddr(dst, &np->daddr, &saddr_buf);
+               if (err)
+                       return err;
 
-               /* set the source address */
-               ipv6_addr_copy(&np->rcv_saddr, saddr);
-               ipv6_addr_copy(&np->saddr, saddr);
+               saddr = &saddr_buf;
        }
 
+       /* set the source address */
+       ipv6_addr_copy(&np->rcv_saddr, saddr);
+       ipv6_addr_copy(&np->saddr, saddr);
+
        tp->ext_header_len = 0;
        if (np->opt)
                tp->ext_header_len = np->opt->opt_flen+np->opt->opt_nflen;
@@ -602,11+601,14 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr,
        np = &sk->net_pinfo.af_inet6;
        if (type == ICMPV6_PKT_TOOBIG) {
                struct dst_entry *dst = NULL;
-               /* icmp should have updated the destination cache entry */
+
+               if (atomic_read(&sk->sock_readers))
+                       return;
 
                if (sk->state == TCP_LISTEN)
                        return;
 
+               /* icmp should have updated the destination cache entry */
                if (sk->dst_cache)
                        dst = dst_check(&sk->dst_cache, np->dst_cookie);
 
@@ -631,8+633,7 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr,
 
                if (dst->error) {
                        sk->err_soft = -dst->error;
-               } else if (tp->pmtu_cookie > dst->pmtu
-                          && !atomic_read(&sk->sock_readers)) {
+               } else if (tp->pmtu_cookie > dst->pmtu) {
                        tcp_sync_mss(sk, dst->pmtu);
                        tcp_simple_retransmit(sk);
                } /* else let the usual retransmit timer handle it */
@@ -1193,6+1194,11 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
        if (skb->protocol == __constant_htons(ETH_P_IP))
                return tcp_v4_do_rcv(sk, skb);
 
+#ifdef CONFIG_FILTER
+       if (sk->filter && sk_filter(skb, sk->filter))
+               goto discard;
+#endif /* CONFIG_FILTER */
+
        /*
         *      socket locking is here for SMP purposes as backlog rcv
         *      is currently called with bh processing disabled.
@@ -1421,6+1427,9 @@ static struct sock * tcp_v6_get_sock(struct sk_buff *skb, struct tcphdr *th)
        struct in6_addr *saddr;
        struct in6_addr *daddr;
 
+       if (skb->protocol == __constant_htons(ETH_P_IP))
+               return ipv4_specific.get_sock(skb, th);
+
        saddr = &skb->nh.ipv6h->saddr;
        daddr = &skb->nh.ipv6h->daddr;
        return tcp_v6_lookup(saddr, th->source, daddr, th->dest, tcp_v6_iif(skb));
index 0670e87..89cc80b 100644 (file)
@@ -7,7+7,7 @@
  *
  *     Based on linux/ipv4/udp.c
  *
- *     $Id: udp.c,v 1.37 1998/11/08 11:17:10 davem Exp $
+ *     $Id: udp.c,v 1.38 1999/03/21 05:23:00 davem Exp $
  *
  *     This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -201,8+201,8 @@ int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        struct sockaddr_in6     *usin = (struct sockaddr_in6 *) uaddr;
        struct ipv6_pinfo       *np = &sk->net_pinfo.af_inet6;
        struct in6_addr         *daddr;
+       struct in6_addr         saddr;
        struct dst_entry        *dst;
-       struct inet6_ifaddr     *ifa;
        struct flowi            fl;
        int                     addr_type;
        int                     err;
@@ -284,28+284,29 @@ ipv4_connected:
 
        dst = ip6_route_output(sk, &fl);
 
-       if (dst->error) {
+       if ((err = dst->error) != 0) {
                dst_release(dst);
-               return dst->error;
+               return err;
        }
 
        ip6_dst_store(sk, dst, fl.nl_u.ip6_u.daddr);
 
        /* get the source adddress used in the apropriate device */
 
-       ifa = ipv6_get_saddr(dst, daddr);
+       err = ipv6_get_saddr(dst, daddr, &saddr);
 
-       if(ipv6_addr_any(&np->saddr))
-               ipv6_addr_copy(&np->saddr, &ifa->addr);
+       if (err == 0) {
+               if(ipv6_addr_any(&np->saddr))
+                       ipv6_addr_copy(&np->saddr, &saddr);
 
-       if(ipv6_addr_any(&np->rcv_saddr)) {
-               ipv6_addr_copy(&np->rcv_saddr, &ifa->addr);
-               sk->rcv_saddr = 0xffffffff;
+               if(ipv6_addr_any(&np->rcv_saddr)) {
+                       ipv6_addr_copy(&np->rcv_saddr, &saddr);
+                       sk->rcv_saddr = 0xffffffff;
+               }
+               sk->state = TCP_ESTABLISHED;
        }
 
-       sk->state = TCP_ESTABLISHED;
-
-       return(0);
+       return err;
 }
 
 static void udpv6_close(struct sock *sk, long timeout)
@@ -317,7+318,7 @@ static void udpv6_close(struct sock *sk, long timeout)
        destroy_sock(sk);
 }
 
-#if defined(CONFIG_FILTER) || !defined(HAVE_CSUM_COPY_USER)
+#ifndef HAVE_CSUM_COPY_USER
 #undef CONFIG_UDP_DELAY_CSUM
 #endif
 
@@ -352,11+353,11 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
        err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), 
                                      msg->msg_iov, copied);
 #else
-       if (sk->no_check || skb->ip_summed==CHECKSUM_UNNECESSARY) {
+       if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
                err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
                                              copied);
        } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
-               if (csum_fold(csum_partial(skb->h.raw, ntohs(skb->h.uh->len), skb->csum))) {
+               if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) {
                        /* Error for blocking case is chosen to masquerade
                           as some normal condition.
                         */
@@ -373,7+374,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
                csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, copied, csum, &err);
                if (err)
                        goto out_free;
-               if (csum_fold(csum)) {
+               if ((unsigned short)csum_fold(csum)) {
                        /* Error for blocking case is chosen to masquerade
                           as some normal condition.
                         */
@@ -454,6+455,17 @@ void udpv6_err(struct sk_buff *skb, struct ipv6hdr *hdr,
 
 static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 {
+#if defined(CONFIG_FILTER) && defined(CONFIG_UDP_DELAY_CSUM)
+       if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
+               if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) {
+                       udp_stats_in6.UdpInErrors++;
+                       ipv6_statistics.Ip6InDiscards++;
+                       kfree_skb(skb);
+                       return 0;
+               }
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+       }
+#endif
        if (sock_queue_rcv_skb(sk,skb)<0) {
                udp_stats_in6.UdpInErrors++;
                ipv6_statistics.Ip6InDiscards++;
@@ -627,14+639,13 @@ int udpv6_rcv(struct sk_buff *skb, unsigned long len)
        if (sk == NULL) {
 #ifdef CONFIG_UDP_DELAY_CSUM
                if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
-                   csum_fold(csum_partial((char*)uh, len, skb->csum)))
+                   (unsigned short)csum_fold(csum_partial((char*)uh, len, skb->csum)))
                        goto discard;
 #endif
-               
                udp_stats_in6.UdpNoPorts++;
 
                icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev);
-               
+
                kfree_skb(skb);
                return(0);
        }
index a281c96..45ec9d6 100644 (file)
@@ -55,6+55,13 @@ static struct socket *netlink_kernel[MAX_LINKS];
 static int netlink_dump(struct sock *sk);
 static void netlink_destroy_callback(struct netlink_callback *cb);
 
+/* Netlink table lock. It protects against sk list changes
+   during uninterruptible sleeps in netlink_broadcast.
+
+   These lock MUST NOT be used from bh/irq on SMP kernels, because
+   It would result in race in netlink_wait_on_table.
+ */
+
 extern __inline__ void
 netlink_wait_on_table(int protocol)
 {
@@ -69,16+76,16 @@ netlink_lock_table(int protocol)
 }
 
 extern __inline__ void
-netlink_unlock_table(int protocol, int wakeup)
+netlink_unlock_table(int protocol)
 {
 #if 0
        /* F...g gcc does not eat it! */
 
-       if (atomic_dec_and_test(&nl_table_lock[protocol]) && wakeup)
+       if (atomic_dec_and_test(&nl_table_lock[protocol]))
                wake_up(&nl_table_wait);
 #else
        atomic_dec(&nl_table_lock[protocol]);
-       if (atomic_read(&nl_table_lock[protocol]) && wakeup)
+       if (!atomic_read(&nl_table_lock[protocol]))
                wake_up(&nl_table_wait);
 #endif
 }
@@ -125,7+132,9 @@ static void netlink_remove(struct sock *sk)
        struct sock **skp;
        for (skp = &nl_table[sk->protocol]; *skp; skp = &((*skp)->next)) {
                if (*skp == sk) {
+                       start_bh_atomic();
                        *skp = sk->next;
+                       end_bh_atomic();
                        return;
                }
        }
@@ -186,7+195,7 @@ static int netlink_release(struct socket *sock, struct socket *peer)
           transport (and AF_UNIX datagram, when it will be repaired).
           
           Someone could wait on our sock->wait now.
-          We cannot release socket until waiter will remove yourself
+          We cannot release socket until waiter will remove itself
           from wait queue. I choose the most conservetive way of solving
           the problem.
 
@@ -218,8+227,6 @@ static int netlink_autobind(struct socket *sock)
        struct sock *sk = sock->sk;
        struct sock *osk;
 
-       netlink_wait_on_table(sk->protocol);
-
        sk->protinfo.af_netlink.groups = 0;
        sk->protinfo.af_netlink.pid = current->pid;
 
@@ -264,8+271,6 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
                return 0;
        }
 
-       netlink_wait_on_table(sk->protocol);
-
        for (osk=nl_table[sk->protocol]; osk; osk=osk->next) {
                if (osk->protinfo.af_netlink.pid == nladdr->nl_pid)
                        return -EADDRINUSE;
@@ -332,7+337,7 @@ int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock
 retry:
        for (sk = nl_table[protocol]; sk; sk = sk->next) {
                if (sk->protinfo.af_netlink.pid != pid)
-                               continue;
+                       continue;
 
                netlink_lock(sk);
 
@@ -416,7+421,8 @@ void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
 
        /* While we sleep in clone, do not allow to change socket list */
 
-       netlink_lock_table(protocol);
+       if (allocation == GFP_KERNEL)
+               netlink_lock_table(protocol);
 
        for (sk = nl_table[protocol]; sk; sk = sk->next) {
                if (ssk == sk)
@@ -454,7+460,8 @@ void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
                netlink_unlock(sk);
        }
 
-       netlink_unlock_table(protocol, allocation == GFP_KERNEL);
+       if (allocation == GFP_KERNEL)
+               netlink_unlock_table(protocol);
 
        if (skb2)
                kfree_skb(skb2);
@@ -475,7+482,7 @@ Nprintk("seterr");
                    !(sk->protinfo.af_netlink.groups&group))
                        continue;
 
-               sk->err = -code;
+               sk->err = code;
                sk->state_change(sk);
        }
 }
@@ -739,15+746,20 @@ int netlink_attach(int unit, int (*function)(int, struct sk_buff *skb))
 void netlink_detach(int unit)
 {
        struct socket *sock = netlink_kernel[unit];
+
+       net_serialize_enter();
        netlink_kernel[unit] = NULL;
+       net_serialize_leave();
        sock_release(sock);
 }
 
 int netlink_post(int unit, struct sk_buff *skb)
 {
-       if (netlink_kernel[unit]) {
+       struct socket *sock = netlink_kernel[unit];
+       barrier();
+       if (sock) {
                memset(skb->cb, 0, sizeof(skb->cb));
-               netlink_broadcast(netlink_kernel[unit]->sk, skb, 0, ~0, GFP_ATOMIC);
+               netlink_broadcast(sock->sk, skb, 0, ~0, GFP_ATOMIC);
                return 0;
        }
        return -EUNATCH;;
@@ -800,6+812,8 @@ done:
        len-=(offset-begin);
        if(len>length)
                len=length;
+       if(len<0)
+               len=0;
        return len;
 }
 #endif
index 83a97aa..fdebe48 100644 (file)
@@ -428,6+428,7 @@ EXPORT_SYMBOL(register_netdevice);
 EXPORT_SYMBOL(unregister_netdevice);
 EXPORT_SYMBOL(register_netdev);
 EXPORT_SYMBOL(unregister_netdev);
+EXPORT_SYMBOL(netdev_state_change);
 EXPORT_SYMBOL(ether_setup);
 EXPORT_SYMBOL(dev_new_index);
 EXPORT_SYMBOL(dev_get_by_index);
@@ -482,6+483,7 @@ EXPORT_SYMBOL(qdisc_head);
 EXPORT_SYMBOL(qdisc_create_dflt);
 EXPORT_SYMBOL(noop_qdisc);
 #ifdef CONFIG_NET_SCHED
+PSCHED_EXPORTLIST;
 EXPORT_SYMBOL(pfifo_qdisc_ops);
 EXPORT_SYMBOL(register_qdisc);
 EXPORT_SYMBOL(unregister_qdisc);
index c7e7a67..e78e413 100644 (file)
@@ -5,7+5,7 @@
  *
  *             PACKET - implements raw packet sockets.
  *
- * Version:    $Id: af_packet.c,v 1.18 1998/10/03 15:55:24 freitag Exp $
+ * Version:    $Id: af_packet.c,v 1.19 1999/03/21 05:23:03 davem Exp $
  *
  * Authors:    Ross Biro, <bir7@leland.Stanford.Edu>
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -831,7+831,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len,
         *      Free or return the buffer as appropriate. Again this
         *      hides all the races and re-entrancy issues from us.
         */
-       err = copied;
+       err = (flags&MSG_TRUNC) ? skb->len : copied;
 
 out_free:
        skb_free_datagram(sk, skb);
index 5d497a0..ffb7a48 100644 (file)
@@ -18,10+18,11 @@ if [ "$CONFIG_NET_QOS" = "y" ]; then
 fi
 bool 'Packet classifier API' CONFIG_NET_CLS
 if [ "$CONFIG_NET_CLS" = "y" ]; then
-  bool 'Routing tables based classifier' CONFIG_NET_CLS_ROUTE
-  if [ "$CONFIG_IP_FIREWALL" = "y" ]; then
-    bool 'Firewall based classifier' CONFIG_NET_CLS_FW
+  tristate 'Routing table based classifier' CONFIG_NET_CLS_ROUTE4
+  if [ "$CONFIG_NET_CLS_ROUTE4" != "n" ]; then
+    define_bool CONFIG_NET_CLS_ROUTE y
   fi
+  tristate 'Firewall based classifier' CONFIG_NET_CLS_FW
   tristate 'U32 classifier' CONFIG_NET_CLS_U32
   if [ "$CONFIG_NET_QOS" = "y" ]; then
     tristate 'Special RSVP classifier' CONFIG_NET_CLS_RSVP
index 21a1cf0..6e1169f 100644 (file)
@@ -125,12+125,20 @@ else
   endif
 endif
 
-ifeq ($(CONFIG_NET_CLS_ROUTE), y)
+ifeq ($(CONFIG_NET_CLS_ROUTE4), y)
 O_OBJS += cls_route.o
+else
+  ifeq ($(CONFIG_NET_CLS_ROUTE4), m)
+       M_OBJS += cls_route.o
+  endif
 endif
 
 ifeq ($(CONFIG_NET_CLS_FW), y)
 O_OBJS += cls_fw.o
+else
+  ifeq ($(CONFIG_NET_CLS_FW), m)
+       M_OBJS += cls_fw.o
+  endif
 endif
 
 endif
index 081896d..0d29ccf 100644 (file)
@@ -7,6+7,10 @@
  *             2 of the License, or (at your option) any later version.
  *
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ *
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  */
 
 #include <asm/uaccess.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
+#include <linux/kmod.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
@@ -87,21+92,13 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 
 /* Select new prio value from the range, managed by kernel. */
 
-static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp, u32 prio)
+static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp)
 {
        u32 first = TC_H_MAKE(0xC0000000U,0U);
 
-       if (!tp || tp->next == NULL)
-               return first;
-
-       if (prio == TC_H_MAKE(0xFFFF0000U,0U))
-               first = tp->prio+1; 
-       else
+       if (tp)
                first = tp->prio-1;
 
-       if (first == prio)
-               first = tp->prio;
-
        return first;
 }
 
@@ -129,10+126,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
                /* If no priority is given, user wants we allocated it. */
                if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
                        return -ENOENT;
-               if (n->nlmsg_flags&NLM_F_APPEND)
-                       prio = TC_H_MAKE(0xFFFF0000U,0U);
-               else
-                       prio = TC_H_MAKE(0x80000000U,0U);
+               prio = TC_H_MAKE(0x80000000U,0U);
        }
 
        /* Find head of filter chain. */
@@ -194,6+188,18 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
                if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL)
                        goto errout;
                tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]);
+#ifdef CONFIG_KMOD
+               if (tp_ops==NULL && tca[TCA_KIND-1] != NULL) {
+                       struct rtattr *kind = tca[TCA_KIND-1];
+                       char module_name[4 + IFNAMSIZ + 1];
+
+                       if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
+                               sprintf(module_name, "cls_%s", (char*)RTA_DATA(kind));
+                               request_module (module_name);
+                               tp_ops = tcf_proto_lookup_ops(kind);
+                       }
+               }
+#endif
                if (tp_ops == NULL) {
                        err = -EINVAL;
                        kfree(tp);
@@ -202,7+208,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
                memset(tp, 0, sizeof(*tp));
                tp->ops = tp_ops;
                tp->protocol = protocol;
-               tp->prio = nprio ? : tcf_auto_prio(*back, prio);
+               tp->prio = nprio ? : tcf_auto_prio(*back);
                tp->q = q;
                tp->classify = tp_ops->classify;
                tp->classid = parent;
@@ -220,7+226,9 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 
        if (fh == 0) {
                if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
+                       net_serialize_enter();
                        *back = tp->next;
+                       net_serialize_leave();
                        tp->ops->destroy(tp);
                        kfree(tp);
                        err = 0;
@@ -249,7+257,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
                }
        }
 
-       err = tp->ops->change(tp, t->tcm_handle, tca, &fh);
+       err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
        if (err == 0)
                tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
 
@@ -336,12+344,16 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
                return skb->len;
        if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
                return skb->len;
-       if ((q = qdisc_lookup(dev, tcm->tcm_parent)) == NULL)
+       if (!tcm->tcm_parent)
+               q = dev->qdisc_sleeping;
+       else
+               q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+       if (q == NULL)
                return skb->len;
-       cops = q->ops->cl_ops;
+       if ((cops = q->ops->cl_ops) == NULL)
+               goto errout;
        if (TC_H_MIN(tcm->tcm_parent)) {
-               if (cops)
-                       cl = cops->get(q, tcm->tcm_parent);
+               cl = cops->get(q, tcm->tcm_parent);
                if (cl == 0)
                        goto errout;
        }
@@ -360,7+372,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
                    TC_H_MIN(tcm->tcm_info) != tp->protocol)
                        continue;
                if (t > s_t)
-                       memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+                       memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
                if (cb->args[1] == 0) {
                        if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid,
                                          cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) {
@@ -418,8+430,8 @@ __initfunc(int tc_filter_init(void))
 #ifdef CONFIG_NET_CLS_U32
        INIT_TC_FILTER(u32);
 #endif
-#ifdef CONFIG_NET_CLS_ROUTE
-       INIT_TC_FILTER(route);
+#ifdef CONFIG_NET_CLS_ROUTE4
+       INIT_TC_FILTER(route4);
 #endif
 #ifdef CONFIG_NET_CLS_FW
        INIT_TC_FILTER(fw);
index 0fab64d..bc4d4db 100644 (file)
@@ -1,5+1,5 @@
 /*
- * net/sched/cls_fw.c  Routing table based packet classifier.
+ * net/sched/cls_fw.c  Classifier mapping ipchains' fwmark to traffic class.
  *
  *             This program is free software; you can redistribute it and/or
  *             modify it under the terms of the GNU General Public License
@@ -9,6+9,7 @@
  * Authors:    Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  */
 
+#include <linux/config.h>
 #include <linux/module.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
+struct fw_head
+{
+       struct fw_filter *ht[256];
+};
+
+struct fw_filter
+{
+       struct fw_filter        *next;
+       u32                     id;
+       struct tcf_result       res;
+#ifdef CONFIG_NET_CLS_POLICE
+       struct tcf_police       *police;
+#endif
+};
+
+static __inline__ int fw_hash(u32 handle)
+{
+       return handle&0xFF;
+}
 
 static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
                          struct tcf_result *res)
 {
-       u32 clid = skb->fwmark;
+       struct fw_head *head = (struct fw_head*)tp->root;
+       struct fw_filter *f;
+#ifdef CONFIG_IP_FIREWALL
+       u32 id = skb->fwmark;
+#else
+       u32 id = 0;
+#endif
 
-       if (clid && (TC_H_MAJ(clid) == 0 ||
-                    !(TC_H_MAJ(clid^tp->q->handle)))) {
-               res->classid = clid;
+       if (head == NULL)
+               goto old_method;
+
+       for (f=head->ht[fw_hash(id)]; f; f=f->next) {
+               if (f->id == id) {
+                       *res = f->res;
+#ifdef CONFIG_NET_CLS_POLICE
+                       if (f->police)
+                               return tcf_police(skb, f->police);
+#endif
+                       return 0;
+               }
+       }
+       return -1;
+
+old_method:
+       if (id && (TC_H_MAJ(id) == 0 ||
+                    !(TC_H_MAJ(id^tp->q->handle)))) {
+               res->classid = id;
                res->class = 0;
                return 0;
        }
@@ -51,6+93,16 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
 
 static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
 {
+       struct fw_head *head = (struct fw_head*)tp->root;
+       struct fw_filter *f;
+
+       if (head == NULL)
+               return 0;
+
+       for (f=head->ht[fw_hash(handle)]; f; f=f->next) {
+               if (f->id == handle)
+                       return (unsigned long)f;
+       }
        return 0;
 }
 
@@ -60,24+112,232 @@ static void fw_put(struct tcf_proto *tp, unsigned long f)
 
 static int fw_init(struct tcf_proto *tp)
 {
+       MOD_INC_USE_COUNT;
        return 0;
 }
 
 static void fw_destroy(struct tcf_proto *tp)
 {
+       struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL);
+       struct fw_filter *f;
+       int h;
+
+       if (head == NULL) {
+               MOD_DEC_USE_COUNT;
+               return;
+       }
+
+       for (h=0; h<256; h++) {
+               while ((f=head->ht[h]) != NULL) {
+                       unsigned long cl;
+                       head->ht[h] = f->next;
+
+                       if ((cl = cls_set_class(&f->res.class, 0)) != 0)
+                               tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+#ifdef CONFIG_NET_CLS_POLICE
+                       tcf_police_release(f->police);
+#endif
+                       kfree(f);
+               }
+       }
+       kfree(head);
+       MOD_DEC_USE_COUNT;
 }
 
 static int fw_delete(struct tcf_proto *tp, unsigned long arg)
 {
+       struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL);
+       struct fw_filter *f = (struct fw_filter*)arg;
+       struct fw_filter **fp;
+
+       if (head == NULL || f == NULL)
+               return -EINVAL;
+
+       for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
+               if (*fp == f) {
+                       unsigned long cl;
+
+                       net_serialize_enter();
+                       *fp = f->next;
+                       net_serialize_leave();
+
+                       if ((cl = cls_set_class(&f->res.class, 0)) != 0)
+                               tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+#ifdef CONFIG_NET_CLS_POLICE
+                       tcf_police_release(f->police);
+#endif
+                       kfree(f);
+                       return 0;
+               }
+       }
        return -EINVAL;
 }
 
-static int fw_change(struct tcf_proto *tp, u32 handle,
-                       struct rtattr **tca,
-                       unsigned long *arg)
+static int fw_change(struct tcf_proto *tp, unsigned long base,
+                    u32 handle,
+                    struct rtattr **tca,
+                    unsigned long *arg)
+{
+       struct fw_head *head = (struct fw_head*)tp->root;
+       struct fw_filter *f;
+       struct rtattr *opt = tca[TCA_OPTIONS-1];
+       struct rtattr *tb[TCA_FW_MAX];
+       int err;
+
+       if (!opt)
+               return handle ? -EINVAL : 0;
+
+       if (rtattr_parse(tb, TCA_FW_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0)
+               return -EINVAL;
+
+       if ((f = (struct fw_filter*)*arg) != NULL) {
+               /* Node exists: adjust only classid */
+
+               if (f->id != handle && handle)
+                       return -EINVAL;
+               if (tb[TCA_FW_CLASSID-1]) {
+                       unsigned long cl;
+
+                       f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]);
+                       cl = tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid);
+                       cl = cls_set_class(&f->res.class, cl);
+                       if (cl)
+                               tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+               }
+#ifdef CONFIG_NET_CLS_POLICE
+               if (tb[TCA_FW_POLICE-1]) {
+                       struct tcf_police *police = tcf_police_locate(tb[TCA_FW_POLICE-1], tca[TCA_RATE-1]);
+                       net_serialize_enter();
+                       police = xchg(&f->police, police);
+                       net_serialize_leave();
+                       tcf_police_release(police);
+               }
+#endif
+               return 0;
+       }
+
+       if (!handle)
+               return -EINVAL;
+
+       if (head == NULL) {
+               head = kmalloc(sizeof(struct fw_head), GFP_KERNEL);
+               if (head == NULL)
+                       return -ENOBUFS;
+               memset(head, 0, sizeof(*head));
+               net_serialize_enter();
+               tp->root = head;
+               net_serialize_leave();
+       }
+
+       f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL);
+       if (f == NULL)
+               return -ENOBUFS;
+       memset(f, 0, sizeof(*f));
+
+       f->id = handle;
+
+       if (tb[TCA_FW_CLASSID-1]) {
+               err = -EINVAL;
+               if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != 4)
+                       goto errout;
+               f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]);
+               cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
+       }
+
+#ifdef CONFIG_NET_CLS_POLICE
+       if (tb[TCA_FW_POLICE-1])
+               f->police = tcf_police_locate(tb[TCA_FW_POLICE-1], tca[TCA_RATE-1]);
+#endif
+
+       f->next = head->ht[fw_hash(handle)];
+       net_serialize_enter();
+       head->ht[fw_hash(handle)] = f;
+       net_serialize_leave();
+       *arg = (unsigned long)f;
+       return 0;
+
+errout:
+       if (f)
+               kfree(f);
+       return err;
+}
+
+static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
-       return handle ? -EINVAL : 0;
+       struct fw_head *head = (struct fw_head*)tp->root;
+       int h;
+
+       if (head == NULL)
+               arg->stop = 1;
+
+       if (arg->stop)
+               return;
+
+       for (h = 0; h <= 256; h++) {
+               struct fw_filter *f;
+
+               for (f = head->ht[h]; f; f = f->next) {
+                       if (arg->count < arg->skip) {
+                               arg->count++;
+                               continue;
+                       }
+                       if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+                               arg->stop = 1;
+                               break;
+                       }
+                       arg->count++;
+               }
+       }
+}
+
+#ifdef CONFIG_RTNETLINK
+static int fw_dump(struct tcf_proto *tp, unsigned long fh,
+                  struct sk_buff *skb, struct tcmsg *t)
+{
+       struct fw_filter *f = (struct fw_filter*)fh;
+       unsigned char    *b = skb->tail;
+       struct rtattr *rta;
+
+       if (f == NULL)
+               return skb->len;
+
+       t->tcm_handle = f->id;
+
+       if (!f->res.classid && !f->police)
+               return skb->len;
+
+       rta = (struct rtattr*)b;
+       RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+
+       if (f->res.classid)
+               RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid);
+#ifdef CONFIG_NET_CLS_POLICE
+       if (f->police) {
+               struct rtattr * p_rta = (struct rtattr*)skb->tail;
+
+               RTA_PUT(skb, TCA_FW_POLICE, 0, NULL);
+
+               if (tcf_police_dump(skb, f->police) < 0)
+                       goto rtattr_failure;
+
+               p_rta->rta_len = skb->tail - (u8*)p_rta;
+       }
+#endif
+
+       rta->rta_len = skb->tail - b;
+#ifdef CONFIG_NET_CLS_POLICE
+       if (f->police) {
+               RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &f->police->stats);
+       }
+#endif
+       return skb->len;
+
+rtattr_failure:
+       skb_trim(skb, b - skb->data);
+       return -1;
 }
+#endif
+
 
 struct tcf_proto_ops cls_fw_ops = {
        NULL,
@@ -90,5+350,22 @@ struct tcf_proto_ops cls_fw_ops = {
        fw_put,
        fw_change,
        fw_delete,
-       NULL,
+       fw_walk,
+#ifdef CONFIG_RTNETLINK
+       fw_dump
+#else
+       NULL
+#endif
 };
+
+#ifdef MODULE
+int init_module(void)
+{
+       return register_tcf_proto_ops(&cls_fw_ops);
+}
+
+void cleanup_module(void) 
+{
+       unregister_tcf_proto_ops(&cls_fw_ops);
+}
+#endif
index a78f209..ac17def 100644 (file)
@@ -1,5+1,5 @@
 /*
- * net/sched/cls_route.c       Routing table based packet classifier.
+ * net/sched/cls_route.c       ROUTE4 classifier.
  *
  *             This program is free software; you can redistribute it and/or
  *             modify it under the terms of the GNU General Public License
  */
 
 #include <linux/module.h>
+#include <linux/config.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <asm/bitops.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
+/*
+   1. For now we assume that route tags < 256.
+      It allows to use direct table lookups, instead of hash tables.
+   2. For now we assume that "from TAG" and "fromdev DEV" statements
+      are mutually  exclusive.
+   3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
+ */
+
+struct route4_fastmap
+{
+       struct route4_filter    *filter;
+       u32                     id;
+       int                     iif;
+};
+
+struct route4_head
+{
+       struct route4_fastmap   fastmap[16];
+       struct route4_bucket    *table[256+1];
+};
+
+struct route4_bucket
+{
+       struct route4_filter    *ht[16+16+1];
+};
+
+struct route4_filter
+{
+       struct route4_filter    *next;
+       u32                     id;
+       int                     iif;
+
+       struct tcf_result       res;
+#ifdef CONFIG_NET_CLS_POLICE
+       struct tcf_police       *police;
+#endif
+
+       u32                     handle;
+       struct route4_bucket    *bkt;
+};
+
+#define ROUTE4_FAILURE ((struct route4_filter*)(-1L))
+
+static __inline__ int route4_fastmap_hash(u32 id, int iif)
+{
+       return id&0xF;
+}
+
+static void route4_reset_fastmap(struct route4_head *head, u32 id)
+{
+       start_bh_atomic();
+       memset(head->fastmap, 0, sizeof(head->fastmap));
+       end_bh_atomic();
+}
+
+static void __inline__
+route4_set_fastmap(struct route4_head *head, u32 id, int iif,
+                  struct route4_filter *f)
+{
+       int h = route4_fastmap_hash(id, iif);
+       head->fastmap[h].id = id;
+       head->fastmap[h].iif = iif;
+       head->fastmap[h].filter = f;
+}
+
+static __inline__ int route4_hash_to(u32 id)
+{
+       return id&0xFF;
+}
+
+static __inline__ int route4_hash_from(u32 id)
+{
+       return (id>>16)&0xF;
+}
+
+static __inline__ int route4_hash_iif(int iif)
+{
+       return 16 + ((iif>>16)&0xF);
+}
+
+static __inline__ int route4_hash_wild(void)
+{
+       return 32;
+}
 
-static int route_classify(struct sk_buff *skb, struct tcf_proto *tp,
-                         struct tcf_result *res)
+#ifdef CONFIG_NET_CLS_POLICE
+#define IF_ROUTE_POLICE \
+if (f->police) { \
+       int pol_res = tcf_police(skb, f->police); \
+       if (pol_res >= 0) return pol_res; \
+       dont_cache = 1; \
+       continue; \
+} \
+if (!dont_cache)
+#else
+#define IF_ROUTE_POLICE
+#endif
+
+
+static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
+                          struct tcf_result *res)
 {
-       struct dst_entry *dst = skb->dst;
+       struct route4_head *head = (struct route4_head*)tp->root;
+       struct dst_entry *dst;
+       struct route4_bucket *b;
+       struct route4_filter *f;
+#ifdef CONFIG_NET_CLS_POLICE
+       int dont_cache = 0;
+#endif
+       u32 id, h;
+       int iif;
+
+       if ((dst = skb->dst) == NULL)
+               goto failure;
+
+       id = dst->tclassid;
+       if (head == NULL)
+               goto old_method;
+
+       iif = ((struct rtable*)dst)->key.iif;
 
-       if (dst) {
-               u32 clid = dst->tclassid;
+       h = route4_fastmap_hash(id, iif);
+       if (id == head->fastmap[h].id &&
+           iif == head->fastmap[h].iif &&
+           (f = head->fastmap[h].filter) != NULL) {
+               if (f == ROUTE4_FAILURE)
+                       goto failure;
 
-               if (clid && (TC_H_MAJ(clid) == 0 ||
-                            !(TC_H_MAJ(clid^tp->q->handle)))) {
-                       res->classid = clid;
-                       res->class = 0;
+               *res = f->res;
+               return 0;
+       }
+
+       h = route4_hash_to(id);
+
+restart:
+       if ((b = head->table[h]) != NULL) {
+               f = b->ht[route4_hash_from(id)];
+
+               for ( ; f; f = f->next) {
+                       if (f->id == id) {
+                               *res = f->res;
+                               IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f);
+                               return 0;
+                       }
+               }
+
+               for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next) {
+                       if (f->iif == iif) {
+                               *res = f->res;
+                               IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f);
+                               return 0;
+                       }
+               }
+
+               for (f = b->ht[route4_hash_wild()]; f; f = f->next) {
+                       *res = f->res;
+                       IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f);
                        return 0;
                }
+
+       }
+       if (h < 256) {
+               h = 256;
+               id &= ~0xFFFF;
+               goto restart;
+       }
+
+#ifdef CONFIG_NET_CLS_POLICE
+       if (!dont_cache)
+#endif
+               route4_set_fastmap(head, id, iif, ROUTE4_FAILURE);
+failure:
+       return -1;
+
+old_method:
+       if (id && (TC_H_MAJ(id) == 0 ||
+                  !(TC_H_MAJ(id^tp->q->handle)))) {
+               res->classid = id;
+               res->class = 0;
+               return 0;
        }
        return -1;
 }
 
-static unsigned long route_get(struct tcf_proto *tp, u32 handle)
+static u32 to_hash(u32 id)
+{
+       u32 h = id&0xFF;
+       if (id&0x8000)
+               h += 256;
+       return h;
+}
+
+static u32 from_hash(u32 id)
+{
+       id &= 0xFFFF;
+       if (id == 0xFFFF)
+               return 32;
+       if (!(id & 0x8000)) {
+               if (id > 255)
+                       return 256;
+               return id&0xF;
+       }
+       return 16 + (id&0xF);
+}
+
+static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
+{
+       struct route4_head *head = (struct route4_head*)tp->root;
+       struct route4_bucket *b;
+       struct route4_filter *f;
+       unsigned h1, h2;
+
+       if (!head)
+               return 0;
+
+       h1 = to_hash(handle);
+       if (h1 > 256)
+               return 0;
+
+       h2 = from_hash(handle>>16);
+       if (h2 > 32)
+               return 0;
+
+       if ((b = head->table[h1]) != NULL) {
+               for (f = b->ht[h2]; f; f = f->next)
+                       if (f->handle == handle)
+                               return (unsigned long)f;
+       }
+       return 0;
+}
+
+static void route4_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int route4_init(struct tcf_proto *tp)
 {
+       MOD_INC_USE_COUNT;
        return 0;
 }
 
-static void route_put(struct tcf_proto *tp, unsigned long f)
+static void route4_destroy(struct tcf_proto *tp)
 {
+       struct route4_head *head = xchg(&tp->root, NULL);
+       int h1, h2;
+
+       if (head == NULL) {
+               MOD_DEC_USE_COUNT;
+               return;
+       }
+
+       for (h1=0; h1<=256; h1++) {
+               struct route4_bucket *b;
+
+               if ((b = head->table[h1]) != NULL) {
+                       for (h2=0; h2<=32; h2++) {
+                               struct route4_filter *f;
+
+                               while ((f = b->ht[h2]) != NULL) {
+                                       unsigned long cl;
+
+                                       b->ht[h2] = f->next;
+                                       if ((cl = cls_set_class(&f->res.class, 0)) != 0)
+                                               tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+#ifdef CONFIG_NET_CLS_POLICE
+                                       tcf_police_release(f->police);
+#endif
+                                       kfree(f);
+                               }
+                       }
+                       kfree(b);
+               }
+       }
+       kfree(head);
+       MOD_DEC_USE_COUNT;
 }
 
-static int route_init(struct tcf_proto *tp)
+static int route4_delete(struct tcf_proto *tp, unsigned long arg)
 {
+       struct route4_head *head = (struct route4_head*)tp->root;
+       struct route4_filter **fp, *f = (struct route4_filter*)arg;
+       unsigned h = f->handle;
+       struct route4_bucket *b;
+       int i;
+
+       if (!head || !f)
+               return -EINVAL;
+
+       b = f->bkt;
+
+       for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) {
+               if (*fp == f) {
+                       unsigned long cl;
+
+                       net_serialize_enter();
+                       *fp = f->next;
+                       net_serialize_leave();
+                       route4_reset_fastmap(head, f->id);
+
+                       if ((cl = cls_set_class(&f->res.class, 0)) != 0)
+                               tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+
+#ifdef CONFIG_NET_CLS_POLICE
+                       tcf_police_release(f->police);
+#endif
+                       kfree(f);
+
+                       /* Strip tree */
+
+                       for (i=0; i<=32; i++)
+                               if (b->ht[i])
+                                       return 0;
+
+                       /* OK, session has no flows */
+                       net_serialize_enter();
+                       head->table[to_hash(h)] = NULL;
+                       net_serialize_leave();
+                       kfree(b);
+                       return 0;
+               }
+       }
        return 0;
 }
 
-static void route_destroy(struct tcf_proto *tp)
+static int route4_change(struct tcf_proto *tp, unsigned long base,
+                      u32 handle,
+                      struct rtattr **tca,
+                      unsigned long *arg)
 {
+       struct route4_head *head = tp->root;
+       struct route4_filter *f, *f1, **ins_f;
+       struct route4_bucket *b;
+       struct rtattr *opt = tca[TCA_OPTIONS-1];
+       struct rtattr *tb[TCA_ROUTE4_MAX];
+       unsigned h1, h2;
+       int err;
+
+       if (opt == NULL)
+               return handle ? -EINVAL : 0;
+
+       if (rtattr_parse(tb, TCA_ROUTE4_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0)
+               return -EINVAL;
+
+       if ((f = (struct route4_filter*)*arg) != NULL) {
+               /* Node exists: adjust only classid */
+
+               if (f->handle != handle && handle)
+                       return -EINVAL;
+               if (tb[TCA_ROUTE4_CLASSID-1]) {
+                       unsigned long cl;
+
+                       f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]);
+                       cl = cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
+                       if (cl)
+                               tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+               }
+#ifdef CONFIG_NET_CLS_POLICE
+               if (tb[TCA_ROUTE4_POLICE-1]) {
+                       struct tcf_police *police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]);
+                       net_serialize_enter();
+                       police = xchg(&f->police, police);
+                       net_serialize_leave();
+                       tcf_police_release(police);
+               }
+#endif
+               return 0;
+       }
+
+       /* Now more serious part... */
+
+       if (head == NULL) {
+               head = kmalloc(sizeof(struct route4_head), GFP_KERNEL);
+               if (head == NULL)
+                       return -ENOBUFS;
+               memset(head, 0, sizeof(struct route4_head));
+               net_serialize_enter();
+               tp->root = head;
+               net_serialize_leave();
+       }
+
+       f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL);
+       if (f == NULL)
+               return -ENOBUFS;
+
+       memset(f, 0, sizeof(*f));
+
+       err = -EINVAL;
+       f->handle = 0x8000;
+       if (tb[TCA_ROUTE4_TO-1]) {
+               if (handle&0x8000)
+                       goto errout;
+               if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < 4)
+                       goto errout;
+               f->id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]);
+               if (f->id > 0xFF)
+                       goto errout;
+               f->handle = f->id;
+       }
+       if (tb[TCA_ROUTE4_FROM-1]) {
+               u32 sid;
+               if (tb[TCA_ROUTE4_IIF-1])
+                       goto errout;
+               if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < 4)
+                       goto errout;
+               sid = (*(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1]));
+               if (sid > 0xFF)
+                       goto errout;
+               f->handle |= sid<<16;
+               f->id |= sid<<16;
+       } else if (tb[TCA_ROUTE4_IIF-1]) {
+               if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < 4)
+                       goto errout;
+               f->iif = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]);
+               if (f->iif > 0x7FFF)
+                       goto errout;
+               f->handle |= (f->iif|0x8000)<<16;
+       } else
+               f->handle |= 0xFFFF<<16;
+
+       if (handle) {
+               f->handle |= handle&0x7F00;
+               if (f->handle != handle)
+                       goto errout;
+       }
+
+       if (tb[TCA_ROUTE4_CLASSID-1]) {
+               if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < 4)
+                       goto errout;
+               f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]);
+       }
+
+       h1 = to_hash(f->handle);
+       if ((b = head->table[h1]) == NULL) {
+               err = -ENOBUFS;
+               b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL);
+               if (b == NULL)
+                       goto errout;
+               memset(b, 0, sizeof(*b));
+               net_serialize_enter();
+               head->table[h1] = b;
+               net_serialize_leave();
+       }
+       f->bkt = b;
+
+       err = -EEXIST;
+       h2 = from_hash(f->handle>>16);
+       for (ins_f = &b->ht[h2]; (f1=*ins_f) != NULL; ins_f = &f1->next) {
+               if (f->handle < f1->handle)
+                       break;
+               if (f1->handle == f->handle)
+                       goto errout;
+       }
+
+       cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
+#ifdef CONFIG_NET_CLS_POLICE
+       if (tb[TCA_ROUTE4_POLICE-1])
+               f->police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]);
+#endif
+
+       f->next = f1;
+       net_serialize_enter();
+       *ins_f = f;
+       net_serialize_leave();
+       route4_reset_fastmap(head, f->id);
+       *arg = (unsigned long)f;
+       return 0;
+
+errout:
+       if (f)
+               kfree(f);
+       return err;
 }
 
-static int route_delete(struct tcf_proto *tp, unsigned long arg)
+static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
-       return -EINVAL;
+       struct route4_head *head = tp->root;
+       unsigned h, h1;
+
+       if (head == NULL)
+               arg->stop = 1;
+
+       if (arg->stop)
+               return;
+
+       for (h = 0; h <= 256; h++) {
+               struct route4_bucket *b = head->table[h];
+
+               if (b) {
+                       for (h1 = 0; h1 <= 32; h1++) {
+                               struct route4_filter *f;
+
+                               for (f = b->ht[h1]; f; f = f->next) {
+                                       if (arg->count < arg->skip) {
+                                               arg->count++;
+                                               continue;
+                                       }
+                                       if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+                                               arg->stop = 1;
+                                               break;
+                                       }
+                                       arg->count++;
+                               }
+                       }
+               }
+       }
 }
 
-static int route_change(struct tcf_proto *tp, u32 handle,
-                       struct rtattr **tca,
-                       unsigned long *arg)
+#ifdef CONFIG_RTNETLINK
+static int route4_dump(struct tcf_proto *tp, unsigned long fh,
+                      struct sk_buff *skb, struct tcmsg *t)
 {
-       return handle ? -EINVAL : 0;
+       struct route4_filter *f = (struct route4_filter*)fh;
+       unsigned char    *b = skb->tail;
+       struct rtattr *rta;
+       u32 id;
+
+       if (f == NULL)
+               return skb->len;
+
+       t->tcm_handle = f->handle;
+
+       rta = (struct rtattr*)b;
+       RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+
+       if (!(f->handle&0x8000)) {
+               id = f->id&0xFF;
+               RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id);
+       }
+       if (f->handle&0x80000000) {
+               if ((f->handle>>16) != 0xFFFF)
+                       RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif);
+       } else {
+               id = f->id>>16;
+               RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id);
+       }
+       if (f->res.classid)
+               RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid);
+#ifdef CONFIG_NET_CLS_POLICE
+       if (f->police) {
+               struct rtattr * p_rta = (struct rtattr*)skb->tail;
+
+               RTA_PUT(skb, TCA_ROUTE4_POLICE, 0, NULL);
+
+               if (tcf_police_dump(skb, f->police) < 0)
+                       goto rtattr_failure;
+
+               p_rta->rta_len = skb->tail - (u8*)p_rta;
+       }
+#endif
+
+       rta->rta_len = skb->tail - b;
+#ifdef CONFIG_NET_CLS_POLICE
+       if (f->police) {
+               RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &f->police->stats);
+       }
+#endif
+       return skb->len;
+
+rtattr_failure:
+       skb_trim(skb, b - skb->data);
+       return -1;
 }
+#endif
 
-struct tcf_proto_ops cls_route_ops = {
+struct tcf_proto_ops cls_route4_ops = {
        NULL,
        "route",
-       route_classify,
-       route_init,
-       route_destroy,
-
-       route_get,
-       route_put,
-       route_change,
-       route_delete,
-       NULL,
+       route4_classify,
+       route4_init,
+       route4_destroy,
+
+       route4_get,
+       route4_put,
+       route4_change,
+       route4_delete,
+       route4_walk,
+#ifdef CONFIG_RTNETLINK
+       route4_dump
+#else
+       NULL
+#endif
 };
+
+#ifdef MODULE
+int init_module(void)
+{
+       return register_tcf_proto_ops(&cls_route4_ops);
+}
+
+void cleanup_module(void)
+{
+       unregister_tcf_proto_ops(&cls_route4_ops);
+}
+#endif
index 4168f54..e587fad 100644 (file)
@@ -120,6+120,18 @@ static __inline__ unsigned hash_src(u32 *src)
        return h & 0xF;
 }
 
+#ifdef CONFIG_NET_CLS_POLICE
+#define RSVP_POLICE() \
+if (f->police) { \
+       int pol_res = tcf_police(skb, f->police); \
+       if (pol_res < 0) continue; \
+       if (pol_res) return pol_res; \
+}
+#else
+#define RSVP_POLICE()
+#endif
+
+
 static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
                         struct tcf_result *res)
 {
@@ -137,7+149,7 @@ static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
        struct iphdr *nhptr = skb->nh.iph;
 #endif
 
-#ifndef __i386__
+#if !defined( __i386__) && !defined(__m68k__)
        if ((unsigned long)nhptr & 3)
                return -1;
 #endif
@@ -181,13+193,12 @@ restart:
                                    && src[2] == f->src[2]
 #endif
                                    ) {
+
+                                       RSVP_POLICE();
+
 matched:
                                        if (f->tunnelhdr == 0) {
                                                *res = f->res;
-#ifdef CONFIG_NET_CLS_POLICE
-                                               if (f->police)
-                                                       return tcf_police(skb, f->police);
-#endif
                                                return 0;
                                        } else {
                                                tunnelid = f->res.classid;
@@ -198,8+209,10 @@ matched:
                        }
 
                        /* And wildcard bucket... */
-                       if ((f = s->ht[16]) != NULL)
+                       for (f = s->ht[16]; f; f = f->next) {
+                               RSVP_POLICE();
                                goto matched;
+                       }
                        return -1;
                }
        }
@@ -260,7+273,6 @@ static void rsvp_destroy(struct tcf_proto *tp)
                struct rsvp_session *s;
 
                while ((s = sht[h1]) != NULL) {
-
                        sht[h1] = s->next;
 
                        for (h2=0; h2<=16; h2++) {
@@ -270,7+282,7 @@ static void rsvp_destroy(struct tcf_proto *tp)
                                        unsigned long cl;
 
                                        s->ht[h2] = f->next;
-                                       if ((cl = xchg(&f->res.class, 0)) != 0)
+                                       if ((cl = cls_set_class(&f->res.class, 0)) != 0)
                                                tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
 #ifdef CONFIG_NET_CLS_POLICE
                                        tcf_police_release(f->police);
@@ -297,8+309,11 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
                if (*fp == f) {
                        unsigned long cl;
 
+                       net_serialize_enter();
                        *fp = f->next;
-                       if ((cl = xchg(&f->res.class, 0)) != 0)
+                       net_serialize_leave();
+
+                       if ((cl = cls_set_class(&f->res.class, 0)) != 0)
                                tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
 
 #ifdef CONFIG_NET_CLS_POLICE
@@ -317,12+332,14 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
                        for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF];
                             *sp; sp = &(*sp)->next) {
                                if (*sp == s) {
+                                       net_serialize_enter();
                                        *sp = s->next;
+                                       net_serialize_leave();
                                        kfree(s);
                                        return 0;
                                }
                        }
-                       
+
                        return 0;
                }
        }
@@ -399,7+416,8 @@ static u32 gen_tunnel(struct rsvp_head *data)
        return 0;
 }
 
-static int rsvp_change(struct tcf_proto *tp, u32 handle,
+static int rsvp_change(struct tcf_proto *tp, unsigned long base,
+                      u32 handle,
                       struct rtattr **tca,
                       unsigned long *arg)
 {
@@ -425,17+443,20 @@ static int rsvp_change(struct tcf_proto *tp, u32 handle,
                if (f->handle != handle && handle)
                        return -EINVAL;
                if (tb[TCA_RSVP_CLASSID-1]) {
-                       unsigned long cl = xchg(&f->res.class, 0);
+                       unsigned long cl;
+
+                       f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
+                       cl = cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
                        if (cl)
                                tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
-                       f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
-                       f->res.class = tp->q->ops->cl_ops->bind_tcf(tp->q, f->res.classid);
                }
 #ifdef CONFIG_NET_CLS_POLICE
                if (tb[TCA_RSVP_POLICE-1]) {
-                       struct tcf_police *police = tcf_police_locate(tb[TCA_RSVP_POLICE-1]);
-
-                       tcf_police_release(xchg(&f->police, police));
+                       struct tcf_police *police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]);
+                       net_serialize_enter();
+                       police = xchg(&f->police, police);
+                       net_serialize_leave();
+                       tcf_police_release(police);
                }
 #endif
                return 0;
@@ -514,17+535,19 @@ insert:
 
                        f->sess = s;
                        if (f->tunnelhdr == 0)
-                               f->res.class = tp->q->ops->cl_ops->bind_tcf(tp->q, f->res.classid);
+                               cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
 #ifdef CONFIG_NET_CLS_POLICE
                        if (tb[TCA_RSVP_POLICE-1])
-                               f->police = tcf_police_locate(tb[TCA_RSVP_POLICE-1]);
+                               f->police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]);
 #endif
 
                        for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
                                if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask)
                                        break;
                        f->next = *fp;
+                       net_serialize_enter();
                        *fp = f;
+                       net_serialize_leave();
                        *arg = (unsigned long)f;
                        return 0;
                }
@@ -546,7+569,10 @@ insert:
                        break;
        }
        s->next = *sp;
+       net_serialize_enter();
        *sp = s;
+       net_serialize_leave();
+       
        goto insert;
 
 errout:
@@ -631,6+657,11 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
 #endif
 
        rta->rta_len = skb->tail - b;
+#ifdef CONFIG_NET_CLS_POLICE
+       if (f->police) {
+               RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &f->police->stats);
+       }
+#endif
        return skb->len;
 
 rtattr_failure:
index cb52e9d..5ca4708 100644 (file)
@@ -114,7+114,7 @@ static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_re
        int sel = 0;
        int i;
 
-#ifndef __i386__
+#if !defined(__i386__) && !defined(__m68k__)
        if ((unsigned long)ptr & 3)
                return -1;
 #endif
@@ -137,10+137,13 @@ check_terminal:
                        if (n->sel.flags&TC_U32_TERMINAL) {
                                *res = n->res;
 #ifdef CONFIG_NET_CLS_POLICE
-                               if (n->police)
-                                       return tcf_police(skb, n->police);
+                               if (n->police) {
+                                       int pol_res = tcf_police(skb, n->police);
+                                       if (pol_res >= 0)
+                                               return pol_res;
+                               } else
 #endif
-                               return 0;
+                                       return 0;
                        }
                        n = n->next;
                        goto next_knode;
@@ -304,7+307,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
 {
        unsigned long cl;
 
-       if ((cl = xchg(&n->res.class, 0)) != 0)
+       if ((cl = cls_set_class(&n->res.class, 0)) != 0)
                tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
 #ifdef CONFIG_NET_CLS_POLICE
        tcf_police_release(n->police);
@@ -323,7+326,10 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
        if (ht) {
                for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) {
                        if (*kp == key) {
+                               net_serialize_enter();
                                *kp = key->next;
+                               net_serialize_leave();
+
                                u32_destroy_key(tp, key);
                                return 0;
                        }
@@ -340,7+346,9 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 
        for (h=0; h<=ht->divisor; h++) {
                while ((n = ht->ht[h]) != NULL) {
+                       net_serialize_enter();
                        ht->ht[h] = n->next;
+                       net_serialize_leave();
                        u32_destroy_key(tp, n);
                }
        }
@@ -402,6+410,7 @@ static void u32_destroy(struct tcf_proto *tp)
                kfree(tp_c);
        }
 
+       MOD_DEC_USE_COUNT;
        tp->data = NULL;
 }
 
@@ -437,8+446,10 @@ static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
        return handle|(i>0xFFF ? 0xFFF : i);
 }
 
-static int u32_set_parms(struct Qdisc *q, struct tc_u_hnode *ht,
-                        struct tc_u_knode *n, struct rtattr **tb)
+static int u32_set_parms(struct Qdisc *q, unsigned long base,
+                        struct tc_u_hnode *ht,
+                        struct tc_u_knode *n, struct rtattr **tb,
+                        struct rtattr *est)
 {
        if (tb[TCA_U32_LINK-1]) {
                u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]);
@@ -455,29+466,34 @@ static int u32_set_parms(struct Qdisc *q, struct tc_u_hnode *ht,
                        ht_down->refcnt++;
                }
 
+               net_serialize_enter();
                ht_down = xchg(&n->ht_down, ht_down);
+               net_serialize_leave();
 
                if (ht_down)
                        ht_down->refcnt--;
        }
        if (tb[TCA_U32_CLASSID-1]) {
-               unsigned long cl = xchg(&n->res.class, 0);
+               unsigned long cl;
+
+               n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]);
+               cl = cls_set_class(&n->res.class, q->ops->cl_ops->bind_tcf(q, base, n->res.classid));
                if (cl)
                        q->ops->cl_ops->unbind_tcf(q, cl);
-               n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]);
-               n->res.class = q->ops->cl_ops->bind_tcf(q, n->res.classid);
        }
 #ifdef CONFIG_NET_CLS_POLICE
        if (tb[TCA_U32_POLICE-1]) {
-               struct tcf_police *police = tcf_police_locate(tb[TCA_U32_POLICE-1]);
-
-               tcf_police_release(xchg(&n->police, police));
+               struct tcf_police *police = tcf_police_locate(tb[TCA_U32_POLICE-1], est);
+               net_serialize_enter();
+               police = xchg(&n->police, police);
+               net_serialize_leave();
+               tcf_police_release(police);
        }
 #endif
        return 0;
 }
 
-static int u32_change(struct tcf_proto *tp, u32 handle,
+static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
                      struct rtattr **tca,
                      unsigned long *arg)
 {
@@ -500,7+516,7 @@ static int u32_change(struct tcf_proto *tp, u32 handle,
                if (TC_U32_KEY(n->handle) == 0)
                        return -EINVAL;
 
-               return u32_set_parms(tp->q, n->ht_up, n, tb);
+               return u32_set_parms(tp->q, base, n->ht_up, n, tb, tca[TCA_RATE-1]);
        }
 
        if (tb[TCA_U32_DIVISOR-1]) {
@@ -531,7+547,7 @@ static int u32_change(struct tcf_proto *tp, u32 handle,
 
        if (tb[TCA_U32_HASH-1]) {
                htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]);
-               if (TC_U32_HTID(handle) == TC_U32_ROOT) {
+               if (TC_U32_HTID(htid) == TC_U32_ROOT) {
                        ht = tp->root;
                        htid = ht->handle;
                } else {
@@ -550,8+566,6 @@ static int u32_change(struct tcf_proto *tp, u32 handle,
        if (handle) {
                if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
                        return -EINVAL;
-               if (TC_U32_HASH(handle) && TC_U32_HASH(handle^htid))
-                       return -EINVAL;
                handle = htid | TC_U32_NODE(handle);
        } else
                handle = gen_new_kid(ht, htid);
@@ -568,14+582,16 @@ static int u32_change(struct tcf_proto *tp, u32 handle,
        memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
        n->ht_up = ht;
        n->handle = handle;
-       err = u32_set_parms(tp->q, ht, n, tb);
+       err = u32_set_parms(tp->q, base, ht, n, tb, tca[TCA_RATE-1]);
        if (err == 0) {
                struct tc_u_knode **ins;
                for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
-                       if (TC_U32_NODE(handle) >= TC_U32_NODE((*ins)->handle))
+                       if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle))
                                break;
+               net_serialize_enter();
                n->next = *ins;
                *ins = n;
+               net_serialize_leave();
                *arg = (unsigned long)n;
                return 0;
        }
@@ -664,6+680,11 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
        }
 
        rta->rta_len = skb->tail - b;
+#ifdef CONFIG_NET_CLS_POLICE
+       if (TC_U32_KEY(n->handle) && n->police) {
+               RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &n->police->stats);
+       }
+#endif
        return skb->len;
 
 rtattr_failure:
index 4638796..a35a916 100644 (file)
@@ -171,8+171,9 @@ void qdisc_kill_estimator(struct tc_stats *stats)
                                pest = &est->next;
                                continue;
                        }
-                       /* ATOMIC_SET */
+                       net_serialize_enter();
                        *pest = est->next;
+                       net_serialize_leave();
                        kfree(est);
                        killed++;
                }
index 13599ac..89e58d8 100644 (file)
@@ -74,6+74,9 @@ void tcf_police_destroy(struct tcf_police *p)
        for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) {
                if (*p1p == p) {
                        *p1p = p->next;
+#ifdef CONFIG_NET_ESTIMATOR
+                       qdisc_kill_estimator(&p->stats);
+#endif
                        if (p->R_tab)
                                qdisc_put_rtab(p->R_tab);
                        if (p->P_tab)
@@ -85,7+88,7 @@ void tcf_police_destroy(struct tcf_police *p)
        BUG_TRAP(0);
 }
 
-struct tcf_police * tcf_police_locate(struct rtattr *rta)
+struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est)
 {
        unsigned h;
        struct tcf_police *p;
@@ -111,20+114,35 @@ struct tcf_police * tcf_police_locate(struct rtattr *rta)
 
        memset(p, 0, sizeof(*p));
        p->refcnt = 1;
-       if ((p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1])) == NULL)
-               goto failure;
-       if (parm->peakrate.rate &&
-           (p->P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE-1])) == NULL)
-               goto failure;
+       if (parm->rate.rate) {
+               if ((p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1])) == NULL)
+                       goto failure;
+               if (parm->peakrate.rate &&
+                   (p->P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE-1])) == NULL)
+                       goto failure;
+       }
+       if (tb[TCA_POLICE_RESULT-1])
+               p->result = *(int*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
+#ifdef CONFIG_NET_ESTIMATOR
+       if (tb[TCA_POLICE_AVRATE-1])
+               p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
+#endif
        p->toks = p->burst = parm->burst;
        p->mtu = parm->mtu;
-       if (p->mtu == 0)
-               p->mtu = 255<<p->R_tab->rate.cell_log;
+       if (p->mtu == 0) {
+               p->mtu = ~0;
+               if (p->R_tab)
+                       p->mtu = 255<<p->R_tab->rate.cell_log;
+       }
        if (p->P_tab)
                p->ptoks = L2T_P(p, p->mtu);
        PSCHED_GET_TIME(p->t_c);
        p->index = parm->index ? : tcf_police_new_index();
        p->action = parm->action;
+#ifdef CONFIG_NET_ESTIMATOR
+       if (est)
+               qdisc_new_estimator(&p->stats, est);
+#endif
        h = tcf_police_hash(p->index);
        p->next = tcf_police_ht[h];
        tcf_police_ht[h] = p;
@@ -143,7+161,20 @@ int tcf_police(struct sk_buff *skb, struct tcf_police *p)
        long toks;
        long ptoks = 0;
 
+       p->stats.bytes += skb->len;
+       p->stats.packets++;
+
+#ifdef CONFIG_NET_ESTIMATOR
+       if (p->ewma_rate && p->stats.bps >= p->ewma_rate) {
+               p->stats.overlimits++;
+               return p->action;
+       }
+#endif
+
        if (skb->len <= p->mtu) {
+               if (p->R_tab == NULL)
+                       return p->result;
+
                PSCHED_GET_TIME(now);
 
                toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst, 0);
@@ -163,10+194,11 @@ int tcf_police(struct sk_buff *skb, struct tcf_police *p)
                        p->t_c = now;
                        p->toks = toks;
                        p->ptoks = ptoks;
-                       return TC_POLICE_OK;
+                       return p->result;
                }
        }
 
+       p->stats.overlimits++;
        return p->action;
 }
 
@@ -180,12+212,21 @@ int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p)
        opt.action = p->action;
        opt.mtu = p->mtu;
        opt.burst = p->burst;
-       opt.rate = p->R_tab->rate;
+       if (p->R_tab)
+               opt.rate = p->R_tab->rate;
+       else
+               memset(&opt.rate, 0, sizeof(opt.rate));
        if (p->P_tab)
                opt.peakrate = p->P_tab->rate;
        else
                memset(&opt.peakrate, 0, sizeof(opt.peakrate));
        RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
+       if (p->result)
+               RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result);
+#ifdef CONFIG_NET_ESTIMATOR
+       if (p->ewma_rate)
+               RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate);
+#endif
        return skb->len;
 
 rtattr_failure:
index f166380..0ced70b 100644 (file)
  * Fixes:
  *
  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  */
 
 #include <linux/config.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
+#include <linux/kmod.h>
 
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 #define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); }
 
 #ifdef CONFIG_RTNETLINK
-static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
+static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
                        struct Qdisc *old, struct Qdisc *new);
 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
                         struct Qdisc *q, unsigned long cl, int event);
@@ -116,6+118,10 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
    ---destroy
 
    destroys resources allocated by init and during lifetime of qdisc.
+
+   ---change
+
+   changes qdisc parameters.
  */
 
 /************************************************
@@ -177,22+183,22 @@ struct Qdisc *qdisc_lookup(struct device *dev, u32 handle)
        return NULL;
 }
 
-/* We know classid. Find qdisc among all qdisc's attached to device
-   (root qdisc, all its children, children of children etc.)
- */
-
-struct Qdisc *qdisc_lookup_class(struct device *dev, u32 classid)
+struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 {
-       struct Qdisc *q;
+       unsigned long cl;
+       struct Qdisc *leaf;
+       struct Qdisc_class_ops *cops = p->ops->cl_ops;
 
-       for (q = dev->qdisc_list; q; q = q->next) {
-               if (q->classid == classid)
-                       return q;
-       }
-       return NULL;
+       if (cops == NULL)
+               return NULL;
+       cl = cops->get(p, classid);
+       if (cl == 0)
+               return NULL;
+       leaf = cops->leaf(p, cl);
+       cops->put(p, cl);
+       return leaf;
 }
 
-
 /* Find queueing discipline by name */
 
 struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
@@ -268,6+274,37 @@ u32 qdisc_alloc_handle(struct device *dev)
        return i>0 ? autohandle : 0;
 }
 
+/* Attach toplevel qdisc to device dev */
+
+static struct Qdisc *
+dev_graft_qdisc(struct device *dev, struct Qdisc *qdisc)
+{
+       struct Qdisc *oqdisc;
+
+       if (dev->flags & IFF_UP)
+               dev_deactivate(dev);
+
+       start_bh_atomic();
+       oqdisc = dev->qdisc_sleeping;
+
+       /* Prune old scheduler */
+       if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
+               qdisc_reset(oqdisc);
+
+       /* ... and graft new one */
+       if (qdisc == NULL)
+               qdisc = &noop_qdisc;
+       dev->qdisc_sleeping = qdisc;
+       dev->qdisc = &noop_qdisc;
+       end_bh_atomic();
+
+       if (dev->flags & IFF_UP)
+               dev_activate(dev);
+
+       return oqdisc;
+}
+
+
 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
    to device "dev".
 
@@ -280,17+317,10 @@ int qdisc_graft(struct device *dev, struct Qdisc *parent, u32 classid,
        int err = 0;
 
        if (parent == NULL) {
-               BUG_TRAP(classid == TC_H_ROOT);
-               if (new) {
-                       new->parent = NULL;
-                       new->classid = TC_H_ROOT;
-               }
-               *old = dev_set_scheduler(dev, new);
+               *old = dev_graft_qdisc(dev, new);
        } else {
                struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 
-               BUG_TRAP(classid != TC_H_ROOT);
-
                err = -EINVAL;
 
                if (cops) {
@@ -313,22+343,30 @@ int qdisc_graft(struct device *dev, struct Qdisc *parent, u32 classid,
  */
 
 static struct Qdisc *
-qdisc_create(struct device *dev, struct Qdisc_ops *ops, u32 handle,
-            u32 parentid, struct rtattr **tca, int *errp)
+qdisc_create(struct device *dev, u32 handle, struct rtattr **tca, int *errp)
 {
        int err;
        struct rtattr *kind = tca[TCA_KIND-1];
        struct Qdisc *sch = NULL;
+       struct Qdisc_ops *ops;
        int size;
-       int new = 0;
 
-       if (ops == NULL) {
-               ops = qdisc_lookup_ops(kind);
-               err = -EINVAL;
-               if (ops == NULL)
-                       goto err_out;
-               new = 1;
+       ops = qdisc_lookup_ops(kind);
+#ifdef CONFIG_KMOD
+       if (ops==NULL && tca[TCA_KIND-1] != NULL) {
+               char module_name[4 + IFNAMSIZ + 1];
+
+               if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
+                       sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind));
+                       request_module (module_name);
+                       ops = qdisc_lookup_ops(kind);
+               }
        }
+#endif
+
+       err = -EINVAL;
+       if (ops == NULL)
+               goto err_out;
 
        size = sizeof(*sch) + ops->priv_size;
 
@@ -340,13+378,8 @@ qdisc_create(struct device *dev, struct Qdisc_ops *ops, u32 handle,
        /* Grrr... Resolve race condition with module unload */
        
        err = -EINVAL;
-       if (new) {
-               if (ops != qdisc_lookup_ops(kind))
-                       goto err_out;
-       } else if (kind) {
-               if (rtattr_strcmp(kind, ops->id))
-                       goto err_out;
-       }
+       if (ops != qdisc_lookup_ops(kind))
+               goto err_out;
 
        memset(sch, 0, size);
 
@@ -355,6+388,7 @@ qdisc_create(struct device *dev, struct Qdisc_ops *ops, u32 handle,
        sch->enqueue = ops->enqueue;
        sch->dequeue = ops->dequeue;
        sch->dev = dev;
+       atomic_set(&sch->refcnt, 1);
        if (handle == 0) {
                handle = qdisc_alloc_handle(dev);
                err = -ENOMEM;
@@ -362,9+396,8 @@ qdisc_create(struct device *dev, struct Qdisc_ops *ops, u32 handle,
                        goto err_out;
        }
        sch->handle = handle;
-       sch->classid = parentid;
 
-       if (ops->init && (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
+       if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
                sch->next = dev->qdisc_list;
                dev->qdisc_list = sch;
 #ifdef CONFIG_NET_ESTIMATOR
@@ -381,135+414,241 @@ err_out:
        return NULL;
 }
 
+static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
+{
+       if (tca[TCA_OPTIONS-1]) {
+               int err;
+
+               if (sch->ops->change == NULL)
+                       return -EINVAL;
+               err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
+               if (err)
+                       return err;
+       }
+#ifdef CONFIG_NET_ESTIMATOR
+       if (tca[TCA_RATE-1]) {
+               qdisc_kill_estimator(&sch->stats);
+               qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
+       }
+#endif
+       return 0;
+}
+
+struct check_loop_arg
+{
+       struct qdisc_walker     w;
+       struct Qdisc            *p;
+       int                     depth;
+};
+
+static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
+
+static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
+{
+       struct check_loop_arg   arg;
+
+       if (q->ops->cl_ops == NULL)
+               return 0;
+
+       arg.w.stop = arg.w.skip = arg.w.count = 0;
+       arg.w.fn = check_loop_fn;
+       arg.depth = depth;
+       arg.p = p;
+       q->ops->cl_ops->walk(q, &arg.w);
+       return arg.w.stop ? -ELOOP : 0;
+}
+
+static int
+check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
+{
+       struct Qdisc *leaf;
+       struct Qdisc_class_ops *cops = q->ops->cl_ops;
+       struct check_loop_arg *arg = (struct check_loop_arg *)w;
+
+       leaf = cops->leaf(q, cl);
+       if (leaf) {
+               if (leaf == arg->p || arg->depth > 7)
+                       return -ELOOP;
+               return check_loop(leaf, arg->p, arg->depth + 1);
+       }
+       return 0;
+}
 
 /*
  Create/delete/change/get qdisc.
* Delete/get qdisc.
  */
 
-static int tc_ctl_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 {
        struct tcmsg *tcm = NLMSG_DATA(n);
        struct rtattr **tca = arg;
        struct device *dev;
        u32 clid = tcm->tcm_parent;
-       struct Qdisc *old_q;
        struct Qdisc *q = NULL;
        struct Qdisc *p = NULL;
-       struct Qdisc *leaf = NULL;
-       struct Qdisc_ops *qops = NULL;
        int err;
 
-       /* Find device */
        if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
                return -ENODEV;
 
-       /* If parent is specified, it must exist
-          and tcm_parent selects a class in parent which
-          new qdisc will be attached to.
-
-          The place may be already busy by another qdisc,
-          remember this fact, if it was not auto-created discipline.
-        */
        if (clid) {
                if (clid != TC_H_ROOT) {
-                       p = qdisc_lookup(dev, TC_H_MAJ(clid));
-                       if (p == NULL)
+                       if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
                                return -ENOENT;
-                       leaf = qdisc_lookup_class(dev, clid);
+                       q = qdisc_leaf(p, clid);
                } else
-                       leaf = dev->qdisc_sleeping;
-
-               if (leaf && leaf->flags&TCQ_F_DEFAULT && n->nlmsg_type == RTM_NEWQDISC)
-                       leaf = NULL;
+                       q = dev->qdisc_sleeping;
 
-               /*
-                  Also, leaf may be exactly that qdisc, which we want
-                  to control. Remember this to avoid one more qdisc_lookup.
-                */
-
-               if (leaf && leaf->handle == tcm->tcm_handle)
-                       q = leaf;
-       }
+               if (!q)
+                       return -ENOENT;
 
-       /* Try to locate the discipline */
-       if (tcm->tcm_handle && q == NULL) {
-               if (TC_H_MIN(tcm->tcm_handle))
+               if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
                        return -EINVAL;
-               q = qdisc_lookup(dev, tcm->tcm_handle);
+       } else {
+               if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+                       return -ENOENT;
        }
 
-       /* If discipline already exists, check that its real parent
-          matches to one selected by tcm_parent.
-        */
-          
-       if (q) {
-               if (clid && p != q->parent)
-                       return -EINVAL;
-               BUG_TRAP(!leaf || leaf == q);
-               if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+       if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+               return -EINVAL;
+
+       if (n->nlmsg_type == RTM_DELQDISC) {
+               if (!clid)
                        return -EINVAL;
-               clid = q->classid;
-               goto process_existing;
+               if (q->handle == 0)
+                       return -ENOENT;
+               if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
+                       return err;
+               if (q) {
+                       qdisc_notify(skb, n, clid, q, NULL);
+                       qdisc_destroy(q);
+               }
+       } else {
+               qdisc_notify(skb, n, clid, NULL, q);
        }
+       return 0;
+}
 
-       /* The discipline is known not to exist.
-          If parent was not selected too, return error.
-        */
-       if (clid == 0)
-               return tcm->tcm_handle ? -ENOENT : -EINVAL;
+/*
+   Create/change qdisc.
+ */
 
-       /* Check for the case when leaf is exactly the thing,
-          that you want.
-        */
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+       struct tcmsg *tcm = NLMSG_DATA(n);
+       struct rtattr **tca = arg;
+       struct device *dev;
+       u32 clid = tcm->tcm_parent;
+       struct Qdisc *q = NULL;
+       struct Qdisc *p = NULL;
+       int err;
+
+       if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+               return -ENODEV;
 
-       if (leaf && tcm->tcm_handle == 0) {
-               q = leaf;
-               if (!tca[TCA_KIND-1] || rtattr_strcmp(tca[TCA_KIND-1], q->ops->id) == 0)
-                       goto process_existing;
+       if (clid) {
+               if (clid != TC_H_ROOT) {
+                       if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+                               return -ENOENT;
+                       q = qdisc_leaf(p, clid);
+               } else {
+                       q = dev->qdisc_sleeping;
+               }
+
+               /* It may be default qdisc, ignore it */
+               if (q && q->handle == 0)
+                       q = NULL;
+
+               if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
+                       if (tcm->tcm_handle) {
+                               if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
+                                       return -EEXIST;
+                               if (TC_H_MIN(tcm->tcm_handle))
+                                       return -EINVAL;
+                               if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+                                       goto create_n_graft;
+                               if (n->nlmsg_flags&NLM_F_EXCL)
+                                       return -EEXIST;
+                               if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+                                       return -EINVAL;
+                               if (q == p ||
+                                   (p && check_loop(q, p, 0)))
+                                       return -ELOOP;
+                               atomic_inc(&q->refcnt);
+                               goto graft;
+                       } else {
+                               if (q == NULL)
+                                       goto create_n_graft;
+
+                               /* This magic test requires explanation.
+                                *
+                                *   We know, that some child q is already
+                                *   attached to this parent and have choice:
+                                *   either to change it or to create/graft new one.
+                                *
+                                *   1. We are allowed to create/graft only
+                                *   if CREATE and REPLACE flags are set.
+                                *
+                                *   2. If EXCL is set, requestor wanted to say,
+                                *   that qdisc tcm_handle is not expected
+                                *   to exist, so that we choose create/graft too.
+                                *
+                                *   3. The last case is when no flags are set.
+                                *   Alas, it is sort of hole in API, we
+                                *   cannot decide what to do unambiguously.
+                                *   For now we select create/graft, if
+                                *   user gave KIND, which does not match existing.
+                                */
+                               if ((n->nlmsg_flags&NLM_F_CREATE) &&
+                                   (n->nlmsg_flags&NLM_F_REPLACE) &&
+                                   ((n->nlmsg_flags&NLM_F_EXCL) ||
+                                    (tca[TCA_KIND-1] &&
+                                     rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
+                                       goto create_n_graft;
+                       }
+               }
+       } else {
+               if (!tcm->tcm_handle)
+                       return -EINVAL;
+               q = qdisc_lookup(dev, tcm->tcm_handle);
        }
 
-       if (n->nlmsg_type != RTM_NEWQDISC || !(n->nlmsg_flags&NLM_F_CREATE))
+       /* Change qdisc parameters */
+       if (q == NULL)
                return -ENOENT;
-       if (leaf && n->nlmsg_flags&NLM_F_EXCL)
+       if (n->nlmsg_flags&NLM_F_EXCL)
                return -EEXIST;
+       if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+               return -EINVAL;
+       err = qdisc_change(q, tca);
+       if (err == 0)
+               qdisc_notify(skb, n, clid, NULL, q);
+       return err;
 
-create_and_graft:
-       q = qdisc_create(dev, qops, tcm->tcm_handle, clid, tca, &err);
+create_n_graft:
+       if (!(n->nlmsg_flags&NLM_F_CREATE))
+               return -ENOENT;
+       q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
        if (q == NULL)
                return err;
 
 graft:
-       err = qdisc_graft(dev, p, clid, q, &old_q);
-       if (err) {
-               if (q)
-                       qdisc_destroy(q);
-               return err;
+       if (1) {
+               struct Qdisc *old_q = NULL;
+               err = qdisc_graft(dev, p, clid, q, &old_q);
+               if (err) {
+                       if (q)
+                               qdisc_destroy(q);
+                       return err;
+               }
+               qdisc_notify(skb, n, clid, old_q, q);
+               if (old_q)
+                       qdisc_destroy(old_q);
        }
-       qdisc_notify(skb, n, old_q, q);
-       if (old_q)
-               qdisc_destroy(old_q);
        return 0;
-
-process_existing:
-
-       switch (n->nlmsg_type) {
-       case RTM_NEWQDISC:
-               if (n->nlmsg_flags&NLM_F_EXCL)
-                       return -EEXIST;
-               qops = q->ops;
-               goto create_and_graft;
-       case RTM_GETQDISC:      
-               qdisc_notify(skb, n, NULL, q);
-               return 0;
-       case RTM_DELQDISC:
-               q = NULL;
-               goto graft;
-       default:
-               return -EINVAL;
-       }
 }
 
-static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q,
+static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
                         u32 pid, u32 seq, unsigned flags, int event)
 {
        struct tcmsg *tcm;
@@ -521,9+660,9 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q,
        tcm = NLMSG_DATA(nlh);
        tcm->tcm_family = AF_UNSPEC;
        tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
-       tcm->tcm_parent = q->classid;
+       tcm->tcm_parent = clid;
        tcm->tcm_handle = q->handle;
-       tcm->tcm_info = 0;
+       tcm->tcm_info = atomic_read(&q->refcnt);
        RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
        if (q->ops->dump && q->ops->dump(q, skb) < 0)
                goto rtattr_failure;
@@ -539,7+678,7 @@ rtattr_failure:
 }
 
 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
-                        struct Qdisc *old, struct Qdisc *new)
+                       u32 clid, struct Qdisc *old, struct Qdisc *new)
 {
        struct sk_buff *skb;
        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
@@ -548,12+687,12 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
        if (!skb)
                return -ENOBUFS;
 
-       if (old && !(old->flags&TCQ_F_DEFAULT)) {
-               if (tc_fill_qdisc(skb, old, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
+       if (old && old->handle) {
+               if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
                        goto err_out;
        }
        if (new) {
-               if (tc_fill_qdisc(skb, new, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+               if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
                        goto err_out;
        }
 
@@ -583,7+722,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
                     q = q->next, q_idx++) {
                        if (q_idx < s_q_idx)
                                continue;
-                       if (tc_fill_qdisc(skb, q, NETLINK_CB(cb->skb).pid,
+                       if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid,
                                          cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
                                goto done;
                }
@@ -797,11+936,10 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
        for (q=dev->qdisc_list, t=0; q; q = q->next, t++) {
                if (t < s_t) continue;
                if (!q->ops->cl_ops) continue;
-               if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle
-                   && (tcm->tcm_parent != TC_H_ROOT || q->parent != NULL))
+               if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)
                        continue;
                if (t > s_t)
-                       memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+                       memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
                arg.w.fn = qdisc_class_dump;
                arg.skb = skb;
                arg.cb = cb;
@@ -846,6+984,20 @@ static int psched_read_proc(char *buffer, char **start, off_t offset,
 }
 #endif
 
+#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
+int psched_tod_diff(int delta_sec, int bound)
+{
+       int delta;
+
+       if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1)
+               return bound;
+       delta = delta_sec * 1000000;
+       if (delta > bound)
+               delta = bound;
+       return delta;
+}
+#endif
+
 psched_time_t psched_time_base;
 
 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
@@ -866,7+1018,8 @@ static void psched_tick(unsigned long dummy)
 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
        psched_time_t dummy_stamp;
        PSCHED_GET_TIME(dummy_stamp);
-       psched_timer.expires = jiffies + 4*HZ;
+       /* It is OK up to 4GHz cpu */
+       psched_timer.expires = jiffies + 1*HZ;
 #else
        unsigned long now = jiffies;
        psched_time_base = ((u64)now)<<PSCHED_JSCALE;
@@ -891,7+1044,6 @@ __initfunc(int psched_calibrate_clock(void))
                return -1;
 #endif
 
-       start_bh_atomic();
 #ifdef PSCHED_WATCHER
        psched_tick(0);
 #endif
@@ -902,7+1054,6 @@ __initfunc(int psched_calibrate_clock(void))
                barrier();
        PSCHED_GET_TIME(stamp1);
        do_gettimeofday(&tv1);
-       end_bh_atomic();
 
        delay = PSCHED_TDIFF(stamp1, stamp);
        rdelay = tv1.tv_usec - tv.tv_usec;
@@ -921,6+1072,9 @@ __initfunc(int psched_calibrate_clock(void))
 
 __initfunc(int pktsched_init(void))
 {
+#ifdef CONFIG_RTNETLINK
+       struct rtnetlink_link *link_p;
+#endif
 #ifdef CONFIG_PROC_FS
        struct proc_dir_entry *ent;
 #endif
@@ -931,19+1085,22 @@ __initfunc(int pktsched_init(void))
 #elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
        psched_tick_per_us = HZ<<PSCHED_JSCALE;
        psched_us_per_tick = 1000000;
+#ifdef PSCHED_WATCHER
+       psched_tick(0);
+#endif
 #endif
 
 #ifdef CONFIG_RTNETLINK
-       struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC];
+       link_p = rtnetlink_links[PF_UNSPEC];
 
        /* Setup rtnetlink links. It is made here to avoid
           exporting large number of public symbols.
         */
 
        if (link_p) {
-               link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_ctl_qdisc;
-               link_p[RTM_DELQDISC-RTM_BASE].doit = tc_ctl_qdisc;
-               link_p[RTM_GETQDISC-RTM_BASE].doit = tc_ctl_qdisc;
+               link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
+               link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
+               link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
                link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
                link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
                link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
@@ -975,6+1132,12 @@ __initfunc(int pktsched_init(void))
 #ifdef CONFIG_NET_SCH_RED
        INIT_QDISC(red);
 #endif
+#ifdef CONFIG_NET_SCH_GRED
+       INIT_QDISC(gred);
+#endif
+#ifdef CONFIG_NET_SCH_DSMARK
+       INIT_QDISC(dsmark);
+#endif
 #ifdef CONFIG_NET_SCH_SFQ
        INIT_QDISC(sfq);
 #endif
index 9ae14c2..052ea8e 100644 (file)
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
+
 /*     Class-Based Queueing (CBQ) algorithm.
        =======================================
 
@@ -169,6+170,9 @@ struct cbq_sched_data
        struct cbq_class        *active[TC_CBQ_MAXPRIO+1];      /* List of all classes
                                                                   with backlog */
 
+#ifdef CONFIG_NET_CLS_POLICE
+       struct cbq_class        *rx_class;
+#endif
        struct cbq_class        *tx_class;
        struct cbq_class        *tx_borrowed;
        int                     tx_len;
@@ -269,17+273,21 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch)
                        else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
                                cl = defmap[TC_PRIO_BESTEFFORT];
 
-                       if (cl == NULL)
+                       if (cl == NULL || cl->level >= head->level)
                                goto fallback;
                }
 
-               if (cl->level == 0) {
 #ifdef CONFIG_NET_CLS_POLICE
-                       if (result)
-                               return cbq_reclassify(skb, cl);
+               switch (result) {
+               case TC_POLICE_RECLASSIFY:
+                       return cbq_reclassify(skb, cl);
+               case TC_POLICE_SHOT:
+                       return NULL;
+               default:
+               }
 #endif
+               if (cl->level == 0)
                        return cl;
-               }
 
                /*
                 * Step 3+n. If classifier selected a link sharing class,
@@ -321,11+329,9 @@ static __inline__ void cbq_activate_class(struct cbq_class *cl)
        if (cl_tail != NULL) {
                cl->next_alive = cl_tail->next_alive;
                cl_tail->next_alive = cl;
-               cl->deficit = 0;
        } else {
                cl->next_alive = cl;
                q->activemask |= (1<<prio);
-               cl->deficit = cl->quantum;
        }
 }
 
@@ -358,31+364,28 @@ static void cbq_deactivate_class(struct cbq_class *this)
                        }
 
                        cl = cl_prev->next_alive;
-                       cl->deficit += cl->quantum;
                        return;
                }
        } while ((cl_prev = cl) != q->active[prio]);
 }
 
-static __inline__ void
+static void
 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 {
-       if (q->toplevel > 0) {
+       int toplevel = q->toplevel;
+
+       if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
                psched_time_t now;
                PSCHED_GET_TIME(now);
                if (PSCHED_TLESS(now, q->now))
                        now = q->now;
-               if (PSCHED_TLESS(cl->undertime, now)) {
-                       q->toplevel = 0;
-                       return;
-               }
-               while ((cl = cl->borrow) != NULL
-                      && q->toplevel > cl->level) {
-                       if (PSCHED_TLESS(cl->borrow->undertime, now)) {
+
+               do {
+                       if (PSCHED_TLESS(cl->undertime, now)) {
                                q->toplevel = cl->level;
                                return;
                        }
-               }
+               } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
        }
 }
 
@@ -393,23+396,31 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
        struct cbq_class *cl = cbq_classify(skb, sch);
        int len = skb->len;
 
-       if (cl && cl->q->enqueue(skb, cl->q) == 1) {
-               sch->q.qlen++;
-               sch->stats.packets++;
-               cl->stats.packets++;
-               sch->stats.bytes+=len;
-               cl->stats.bytes+=len;
-               cbq_mark_toplevel(q, cl);
-               if (!cl->next_alive)
-                       cbq_activate_class(cl);
-               return 1;
+#ifdef CONFIG_NET_CLS_POLICE
+       q->rx_class = cl;
+#endif
+       if (cl) {
+#ifdef CONFIG_NET_CLS_POLICE
+               cl->q->__parent = sch;
+#endif
+               if (cl->q->enqueue(skb, cl->q) == 1) {
+                       sch->q.qlen++;
+                       sch->stats.packets++;
+                       sch->stats.bytes+=len;
+                       cbq_mark_toplevel(q, cl);
+                       if (!cl->next_alive)
+                               cbq_activate_class(cl);
+                       return 1;
+               }
        }
 
        sch->stats.drops++;
        if (cl == NULL)
                kfree_skb(skb);
-       else
+       else {
+               cbq_mark_toplevel(q, cl);
                cl->stats.drops++;
+       }
        return 0;
 }
 
@@ -426,9+437,14 @@ cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
        }
        q->tx_class = NULL;
 
+       cbq_mark_toplevel(q, cl);
+
+#ifdef CONFIG_NET_CLS_POLICE
+       q->rx_class = cl;
+       cl->q->__parent = sch;
+#endif
        if (cl->q->ops->requeue(skb, cl->q) == 1) {
                sch->q.qlen++;
-               cbq_mark_toplevel(q, cl);
                if (!cl->next_alive)
                        cbq_activate_class(cl);
                return 1;
@@ -445,11+461,9 @@ cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
 static void cbq_ovl_classic(struct cbq_class *cl)
 {
        struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
+       psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
 
        if (!cl->delayed) {
-               psched_tdiff_t delay;
-
-               delay = PSCHED_TDIFF(cl->undertime, q->now);
                delay += cl->offtime;
 
                /* 
@@ -463,15+477,35 @@ static void cbq_ovl_classic(struct cbq_class *cl)
                        delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
                if (cl->avgidle < cl->minidle)
                        cl->avgidle = cl->minidle;
-               if (delay < 0)
-                       delay = 0;
+               if (delay <= 0)
+                       delay = 1;
                PSCHED_TADD2(q->now, delay, cl->undertime);
 
-               if (q->wd_expires == 0 || q->wd_expires > delay)
-                       q->wd_expires = delay;
                cl->xstats.overactions++;
                cl->delayed = 1;
        }
+       if (q->wd_expires == 0 || q->wd_expires > delay)
+               q->wd_expires = delay;
+
+       /* Dirty work! We must schedule wakeups based on
+          real available rate, rather than leaf rate,
+          which may be tiny (even zero).
+        */
+       if (q->toplevel == TC_CBQ_MAXLEVEL) {
+               struct cbq_class *b;
+               psched_tdiff_t base_delay = q->wd_expires;
+
+               for (b = cl->borrow; b; b = b->borrow) {
+                       delay = PSCHED_TDIFF(b->undertime, q->now);
+                       if (delay < base_delay) {
+                               if (delay <= 0)
+                                       delay = 1;
+                               base_delay = delay;
+                       }
+               }
+
+               q->wd_expires = delay;
+       }
 }
 
 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
@@ -481,15+515,18 @@ static void cbq_ovl_classic(struct cbq_class *cl)
 static void cbq_ovl_rclassic(struct cbq_class *cl)
 {
        struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
+       struct cbq_class *this = cl;
 
-       while (cl && cl->delayed) {
-               cl = cl->borrow;
-               if (cl->level > q->toplevel)
-                       return;
-       }
+       do {
+               if (cl->level > q->toplevel) {
+                       cl = NULL;
+                       break;
+               }
+       } while ((cl = cl->borrow) != NULL);
 
-       if (cl)
-               cbq_ovl_classic(cl);
+       if (cl == NULL)
+               cl = this;
+       cbq_ovl_classic(cl);
 }
 
 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
@@ -497,12+534,11 @@ static void cbq_ovl_rclassic(struct cbq_class *cl)
 static void cbq_ovl_delay(struct cbq_class *cl)
 {
        struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
+       psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
 
        if (!cl->delayed) {
-               psched_tdiff_t delay;
                unsigned long sched = jiffies;
 
-               delay = PSCHED_TDIFF(cl->undertime, q->now);
                delay += cl->offtime;
                if (cl->avgidle < 0)
                        delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
@@ -521,8+557,12 @@ static void cbq_ovl_delay(struct cbq_class *cl)
                        add_timer(&q->delay_timer);
                        cl->delayed = 1;
                        cl->xstats.overactions++;
+                       return;
                }
+               delay = 1;
        }
+       if (q->wd_expires == 0 || q->wd_expires > delay)
+               q->wd_expires = delay;
 }
 
 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
@@ -555,6+595,7 @@ static void cbq_ovl_drop(struct cbq_class *cl)
 static void cbq_watchdog(unsigned long arg)
 {
        struct Qdisc *sch = (struct Qdisc*)arg;
+       sch->flags &= ~TCQ_F_THROTTLED;
        qdisc_wakeup(sch->dev);
 }
 
@@ -622,6+663,7 @@ static void cbq_undelay(unsigned long arg)
                add_timer(&q->delay_timer);
        }
 
+       sch->flags &= ~TCQ_F_THROTTLED;
        qdisc_wakeup(sch->dev);
 }
 
@@ -631,18+673,23 @@ static void cbq_undelay(unsigned long arg)
 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
 {
        int len = skb->len;
-       struct Qdisc *sch = child->parent;
+       struct Qdisc *sch = child->__parent;
        struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
-       struct cbq_class *cl = cbq_class_lookup(q, child->classid);
+       struct cbq_class *cl = q->rx_class;
+
+       q->rx_class = NULL;
 
        if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
+
+               cbq_mark_toplevel(q, cl);
+
+               q->rx_class = cl;
+               cl->q->__parent = sch;
+
                if (cl->q->enqueue(skb, cl->q) == 1) {
                        sch->q.qlen++;
                        sch->stats.packets++;
-                       cl->stats.packets++;
                        sch->stats.bytes+=len;
-                       cl->stats.bytes+=len;
-                       cbq_mark_toplevel(q, cl);
                        if (!cl->next_alive)
                                cbq_activate_class(cl);
                        return 0;
@@ -656,21+703,42 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
 }
 #endif
 
+/* 
+   It is mission critical procedure.
+
+   We "regenerate" toplevel cutoff, if transmitting class
+   has backlog and it is not regulated. It is not part of
+   original CBQ description, but looks more reasonable.
+   Probably, it is wrong. This question needs further investigation.
+*/
+
 static __inline__ void
-cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
+cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
+                   struct cbq_class *borrowed)
 {
-       if (cl && q->toplevel >= cl->level) {
-               if (cl->q->q.qlen <= 1 || PSCHED_TLESS(q->now, cl->undertime))
-                       q->toplevel = TC_CBQ_MAXLEVEL;
-               else /* BUGGGG? if (cl != this) */
-                       q->toplevel = cl->level;
+       if (cl && q->toplevel >= borrowed->level) {
+               if (cl->q->q.qlen > 1) {
+                       do {
+                               if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
+                                       q->toplevel = borrowed->level;
+                                       return;
+                               }
+                       } while ((borrowed=borrowed->borrow) != NULL);
+               }
+#if 0  
+       /* It is not necessary now. Uncommenting it
+          will save CPU cycles, but decrease fairness.
+        */
+               q->toplevel = TC_CBQ_MAXLEVEL;
+#endif
        }
 }
 
-static __inline__ void
+static void
 cbq_update(struct cbq_sched_data *q)
 {
-       struct cbq_class *cl = q->tx_class;
+       struct cbq_class *this = q->tx_class;
+       struct cbq_class *cl = this;
        int len = q->tx_len;
 
        q->tx_class = NULL;
@@ -679,6+747,9 @@ cbq_update(struct cbq_sched_data *q)
                long avgidle = cl->avgidle;
                long idle;
 
+               cl->stats.packets++;
+               cl->stats.bytes += len;
+
                /*
                   (now - last) is total time between packet right edges.
                   (last_pktlen/rate) is "virtual" busy time, so that
@@ -697,6+768,10 @@ cbq_update(struct cbq_sched_data *q)
 
                if (avgidle <= 0) {
                        /* Overlimit or at-limit */
+
+                       if (avgidle < cl->minidle)
+                               avgidle = cl->minidle;
+
                        cl->avgidle = avgidle;
 
                        /* Calculate expected time, when this class
@@ -732,12+807,11 @@ cbq_update(struct cbq_sched_data *q)
                                cl->avgidle = cl->maxidle;
                        else
                                cl->avgidle = avgidle;
-
                }
                cl->last = q->now;
        }
 
-       cbq_update_toplevel(q, q->tx_borrowed);
+       cbq_update_toplevel(q, this, q->tx_borrowed);
 }
 
 static __inline__ struct cbq_class *
@@ -750,21+824,33 @@ cbq_under_limit(struct cbq_class *cl)
                return cl;
 
        if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
-           PSCHED_TLESS(cl->undertime, q->now)) {
+           !PSCHED_TLESS(q->now, cl->undertime)) {
                cl->delayed = 0;
                return cl;
        }
 
-       while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
-              PSCHED_TLESS(q->now, cl->undertime)) {
-               if ((cl = cl->borrow) == NULL || cl->level > q->toplevel) {
+       do {
+               /* It is very suspicious place. Now overlimit
+                  action is generated for not bounded classes
+                  only if link is completely congested.
+                  Though it is in agree with ancestor-only paradigm,
+                  it looks very stupid. Particularly,
+                  it means that this chunk of code will either
+                  never be called or result in strong amplification
+                  of burstiness. Dangerous, silly, and, however,
+                  no another solution exists.
+                */
+               if ((cl = cl->borrow) == NULL) {
                        this_cl->stats.overlimits++;
                        this_cl->overlimit(this_cl);
                        return NULL;
                }
-       }
-       this_cl->xstats.borrows++;
-       cl->xstats.borrows++;
+               if (cl->level > q->toplevel)
+                       return NULL;
+       } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
+                PSCHED_TLESS(q->now, cl->undertime));
+
+       cl->delayed = 0;
        return cl;
 }
 
@@ -784,27+870,26 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
 
                /* Start round */
                do {
-                       struct cbq_class *borrow;
+                       struct cbq_class *borrow = NULL;
 
-                       /* Class is empty */
-                       if (cl->q->q.qlen == 0)
-                               goto skip_class;
-
-                       if ((borrow = cbq_under_limit(cl)) == NULL)
+                       if (cl->q->q.qlen &&
+                           (borrow = cbq_under_limit(cl)) == NULL)
                                goto skip_class;
 
                        if (cl->deficit <= 0) {
-                               /* Class exhausted its allotment per this
-                                  round.
+                               /* Class exhausted its allotment per
+                                  this round. Switch to the next one.
                                 */
                                deficit = 1;
+                               cl->deficit += cl->quantum;
                                goto next_class;
                        }
 
                        skb = cl->q->dequeue(cl->q);
 
                        /* Class did not give us any skb :-(
-                          It could occur if cl->q == "tbf"
+                          It could occur even if cl->q->q.qlen != 0 
+                          f.e. if cl->q == "tbf"
                         */
                        if (skb == NULL)
                                goto skip_class;
@@ -812,6+897,15 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
                        cl->deficit -= skb->len;
                        q->tx_class = cl;
                        q->tx_borrowed = borrow;
+                       if (borrow != cl) {
+#ifndef CBQ_XSTATS_BORROWS_BYTES
+                               borrow->xstats.borrows++;
+                               cl->xstats.borrows++;
+#else
+                               borrow->xstats.borrows += skb->len;
+                               cl->xstats.borrows += skb->len;
+#endif
+                       }
                        q->tx_len = skb->len;
 
                        if (cl->deficit <= 0) {
@@ -822,8+916,6 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
                        return skb;
 
 skip_class:
-                       cl->deficit = 0;
-
                        if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
                                /* Class is empty or penalized.
                                   Unlink it from active chain.
@@ -857,7+949,6 @@ skip_class:
 next_class:
                        cl_prev = cl;
                        cl = cl->next_alive;
-                       cl->deficit += cl->quantum;
                } while (cl_prev != cl_tail);
        } while (deficit);
 
@@ -914,6+1005,7 @@ cbq_dequeue(struct Qdisc *sch)
                skb = cbq_dequeue_1(sch);
                if (skb) {
                        sch->q.qlen--;
+                       sch->flags &= ~TCQ_F_THROTTLED;
                        return skb;
                }
 
@@ -955,6+1047,7 @@ cbq_dequeue(struct Qdisc *sch)
                                delay = 1;
                        q->wd_timer.expires = jiffies + delay;
                        add_timer(&q->wd_timer);
+                       sch->flags |= TCQ_F_THROTTLED;
                }
        }
        return NULL;
@@ -1129,14+1222,18 @@ static void cbq_link_class(struct cbq_class *this)
 static int cbq_drop(struct Qdisc* sch)
 {
        struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
-       struct cbq_class *cl;
-       int h;
+       struct cbq_class *cl, *cl_head;
+       int prio;
 
-       for (h = TC_CBQ_MAXPRIO; h >= 0; h++) {
-               for (cl = q->classes[h]; cl; cl = cl->next) {
+       for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio++) {
+               if ((cl_head = q->active[prio]) == NULL)
+                       continue;
+
+               cl = cl_head;
+               do {
                        if (cl->q->ops->drop && cl->q->ops->drop(cl->q))
                                return 1;
-               }
+               } while ((cl = cl->next_alive) != cl_head);
        }
        return 0;
 }
@@ -1166,8+1263,8 @@ cbq_reset(struct Qdisc* sch)
 
                        cl->next_alive = NULL;
                        PSCHED_SET_PASTPERFECT(cl->undertime);
-                       cl->avgidle = 0;
-                       cl->deficit = 0;
+                       cl->avgidle = cl->maxidle;
+                       cl->deficit = cl->quantum;
                        cl->cpriority = cl->priority;
                }
        }
@@ -1187,8+1284,10 @@ static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
                cl->avpkt = lss->avpkt;
        if (lss->change&TCF_CBQ_LSS_MINIDLE)
                cl->minidle = -(long)lss->minidle;
-       if (lss->change&TCF_CBQ_LSS_MAXIDLE)
+       if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
                cl->maxidle = lss->maxidle;
+               cl->avgidle = lss->maxidle;
+       }
        if (lss->change&TCF_CBQ_LSS_OFFTIME)
                cl->offtime = lss->offtime;
        return 0;
@@ -1261,7+1360,7 @@ static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
 {
        cl->police = p->police;
 
-       if (!(cl->q->flags&TCQ_F_DEFAULT)) {
+       if (cl->q->handle) {
                if (p->police == TC_POLICE_RECLASSIFY)
                        cl->q->reshape_fail = cbq_reshape_fail;
                else
@@ -1300,6+1399,7 @@ static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
                return -EINVAL;
        }
 
+       q->link.refcnt = 1;
        q->link.sibling = &q->link;
        q->link.classid = sch->handle;
        q->link.qdisc = sch;
@@ -1493,6+1593,7 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg,
        else
                tcm->tcm_parent = TC_H_ROOT;
        tcm->tcm_handle = cl->classid;
+       tcm->tcm_info = cl->q->handle;
 
        rta = (struct rtattr*)b;
        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
@@ -1533,12+1634,20 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
                }
                if ((*old = xchg(&cl->q, new)) != NULL)
                        qdisc_reset(*old);
-                       
+
                return 0;
        }
        return -ENOENT;
 }
 
+static struct Qdisc *
+cbq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+       struct cbq_class *cl = (struct cbq_class*)arg;
+
+       return cl ? cl->q : NULL;
+}
+
 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
 {
        struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
@@ -1569,6+1678,7 @@ static void cbq_destroy_class(struct cbq_class *cl)
 #ifdef CONFIG_NET_ESTIMATOR
        qdisc_kill_estimator(&cl->stats);
 #endif
+       kfree(cl);
 }
 
 static void
@@ -1578,6+1688,9 @@ cbq_destroy(struct Qdisc* sch)
        struct cbq_class *cl;
        unsigned h;
 
+#ifdef CONFIG_NET_CLS_POLICE
+       q->rx_class = NULL;
+#endif
        for (h = 0; h < 16; h++) {
                for (cl = q->classes[h]; cl; cl = cl->next)
                        cbq_destroy_filters(cl);
@@ -1590,20+1703,29 @@ cbq_destroy(struct Qdisc* sch)
        }
 
        qdisc_put_rtab(q->link.R_tab);
+       MOD_DEC_USE_COUNT;
 }
 
-static void cbq_put(struct Qdisc *q, unsigned long arg)
+static void cbq_put(struct Qdisc *sch, unsigned long arg)
 {
+       struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
        struct cbq_class *cl = (struct cbq_class*)arg;
 
-       if (--cl->refcnt == 0)
+       start_bh_atomic();
+       if (--cl->refcnt == 0) {
+#ifdef CONFIG_NET_CLS_POLICE
+               if (q->rx_class == cl)
+                       q->rx_class = NULL;
+#endif
                cbq_destroy_class(cl);
+       }
+       end_bh_atomic();
        return;
 }
 
 static int
-cbq_change(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
-          unsigned long *arg)
+cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
+                unsigned long *arg)
 {
        int err;
        struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
@@ -1763,6+1885,7 @@ cbq_change(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
        cl->borrow = cl->tparent;
        if (cl->tparent != &q->link)
                cl->share = cl->tparent;
+       cbq_adjust_levels(parent);
        cl->minidle = -0x7FFFFFFF;
        cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
        cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
@@ -1781,7+1904,6 @@ cbq_change(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
 #endif
        if (tb[TCA_CBQ_FOPT-1])
                cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
-       cbq_adjust_levels(parent);
        end_bh_atomic();
 
 #ifdef CONFIG_NET_ESTIMATOR
@@ -1810,10+1932,16 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
        if (cl->next_alive)
                cbq_deactivate_class(cl);
 
-       if (q->tx_class == cl)
-               q->tx_class = cl->borrow;
        if (q->tx_borrowed == cl)
                q->tx_borrowed = q->tx_class;
+       if (q->tx_class == cl) {
+               q->tx_class = NULL;
+               q->tx_borrowed = NULL;
+       }
+#ifdef CONFIG_NET_CLS_POLICE
+       if (q->rx_class == cl)
+               q->rx_class = NULL;
+#endif
 
        cbq_unlink_class(cl);
        cbq_adjust_levels(cl->tparent);
@@ -1841,12+1969,16 @@ static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
        return &cl->filter_list;
 }
 
-static unsigned long cbq_bind_filter(struct Qdisc *sch, u32 classid)
+static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
+                                    u32 classid)
 {
        struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+       struct cbq_class *p = (struct cbq_class*)parent;
        struct cbq_class *cl = cbq_class_lookup(q, classid);
 
        if (cl) {
+               if (p && p->level <= cl->level)
+                       return 0;
                cl->filters++;
                return (unsigned long)cl;
        }
@@ -1878,7+2010,7 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
                        }
                        if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
                                arg->stop = 1;
-                               break;
+                               return;
                        }
                        arg->count++;
                }
@@ -1888,9+2020,10 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 static struct Qdisc_class_ops cbq_class_ops =
 {
        cbq_graft,
+       cbq_leaf,
        cbq_get,
        cbq_put,
-       cbq_change,
+       cbq_change_class,
        cbq_delete,
        cbq_walk,
 
@@ -1918,6+2051,7 @@ struct Qdisc_ops cbq_qdisc_ops =
        cbq_init,
        cbq_reset,
        cbq_destroy,
+       NULL /* cbq_change */,
 
 #ifdef CONFIG_RTNETLINK
        cbq_dump,
index 9bdc656..2202fd8 100644 (file)
@@ -826,6+826,12 @@ static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
        return -EINVAL;
 }
 
+static struct Qdisc * csz_leaf(struct Qdisc *sch, unsigned long cl)
+{
+       return NULL;
+}
+
+
 static unsigned long csz_get(struct Qdisc *sch, u32 classid)
 {
        struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
@@ -840,6+846,12 @@ static unsigned long csz_get(struct Qdisc *sch, u32 classid)
        return band+1;
 }
 
+static unsigned long csz_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+       return csz_get(sch, classid);
+}
+
+
 static void csz_put(struct Qdisc *sch, unsigned long cl)
 {
        return;
@@ -1006,6+1018,8 @@ static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl)
 struct Qdisc_class_ops csz_class_ops =
 {
        csz_graft,
+       csz_leaf,
+
        csz_get,
        csz_put,
        csz_change,
@@ -1013,7+1027,7 @@ struct Qdisc_class_ops csz_class_ops =
        csz_walk,
 
        csz_find_tcf,
-       csz_get,
+       csz_bind,
        csz_put,
 
 #ifdef CONFIG_RTNETLINK
@@ -1036,6+1050,7 @@ struct Qdisc_ops csz_qdisc_ops =
        csz_init,
        csz_reset,
        csz_destroy,
+       NULL /* csz_change */,
 
 #ifdef CONFIG_RTNETLINK
        csz_dump,
index 14bc8bb..c93f206 100644 (file)
@@ -97,10+97,7 @@ fifo_drop(struct Qdisc* sch)
 static void
 fifo_reset(struct Qdisc* sch)
 {
-       struct sk_buff *skb;
-
-       while ((skb=__skb_dequeue(&sch->q)) != NULL)
-               kfree_skb(skb);
+       skb_queue_purge(&sch->q);
        sch->stats.backlog = 0;
 }
 
@@ -137,15+134,15 @@ pfifo_dequeue(struct Qdisc* sch)
        return __skb_dequeue(&sch->q);
 }
 
-
 static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
 {
        struct fifo_sched_data *q = (void*)sch->data;
 
        if (opt == NULL) {
-               q->limit = sch->dev->tx_queue_len;
                if (sch->ops == &bfifo_qdisc_ops)
-                       q->limit *= sch->dev->mtu;
+                       q->limit = sch->dev->tx_queue_len*sch->dev->mtu;
+               else    
+                       q->limit = sch->dev->tx_queue_len;
        } else {
                struct tc_fifo_qopt *ctl = RTA_DATA(opt);
                if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
@@ -188,6+185,8 @@ struct Qdisc_ops pfifo_qdisc_ops =
        fifo_init,
        fifo_reset,
        NULL,
+       fifo_init,
+
 #ifdef CONFIG_RTNETLINK
        fifo_dump,
 #endif
@@ -208,6+207,7 @@ struct Qdisc_ops bfifo_qdisc_ops =
        fifo_init,
        fifo_reset,
        NULL,
+       fifo_init,
 #ifdef CONFIG_RTNETLINK
        fifo_dump,
 #endif
index 7ba2e94..ba40033 100644 (file)
@@ -184,7+184,7 @@ struct Qdisc noop_qdisc =
         { NULL }, 
        noop_enqueue,
        noop_dequeue,
-       TCQ_F_DEFAULT|TCQ_F_BUILTIN,
+       TCQ_F_BUILTIN,
        &noop_qdisc_ops,        
 };
 
@@ -207,7+207,7 @@ struct Qdisc noqueue_qdisc =
         { NULL }, 
        NULL,
        NULL,
-       TCQ_F_DEFAULT|TCQ_F_BUILTIN,
+       TCQ_F_BUILTIN,
        &noqueue_qdisc_ops,
 };
 
@@ -322,8+322,8 @@ struct Qdisc * qdisc_create_dflt(struct device *dev, struct Qdisc_ops *ops)
        sch->enqueue = ops->enqueue;
        sch->dequeue = ops->dequeue;
        sch->dev = dev;
-       sch->flags |= TCQ_F_DEFAULT;
-       if (ops->init && ops->init(sch, NULL) == 0)
+       atomic_set(&sch->refcnt, 1);
+       if (!ops->init || ops->init(sch, NULL) == 0)
                return sch;
 
        kfree(sch);
@@ -342,6+342,10 @@ void qdisc_reset(struct Qdisc *qdisc)
 void qdisc_destroy(struct Qdisc *qdisc)
 {
        struct Qdisc_ops *ops = qdisc->ops;
+
+       if (!atomic_dec_and_test(&qdisc->refcnt))
+               return;
+
 #ifdef CONFIG_NET_SCHED
        if (qdisc->dev) {
                struct Qdisc *q, **qp;
@@ -444,30+448,3 @@ void dev_shutdown(struct device *dev)
        end_bh_atomic();
 }
 
-struct Qdisc * dev_set_scheduler(struct device *dev, struct Qdisc *qdisc)
-{
-       struct Qdisc *oqdisc;
-
-       if (dev->flags & IFF_UP)
-               dev_deactivate(dev);
-
-       start_bh_atomic();
-       oqdisc = dev->qdisc_sleeping;
-
-       /* Prune old scheduler */
-       if (oqdisc)
-               qdisc_reset(oqdisc);
-
-       /* ... and graft new one */
-       if (qdisc == NULL)
-               qdisc = &noop_qdisc;
-       dev->qdisc_sleeping = qdisc;
-       dev->qdisc = &noop_qdisc;
-       end_bh_atomic();
-
-       if (dev->flags & IFF_UP)
-               dev_activate(dev);
-
-       return oqdisc;
-}
-
index 5b7b39f..ca2d02d 100644 (file)
@@ -49,17+49,19 @@ static __inline__ unsigned prio_classify(struct sk_buff *skb, struct Qdisc *sch)
 {
        struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
        struct tcf_result res;
+       u32 band;
 
-       res.classid = skb->priority;
-       if (TC_H_MAJ(res.classid) != sch->handle) {
+       band = skb->priority;
+       if (TC_H_MAJ(skb->priority) != sch->handle) {
                if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) {
-                       if (TC_H_MAJ(res.classid))
-                               res.classid = 0;
-                       res.classid = q->prio2band[res.classid&TC_PRIO_MAX] + 1;
+                       if (TC_H_MAJ(band))
+                               band = 0;
+                       return q->prio2band[band&TC_PRIO_MAX];
                }
+               band = res.classid;
        }
-
-       return res.classid - 1;
+       band = TC_H_MIN(band) - 1;
+       return band < q->bands ? band : q->prio2band[0];
 }
 
 static int
@@ -160,38+162,74 @@ prio_destroy(struct Qdisc* sch)
        MOD_DEC_USE_COUNT;
 }
 
+static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
+{
+       struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+       struct tc_prio_qopt *qopt = RTA_DATA(opt);
+       int i;
+
+       if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
+               return -EINVAL;
+       if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
+               return -EINVAL;
+
+       for (i=0; i<=TC_PRIO_MAX; i++) {
+               if (qopt->priomap[i] >= qopt->bands)
+                       return -EINVAL;
+       }
+
+       start_bh_atomic();
+       q->bands = qopt->bands;
+       memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
+
+       for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
+               struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
+               if (child != &noop_qdisc)
+                       qdisc_destroy(child);
+       }
+       end_bh_atomic();
+
+       for (i=0; i<=TC_PRIO_MAX; i++) {
+               int band = q->prio2band[i];
+               if (q->queues[band] == &noop_qdisc) {
+                       struct Qdisc *child;
+                       child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+                       if (child) {
+                               net_serialize_enter();
+                               child = xchg(&q->queues[band], child);
+                               net_serialize_leave();
+                               if (child != &noop_qdisc)
+                                       qdisc_destroy(child);
+                       }
+               }
+       }
+       return 0;
+}
+
 static int prio_init(struct Qdisc *sch, struct rtattr *opt)
 {
        static const u8 prio2band[TC_PRIO_MAX+1] =
        { 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 };
        struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
-       unsigned mask = 0;
        int i;
 
+       for (i=0; i<TCQ_PRIO_BANDS; i++)
+               q->queues[i] = &noop_qdisc;
+
        if (opt == NULL) {
                q->bands = 3;
                memcpy(q->prio2band, prio2band, sizeof(prio2band));
-               mask = 7;
+               for (i=0; i<3; i++) {
+                       struct Qdisc *child;
+                       child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+                       if (child)
+                               q->queues[i] = child;
+               }
        } else {
-               struct tc_prio_qopt *qopt = RTA_DATA(opt);
+               int err;
 
-               if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
-                       return -EINVAL;
-               if (qopt->bands > TCQ_PRIO_BANDS)
-                       return -EINVAL;
-               q->bands = qopt->bands;
-               for (i=0; i<=TC_PRIO_MAX; i++) {
-                       if (qopt->priomap[i] >= q->bands)
-                               return -EINVAL;
-                       q->prio2band[i] = qopt->priomap[i];
-                       mask |= (1<<qopt->priomap[i]);
-               }
-       }
-       for (i=0; i<TCQ_PRIO_BANDS; i++) {
-               if (mask&(1<<i))
-                       q->queues[i] = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
-               if (q->queues[i] == NULL)
-                       q->queues[i] = &noop_qdisc;
+               if ((err= prio_tune(sch, opt)) != 0)
+                       return err;
        }
        MOD_INC_USE_COUNT;
        return 0;
@@ -232,6+270,18 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
        return 0;
 }
 
+static struct Qdisc *
+prio_leaf(struct Qdisc *sch, unsigned long arg)
+{
+       struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+       unsigned long band = arg - 1;
+
+       if (band >= q->bands)
+               return NULL;
+
+       return q->queues[band];
+}
+
 static unsigned long prio_get(struct Qdisc *sch, u32 classid)
 {
        struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
@@ -242,6+292,12 @@ static unsigned long prio_get(struct Qdisc *sch, u32 classid)
        return band;
 }
 
+static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+       return prio_get(sch, classid);
+}
+
+
 static void prio_put(struct Qdisc *q, unsigned long cl)
 {
        return;
@@ -267,12+323,15 @@ static int prio_delete(struct Qdisc *sch, unsigned long cl)
 
 
 #ifdef CONFIG_RTNETLINK
-static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm)
+static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
+                          struct tcmsg *tcm)
 {
        struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
 
        if (cl - 1 > q->bands)
                return -ENOENT;
+       if (q->queues[cl-1])
+               tcm->tcm_info = q->queues[cl-1]->handle;
        return 0;
 }
 #endif
@@ -310,6+369,8 @@ static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl)
 static struct Qdisc_class_ops prio_class_ops =
 {
        prio_graft,
+       prio_leaf,
+
        prio_get,
        prio_put,
        prio_change,
@@ -317,7+378,7 @@ static struct Qdisc_class_ops prio_class_ops =
        prio_walk,
 
        prio_find_tcf,
-       prio_get,
+       prio_bind,
        prio_put,
 
 #ifdef CONFIG_RTNETLINK
@@ -340,6+401,7 @@ struct Qdisc_ops prio_qdisc_ops =
        prio_init,
        prio_reset,
        prio_destroy,
+       prio_tune,
 
 #ifdef CONFIG_RTNETLINK
        prio_dump,
index eac678b..30b537b 100644 (file)
@@ -193,8+193,8 @@ red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
        }
 
        if (q->qave < q->qth_min) {
-enqueue:
                q->qcount = -1;
+enqueue:
                if (sch->stats.backlog <= q->limit) {
                        __skb_queue_tail(&sch->q, skb);
                        sch->stats.backlog += skb->len;
@@ -231,6+231,7 @@ drop:
                 */
                if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
                        goto enqueue;
+printk(KERN_DEBUG "Drop %d\n", q->qcount);
                q->qcount = 0;
                q->qR = net_random()&q->Rmask;
                sch->stats.overlimits++;
@@ -375,6+376,7 @@ struct Qdisc_ops red_qdisc_ops =
        red_init,
        red_reset,
        red_destroy,
+       NULL /* red_change */,
 
 #ifdef CONFIG_RTNETLINK
        red_dump,
index c6f43ba..74d45fe 100644 (file)
@@ -380,6+380,27 @@ static void sfq_perturbation(unsigned long arg)
        }
 }
 
+static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
+{
+       struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
+       struct tc_sfq_qopt *ctl = RTA_DATA(opt);
+
+       if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+               return -EINVAL;
+
+       start_bh_atomic();
+       q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
+       q->perturb_period = ctl->perturb_period*HZ;
+
+       del_timer(&q->perturb_timer);
+       if (q->perturb_period) {
+               q->perturb_timer.expires = jiffies + q->perturb_period;
+               add_timer(&q->perturb_timer);
+       }
+       end_bh_atomic();
+       return 0;
+}
+
 static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
 {
        struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
@@ -399,24+420,15 @@ static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
        q->max_depth = 0;
        q->tail = SFQ_DEPTH;
        if (opt == NULL) {
-               q->quantum = sch->dev->mtu;
+               q->quantum = psched_mtu(sch->dev);
                q->perturb_period = 0;
-               if (sch->dev->hard_header)
-                       q->quantum += sch->dev->hard_header_len;
        } else {
-               struct tc_sfq_qopt *ctl = RTA_DATA(opt);
-               if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
-                       return -EINVAL;
-               q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
-               q->perturb_period = ctl->perturb_period*HZ;
-               /* The rest is compiled in */
+               int err = sfq_change(sch, opt);
+               if (err)
+                       return err;
        }
        for (i=0; i<SFQ_DEPTH; i++)
                sfq_link(q, i);
-       if (q->perturb_period) {
-               q->perturb_timer.expires = jiffies + q->perturb_period;
-               add_timer(&q->perturb_timer);
-       }
        MOD_INC_USE_COUNT;
        return 0;
 }
@@ -467,6+479,7 @@ struct Qdisc_ops sfq_qdisc_ops =
        sfq_init,
        sfq_reset,
        sfq_destroy,
+       NULL, /* sfq_change */
 
 #ifdef CONFIG_RTNETLINK
        sfq_dump,
index 83d6da8..a4d13b6 100644 (file)
@@ -114,6+114,7 @@ struct tbf_sched_data
        u32             limit;          /* Maximal length of backlog: bytes */
        u32             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
        u32             mtu;
+       u32             max_size;
        struct qdisc_rate_table *R_tab;
        struct qdisc_rate_table *P_tab;
 
@@ -132,6+133,8 @@ tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 {
        struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 
+       if (skb->len > q->max_size)
+               goto drop;
        __skb_queue_tail(&sch->q, skb);
        if ((sch->stats.backlog += skb->len) <= q->limit) {
                sch->stats.bytes += skb->len;
@@ -145,6+148,8 @@ tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 
        __skb_unlink(skb, &sch->q);
        sch->stats.backlog -= skb->len;
+
+drop:
        sch->stats.drops++;
 #ifdef CONFIG_NET_CLS_POLICE
        if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
@@ -180,6+185,7 @@ static void tbf_watchdog(unsigned long arg)
 {
        struct Qdisc *sch = (struct Qdisc*)arg;
 
+       sch->flags &= ~TCQ_F_THROTTLED;
        qdisc_wakeup(sch->dev);
 }
 
@@ -216,6+222,7 @@ tbf_dequeue(struct Qdisc* sch)
                        q->tokens = toks;
                        q->ptokens = ptoks;
                        sch->stats.backlog -= skb->len;
+                       sch->flags &= ~TCQ_F_THROTTLED;
                        return skb;
                }
 
@@ -238,10+245,11 @@ tbf_dequeue(struct Qdisc* sch)
                   Really, if we split the flow into independent
                   subflows, it would be a very good solution.
                   This is the main idea of all FQ algorithms
-                  (cf. CSZ, HPFQ, HFCS)
+                  (cf. CSZ, HPFQ, HFSC)
                 */
                __skb_queue_head(&sch->q, skb);
 
+               sch->flags |= TCQ_F_THROTTLED;
                sch->stats.overlimits++;
        }
        return NULL;
@@ -258,53+266,86 @@ tbf_reset(struct Qdisc* sch)
        PSCHED_GET_TIME(q->t_c);
        q->tokens = q->buffer;
        q->ptokens = q->mtu;
+       sch->flags &= ~TCQ_F_THROTTLED;
        del_timer(&q->wd_timer);
 }
 
-static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
+static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
 {
+       int err = -EINVAL;
        struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
        struct rtattr *tb[TCA_TBF_PTAB];
        struct tc_tbf_qopt *qopt;
+       struct qdisc_rate_table *rtab = NULL;
+       struct qdisc_rate_table *ptab = NULL;
+       int max_size;
 
-       MOD_INC_USE_COUNT;
-
-       if (opt == NULL ||
-           rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
+       if (rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
            tb[TCA_TBF_PARMS-1] == NULL ||
-           RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) {
-               MOD_DEC_USE_COUNT;
-               return -EINVAL;
-       }
+           RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
+               goto done;
 
        qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
-       q->R_tab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
-       if (q->R_tab == NULL) {
-               MOD_DEC_USE_COUNT;
-               return -EINVAL;
-       }
+       rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
+       if (rtab == NULL)
+               goto done;
 
        if (qopt->peakrate.rate) {
-               q->P_tab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_PTAB-1]);
-               if (q->P_tab == NULL) {
-                       MOD_DEC_USE_COUNT;
-                       qdisc_put_rtab(q->R_tab);
-                       return -EINVAL;
+               if (qopt->peakrate.rate > qopt->rate.rate)
+                       ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
+               if (ptab == NULL)
+                       goto done;
+       }
+
+       max_size = psched_mtu(sch->dev);
+       if (ptab) {
+               int n = max_size>>qopt->peakrate.cell_log;
+               while (n>0 && ptab->data[n-1] > qopt->mtu) {
+                       max_size -= (1<<qopt->peakrate.cell_log);
+                       n--;
                }
        }
+       if (rtab->data[max_size>>qopt->rate.cell_log] > qopt->buffer)
+               goto done;
 
-       PSCHED_GET_TIME(q->t_c);
-       init_timer(&q->wd_timer);
-       q->wd_timer.function = tbf_watchdog;
-       q->wd_timer.data = (unsigned long)sch;
+       start_bh_atomic();
        q->limit = qopt->limit;
        q->mtu = qopt->mtu;
-       if (q->mtu == 0)
-               q->mtu = psched_mtu(sch->dev);
+       q->max_size = max_size;
        q->buffer = qopt->buffer;
        q->tokens = q->buffer;
        q->ptokens = q->mtu;
-       return 0;
+       rtab = xchg(&q->R_tab, rtab);
+       ptab = xchg(&q->P_tab, ptab);
+       end_bh_atomic();
+       err = 0;
+done:
+       if (rtab)
+               qdisc_put_rtab(rtab);
+       if (ptab)
+               qdisc_put_rtab(ptab);
+       return err;
+}
+
+static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
+{
+       int err;
+       struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+       
+       if (opt == NULL)
+               return -EINVAL;
+       
+       MOD_INC_USE_COUNT;
+       
+       PSCHED_GET_TIME(q->t_c);
+       init_timer(&q->wd_timer);
+       q->wd_timer.function = tbf_watchdog;
+       q->wd_timer.data = (unsigned long)sch;
+       
+       if ((err = tbf_change(sch, opt)) != 0) {
+               MOD_DEC_USE_COUNT;
+       }
+       return err;
 }
 
 static void tbf_destroy(struct Qdisc *sch)
@@ -328,10+369,10 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
        unsigned char    *b = skb->tail;
        struct rtattr *rta;
        struct tc_tbf_qopt opt;
-
+       
        rta = (struct rtattr*)b;
        RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
-
+       
        opt.limit = q->limit;
        opt.rate = q->R_tab->rate;
        if (q->P_tab)
@@ -366,6+407,7 @@ struct Qdisc_ops tbf_qdisc_ops =
        tbf_init,
        tbf_reset,
        tbf_destroy,
+       tbf_change,
 
 #ifdef CONFIG_RTNETLINK
        tbf_dump,
index 212e6f6..66040d5 100644 (file)
@@ -444,6+444,7 @@ static struct teql_master the_master = {
        teql_qdisc_init,
        teql_reset,
        teql_destroy,
+       NULL,
 },};
 
 
index ae33770..91e438b 100644 (file)
@@ -8,7+8,7 @@
  *             as published by the Free Software Foundation; either version
  *             2 of the License, or (at your option) any later version.
  *
- * Version:    $Id: af_unix.c,v 1.73 1999/01/15 06:55:48 davem Exp $
+ * Version:    $Id: af_unix.c,v 1.74 1999/03/21 05:23:16 davem Exp $
  *
  * Fixes:
  *             Linus Torvalds  :       Assorted bug cures.
  *                                     Lots of bug fixes.
  *          Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  *                                     by above two patches.
+ *          Andrea Arcangeli   :       If possible we block in connect(2)
+ *                                     if the max backlog of the listen socket
+ *                                     is been reached. This won't break
+ *                                     old apps and it will avoid huge amount
+ *                                     of socks hashed (this for unix_gc()
+ *                                     performances reasons).
+ *                                     Security fix that limits the max
+ *                                     number of socks to 2*max_files and
+ *                                     the number of skb queueable in the
+ *                                     dgram receiver.
  *
  * Known differences from reference BSD that was tested:
  *
 
 int sysctl_unix_delete_delay = HZ;
 int sysctl_unix_destroy_delay = 10*HZ;
+int sysctl_unix_max_dgram_qlen = 10;
 
 unix_socket *unix_socket_table[UNIX_HASH_SIZE+1];
+static atomic_t unix_nr_socks = ATOMIC_INIT(0);
+static struct wait_queue * unix_ack_wqueue = NULL;
+static struct wait_queue * unix_dgram_wqueue = NULL;
 
 #define unix_sockets_unbound   (unix_socket_table[UNIX_HASH_SIZE])
 
@@ -263,6+277,8 @@ static void unix_destroy_timer(unsigned long data)
        unix_socket *sk=(unix_socket *)data;
        if(!unix_locked(sk) && atomic_read(&sk->wmem_alloc) == 0)
        {
+               atomic_dec(&unix_nr_socks);
+
                sk_free(sk);
        
                /* socket destroyed, decrement count                  */
@@ -295,6+311,11 @@ static int unix_release_sock (unix_socket *sk)
        sk->dead=1;
        sk->socket = NULL;
 
+       if (sk->state == TCP_LISTEN)
+               wake_up_interruptible(&unix_ack_wqueue);
+       if (sk->type == SOCK_DGRAM)
+               wake_up_interruptible(&unix_dgram_wqueue);
+
        skpair=unix_peer(sk);
 
        if (skpair!=NULL)
@@ -347,6+368,8 @@ static void unix_destroy_socket(unix_socket *sk)
        
        if(!unix_locked(sk) && atomic_read(&sk->wmem_alloc) == 0)
        {
+               atomic_dec(&unix_nr_socks);
+               
                sk_free(sk);
        
                /* socket destroyed, decrement count                  */
@@ -371,6+394,8 @@ static int unix_listen(struct socket *sock, int backlog)
                return -EOPNOTSUPP;             /* Only stream sockets accept */
        if (!sk->protinfo.af_unix.addr)
                return -EINVAL;                 /* No listens on an unbound socket */
+       if ((unsigned) backlog > SOMAXCONN)
+               backlog = SOMAXCONN;
        sk->max_ack_backlog=backlog;
        sk->state=TCP_LISTEN;
        sock->flags |= SO_ACCEPTCON;
@@ -388,6+413,9 @@ static struct sock * unix_create1(struct socket *sock, int stream)
 {
        struct sock *sk;
 
+       if (atomic_read(&unix_nr_socks) >= 2*max_files)
+               return NULL;
+
        MOD_INC_USE_COUNT;
        sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1);
        if (!sk) {
@@ -395,6+423,8 @@ static struct sock * unix_create1(struct socket *sock, int stream)
                return NULL;
        }
 
+       atomic_inc(&unix_nr_socks);
+
        sock_init_data(sock,sk);
 
        if (stream)
@@ -673,9+703,25 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
           we will have to recheck all again in any case.
         */
 
+restart:
        /*  Find listening sock */
        other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err);
 
+       if (!other)
+               return -ECONNREFUSED;
+
+       while (other->ack_backlog >= other->max_ack_backlog) {
+               unix_unlock(other);
+               if (other->dead || other->state != TCP_LISTEN)
+                       return -ECONNREFUSED;
+               if (flags & O_NONBLOCK)
+                       return -EAGAIN;
+               interruptible_sleep_on(&unix_ack_wqueue);
+               if (signal_pending(current))
+                       return -ERESTARTSYS;
+               goto restart;
+        }
+
        /* create new sock for complete connection */
        newsk = unix_create1(NULL, 1);
 
@@ -704,7+750,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 
        /* Check that listener is in valid state. */
        err = -ECONNREFUSED;
-       if (other == NULL || other->dead || other->state != TCP_LISTEN)
+       if (other->dead || other->state != TCP_LISTEN)
                goto out;
 
        err = -ENOMEM;
@@ -815,11+861,10 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
                        continue;
                }
                tsk = skb->sk;
-               sk->ack_backlog--;
+               if (sk->max_ack_backlog == sk->ack_backlog--)
+                       wake_up_interruptible(&unix_ack_wqueue);
                kfree_skb(skb);
-               if (!tsk->dead) 
-                       break;
-               unix_release_sock(tsk);
+               break;
        }
 
 
@@ -947,6+992,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len,
                 *      Check with 1003.1g - what should
                 *      datagram error
                 */
+       dead:
                unix_unlock(other);
                unix_peer(sk)=NULL;
                other = NULL;
@@ -964,6+1010,29 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len,
                        goto out_unlock;
        }
 
+       while (skb_queue_len(&other->receive_queue) >=
+              sysctl_unix_max_dgram_qlen)
+       {
+               if (sock->file->f_flags & O_NONBLOCK)
+               {
+                       err = -EAGAIN;
+                       goto out_unlock;
+               }
+               interruptible_sleep_on(&unix_dgram_wqueue);
+               if (other->dead)
+                       goto dead;
+               if (sk->shutdown & SEND_SHUTDOWN)
+               {
+                       err = -EPIPE;
+                       goto out_unlock;
+               }
+               if (signal_pending(current))
+               {
+                       err = -ERESTARTSYS;
+                       goto out_unlock;
+               }
+       }
+
        skb_queue_tail(&other->receive_queue, skb);
        other->data_ready(other,len);
        
@@ -1126,6+1195,13 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int size,
        if (!skb)
                goto out;
 
+       /*
+        * sysctl_unix_max_dgram_qlen may change over the time we blocked
+        * in the waitqueue so we must wakeup every time we shrink the
+        * receiver queue. -arca
+        */
+       wake_up_interruptible(&unix_dgram_wqueue);
+
        if (msg->msg_name)
        {
                msg->msg_namelen = sizeof(short);
index d492e8e..2f06a36 100644 (file)
 
 extern int sysctl_unix_destroy_delay;
 extern int sysctl_unix_delete_delay;
+extern int sysctl_unix_max_dgram_qlen;
 
 ctl_table unix_table[] = {
        {NET_UNIX_DESTROY_DELAY, "destroy_delay",
@@ -27,6+28,9 @@ ctl_table unix_table[] = {
        {NET_UNIX_DELETE_DELAY, "delete_delay",
        &sysctl_unix_delete_delay, sizeof(int), 0644, NULL, 
         &proc_dointvec_jiffies},
+       {NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen",
+       &sysctl_unix_max_dgram_qlen, sizeof(int), 0600, NULL, 
+        &proc_dointvec_jiffies},
        {0}
 };
 
close