@@ -2251,6+2251,12 @@ CONFIG_IP_ROUTE_TOS you say Y here, you will be able to specify different routes for
packets with different TOS values.
+IP: use FWMARK value as routing key
+CONFIG_IP_ROUTE_FWMARK
+ If you say Y here, you will be able to specify different routes for
+ packets with different FWMARK ("firewalling mark") values
+ (see ipchains(8), "-m" argument).
+
IP: verbose route monitoring
CONFIG_IP_ROUTE_VERBOSE
If you say Y here, which is recommended, then the kernel will print
@@ -5103,7+5109,7 @@ CONFIG_NET_FASTROUTE *** networking options: especially CONFIG*FIREWALL. ***
However, it will work with all options in CONFIG_IP_ADVANCED_ROUTER
- section (except for CONFIG_IP_ROUTE_TOS). At the moment, few devices
+ section (except for CONFIG_IP_ROUTE_TOS&FWMARK). At the moment, few devices
support fast switching (tulip is one of them, modified 8390 can be
found at ftp://ftp.inr.ac.ru/ip-routing/fastroute-8390.tar.gz).
\author{David van Leeuwen\\{\normalsize\tt david@ElseWare.cistron.nl}
\\{\footnotesize updated by Erik Andersen {\tt(andersee@debian.org)}}
\\{\footnotesize updated by Jens Axboe {\tt(axboe@image.dk)}}}
-\date{11 January 1999}
+\date{12 March 1999}
\maketitle
@@ -549,7+549,9 @@ non-supported $ioctl$s are: {\it CDROMREADMODE1, CDROMREADMODE2, CDROMREADAUDIO, CDROMREADRAW, CDROMREADCOOKED, CDROMSEEK,
CDROMPLAY\-BLK and CDROM\-READALL}.
+
\subsection{\cdrom\ capabilities}
+\label{capability}
Instead of just implementing some $ioctl$ calls, the interface in
\cdromc\ supplies the possibility to indicate the {\em capabilities\/}
@@ -944,6+946,13 @@ the current flags. \item[CDROM_CHANGER_NSLOTS] Returns the number of slots in a
juke-box.
\item[CDROMRESET] Reset the drive.
+\item[CDROM_GET_CAPABILITY] Returns the $capability$ flags for the
+ drive. Refer to section \ref{capability} for more information on
+ these flags.
+\item[CDROM_LOCKDOOR] Locks the door of the drive. $arg == \rm0$
+ unlocks the door, any other value locks it.
+\item[CDROM_DEBUG] Turns on debugging info. Only root is allowed
+ to do this. Same semantics as CDROM_LOCKDOOR.
\end{description}
\subsubsection{Device dependent $ioctl$s}
@@ -151,6+151,28 @@ Command line: /etc/fstab entry:
/dev/sdb5 /amiga/Workbench affs noauto,user,exec,verbose 0 0
+IMPORTANT NOTE
+==============
+
+If you boot Windows 95 (don't know about 3.x, 98 and NT) while you
+have an Amiga harddisk connected to your PC, it will overwrite
+the bytes 0x00dc..0x00df of block 0 with garbage, thus invalidating
+the Rigid Disk Block. Sheer luck has it that this is an unused
+area of the RDB, so only the checksum doesn's match anymore.
+Linux will ignore this garbage and recognize the RDB anyway, but
+before you connect that drive to your Amiga again, you must
+restore or repair your RDB. So please do make a backup copy of it
+before booting Windows!
+
+If the damage is already done, the following should fix the RDB
+(where <disk> is the device name).
+DO AT YOUR OWN RISK:
+
+ dd if=/dev/<disk> of=rdb.tmp count=1
+ cp rdb.tmp rdb.fixed
+ dd if=/dev/zero of=rdb.fixed bs=1 seek=220 count=4
+ dd if=rdb.fixed of=/dev/<disk>
+
Bugs, Restrictions, Caveats
===========================
@@ -185,9+207,8 @@ system crashes while an affs partition is mounted. There's currently no way to fix a garbled filesystem without an Amiga (disk validator)
or manually (who would do this?). Maybe later.
-A fsck.affs and mkfs.affs will probably be available in the future.
-If you mount them on system startup, you may want to tell fsck
-that the fs should not be checked (place a '0' in the sixth field
+If you mount affs partitions on system startup, you may want to tell
+fsck that the fs should not be checked (place a '0' in the sixth field
of /etc/fstab).
It's not possible to read floppy disks with a normal PC or workstation
@@ -997,15+997,15 @@ static void __init pcibios_fixup_peer_bridges(void) l != 0x0000 && l != 0xffff) {
#ifdef CONFIG_PCI_BIOS
if (pci_bios_present) {
- int succ, idx = 0;
+ int err, idx = 0;
u8 bios_bus, bios_dfn;
u16 d;
pcibios_read_config_word(n, i, PCI_DEVICE_ID, &d);
DBG("BIOS test for %02x:%02x (%04x:%04x)\n", n, i, l, d);
- while ((succ = pci_bios_find_device(l, d, idx, &bios_bus, &bios_dfn)) &&
+ while (!(err = pci_bios_find_device(l, d, idx, &bios_bus, &bios_dfn)) &&
(bios_bus != n || bios_dfn != i))
idx++;
- if (!succ)
+ if (err)
break;
}
#endif
@@ -210,6+210,10 @@ CONFIG_HAPPYMEAL=m CONFIG_SUNBMAC=m
CONFIG_SUNQE=m
CONFIG_MYRI_SBUS=m
+
+#
+# Unix98 PTY support
+#
CONFIG_UNIX98_PTYS=y
CONFIG_UNIX98_PTY_COUNT=256
-/* $Id: process.c,v 1.131 1999/01/19 07:54:33 davem Exp $
+/* $Id: process.c,v 1.132 1999/03/22 02:12:13 davem Exp $
* linux/arch/sparc/kernel/process.c
*
* Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
@@ -442,10+442,17 @@ clone_stackframe(struct sparc_stackf *dst, struct sparc_stackf *src) size = ((unsigned long)src->fp) - ((unsigned long)src);
sp = (struct sparc_stackf *)(((unsigned long)dst) - size);
+ /* do_fork() grabs the parent semaphore, we must release it
+ * temporarily so we can build the child clone stack frame
+ * without deadlocking.
+ */
+ up(¤t->mm->mmap_sem);
if (copy_to_user(sp, src, size))
- return 0;
- if (put_user(dst, &sp->fp))
- return 0;
+ sp = (struct sparc_stackf *) 0;
+ else if (put_user(dst, &sp->fp))
+ sp = (struct sparc_stackf *) 0;
+ down(¤t->mm->mmap_sem);
+
return sp;
}
-/* $Id: sparc_ksyms.c,v 1.76 1999/01/29 02:06:54 davem Exp $
+/* $Id: sparc_ksyms.c,v 1.77 1999/03/21 06:37:43 davem Exp $
* arch/sparc/kernel/ksyms.c: Sparc specific ksyms support.
*
* Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
@@ -66,6+66,7 @@ extern char saved_command_line[];
extern void bcopy (const char *, char *, int);
extern int __ashrdi3(int, int);
+extern int __lshrdi3(int, int);
extern void dump_thread(struct pt_regs *, struct user *);
@@ -271,6+272,7 @@ EXPORT_SYMBOL_NOVERS(memcpy); EXPORT_SYMBOL_NOVERS(memset);
EXPORT_SYMBOL_NOVERS(memmove);
EXPORT_SYMBOL_NOVERS(__ashrdi3);
+EXPORT_SYMBOL_NOVERS(__lshrdi3);
EXPORT_SYMBOL_DOT(rem);
EXPORT_SYMBOL_DOT(urem);
-/* $Id: sys_sparc.c,v 1.50 1999/01/07 19:06:57 jj Exp $
+/* $Id: sys_sparc.c,v 1.51 1999/03/20 22:02:00 davem Exp $
* linux/arch/sparc/kernel/sys_sparc.c
*
* This file contains various random system calls that
@@ -191,6+191,7 @@ asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len, goto out;
}
retval = -ENOMEM;
+ len = PAGE_ALIGN(len);
if(!(flags & MAP_FIXED) && !addr) {
addr = get_unmapped_area(addr, len);
if(!addr)
@@ -204,6+205,7 @@ asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len,
if(ARCH_SUN4C_SUN4) {
if(((addr >= 0x20000000) && (addr < 0xe0000000))) {
+ /* VM hole */
retval = current->mm->brk;
goto out_putf;
}
-/* $Id: systbls.S,v 1.81 1999/03/12 13:30:15 jj Exp $
+/* $Id: systbls.S,v 1.82 1999/03/20 22:01:59 davem Exp $
* systbls.S: System call entry point tables for OS compatibility.
* The native Linux system call table lives here also.
*
@@ -97,7+97,7 @@ sunos_sys_table: .long sunos_nosys, sys_symlink, sys_readlink
.long sys_execve, sys_umask, sys_chroot
.long sys_newfstat, sunos_nosys, sys_getpagesize
- .long sys_msync, sys_fork, sunos_nosys
+ .long sys_msync, sys_vfork, sunos_nosys
.long sunos_nosys, sunos_sbrk, sunos_sstk
.long sunos_mmap, sunos_vadvise, sys_munmap
.long sys_mprotect, sunos_madvise, sys_vhangup
-# $Id: Makefile,v 1.27 1999/01/02 16:45:45 davem Exp $
+# $Id: Makefile,v 1.28 1999/03/21 06:37:44 davem Exp $
# Makefile for Sparc library files..
#
OBJS = mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o memcpy.o memset.o \
strlen.o checksum.o blockops.o memscan.o memcmp.o strncmp.o \
strncpy_from_user.o divdi3.o udivdi3.o strlen_user.o \
- copy_user.o locks.o atomic.o bitops.o debuglocks.o
+ copy_user.o locks.o atomic.o bitops.o debuglocks.o lshrdi3.o
ifdef CONFIG_SMP
OBJS += irqlock.o
@@ -89,6+89,9 @@ urem.o: urem.S ashrdi3.o: ashrdi3.S
$(CC) -D__ASSEMBLY__ -c -o ashrdi3.o ashrdi3.S
+lshrdi3.o: lshrdi3.S
+ $(CC) -D__ASSEMBLY__ -c -o lshrdi3.o lshrdi3.S
+
dep:
include $(TOPDIR)/Rules.make
--- /dev/null
+/* $Id: lshrdi3.S,v 1.1 1999/03/21 06:37:45 davem Exp $ */
+
+#include <asm/cprefix.h>
+
+ .globl C_LABEL(__lshrdi3)
+C_LABEL(__lshrdi3):
+ cmp %o2, 0
+ be 3f
+ mov 0x20, %g2
+
+ sub %g2, %o2, %g2
+ cmp %g2, 0
+ bg 1f
+ srl %o0, %o2, %o4
+
+ clr %o4
+ neg %g2
+ b 2f
+ srl %o0, %g2, %o5
+1:
+ sll %o0, %g2, %g3
+ srl %o1, %o2, %g2
+ or %g2, %g3, %o5
+2:
+ mov %o4, %o0
+ mov %o5, %o1
+3:
+ retl
+ nop
-/* $Id: init.c,v 1.62 1999/01/07 14:13:00 jj Exp $
+/* $Id: init.c,v 1.63 1999/03/20 22:02:01 davem Exp $
* linux/arch/sparc/mm/init.c
*
* Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
@@ -330,11+330,18 @@ __initfunc(void mem_init(unsigned long start_mem, unsigned long end_mem)) initpages << (PAGE_SHIFT-10),
(unsigned long)PAGE_OFFSET, end_mem);
- freepages.min = nr_free_pages >> 7;
- if(freepages.min < 16)
- freepages.min = 16;
- freepages.low = freepages.min + (freepages.min >> 1);
- freepages.high = freepages.min + freepages.min;
+ /* NOTE NOTE NOTE NOTE
+ * Please keep track of things and make sure this
+ * always matches the code in mm/page_alloc.c -DaveM
+ */
+ i = nr_free_pages >> 7;
+ if (i < 48)
+ i = 48;
+ if (i > 256)
+ i = 256;
+ freepages.min = i;
+ freepages.low = i << 1;
+ freepages.high = freepages.low + i;
}
void free_initmem (void)
-/* $Id: srmmu.c,v 1.183 1999/03/16 11:36:16 davem Exp $
+/* $Id: srmmu.c,v 1.184 1999/03/20 22:02:03 davem Exp $
* srmmu.c: SRMMU specific routines for memory management.
*
* Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
@@ -465,7+465,8 @@ static inline pte_t *srmmu_s_pte_offset(pmd_t * dir, unsigned long address) /* This must update the context table entry for this process. */
static void srmmu_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp)
{
- if(tsk->mm->context != NO_CONTEXT) {
+ if(tsk->mm->context != NO_CONTEXT &&
+ tsk->mm->pgd != pgdp) {
flush_cache_mm(tsk->mm);
ctxd_set(&srmmu_context_table[tsk->mm->context], pgdp);
flush_tlb_mm(tsk->mm);
@@ -816,13+817,19 @@ static inline void free_context(int context)
static void srmmu_switch_to_context(struct task_struct *tsk)
{
+ int set = 0;
+
if(tsk->mm->context == NO_CONTEXT) {
alloc_context(tsk->mm);
flush_cache_mm(tsk->mm);
ctxd_set(&srmmu_context_table[tsk->mm->context], tsk->mm->pgd);
flush_tlb_mm(tsk->mm);
- }
- srmmu_set_context(tsk->mm->context);
+ set = 1;
+ } else if(tsk->mm != current->mm)
+ set = 1;
+
+ if(set != 0)
+ srmmu_set_context(tsk->mm->context);
}
static void srmmu_init_new_context(struct mm_struct *mm)
@@ -1335,7+1342,8 @@ static void hypersparc_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp) if(pgdp != swapper_pg_dir)
hypersparc_flush_page_to_ram(page);
- if(tsk->mm->context != NO_CONTEXT) {
+ if(tsk->mm->context != NO_CONTEXT &&
+ tsk->mm->pgd != pgdp) {
flush_cache_mm(tsk->mm);
ctxd_set(&srmmu_context_table[tsk->mm->context], pgdp);
flush_tlb_mm(tsk->mm);
@@ -1344,8+1352,10 @@ static void hypersparc_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp)
static void viking_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp)
{
- viking_flush_page((unsigned long)pgdp);
- if(tsk->mm->context != NO_CONTEXT) {
+ if(pgdp != swapper_pg_dir)
+ viking_flush_page((unsigned long)pgdp);
+ if(tsk->mm->context != NO_CONTEXT &&
+ tsk->mm->pgd != pgdp) {
flush_cache_mm(tsk->mm);
ctxd_set(&srmmu_context_table[tsk->mm->context], pgdp);
flush_tlb_mm(tsk->mm);
@@ -1358,6+1368,9 @@ static void cypress_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp) unsigned long page = ((unsigned long) pgdp) & PAGE_MASK;
unsigned long line;
+ if(pgdp == swapper_pg_dir)
+ goto skip_flush;
+
a = 0x20; b = 0x40; c = 0x60; d = 0x80; e = 0xa0; f = 0xc0; g = 0xe0;
page &= PAGE_MASK;
line = (page + PAGE_SIZE) - 0x100;
@@ -1378,8+1391,9 @@ static void cypress_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp) "r" (a), "r" (b), "r" (c), "r" (d),
"r" (e), "r" (f), "r" (g));
} while(line != page);
-
- if(tsk->mm->context != NO_CONTEXT) {
+skip_flush:
+ if(tsk->mm->context != NO_CONTEXT &&
+ tsk->mm->pgd != pgdp) {
flush_cache_mm(tsk->mm);
ctxd_set(&srmmu_context_table[tsk->mm->context], pgdp);
flush_tlb_mm(tsk->mm);
@@ -1388,6+1402,8 @@ static void cypress_update_rootmmu_dir(struct task_struct *tsk, pgd_t *pgdp)
static void hypersparc_switch_to_context(struct task_struct *tsk)
{
+ int set = 0;
+
if(tsk->mm->context == NO_CONTEXT) {
ctxd_t *ctxp;
@@ -1395,9+1411,14 @@ static void hypersparc_switch_to_context(struct task_struct *tsk) ctxp = &srmmu_context_table[tsk->mm->context];
srmmu_set_entry((pte_t *)ctxp, __pte((SRMMU_ET_PTD | (srmmu_v2p((unsigned long) tsk->mm->pgd) >> 4))));
hypersparc_flush_page_to_ram((unsigned long)ctxp);
+ set = 1;
+ } else if(tsk->mm != current->mm)
+ set = 1;
+
+ if(set != 0) {
+ hyper_flush_whole_icache();
+ srmmu_set_context(tsk->mm->context);
}
- hyper_flush_whole_icache();
- srmmu_set_context(tsk->mm->context);
}
static void hypersparc_init_new_context(struct mm_struct *mm)
@@ -1410,9+1431,10 @@ static void hypersparc_init_new_context(struct mm_struct *mm) srmmu_set_entry((pte_t *)ctxp, __pte((SRMMU_ET_PTD | (srmmu_v2p((unsigned long) mm->pgd) >> 4))));
hypersparc_flush_page_to_ram((unsigned long)ctxp);
- hyper_flush_whole_icache();
- if(mm == current->mm)
+ if(mm == current->mm) {
+ hyper_flush_whole_icache();
srmmu_set_context(mm->context);
+ }
}
static unsigned long mempool;
@@ -2022,6+2044,11 @@ static void srmmu_update_mmu_cache(struct vm_area_struct * vma, unsigned long ad static void srmmu_destroy_context(struct mm_struct *mm)
{
if(mm->context != NO_CONTEXT && atomic_read(&mm->count) == 1) {
+ /* XXX This could be drastically improved.
+ * XXX We are only called from __exit_mm and it just did
+ * XXX cache/tlb mm flush and right after this will (re-)
+ * XXX SET_PAGE_DIR to swapper_pg_dir. -DaveM
+ */
flush_cache_mm(mm);
ctxd_set(&srmmu_context_table[mm->context], swapper_pg_dir);
flush_tlb_mm(mm);
@@ -2680,15+2707,8 @@ __initfunc(static void init_viking(void))
/* Ahhh, the viking. SRMMU VLSI abortion number two... */
if(mreg & VIKING_MMODE) {
- unsigned long bpreg;
-
srmmu_name = "TI Viking";
viking_mxcc_present = 0;
-
- bpreg = viking_get_bpreg();
- bpreg &= ~(VIKING_ACTION_MIX);
- viking_set_bpreg(bpreg);
-
msi_set_sync();
BTFIXUPSET_CALL(set_pte, srmmu_set_pte_nocache_viking, BTFIXUPCALL_NORM);
@@ -247,6+247,10 @@ CONFIG_SUNQE=m CONFIG_MYRI_SBUS=m
CONFIG_DE4X5=m
CONFIG_VORTEX=m
+
+#
+# Unix 98 PTY support
+#
CONFIG_UNIX98_PTYS=y
CONFIG_UNIX98_PTY_COUNT=256
#include <linux/module.h>
-#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/file.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
-#include <linux/file.h>
#include <linux/ptrace.h>
#include <linux/user.h>
#include <linux/malloc.h>
@@ -58,14+58,13 @@ static void set_brk(unsigned long start, unsigned long end) * macros to write out all the necessary info.
*/
#define DUMP_WRITE(addr,nr) \
-while (file.f_op->write(&file,(char *)(addr),(nr),&file.f_pos) != (nr)) \
- goto close_coredump
+while (file->f_op->write(file,(char *)(addr),(nr),&file->f_pos) != (nr)) goto close_coredump
#define DUMP_SEEK(offset) \
-if (file.f_op->llseek) { \
- if (file.f_op->llseek(&file,(offset),0) != (offset)) \
+if (file->f_op->llseek) { \
+ if (file->f_op->llseek(file,(offset),0) != (offset)) \
goto close_coredump; \
-} else file.f_pos = (offset)
+} else file->f_pos = (offset)
/*
* Routine writes a core dump image in the current directory.
@@ -82,7+81,7 @@ do_aout32_core_dump(long signr, struct pt_regs * regs) {
struct dentry * dentry = NULL;
struct inode * inode = NULL;
- struct file file;
+ struct file * file;
mm_segment_t fs;
int has_dumped = 0;
char corefile[6+sizeof(current->comm)];
@@ -106,29+105,16 @@ do_aout32_core_dump(long signr, struct pt_regs * regs) #else
corefile[4] = '\0';
#endif
- dentry = open_namei(corefile,O_CREAT | 2 | O_TRUNC | O_NOFOLLOW, 0600);
- if (IS_ERR(dentry)) {
- dentry = NULL;
+ file = filp_open(corefile,O_CREAT | 2 | O_TRUNC | O_NOFOLLOW, 0600);
+ if (IS_ERR(file))
goto end_coredump;
- }
+ dentry = file->f_dentry;
inode = dentry->d_inode;
if (!S_ISREG(inode->i_mode))
- goto end_coredump;
+ goto close_coredump;
if (!inode->i_op || !inode->i_op->default_file_ops)
- goto end_coredump;
- if (get_write_access(inode))
- goto end_coredump;
- file.f_mode = 3;
- file.f_flags = 0;
- file.f_count = 1;
- file.f_dentry = dentry;
- file.f_pos = 0;
- file.f_reada = 0;
- file.f_op = inode->i_op->default_file_ops;
- if (file.f_op->open)
- if (file.f_op->open(inode,&file))
- goto done_coredump;
- if (!file.f_op->write)
+ goto close_coredump;
+ if (!file->f_op->write)
goto close_coredump;
has_dumped = 1;
current->flags |= PF_DUMPCORE;
@@ -175,13+161,9 @@ do_aout32_core_dump(long signr, struct pt_regs * regs) set_fs(KERNEL_DS);
DUMP_WRITE(current,sizeof(*current));
close_coredump:
- if (file.f_op->release)
- file.f_op->release(inode,&file);
-done_coredump:
- put_write_access(inode);
+ close_fp(file, NULL);
end_coredump:
set_fs(fs);
- dput(dentry);
return has_dumped;
}
@@ -269,7+251,6 @@ static inline int do_load_aout32_binary(struct linux_binprm * bprm, return -ENOEXEC;
}
- current->personality = PER_LINUX;
fd_offset = N_TXTOFF(ex);
/* Check initial limits. This avoids letting people circumvent
@@ -288,6+269,8 @@ static inline int do_load_aout32_binary(struct linux_binprm * bprm, return retval;
/* OK, This is the point of no return */
+ current->personality = PER_LINUX;
+
current->mm->end_code = ex.a_text +
(current->mm->start_code = N_TXTADDR(ex));
current->mm->end_data = ex.a_data +
@@ -297,8+280,7 @@ static inline int do_load_aout32_binary(struct linux_binprm * bprm,
current->mm->rss = 0;
current->mm->mmap = NULL;
- current->suid = current->euid = current->fsuid = bprm->e_uid;
- current->sgid = current->egid = current->fsgid = bprm->e_gid;
+ compute_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
if (N_MAGIC(ex) == NMAGIC) {
/* Fuck me plenty... */
@@ -404,48+386,44 @@ static inline int do_load_aout32_library(int fd)
{
struct file * file;
- struct exec ex;
- struct dentry * dentry;
struct inode * inode;
- unsigned int len;
- unsigned int bss;
- unsigned int start_addr;
+ unsigned long bss, start_addr, len;
unsigned long error;
+ int retval;
+ loff_t offset = 0;
+ struct exec ex;
- file = fcheck(fd);
-
- if (!file || !file->f_op)
- return -EACCES;
-
- dentry = file->f_dentry;
- inode = dentry->d_inode;
-
- /* Seek into the file */
- if (file->f_op->llseek) {
- if ((error = file->f_op->llseek(file, 0, 0)) != 0)
- return -ENOEXEC;
- } else
- file->f_pos = 0;
+ retval = -EACCES;
+ file = fget(fd);
+ if (!file)
+ goto out;
+ if (!file->f_op)
+ goto out_putf;
+ inode = file->f_dentry->d_inode;
+ retval = -ENOEXEC;
+ /* N.B. Save current fs? */
set_fs(KERNEL_DS);
- error = file->f_op->read(file, (char *) &ex, sizeof(ex), &file->f_pos);
+ error = file->f_op->read(file, (char *) &ex, sizeof(ex), &offset);
set_fs(USER_DS);
if (error != sizeof(ex))
- return -ENOEXEC;
+ goto out_putf;
/* We come in here for the regular a.out style of shared libraries */
if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
inode->i_size < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- return -ENOEXEC;
+ goto out_putf;
}
+
if (N_MAGIC(ex) == ZMAGIC && N_TXTOFF(ex) &&
(N_TXTOFF(ex) < inode->i_sb->s_blocksize)) {
printk("N_TXTOFF < BLOCK_SIZE. Please convert library\n");
- return -ENOEXEC;
+ goto out_putf;
}
- if (N_FLAGS(ex)) return -ENOEXEC;
+ if (N_FLAGS(ex))
+ goto out_putf;
/* For QMAGIC, the starting address is 0x20 into the page. We mask
this off to get the starting address for the page */
@@ -457,18+435,26 @@ do_load_aout32_library(int fd) PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
N_TXTOFF(ex));
+ retval = error;
if (error != start_addr)
- return error;
+ goto out_putf;
+
len = PAGE_ALIGN(ex.a_text + ex.a_data);
bss = ex.a_text + ex.a_data + ex.a_bss;
if (bss > len) {
- error = do_mmap(NULL, start_addr + len, bss-len,
- PROT_READ|PROT_WRITE|PROT_EXEC,
- MAP_PRIVATE|MAP_FIXED, 0);
+ error = do_mmap(NULL, start_addr + len, bss - len,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_PRIVATE | MAP_FIXED, 0);
+ retval = error;
if (error != start_addr + len)
- return error;
+ goto out_putf;
}
- return 0;
+ retval = 0;
+
+out_putf:
+ fput(file);
+out:
+ return retval;
}
static int
@@ -88,7+88,7 @@ __initfunc(void cpu_probe(void)) if(i==NSPARCCHIPS) {
printk("DEBUG: manuf = 0x%x impl = 0x%x\n", manuf,
impl);
- sparc_cpu_type[cpuid] = "Unknow CPU";
+ sparc_cpu_type[cpuid] = "Unknown CPU";
}
for(i = 0; i<NSPARCFPU; i++) {
-/* $Id: process.c,v 1.89 1999/01/19 07:54:39 davem Exp $
+/* $Id: process.c,v 1.90 1999/03/22 02:12:16 davem Exp $
* arch/sparc64/kernel/process.c
*
* Copyright (C) 1995, 1996 David S. Miller (davem@caip.rutgers.edu)
@@ -456,6+456,11 @@ static unsigned long clone_stackframe(unsigned long csp, unsigned long psp) {
unsigned long fp, distance, rval;
+ /* do_fork() grabs the parent semaphore, we must release it
+ * temporarily so we can build the child clone stack frame
+ * without deadlocking.
+ */
+ up(¤t->mm->mmap_sem);
if(!(current->tss.flags & SPARC_FLAG_32BIT)) {
csp += STACK_BIAS;
psp += STACK_BIAS;
@@ -472,17+477,20 @@ static unsigned long clone_stackframe(unsigned long csp, unsigned long psp) distance = fp - psp;
rval = (csp - distance);
if(copy_in_user(rval, psp, distance))
- return 0;
- if(current->tss.flags & SPARC_FLAG_32BIT) {
+ rval = 0;
+ else if(current->tss.flags & SPARC_FLAG_32BIT) {
if(put_user(((u32)csp), &(((struct reg_window32 *)rval)->ins[6])))
- return 0;
- return rval;
+ rval = 0;
} else {
if(put_user(((u64)csp - STACK_BIAS),
&(((struct reg_window *)rval)->ins[6])))
- return 0;
- return rval - STACK_BIAS;
+ rval = 0;
+ else
+ rval = rval - STACK_BIAS;
}
+ down(¤t->mm->mmap_sem);
+
+ return rval;
}
/* Standard stuff. */
-/* $Id: systbls.S,v 1.51 1999/03/12 13:30:24 jj Exp $
+/* $Id: systbls.S,v 1.52 1999/03/20 22:02:05 davem Exp $
* systbls.S: System call entry point tables for OS compatibility.
* The native Linux system call table lives here also.
*
@@ -156,7+156,7 @@ sunos_sys_table: .word sunos_nosys, sys_symlink, sys_readlink
.word sys32_execve, sys_umask, sys_chroot
.word sys32_newfstat, sunos_nosys, sys_getpagesize
- .word sys_msync, sys_fork, sunos_nosys
+ .word sys_msync, sys_vfork, sunos_nosys
.word sunos_nosys, sunos_sbrk, sunos_sstk
.word sunos_mmap, sunos_vadvise, sys_munmap
.word sys_mprotect, sunos_madvise, sys_vhangup
@@ -867,19+867,20 @@ amiga_partition(struct gendisk *hd, kdev_t dev, unsigned long first_sector) int nr_sects;
int blk;
int part, res;
+ int old_blocksize;
+ int blocksize;
- /*
- * Don't bother touching M/O 2K media.
- */
-
- if (get_ptable_blocksize(dev) != 1024)
- return 0;
-
- set_blocksize(dev,512);
+ old_blocksize = get_ptable_blocksize(dev);
+ if (hardsect_size[MAJOR(dev)] != NULL)
+ blocksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
+ else
+ blocksize = 512;
+
+ set_blocksize(dev,blocksize);
res = 0;
for (blk = 0; blk < RDB_ALLOCATION_LIMIT; blk++) {
- if(!(bh = bread(dev,blk,512))) {
+ if(!(bh = bread(dev,blk,blocksize))) {
printk("Dev %s: unable to read RDB block %d\n",
kdevname(dev),blk);
goto rdb_done;
@@ -887,16+888,25 @@ amiga_partition(struct gendisk *hd, kdev_t dev, unsigned long first_sector) if (*(u32 *)bh->b_data == htonl(IDNAME_RIGIDDISK)) {
rdb = (struct RigidDiskBlock *)bh->b_data;
if (checksum_block((u32 *)bh->b_data,htonl(rdb->rdb_SummedLongs) & 0x7F)) {
- printk("Dev %s: RDB in block %d has bad checksum\n",
- kdevname(dev),blk);
- brelse(bh);
- continue;
+ /* Try again with 0xdc..0xdf zeroed, Windows might have
+ * trashed it.
+ */
+ *(u32 *)(&bh->b_data[0xdc]) = 0;
+ if (checksum_block((u32 *)bh->b_data,
+ htonl(rdb->rdb_SummedLongs) & 0x7F)) {
+ brelse(bh);
+ printk("Dev %s: RDB in block %d has bad checksum\n",
+ kdevname(dev),blk);
+ continue;
+ }
+ printk("Warning: Trashed word at 0xd0 in block %d "
+ "ignored in checksum calculation\n",kdevname(dev),blk);
}
printk(" RDSK");
blk = htonl(rdb->rdb_PartitionList);
brelse(bh);
for (part = 1; blk > 0 && part <= 16; part++) {
- if (!(bh = bread(dev,blk, 512))) {
+ if (!(bh = bread(dev,blk,blocksize))) {
printk("Dev %s: unable to read partition block %d\n",
kdevname(dev),blk);
goto rdb_done;
@@ -929,11+939,7 @@ amiga_partition(struct gendisk *hd, kdev_t dev, unsigned long first_sector) }
rdb_done:
- /*
- * FIXME: should restore the original size. Then we could clean
- * up the M/O skip. Amiga people ?
- */
- set_blocksize(dev,BLOCK_SIZE);
+ set_blocksize(dev,old_blocksize);
return res;
}
#endif /* CONFIG_AMIGA_PARTITION */
-- Added CDROM_DEBUG ioctl. Enable debug messages on-the-fly.
-- Added CDROM_GET_CAPABILITY ioctl. This relieves userspace programs
from parsing /proc/sys/dev/cdrom/info.
+
+ 2.54 Mar 15, 1999 - Jens Axboe <axboe@image.dk>
+ -- Check capability mask from low level driver when counting tracks as
+ per suggestion from Corey J. Scotts <cstotts@blue.weeg.uiowa.edu>.
-------------------------------------------------------------------------*/
-#define REVISION "Revision: 2.53"
-#define VERSION "Id: cdrom.c 2.53 1999/02/22"
+#define REVISION "Revision: 2.54"
+#define VERSION "Id: cdrom.c 2.54 1999/03/15"
/* I use an error-log mask to give fine grain control over the type of
messages dumped to the system logs. The available masks include: */
@@ -601,14+605,17 @@ void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype* tracks) tracks->xa=0;
tracks->error=0;
cdinfo(CD_COUNT_TRACKS, "entering cdrom_count_tracks\n");
- if (!(cdi->ops->capability & CDC_PLAY_AUDIO)) {
+ if (!(cdi->ops->capability & ~cdi->mask & CDC_PLAY_AUDIO)) {
tracks->error=CDS_NO_INFO;
return;
}
/* Grab the TOC header so we can see how many tracks there are */
- ret=cdi->ops->audio_ioctl(cdi, CDROMREADTOCHDR, &header);
+ ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCHDR, &header);
if (ret) {
- tracks->error=(ret == -ENOMEDIUM) ? CDS_NO_DISC : CDS_NO_INFO;
+ if (ret == -ENOMEDIUM)
+ tracks->error = CDS_NO_DISC;
+ else
+ tracks->error = CDS_NO_INFO;
return;
}
/* check what type of tracks are on this disc */
@@ -729,7+736,7 @@ int cdrom_ioctl(struct inode *ip, struct file *fp, cdinfo(CD_DO_IOCTL, "entering CDROMEJECT\n");
if (!(cdo->capability & ~cdi->mask & CDC_OPEN_TRAY))
return -ENOSYS;
- if (cdi->use_count != 1)
+ if (cdi->use_count != 1 || keeplocked)
return -EBUSY;
if (cdo->capability & ~cdi->mask & CDC_LOCK)
if ((ret=cdo->lock_door(cdi, 0)))
@@ -748,6+755,8 @@ int cdrom_ioctl(struct inode *ip, struct file *fp, cdinfo(CD_DO_IOCTL, "entering CDROMEJECT_SW\n");
if (!(cdo->capability & ~cdi->mask & CDC_OPEN_TRAY))
return -ENOSYS;
+ if (keeplocked)
+ return -EBUSY;
cdi->options &= ~(CDO_AUTO_CLOSE | CDO_AUTO_EJECT);
if (arg)
cdi->options |= CDO_AUTO_CLOSE | CDO_AUTO_EJECT;
@@ -778,6+787,8 @@ int cdrom_ioctl(struct inode *ip, struct file *fp, if (!(cdo->capability & ~cdi->mask & CDC_LOCK))
return -ENOSYS;
break;
+ case 0:
+ return cdi->options;
/* default is basically CDO_[AUTO_CLOSE|AUTO_EJECT] */
default:
if (!(cdo->capability & ~cdi->mask & arg))
@@ -814,31+825,30 @@ int cdrom_ioctl(struct inode *ip, struct file *fp, if (!(cdo->capability & ~cdi->mask & CDC_RESET))
return -ENOSYS;
return cdo->reset(cdi);
- }
+ }
case CDROM_LOCKDOOR: {
cdinfo(CD_DO_IOCTL, "%socking door.\n",arg?"L":"Unl");
- if (cdo->capability & ~cdi->mask & CDC_LOCK) {
+ if (!(cdo->capability & ~cdi->mask & CDC_LOCK)) {
+ return -EDRIVE_CANT_DO_THIS;
+ } else {
keeplocked = arg ? 1 : 0;
return cdo->lock_door(cdi, arg);
- } else
- return -EDRIVE_CANT_DO_THIS;
- }
+ }
+ }
case CDROM_DEBUG: {
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
cdinfo(CD_DO_IOCTL, "%sabling debug.\n",arg?"En":"Dis");
debug = arg ? 1 : 0;
- return 0;
- }
+ return debug;
+ }
case CDROM_GET_CAPABILITY: {
cdinfo(CD_DO_IOCTL, "entering CDROM_GET_CAPABILITY\n");
return cdo->capability;
- }
-
-
+ }
/* The following function is implemented, although very few audio
* discs give Universal Product Code information, which should just be
--- /dev/null
+/* 3c527.c: 3Com Etherlink/MC32 driver for Linux
+ *
+ * (c) Copyright 1998 Red Hat Software Inc
+ * Written by Alan Cox.
+ *
+ * Based on skeleton.c written 1993-94 by Donald Becker and ne2.c
+ * (for the MCA stuff) written by Wim Dumon.
+ *
+ * Thanks to 3Com for making this possible by providing me with the
+ * documentation.
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU Public License, incorporated herein by reference.
+ *
+ */
+
+static const char *version =
+ "3c527.c:v0.04 1999/03/16 Alan Cox (alan@redhat.com)\n";
+
+/*
+ * Things you need
+ * o The databook.
+ *
+ * Traps for the unwary
+ *
+ * The diagram (Figure 1-1) and the POS summary disagree with the
+ * "Interrupt Level" section in the manual.
+ *
+ * The documentation in places seems to miss things. In actual fact
+ * I've always eventually found everything is documented, it just
+ * requires careful study.
+ */
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/interrupt.h>
+#include <linux/ptrace.h>
+#include <linux/mca.h>
+#include <linux/ioport.h>
+#include <linux/in.h>
+#include <linux/malloc.h>
+#include <linux/string.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <asm/io.h>
+#include <asm/dma.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+
+#include "3c527.h"
+
+/*
+ * The name of the card. Is used for messages and in the requests for
+ * io regions, irqs and dma channels
+ */
+static const char* cardname = "3c527";
+
+/* use 0 for production, 1 for verification, >2 for debug */
+#ifndef NET_DEBUG
+#define NET_DEBUG 2
+#endif
+static unsigned int mc32_debug = NET_DEBUG;
+
+/* The number of low I/O ports used by the ethercard. */
+#define NETCARD_IO_EXTENT 8
+
+
+struct mc32_mailbox
+{
+ u16 mbox __attribute((packed));
+ u16 data[1] __attribute((packed));
+};
+
+/* Information that need to be kept for each board. */
+
+#define TX_RING_MAX 16 /* Typically the card supports 37 */
+#define RX_RING_MAX 32 /* " " " */
+
+struct mc32_local
+{
+ struct net_device_stats net_stats;
+ int slot;
+ volatile struct mc32_mailbox *rx_box;
+ volatile struct mc32_mailbox *tx_box;
+ volatile struct mc32_mailbox *exec_box;
+ volatile u16 *stats;
+ u16 tx_chain;
+ u16 rx_chain;
+ u16 tx_len;
+ u16 rx_len;
+ u32 base;
+ u16 rx_halted;
+ u16 tx_halted;
+ u16 exec_pending;
+ u16 mc_reload_wait; /* a multicast load request is pending */
+ atomic_t tx_count; /* buffers left */
+ struct wait_queue *event;
+ struct sk_buff *tx_skb[TX_RING_MAX]; /* Transmit ring */
+ u16 tx_skb_top;
+ u16 tx_skb_end;
+ struct sk_buff *rx_skb[RX_RING_MAX]; /* Receive ring */
+ void *rx_ptr[RX_RING_MAX]; /* Data pointers */
+};
+
+/* The station (ethernet) address prefix, used for a sanity check. */
+#define SA_ADDR0 0x02
+#define SA_ADDR1 0x60
+#define SA_ADDR2 0xAC
+
+struct mca_adapters_t {
+ unsigned int id;
+ char *name;
+};
+
+const struct mca_adapters_t mc32_adapters[] = {
+ { 0x0041, "3COM EtherLink MC/32" },
+ { 0x8EF5, "IBM High Performance Lan Adapter" },
+ { 0x0000, NULL }
+};
+
+
+/* Index to functions, as function prototypes. */
+
+extern int mc32_probe(struct device *dev);
+
+static int mc32_probe1(struct device *dev, int ioaddr);
+static int mc32_open(struct device *dev);
+static int mc32_send_packet(struct sk_buff *skb, struct device *dev);
+static void mc32_interrupt(int irq, void *dev_id, struct pt_regs *regs);
+static int mc32_close(struct device *dev);
+static struct net_device_stats *mc32_get_stats(struct device *dev);
+static void mc32_set_multicast_list(struct device *dev);
+
+/*
+ * Check for a network adaptor of this type, and return '0' iff one exists.
+ * If dev->base_addr == 0, probe all likely locations.
+ * If dev->base_addr == 1, always return failure.
+ * If dev->base_addr == 2, allocate space for the device and return success
+ * (detachable devices only).
+ */
+
+__initfunc(int mc32_probe(struct device *dev))
+{
+ static int current_mca_slot = -1;
+ int i;
+ int adapter_found = 0;
+
+ /* Do not check any supplied i/o locations.
+ POS registers usually don't fail :) */
+
+ /* MCA cards have POS registers.
+ Autodetecting MCA cards is extremely simple.
+ Just search for the card. */
+
+ for(i = 0; (mc32_adapters[i].name != NULL) && !adapter_found; i++) {
+ current_mca_slot =
+ mca_find_unused_adapter(mc32_adapters[i].id, 0);
+
+ if((current_mca_slot != MCA_NOTFOUND) && !adapter_found) {
+ if(!mc32_probe1(dev, current_mca_slot))
+ {
+ mca_set_adapter_name(current_mca_slot,
+ mc32_adapters[i].name);
+ mca_mark_as_used(current_mca_slot);
+ return 0;
+ }
+
+ }
+ }
+ return -ENODEV;
+}
+
+/*
+ * This is the real probe routine. Linux has a history of friendly device
+ * probes on the ISA bus. A good device probes avoids doing writes, and
+ * verifies that the correct device exists and functions.
+ */
+__initfunc(static int mc32_probe1(struct device *dev, int slot))
+{
+ static unsigned version_printed = 0;
+ int i;
+ u8 POS;
+ u32 base;
+ struct mc32_local *lp;
+ static u16 mca_io_bases[]={
+ 0x7280,0x7290,
+ 0x7680,0x7690,
+ 0x7A80,0x7A90,
+ 0x7E80,0x7E90
+ };
+ static u32 mca_mem_bases[]={
+ 0x00C0000,
+ 0x00C4000,
+ 0x00C8000,
+ 0x00CC000,
+ 0x00D0000,
+ 0x00D4000,
+ 0x00D8000,
+ 0x00DC000
+ };
+ static char *failures[]={
+ "Processor instruction",
+ "Processor data bus",
+ "Processor data bus",
+ "Processor data bus",
+ "Adapter bus",
+ "ROM checksum",
+ "Base RAM",
+ "Extended RAM",
+ "82586 internal loopback",
+ "82586 initialisation failure",
+ "Adapter list configuration error"
+ };
+
+ /* Time to play MCA games */
+
+ if (mc32_debug && version_printed++ == 0)
+ printk(KERN_DEBUG "%s", version);
+
+ printk(KERN_INFO "%s: %s found in slot %d:", dev->name, cardname, slot);
+
+ POS = mca_read_stored_pos(slot, 2);
+
+ if(!(POS&1))
+ {
+ printk(" disabled.\n");
+ return -ENODEV;
+ }
+
+ /* Allocate a new 'dev' if needed. */
+ if (dev == NULL) {
+ /*
+ * Don't allocate the private data here, it is done later
+ * This makes it easier to free the memory when this driver
+ * is used as a module.
+ */
+ dev = init_etherdev(0, 0);
+ if (dev == NULL)
+ return -ENOMEM;
+ }
+
+ /* Fill in the 'dev' fields. */
+ dev->base_addr = mca_io_bases[(POS>>1)&7];
+ dev->mem_start = mca_mem_bases[(POS>>4)&7];
+
+ POS = mca_read_stored_pos(slot, 4);
+ if(!(POS&1))
+ {
+ printk("memory window disabled.\n");
+ return -ENODEV;
+ }
+
+ POS = mca_read_stored_pos(slot, 5);
+
+ i=(POS>>4)&3;
+ if(i==3)
+ {
+ printk("invalid memory window.\n");
+ return -ENODEV;
+ }
+
+ i*=16384;
+ i+=16384;
+
+ dev->mem_end=dev->mem_start + i;
+
+ dev->irq = ((POS>>2)&3)+9;
+
+ printk("io 0x%3lX irq %d mem 0x%lX (%dK)\n",
+ dev->base_addr, dev->irq, dev->mem_start, i/1024);
+
+
+ /* We ought to set the cache line size here.. */
+
+
+ /*
+ * Go PROM browsing
+ */
+
+ printk("%s: Address ", dev->name);
+
+ /* Retrieve and print the ethernet address. */
+ for (i = 0; i < 6; i++)
+ {
+ mca_write_pos(slot, 6, i+12);
+ mca_write_pos(slot, 7, 0);
+
+ printk(" %2.2x", dev->dev_addr[i] = mca_read_pos(slot,3));
+ }
+
+ mca_write_pos(slot, 6, 0);
+ mca_write_pos(slot, 7, 0);
+
+ POS = mca_read_stored_pos(slot, 4);
+
+ if(POS&2)
+ printk(" : BNC port selected.\n");
+ else
+ printk(" : AUI port selected.\n");
+
+ POS=inb(dev->base_addr+HOST_CTRL);
+ POS|=HOST_CTRL_ATTN|HOST_CTRL_RESET;
+ POS&=~HOST_CTRL_INTE;
+ outb(POS, dev->base_addr+HOST_CTRL);
+ /* Reset adapter */
+ udelay(100);
+ /* Reset off */
+ POS&=~(HOST_CTRL_ATTN|HOST_CTRL_RESET);
+ outb(POS, dev->base_addr+HOST_CTRL);
+
+ udelay(300);
+
+ /*
+ * Grab the IRQ
+ */
+
+ if(request_irq(dev->irq, &mc32_interrupt, 0, cardname, dev))
+ {
+ printk("%s: unable to get IRQ %d.\n",
+ dev->name, dev->irq);
+ return -EAGAIN;
+ }
+
+ /* Initialize the device structure. */
+ if (dev->priv == NULL) {
+ dev->priv = kmalloc(sizeof(struct mc32_local), GFP_KERNEL);
+ if (dev->priv == NULL)
+ {
+ free_irq(dev->irq, dev);
+ return -ENOMEM;
+ }
+ }
+
+ memset(dev->priv, 0, sizeof(struct mc32_local));
+ lp = (struct mc32_local *)dev->priv;
+ lp->slot = slot;
+
+ i=0;
+
+ base = inb(dev->base_addr);
+
+ while(base==0xFF)
+ {
+ i++;
+ if(i==1000)
+ {
+ printk("%s: failed to boot adapter.\n", dev->name);
+ free_irq(dev->irq, dev);
+ return -ENODEV;
+ }
+ udelay(1000);
+ if(inb(dev->base_addr+2)&(1<<5))
+ base = inb(dev->base_addr);
+ }
+
+ if(base>0)
+ {
+ if(base < 0x0C)
+ printk("%s: %s%s.\n", dev->name, failures[base-1],
+ base<0x0A?" test failure":"");
+ else
+ printk("%s: unknown failure %d.\n", dev->name, base);
+ free_irq(dev->irq, dev);
+ return -ENODEV;
+ }
+
+ base=0;
+ for(i=0;i<4;i++)
+ {
+ int n=0;
+
+ while(!(inb(dev->base_addr+2)&(1<<5)))
+ {
+ n++;
+ udelay(50);
+ if(n>100)
+ {
+ printk(KERN_ERR "%s: mailbox read fail (%d).\n", dev->name, i);
+ free_irq(dev->irq, dev);
+ return -ENODEV;
+ }
+ }
+
+ base|=(inb(dev->base_addr)<<(8*i));
+ }
+
+ lp->exec_box=bus_to_virt(dev->mem_start+base);
+
+ base=lp->exec_box->data[1]<<16|lp->exec_box->data[0];
+
+ lp->base = dev->mem_start+base;
+
+ lp->rx_box=bus_to_virt(lp->base + lp->exec_box->data[2]);
+ lp->tx_box=bus_to_virt(lp->base + lp->exec_box->data[3]);
+
+ lp->stats = bus_to_virt(lp->base + lp->exec_box->data[5]);
+
+ /*
+ * Descriptor chains (card relative)
+ */
+
+ lp->tx_chain = lp->exec_box->data[8];
+ lp->rx_chain = lp->exec_box->data[10];
+ lp->tx_len = lp->exec_box->data[9];
+ lp->rx_len = lp->exec_box->data[11];
+
+ printk("%s: %d RX buffers, %d TX buffers. Base of 0x%08X.\n",
+ dev->name, lp->rx_len, lp->tx_len, lp->base);
+
+ dev->open = mc32_open;
+ dev->stop = mc32_close;
+ dev->hard_start_xmit = mc32_send_packet;
+ dev->get_stats = mc32_get_stats;
+ dev->set_multicast_list = mc32_set_multicast_list;
+
+ lp->rx_halted = 1;
+ lp->tx_halted = 1;
+
+ /* Fill in the fields of the device structure with ethernet values. */
+ ether_setup(dev);
+ return 0;
+}
+
+
+/*
+ * Polled command stuff
+ */
+
+static void mc32_ring_poll(struct device *dev)
+{
+ int ioaddr = dev->base_addr;
+ while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+}
+
+
+/*
+ * Send exec commands
+ */
+
+static int mc32_command(struct device *dev, u16 cmd, void *data, int len)
+{
+ struct mc32_local *lp = (struct mc32_local *)dev->priv;
+ int ioaddr = dev->base_addr;
+ unsigned long flags;
+
+ while(lp->exec_pending)
+ sleep_on(&lp->event);
+
+ lp->exec_pending=1;
+ lp->exec_box->mbox=0;
+ lp->exec_box->mbox=cmd;
+ memcpy((void *)lp->exec_box->data, data, len);
+ barrier(); /* the memcpy forgot the volatile so be sure */
+
+ /* Send the command */
+ while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+ outb(1<<6, ioaddr+HOST_CMD);
+
+ save_flags(flags);
+ cli();
+ while(lp->exec_pending!=2)
+ sleep_on(&lp->event);
+ lp->exec_pending=0;
+ restore_flags(flags);
+
+ /*
+ * A multicast set got blocked - do it now
+ */
+
+ if(lp->mc_reload_wait)
+ mc32_set_multicast_list(dev);
+
+ if(lp->exec_box->data[0]&(1<<13))
+ return -1;
+ return 0;
+}
+
+/*
+ * RX abort
+ */
+
+static void mc32_rx_abort(struct device *dev)
+{
+ struct mc32_local *lp = (struct mc32_local *)dev->priv;
+ int ioaddr = dev->base_addr;
+
+ while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+
+ lp->rx_box->mbox=0;
+ outb(3<<3, ioaddr+HOST_CMD); /* Suspend reception */
+}
+
+
+/*
+ * RX enable
+ */
+
+static void mc32_rx_begin(struct device *dev)
+{
+ struct mc32_local *lp = (struct mc32_local *)dev->priv;
+ int ioaddr = dev->base_addr;
+
+ while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+
+ lp->rx_box->mbox=0;
+ outb(1<<3, ioaddr+HOST_CMD); /* GO */
+ mc32_ring_poll(dev);
+
+ lp->rx_halted=0;
+}
+
+static void mc32_tx_abort(struct device *dev)
+{
+ struct mc32_local *lp = (struct mc32_local *)dev->priv;
+ int ioaddr = dev->base_addr;
+
+ while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+
+ lp->tx_box->mbox=0;
+ outb(3, ioaddr+HOST_CMD); /* Suspend */
+
+ /* Ring empty */
+
+ atomic_set(&lp->tx_count, lp->tx_len);
+
+ /* Flush */
+ if(lp->tx_skb_top!=lp->tx_skb_end)
+ {
+ int i;
+ if(lp->tx_skb_top<=lp->tx_skb_end)
+ {
+ for(i=lp->tx_skb_top;i<lp->tx_skb_end;i++)
+ {
+ dev_kfree_skb(lp->tx_skb[i]);
+ lp->tx_skb[i]=NULL;
+ }
+ }
+ else
+ {
+ for(i=lp->tx_skb_end;i<TX_RING_MAX;i++)
+ {
+ dev_kfree_skb(lp->tx_skb[i]);
+ lp->tx_skb[i]=NULL;
+ }
+ for(i=0;i<lp->tx_skb_top;i++)
+ {
+ dev_kfree_skb(lp->tx_skb[i]);
+ lp->tx_skb[i]=NULL;
+ }
+ }
+ }
+ lp->tx_skb_top=lp->tx_skb_end=0;
+}
+
+/*
+ * TX enable
+ */
+
+static void mc32_tx_begin(struct device *dev)
+{
+ struct mc32_local *lp = (struct mc32_local *)dev->priv;
+ int ioaddr = dev->base_addr;
+
+ while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+
+ lp->tx_box->mbox=0;
+#if 0
+ outb(5, ioaddr+HOST_CMD); /* GO */
+ printk("TX=>5\n");
+ mc32_ring_poll(dev);
+ if(lp->tx_box->mbox&(1<<13))
+ printk("TX begin error!\n");
+#endif
+ lp->tx_halted=0;
+}
+
+
+/*
+ * Load the rx ring
+ */
+
+static int mc32_load_rx_ring(struct device *dev)
+{
+ struct mc32_local *lp = (struct mc32_local *)dev->priv;
+ int i;
+ u16 base;
+ volatile struct skb_header *p;
+
+ base = lp->rx_box->data[0];
+
+ /* Fix me - should use card size - also fix flush ! */
+
+ for(i=0;i<RX_RING_MAX;i++)
+ {
+ lp->rx_skb[i]=alloc_skb(1532, GFP_KERNEL);
+ if(lp->rx_skb[i]==NULL)
+ {
+ for(;i>=0;i--)
+ kfree_skb(lp->rx_skb[i]);
+ return -ENOBUFS;
+ }
+ lp->rx_ptr[i]=lp->rx_skb[i]->data+18;
+
+ p=bus_to_virt(lp->base+base);
+ p->control=0;
+ p->data = virt_to_bus(lp->rx_ptr[i]);
+ p->status=0;
+ p->length = 1532;
+ base = p->next;
+ }
+ p->control = (1<<6);
+ lp->rx_box->mbox = 0;
+ return 0;
+}
+
+static void mc32_flush_rx_ring(struct mc32_local *lp)
+{
+ int i;
+ for(i=0;i<RX_RING_MAX;i++)
+ kfree_skb(lp->rx_skb[i]);
+}
+
+static void mc32_flush_tx_ring(struct mc32_local *lp)
+{
+ int i;
+
+ if(lp->tx_skb_top <= lp->tx_skb_end)
+ {
+ for(i=lp->tx_skb_top;i<lp->tx_skb_end;i++)
+ dev_kfree_skb(lp->tx_skb[i]);
+ }
+ else
+ {
+ for(i=0;i<lp->tx_skb_end;i++)
+ dev_kfree_skb(lp->tx_skb[i]);
+ for(i=lp->tx_skb_top;i<TX_RING_MAX;i++)
+ dev_kfree_skb(lp->tx_skb[i]);
+ }
+}
+
+/*
+ * Open/initialize the board. This is called (in the current kernel)
+ * sometime after booting when the 'ifconfig' program is run.
+ */
+
+static int mc32_open(struct device *dev)
+{
+ int ioaddr = dev->base_addr;
+ u16 zero_word=0;
+ u8 one=1;
+ u8 regs;
+
+ dev->tbusy = 0;
+ dev->interrupt = 0;
+ dev->start = 1;
+
+ /*
+ * Interrupts enabled
+ */
+
+ regs=inb(ioaddr+HOST_CTRL);
+ regs|=HOST_CTRL_INTE;
+ outb(regs, ioaddr+HOST_CTRL);
+
+
+ /*
+ * Send the indications on command
+ */
+
+ mc32_command(dev, 4, &one, 2);
+
+
+ /*
+ * Send the command sequence "abort, resume" for RX and TX.
+ * The abort cleans up the buffer chains if needed.
+ */
+
+ mc32_rx_abort(dev);
+ mc32_tx_abort(dev);
+
+ /* Set Network Address */
+ mc32_command(dev, 1, dev->dev_addr, 6);
+
+ /* Set the filters */
+ mc32_set_multicast_list(dev);
+
+ /* Issue the 82586 workaround command - this is for "busy lans",
+ but basically means for all lans now days - has a performance
+ cost but best set */
+
+ mc32_command(dev, 0x0D, &zero_word, 2); /* 82586 bug workaround on */
+
+ /* Load the ring we just initialised */
+
+ if(mc32_load_rx_ring(dev))
+ {
+ mc32_close(dev);
+ return -ENOBUFS;
+ }
+
+ /* And the resume command goes last */
+
+ mc32_rx_begin(dev);
+ mc32_tx_begin(dev);
+
+ MOD_INC_USE_COUNT;
+
+ return 0;
+}
+
+static int mc32_send_packet(struct sk_buff *skb, struct device *dev)
+{
+ struct mc32_local *lp = (struct mc32_local *)dev->priv;
+
+ if (dev->tbusy) {
+ /*
+ * If we get here, some higher level has decided we are broken.
+ * There should really be a "kick me" function call instead.
+ */
+ int tickssofar = jiffies - dev->trans_start;
+ if (tickssofar < 5)
+ return 1;
+ printk(KERN_WARNING "%s: transmit timed out?\n", dev->name);
+ /* Try to restart the adaptor. */
+ dev->tbusy=0;
+ dev->trans_start = jiffies;
+ }
+
+ /*
+ * Block a timer-based transmit from overlapping. This could better be
+ * done with atomic_swap(1, dev->tbusy), but set_bit() works as well.
+ */
+ if (test_and_set_bit(0, (void*)&dev->tbusy) != 0)
+ {
+ printk(KERN_WARNING "%s: Transmitter access conflict.\n", dev->name);
+ dev_kfree_skb(skb);
+ }
+ else
+ {
+ unsigned long flags;
+
+ u16 tx_head;
+ volatile struct skb_header *p, *np;
+
+ save_flags(flags);
+ cli();
+
+ if(atomic_read(&lp->tx_count)==0)
+ {
+ dev->tbusy=1;
+ restore_flags(flags);
+ return 1;
+ }
+
+ tx_head = lp->tx_box->data[0];
+ atomic_dec(&lp->tx_count);
+
+ /* We will need this to flush the buffer out */
+
+ lp->tx_skb[lp->tx_skb_end] = skb;
+ lp->tx_skb_end++;
+ lp->tx_skb_end&=(TX_RING_MAX-1);
+
+ /* P is the last sending/sent buffer as a pointer */
+ p=(struct skb_header *)bus_to_virt(lp->base+tx_head);
+
+ /* NP is the buffer we will be loading */
+ np=(struct skb_header *)bus_to_virt(lp->base+p->next);
+
+ np->control |= (1<<6); /* EOL */
+ wmb();
+
+ np->length = skb->len;
+ np->data = virt_to_bus(skb->data);
+ np->status = 0;
+ np->control = (1<<7)|(1<<6); /* EOP EOL */
+ wmb();
+
+ p->status = 0;
+ p->control &= ~(1<<6);
+
+ dev->tbusy = 0; /* Keep feeding me */
+
+ lp->tx_box->mbox=0;
+ restore_flags(flags);
+ }
+ return 0;
+}
+
+static void mc32_update_stats(struct device *dev)
+{
+}
+
+
+static void mc32_rx_ring(struct device *dev)
+{
+ struct mc32_local *lp=dev->priv;
+ int ioaddr = dev->base_addr;
+ int x=0;
+ volatile struct skb_header *p;
+ u16 base;
+ u16 top;
+
+ top = base = lp->rx_box->data[0];
+ do
+ {
+ p=(struct skb_header *)bus_to_virt(base+lp->base);
+ if(!(p->status & (1<<7)))
+ break;
+ if(p->status & (1<<6))
+ {
+ u16 length = p->length;
+ struct sk_buff *skb=dev_alloc_skb(length+2);
+ if(skb!=NULL)
+ {
+ skb_reserve(skb,2);
+ /*printk("Frame at %p\n", bus_to_virt(p->data)); */
+ memcpy(skb_put(skb, length),
+ bus_to_virt(p->data), length);
+ skb->protocol=eth_type_trans(skb,dev);
+ skb->dev=dev;
+ lp->net_stats.rx_packets++;
+ lp->net_stats.rx_bytes+=skb->len;
+ netif_rx(skb);
+ }
+ else
+ lp->net_stats.rx_dropped++;
+ }
+ else
+ {
+ lp->net_stats.rx_errors++;
+ switch(p->status&0x0F)
+ {
+ case 1:
+ lp->net_stats.rx_crc_errors++;break;
+ case 2:
+ lp->net_stats.rx_fifo_errors++;break;
+ case 3:
+ lp->net_stats.rx_frame_errors++;break;
+ case 4:
+ lp->net_stats.rx_missed_errors++;break;
+ case 5:
+ lp->net_stats.rx_length_errors++;break;
+ }
+ }
+ p->length = 1532;
+ p->control &= ~(1<<6);
+ p->status = 0;
+ base = p->next;
+ }
+ while(x++<48);
+
+ /*
+ * This is curious. It seems the receive stop and receive continue
+ * commands race against each other, even though we poll for
+ * command ready to be issued. The delay is hackish but is a workaround
+ * while I investigate in depth
+ */
+
+ while(!(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR));
+ lp->rx_box->mbox=0;
+ lp->rx_box->data[0] = top;
+ outb(1<<3, ioaddr+HOST_CMD);
+}
+
+
+/*
+ * The typical workload of the driver:
+ * Handle the network interface interrupts.
+ */
+static void mc32_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+ struct device *dev = dev_id;
+ struct mc32_local *lp;
+ int ioaddr, status, boguscount = 0;
+ int rx_event = 0;
+
+ if (dev == NULL) {
+ printk(KERN_WARNING "%s: irq %d for unknown device.\n", cardname, irq);
+ return;
+ }
+ dev->interrupt = 1;
+
+ ioaddr = dev->base_addr;
+ lp = (struct mc32_local *)dev->priv;
+
+ /* See whats cooking */
+
+ while((inb(ioaddr+2)&(1<<5)) && boguscount++<2000)
+ {
+ status=inb(ioaddr+HOST_CMD);
+
+#ifdef DEBUG_IRQ
+ printk("Status TX%d RX%d EX%d OV%d\n",
+ (status&7), (status>>3)&7, (status>>6)&1,
+ (status>>7)&1);
+#endif
+
+ switch(status&7)
+ {
+ case 0:
+ break;
+ case 6: /* TX fail */
+ lp->net_stats.tx_errors++;
+ case 2: /* TX ok */
+ lp->net_stats.tx_packets++;
+ /* Packets are sent in order - this is
+ basically a FIFO queue of buffers matching
+ the card ring */
+ lp->net_stats.tx_bytes+=lp->tx_skb[lp->tx_skb_top]->len;
+ dev_kfree_skb(lp->tx_skb[lp->tx_skb_top]);
+ lp->tx_skb[lp->tx_skb_top]=NULL;
+ lp->tx_skb_top++;
+ lp->tx_skb_top&=(TX_RING_MAX-1);
+ atomic_inc(&lp->tx_count);
+ dev->tbusy=0;
+ mark_bh(NET_BH);
+ break;
+ case 3: /* Halt */
+ case 4: /* Abort */
+ lp->tx_halted=1;
+ wake_up(&lp->event);
+ break;
+ case 5:
+ lp->tx_halted=0;
+ wake_up(&lp->event);
+ break;
+ default:
+ printk("%s: strange tx ack %d\n",
+ dev->name, status&7);
+ }
+ status>>=3;
+ switch(status&7)
+ {
+ case 0:
+ break;
+ case 2: /* RX */
+ rx_event=1;
+ break;
+ case 3:
+ case 4:
+ lp->rx_halted=1;
+ wake_up(&lp->event);
+ break;
+ case 5:
+ lp->rx_halted=0;
+ wake_up(&lp->event);
+ break;
+ case 6:
+ /* Out of RX buffers stat */
+ /* Must restart */
+ lp->net_stats.rx_dropped++;
+ rx_event = 1; /* To restart */
+ break;
+ default:
+ printk("%s: strange rx ack %d\n",
+ dev->name, status&7);
+
+ }
+ status>>=3;
+ if(status&1)
+ {
+ /* 0=no 1=yes 2=reply clearing */
+ lp->exec_pending=2;
+ wake_up(&lp->event);
+ }
+ if(status&2)
+ {
+ /*
+ * Update the stats as soon as
+ * we have it flagged and can
+ * send an immediate reply (CRR set)
+ */
+
+ if(inb(ioaddr+HOST_STATUS)&HOST_STATUS_CRR)
+ {
+ mc32_update_stats(dev);
+ outb(0, ioaddr+HOST_CMD);
+ }
+ }
+ }
+
+ /*
+ * Process and restart the receive ring.
+ */
+
+ if(rx_event)
+ mc32_rx_ring(dev);
+ dev->interrupt = 0;
+ return;
+}
+
+
+/* The inverse routine to mc32_open(). */
+
+static int mc32_close(struct device *dev)
+{
+ struct mc32_local *lp = (struct mc32_local *)dev->priv;
+ int ioaddr = dev->base_addr;
+ u8 regs;
+ u16 one=1;
+
+ /*
+ * Send the indications on command (handy debug check)
+ */
+
+ mc32_command(dev, 4, &one, 2);
+
+ /* Abort RX and Abort TX */
+
+ mc32_rx_abort(dev);
+ mc32_tx_abort(dev);
+
+ /* Catch any waiting commands */
+
+ while(lp->exec_pending==1)
+ sleep_on(&lp->event);
+
+ /* Ok the card is now stopping */
+
+ regs=inb(ioaddr+HOST_CTRL);
+ regs&=~HOST_CTRL_INTE;
+ outb(regs, ioaddr+HOST_CTRL);
+
+ mc32_flush_rx_ring(lp);
+ mc32_flush_tx_ring(lp);
+
+ dev->tbusy = 1;
+ dev->start = 0;
+
+ /* Update the statistics here. */
+
+ MOD_DEC_USE_COUNT;
+
+ return 0;
+
+}
+
+/*
+ * Get the current statistics.
+ * This may be called with the card open or closed.
+ */
+
+static struct net_device_stats *mc32_get_stats(struct device *dev)
+{
+ struct mc32_local *lp = (struct mc32_local *)dev->priv;
+ return &lp->net_stats;
+}
+
+/*
+ * Set or clear the multicast filter for this adaptor.
+ * num_addrs == -1 Promiscuous mode, receive all packets
+ * num_addrs == 0 Normal mode, clear multicast list
+ * num_addrs > 0 Multicast mode, receive normal and MC packets,
+ * and do best-effort filtering.
+ */
+static void mc32_set_multicast_list(struct device *dev)
+{
+ u16 filt;
+ if (dev->flags&IFF_PROMISC)
+ {
+ /* Enable promiscuous mode */
+ filt = 1;
+ mc32_command(dev, 0, &filt, 2);
+ }
+ else if((dev->flags&IFF_ALLMULTI) || dev->mc_count > 10)
+ {
+ dev->flags|=IFF_PROMISC;
+ filt = 1;
+ mc32_command(dev, 0, &filt, 2);
+ }
+ else if(dev->mc_count)
+ {
+ unsigned char block[62];
+ unsigned char *bp;
+ struct dev_mc_list *dmc=dev->mc_list;
+
+ int i;
+
+ filt = 0;
+ block[1]=0;
+ block[0]=dev->mc_count;
+ bp=block+2;
+
+ for(i=0;i<dev->mc_count;i++)
+ {
+ memcpy(bp, dmc->dmi_addr, 6);
+ bp+=6;
+ dmc=dmc->next;
+ }
+ mc32_command(dev, 2, block, 2+6*dev->mc_count);
+ mc32_command(dev, 0, &filt, 2);
+ }
+ else
+ {
+ filt = 0;
+ mc32_command(dev, 0, &filt, 2);
+ }
+}
+
+#ifdef MODULE
+
+static char devicename[9] = { 0, };
+static struct device this_device = {
+ devicename, /* will be inserted by linux/drivers/net/mc32_init.c */
+ 0, 0, 0, 0,
+ 0, 0, /* I/O address, IRQ */
+ 0, 0, 0, NULL, mc32_probe };
+
+int init_module(void)
+{
+ int result;
+
+ if ((result = register_netdev(&this_device)) != 0)
+ return result;
+
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ int slot;
+
+ /* No need to check MOD_IN_USE, as sys_delete_module() checks. */
+ unregister_netdev(&this_device);
+
+ /*
+ * If we don't do this, we can't re-insmod it later.
+ * Release irq/dma here, when you have jumpered versions and
+ * allocate them in mc32_probe1().
+ */
+
+ if (this_device.priv)
+ {
+ struct mc32_local *lp=this_device.priv;
+ slot = lp->slot;
+ mca_mark_as_unused(slot);
+ mca_set_adapter_name(slot, NULL);
+ kfree_s(this_device.priv, sizeof(struct mc32_local));
+ }
+ free_irq(this_device.irq, &this_device);
+}
+
+#endif /* MODULE */
--- /dev/null
+/*
+ * 3COM "EtherLink MC/32" Descriptions
+ */
+
+/*
+ * Registers
+ */
+
+#define HOST_CMD 0
+
+#define HOST_STATUS 2
+#define HOST_STATUS_CRR (1<<6)
+#define HOST_STATUS_CWR (1<<5)
+
+#define HOST_CTRL 6
+#define HOST_CTRL_ATTN (1<<7)
+#define HOST_CTRL_RESET (1<<6)
+#define HOST_CTRL_INTE (1<<2)
+
+#define HOST_RAMPAGE 8
+
+struct skb_header
+{
+ u8 status __attribute((packed));
+ u8 control __attribute((packed));
+ u16 next __attribute((packed)); /* Do not change! */
+ u16 length __attribute((packed));
+ u32 data __attribute((packed));
+};
+
+#define STATUS_MASK 0x0F
+#define COMPLETED 0x80
+#define COMPLETED_OK 0x40
+#define BUFFER_BUSY 0x20
+
+#define CONTROL_EOP 0x80 /* End Of Packet */
+#define CONTROL_EL 0x40 /* End of List */
+
+
+#define MCA_MC32_ID 0x0041 /* Our MCA ident */
\ No newline at end of file
@@ -53,6+53,7 @@ if [ "$CONFIG_NET_ETHERNET" = "y" ]; then tristate '3c507 support' CONFIG_EL16
if [ "$CONFIG_MCA" = "y" ]; then
tristate '3c523 support' CONFIG_ELMC
+ tristate '3c527 support' CONFIG_ELMC_II
fi
fi
tristate '3c509/3c579 support' CONFIG_EL3
endif
endif
+ifeq ($(CONFIG_ELMC_II),y)
+L_OBJS += 3c527.o
+else
+ ifeq ($(CONFIG_ELMC_II),m)
+ M_OBJS += 3c527.o
+ endif
+endif
+
ifeq ($(CONFIG_EL3),y)
L_OBJS += 3c509.o
else
@@ -135,6+135,7 @@ struct lance_private { struct Linux_SBus_DMA *ledma; /* if set this points to ledma and arch=4m */
int burst_sizes; /* ledma SBus burst sizes */
#endif
+ struct timer_list multicast_timer;
};
#define TX_BUFFS_AVAIL ((lp->tx_old<=lp->tx_new)?\
@@ -527,6+528,7 @@ static int lance_close (struct device *dev)
dev->start = 0;
dev->tbusy = 1;
+ del_timer(&lp->multicast_timer);
/* Stop the card */
ll->rap = LE_CSR0;
@@ -706,12+708,20 @@ static void lance_set_multicast (struct device *dev) volatile struct lance_init_block *ib = lp->init_block;
volatile struct lance_regs *ll = lp->ll;
- while (dev->tbusy)
- schedule();
+ if (!dev->start)
+ return;
+
+ if (dev->tbusy) {
+ mod_timer(&lp->multicast_timer, jiffies + 2);
+ return;
+ }
set_bit (0, (void *) &dev->tbusy);
- while (lp->tx_old != lp->tx_new)
- schedule();
+ if (lp->tx_old != lp->tx_new) {
+ mod_timer(&lp->multicast_timer, jiffies + 4);
+ dev->tbusy = 0;
+ return;
+ }
ll->rap = LE_CSR0;
ll->rdp = LE_C0_STOP;
@@ -726,6+736,7 @@ static void lance_set_multicast (struct device *dev) load_csrs (lp);
init_restart_lance (lp);
dev->tbusy = 0;
+ mark_bh(NET_BH);
}
@@ -795,6+806,11 @@ __initfunc(int a2065_probe(struct device *dev)) dev->dma = 0;
ether_setup(dev);
+ init_timer(&priv->multicast_timer);
+ priv->multicast_timer.data = (unsigned long) dev;
+ priv->multicast_timer.function =
+ (void (*)(unsigned long)) &lance_set_multicast;
+
zorro_config_board(key, 0);
return(0);
}
-/* $Id: sunlance.c,v 1.84 1999/03/11 12:30:22 anton Exp $
+/* $Id: sunlance.c,v 1.85 1999/03/21 05:22:05 davem Exp $
* lance.c: Linux/Sparc/Lance driver
*
* Written 1995, 1996 by Miguel de Icaza
@@ -331,8+331,6 @@ static void lance_init_ring (struct device *dev) lp->rx_new = lp->tx_new = 0;
lp->rx_old = lp->tx_old = 0;
- ib->mode = 0;
-
/* Copy the ethernet address to the lance init block
* Note that on the sparc you need to swap the ethernet address.
* Note also we want the CPU ptr of the init_block here.
@@ -389,10+387,6 @@ static void lance_init_ring (struct device *dev) ib->tx_ptr = leptr;
if (ZERO)
printk ("TX ptr: %8.8x\n", leptr);
-
- /* Clear the multicast filter */
- ib->filter [0] = 0;
- ib->filter [1] = 0;
}
static int init_restart_lance (struct lance_private *lp)
@@ -673,6+667,7 @@ static int lance_open (struct device *dev) {
struct lance_private *lp = (struct lance_private *)dev->priv;
volatile struct lance_regs *ll = lp->ll;
+ volatile struct lance_init_block *ib = lp->init_block;
int status = 0;
last_dev = dev;
@@ -691,6+686,16 @@ static int lance_open (struct device *dev) if (lp->ledma)
lp->ledma->regs->dma_test = ((__u32) lp->init_block_dvma) & 0xff000000;
+ /* Set mode and clear multicast filter only at device open,
+ so that lance_init_ring() called at any error will not
+ forget multicast filters.
+
+ BTW it is common bug in all lance drivers! --ANK
+ */
+ ib->mode = 0;
+ ib->filter [0] = 0;
+ ib->filter [1] = 0;
+
lance_init_ring (dev);
load_csrs (lp);
@@ -747,6+752,7 @@ static int lance_close (struct device *dev)
dev->start = 0;
dev->tbusy = 1;
+ del_timer(&lp->multicast_timer);
/* Stop the card */
ll->rap = LE_CSR0;
@@ -916,14+922,31 @@ static void lance_set_multicast (struct device *dev) volatile struct lance_init_block *ib = lp->init_block;
volatile struct lance_regs *ll = lp->ll;
+ if (!dev->start)
+ return;
+
if (dev->tbusy) {
mod_timer(&lp->multicast_timer, jiffies + 2);
return;
}
+ /* This CANNOT be correct. Chip is running
+ and dev->tbusy may change any moment.
+ It is useless to set it.
+
+ Generally, usage of dev->tbusy in this driver is completely
+ wrong.
+
+ I protected calls to this function
+ with start_bh_atomic, so that set_multicast_list
+ and hard_start_xmit are serialized now by top level. --ANK
+
+ The same is true about a2065.
+ */
set_bit (0, (void *) &dev->tbusy);
if (lp->tx_old != lp->tx_new) {
mod_timer(&lp->multicast_timer, jiffies + 4);
+ dev->tbusy = 0;
return;
}
@@ -940,6+963,7 @@ static void lance_set_multicast (struct device *dev) load_csrs (lp);
init_restart_lance (lp);
dev->tbusy = 0;
+ mark_bh(NET_BH);
}
__initfunc(static int
@@ -636,6+636,28 @@ static int sparcaudio_mixer_ioctl(struct inode * inode, struct file * file,
k = arg;
+ if(cmd == SOUND_MIXER_INFO) {
+ audio_device_t tmp;
+ mixer_info info;
+ int retval = -EINVAL;
+
+ if(drv->ops->sunaudio_getdev) {
+ drv->ops->sunaudio_getdev(drv, &tmp);
+ memset(&info, 0, sizeof(info));
+ strncpy(info.id, tmp.name, sizeof(info.id));
+ strncpy(info.name, "Sparc Audio", sizeof(info.name));
+
+ /* XXX do this right... */
+ info.modify_counter = 0;
+
+ if(copy_to_user((char *)arg, &info, sizeof(info)))
+ retval = -EFAULT;
+ else
+ retval = 0;
+ }
+ return retval;
+ }
+
switch (cmd) {
case SOUND_MIXER_WRITE_RECLEV:
case SOUND_MIXER_WRITE_MIC:
/* --------------------------------------------------------------------- */
#undef OSS_DOCUMENTED_MIXER_SEMANTICS
+#define DBG(x) {}
+/*#define DBG(x) {x}*/
/* --------------------------------------------------------------------- */
@@ -1019,7+1021,7 @@ static int drain_dac1(struct es1370_state *s, int nonblock) tmo = (count * HZ) / dac1_samplerate[(s->ctrl & CTRL_WTSRSEL) >> CTRL_SH_WTSRSEL];
tmo >>= sample_shift[(s->sctrl & SCTRL_P1FMT) >> SCTRL_SH_P1FMT];
if (!schedule_timeout(tmo ? : 1) && tmo)
- printk(KERN_DEBUG "es1370: dma timed out??\n");
+ DBG(printk(KERN_DEBUG "es1370: dma timed out??\n");)
}
remove_wait_queue(&s->dma_dac1.wait, &wait);
current->state = TASK_RUNNING;
@@ -1054,7+1056,7 @@ static int drain_dac2(struct es1370_state *s, int nonblock) tmo = (count * HZ) / DAC2_DIVTOSR((s->ctrl & CTRL_PCLKDIV) >> CTRL_SH_PCLKDIV);
tmo >>= sample_shift[(s->sctrl & SCTRL_P2FMT) >> SCTRL_SH_P2FMT];
if (!schedule_timeout(tmo ? : 1) && tmo)
- printk(KERN_DEBUG "es1370: dma timed out??\n");
+ DBG(printk(KERN_DEBUG "es1370: dma timed out??\n");)
}
remove_wait_queue(&s->dma_dac2.wait, &wait);
current->state = TASK_RUNNING;
@@ -2189,7+2191,7 @@ static int es1370_midi_release(struct inode *inode, struct file *file) }
tmo = (count * HZ) / 3100;
if (!schedule_timeout(tmo ? : 1) && tmo)
- printk(KERN_DEBUG "es1370: midi timed out??\n");
+ DBG(printk(KERN_DEBUG "es1370: midi timed out??\n");)
}
remove_wait_queue(&s->midi.owait, &wait);
current->state = TASK_RUNNING;
@@ -84,7+84,7 @@ do_midi_msg(int synthno, unsigned char *msg, int mlen)
case 0xE0:
STORE(SEQ_BENDER(synthno, msg[0] & 0x0f,
- (msg[1] % 0x7f) | ((msg[2] & 0x7f) << 7)));
+ (msg[1] & 0x7f) | ((msg[2] & 0x7f) << 7)));
break;
default:
@@ -28,6+28,15 @@ Known bugs:
Please direct bug reports to: hjw@zvw.de
+Version 3.10
+------------
+
+- Changed partition checker to allow devices
+ with physical blocks != 512 bytes.
+
+- The partition checker now also ignores the
+ word at 0xd0 that Windows likes to write to.
+
Version 3.9
-----------
@@ -86,6+86,7 @@ void sysv_free_inode(struct inode * inode) return;
}
raw_inode = (struct sysv_inode *) bh->b_data + ((ino-1) & sb->sv_inodes_per_block_1);
+ clear_inode(inode);
lock_super(sb);
if (*sb->sv_sb_fic_count < sb->sv_fic_size)
*sv_sb_fic_inode(sb,(*sb->sv_sb_fic_count)++) = ino;
@@ -97,7+98,6 @@ void sysv_free_inode(struct inode * inode) mark_buffer_dirty(bh, 1);
unlock_super(sb);
brelse(bh);
- clear_inode(inode);
}
struct inode * sysv_new_inode(const struct inode * dir)
@@ -55,10+55,8 @@ void sysv_print_inode(struct inode * inode) }
#endif
-void sysv_put_inode(struct inode *inode)
+static void sysv_delete_inode(struct inode *inode)
{
- if (inode->i_nlink)
- return;
inode->i_size = 0;
sysv_truncate(inode);
sysv_free_inode(inode);
@@ -68,8+66,8 @@ void sysv_put_inode(struct inode *inode) static struct super_operations sysv_sops = {
sysv_read_inode,
sysv_write_inode,
- sysv_put_inode,
- NULL, /* delete_inode */
+ NULL, /* nothing special on put_inode() */
+ sysv_delete_inode,
sysv_notify_change,
sysv_put_super,
sysv_write_super,
@@ -576,6+576,7 @@ int sysv_link(struct dentry * old_dentry, struct inode * dir, oldinode->i_nlink++;
oldinode->i_ctime = CURRENT_TIME;
mark_inode_dirty(oldinode);
+ inode->i_count++;
d_instantiate(dentry, oldinode);
return 0;
}
-/* $Id: checksum.h,v 1.28 1998/04/17 02:37:25 davem Exp $ */
+/* $Id: checksum.h,v 1.29 1999/03/21 05:22:07 davem Exp $ */
#ifndef __SPARC_CHECKSUM_H
#define __SPARC_CHECKSUM_H
@@ -117,7+117,10 @@ csum_partial_copy_to_user(const char *src, char *dst, int len, return ret;
}
}
-
+
+#define HAVE_CSUM_COPY_USER
+#define csum_and_copy_to_user csum_partial_copy_to_user
+
/* ihl is always 5 or greater, almost always is 5, and iph is word aligned
* the majority of the time.
*/
@@ -99,6+99,7 @@ static int FDC2=-1; /* Routines unique to each controller type on a Sun. */
static unsigned char sun_82072_fd_inb(int port)
{
+ udelay(5);
switch(port & 7) {
default:
printk("floppy: Asked to read unknown port %d\n", port);
@@ -115,6+116,7 @@ static unsigned char sun_82072_fd_inb(int port)
static void sun_82072_fd_outb(unsigned char value, int port)
{
+ udelay(5);
switch(port & 7) {
default:
printk("floppy: Asked to write to unknown port %d\n", port);
@@ -150,6+152,7 @@ static void sun_82072_fd_outb(unsigned char value, int port)
static unsigned char sun_82077_fd_inb(int port)
{
+ udelay(5);
switch(port & 7) {
default:
printk("floppy: Asked to read unknown port %d\n", port);
@@ -167,6+170,7 @@ static unsigned char sun_82077_fd_inb(int port)
static void sun_82077_fd_outb(unsigned char value, int port)
{
+ udelay(5);
switch(port & 7) {
default:
printk("floppy: Asked to write to unknown port %d\n", port);
@@ -22,11+22,9 @@ BTFIXUPDEF_CALL(void, destroy_context, struct mm_struct *)
#define destroy_context(mm) BTFIXUP_CALL(destroy_context)(mm)
-/*
- * After we have set current->mm to a new value, this activates
+/* After we have set current->mm to a new value, this activates
* the context for the new mm so we see the new mappings.
- * XXX this presumably needs a sensible implementation - paulus.
*/
-#define activate_context(tsk) do { } while(0)
+#define activate_context(tsk) switch_to_context(tsk)
#endif /* !(__SPARC_MMU_CONTEXT_H) */
-/* $Id: floppy.h,v 1.17 1998/12/02 12:42:23 davem Exp $
+/* $Id: floppy.h,v 1.18 1999/03/21 10:51:38 davem Exp $
* asm-sparc64/floppy.h: Sparc specific parts of the Floppy driver.
*
* Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
@@ -114,6+114,7 @@ static int FDC2 = -1;
static unsigned char sun_82077_fd_inb(unsigned long port)
{
+ udelay(5);
switch(port & 7) {
default:
printk("floppy: Asked to read unknown port %lx\n", port);
@@ -131,6+132,7 @@ static unsigned char sun_82077_fd_inb(unsigned long port)
static void sun_82077_fd_outb(unsigned char value, unsigned long port)
{
+ udelay(5);
switch(port & 7) {
default:
printk("floppy: Asked to write to unknown port %lx\n", port);
@@ -343,14+343,15 @@ struct cdrom_blk #define CDC_IOCTLS 0x400 /* driver has non-standard ioctls */
#define CDC_DRIVE_STATUS 0x800 /* driver implements drive status */
-/* drive status possibilities used with the uniform CD-ROM driver */
-#define CDS_NO_INFO 0 /* if not implemented */
+/* drive status possibilities returned by CDROM_DRIVE_STATUS ioctl */
+#define CDS_NO_INFO 0 /* if not implemented */
#define CDS_NO_DISC 1
#define CDS_TRAY_OPEN 2
#define CDS_DRIVE_NOT_READY 3
#define CDS_DISC_OK 4
-/* disc status possibilities, other than CDS_NO_DISC and CDS_NO_INFO */
+/* return values for the CDROM_DISC_STATUS ioctl */
+/* can also return CDS_NO_[INFO|DISC], from above */
#define CDS_AUDIO 100
#define CDS_DATA_1 101
#define CDS_DATA_2 102
@@ -80,7+80,6 @@ struct in_addr { /* These need to appear somewhere around here */
#define IP_DEFAULT_MULTICAST_TTL 1
#define IP_DEFAULT_MULTICAST_LOOP 1
-#define IP_MAX_MEMBERSHIPS 20
/* Request struct for multicast socket ops */
#define MAX_HEADER (LL_MAX_HEADER + 48)
#endif
-struct neighbour;
-struct neigh_parms;
-struct sk_buff;
-
-/*
- * We tag multicasts with these structures.
- */
-
-struct dev_mc_list
-{
- struct dev_mc_list *next;
- __u8 dmi_addr[MAX_ADDR_LEN];
- unsigned char dmi_addrlen;
- int dmi_users;
- int dmi_gusers;
-};
-
-struct hh_cache
-{
- struct hh_cache *hh_next; /* Next entry */
- atomic_t hh_refcnt; /* number of users */
- unsigned short hh_type; /* protocol identifier, f.e ETH_P_IP */
- int (*hh_output)(struct sk_buff *skb);
- /* cached hardware header; allow for machine alignment needs. */
- unsigned long hh_data[16/sizeof(unsigned long)];
-};
-
/*
* Network device statistics. Akin to the 2.0 ether stats but
* with byte counters.
@@ -157,6+130,35 @@ extern const char *if_port_text[];
#include <linux/skbuff.h>
+struct neighbour;
+struct neigh_parms;
+struct sk_buff;
+
+/*
+ * We tag multicasts with these structures.
+ */
+
+struct dev_mc_list
+{
+ struct dev_mc_list *next;
+ __u8 dmi_addr[MAX_ADDR_LEN];
+ unsigned char dmi_addrlen;
+ int dmi_users;
+ int dmi_gusers;
+};
+
+struct hh_cache
+{
+ struct hh_cache *hh_next; /* Next entry */
+ atomic_t hh_refcnt; /* number of users */
+ unsigned short hh_type; /* protocol identifier, f.e ETH_P_IP */
+ int (*hh_output)(struct sk_buff *skb);
+ rwlock_t hh_lock;
+ /* cached hardware header; allow for machine alignment needs. */
+ unsigned long hh_data[16/sizeof(unsigned long)];
+};
+
+
/*
* The DEVICE structure.
* Actually, this whole structure is a big mistake. It mixes I/O
@@ -432,6+434,7 @@ extern int dev_mc_add(struct device *dev, void *addr, int alen, int newonly); extern void dev_mc_discard(struct device *dev);
extern void dev_set_promiscuity(struct device *dev, int inc);
extern void dev_set_allmulti(struct device *dev, int inc);
+extern void netdev_state_change(struct device *dev);
/* Load a device via the kmod */
extern void dev_load(const char *name);
extern void dev_mcast_init(void);
TCA_POLICE_TBF,
TCA_POLICE_RATE,
TCA_POLICE_PEAKRATE,
+ TCA_POLICE_AVRATE,
+ TCA_POLICE_RESULT
+#define TCA_POLICE_RESULT TCA_POLICE_RESULT
};
-#define TCA_POLICE_MAX TCA_POLICE_PEAKRATE
+#define TCA_POLICE_MAX TCA_POLICE_RESULT
/* U32 filters */
@@ -114,4+117,30 @@ struct tc_rsvp_pinfo __u8 tunnelhdr;
};
+/* ROUTE filter */
+
+enum
+{
+ TCA_ROUTE4_UNSPEC,
+ TCA_ROUTE4_CLASSID,
+ TCA_ROUTE4_TO,
+ TCA_ROUTE4_FROM,
+ TCA_ROUTE4_IIF,
+ TCA_ROUTE4_POLICE,
+};
+
+#define TCA_ROUTE4_MAX TCA_ROUTE4_POLICE
+
+
+/* FW filter */
+
+enum
+{
+ TCA_FW_UNSPEC,
+ TCA_FW_CLASSID,
+ TCA_FW_POLICE,
+};
+
+#define TCA_FW_MAX TCA_FW_POLICE
+
#endif
#define RTPROT_RA 9 /* RDISC/ND router advertisments */
#define RTPROT_MRT 10 /* Merit MRT */
#define RTPROT_ZEBRA 11 /* Zebra */
+#define RTPROT_BIRD 12 /* BIRD */
/* rtm_scope
enum
{
NET_UNIX_DESTROY_DELAY=1,
- NET_UNIX_DELETE_DELAY=2
+ NET_UNIX_DELETE_DELAY=2,
+ NET_UNIX_MAX_DGRAM_QLEN=3,
};
/* /proc/sys/net/ipv4 */
@@ -225,7+226,8 @@ enum { NET_IPV4_ROUTE_REDIRECT_SILENCE=11,
NET_IPV4_ROUTE_ERROR_COST=12,
NET_IPV4_ROUTE_ERROR_BURST=13,
- NET_IPV4_ROUTE_GC_ELASTICITY=14
+ NET_IPV4_ROUTE_GC_ELASTICITY=14,
+ NET_IPV4_ROUTE_MTU_EXPIRES=15
};
enum
@@ -265,7+267,8 @@ enum { NET_IPV6_ROUTE_GC_MIN_INTERVAL=4,
NET_IPV6_ROUTE_GC_TIMEOUT=5,
NET_IPV6_ROUTE_GC_INTERVAL=6,
- NET_IPV6_ROUTE_GC_ELASTICITY=7
+ NET_IPV6_ROUTE_GC_ELASTICITY=7,
+ NET_IPV6_ROUTE_MTU_EXPIRES=8
};
enum {
@@ -398,7+398,6 @@ extern void sysv_write_super(struct super_block *); extern void sysv_read_inode(struct inode *);
extern int sysv_notify_change(struct dentry *, struct iattr *);
extern void sysv_write_inode(struct inode *);
-extern void sysv_put_inode(struct inode *);
extern int sysv_statfs(struct super_block *, struct statfs *, int);
extern int sysv_sync_inode(struct inode *);
extern int sysv_sync_file(struct file *, struct dentry *);
@@ -53,8+53,9 @@ extern int addrconf_set_dstaddr(void *arg);
extern struct inet6_ifaddr * ipv6_chk_addr(struct in6_addr *addr,
struct device *dev, int nd);
-extern struct inet6_ifaddr * ipv6_get_saddr(struct dst_entry *dst,
- struct in6_addr *daddr);
+extern int ipv6_get_saddr(struct dst_entry *dst,
+ struct in6_addr *daddr,
+ struct in6_addr *saddr);
extern struct inet6_ifaddr * ipv6_get_lladdr(struct device *dev);
/*
@@ -36,9+36,10 @@ struct dst_entry struct device *dev;
int obsolete;
unsigned long lastuse;
+ unsigned long expires;
unsigned mxlock;
- unsigned window;
unsigned pmtu;
+ unsigned window;
unsigned rtt;
unsigned long rate_last; /* rate limiting for ICMP */
unsigned long rate_tokens;
@@ -98,6+99,19 @@ void dst_release(struct dst_entry * dst) atomic_dec(&dst->use);
}
+/* The following primitive should be use if and only if
+ destination entry has just been removed from a location
+ accessed directly by hard irq.
+ */
+extern __inline__
+void dst_release_irqwait(struct dst_entry * dst)
+{
+ if (dst) {
+ synchronize_irq();
+ atomic_dec(&dst->use);
+ }
+}
+
extern __inline__
struct dst_entry * dst_check(struct dst_entry ** dst_p, u32 cookie)
{
@@ -152,6+166,17 @@ extern __inline__ void dst_link_failure(struct sk_buff *skb) if (dst && dst->ops && dst->ops->link_failure)
dst->ops->link_failure(skb);
}
+
+extern __inline__ void dst_set_expires(struct dst_entry *dst, int timeout)
+{
+ unsigned long expires = jiffies + timeout;
+
+ if (expires == 0)
+ expires = 1;
+
+ if (dst->expires == 0 || (long)(dst->expires - expires) > 0)
+ dst->expires = expires;
+}
#endif
#endif /* _NET_DST_H */
@@ -147,13+147,14 @@ extern __inline__ int ip_finish_output(struct sk_buff *skb) skb->protocol = __constant_htons(ETH_P_IP);
if (hh) {
+ read_lock_irq(&hh->hh_lock);
memcpy(skb->data - 16, hh->hh_data, 16);
+ read_unlock_irq(&hh->hh_lock);
skb_push(skb, dev->hard_header_len);
return hh->hh_output(skb);
} else if (dst->neighbour)
return dst->neighbour->output(skb);
- printk(KERN_DEBUG "khm\n");
kfree_skb(skb);
return -EINVAL;
}
@@ -59,6+59,7 @@ struct rt6_info
#define rt6i_dev u.dst.dev
#define rt6i_nexthop u.dst.neighbour
+#define rt6i_expires u.dst.expires
struct fib6_node *rt6i_node;
@@ -67,7+68,6 @@ struct rt6_info u32 rt6i_flags;
u32 rt6i_metric;
u8 rt6i_hoplimit;
- unsigned long rt6i_expires;
atomic_t rt6i_ref;
union {
@@ -127,6+127,8 @@ struct fib_table int (*tb_flush)(struct fib_table *table);
int (*tb_get_info)(struct fib_table *table, char *buf,
int first, int count);
+ void (*tb_select_default)(struct fib_table *table,
+ const struct rt_key *key, struct fib_result *res);
unsigned char tb_data[0];
};
@@ -156,6+158,12 @@ extern __inline__ int fib_lookup(const struct rt_key *key, struct fib_result *re return 0;
}
+extern __inline__ void fib_select_default(const struct rt_key *key, struct fib_result *res)
+{
+ if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+ main_table->tb_select_default(main_table, key, res);
+}
+
#else /* CONFIG_IP_MULTIPLE_TABLES */
#define local_table (fib_tables[RT_TABLE_LOCAL])
#define main_table (fib_tables[RT_TABLE_MAIN])
@@ -179,6+187,9 @@ extern __inline__ struct fib_table *fib_new_table(int id)
return fib_tables[id] ? : __fib_new_table(id);
}
+
+extern void fib_select_default(const struct rt_key *key, struct fib_result *res);
+
#endif /* CONFIG_IP_MULTIPLE_TABLES */
/* Exported by fib_frontend.c */
@@ -189,7+200,7 @@ extern int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *ar extern int inet_rtm_getroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
extern int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb);
extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
- struct device *dev, u32 *spec_dst);
+ struct device *dev, u32 *spec_dst, u32 *itag);
extern void fib_select_multipath(const struct rt_key *key, struct fib_result *res);
/* Exported by fib_semantics.c */
@@ -227,4+238,20 @@ extern u32 fib_rules_policy(u32 saddr, struct fib_result *res, unsigned *flags); extern void fib_rules_init(void);
#endif
+extern __inline__ void fib_combine_itag(u32 *itag, struct fib_result *res)
+{
+#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ u32 rtag;
+#endif
+ *itag = FIB_RES_NH(*res).nh_tclassid<<16;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ rtag = fib_rules_tclass(res);
+ if (*itag == 0)
+ *itag = (rtag<<16);
+ *itag |= (rtag>>16);
+#endif
+#endif
+}
+
#endif _NET_FIB_H
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: ipv6.h,v 1.14 1998/10/03 09:36:45 davem Exp $
+ * $Id: ipv6.h,v 1.15 1999/03/21 05:22:16 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -186,14+186,6 @@ extern __inline__ int ipv6_addr_any(struct in6_addr *a) a->s6_addr32[2] | a->s6_addr32[3] ) == 0);
}
-extern __inline__ int gfp_any(void)
-{
- int pri = GFP_KERNEL;
- if (in_interrupt())
- pri = GFP_ATOMIC;
- return pri;
-}
-
/*
* Prototypes exported by ipv6
*/
@@ -226,7+226,6 @@ extern __inline__ void neigh_confirm(struct neighbour *neigh) neigh->confirmed = jiffies;
}
-
extern __inline__ struct neighbour *
neigh_lookup(struct neigh_table *tbl, const void *pkey, struct device *dev)
{
@@ -258,6+257,7 @@ extern __inline__ int neigh_event_send(struct neighbour *neigh, struct sk_buff * extern __inline__ void neigh_table_lock(struct neigh_table *tbl)
{
atomic_inc(&tbl->lock);
+ synchronize_bh();
}
extern __inline__ void neigh_table_unlock(struct neigh_table *tbl)
@@ -50,7+50,7 @@ struct tcf_proto_ops
unsigned long (*get)(struct tcf_proto*, u32 handle);
void (*put)(struct tcf_proto*, unsigned long);
- int (*change)(struct tcf_proto*, u32 handle, struct rtattr **, unsigned long *);
+ int (*change)(struct tcf_proto*, unsigned long, u32 handle, struct rtattr **, unsigned long *);
int (*delete)(struct tcf_proto*, unsigned long);
void (*walk)(struct tcf_proto*, struct tcf_walker *arg);
@@ -77,6+77,14 @@ extern __inline__ int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, str return -1;
}
+extern __inline__ unsigned long cls_set_class(unsigned long *clp, unsigned long cl)
+{
+ net_serialize_enter();
+ cl = xchg(clp, cl);
+ net_serialize_leave();
+ return cl;
+}
+
extern int register_tcf_proto_ops(struct tcf_proto_ops *ops);
extern int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
#define PSCHED_JIFFIES 2
#define PSCHED_CPU 3
-#define PSCHED_CLOCK_SOURCE PSCHED_GETTIMEOFDAY
+#define PSCHED_CLOCK_SOURCE PSCHED_JIFFIES
#include <linux/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -25,6+25,7 @@ struct Qdisc_class_ops {
/* Child qdisc manipulation */
int (*graft)(struct Qdisc *, unsigned long cl, struct Qdisc *, struct Qdisc **);
+ struct Qdisc * (*leaf)(struct Qdisc *, unsigned long cl);
/* Class manipulation routines */
unsigned long (*get)(struct Qdisc *, u32 classid);
@@ -35,7+36,7 @@ struct Qdisc_class_ops
/* Filter manipulation */
struct tcf_proto ** (*tcf_chain)(struct Qdisc *, unsigned long);
- unsigned long (*bind_tcf)(struct Qdisc *, u32 classid);
+ unsigned long (*bind_tcf)(struct Qdisc *, unsigned long, u32 classid);
void (*unbind_tcf)(struct Qdisc *, unsigned long);
/* rtnetlink specific */
@@ -57,6+58,7 @@ struct Qdisc_ops int (*init)(struct Qdisc *, struct rtattr *arg);
void (*reset)(struct Qdisc *);
void (*destroy)(struct Qdisc *);
+ int (*change)(struct Qdisc *, struct rtattr *arg);
int (*dump)(struct Qdisc *, struct sk_buff *);
};
@@ -74,13+76,12 @@ struct Qdisc int (*enqueue)(struct sk_buff *skb, struct Qdisc *dev);
struct sk_buff * (*dequeue)(struct Qdisc *dev);
unsigned flags;
-#define TCQ_F_DEFAULT 1
-#define TCQ_F_BUILTIN 2
+#define TCQ_F_BUILTIN 1
+#define TCQ_F_THROTTLED 2
struct Qdisc_ops *ops;
struct Qdisc *next;
u32 handle;
- u32 classid;
- struct Qdisc *parent;
+ atomic_t refcnt;
struct sk_buff_head q;
struct device *dev;
@@ -89,6+90,11 @@ struct Qdisc unsigned long tx_last;
int (*reshape_fail)(struct sk_buff *skb, struct Qdisc *q);
+ /* This field is deprecated, but it is still used by CBQ
+ * and it will live until better solution will be invented.
+ */
+ struct Qdisc *__parent;
+
char data[0];
};
@@ -129,6+135,15 @@ struct qdisc_rate_table which have fast and precise clock source, but it is too expensive.
*/
+/* General note about internal clock.
+
+ Any clock source returns time intervals, measured in units
+ close to 1usec. With source PSCHED_GETTIMEOFDAY it is precisely
+ microseconds, otherwise something close but different chosen to minimize
+ arithmetic cost. Ratio usec/internal untis in form nominator/denominator
+ may be read from /proc/net/psched.
+ */
+
#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
@@ -138,8+153,12 @@ typedef long psched_tdiff_t; #define PSCHED_GET_TIME(stamp) do_gettimeofday(&(stamp))
#define PSCHED_US2JIFFIE(usecs) (((usecs)+(1000000/HZ-1))/(1000000/HZ))
+#define PSCHED_EXPORTLIST EXPORT_SYMBOL(psched_tod_diff);
+
#else /* PSCHED_CLOCK_SOURCE != PSCHED_GETTIMEOFDAY */
+#define PSCHED_EXPORTLIST PSCHED_EXPORTLIST_1 PSCHED_EXPORTLIST_2
+
typedef u64 psched_time_t;
typedef long psched_tdiff_t;
@@ -147,10+166,6 @@ extern psched_time_t psched_time_base;
#if PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
-#define PSCHED_WATCHER unsigned long
-
-extern PSCHED_WATCHER psched_time_mark;
-
#if HZ == 100
#define PSCHED_JSCALE 13
#elif HZ == 1024
@@ -159,22+174,45 @@ extern PSCHED_WATCHER psched_time_mark; #define PSCHED_JSCALE 0
#endif
+#define PSCHED_EXPORTLIST_2
+
+#if ~0UL == 0xFFFFFFFF
+
+#define PSCHED_WATCHER unsigned long
+
+extern PSCHED_WATCHER psched_time_mark;
+
#define PSCHED_GET_TIME(stamp) ((stamp) = psched_time_base + (((unsigned long)(jiffies-psched_time_mark))<<PSCHED_JSCALE))
-#define PSCHED_US2JIFFIE(delay) ((delay)>>PSCHED_JSCALE)
+
+#define PSCHED_EXPORTLIST_1 EXPORT_SYMBOL(psched_time_base); \
+ EXPORT_SYMBOL(psched_time_mark);
+
+#else
+
+#define PSCHED_GET_TIME(stamp) ((stamp) = (jiffies<<PSCHED_JSCALE))
+
+#define PSCHED_EXPORTLIST_1
+
+#endif
+
+#define PSCHED_US2JIFFIE(delay) (((delay)+(1<<PSCHED_JSCALE)-1)>>PSCHED_JSCALE)
#elif PSCHED_CLOCK_SOURCE == PSCHED_CPU
extern psched_tdiff_t psched_clock_per_hz;
extern int psched_clock_scale;
+#define PSCHED_EXPORTLIST_2 EXPORT_SYMBOL(psched_clock_per_hz); \
+ EXPORT_SYMBOL(psched_clock_scale);
+
#define PSCHED_US2JIFFIE(delay) (((delay)+psched_clock_per_hz-1)/psched_clock_per_hz)
#if CPU == 586 || CPU == 686
#define PSCHED_GET_TIME(stamp) \
-({ u32 hi, lo; \
- __asm__ __volatile__ (".byte 0x0f,0x31" :"=a" (lo), "=d" (hi)); \
- (stamp) = ((((u64)hi)<<32) + lo)>>psched_clock_scale; \
+({ u64 __cur; \
+ __asm__ __volatile__ (".byte 0x0f,0x31" :"=A" (__cur)); \
+ (stamp) = __cur>>psched_clock_scale; \
})
#elif defined (__alpha__)
@@ -191,6+229,9 @@ extern PSCHED_WATCHER psched_time_mark; (stamp) = (psched_time_base + __res)>>psched_clock_scale; \
})
+#define PSCHED_EXPORTLIST_1 EXPORT_SYMBOL(psched_time_base); \
+ EXPORT_SYMBOL(psched_time_mark);
+
#else
#error PSCHED_CLOCK_SOURCE=PSCHED_CPU is not supported on this arch.
@@ -219,13+260,15 @@ extern PSCHED_WATCHER psched_time_mark; __delta; \
})
+extern int psched_tod_diff(int delta_sec, int bound);
+
#define PSCHED_TDIFF_SAFE(tv1, tv2, bound, guard) \
({ \
int __delta_sec = (tv1).tv_sec - (tv2).tv_sec; \
int __delta = (tv1).tv_usec - (tv2).tv_usec; \
switch (__delta_sec) { \
default: \
- __delta = (bound); guard; break; \
+ __delta = psched_tod_diff(__delta_sec, bound); guard; break; \
case 2: \
__delta += 1000000; \
case 1: \
@@ -290,6+333,8 @@ struct tcf_police u32 index;
int action;
+ int result;
+ u32 ewma_rate;
u32 burst;
u32 mtu;
@@ -298,10+343,12 @@ struct tcf_police psched_time_t t_c;
struct qdisc_rate_table *R_tab;
struct qdisc_rate_table *P_tab;
+
+ struct tc_stats stats;
};
extern void tcf_police_destroy(struct tcf_police *p);
-extern struct tcf_police * tcf_police_locate(struct rtattr *rta);
+extern struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est);
extern int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p);
extern int tcf_police(struct sk_buff *skb, struct tcf_police *p);
@@ -327,7+374,6 @@ void dev_deactivate(struct device *dev); void qdisc_reset(struct Qdisc *qdisc);
void qdisc_destroy(struct Qdisc *qdisc);
struct Qdisc * qdisc_create_dflt(struct device *dev, struct Qdisc_ops *ops);
-struct Qdisc * dev_set_scheduler(struct device *dev, struct Qdisc *qdisc);
int qdisc_new_estimator(struct tc_stats *stats, struct rtattr *opt);
void qdisc_kill_estimator(struct tc_stats *stats);
struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab);
@@ -57,6+57,9 @@ struct rt_key __u32 src;
int iif;
int oif;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ __u32 fwmark;
+#endif
__u8 tos;
__u8 scope;
};
@@ -93,6+96,16 @@ struct rtable
extern struct rtable *rt_hash_table[RT_HASH_DIVISOR];
+struct ip_rt_acct
+{
+ __u32 o_bytes;
+ __u32 o_packets;
+ __u32 i_bytes;
+ __u32 i_packets;
+};
+
+extern struct ip_rt_acct ip_rt_acct[256];
+
extern void ip_rt_init(void);
extern void ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw,
u32 src, u8 tos, struct device *dev);
@@ -918,6+918,18 @@ extern void net_delete_timer (struct sock *); extern void net_reset_timer (struct sock *, int, unsigned long);
extern void net_timer (unsigned long);
+extern __inline__ int gfp_any(void)
+{
+ return in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
+}
+
+#ifdef __SMP__
+#define net_serialize_enter() start_bh_atomic()
+#define net_serialize_leave() end_bh_atomic()
+#else
+#define net_serialize_enter() barrier();
+#define net_serialize_leave() barrier();
+#endif
/*
* Enable debug/info messages
@@ -265,7+265,6 @@ printk("SIG queue (%s:%d): %d ", t->comm, t->pid, sig); && ((sig != SIGCONT) || (current->session != t->session))
&& (current->euid ^ t->suid) && (current->euid ^ t->uid)
&& (current->uid ^ t->suid) && (current->uid ^ t->uid)
- && (cap_issubset(t->cap_permitted, current->cap_permitted))
&& !capable(CAP_SYS_ADMIN))
goto out_nolock;
@@ -62,9+62,9 @@ asmlinkage void do_bottom_half(void) if (hardirq_trylock(cpu)) {
__sti();
run_bottom_halves();
+ __cli();
hardirq_endlock(cpu);
}
- __cli();
softirq_endlock(cpu);
}
}
@@ -232,7+232,9 @@ void dev_remove_pack(struct packet_type *pt) {
if(pt==(*pt1))
{
+ net_serialize_enter();
*pt1=pt->next;
+ net_serialize_leave();
#ifdef CONFIG_NET_FASTROUTE
if (pt->data)
netdev_fastroute_obstacles--;
@@ -328,6+330,12 @@ struct device *dev_alloc(const char *name, int *err) return dev;
}
+void netdev_state_change(struct device *dev)
+{
+ if (dev->flags&IFF_UP)
+ notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
+}
+
/*
* Find and possibly load an interface.
@@ -422,7+430,7 @@ static __inline__ void dev_do_clear_fastroute(struct device *dev) int i;
for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++)
- dst_release(xchg(dev->fastpath+i, NULL));
+ dst_release_irqwait(xchg(dev->fastpath+i, NULL));
}
}
@@ -895,22+903,6 @@ void net_bh(void) #endif
/*
- * Fetch the packet protocol ID.
- */
-
- type = skb->protocol;
-
-
-#ifdef CONFIG_BRIDGE
- /*
- * If we are bridging then pass the frame up to the
- * bridging code (if this protocol is to be bridged).
- * If it is bridged then move on
- */
- handle_bridge(skb, type);
-#endif
-
- /*
* Bump the pointer to the next structure.
*
* On entry to the protocol layer. skb->data and
@@ -927,11+919,26 @@ void net_bh(void) }
/*
+ * Fetch the packet protocol ID.
+ */
+
+ type = skb->protocol;
+
+#ifdef CONFIG_BRIDGE
+ /*
+ * If we are bridging then pass the frame up to the
+ * bridging code (if this protocol is to be bridged).
+ * If it is bridged then move on
+ */
+ handle_bridge(skb, type);
+#endif
+
+ /*
* We got a packet ID. Now loop over the "known protocols"
* list. There are two lists. The ptype_all list of taps (normally empty)
* and the main protocol list which is hashed perfectly for normal protocols.
*/
-
+
pt_prev = NULL;
for (ptype = ptype_all; ptype!=NULL; ptype=ptype->next)
{
@@ -1536,8+1543,7 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) return 0;
case SIOCSIFTXQLEN:
- /* Why <2? 0 and 1 are valid values. --ANK (980807) */
- if(/*ifr->ifr_qlen<2 ||*/ ifr->ifr_qlen>1024)
+ if(ifr->ifr_qlen<0)
return -EINVAL;
dev->tx_queue_len = ifr->ifr_qlen;
return 0;
@@ -1817,8+1823,11 @@ int unregister_netdevice(struct device *dev) /* And unlink it from device chain. */
for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
if (d == dev) {
+ net_serialize_enter();
*dp = d->next;
+ net_serialize_leave();
d->next = NULL;
+
if (dev->destructor)
dev->destructor(dev);
return 0;
@@ -1977,7+1986,9 @@ __initfunc(int net_dev_init(void)) /*
* It failed to come up. Unhook it.
*/
+ net_serialize_enter();
*dp = dev->next;
+ net_serialize_leave();
}
else
{
* protocols without doing damage to the protocols when it deletes the
* entries. It also helps IP as it tracks overlapping maps.
*
- * BUGGGG! IPv6 calls dev_mac_add/delete from BH, it means
- * that all the functions in this file are racy. [NOT FIXED] --ANK
+ * Device mc lists are changed by bh at least if IPv6 is enabled,
+ * so that it must be bh protected.
*/
-
/*
* Update the multicast list into the physical NIC controller.
@@ -77,11+76,13 @@ void dev_mc_upload(struct device *dev) /*
* Devices with no set multicast don't get set
*/
-
+
if(dev->set_multicast_list==NULL)
return;
-
+
+ start_bh_atomic();
dev->set_multicast_list(dev);
+ end_bh_atomic();
}
/*
@@ -90,8+91,10 @@ void dev_mc_upload(struct device *dev)
int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl)
{
+ int err = 0;
struct dev_mc_list *dmi, **dmip;
+ start_bh_atomic();
for (dmip=&dev->mc_list; (dmi=*dmip)!=NULL; dmip=&dmi->next) {
/*
* Find the entry we want to delete. The device could
@@ -102,10+105,10 @@ int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl) int old_glbl = dmi->dmi_gusers;
dmi->dmi_gusers = 0;
if (old_glbl == 0)
- return -ENOENT;
+ break;
}
if(--dmi->dmi_users)
- return 0;
+ goto done;
/*
* Last user. So delete the entry.
@@ -117,11+120,15 @@ int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl) * We have altered the list, so the card
* loaded filter is now wrong. Fix it
*/
+ end_bh_atomic();
dev_mc_upload(dev);
return 0;
}
}
- return -ENOENT;
+ err = -ENOENT;
+done:
+ end_bh_atomic();
+ return err;
}
/*
@@ -130,30+137,27 @@ int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl)
int dev_mc_add(struct device *dev, void *addr, int alen, int glbl)
{
- struct dev_mc_list *dmi;
+ int err = 0;
+ struct dev_mc_list *dmi, *dmi1;
+
+ dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), gfp_any());
+ start_bh_atomic();
for(dmi=dev->mc_list; dmi!=NULL; dmi=dmi->next) {
if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) {
if (glbl) {
int old_glbl = dmi->dmi_gusers;
dmi->dmi_gusers = 1;
if (old_glbl)
- return 0;
+ goto done;
}
dmi->dmi_users++;
- return 0;
+ goto done;
}
}
- /* GFP_ATOMIC!! It is used by IPv6 from interrupt,
- when new address arrives.
-
- Particularly, it means that this part of code is weirdly
- racy, and needs numerous *_bh_atomic --ANK
- */
- dmi=(struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC);
- if (dmi==NULL)
- return -ENOBUFS;
+ if ((dmi=dmi1)==NULL)
+ return -ENOMEM;
memcpy(dmi->dmi_addr, addr, alen);
dmi->dmi_addrlen=alen;
dmi->next=dev->mc_list;
@@ -161,8+165,15 @@ int dev_mc_add(struct device *dev, void *addr, int alen, int glbl) dmi->dmi_gusers=glbl ? 1 : 0;
dev->mc_list=dmi;
dev->mc_count++;
+ end_bh_atomic();
dev_mc_upload(dev);
return 0;
+
+done:
+ end_bh_atomic();
+ if (dmi1)
+ kfree(dmi1);
+ return err;
}
/*
@@ -171,6+182,7 @@ int dev_mc_add(struct device *dev, void *addr, int alen, int glbl)
void dev_mc_discard(struct device *dev)
{
+ start_bh_atomic();
while (dev->mc_list!=NULL) {
struct dev_mc_list *tmp=dev->mc_list;
dev->mc_list=tmp->next;
@@ -179,6+191,7 @@ void dev_mc_discard(struct device *dev) kfree_s(tmp,sizeof(*tmp));
}
dev->mc_count=0;
+ end_bh_atomic();
}
#ifdef CONFIG_PROC_FS
@@ -189,7+202,9 @@ static int dev_mc_read_proc(char *buffer, char **start, off_t offset, struct dev_mc_list *m;
int len=0;
struct device *dev;
-
+
+ start_bh_atomic();
+
for (dev = dev_base; dev; dev = dev->next) {
for (m = dev->mc_list; m; m = m->next) {
int i;
@@ -214,10+229,13 @@ static int dev_mc_read_proc(char *buffer, char **start, off_t offset, *eof = 1;
done:
+ end_bh_atomic();
*start=buffer+(offset-begin);
len-=(offset-begin);
if(len>length)
len=length;
+ if(len<0)
+ len=0;
return len;
}
#endif
@@ -441,8+441,9 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
if ((err = sk_chk_filter(fp->insns, fp->len))==0) {
struct sk_filter *old_fp = sk->filter;
+ net_serialize_enter();
sk->filter = fp;
- wmb();
+ net_serialize_leave();
fp = old_fp;
}
#endif
#include <net/neighbour.h>
#include <net/dst.h>
+#include <net/sock.h>
#include <linux/rtnetlink.h>
+/*
+ NOTE. The most unpleasent question is serialization of
+ accesses to resolved addresses. The problem is that addresses
+ are modified by bh, but they are referenced from normal
+ kernel thread. Before today no locking was made.
+ My reasoning was that corrupted address token will be copied
+ to packet with cosmologically small probability
+ (it is even difficult to estimate such small number)
+ and it is very silly to waste cycles in fast path to lock them.
+
+ But now I changed my mind, but not because previous statement
+ is wrong. Actually, neigh->ha MAY BE not opaque byte array,
+ but reference to some private data. In this case even neglibible
+ corruption probability becomes bug.
+
+ - hh cache is protected by rwlock. It assumes that
+ hh cache update procedure is short and fast, and that
+ read_lock is cheaper than start_bh_atomic().
+ - ha tokens, saved in neighbour entries, are protected
+ by bh_atomic().
+ - no protection is made in /proc reading. It is OK, because
+ /proc is broken by design in any case, and
+ corrupted output is normal behaviour there.
+
+ --ANK (981025)
+ */
+
#define NEIGH_DEBUG 1
#define NEIGH_PRINTK(x...) printk(x)
@@ -48,6+76,7 @@ static void neigh_timer_handler(unsigned long arg); #ifdef CONFIG_ARPD
static void neigh_app_notify(struct neighbour *n);
#endif
+static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev);
static int neigh_glbl_allocs;
static struct neigh_table *neigh_tables;
@@ -83,8+112,20 @@ static int neigh_forced_gc(struct neigh_table *tbl)
np = &tbl->hash_buckets[i];
while ((n = *np) != NULL) {
+ /* Neighbour record may be discarded if:
+ - nobody refers to it.
+ - it is not premanent
+ - (NEW and probably wrong)
+ INCOMPLETE entries are kept at least for
+ n->parms->retrans_time, otherwise we could
+ flood network with resolution requests.
+ It is not clear, what is better table overflow
+ or flooding.
+ */
if (atomic_read(&n->refcnt) == 0 &&
- !(n->nud_state&NUD_PERMANENT)) {
+ !(n->nud_state&NUD_PERMANENT) &&
+ (n->nud_state != NUD_INCOMPLETE ||
+ jiffies - n->used > n->parms->retrans_time)) {
*np = n->next;
n->tbl = NULL;
tbl->entries--;
@@ -149,6+190,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct device *dev)
del_timer(&tbl->proxy_timer);
skb_queue_purge(&tbl->proxy_queue);
+ pneigh_ifdown(tbl, dev);
end_bh_atomic();
return 0;
}
@@ -295,7+337,9 @@ int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev)
for (np = &tbl->phash_buckets[hash_val]; (n=*np) != NULL; np = &n->next) {
if (memcmp(n->key, pkey, key_len) == 0 && n->dev == dev) {
+ net_serialize_enter();
*np = n->next;
+ net_serialize_leave();
if (tbl->pdestructor)
tbl->pdestructor(n);
kfree(n);
@@ -305,6+349,30 @@ int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev) return -ENOENT;
}
+static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev)
+{
+ struct pneigh_entry *n, **np;
+ u32 h;
+
+ for (h=0; h<=PNEIGH_HASHMASK; h++) {
+ np = &tbl->phash_buckets[h];
+ for (np = &tbl->phash_buckets[h]; (n=*np) != NULL; np = &n->next) {
+ if (n->dev == dev || dev == NULL) {
+ net_serialize_enter();
+ *np = n->next;
+ net_serialize_leave();
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+ kfree(n);
+ continue;
+ }
+ np = &n->next;
+ }
+ }
+ return -ENOENT;
+}
+
+
/*
* neighbour must already be out of the table;
*
@@ -516,11+584,11 @@ static void neigh_timer_handler(unsigned long arg) return;
}
- neigh->probes++;
neigh->timer.expires = now + neigh->parms->retrans_time;
add_timer(&neigh->timer);
neigh->ops->solicit(neigh, skb_peek(&neigh->arp_queue));
+ neigh->probes++;
}
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
@@ -542,6+610,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) add_timer(&neigh->timer);
neigh->ops->solicit(neigh, skb);
+ neigh->probes++;
} else {
neigh->nud_state = NUD_FAILED;
if (skb)
@@ -581,8+650,11 @@ static __inline__ void neigh_update_hhs(struct neighbour *neigh) neigh->dev->header_cache_update;
if (update) {
- for (hh=neigh->hh; hh; hh=hh->hh_next)
+ for (hh=neigh->hh; hh; hh=hh->hh_next) {
+ write_lock_irq(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
+ write_unlock_irq(&hh->hh_lock);
+ }
}
}
@@ -655,7+727,7 @@ int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int del_timer(&neigh->timer);
neigh->nud_state = new;
if (lladdr != neigh->ha) {
- memcpy(neigh->ha, lladdr, dev->addr_len);
+ memcpy(&neigh->ha, lladdr, dev->addr_len);
neigh_update_hhs(neigh);
neigh->confirmed = jiffies - (neigh->parms->base_reachable_time<<1);
#ifdef CONFIG_ARPD
@@ -764,14+836,20 @@ int neigh_resolve_output(struct sk_buff *skb) __skb_pull(skb, skb->nh.raw - skb->data);
if (neigh_event_send(neigh, skb) == 0) {
+ int err;
struct device *dev = neigh->dev;
- if (dev->hard_header_cache) {
+ if (dev->hard_header_cache && dst->hh == NULL) {
start_bh_atomic();
if (dst->hh == NULL)
neigh_hh_init(neigh, dst, dst->ops->protocol);
+ err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
+ end_bh_atomic();
+ } else {
+ start_bh_atomic();
+ err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
end_bh_atomic();
}
- if (dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len) >= 0)
+ if (err >= 0)
return neigh->ops->queue_xmit(skb);
kfree_skb(skb);
return -EINVAL;
@@ -788,13+866,17 @@ discard:
int neigh_connected_output(struct sk_buff *skb)
{
+ int err;
struct dst_entry *dst = skb->dst;
struct neighbour *neigh = dst->neighbour;
struct device *dev = neigh->dev;
__skb_pull(skb, skb->nh.raw - skb->data);
- if (dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len) >= 0)
+ start_bh_atomic();
+ err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
+ end_bh_atomic();
+ if (err >= 0)
return neigh->ops->queue_xmit(skb);
kfree_skb(skb);
return -EINVAL;
@@ -868,7+950,6 @@ struct neigh_parms *neigh_parms_alloc(struct device *dev, struct neigh_table *tb }
}
p->next = tbl->parms.next;
- /* ATOMIC_SET */
tbl->parms.next = p;
}
return p;
@@ -882,8+963,9 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) return;
for (p = &tbl->parms.next; *p; p = &(*p)->next) {
if (*p == parms) {
- /* ATOMIC_SET */
+ net_serialize_enter();
*p = parms->next;
+ net_serialize_leave();
#ifdef CONFIG_SYSCTL
neigh_sysctl_unregister(parms);
#endif
@@ -926,14+1008,15 @@ int neigh_table_clear(struct neigh_table *tbl) del_timer(&tbl->gc_timer);
del_timer(&tbl->proxy_timer);
skb_queue_purge(&tbl->proxy_queue);
- if (tbl->entries)
- neigh_ifdown(tbl, NULL);
+ neigh_ifdown(tbl, NULL);
end_bh_atomic();
if (tbl->entries)
printk(KERN_CRIT "neighbour leakage\n");
for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
if (*tp == tbl) {
+ net_serialize_enter();
*tp = tbl->next;
+ net_serialize_leave();
break;
}
}
@@ -976,7+1059,7 @@ int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return -EINVAL;
start_bh_atomic();
- n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev);
+ n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0);
if (n) {
err = neigh_update(n, NULL, NUD_FAILED, 1, 0);
neigh_release(n);
@@ -1020,7+1103,7 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) nda[NDA_LLADDR-1]->rta_len != RTA_LENGTH(dev->addr_len))
return -EINVAL;
start_bh_atomic();
- n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev);
+ n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0);
if (n) {
if (nlh->nlmsg_flags&NLM_F_EXCL)
err = -EEXIST;
@@ -1091,7+1174,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct for (h=0; h <= NEIGH_HASHMASK; h++) {
if (h < s_h) continue;
if (h > s_h)
- memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(int));
+ memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0]));
start_bh_atomic();
for (n = tbl->hash_buckets[h], idx = 0; n;
n = n->next, idx++) {
@@ -1125,7+1208,7 @@ int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (family && tbl->family != family)
continue;
if (t > s_t)
- memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+ memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
if (neigh_dump_table(tbl, skb, cb) < 0)
break;
}
#include <linux/inet.h>
#include <linux/netdevice.h>
-#include <net/pkt_sched.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/sock.h>
+#include <net/pkt_sched.h>
atomic_t rtnl_rlockct;
struct wait_queue *rtnl_wait;
* handler for protocols to use and generic option handler.
*
*
- * Version: $Id: sock.c,v 1.76 1999/02/23 08:12:29 davem Exp $
+ * Version: $Id: sock.c,v 1.77 1999/03/21 05:22:26 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -251,12+251,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break;
case SO_PRIORITY:
- if (val >= 0 && val <= 7)
- {
- if(val==7 && !capable(CAP_NET_ADMIN))
- return -EPERM;
+ if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
sk->priority = val;
- }
+ else
+ return(-EPERM);
break;
case SO_LINGER:
@@ -348,8+346,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
filter = sk->filter;
+ net_serialize_enter();
sk->filter = NULL;
- wmb();
+ net_serialize_leave();
if (filter)
sk_filter_release(sk, filter);
@@ -515,6+514,16 @@ void sk_free(struct sock *sk) if (atomic_read(&sk->omem_alloc))
printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
+#ifdef CONFIG_FILTER
+ if (sk->filter) {
+ sk_filter_release(sk, sk->filter);
+ sk->filter = NULL;
+ }
+#endif
+
+ if (atomic_read(&sk->omem_alloc))
+ printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
+
kmem_cache_free(sk_cachep, sk);
}
@@ -32,6+32,9 @@ if [ "$CONFIG_FIREWALL" = "y" ]; then fi
fi
bool 'IP: always defragment (required for masquerading)' CONFIG_IP_ALWAYS_DEFRAG
+ if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then
+ bool 'IP: use FWMARK value as routing key' CONFIG_IP_ROUTE_FWMARK
+ fi
fi
fi
if [ "$CONFIG_IP_FIREWALL" = "y" ]; then
*
* PF_INET protocol family socket handler.
*
- * Version: $Id: af_inet.c,v 1.84 1999/03/15 22:16:47 davem Exp $
+ * Version: $Id: af_inet.c,v 1.85 1999/03/21 05:22:28 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -176,8+176,6 @@ static __inline__ void kill_sk_now(struct sock *sk) if(sk->opt)
kfree(sk->opt);
dst_release(sk->dst_cache);
- if (atomic_read(&sk->omem_alloc))
- printk(KERN_DEBUG "kill_sk_now: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
sk_free(sk);
}
/* linux/net/inet/arp.c
*
- * Version: $Id: arp.c,v 1.76 1999/03/09 14:10:07 davem Exp $
+ * Version: $Id: arp.c,v 1.77 1999/03/21 05:22:30 davem Exp $
*
* Copyright (C) 1994 by Florian La Roche
*
@@ -294,7+294,7 @@ static int arp_constructor(struct neighbour *neigh)
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
{
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+ dst_link_failure(skb);
kfree_skb(skb);
}
@@ -401,8+401,12 @@ int arp_bind_neighbour(struct dst_entry *dst)
if (dev == NULL)
return 0;
- if (dst->neighbour == NULL)
- dst->neighbour = __neigh_lookup(&arp_tbl, &((struct rtable*)dst)->rt_gateway, dev, 1);
+ if (dst->neighbour == NULL) {
+ u32 nexthop = ((struct rtable*)dst)->rt_gateway;
+ if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+ nexthop = 0;
+ dst->neighbour = __neigh_lookup(&arp_tbl, &nexthop, dev, 1);
+ }
return (dst->neighbour != NULL);
}
/*
* NET3 IP device support routines.
*
- * Version: $Id: devinet.c,v 1.25 1999/01/04 20:14:33 davem Exp $
+ * Version: $Id: devinet.c,v 1.26 1999/03/21 05:22:31 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -138,7+138,9 @@ static void inetdev_destroy(struct in_device *in_dev) #ifdef CONFIG_SYSCTL
devinet_sysctl_unregister(&in_dev->cnf);
#endif
+ net_serialize_enter();
in_dev->dev->ip_ptr = NULL;
+ net_serialize_leave();
neigh_parms_release(&arp_tbl, in_dev->arp_parms);
kfree(in_dev);
}
@@ -172,7+174,10 @@ inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) ifap1 = &ifa->ifa_next;
continue;
}
+ net_serialize_enter();
*ifap1 = ifa->ifa_next;
+ net_serialize_leave();
+
rtmsg_ifa(RTM_DELADDR, ifa);
notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
inet_free_ifa(ifa);
@@ -181,8+186,9 @@ inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy)
/* 2. Unlink it */
+ net_serialize_enter();
*ifap = ifa1->ifa_next;
-
+ net_serialize_leave();
/* 3. Announce address deletion */
@@ -238,8+244,9 @@ inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa) }
ifa->ifa_next = *ifap;
- /* ATOMIC_SET */
+ net_serialize_enter();
*ifap = ifa;
+ net_serialize_leave();
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
@@ -650,8+657,25 @@ u32 inet_select_addr(struct device *dev, u32 dst, int scope) if (!dst || inet_ifa_match(dst, ifa))
return addr;
} endfor_ifa(in_dev);
+
+ if (addr || scope >= RT_SCOPE_LINK)
+ return addr;
- return addr;
+ /* Not loopback addresses on loopback should be preferred
+ in this case. It is importnat that lo is the first interface
+ in dev_base list.
+ */
+ for (dev=dev_base; dev; dev=dev->next) {
+ if ((in_dev=dev->ip_ptr) == NULL)
+ continue;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope <= scope)
+ return ifa->ifa_local;
+ } endfor_ifa(in_dev);
+ }
+
+ return 0;
}
/*
*
* IPv4 Forwarding Information Base: FIB frontend.
*
- * Version: $Id: fib_frontend.c,v 1.14 1999/01/04 20:13:55 davem Exp $
+ * Version: $Id: fib_frontend.c,v 1.15 1999/03/21 05:22:31 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -189,7+189,7 @@ unsigned inet_addr_type(u32 addr) */
int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
- struct device *dev, u32 *spec_dst)
+ struct device *dev, u32 *spec_dst, u32 *itag)
{
struct in_device *in_dev = dev->ip_ptr;
struct rt_key key;
@@ -209,6+209,8 @@ int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, if (res.type != RTN_UNICAST)
return -EINVAL;
*spec_dst = FIB_RES_PREFSRC(res);
+ if (itag)
+ fib_combine_itag(itag, &res);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
#else
@@ -231,6+233,7 @@ last_resort: if (IN_DEV_RPFILTER(in_dev))
return -EINVAL;
*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ *itag = 0;
return 0;
}
@@ -354,7+357,7 @@ int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) for (t=s_t; t<=RT_TABLE_MAX; t++) {
if (t < s_t) continue;
if (t > s_t)
- memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+ memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
if ((tb = fib_get_table(t))==NULL)
continue;
if (tb->tb_dump(tb, skb, cb) < 0)
*
* IPv4 FIB: lookup engine and maintenance routines.
*
- * Version: $Id: fib_hash.c,v 1.6 1998/10/03 09:37:06 davem Exp $
+ * Version: $Id: fib_hash.c,v 1.7 1999/03/21 05:22:32 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -302,6+302,90 @@ fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result return 1;
}
+static int fn_hash_last_dflt=-1;
+
+static int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort, int *last_idx)
+{
+ struct neighbour *n;
+ int state = NUD_NONE;
+
+ n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+ if (n) {
+ state = n->nud_state;
+ neigh_release(n);
+ }
+ if (state==NUD_REACHABLE)
+ return 0;
+ if ((state&NUD_VALID) && order != fn_hash_last_dflt)
+ return 0;
+ if ((state&NUD_VALID) ||
+ (*last_idx<0 && order > fn_hash_last_dflt)) {
+ *last_resort = fi;
+ *last_idx = order;
+ }
+ return 1;
+}
+
+static void
+fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fib_result *res)
+{
+ int order, last_idx;
+ struct fib_node *f;
+ struct fib_info *fi = NULL;
+ struct fib_info *last_resort;
+ struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+ struct fn_zone *fz = t->fn_zones[0];
+
+ if (fz == NULL)
+ return;
+
+ last_idx = -1;
+ last_resort = NULL;
+ order = -1;
+
+ for (f = fz->fz_hash[0]; f; f = f->fn_next) {
+ struct fib_info *next_fi = FIB_INFO(f);
+
+ if ((f->fn_state&FN_S_ZOMBIE) ||
+ f->fn_scope != res->scope ||
+ f->fn_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+ f->fn_state |= FN_S_ACCESSED;
+
+ if (fi == NULL) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
+ res->fi = fi;
+ fn_hash_last_dflt = order;
+ return;
+ }
+ fi = next_fi;
+ order++;
+ }
+
+ if (order<=0 || fi==NULL) {
+ fn_hash_last_dflt = -1;
+ return;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
+ res->fi = fi;
+ fn_hash_last_dflt = order;
+ return;
+ }
+
+ if (last_idx >= 0)
+ res->fi = last_resort;
+ fn_hash_last_dflt = last_idx;
+}
+
#define FIB_SCAN(f, fp) \
for ( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next)
@@ -476,14+560,16 @@ replace: */
new_f->fn_next = f;
- /* ATOMIC_SET */
*fp = new_f;
fz->fz_nent++;
if (del_fp) {
f = *del_fp;
/* Unlink replaced node */
+ net_serialize_enter();
*del_fp = f->fn_next;
+ net_serialize_leave();
+
if (!(f->fn_state&FN_S_ZOMBIE))
rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
if (f->fn_state&FN_S_ACCESSED)
@@ -570,7+656,10 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ? rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
if (matched != 1) {
+ net_serialize_enter();
*del_fp = f->fn_next;
+ net_serialize_leave();
+
if (f->fn_state&FN_S_ACCESSED)
rt_cache_flush(-1);
fn_free_node(f);
@@ -600,7+689,10 @@ fn_flush_list(struct fib_node ** fp, int z, struct fn_hash *table) struct fib_info *fi = FIB_INFO(f);
if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) {
+ net_serialize_enter();
*fp = f->fn_next;
+ net_serialize_leave();
+
fn_free_node(f);
found++;
continue;
@@ -710,7+802,7 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, for (h=0; h < fz->fz_divisor; h++) {
if (h < s_h) continue;
if (h > s_h)
- memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(int));
+ memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
if (fz->fz_hash == NULL || fz->fz_hash[h] == NULL)
continue;
if (fn_hash_dump_bucket(skb, cb, tb, fz, fz->fz_hash[h]) < 0) {
@@ -732,7+824,7 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
if (m < s_m) continue;
if (m > s_m)
- memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(int));
+ memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0]));
if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
cb->args[1] = m;
return -1;
@@ -784,6+876,7 @@ __initfunc(struct fib_table * fib_hash_init(int id)) tb->tb_insert = fn_hash_insert;
tb->tb_delete = fn_hash_delete;
tb->tb_flush = fn_hash_flush;
+ tb->tb_select_default = fn_hash_select_default;
#ifdef CONFIG_RTNETLINK
tb->tb_dump = fn_hash_dump;
#endif
*
* IPv4 Forwarding Information Base: policy rules.
*
- * Version: $Id: fib_rules.c,v 1.7 1998/10/03 09:37:09 davem Exp $
+ * Version: $Id: fib_rules.c,v 1.8 1999/03/21 05:22:33 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
*
* Fixes:
* Rani Assaf : local_rule cannot be deleted
+ * Marc Boucher : routing by fwmark
*/
#include <linux/config.h>
@@ -63,6+64,9 @@ struct fib_rule u32 r_srcmap;
u8 r_flags;
u8 r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ u32 r_fwmark;
+#endif
int r_ifindex;
#ifdef CONFIG_NET_CLS_ROUTE
__u32 r_tclassid;
@@ -88,13+92,18 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rtm->rtm_dst_len == r->r_dst_len &&
(!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) &&
rtm->rtm_tos == r->r_tos &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) &&
+#endif
(!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
(!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
(!rta[RTA_IIF-1] || strcmp(RTA_DATA(rta[RTA_IIF-1]), r->r_ifname) == 0) &&
(!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
if (r == &local_rule)
return -EPERM;
+ net_serialize_enter();
*rp = r->r_next;
+ net_serialize_leave();
if (r != &default_rule && r != &main_rule)
kfree(r);
return 0;
@@ -155,6+164,10 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len);
new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len);
new_r->r_tos = rtm->rtm_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ if (rta[RTA_PROTOINFO-1])
+ memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4);
+#endif
new_r->r_action = rtm->rtm_type;
new_r->r_flags = rtm->rtm_flags;
if (rta[RTA_PRIORITY-1])
@@ -267,14+280,15 @@ FRprintk("Lookup: %08x <- %08x ", key->dst, key->src); #ifdef CONFIG_IP_ROUTE_TOS
(r->r_tos && r->r_tos != key->tos) ||
#endif
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ (r->r_fwmark && r->r_fwmark != key->fwmark) ||
+#endif
(r->r_ifindex && r->r_ifindex != key->iif))
continue;
FRprintk("tb %d r %d ", r->r_table, r->r_action);
switch (r->r_action) {
case RTN_UNICAST:
- policy = NULL;
- break;
case RTN_NAT:
policy = r;
break;
@@ -295,14+309,23 @@ FRprintk("ok\n"); res->r = policy;
return 0;
}
- if (err < 0)
+ if (err < 0 && err != -EAGAIN)
return err;
-FRprintk("RCONT ");
}
FRprintk("FAILURE\n");
return -ENETUNREACH;
}
+void fib_select_default(const struct rt_key *key, struct fib_result *res)
+{
+ if (res->r && res->r->r_action == RTN_UNICAST &&
+ FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
+ struct fib_table *tb;
+ if ((tb = fib_get_table(res->r->r_table)) != NULL)
+ tb->tb_select_default(tb, key, res);
+ }
+}
+
static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct device *dev = ptr;
@@ -337,6+360,10 @@ extern __inline__ int inet_fill_rule(struct sk_buff *skb, rtm->rtm_dst_len = r->r_dst_len;
rtm->rtm_src_len = r->r_src_len;
rtm->rtm_tos = r->r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ if (r->r_fwmark)
+ RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark);
+#endif
rtm->rtm_table = r->r_table;
rtm->rtm_protocol = 0;
rtm->rtm_scope = 0;
*
* IPv4 Forwarding Information Base: semantics.
*
- * Version: $Id: fib_semantics.c,v 1.12 1999/01/26 05:33:44 davem Exp $
+ * Version: $Id: fib_semantics.c,v 1.13 1999/03/21 05:22:34 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -89,7+89,7 @@ static struct { -EINVAL, RT_SCOPE_UNIVERSE}, /* RTN_BLACKHOLE */
{ -EHOSTUNREACH, RT_SCOPE_UNIVERSE},/* RTN_UNREACHABLE */
{ -EACCES, RT_SCOPE_UNIVERSE}, /* RTN_PROHIBIT */
- { 1, RT_SCOPE_UNIVERSE}, /* RTN_THROW */
+ { -EAGAIN, RT_SCOPE_UNIVERSE}, /* RTN_THROW */
#ifdef CONFIG_IP_ROUTE_NAT
{ 0, RT_SCOPE_HOST}, /* RTN_NAT */
#else
@@ -420,7+420,7 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, unsigned flavor = attr->rta_type;
if (flavor) {
if (flavor > FIB_MAX_METRICS)
- goto failure;
+ goto err_inval;
fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
}
attr = RTA_NEXT(attr, attrlen);
*
* Alan Cox, <alan@cymru.net>
*
- * Version: $Id: icmp.c,v 1.50 1999/03/17 01:53:55 davem Exp $
+ * Version: $Id: icmp.c,v 1.52 1999/03/21 12:04:11 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -373,6+373,12 @@ struct socket *icmp_socket=&icmp_inode.u.socket_i; * works for icmp destinations. This means the rate limiting information
* for one "ip object" is shared.
*
+ * Note that the same dst_entry fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
* RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
* SHOULD allow setting of rate limits
*
@@ -385,10+391,10 @@ int xrlim_allow(struct dst_entry *dst, int timeout)
now = jiffies;
dst->rate_tokens += now - dst->rate_last;
+ dst->rate_last = now;
if (dst->rate_tokens > XRLIM_BURST_FACTOR*timeout)
dst->rate_tokens = XRLIM_BURST_FACTOR*timeout;
if (dst->rate_tokens >= timeout) {
- dst->rate_last = now;
dst->rate_tokens -= timeout;
return 1;
}
@@ -406,6+412,10 @@ static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
return 1;
+ /* Redirect has its own rate limit mechanism */
+ if (type == ICMP_REDIRECT)
+ return 1;
+
/* No rate limit on loopback */
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
return 1;
@@ -526,8+536,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) /*
* Now check at the protocol level
*/
- if (!rt)
+ if (!rt) {
+#ifndef CONFIG_IP_ALWAYS_DEFRAG
+ if (net_ratelimit())
+ printk(KERN_DEBUG "icmp_send: destinationless packet\n");
+#endif
return;
+ }
if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
return;
* the older version didn't come out right using gcc 2.5.8, the newer one
* seems to fall out with gcc 2.6.2.
*
- * Version: $Id: igmp.c,v 1.28 1998/11/30 15:53:13 davem Exp $
+ * Version: $Id: igmp.c,v 1.29 1999/03/21 05:22:36 davem Exp $
*
* Authors:
* Alan Cox <Alan.Cox@linux.org>
#include <linux/mroute.h>
#endif
+#define IP_MAX_MEMBERSHIPS 20
#ifdef CONFIG_IP_MULTICAST
@@ -462,7+463,9 @@ int ip_mc_dec_group(struct in_device *in_dev, u32 addr) for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
if (i->multiaddr==addr) {
if (--i->users == 0) {
+ net_serialize_enter();
*ip = i->next;
+ net_serialize_leave();
igmp_group_dropped(i);
if (in_dev->dev->flags & IFF_UP)
ip_rt_multicast_event(in_dev);
@@ -610,7+613,9 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) struct in_device *in_dev;
if (--iml->count)
return 0;
+ net_serialize_enter();
*imlp = iml->next;
+ net_serialize_leave();
in_dev = inetdev_by_index(iml->multi.imr_ifindex);
if (in_dev)
ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
len-=(offset-begin);
if(len>length)
len=length;
+ if(len<0)
+ len=0;
return len;
}
#endif
*
* The IP forwarding functionality.
*
- * Version: $Id: ip_forward.c,v 1.42 1998/10/03 09:37:19 davem Exp $
+ * Version: $Id: ip_forward.c,v 1.43 1999/03/21 05:22:37 davem Exp $
*
* Authors: see ip.c
*
@@ -260,7+260,7 @@ skip_call_fw_firewall: if (rt->rt_flags&RTCF_FAST && !netdev_fastroute_obstacles) {
unsigned h = ((*(u8*)&rt->key.dst)^(*(u8*)&rt->key.src))&NETDEV_FASTROUTE_HMASK;
/* Time to switch to functional programming :-) */
- dst_release(xchg(&skb->dev->fastpath[h], dst_clone(&rt->u.dst)));
+ dst_release_irqwait(xchg(&skb->dev->fastpath[h], dst_clone(&rt->u.dst)));
}
#endif
ip_send(skb);
*
* The IP fragmentation functionality.
*
- * Version: $Id: ip_fragment.c,v 1.39 1998/08/26 10:35:26 davem Exp $
+ * Version: $Id: ip_fragment.c,v 1.40 1999/03/20 23:58:34 davem Exp $
*
* Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
* Alan Cox <Alan.Cox@linux.org>
@@ -189,6+189,48 @@ static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key) return NULL;
}
+static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
+{
+ u32 remote = t->parms.iph.daddr;
+ u32 local = t->parms.iph.saddr;
+ u32 key = t->parms.i_key;
+ unsigned h = HASH(key);
+ int prio = 0;
+
+ if (local)
+ prio |= 1;
+ if (remote && !MULTICAST(remote)) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+
+ return &tunnels[prio][h];
+}
+
+static void ipgre_tunnel_link(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp = ipgre_bucket(t);
+
+ net_serialize_enter();
+ t->next = *tp;
+ *tp = t;
+ net_serialize_leave();
+}
+
+static void ipgre_tunnel_unlink(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp;
+
+ for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
+ if (t == *tp) {
+ net_serialize_enter();
+ *tp = t->next;
+ net_serialize_leave();
+ break;
+ }
+ }
+}
+
static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
{
u32 remote = parms->iph.daddr;
@@ -241,10+283,7 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int if (register_netdevice(dev) < 0)
goto failed;
- start_bh_atomic();
- nt->next = t;
- *tp = nt;
- end_bh_atomic();
+ ipgre_tunnel_link(nt);
/* Do not decrement MOD_USE_COUNT here. */
return nt;
@@ -256,28+295,11 @@ failed:
static void ipgre_tunnel_destroy(struct device *dev)
{
- struct ip_tunnel *t, **tp;
- struct ip_tunnel *t0 = (struct ip_tunnel*)dev->priv;
- u32 remote = t0->parms.iph.daddr;
- u32 local = t0->parms.iph.saddr;
- unsigned h = HASH(t0->parms.i_key);
- int prio = 0;
+ ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
- if (local)
- prio |= 1;
- if (remote && !MULTICAST(remote)) {
- prio |= 2;
- h ^= HASH(remote);
- }
- for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
- if (t == t0) {
- *tp = t->next;
- if (dev != &ipgre_fb_tunnel_dev) {
- kfree(dev);
- MOD_DEC_USE_COUNT;
- }
- break;
- }
+ if (dev != &ipgre_fb_tunnel_dev) {
+ kfree(dev);
+ MOD_DEC_USE_COUNT;
}
}
@@ -849,6+871,41 @@ ipgre_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+ if (dev != &ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
+ t != &ipgre_fb_tunnel) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ unsigned nflags=0;
+
+ t = (struct ip_tunnel*)dev->priv;
+
+ if (MULTICAST(p.iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p.iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+ err = -EINVAL;
+ break;
+ }
+ start_bh_atomic();
+ ipgre_tunnel_unlink(t);
+ t->parms.iph.saddr = p.iph.saddr;
+ t->parms.iph.daddr = p.iph.daddr;
+ t->parms.i_key = p.i_key;
+ t->parms.o_key = p.o_key;
+ memcpy(dev->dev_addr, &p.iph.saddr, 4);
+ memcpy(dev->broadcast, &p.iph.daddr, 4);
+ ipgre_tunnel_link(t);
+ end_bh_atomic();
+ netdev_state_change(dev);
+ }
+ }
+
if (t) {
err = 0;
if (cmd == SIOCCHGTUNNEL) {
*
* The Internet Protocol (IP) module.
*
- * Version: $Id: ip_input.c,v 1.35 1999/01/12 14:32:48 davem Exp $
+ * Version: $Id: ip_input.c,v 1.36 1999/03/21 05:22:38 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -387,6+387,10 @@ int ip_local_deliver(struct sk_buff *skb) int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
{
struct iphdr *iph = skb->nh.iph;
+#ifdef CONFIG_FIREWALL
+ int fwres;
+ u16 rport;
+#endif /* CONFIG_FIREWALL */
/*
* When the interface is in promisc. mode, drop all the crap
@@ -427,6+431,30 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) __skb_trim(skb, len);
}
+#ifdef CONFIG_IP_ALWAYS_DEFRAG
+ /* Won't send ICMP reply, since skb->dst == NULL. --RR */
+ if (iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ skb = ip_defrag(skb);
+ if (!skb)
+ return 0;
+ iph = skb->nh.iph;
+ ip_send_check(iph);
+ }
+#endif
+
+#ifdef CONFIG_FIREWALL
+ /*
+ * See if the firewall wants to dispose of the packet.
+ *
+ * We can't do ICMP reply or local delivery before routing,
+ * so we delay those decisions until after route. --RR
+ */
+ fwres = call_in_firewall(PF_INET, dev, iph, &rport, &skb);
+ if (fwres < FW_ACCEPT && fwres != FW_REJECT)
+ goto drop;
+ iph = skb->nh.iph;
+#endif /* CONFIG_FIREWALL */
+
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
@@ -442,13+470,13 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) #endif
}
-#ifdef CONFIG_IP_ALWAYS_DEFRAG
- if (iph->frag_off & htons(IP_MF|IP_OFFSET)) {
- skb = ip_defrag(skb);
- if (!skb)
- return 0;
- iph = skb->nh.iph;
- ip_send_check(iph);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (skb->dst->tclassid) {
+ u32 idx = skb->dst->tclassid;
+ ip_rt_acct[idx&0xFF].o_packets++;
+ ip_rt_acct[idx&0xFF].o_bytes+=skb->len;
+ ip_rt_acct[(idx>>16)&0xFF].i_packets++;
+ ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len;
}
#endif
@@ -462,7+490,7 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) and running sniffer is extremely rare condition.
--ANK (980813)
*/
-
+
skb = skb_cow(skb, skb_headroom(skb));
if (skb == NULL)
return 0;
@@ -486,51+514,17 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) }
}
- /*
- * See if the firewall wants to dispose of the packet.
- *
- * Note: the current standard firewall code expects that the
- * destination address was already checked against the interface
- * address lists.
- *
- * If this code is ever moved in front of ip_route_input() you need
- * to fix the fw code [moving it might be a good idea anyways,
- * so that we can firewall against potentially bugs in the options
- * or routing code]
- */
-
-#ifdef CONFIG_FIREWALL
- {
- int fwres;
- u16 rport;
-#ifdef CONFIG_IP_ROUTE_TOS
- u8 tos = iph->tos;
-#endif
-
- if ((fwres=call_in_firewall(PF_INET, skb->dev, iph, &rport, &skb))<FW_ACCEPT) {
- if (fwres==FW_REJECT)
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
- goto drop;
- }
-
+#ifdef CONFIG_FIREWALL
#ifdef CONFIG_IP_TRANSPARENT_PROXY
- if (fwres==FW_REDIRECT && (IPCB(skb)->redirport = rport) != 0)
- return ip_local_deliver(skb);
-#endif
-#ifdef CONFIG_IP_ROUTE_TOS
- /* It is for 2.2 only. Firewalling should make smart
- rerouting itself, ideally, but now it is too late
- to teach it. --ANK (980905)
- */
- if (iph->tos != tos && ((struct rtable*)skb->dst)->rt_type == RTN_UNICAST) {
- dst_release(skb->dst);
- skb->dst = NULL;
- if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
- goto drop;
- }
-#endif
+ if (fwres == FW_REDIRECT && (IPCB(skb)->redirport = rport) != 0)
+ return ip_local_deliver(skb);
+#endif /* CONFIG_IP_TRANSPARENT_PROXY */
+
+ if (fwres == FW_REJECT) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+ goto drop;
}
-#endif
+#endif /* CONFIG_FIREWALL */
return skb->dst->input(skb);
*
* Dumb Network Address Translation.
*
- * Version: $Id: ip_nat_dumb.c,v 1.7 1998/10/06 04:49:09 davem Exp $
+ * Version: $Id: ip_nat_dumb.c,v 1.8 1999/03/21 05:22:40 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -89,6+89,8 @@ ip_do_nat(struct sk_buff *skb) {
struct icmphdr *icmph = (struct icmphdr*)((char*)iph + (iph->ihl<<2));
struct iphdr *ciph;
+ u32 idaddr, isaddr;
+ int updated;
if ((icmph->type != ICMP_DEST_UNREACH) &&
(icmph->type != ICMP_TIME_EXCEEDED) &&
@@ -100,8+102,14 @@ ip_do_nat(struct sk_buff *skb) if ((u8*)(ciph+1) > skb->tail)
goto truncated;
- if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr)
+ isaddr = ciph->saddr;
+ idaddr = ciph->daddr;
+ updated = 0;
+
+ if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr) {
ciph->saddr = iph->daddr;
+ updated = 1;
+ }
if (rt->rt_flags&RTCF_SNAT) {
if (ciph->daddr != osaddr) {
struct fib_result res;
@@ -115,16+123,27 @@ ip_do_nat(struct sk_buff *skb) #ifdef CONFIG_IP_ROUTE_TOS
key.tos = RT_TOS(ciph->tos);
#endif
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ key.fwmark = 0;
+#endif
/* Use fib_lookup() until we get our own
* hash table of NATed hosts -- Rani
*/
- if (fib_lookup(&key, &res) != 0)
- return 0;
- if (res.r)
+ if (fib_lookup(&key, &res) == 0 && res.r) {
ciph->daddr = fib_rules_policy(ciph->daddr, &res, &flags);
- }
- else
+ if (ciph->daddr != idaddr)
+ updated = 1;
+ }
+ } else {
ciph->daddr = iph->saddr;
+ updated = 1;
+ }
+ }
+ if (updated) {
+ cksum = &icmph->checksum;
+ /* Using tcpudp primitive. Why not? */
+ check = csum_tcpudp_magic(ciph->saddr, ciph->daddr, 0, 0, ~(*cksum));
+ *cksum = csum_tcpudp_magic(~isaddr, ~idaddr, 0, 0, ~check);
}
break;
}
*
* The options processing module for ip.c
*
- * Version: $Id: ip_options.c,v 1.15 1998/10/03 09:37:27 davem Exp $
+ * Version: $Id: ip_options.c,v 1.16 1999/03/21 05:22:40 davem Exp $
*
* Authors: A.N.Kuznetsov
*
@@ -137,17+137,17 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) if (sopt->ts_needtime) {
if (soffset + 3 > optlen)
return -EINVAL;
- dopt->ts_needtime = 1;
- soffset += 4;
- if ((dptr[3]&0xF) == IPOPT_TS_PRESPEC) {
- __u32 addr;
- if (soffset + 3 > optlen)
- return -EINVAL;
+ if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+ dopt->ts_needtime = 1;
soffset += 4;
+ } else {
+ dopt->ts_needtime = 0;
+
if (soffset + 8 <= optlen) {
- dopt->ts_needtime = 0;
+ __u32 addr;
+
memcpy(&addr, sptr+soffset-1, 4);
- if (inet_addr_type(addr) != RTN_UNICAST) {
+ if (inet_addr_type(addr) != RTN_LOCAL) {
dopt->ts_needtime = 1;
soffset += 8;
}
@@ -471,19+471,21 @@ void ip_options_undo(struct ip_options * opt) }
if (opt->rr_needaddr) {
unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr);
- memset(&optptr[optptr[2]-1], 0, 4);
optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
}
if (opt->ts) {
unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr);
if (opt->ts_needtime) {
- memset(&optptr[optptr[2]-1], 0, 4);
optptr[2] -= 4;
- }
- if (opt->ts_needaddr)
memset(&optptr[optptr[2]-1], 0, 4);
- if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC)
+ if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+ optptr[2] -= 4;
+ }
+ if (opt->ts_needaddr) {
optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ }
}
}
*
* The Internet Protocol (IP) output module.
*
- * Version: $Id: ip_output.c,v 1.65 1999/01/21 13:37:34 davem Exp $
+ * Version: $Id: ip_output.c,v 1.66 1999/03/21 05:22:41 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* for decreased register pressure on x86
* and more readibility.
* Marc Boucher : When call_out_firewall returns FW_QUEUE,
- * silently abort send instead of failing
- * with -EPERM.
+ * silently drop skb instead of failing with -EPERM.
*/
#include <asm/uaccess.h>
@@ -132,8+131,16 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, dev = rt->u.dst.dev;
#ifdef CONFIG_FIREWALL
- if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT)
- goto drop;
+ /* Now we have no better mechanism to notify about error. */
+ switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
+ case FW_REJECT:
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+ /* Fall thru... */
+ case FW_BLOCK:
+ case FW_QUEUE:
+ kfree_skb(skb);
+ return;
+ }
#endif
ip_send_check(iph);
@@ -141,11+148,6 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, /* Send it out. */
skb->dst->output(skb);
return;
-
-#ifdef CONFIG_FIREWALL
-drop:
- kfree_skb(skb);
-#endif
}
int __ip_finish_output(struct sk_buff *skb)
@@ -292,8+294,17 @@ void ip_queue_xmit(struct sk_buff *skb) dev = rt->u.dst.dev;
#ifdef CONFIG_FIREWALL
- if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT)
- goto drop;
+ /* Now we have no better mechanism to notify about error. */
+ switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
+ case FW_REJECT:
+ start_bh_atomic();
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+ end_bh_atomic();
+ /* Fall thru... */
+ case FW_BLOCK:
+ case FW_QUEUE:
+ goto drop;
+ }
#endif
/* This can happen when the transport layer has segments queued
@@ -340,8+351,12 @@ fragment: */
iph->frag_off |= __constant_htons(IP_DF);
printk(KERN_DEBUG "sending pkt_too_big to self\n");
+
+ /* icmp_send is not reenterable, so that bh_atomic... --ANK */
+ start_bh_atomic();
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(rt->u.dst.pmtu));
+ end_bh_atomic();
goto drop;
}
ip_fragment(skb, skb->dst->output);
@@ -402,14+417,13 @@ int ip_build_xmit_slow(struct sock *sk, if (ip_dont_fragment(sk, &rt->u.dst))
df = htons(IP_DF);
- if (!sk->ip_hdrincl)
- length -= sizeof(struct iphdr);
+ length -= sizeof(struct iphdr);
if (opt) {
fragheaderlen = sizeof(struct iphdr) + opt->optlen;
maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
} else {
- fragheaderlen = sk->ip_hdrincl ? 0 : sizeof(struct iphdr);
+ fragheaderlen = sizeof(struct iphdr);
/*
* Fragheaderlen is the size of 'overhead' on each buffer. Now work
@@ -474,7+488,6 @@ int ip_build_xmit_slow(struct sock *sk, */
do {
- int error;
char *data;
struct sk_buff * skb;
@@ -482,15+495,10 @@ int ip_build_xmit_slow(struct sock *sk, * Get the memory we require with some space left for alignment.
*/
- skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &error);
- if (skb == NULL) {
- ip_statistics.IpOutDiscards++;
- if(nfrags>1)
- ip_statistics.IpFragCreates++;
- dev_unlock_list();
- return(error);
- }
-
+ skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err);
+ if (skb == NULL)
+ goto error;
+
/*
* Fill in the control structures
*/
@@ -510,7+518,7 @@ int ip_build_xmit_slow(struct sock *sk, * Only write IP header onto non-raw packets
*/
- if(!sk->ip_hdrincl) {
+ {
struct iphdr *iph = (struct iphdr *)data;
iph->version = 4;
@@ -547,53+555,46 @@ int ip_build_xmit_slow(struct sock *sk, * User data callback
*/
- err = 0;
- if (getfrag(frag, data, offset, fraglen-fragheaderlen))
+ if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
err = -EFAULT;
-
- /*
- * Account for the fragment.
- */
-
-#ifdef CONFIG_FIREWALL
- if(!err) {
- int fw_res;
-
- fw_res = call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb);
- if(fw_res == FW_QUEUE) {
- kfree_skb(skb);
- skb = NULL;
- } else if(fw_res < FW_ACCEPT) {
- err = -EPERM;
- }
- }
-#endif
-
- if (err) {
- ip_statistics.IpOutDiscards++;
kfree_skb(skb);
- dev_unlock_list();
- return err;
+ goto error;
}
-
offset -= (maxfraglen-fragheaderlen);
fraglen = maxfraglen;
nfrags++;
- err = 0;
- if (skb && rt->u.dst.output(skb)) {
- err = -ENETDOWN;
- ip_statistics.IpOutDiscards++;
- break;
+#ifdef CONFIG_FIREWALL
+ switch (call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb)) {
+ case FW_QUEUE:
+ kfree_skb(skb);
+ continue;
+ case FW_BLOCK:
+ case FW_REJECT:
+ kfree_skb(skb);
+ err = -EPERM;
+ goto error;
}
+#endif
+
+ err = -ENETDOWN;
+ if (rt->u.dst.output(skb))
+ goto error;
} while (offset >= 0);
if (nfrags>1)
ip_statistics.IpFragCreates += nfrags;
dev_unlock_list();
- return err;
+ return 0;
+
+error:
+ ip_statistics.IpOutDiscards++;
+ if (nfrags>1)
+ ip_statistics.IpFragCreates += nfrags;
+ dev_unlock_list();
+ return err;
}
@@ -621,14+622,20 @@ int ip_build_xmit(struct sock *sk, * choice RAW frames within 20 bytes of maximum size(rare) to the long path
*/
- if (!sk->ip_hdrincl)
+ if (!sk->ip_hdrincl) {
length += sizeof(struct iphdr);
- /*
- * Check for slow path.
- */
- if (length > rt->u.dst.pmtu || ipc->opt != NULL)
- return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
+ /*
+ * Check for slow path.
+ */
+ if (length > rt->u.dst.pmtu || ipc->opt != NULL)
+ return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
+ } else {
+ if (length > rt->u.dst.dev->mtu) {
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
+ return -EMSGSIZE;
+ }
+ }
/*
* Do path mtu discovery if needed.
@@ -636,7+643,7 @@ int ip_build_xmit(struct sock *sk, df = 0;
if (ip_dont_fragment(sk, &rt->u.dst))
df = htons(IP_DF);
-
+
/*
* Fast path for unfragmented frames without options.
*/
@@ -679,31+686,27 @@ int ip_build_xmit(struct sock *sk,
dev_unlock_list();
- if (err)
- err = -EFAULT;
+ if (err)
+ goto error_fault;
#ifdef CONFIG_FIREWALL
- if(!err) {
- int fw_res;
-
- fw_res = call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb);
- if(fw_res == FW_QUEUE) {
- /* re-queued elsewhere; silently abort this send */
- kfree_skb(skb);
- return 0;
- }
- if(fw_res < FW_ACCEPT)
- err = -EPERM;
- }
-#endif
-
- if (err) {
+ switch (call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb)) {
+ case FW_QUEUE:
kfree_skb(skb);
+ return 0;
+ case FW_BLOCK:
+ case FW_REJECT:
+ kfree_skb(skb);
+ err = -EPERM;
goto error;
}
-
+#endif
+
return rt->u.dst.output(skb);
+error_fault:
+ err = -EFAULT;
+ kfree_skb(skb);
error:
ip_statistics.IpOutDiscards++;
return err;
*
* The IP to API glue.
*
- * Version: $Id: ip_sockglue.c,v 1.39 1998/10/03 09:37:33 davem Exp $
+ * Version: $Id: ip_sockglue.c,v 1.40 1999/03/21 05:22:42 davem Exp $
*
* Authors: see ip.c
*
@@ -209,7+209,9 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s kfree(new_ra);
return -EADDRINUSE;
}
+ net_serialize_enter();
*rap = ra->next;
+ net_serialize_leave();
if (ra->destructor)
ra->destructor(sk);
kfree(ra);
@@ -220,10+222,10 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s return -ENOBUFS;
new_ra->sk = sk;
new_ra->destructor = destructor;
- start_bh_atomic();
new_ra->next = ra;
+ net_serialize_enter();
*rap = new_ra;
- end_bh_atomic();
+ net_serialize_leave();
return 0;
}
@@ -404,7+406,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt err = ip_options_get(&opt, optval, optlen, 1);
if (err)
return err;
- start_bh_atomic();
+ lock_sock(sk);
if (sk->type == SOCK_STREAM) {
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -420,7+422,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt #endif
}
opt = xchg(&sk->opt, opt);
- end_bh_atomic();
+ release_sock(sk);
if (opt)
kfree_s(opt, sizeof(struct ip_options) + opt->optlen);
return 0;
@@ -463,11+465,12 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt !capable(CAP_NET_ADMIN))
return -EPERM;
if (sk->ip_tos != val) {
+ lock_sock(sk);
sk->ip_tos=val;
sk->priority = rt_tos2priority(val);
dst_release(xchg(&sk->dst_cache, NULL));
+ release_sock(sk);
}
- sk->priority = rt_tos2priority(val);
return 0;
case IP_TTL:
if (optlen<1)
@@ -637,11+640,11 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op {
unsigned char optbuf[sizeof(struct ip_options)+40];
struct ip_options * opt = (struct ip_options*)optbuf;
- start_bh_atomic();
+ lock_sock(sk);
opt->optlen = 0;
if (sk->opt)
memcpy(optbuf, sk->opt, sizeof(struct ip_options)+sk->opt->optlen);
- end_bh_atomic();
+ release_sock(sk);
if (opt->optlen == 0)
return put_user(0, optlen);
/*
* Linux NET3: IP/IP protocol decoder.
*
- * Version: $Id: ipip.c,v 1.24 1998/10/03 09:37:35 davem Exp $
+ * Version: $Id: ipip.c,v 1.25 1999/03/21 05:22:43 davem Exp $
*
* Authors:
* Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
@@ -157,6+157,49 @@ static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local) return NULL;
}
+static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t)
+{
+ u32 remote = t->parms.iph.daddr;
+ u32 local = t->parms.iph.saddr;
+ unsigned h = 0;
+ int prio = 0;
+
+ if (remote) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+ if (local) {
+ prio |= 1;
+ h ^= HASH(local);
+ }
+ return &tunnels[prio][h];
+}
+
+
+static void ipip_tunnel_unlink(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp;
+
+ for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) {
+ if (t == *tp) {
+ net_serialize_enter();
+ *tp = t->next;
+ net_serialize_leave();
+ break;
+ }
+ }
+}
+
+static void ipip_tunnel_link(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp = ipip_bucket(t);
+
+ net_serialize_enter();
+ t->next = *tp;
+ *tp = t;
+ net_serialize_leave();
+}
+
struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
{
u32 remote = parms->iph.daddr;
@@ -208,10+251,7 @@ struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create) if (register_netdevice(dev) < 0)
goto failed;
- start_bh_atomic();
- nt->next = t;
- *tp = nt;
- end_bh_atomic();
+ ipip_tunnel_link(nt);
/* Do not decrement MOD_USE_COUNT here. */
return nt;
@@ -221,39+261,20 @@ failed: return NULL;
}
+
static void ipip_tunnel_destroy(struct device *dev)
{
- struct ip_tunnel *t, **tp;
- struct ip_tunnel *t0 = (struct ip_tunnel*)dev->priv;
- u32 remote = t0->parms.iph.daddr;
- u32 local = t0->parms.iph.saddr;
- unsigned h = 0;
- int prio = 0;
-
if (dev == &ipip_fb_tunnel_dev) {
+ net_serialize_enter();
tunnels_wc[0] = NULL;
- return;
- }
-
- if (remote) {
- prio |= 2;
- h ^= HASH(remote);
- }
- if (local) {
- prio |= 1;
- h ^= HASH(local);
- }
- for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
- if (t == t0) {
- *tp = t->next;
- kfree(dev);
- MOD_DEC_USE_COUNT;
- break;
- }
+ net_serialize_leave();
+ } else {
+ ipip_tunnel_unlink((struct ip_tunnel*)dev->priv);
+ kfree(dev);
+ MOD_DEC_USE_COUNT;
}
}
-
void ipip_err(struct sk_buff *skb, unsigned char *dp, int len)
{
#ifndef I_WISH_WORLD_WERE_PERFECT
@@ -642,6+663,32 @@ ipip_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+ if (dev != &ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
+ t != &ipip_fb_tunnel) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
+ err = -EINVAL;
+ break;
+ }
+ t = (struct ip_tunnel*)dev->priv;
+ start_bh_atomic();
+ ipip_tunnel_unlink(t);
+ t->parms.iph.saddr = p.iph.saddr;
+ t->parms.iph.daddr = p.iph.daddr;
+ memcpy(dev->dev_addr, &p.iph.saddr, 4);
+ memcpy(dev->broadcast, &p.iph.daddr, 4);
+ ipip_tunnel_link(t);
+ end_bh_atomic();
+ netdev_state_change(dev);
+ }
+ }
+
if (t) {
err = 0;
if (cmd == SIOCCHGTUNNEL) {
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Version: $Id: ipmr.c,v 1.38 1999/01/12 14:34:40 davem Exp $
+ * Version: $Id: ipmr.c,v 1.39 1999/03/21 05:22:44 davem Exp $
*
* Fixes:
* Michael Chastain : Incorrect size of copying.
@@ -138,6+138,8 @@ static struct device * reg_dev;
static int reg_vif_xmit(struct sk_buff *skb, struct device *dev)
{
+ ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
+ ((struct net_device_stats*)dev->priv)->tx_packets++;
ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
kfree_skb(skb);
return 0;
@@ -449,6+451,9 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) struct igmpmsg *msg;
int ret;
+ if (mroute_socket==NULL)
+ return -EINVAL;
+
#ifdef CONFIG_IP_PIMSM
if (assert == IGMPMSG_WHOLEPKT)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
@@ -656,7+661,9 @@ static void mrtsock_destruct(struct sock *sk) {
if (sk == mroute_socket) {
ipv4_devconf.mc_forwarding = 0;
+ net_serialize_enter();
mroute_socket=NULL;
+ net_serialize_leave();
mroute_close(sk);
}
}
@@ -1045,7+1052,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
dev = rt->u.dst.dev;
- if (skb->len+encap > rt->u.dst.pmtu /* && (ntohs(iph->frag_off) & IP_DF) */) {
+ if (skb->len+encap > rt->u.dst.pmtu && (ntohs(iph->frag_off) & IP_DF)) {
/* Do not fragment multicasts. Alas, IPv4 does not
allow to send ICMP, so that packets will disappear
to blackhole.
@@ -1119,7+1126,10 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, * not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- skb2->dst->output(skb2);
+ if (skb2->len <= rt->u.dst.pmtu)
+ skb2->dst->output(skb2);
+ else
+ ip_fragment(skb2, skb2->dst->output);
}
int ipmr_find_vif(struct device *dev)
*
* ROUTE - implementation of the IP router.
*
- * Version: $Id: route.c,v 1.62 1999/03/15 22:16:51 davem Exp $
+ * Version: $Id: route.c,v 1.63 1999/03/21 05:22:45 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Andi Kleen : Load-limit warning messages.
* Vitaly E. Lavrov : Transparent proxy revived after year coma.
* Vitaly E. Lavrov : Race condition in ip_route_input_slow.
+ * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
+ * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
+ * Marc Boucher : routing by fwmark
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -108,6+111,7 @@ int ip_rt_redirect_silence = ((HZ/50) << (9+1)); int ip_rt_error_cost = HZ;
int ip_rt_error_burst = 5*HZ;
int ip_rt_gc_elasticity = 8;
+int ip_rt_mtu_expires = 10*60*HZ;
static unsigned long rt_deadline = 0;
@@ -165,13+169,14 @@ __u8 ip_tos2prio[16] = { TC_PRIO_FILLER
};
+
/*
* Route cache.
*/
struct rtable *rt_hash_table[RT_HASH_DIVISOR];
-static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth);
+static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
{
@@ -249,6+254,12 @@ static __inline__ void rt_free(struct rtable *rt) dst_free(&rt->u.dst);
}
+static __inline__ void rt_drop(struct rtable *rt)
+{
+ ip_rt_put(rt);
+ dst_free(&rt->u.dst);
+}
+
static __inline__ int rt_fast_clean(struct rtable *rth)
{
/* Kill broadcast/multicast entries very aggresively, if they
@@ -257,6+268,27 @@ static __inline__ int rt_fast_clean(struct rtable *rth) && rth->key.iif && rth->u.rt_next);
}
+static __inline__ int rt_valuable(struct rtable *rth)
+{
+ return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
+ || rth->u.dst.expires);
+}
+
+static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
+{
+ int age;
+
+ if (atomic_read(&rth->u.dst.use))
+ return 0;
+
+ age = jiffies - rth->u.dst.lastuse;
+ if (age <= tmo1 && !rt_fast_clean(rth))
+ return 0;
+ if (age <= tmo2 && rt_valuable(rth))
+ return 0;
+ return 1;
+}
+
static void rt_check_expire(unsigned long dummy)
{
int i;
@@ -271,22+303,27 @@ static void rt_check_expire(unsigned long dummy) rthp = &rt_hash_table[rover];
while ((rth = *rthp) != NULL) {
- /*
- * Cleanup aged off entries.
- */
-
- if (!atomic_read(&rth->u.dst.use) &&
- (now - rth->u.dst.lastuse > tmo
- || rt_fast_clean(rth))) {
- *rthp = rth->u.rt_next;
- rt_free(rth);
+ if (rth->u.dst.expires) {
+ /* Entrie is expired even if it is in use */
+ if ((long)(now - rth->u.dst.expires) < tmo) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+ } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
continue;
}
- tmo >>= 1;
- rthp = &rth->u.rt_next;
+ /*
+ * Cleanup aged off entries.
+ */
+ *rthp = rth->u.rt_next;
+ rt_free(rth);
}
+ /* Fallback loop breaker. */
if ((jiffies - now) > 0)
break;
}
@@ -301,16+338,21 @@ static void rt_run_flush(unsigned long dummy)
rt_deadline = 0;
+ net_serialize_enter();
for (i=0; i<RT_HASH_DIVISOR; i++) {
if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL)
continue;
+ net_serialize_leave();
for (; rth; rth=next) {
next = rth->u.rt_next;
rth->u.rt_next = NULL;
rt_free(rth);
}
+
+ net_serialize_enter();
}
+ net_serialize_leave();
}
void rt_cache_flush(int delay)
@@ -354,60+396,137 @@ void rt_cache_flush(int delay) end_bh_atomic();
}
+/*
+ Short description of GC goals.
+
+ We want to build algorithm, which will keep routing cache
+ at some equilibrium point, when number of aged off entries
+ is kept approximately equal to newly generated ones.
+
+ Current expiration strength is variable "expire".
+ We try to adjust it dynamically, so that if networking
+ is idle expires is large enough to keep enough of warm entries,
+ and when load increases it reduces to limit cache size.
+ */
+
static int rt_garbage_collect(void)
{
- int i;
- static unsigned expire = RT_GC_TIMEOUT>>1;
+ static unsigned expire = RT_GC_TIMEOUT;
static unsigned long last_gc;
+ static int rover;
+ static int equilibrium;
struct rtable *rth, **rthp;
unsigned long now = jiffies;
-
- start_bh_atomic();
+ int goal;
/*
* Garbage collection is pretty expensive,
- * do not make it too frequently, but just increase expire strength.
+ * do not make it too frequently.
*/
- if (now - last_gc < ip_rt_gc_min_interval)
- goto out;
+ if (now - last_gc < ip_rt_gc_min_interval &&
+ atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ return 0;
- expire++;
+ /* Calculate number of entries, which we want to expire now. */
+ goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
+ if (goal <= 0) {
+ if (equilibrium < ipv4_dst_ops.gc_thresh)
+ equilibrium = ipv4_dst_ops.gc_thresh;
+ goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ if (goal > 0) {
+ equilibrium += min(goal/2, RT_HASH_DIVISOR);
+ goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ }
+ } else {
+ /* We are in dangerous area. Try to reduce cache really
+ * aggressively.
+ */
+ goal = max(goal/2, RT_HASH_DIVISOR);
+ equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+ }
- for (i=0; i<RT_HASH_DIVISOR; i++) {
- unsigned tmo;
- if (!rt_hash_table[i])
- continue;
- tmo = expire;
- for (rthp=&rt_hash_table[i]; (rth=*rthp); rthp=&rth->u.rt_next) {
- if (atomic_read(&rth->u.dst.use) ||
- (now - rth->u.dst.lastuse < tmo && !rt_fast_clean(rth))) {
- tmo >>= 1;
- continue;
+ if (now - last_gc >= ip_rt_gc_min_interval)
+ last_gc = now;
+
+ if (goal <= 0) {
+ equilibrium += goal;
+ goto work_done;
+ }
+
+ do {
+ int i, k;
+
+ start_bh_atomic();
+ for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
+ unsigned tmo = expire;
+
+ k = (k + 1) & (RT_HASH_DIVISOR-1);
+ rthp = &rt_hash_table[k];
+ while ((rth = *rthp) != NULL) {
+ if (!rt_may_expire(rth, tmo, expire)) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+ *rthp = rth->u.rt_next;
+ rth->u.rt_next = NULL;
+ rt_free(rth);
+ goal--;
}
- *rthp = rth->u.rt_next;
- rth->u.rt_next = NULL;
- rt_free(rth);
- break;
+ if (goal <= 0)
+ break;
}
- if ((jiffies-now)>0)
+ rover = k;
+ end_bh_atomic();
+
+ if (goal <= 0)
+ goto work_done;
+
+ /* Goal is not achieved. We stop process if:
+
+ - if expire reduced to zero. Otherwise, expire is halfed.
+ - if table is not full.
+ - if we are called from interrupt.
+ - jiffies check is just fallback/debug loop breaker.
+ We will not spin here for long time in any case.
+ */
+
+ if (expire == 0)
break;
- }
- last_gc = now;
- if (atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
- expire = ip_rt_gc_timeout>>1;
+ expire >>= 1;
+#if RT_CACHE_DEBUG >= 2
+ printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
+#endif
-out:
- expire -= expire>>ip_rt_gc_elasticity;
- end_bh_atomic();
- return (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size);
+ if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ return 0;
+ } while (!in_interrupt() && jiffies - now < 1);
+
+ if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ return 0;
+ if (net_ratelimit())
+ printk("dst cache overflow\n");
+ return 1;
+
+work_done:
+ expire += ip_rt_gc_min_interval;
+ if (expire > ip_rt_gc_timeout ||
+ atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+ expire = ip_rt_gc_timeout;
+#if RT_CACHE_DEBUG >= 2
+ printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
+#endif
+ return 0;
}
-static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt)
+static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
{
struct rtable *rth, **rthp;
unsigned long now = jiffies;
+ int attempts = !in_interrupt();
+restart:
start_bh_atomic();
rthp = &rt_hash_table[hash];
@@ -424,9+543,9 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt) rth->u.dst.lastuse = now;
end_bh_atomic();
- ip_rt_put(rt);
- rt_free(rt);
- return rth;
+ rt_drop(rt);
+ *rp = rth;
+ return 0;
}
rthp = &rth->u.rt_next;
@@ -435,8+554,28 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt) /* Try to bind route to arp only if it is output
route or unicast forwarding path.
*/
- if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0)
- arp_bind_neighbour(&rt->u.dst);
+ if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
+ if (!arp_bind_neighbour(&rt->u.dst)) {
+ end_bh_atomic();
+
+ /* Neighbour tables are full and nothing
+ can be released. Try to shrink route cache,
+ it is most likely it holds some neighbour records.
+ */
+ if (attempts-- > 0) {
+ int saved_elasticity = ip_rt_gc_elasticity;
+ ip_rt_gc_elasticity = 1;
+ rt_garbage_collect();
+ ip_rt_gc_elasticity = saved_elasticity;
+ goto restart;
+ }
+
+ rt_drop(rt);
+ if (net_ratelimit())
+ printk("neighbour table overflow\n");
+ return -ENOBUFS;
+ }
+ }
rt->u.rt_next = rt_hash_table[hash];
#if RT_CACHE_DEBUG >= 2
@@ -449,9+588,9 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt) }
#endif
rt_hash_table[hash] = rt;
-
end_bh_atomic();
- return rt;
+ *rp = rt;
+ return 0;
}
void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -537,17+676,15 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
if (rt->u.dst.neighbour)
neigh_event_send(rt->u.dst.neighbour, NULL);
- ip_rt_put(rt);
ip_rt_put(rth);
- rt_free(rt);
+ rt_drop(rt);
break;
}
*rthp = rth->u.rt_next;
- rt = rt_intern_hash(hash, rt);
- ip_rt_put(rt);
- ip_rt_put(rth);
- rt_free(rth);
+ if (!rt_intern_hash(hash, rt, &rt))
+ ip_rt_put(rt);
+ rt_drop(rth);
break;
}
}
@@ -573,14+710,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) ip_rt_put(rt);
return NULL;
}
- if (rt->rt_flags&RTCF_REDIRECTED) {
+ if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
struct rtable **rthp;
#if RT_CACHE_DEBUG >= 1
printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
#endif
- ip_rt_put(rt);
start_bh_atomic();
+ ip_rt_put(rt);
for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
if (*rthp == rt) {
*rthp = rt->u.rt_next;
@@ -614,6+751,10 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) void ip_rt_send_redirect(struct sk_buff *skb)
{
struct rtable *rt = (struct rtable*)skb->dst;
+ struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr;
+
+ if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev))
+ return;
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
@@ -637,7+778,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) rt->u.dst.rate_last = jiffies;
++rt->u.dst.rate_tokens;
#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (skb->dev->ip_ptr && IN_DEV_LOG_MARTIANS((struct in_device*)skb->dev->ip_ptr) &&
+ if (IN_DEV_LOG_MARTIANS(in_dev) &&
rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
@@ -737,6+878,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) if (mtu < rth->u.dst.pmtu) {
dst_confirm(&rth->u.dst);
rth->u.dst.pmtu = mtu;
+ dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
}
est_mtu = mtu;
}
@@ -760,7+902,13 @@ static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
static void ipv4_link_failure(struct sk_buff *skb)
{
+ struct rtable *rt;
+
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+
+ rt = (struct rtable *) skb->dst;
+ if (rt)
+ dst_set_expires(&rt->u.dst, 0);
}
static int ip_rt_bug(struct sk_buff *skb)
@@ -794,7+942,17 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) memcpy(addr, &src, 4);
}
-static void rt_set_nexthop(struct rtable *rt, struct fib_result *res)
+#ifdef CONFIG_NET_CLS_ROUTE
+static void set_class_tag(struct rtable *rt, u32 tag)
+{
+ if (!(rt->u.dst.tclassid&0xFFFF))
+ rt->u.dst.tclassid |= tag&0xFFFF;
+ if (!(rt->u.dst.tclassid&0xFFFF0000))
+ rt->u.dst.tclassid |= tag&0xFFFF0000;
+}
+#endif
+
+static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
{
struct fib_info *fi = res->fi;
@@ -824,9+982,11 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res) rt->u.dst.window= 0;
rt->u.dst.rtt = TCP_TIMEOUT_INIT;
}
-#if defined(CONFIG_NET_CLS_ROUTE) && defined(CONFIG_IP_MULTIPLE_TABLES)
- if (rt->u.dst.tclassid == 0)
- rt->u.dst.tclassid = fib_rules_tclass(res);
+#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ set_class_tag(rt, fib_rules_tclass(res));
+#endif
+ set_class_tag(rt, itag);
#endif
rt->rt_type = res->type;
}
@@ -839,6+999,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, struct rtable *rth;
u32 spec_dst;
struct in_device *in_dev = dev->ip_ptr;
+ u32 itag = 0;
/* Primary sanity checks. */
@@ -850,7+1011,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, if (!LOCAL_MCAST(daddr))
return -EINVAL;
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
- } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst) < 0)
+ } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
return -EINVAL;
rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
@@ -863,12+1024,18 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark = skb->fwmark;
+#endif
rth->key.src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_IP_ROUTE_NAT
rth->rt_dst_map = daddr;
rth->rt_src_map = saddr;
#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+ rth->u.dst.tclassid = itag;
+#endif
rth->rt_iif =
rth->key.iif = dev->ifindex;
rth->u.dst.dev = &loopback_dev;
@@ -888,8+1055,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, #endif
hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
- skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
- return 0;
+ return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
}
/*
@@ -910,6+1076,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, struct in_device *in_dev = dev->ip_ptr;
struct in_device *out_dev;
unsigned flags = 0;
+ u32 itag = 0;
struct rtable * rth;
unsigned hash;
u32 spec_dst;
@@ -925,6+1092,9 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, key.dst = daddr;
key.src = saddr;
key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ key.fwmark = skb->fwmark;
+#endif
key.iif = dev->ifindex;
key.oif = 0;
key.scope = RT_SCOPE_UNIVERSE;
@@ -983,9+1153,14 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, goto brd_input;
if (res.type == RTN_LOCAL) {
- spec_dst = daddr;
- if (inet_addr_type(saddr) != RTN_UNICAST)
+ int result;
+ result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
+ dev, &spec_dst, &itag);
+ if (result < 0)
goto martian_source;
+ if (result)
+ flags |= RTCF_DIRECTSRC;
+ spec_dst = daddr;
goto local_input;
}
@@ -1005,7+1180,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, return -EINVAL;
}
- err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst);
+ err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
if (err < 0)
goto martian_source;
@@ -1033,6+1208,9 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark = skb->fwmark;
+#endif
rth->key.src = saddr;
rth->rt_src = saddr;
rth->rt_gateway = daddr;
@@ -1051,7+1229,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, rth->u.dst.input = ip_forward;
rth->u.dst.output = ip_output;
- rt_set_nexthop(rth, &res);
+ rt_set_nexthop(rth, &res, itag);
rth->rt_flags = flags;
@@ -1066,8+1244,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, }
#endif
- skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
- return 0;
+ return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
brd_input:
if (skb->protocol != __constant_htons(ETH_P_IP))
@@ -1076,7+1253,7 @@ brd_input: if (ZERONET(saddr)) {
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
} else {
- err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst);
+ err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
if (err < 0)
goto martian_source;
if (err)
@@ -1096,12+1273,18 @@ local_input: rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark = skb->fwmark;
+#endif
rth->key.src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_IP_ROUTE_NAT
rth->rt_dst_map = key.dst;
rth->rt_src_map = key.src;
#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+ rth->u.dst.tclassid = itag;
+#endif
rth->rt_iif =
rth->key.iif = dev->ifindex;
rth->u.dst.dev = &loopback_dev;
@@ -1116,8+1299,7 @@ local_input: rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
- skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
- return 0;
+ return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
no_route:
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -1170,6+1352,9 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, rth->key.src == saddr &&
rth->key.iif == iif &&
rth->key.oif == 0 &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark == skb->fwmark &&
+#endif
rth->key.tos == tos) {
rth->u.dst.lastuse = jiffies;
atomic_inc(&rth->u.dst.use);
@@ -1344,43+1529,33 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int if (res.type == RTN_NAT)
return -EINVAL;
-
- if (!key.src) {
- key.src = FIB_RES_PREFSRC(res);
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- /*
- * "Stabilization" of route.
- * This step is necessary, if locally originated packets
- * are subjected to policy routing, otherwise we could get
- * route flapping.
- */
- if (fib_lookup(&key, &res))
- return -ENETUNREACH;
-#endif
+ if (res.type == RTN_LOCAL) {
+ if (!key.src)
+ key.src = key.dst;
+ dev_out = &loopback_dev;
+ key.oif = dev_out->ifindex;
+ res.fi = NULL;
+ flags |= RTCF_LOCAL;
+ goto make_route;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res.fi->fib_nhs > 1 && key.oif == 0)
fib_select_multipath(&key, &res);
+ else
#endif
+ if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
+ fib_select_default(&key, &res);
- dev_out = FIB_RES_DEV(res);
-
- if (res.type == RTN_LOCAL) {
- dev_out = &loopback_dev;
- key.oif = dev_out->ifindex;
- res.fi = NULL;
- flags |= RTCF_LOCAL;
- }
+ if (!key.src)
+ key.src = FIB_RES_PREFSRC(res);
+ dev_out = FIB_RES_DEV(res);
key.oif = dev_out->ifindex;
make_route:
- if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK)) {
- printk(KERN_DEBUG "this guy talks to %08x from loopback\n", key.dst);
+ if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
return -EINVAL;
- }
if (key.dst == 0xFFFFFFFF)
res.type = RTN_BROADCAST;
@@ -1449,13+1624,12 @@ make_route: #endif
}
- rt_set_nexthop(rth, &res);
+ rt_set_nexthop(rth, &res, 0);
rth->rt_flags = flags;
hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
- *rp = rt_intern_hash(hash, rth);
- return 0;
+ return rt_intern_hash(hash, rth, rp);
}
int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
@@ -1507,7+1681,7 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int no
nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
r = NLMSG_DATA(nlh);
- nlh->nlmsg_flags = nowait ? NLM_F_MULTI : 0;
+ nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
@@ -1517,6+1691,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int no r->rtm_scope = RT_SCOPE_UNIVERSE;
r->rtm_protocol = RTPROT_UNSPEC;
r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
+ if (rt->rt_flags & RTCF_NOTIFY)
+ r->rtm_flags |= RTM_F_NOTIFY;
RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
if (rt->key.src) {
r->rtm_src_len = 32;
@@ -1524,6+1700,10 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int no }
if (rt->u.dst.dev)
RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rt->u.dst.tclassid)
+ RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
+#endif
if (rt->key.iif)
RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
else if (rt->rt_src != rt->key.src)
@@ -1546,7+1726,10 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int no ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
ci.rta_used = atomic_read(&rt->u.dst.refcnt);
ci.rta_clntref = atomic_read(&rt->u.dst.use);
- ci.rta_expires = 0;
+ if (rt->u.dst.expires)
+ ci.rta_expires = rt->u.dst.expires - jiffies;
+ else
+ ci.rta_expires = 0;
ci.rta_error = rt->u.dst.error;
#ifdef CONFIG_IP_MROUTE
eptr = (struct rtattr*)skb->tail;
@@ -1625,7+1808,7 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) end_bh_atomic();
rt = (struct rtable*)skb->dst;
if (!err && rt->u.dst.error)
- err = rt->u.dst.error;
+ err = -rt->u.dst.error;
} else {
int oif = 0;
if (rta[RTA_OIF-1])
@@ -1667,7+1850,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) for (h=0; h < RT_HASH_DIVISOR; h++) {
if (h < s_h) continue;
if (h > s_h)
- memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(int));
+ memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0]));
start_bh_atomic();
for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
if (idx < s_idx)
@@ -1758,12+1941,45 @@ ctl_table ipv4_route_table[] = { {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
&ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
&proc_dointvec},
+ {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
+ &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies},
{0}
};
#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+struct ip_rt_acct ip_rt_acct[256];
+
+#ifdef CONFIG_PROC_FS
+static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
+ int length, int *eof, void *data)
+{
+ *start=buffer;
+
+ if (offset + length > sizeof(ip_rt_acct)) {
+ length = sizeof(ip_rt_acct) - offset;
+ *eof = 1;
+ }
+ if (length > 0) {
+ start_bh_atomic();
+ memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
+ end_bh_atomic();
+ return length;
+ }
+ return 0;
+}
+#endif
+#endif
+
+
__initfunc(void ip_rt_init(void))
{
+#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_NET_CLS_ROUTE
+ struct proc_dir_entry *ent;
+#endif
+#endif
devinet_init();
ip_fib_init();
rt_periodic_timer.function = rt_check_expire;
@@ -1781,5+1997,9 @@ __initfunc(void ip_rt_init(void)) 0, &proc_net_inode_operations,
rt_cache_get_info
});
+#ifdef CONFIG_NET_CLS_ROUTE
+ ent = create_proc_entry("net/rt_acct", 0, 0);
+ ent->read_proc = ip_rt_acct_read;
+#endif
#endif
}
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_ipv4.c,v 1.169 1999/03/11 00:04:22 davem Exp $
+ * Version: $Id: tcp_ipv4.c,v 1.170 1999/03/21 05:22:47 davem Exp $
*
* IPv4 specific functions
*
@@ -726,6+726,9 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip) {
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ if (atomic_read(&sk->sock_readers))
+ return;
+
/* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
* send out by Linux are always <576bytes so they should go through
* unfragmented).
@@ -739,19+742,18 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip) * There is a small race when the user changes this flag in the
* route, but I think that's acceptable.
*/
- if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && sk->dst_cache) {
- if (tp->pmtu_cookie > sk->dst_cache->pmtu &&
- !atomic_read(&sk->sock_readers)) {
- tcp_sync_mss(sk, sk->dst_cache->pmtu);
-
- /* Resend the TCP packet because it's
- * clear that the old packet has been
- * dropped. This is the new "fast" path mtu
- * discovery.
- */
- tcp_simple_retransmit(sk);
- } /* else let the usual retransmit timer handle it */
- }
+ if (sk->dst_cache &&
+ sk->ip_pmtudisc != IP_PMTUDISC_DONT &&
+ tp->pmtu_cookie > sk->dst_cache->pmtu) {
+ tcp_sync_mss(sk, sk->dst_cache->pmtu);
+
+ /* Resend the TCP packet because it's
+ * clear that the old packet has been
+ * dropped. This is the new "fast" path mtu
+ * discovery.
+ */
+ tcp_simple_retransmit(sk);
+ } /* else let the usual retransmit timer handle it */
}
/*
@@ -778,6+780,11 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) struct tcp_opt *tp;
int type = skb->h.icmph->type;
int code = skb->h.icmph->code;
+#if ICMP_MIN_LENGTH < 14
+ int no_flags = 0;
+#else
+#define no_flags 0
+#endif
struct sock *sk;
__u32 seq;
int err;
@@ -786,6+793,10 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) icmp_statistics.IcmpInErrors++;
return;
}
+#if ICMP_MIN_LENGTH < 14
+ if (len < (iph->ihl << 2) + 14)
+ no_flags = 1;
+#endif
th = (struct tcphdr*)(dp+(iph->ihl<<2));
@@ -852,7+863,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) * ACK should set the opening flag, but that is too
* complicated right now.
*/
- if (!th->syn && !th->ack)
+ if (!no_flags && !th->syn && !th->ack)
return;
req = tcp_v4_search_req(tp, iph, th, &prev);
@@ -887,7+898,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) break;
case TCP_SYN_SENT:
case TCP_SYN_RECV: /* Cannot happen */
- if (!th->syn)
+ if (!no_flags && !th->syn)
return;
tcp_statistics.TcpAttemptFails++;
sk->err = err;
*
* The User Datagram Protocol (UDP).
*
- * Version: $Id: udp.c,v 1.64 1998/11/08 11:17:07 davem Exp $
+ * Version: $Id: udp.c,v 1.65 1999/03/21 05:22:49 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -570,7+570,6 @@ struct udpfakehdr struct udphdr uh;
u32 saddr;
u32 daddr;
- u32 other;
struct iovec *iov;
u32 wcheck;
};
@@ -778,7+777,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) ufh.daddr = ipc.addr = rt->rt_dst;
ufh.uh.len = htons(ulen);
ufh.uh.check = 0;
- ufh.other = (htons(ulen) << 16) + IPPROTO_UDP*256;
ufh.iov = msg->msg_iov;
ufh.wcheck = 0;
@@ -846,7+844,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) return(0);
}
-#if defined(CONFIG_FILTER) || !defined(HAVE_CSUM_COPY_USER)
+#ifndef HAVE_CSUM_COPY_USER
#undef CONFIG_UDP_DELAY_CSUM
#endif
@@ -890,11+888,11 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
copied);
#else
- if (sk->no_check || skb->ip_summed==CHECKSUM_UNNECESSARY) {
+ if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
copied);
} else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
- if (csum_fold(csum_partial(skb->h.raw, ntohs(skb->h.uh->len), skb->csum)))
+ if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum)))
goto csum_copy_err;
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
copied);
@@ -907,7+905,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, copied, csum, &err);
if (err)
goto out_free;
- if (csum_fold(csum))
+ if ((unsigned short)csum_fold(csum))
goto csum_copy_err;
}
#endif
@@ -1030,6+1028,19 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) * Charge it to the socket, dropping if the queue is full.
*/
+#if defined(CONFIG_FILTER) && defined(CONFIG_UDP_DELAY_CSUM)
+ if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
+ if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) {
+ udp_statistics.UdpInErrors++;
+ ip_statistics.IpInDiscards++;
+ ip_statistics.IpInDelivers--;
+ kfree_skb(skb);
+ return -1;
+ }
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+#endif
+
if (sock_queue_rcv_skb(sk,skb)<0) {
udp_statistics.UdpInErrors++;
ip_statistics.IpInDiscards++;
@@ -1179,7+1190,7 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) if (sk == NULL) {
#ifdef CONFIG_UDP_DELAY_CSUM
if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
- csum_fold(csum_partial((char*)uh, ulen, skb->csum)))
+ (unsigned short)csum_fold(csum_partial((char*)uh, ulen, skb->csum)))
goto csum_error;
#endif
udp_statistics.UdpNoPorts++;
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: addrconf.c,v 1.46 1999/01/12 14:34:47 davem Exp $
+ * $Id: addrconf.c,v 1.47 1999/03/21 05:22:50 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -88,6+88,34 @@ static struct timer_list addr_chk_timer = { 0, 0, addrconf_verify
};
+/* These locks protect only against address deletions,
+ but not against address adds or status updates.
+ It is OK. The only race is when address is selected,
+ which becomes invalid immediately after selection.
+ It is harmless, because this address could be already invalid
+ several usecs ago.
+
+ Its important, that:
+
+ 1. The result of inet6_add_addr() is used only inside lock
+ or from bh_atomic context.
+
+ 2. inet6_get_lladdr() is used only from bh protected context.
+
+ 3. The result of ipv6_chk_addr() is not used outside of bh protected context.
+ */
+
+static __inline__ void addrconf_lock(void)
+{
+ atomic_inc(&addr_list_lock);
+ synchronize_bh();
+}
+
+static __inline__ void addrconf_unlock(void)
+{
+ atomic_dec(&addr_list_lock);
+}
+
static int addrconf_ifdown(struct device *dev, int how);
static void addrconf_dad_start(struct inet6_ifaddr *ifp);
@@ -188,7+216,7 @@ static struct inet6_dev * ipv6_add_dev(struct device *dev) if (dev->mtu < IPV6_MIN_MTU)
return NULL;
- ndev = kmalloc(sizeof(struct inet6_dev), gfp_any());
+ ndev = kmalloc(sizeof(struct inet6_dev), GFP_KERNEL);
if (ndev) {
memset(ndev, 0, sizeof(struct inet6_dev));
@@ -227,9+255,9 @@ static struct inet6_dev * ipv6_find_idev(struct device *dev) idev = ipv6_add_dev(dev);
if (idev == NULL)
return NULL;
+ if (dev->flags&IFF_UP)
+ ipv6_mc_up(idev);
}
- if (dev->flags&IFF_UP)
- ipv6_mc_up(idev);
return idev;
}
@@ -260,13+288,13 @@ struct inet6_dev * ipv6_get_idev(struct device *dev) return NULL;
}
-struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev,
- struct in6_addr *addr, int scope)
+static struct inet6_ifaddr *
+ipv6_add_addr(struct inet6_dev *idev, struct in6_addr *addr, int scope)
{
struct inet6_ifaddr *ifa;
int hash;
- ifa = kmalloc(sizeof(struct inet6_ifaddr), gfp_any());
+ ifa = kmalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
if (ifa == NULL) {
ADBG(("ipv6_add_addr: malloc failed\n"));
@@ -312,7+340,9 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
for (; iter; iter = iter->lst_next) {
if (iter == ifp) {
+ net_serialize_enter();
*back = ifp->lst_next;
+ net_serialize_leave();
ifp->lst_next = NULL;
break;
}
@@ -324,7+354,9 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
for (; iter; iter = iter->if_next) {
if (iter == ifp) {
+ net_serialize_enter();
*back = ifp->if_next;
+ net_serialize_leave();
ifp->if_next = NULL;
break;
}
@@ -343,24+375,23 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) * ii) see if there is a specific route for the destination and use
* an address of the attached interface
* iii) don't use deprecated addresses
- *
- * at the moment I believe only iii) is missing.
*/
-struct inet6_ifaddr * ipv6_get_saddr(struct dst_entry *dst,
- struct in6_addr *daddr)
+int ipv6_get_saddr(struct dst_entry *dst,
+ struct in6_addr *daddr, struct in6_addr *saddr)
{
int scope;
struct inet6_ifaddr *ifp = NULL;
struct inet6_ifaddr *match = NULL;
struct device *dev = NULL;
struct rt6_info *rt;
+ int err;
int i;
rt = (struct rt6_info *) dst;
if (rt)
dev = rt->rt6i_dev;
- atomic_inc(&addr_list_lock);
+ addrconf_lock();
scope = ipv6_addr_scope(daddr);
if (rt && (rt->rt6i_flags & RTF_ALLONLINK)) {
@@ -388,10+419,10 @@ struct inet6_ifaddr * ipv6_get_saddr(struct dst_entry *dst, if (idev->dev == dev) {
for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
if (ifp->scope == scope) {
- if (!(ifp->flags & ADDR_STATUS))
+ if (!(ifp->flags & (ADDR_STATUS|DAD_STATUS)))
goto out;
- if (!(ifp->flags & ADDR_INVALID))
+ if (!(ifp->flags & (ADDR_INVALID|DAD_STATUS)))
match = ifp;
}
}
@@ -410,10+441,10 @@ struct inet6_ifaddr * ipv6_get_saddr(struct dst_entry *dst, for (i=0; i < IN6_ADDR_HSIZE; i++) {
for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) {
if (ifp->scope == scope) {
- if (!(ifp->flags & ADDR_STATUS))
+ if (!(ifp->flags & (ADDR_STATUS|DAD_STATUS)))
goto out;
- if (!(ifp->flags & ADDR_INVALID))
+ if (!(ifp->flags & (ADDR_INVALID|DAD_STATUS)))
match = ifp;
}
}
@@ -422,28+453,30 @@ struct inet6_ifaddr * ipv6_get_saddr(struct dst_entry *dst, out:
if (ifp == NULL)
ifp = match;
- atomic_dec(&addr_list_lock);
- return ifp;
+
+ err = -ENETUNREACH;
+ if (ifp) {
+ memcpy(saddr, &ifp->addr, sizeof(struct in6_addr));
+ err = 0;
+ }
+ addrconf_unlock();
+ return err;
}
struct inet6_ifaddr * ipv6_get_lladdr(struct device *dev)
{
- struct inet6_ifaddr *ifp;
+ struct inet6_ifaddr *ifp = NULL;
struct inet6_dev *idev;
- int hash;
-
- hash = ipv6_devindex_hash(dev->ifindex);
- for (idev = inet6_dev_lst[hash]; idev; idev=idev->next) {
- if (idev->dev == dev) {
- for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
- if (ifp->scope == IFA_LINK)
- return ifp;
- }
- break;
+ if ((idev = ipv6_get_idev(dev)) != NULL) {
+ addrconf_lock();
+ for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
+ if (ifp->scope == IFA_LINK)
+ break;
}
+ addrconf_unlock();
}
- return NULL;
+ return ifp;
}
/*
@@ -461,7+494,7 @@ struct inet6_ifaddr * ipv6_chk_addr(struct in6_addr *addr, struct device *dev, i if (!nd)
flags |= DAD_STATUS|ADDR_INVALID;
- atomic_inc(&addr_list_lock);
+ addrconf_lock();
hash = ipv6_addr_hash(addr);
for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) {
@@ -472,7+505,7 @@ struct inet6_ifaddr * ipv6_chk_addr(struct in6_addr *addr, struct device *dev, i }
}
- atomic_dec(&addr_list_lock);
+ addrconf_unlock();
return ifp;
}
@@ -665,13+698,6 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) }
/*
- * If we where using an "all destinations on link" route
- * delete it
- */
-
- rt6_purge_dflt_routers(RTF_ALLONLINK);
-
- /*
* Two things going on here:
* 1) Add routes for on-link prefixes
* 2) Configure prefixes with the auto flag set
@@ -845,14+871,17 @@ static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen)
scope = ipv6_addr_scope(pfx);
- if ((ifp = ipv6_add_addr(idev, pfx, scope)) == NULL)
- return -ENOMEM;
-
- ifp->prefix_len = plen;
- ifp->flags |= ADDR_PERMANENT;
+ addrconf_lock();
+ if ((ifp = ipv6_add_addr(idev, pfx, scope)) != NULL) {
+ ifp->prefix_len = plen;
+ ifp->flags |= ADDR_PERMANENT;
+ addrconf_dad_start(ifp);
+ addrconf_unlock();
+ return 0;
+ }
+ addrconf_unlock();
- addrconf_dad_start(ifp);
- return 0;
+ return -ENOBUFS;
}
static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen)
@@ -870,20+899,22 @@ static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen)
scope = ipv6_addr_scope(pfx);
+ start_bh_atomic();
for (ifp = idev->addr_list; ifp; ifp=ifp->if_next) {
if (ifp->scope == scope && ifp->prefix_len == plen &&
(!memcmp(pfx, &ifp->addr, sizeof(struct in6_addr)))) {
ipv6_del_addr(ifp);
+ end_bh_atomic();
/* If the last address is deleted administratively,
disable IPv6 on this interface.
*/
-
if (idev->addr_list == NULL)
addrconf_ifdown(idev->dev, 1);
return 0;
}
}
+ end_bh_atomic();
return -EADDRNOTAVAIL;
}
@@ -940,12+971,14 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) }
if (addr.s6_addr32[3]) {
+ addrconf_lock();
ifp = ipv6_add_addr(idev, &addr, scope);
if (ifp) {
ifp->flags |= ADDR_PERMANENT;
ifp->prefix_len = 128;
ipv6_ifa_notify(RTM_NEWADDR, ifp);
}
+ addrconf_unlock();
return;
}
@@ -967,17+1000,17 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) flag |= IFA_HOST;
}
+ addrconf_lock();
ifp = ipv6_add_addr(idev, &addr, flag);
-
- if (ifp == NULL)
- continue;
-
- if (idev->dev->flags&IFF_POINTOPOINT)
- ifp->prefix_len = 10;
- else
- ifp->prefix_len = 96;
- ifp->flags |= ADDR_PERMANENT;
- ipv6_ifa_notify(RTM_NEWADDR, ifp);
+ if (ifp) {
+ if (idev->dev->flags&IFF_POINTOPOINT)
+ ifp->prefix_len = 10;
+ else
+ ifp->prefix_len = 96;
+ ifp->flags |= ADDR_PERMANENT;
+ ipv6_ifa_notify(RTM_NEWADDR, ifp);
+ }
+ addrconf_unlock();
}
}
}
@@ -999,31+1032,29 @@ static void init_loopback(struct device *dev) return;
}
+ addrconf_lock();
ifp = ipv6_add_addr(idev, &addr, IFA_HOST);
- if (ifp == NULL) {
- printk(KERN_DEBUG "init_loopback: add_addr failed\n");
- return;
+ if (ifp) {
+ ifp->flags |= ADDR_PERMANENT;
+ ifp->prefix_len = 128;
+ ipv6_ifa_notify(RTM_NEWADDR, ifp);
}
-
- ifp->flags |= ADDR_PERMANENT;
- ifp->prefix_len = 128;
-
- ipv6_ifa_notify(RTM_NEWADDR, ifp);
+ addrconf_unlock();
}
static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr)
{
struct inet6_ifaddr * ifp;
+ addrconf_lock();
ifp = ipv6_add_addr(idev, addr, IFA_LINK);
- if (ifp == NULL)
- return;
-
- ifp->flags = ADDR_PERMANENT;
- ifp->prefix_len = 10;
-
- addrconf_dad_start(ifp);
+ if (ifp) {
+ ifp->flags = ADDR_PERMANENT;
+ ifp->prefix_len = 10;
+ addrconf_dad_start(ifp);
+ }
+ addrconf_unlock();
}
static void addrconf_dev_config(struct device *dev)
@@ -1375,8+1406,12 @@ static int iface_proc_info(char *buffer, char **start, off_t offset, struct inet6_ifaddr *ifp;
int i;
int len = 0;
+ off_t pos=0;
+ off_t begin=0;
- for (i=0; i < IN6_ADDR_HSIZE; i++)
+ addrconf_lock();
+
+ for (i=0; i < IN6_ADDR_HSIZE; i++) {
for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) {
int j;
@@ -1393,14+1428,25 @@ static int iface_proc_info(char *buffer, char **start, off_t offset, ifp->scope,
ifp->flags,
ifp->idev->dev->name);
+ pos=begin+len;
+ if(pos<offset) {
+ len=0;
+ begin=pos;
+ }
+ if(pos>offset+length)
+ goto done;
}
+ }
- *start = buffer + offset;
-
- len -= offset;
-
- if (len > length)
- len = length;
+done:
+ addrconf_unlock();
+
+ *start=buffer+(offset-begin);
+ len-=(offset-begin);
+ if(len>length)
+ len=length;
+ if(len<0)
+ len=0;
return len;
}
@@ -1423,6+1469,12 @@ void addrconf_verify(unsigned long foo) unsigned long now = jiffies;
int i;
+ if (atomic_read(&addr_list_lock)) {
+ addr_chk_timer.expires = jiffies + 1*HZ;
+ add_timer(&addr_chk_timer);
+ return;
+ }
+
for (i=0; i < IN6_ADDR_HSIZE; i++) {
for (ifp=inet6_addr_lst[i]; ifp;) {
if (ifp->flags & ADDR_INVALID) {
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: icmp.c,v 1.20 1998/10/03 09:38:31 davem Exp $
+ * $Id: icmp.c,v 1.21 1999/03/21 05:22:51 davem Exp $
*
* Based on net/ipv4/icmp.c
*
@@ -200,9+200,11 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, int type, * this lookup should be more aggressive (not longer than timeout).
*/
dst = ip6_route_output(sk, fl);
- if (dst->error)
+ if (dst->error) {
ipv6_statistics.Ip6OutNoRoutes++;
- else {
+ } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
+ res = 1;
+ } else {
struct rt6_info *rt = (struct rt6_info *)dst;
int tmo = sysctl_icmpv6_time;
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: ip6_fib.c,v 1.15 1998/08/26 12:04:55 davem Exp $
+ * $Id: ip6_fib.c,v 1.16 1999/03/21 05:22:52 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -103,8+103,8 @@ static struct fib6_walker_t fib6_walker_list = { static __inline__ u32 fib6_new_sernum(void)
{
u32 n = ++rt_sernum;
- if (n == 0)
- n = ++rt_sernum;
+ if ((__s32)n <= 0)
+ rt_sernum = n = 1;
return n;
}
@@ -1157,7+1157,6 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1;
}
gc_args.more++;
- return 0;
}
/*
@@ -1171,7+1170,6 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1;
}
gc_args.more++;
- return 0;
}
return 0;
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: ip6_output.c,v 1.15 1998/10/03 09:38:34 davem Exp $
+ * $Id: ip6_output.c,v 1.16 1999/03/21 05:22:54 davem Exp $
*
* Based on linux/net/ipv4/ip_output.c
*
@@ -77,11+77,14 @@ int ip6_output(struct sk_buff *skb) /* Alpha has disguisting memcpy. Help it. */
u64 *aligned_hdr = (u64*)(skb->data - 16);
u64 *aligned_hdr0 = hh->hh_data;
+ read_lock_irq(&hh->hh_lock);
aligned_hdr[0] = aligned_hdr0[0];
aligned_hdr[1] = aligned_hdr0[1];
#else
+ read_lock_irq(&hh->hh_lock);
memcpy(skb->data - 16, hh->hh_data, 16);
#endif
+ read_unlock_irq(&hh->hh_lock);
skb_push(skb, dev->hard_header_len);
return hh->hh_output(skb);
} else if (dst->neighbour)
@@ -164,7+167,9 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, }
printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
+ start_bh_atomic();
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
+ end_bh_atomic();
kfree_skb(skb);
return -EMSGSIZE;
}
@@ -427,6+432,7 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, struct dst_entry *dst;
int err = 0;
unsigned int pktlength, jumbolen, mtu;
+ struct in6_addr saddr;
if (opt && opt->srcrt) {
struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
@@ -481,19+487,16 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, }
if (fl->nl_u.ip6_u.saddr == NULL) {
- struct inet6_ifaddr *ifa;
-
- ifa = ipv6_get_saddr(dst, fl->nl_u.ip6_u.daddr);
+ err = ipv6_get_saddr(dst, fl->nl_u.ip6_u.daddr, &saddr);
- if (ifa == NULL) {
+ if (err) {
#if IP6_DEBUG >= 2
printk(KERN_DEBUG "ip6_build_xmit: "
"no availiable source address\n");
#endif
- err = -ENETUNREACH;
goto out;
}
- fl->nl_u.ip6_u.saddr = &ifa->addr;
+ fl->nl_u.ip6_u.saddr = &saddr;
}
pktlength = length;
*
* Based on linux/net/ipv4/ip_sockglue.c
*
- * $Id: ipv6_sockglue.c,v 1.24 1998/10/03 09:38:37 davem Exp $
+ * $Id: ipv6_sockglue.c,v 1.25 1999/03/21 05:22:54 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -86,7+86,9 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) kfree(new_ra);
return -EADDRINUSE;
}
+ net_serialize_enter();
*rap = ra->next;
+ net_serialize_leave();
if (ra->destructor)
ra->destructor(sk);
kfree(ra);
@@ -136,15+138,16 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, if (sk->protocol != IPPROTO_UDP &&
sk->protocol != IPPROTO_TCP)
goto out;
-
+
+ lock_sock(sk);
if (sk->state != TCP_ESTABLISHED) {
retv = ENOTCONN;
- goto out;
+ goto addrform_done;
}
if (!(ipv6_addr_type(&np->daddr) & IPV6_ADDR_MAPPED)) {
retv = -EADDRNOTAVAIL;
- goto out;
+ goto addrform_done;
}
if (sk->protocol == IPPROTO_TCP) {
@@ -166,6+169,9 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, if (pktopt)
kfree_skb(pktopt);
retv = 0;
+
+addrform_done:
+ release_sock(sk);
} else {
retv = -EINVAL;
}
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: mcast.c,v 1.17 1998/08/26 12:05:06 davem Exp $
+ * $Id: mcast.c,v 1.18 1999/03/21 05:22:55 davem Exp $
*
* Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c
*
@@ -132,7+132,9 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr) if (mc_lst->ifindex == ifindex &&
ipv6_addr_cmp(&mc_lst->addr, addr) == 0) {
struct device *dev;
+ net_serialize_enter();
*lnk = mc_lst->next;
+ net_serialize_leave();
if ((dev = dev_get_by_index(ifindex)) != NULL)
ipv6_dev_mc_dec(dev, &mc_lst->addr);
sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
@@ -252,7+254,9 @@ static void ipv6_mca_remove(struct device *dev, struct ifmcaddr6 *ma)
for (lnk = &idev->mc_list; (iter = *lnk) != NULL; lnk = &iter->if_next) {
if (iter == ma) {
+ net_serialize_enter();
*lnk = iter->if_next;
+ net_serialize_leave();
return;
}
}
@@ -273,7+277,9 @@ int ipv6_dev_mc_dec(struct device *dev, struct in6_addr *addr) if (ipv6_addr_cmp(&ma->mca_addr, addr) == 0 && ma->dev == dev) {
if (atomic_dec_and_test(&ma->mca_users)) {
igmp6_group_dropped(ma);
+ net_serialize_enter();
*lnk = ma->next;
+ net_serialize_leave();
ipv6_mca_remove(dev, ma);
kfree(ma);
}
@@ -496,10+502,10 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) if ((addr_type & (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_LOOPBACK)))
return;
+ start_bh_atomic();
igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REPORT);
delay = net_random() % IGMP6_UNSOLICITED_IVAL;
- start_bh_atomic();
if (del_timer(&ma->mca_timer))
delay = ma->mca_timer.expires - jiffies;
@@ -519,11+525,13 @@ static void igmp6_leave_group(struct ifmcaddr6 *ma) if ((addr_type & IPV6_ADDR_LINKLOCAL))
return;
+ start_bh_atomic();
if (ma->mca_flags & MAF_LAST_REPORTER)
igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REDUCTION);
if (ma->mca_flags & MAF_TIMER_RUNNING)
del_timer(&ma->mca_timer);
+ end_bh_atomic();
}
void igmp6_timer_handler(unsigned long data)
@@ -577,10+585,22 @@ void ipv6_mc_up(struct inet6_dev *idev)
void ipv6_mc_destroy_dev(struct inet6_dev *idev)
{
- struct ifmcaddr6 *i;
+ int hash;
+ struct ifmcaddr6 *i, **lnk;
while ((i = idev->mc_list) != NULL) {
idev->mc_list = i->if_next;
+
+ hash = ipv6_addr_hash(&i->mca_addr);
+
+ for (lnk = &inet6_mcast_lst[hash]; *lnk; lnk = &(*lnk)->next) {
+ if (*lnk == i) {
+ net_serialize_enter();
+ *lnk = i->next;
+ net_serialize_leave();
+ break;
+ }
+ }
igmp6_group_dropped(i);
kfree(i);
}
len-=(offset-begin);
if(len>length)
len=length;
+ if (len<0)
+ len=0;
return len;
}
#endif
@@ -335,7+335,7 @@ void ndisc_send_na(struct device *dev, struct neighbour *neigh, msg->icmph.icmp6_unused = 0;
msg->icmph.icmp6_router = router;
msg->icmph.icmp6_solicited = solicited;
- msg->icmph.icmp6_override = override;
+ msg->icmph.icmp6_override = !!override;
/* Set the target address. */
ipv6_addr_copy(&msg->target, solicited_addr);
@@ -497,7+497,7 @@ static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) * "The sender MUST return an ICMP
* destination unreachable"
*/
- icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
+ dst_link_failure(skb);
kfree_skb(skb);
}
@@ -604,6+604,13 @@ static void ndisc_router_discovery(struct sk_buff *skb) return;
}
neigh->flags |= NTF_ROUTER;
+
+ /*
+ * If we where using an "all destinations on link" route
+ * delete it
+ */
+
+ rt6_purge_dflt_routers(RTF_ALLONLINK);
}
if (rt)
@@ -989,7+996,7 @@ int ndisc_rcv(struct sk_buff *skb, unsigned long len)
if (neigh) {
ndisc_send_na(dev, neigh, saddr, &msg->target,
- 0, 0, inc, inc);
+ 0, 1, 0, inc);
neigh_release(neigh);
}
} else {
@@ -1173,7+1180,6 @@ __initfunc(int ndisc_init(struct net_proto_family *ops)) sk = ndisc_socket->sk;
sk->allocation = GFP_ATOMIC;
sk->net_pinfo.af_inet6.hop_limit = 255;
- sk->net_pinfo.af_inet6.priority = 15;
/* Do not loopback ndisc messages */
sk->net_pinfo.af_inet6.mc_loop = 0;
sk->num = 256;
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: route.c,v 1.34 1998/10/03 09:38:43 davem Exp $
+ * $Id: route.c,v 1.35 1999/03/21 05:22:57 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -71,6+71,7 @@ int ip6_rt_gc_min_interval = 5*HZ; int ip6_rt_gc_timeout = 60*HZ;
int ip6_rt_gc_interval = 30*HZ;
int ip6_rt_gc_elasticity = 9;
+int ip6_rt_mtu_expires = 10*60*HZ;
static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
@@ -97,7+98,7 @@ struct dst_ops ip6_dst_ops = {
struct rt6_info ip6_null_entry = {
{{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), &loopback_dev,
- -1, 0, 0, 0, 0, 0, 0, 0,
+ -1, 0, 0, 0, 0, 0, 0, 0, 0,
-ENETUNREACH, NULL, NULL,
ip6_pkt_discard, ip6_pkt_discard,
#ifdef CONFIG_NET_CLS_ROUTE
@@ -105,7+106,7 @@ struct rt6_info ip6_null_entry = { #endif
&ip6_dst_ops}},
NULL, {{{0}}}, RTF_REJECT|RTF_NONEXTHOP, ~0U,
- 255, 0, ATOMIC_INIT(1), {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
+ 255, ATOMIC_INIT(1), {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
};
struct fib6_node ip6_routing_table = {
@@ -515,13+516,30 @@ static struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, struct sk_buff *
static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
{
- dst_release(dst);
+ struct rt6_info *rt = (struct rt6_info *) dst;
+
+ if (rt) {
+ if (rt->rt6i_flags & RTF_CACHE)
+ ip6_del_rt(rt);
+ dst_release(dst);
+ }
return NULL;
}
static void ip6_link_failure(struct sk_buff *skb)
{
+ struct rt6_info *rt;
+
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
+
+ rt = (struct rt6_info *) skb->dst;
+ if (rt) {
+ if (rt->rt6i_flags&RTF_CACHE) {
+ dst_set_expires(&rt->u.dst, 0);
+ rt->rt6i_flags |= RTF_EXPIRES;
+ } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
+ rt->rt6i_node->fn_sernum = -1;
+ }
}
static int ip6_dst_gc()
@@ -1009,12+1027,10 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, when cache entry will expire old pmtu
would return automatically.
*/
- if (rt->rt6i_dst.plen == 128) {
- /*
- * host route
- */
+ if (rt->rt6i_flags & RTF_CACHE) {
rt->u.dst.pmtu = pmtu;
- rt->rt6i_flags |= RTF_MODIFIED;
+ dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
+ rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
goto out;
}
@@ -1025,9+1041,12 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, */
if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
nrt = rt6_cow(rt, daddr, saddr);
- nrt->u.dst.pmtu = pmtu;
- nrt->rt6i_flags |= RTF_DYNAMIC;
- dst_release(&nrt->u.dst);
+ if (!nrt->u.dst.error) {
+ nrt->u.dst.pmtu = pmtu;
+ dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
+ nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
+ dst_release(&nrt->u.dst);
+ }
} else {
nrt = ip6_rt_copy(rt);
if (nrt == NULL)
@@ -1035,7+1054,8 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr);
nrt->rt6i_dst.plen = 128;
nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
- nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE);
+ dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
+ nrt->rt6i_flags |= RTF_DYNAMIC|RTF_CACHE|RTF_EXPIRES;
nrt->u.dst.pmtu = pmtu;
rt6_ins(nrt);
}
@@ -1069,7+1089,7 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
- rt->rt6i_metric = ort->rt6i_metric;
+ rt->rt6i_metric = 0;
memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
@@ -1521,9+1541,9 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, if (iif)
RTA_PUT(skb, RTA_IIF, 4, &iif);
else if (dst) {
- struct inet6_ifaddr *ifp = ipv6_get_saddr(&rt->u.dst, dst);
- if (ifp)
- RTA_PUT(skb, RTA_PREFSRC, 16, &ifp->addr);
+ struct in6_addr saddr_buf;
+ if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf))
+ RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
}
mx = (struct rtattr*)skb->tail;
RTA_PUT(skb, RTA_METRICS, 0, NULL);
@@ -1722,7+1742,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt) struct sk_buff *skb;
int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
- skb = alloc_skb(size, GFP_ATOMIC);
+ skb = alloc_skb(size, gfp_any());
if (!skb) {
netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
return;
@@ -1733,7+1753,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt) return;
}
NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, GFP_ATOMIC);
+ netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any());
}
#endif
@@ -1916,6+1936,9 @@ ctl_table ipv6_route_table[] = { {NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity",
&ip6_rt_gc_elasticity, sizeof(int), 0644, NULL,
&proc_dointvec_jiffies},
+ {NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires",
+ &ip6_rt_mtu_expires, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies},
{0}
};
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
- * $Id: sit.c,v 1.29 1998/10/03 09:38:47 davem Exp $
+ * $Id: sit.c,v 1.30 1999/03/21 05:22:58 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -94,6+94,48 @@ static struct ip_tunnel * ipip6_tunnel_lookup(u32 remote, u32 local) return NULL;
}
+static struct ip_tunnel ** ipip6_bucket(struct ip_tunnel *t)
+{
+ u32 remote = t->parms.iph.daddr;
+ u32 local = t->parms.iph.saddr;
+ unsigned h = 0;
+ int prio = 0;
+
+ if (remote) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+ if (local) {
+ prio |= 1;
+ h ^= HASH(local);
+ }
+ return &tunnels[prio][h];
+}
+
+static void ipip6_tunnel_unlink(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp;
+
+ for (tp = ipip6_bucket(t); *tp; tp = &(*tp)->next) {
+ if (t == *tp) {
+ net_serialize_enter();
+ *tp = t->next;
+ net_serialize_leave();
+ break;
+ }
+ }
+}
+
+static void ipip6_tunnel_link(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp = ipip6_bucket(t);
+
+ net_serialize_enter();
+ t->next = *tp;
+ *tp = t;
+ net_serialize_leave();
+}
+
struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create)
{
u32 remote = parms->iph.daddr;
@@ -145,10+187,7 @@ struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create) if (register_netdevice(dev) < 0)
goto failed;
- start_bh_atomic();
- nt->next = t;
- *tp = nt;
- end_bh_atomic();
+ ipip6_tunnel_link(nt);
/* Do not decrement MOD_USE_COUNT here. */
return nt;
@@ -160,37+199,18 @@ failed:
static void ipip6_tunnel_destroy(struct device *dev)
{
- struct ip_tunnel *t, **tp;
- struct ip_tunnel *t0 = (struct ip_tunnel*)dev->priv;
- u32 remote = t0->parms.iph.daddr;
- u32 local = t0->parms.iph.saddr;
- unsigned h = 0;
- int prio = 0;
-
if (dev == &ipip6_fb_tunnel_dev) {
+ net_serialize_enter();
tunnels_wc[0] = NULL;
+ net_serialize_leave();
return;
- }
-
- if (remote) {
- prio |= 2;
- h ^= HASH(remote);
- }
- if (local) {
- prio |= 1;
- h ^= HASH(local);
- }
- for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
- if (t == t0) {
- *tp = t->next;
- kfree(dev);
- MOD_DEC_USE_COUNT;
- break;
- }
+ } else {
+ ipip6_tunnel_unlink((struct ip_tunnel*)dev->priv);
+ kfree(dev);
+ MOD_DEC_USE_COUNT;
}
}
-
void ipip6_err(struct sk_buff *skb, unsigned char *dp, int len)
{
#ifndef I_WISH_WORLD_WERE_PERFECT
@@ -571,6+591,32 @@ ipip6_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
t = ipip6_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+ if (dev != &ipip6_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
+ t != &ipip6_fb_tunnel) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
+ err = -EINVAL;
+ break;
+ }
+ t = (struct ip_tunnel*)dev->priv;
+ start_bh_atomic();
+ ipip6_tunnel_unlink(t);
+ t->parms.iph.saddr = p.iph.saddr;
+ t->parms.iph.daddr = p.iph.daddr;
+ memcpy(dev->dev_addr, &p.iph.saddr, 4);
+ memcpy(dev->broadcast, &p.iph.daddr, 4);
+ ipip6_tunnel_link(t);
+ end_bh_atomic();
+ netdev_state_change(dev);
+ }
+ }
+
if (t) {
err = 0;
if (cmd == SIOCCHGTUNNEL) {
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * $Id: tcp_ipv6.c,v 1.99 1999/03/11 00:04:26 davem Exp $
+ * $Id: tcp_ipv6.c,v 1.100 1999/03/21 05:22:59 davem Exp $
*
* Based on:
* linux/net/ipv4/tcp.c
@@ -376,12+376,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- struct inet6_ifaddr *ifa;
struct in6_addr *saddr = NULL;
+ struct in6_addr saddr_buf;
struct flowi fl;
struct dst_entry *dst;
struct sk_buff *buff;
int addr_type;
+ int err;
if (sk->state != TCP_CLOSE)
return(-EISCONN);
@@ -428,7+429,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (addr_type == IPV6_ADDR_MAPPED) {
u32 exthdrlen = tp->ext_header_len;
struct sockaddr_in sin;
- int err;
SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
@@ -472,9+472,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
dst = ip6_route_output(sk, &fl);
- if (dst->error) {
+ if ((err = dst->error) != 0) {
dst_release(dst);
- return dst->error;
+ return err;
}
if (fl.oif == 0 && addr_type&IPV6_ADDR_LINKLOCAL) {
@@ -489,18+489,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ip6_dst_store(sk, dst, NULL);
if (saddr == NULL) {
- ifa = ipv6_get_saddr(dst, &np->daddr);
-
- if (ifa == NULL)
- return -ENETUNREACH;
-
- saddr = &ifa->addr;
+ err = ipv6_get_saddr(dst, &np->daddr, &saddr_buf);
+ if (err)
+ return err;
- /* set the source address */
- ipv6_addr_copy(&np->rcv_saddr, saddr);
- ipv6_addr_copy(&np->saddr, saddr);
+ saddr = &saddr_buf;
}
+ /* set the source address */
+ ipv6_addr_copy(&np->rcv_saddr, saddr);
+ ipv6_addr_copy(&np->saddr, saddr);
+
tp->ext_header_len = 0;
if (np->opt)
tp->ext_header_len = np->opt->opt_flen+np->opt->opt_nflen;
@@ -602,11+601,14 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, np = &sk->net_pinfo.af_inet6;
if (type == ICMPV6_PKT_TOOBIG) {
struct dst_entry *dst = NULL;
- /* icmp should have updated the destination cache entry */
+
+ if (atomic_read(&sk->sock_readers))
+ return;
if (sk->state == TCP_LISTEN)
return;
+ /* icmp should have updated the destination cache entry */
if (sk->dst_cache)
dst = dst_check(&sk->dst_cache, np->dst_cookie);
@@ -631,8+633,7 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr,
if (dst->error) {
sk->err_soft = -dst->error;
- } else if (tp->pmtu_cookie > dst->pmtu
- && !atomic_read(&sk->sock_readers)) {
+ } else if (tp->pmtu_cookie > dst->pmtu) {
tcp_sync_mss(sk, dst->pmtu);
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
@@ -1193,6+1194,11 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == __constant_htons(ETH_P_IP))
return tcp_v4_do_rcv(sk, skb);
+#ifdef CONFIG_FILTER
+ if (sk->filter && sk_filter(skb, sk->filter))
+ goto discard;
+#endif /* CONFIG_FILTER */
+
/*
* socket locking is here for SMP purposes as backlog rcv
* is currently called with bh processing disabled.
@@ -1421,6+1427,9 @@ static struct sock * tcp_v6_get_sock(struct sk_buff *skb, struct tcphdr *th) struct in6_addr *saddr;
struct in6_addr *daddr;
+ if (skb->protocol == __constant_htons(ETH_P_IP))
+ return ipv4_specific.get_sock(skb, th);
+
saddr = &skb->nh.ipv6h->saddr;
daddr = &skb->nh.ipv6h->daddr;
return tcp_v6_lookup(saddr, th->source, daddr, th->dest, tcp_v6_iif(skb));
*
* Based on linux/ipv4/udp.c
*
- * $Id: udp.c,v 1.37 1998/11/08 11:17:10 davem Exp $
+ * $Id: udp.c,v 1.38 1999/03/21 05:23:00 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -201,8+201,8 @@ int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
struct in6_addr *daddr;
+ struct in6_addr saddr;
struct dst_entry *dst;
- struct inet6_ifaddr *ifa;
struct flowi fl;
int addr_type;
int err;
@@ -284,28+284,29 @@ ipv4_connected:
dst = ip6_route_output(sk, &fl);
- if (dst->error) {
+ if ((err = dst->error) != 0) {
dst_release(dst);
- return dst->error;
+ return err;
}
ip6_dst_store(sk, dst, fl.nl_u.ip6_u.daddr);
/* get the source adddress used in the apropriate device */
- ifa = ipv6_get_saddr(dst, daddr);
+ err = ipv6_get_saddr(dst, daddr, &saddr);
- if(ipv6_addr_any(&np->saddr))
- ipv6_addr_copy(&np->saddr, &ifa->addr);
+ if (err == 0) {
+ if(ipv6_addr_any(&np->saddr))
+ ipv6_addr_copy(&np->saddr, &saddr);
- if(ipv6_addr_any(&np->rcv_saddr)) {
- ipv6_addr_copy(&np->rcv_saddr, &ifa->addr);
- sk->rcv_saddr = 0xffffffff;
+ if(ipv6_addr_any(&np->rcv_saddr)) {
+ ipv6_addr_copy(&np->rcv_saddr, &saddr);
+ sk->rcv_saddr = 0xffffffff;
+ }
+ sk->state = TCP_ESTABLISHED;
}
- sk->state = TCP_ESTABLISHED;
-
- return(0);
+ return err;
}
static void udpv6_close(struct sock *sk, long timeout)
@@ -317,7+318,7 @@ static void udpv6_close(struct sock *sk, long timeout) destroy_sock(sk);
}
-#if defined(CONFIG_FILTER) || !defined(HAVE_CSUM_COPY_USER)
+#ifndef HAVE_CSUM_COPY_USER
#undef CONFIG_UDP_DELAY_CSUM
#endif
@@ -352,11+353,11 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
msg->msg_iov, copied);
#else
- if (sk->no_check || skb->ip_summed==CHECKSUM_UNNECESSARY) {
+ if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
copied);
} else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
- if (csum_fold(csum_partial(skb->h.raw, ntohs(skb->h.uh->len), skb->csum))) {
+ if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) {
/* Error for blocking case is chosen to masquerade
as some normal condition.
*/
@@ -373,7+374,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, copied, csum, &err);
if (err)
goto out_free;
- if (csum_fold(csum)) {
+ if ((unsigned short)csum_fold(csum)) {
/* Error for blocking case is chosen to masquerade
as some normal condition.
*/
@@ -454,6+455,17 @@ void udpv6_err(struct sk_buff *skb, struct ipv6hdr *hdr,
static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
{
+#if defined(CONFIG_FILTER) && defined(CONFIG_UDP_DELAY_CSUM)
+ if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
+ if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) {
+ udp_stats_in6.UdpInErrors++;
+ ipv6_statistics.Ip6InDiscards++;
+ kfree_skb(skb);
+ return 0;
+ }
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+#endif
if (sock_queue_rcv_skb(sk,skb)<0) {
udp_stats_in6.UdpInErrors++;
ipv6_statistics.Ip6InDiscards++;
@@ -627,14+639,13 @@ int udpv6_rcv(struct sk_buff *skb, unsigned long len) if (sk == NULL) {
#ifdef CONFIG_UDP_DELAY_CSUM
if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
- csum_fold(csum_partial((char*)uh, len, skb->csum)))
+ (unsigned short)csum_fold(csum_partial((char*)uh, len, skb->csum)))
goto discard;
#endif
-
udp_stats_in6.UdpNoPorts++;
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev);
-
+
kfree_skb(skb);
return(0);
}
@@ -55,6+55,13 @@ static struct socket *netlink_kernel[MAX_LINKS]; static int netlink_dump(struct sock *sk);
static void netlink_destroy_callback(struct netlink_callback *cb);
+/* Netlink table lock. It protects against sk list changes
+ during uninterruptible sleeps in netlink_broadcast.
+
+ These lock MUST NOT be used from bh/irq on SMP kernels, because
+ It would result in race in netlink_wait_on_table.
+ */
+
extern __inline__ void
netlink_wait_on_table(int protocol)
{
@@ -69,16+76,16 @@ netlink_lock_table(int protocol) }
extern __inline__ void
-netlink_unlock_table(int protocol, int wakeup)
+netlink_unlock_table(int protocol)
{
#if 0
/* F...g gcc does not eat it! */
- if (atomic_dec_and_test(&nl_table_lock[protocol]) && wakeup)
+ if (atomic_dec_and_test(&nl_table_lock[protocol]))
wake_up(&nl_table_wait);
#else
atomic_dec(&nl_table_lock[protocol]);
- if (atomic_read(&nl_table_lock[protocol]) && wakeup)
+ if (!atomic_read(&nl_table_lock[protocol]))
wake_up(&nl_table_wait);
#endif
}
@@ -125,7+132,9 @@ static void netlink_remove(struct sock *sk) struct sock **skp;
for (skp = &nl_table[sk->protocol]; *skp; skp = &((*skp)->next)) {
if (*skp == sk) {
+ start_bh_atomic();
*skp = sk->next;
+ end_bh_atomic();
return;
}
}
@@ -186,7+195,7 @@ static int netlink_release(struct socket *sock, struct socket *peer) transport (and AF_UNIX datagram, when it will be repaired).
Someone could wait on our sock->wait now.
- We cannot release socket until waiter will remove yourself
+ We cannot release socket until waiter will remove itself
from wait queue. I choose the most conservetive way of solving
the problem.
@@ -218,8+227,6 @@ static int netlink_autobind(struct socket *sock) struct sock *sk = sock->sk;
struct sock *osk;
- netlink_wait_on_table(sk->protocol);
-
sk->protinfo.af_netlink.groups = 0;
sk->protinfo.af_netlink.pid = current->pid;
@@ -264,8+271,6 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len return 0;
}
- netlink_wait_on_table(sk->protocol);
-
for (osk=nl_table[sk->protocol]; osk; osk=osk->next) {
if (osk->protinfo.af_netlink.pid == nladdr->nl_pid)
return -EADDRINUSE;
@@ -332,7+337,7 @@ int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock retry:
for (sk = nl_table[protocol]; sk; sk = sk->next) {
if (sk->protinfo.af_netlink.pid != pid)
- continue;
+ continue;
netlink_lock(sk);
@@ -416,7+421,8 @@ void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
/* While we sleep in clone, do not allow to change socket list */
- netlink_lock_table(protocol);
+ if (allocation == GFP_KERNEL)
+ netlink_lock_table(protocol);
for (sk = nl_table[protocol]; sk; sk = sk->next) {
if (ssk == sk)
@@ -454,7+460,8 @@ void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, netlink_unlock(sk);
}
- netlink_unlock_table(protocol, allocation == GFP_KERNEL);
+ if (allocation == GFP_KERNEL)
+ netlink_unlock_table(protocol);
if (skb2)
kfree_skb(skb2);
@@ -475,7+482,7 @@ Nprintk("seterr"); !(sk->protinfo.af_netlink.groups&group))
continue;
- sk->err = -code;
+ sk->err = code;
sk->state_change(sk);
}
}
@@ -739,15+746,20 @@ int netlink_attach(int unit, int (*function)(int, struct sk_buff *skb)) void netlink_detach(int unit)
{
struct socket *sock = netlink_kernel[unit];
+
+ net_serialize_enter();
netlink_kernel[unit] = NULL;
+ net_serialize_leave();
sock_release(sock);
}
int netlink_post(int unit, struct sk_buff *skb)
{
- if (netlink_kernel[unit]) {
+ struct socket *sock = netlink_kernel[unit];
+ barrier();
+ if (sock) {
memset(skb->cb, 0, sizeof(skb->cb));
- netlink_broadcast(netlink_kernel[unit]->sk, skb, 0, ~0, GFP_ATOMIC);
+ netlink_broadcast(sock->sk, skb, 0, ~0, GFP_ATOMIC);
return 0;
}
return -EUNATCH;;
len-=(offset-begin);
if(len>length)
len=length;
+ if(len<0)
+ len=0;
return len;
}
#endif
@@ -428,6+428,7 @@ EXPORT_SYMBOL(register_netdevice); EXPORT_SYMBOL(unregister_netdevice);
EXPORT_SYMBOL(register_netdev);
EXPORT_SYMBOL(unregister_netdev);
+EXPORT_SYMBOL(netdev_state_change);
EXPORT_SYMBOL(ether_setup);
EXPORT_SYMBOL(dev_new_index);
EXPORT_SYMBOL(dev_get_by_index);
@@ -482,6+483,7 @@ EXPORT_SYMBOL(qdisc_head); EXPORT_SYMBOL(qdisc_create_dflt);
EXPORT_SYMBOL(noop_qdisc);
#ifdef CONFIG_NET_SCHED
+PSCHED_EXPORTLIST;
EXPORT_SYMBOL(pfifo_qdisc_ops);
EXPORT_SYMBOL(register_qdisc);
EXPORT_SYMBOL(unregister_qdisc);
*
* PACKET - implements raw packet sockets.
*
- * Version: $Id: af_packet.c,v 1.18 1998/10/03 15:55:24 freitag Exp $
+ * Version: $Id: af_packet.c,v 1.19 1999/03/21 05:23:03 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -831,7+831,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len, * Free or return the buffer as appropriate. Again this
* hides all the races and re-entrancy issues from us.
*/
- err = copied;
+ err = (flags&MSG_TRUNC) ? skb->len : copied;
out_free:
skb_free_datagram(sk, skb);
@@ -18,10+18,11 @@ if [ "$CONFIG_NET_QOS" = "y" ]; then fi
bool 'Packet classifier API' CONFIG_NET_CLS
if [ "$CONFIG_NET_CLS" = "y" ]; then
- bool 'Routing tables based classifier' CONFIG_NET_CLS_ROUTE
- if [ "$CONFIG_IP_FIREWALL" = "y" ]; then
- bool 'Firewall based classifier' CONFIG_NET_CLS_FW
+ tristate 'Routing table based classifier' CONFIG_NET_CLS_ROUTE4
+ if [ "$CONFIG_NET_CLS_ROUTE4" != "n" ]; then
+ define_bool CONFIG_NET_CLS_ROUTE y
fi
+ tristate 'Firewall based classifier' CONFIG_NET_CLS_FW
tristate 'U32 classifier' CONFIG_NET_CLS_U32
if [ "$CONFIG_NET_QOS" = "y" ]; then
tristate 'Special RSVP classifier' CONFIG_NET_CLS_RSVP
endif
endif
-ifeq ($(CONFIG_NET_CLS_ROUTE), y)
+ifeq ($(CONFIG_NET_CLS_ROUTE4), y)
O_OBJS += cls_route.o
+else
+ ifeq ($(CONFIG_NET_CLS_ROUTE4), m)
+ M_OBJS += cls_route.o
+ endif
endif
ifeq ($(CONFIG_NET_CLS_FW), y)
O_OBJS += cls_fw.o
+else
+ ifeq ($(CONFIG_NET_CLS_FW), m)
+ M_OBJS += cls_fw.o
+ endif
endif
endif
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ *
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
*/
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
+#include <linux/kmod.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
@@ -87,21+92,13 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
/* Select new prio value from the range, managed by kernel. */
-static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp, u32 prio)
+static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp)
{
u32 first = TC_H_MAKE(0xC0000000U,0U);
- if (!tp || tp->next == NULL)
- return first;
-
- if (prio == TC_H_MAKE(0xFFFF0000U,0U))
- first = tp->prio+1;
- else
+ if (tp)
first = tp->prio-1;
- if (first == prio)
- first = tp->prio;
-
return first;
}
@@ -129,10+126,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) /* If no priority is given, user wants we allocated it. */
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
return -ENOENT;
- if (n->nlmsg_flags&NLM_F_APPEND)
- prio = TC_H_MAKE(0xFFFF0000U,0U);
- else
- prio = TC_H_MAKE(0x80000000U,0U);
+ prio = TC_H_MAKE(0x80000000U,0U);
}
/* Find head of filter chain. */
@@ -194,6+188,18 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL)
goto errout;
tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]);
+#ifdef CONFIG_KMOD
+ if (tp_ops==NULL && tca[TCA_KIND-1] != NULL) {
+ struct rtattr *kind = tca[TCA_KIND-1];
+ char module_name[4 + IFNAMSIZ + 1];
+
+ if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
+ sprintf(module_name, "cls_%s", (char*)RTA_DATA(kind));
+ request_module (module_name);
+ tp_ops = tcf_proto_lookup_ops(kind);
+ }
+ }
+#endif
if (tp_ops == NULL) {
err = -EINVAL;
kfree(tp);
@@ -202,7+208,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) memset(tp, 0, sizeof(*tp));
tp->ops = tp_ops;
tp->protocol = protocol;
- tp->prio = nprio ? : tcf_auto_prio(*back, prio);
+ tp->prio = nprio ? : tcf_auto_prio(*back);
tp->q = q;
tp->classify = tp_ops->classify;
tp->classid = parent;
@@ -220,7+226,9 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
if (fh == 0) {
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
+ net_serialize_enter();
*back = tp->next;
+ net_serialize_leave();
tp->ops->destroy(tp);
kfree(tp);
err = 0;
@@ -249,7+257,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) }
}
- err = tp->ops->change(tp, t->tcm_handle, tca, &fh);
+ err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
if (err == 0)
tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
@@ -336,12+344,16 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) return skb->len;
if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
return skb->len;
- if ((q = qdisc_lookup(dev, tcm->tcm_parent)) == NULL)
+ if (!tcm->tcm_parent)
+ q = dev->qdisc_sleeping;
+ else
+ q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+ if (q == NULL)
return skb->len;
- cops = q->ops->cl_ops;
+ if ((cops = q->ops->cl_ops) == NULL)
+ goto errout;
if (TC_H_MIN(tcm->tcm_parent)) {
- if (cops)
- cl = cops->get(q, tcm->tcm_parent);
+ cl = cops->get(q, tcm->tcm_parent);
if (cl == 0)
goto errout;
}
@@ -360,7+372,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) TC_H_MIN(tcm->tcm_info) != tp->protocol)
continue;
if (t > s_t)
- memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+ memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
if (cb->args[1] == 0) {
if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) {
@@ -418,8+430,8 @@ __initfunc(int tc_filter_init(void)) #ifdef CONFIG_NET_CLS_U32
INIT_TC_FILTER(u32);
#endif
-#ifdef CONFIG_NET_CLS_ROUTE
- INIT_TC_FILTER(route);
+#ifdef CONFIG_NET_CLS_ROUTE4
+ INIT_TC_FILTER(route4);
#endif
#ifdef CONFIG_NET_CLS_FW
INIT_TC_FILTER(fw);
/*
- * net/sched/cls_fw.c Routing table based packet classifier.
+ * net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
+#include <linux/config.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
+struct fw_head
+{
+ struct fw_filter *ht[256];
+};
+
+struct fw_filter
+{
+ struct fw_filter *next;
+ u32 id;
+ struct tcf_result res;
+#ifdef CONFIG_NET_CLS_POLICE
+ struct tcf_police *police;
+#endif
+};
+
+static __inline__ int fw_hash(u32 handle)
+{
+ return handle&0xFF;
+}
static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
- u32 clid = skb->fwmark;
+ struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_filter *f;
+#ifdef CONFIG_IP_FIREWALL
+ u32 id = skb->fwmark;
+#else
+ u32 id = 0;
+#endif
- if (clid && (TC_H_MAJ(clid) == 0 ||
- !(TC_H_MAJ(clid^tp->q->handle)))) {
- res->classid = clid;
+ if (head == NULL)
+ goto old_method;
+
+ for (f=head->ht[fw_hash(id)]; f; f=f->next) {
+ if (f->id == id) {
+ *res = f->res;
+#ifdef CONFIG_NET_CLS_POLICE
+ if (f->police)
+ return tcf_police(skb, f->police);
+#endif
+ return 0;
+ }
+ }
+ return -1;
+
+old_method:
+ if (id && (TC_H_MAJ(id) == 0 ||
+ !(TC_H_MAJ(id^tp->q->handle)))) {
+ res->classid = id;
res->class = 0;
return 0;
}
@@ -51,6+93,16 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
{
+ struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_filter *f;
+
+ if (head == NULL)
+ return 0;
+
+ for (f=head->ht[fw_hash(handle)]; f; f=f->next) {
+ if (f->id == handle)
+ return (unsigned long)f;
+ }
return 0;
}
@@ -60,24+112,232 @@ static void fw_put(struct tcf_proto *tp, unsigned long f)
static int fw_init(struct tcf_proto *tp)
{
+ MOD_INC_USE_COUNT;
return 0;
}
static void fw_destroy(struct tcf_proto *tp)
{
+ struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL);
+ struct fw_filter *f;
+ int h;
+
+ if (head == NULL) {
+ MOD_DEC_USE_COUNT;
+ return;
+ }
+
+ for (h=0; h<256; h++) {
+ while ((f=head->ht[h]) != NULL) {
+ unsigned long cl;
+ head->ht[h] = f->next;
+
+ if ((cl = cls_set_class(&f->res.class, 0)) != 0)
+ tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+#ifdef CONFIG_NET_CLS_POLICE
+ tcf_police_release(f->police);
+#endif
+ kfree(f);
+ }
+ }
+ kfree(head);
+ MOD_DEC_USE_COUNT;
}
static int fw_delete(struct tcf_proto *tp, unsigned long arg)
{
+ struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL);
+ struct fw_filter *f = (struct fw_filter*)arg;
+ struct fw_filter **fp;
+
+ if (head == NULL || f == NULL)
+ return -EINVAL;
+
+ for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
+ if (*fp == f) {
+ unsigned long cl;
+
+ net_serialize_enter();
+ *fp = f->next;
+ net_serialize_leave();
+
+ if ((cl = cls_set_class(&f->res.class, 0)) != 0)
+ tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+#ifdef CONFIG_NET_CLS_POLICE
+ tcf_police_release(f->police);
+#endif
+ kfree(f);
+ return 0;
+ }
+ }
return -EINVAL;
}
-static int fw_change(struct tcf_proto *tp, u32 handle,
- struct rtattr **tca,
- unsigned long *arg)
+static int fw_change(struct tcf_proto *tp, unsigned long base,
+ u32 handle,
+ struct rtattr **tca,
+ unsigned long *arg)
+{
+ struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_filter *f;
+ struct rtattr *opt = tca[TCA_OPTIONS-1];
+ struct rtattr *tb[TCA_FW_MAX];
+ int err;
+
+ if (!opt)
+ return handle ? -EINVAL : 0;
+
+ if (rtattr_parse(tb, TCA_FW_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0)
+ return -EINVAL;
+
+ if ((f = (struct fw_filter*)*arg) != NULL) {
+ /* Node exists: adjust only classid */
+
+ if (f->id != handle && handle)
+ return -EINVAL;
+ if (tb[TCA_FW_CLASSID-1]) {
+ unsigned long cl;
+
+ f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]);
+ cl = tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid);
+ cl = cls_set_class(&f->res.class, cl);
+ if (cl)
+ tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+ }
+#ifdef CONFIG_NET_CLS_POLICE
+ if (tb[TCA_FW_POLICE-1]) {
+ struct tcf_police *police = tcf_police_locate(tb[TCA_FW_POLICE-1], tca[TCA_RATE-1]);
+ net_serialize_enter();
+ police = xchg(&f->police, police);
+ net_serialize_leave();
+ tcf_police_release(police);
+ }
+#endif
+ return 0;
+ }
+
+ if (!handle)
+ return -EINVAL;
+
+ if (head == NULL) {
+ head = kmalloc(sizeof(struct fw_head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+ memset(head, 0, sizeof(*head));
+ net_serialize_enter();
+ tp->root = head;
+ net_serialize_leave();
+ }
+
+ f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL);
+ if (f == NULL)
+ return -ENOBUFS;
+ memset(f, 0, sizeof(*f));
+
+ f->id = handle;
+
+ if (tb[TCA_FW_CLASSID-1]) {
+ err = -EINVAL;
+ if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != 4)
+ goto errout;
+ f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]);
+ cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
+ }
+
+#ifdef CONFIG_NET_CLS_POLICE
+ if (tb[TCA_FW_POLICE-1])
+ f->police = tcf_police_locate(tb[TCA_FW_POLICE-1], tca[TCA_RATE-1]);
+#endif
+
+ f->next = head->ht[fw_hash(handle)];
+ net_serialize_enter();
+ head->ht[fw_hash(handle)] = f;
+ net_serialize_leave();
+ *arg = (unsigned long)f;
+ return 0;
+
+errout:
+ if (f)
+ kfree(f);
+ return err;
+}
+
+static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- return handle ? -EINVAL : 0;
+ struct fw_head *head = (struct fw_head*)tp->root;
+ int h;
+
+ if (head == NULL)
+ arg->stop = 1;
+
+ if (arg->stop)
+ return;
+
+ for (h = 0; h <= 256; h++) {
+ struct fw_filter *f;
+
+ for (f = head->ht[h]; f; f = f->next) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+ }
+}
+
+#ifdef CONFIG_RTNETLINK
+static int fw_dump(struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct fw_filter *f = (struct fw_filter*)fh;
+ unsigned char *b = skb->tail;
+ struct rtattr *rta;
+
+ if (f == NULL)
+ return skb->len;
+
+ t->tcm_handle = f->id;
+
+ if (!f->res.classid && !f->police)
+ return skb->len;
+
+ rta = (struct rtattr*)b;
+ RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+
+ if (f->res.classid)
+ RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid);
+#ifdef CONFIG_NET_CLS_POLICE
+ if (f->police) {
+ struct rtattr * p_rta = (struct rtattr*)skb->tail;
+
+ RTA_PUT(skb, TCA_FW_POLICE, 0, NULL);
+
+ if (tcf_police_dump(skb, f->police) < 0)
+ goto rtattr_failure;
+
+ p_rta->rta_len = skb->tail - (u8*)p_rta;
+ }
+#endif
+
+ rta->rta_len = skb->tail - b;
+#ifdef CONFIG_NET_CLS_POLICE
+ if (f->police) {
+ RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &f->police->stats);
+ }
+#endif
+ return skb->len;
+
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
}
+#endif
+
struct tcf_proto_ops cls_fw_ops = {
NULL,
@@ -90,5+350,22 @@ struct tcf_proto_ops cls_fw_ops = { fw_put,
fw_change,
fw_delete,
- NULL,
+ fw_walk,
+#ifdef CONFIG_RTNETLINK
+ fw_dump
+#else
+ NULL
+#endif
};
+
+#ifdef MODULE
+int init_module(void)
+{
+ return register_tcf_proto_ops(&cls_fw_ops);
+}
+
+void cleanup_module(void)
+{
+ unregister_tcf_proto_ops(&cls_fw_ops);
+}
+#endif
/*
- * net/sched/cls_route.c Routing table based packet classifier.
+ * net/sched/cls_route.c ROUTE4 classifier.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
*/
#include <linux/module.h>
+#include <linux/config.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
+/*
+ 1. For now we assume that route tags < 256.
+ It allows to use direct table lookups, instead of hash tables.
+ 2. For now we assume that "from TAG" and "fromdev DEV" statements
+ are mutually exclusive.
+ 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
+ */
+
+struct route4_fastmap
+{
+ struct route4_filter *filter;
+ u32 id;
+ int iif;
+};
+
+struct route4_head
+{
+ struct route4_fastmap fastmap[16];
+ struct route4_bucket *table[256+1];
+};
+
+struct route4_bucket
+{
+ struct route4_filter *ht[16+16+1];
+};
+
+struct route4_filter
+{
+ struct route4_filter *next;
+ u32 id;
+ int iif;
+
+ struct tcf_result res;
+#ifdef CONFIG_NET_CLS_POLICE
+ struct tcf_police *police;
+#endif
+
+ u32 handle;
+ struct route4_bucket *bkt;
+};
+
+#define ROUTE4_FAILURE ((struct route4_filter*)(-1L))
+
+static __inline__ int route4_fastmap_hash(u32 id, int iif)
+{
+ return id&0xF;
+}
+
+static void route4_reset_fastmap(struct route4_head *head, u32 id)
+{
+ start_bh_atomic();
+ memset(head->fastmap, 0, sizeof(head->fastmap));
+ end_bh_atomic();
+}
+
+static void __inline__
+route4_set_fastmap(struct route4_head *head, u32 id, int iif,
+ struct route4_filter *f)
+{
+ int h = route4_fastmap_hash(id, iif);
+ head->fastmap[h].id = id;
+ head->fastmap[h].iif = iif;
+ head->fastmap[h].filter = f;
+}
+
+static __inline__ int route4_hash_to(u32 id)
+{
+ return id&0xFF;
+}
+
+static __inline__ int route4_hash_from(u32 id)
+{
+ return (id>>16)&0xF;
+}
+
+static __inline__ int route4_hash_iif(int iif)
+{
+ return 16 + ((iif>>16)&0xF);
+}
+
+static __inline__ int route4_hash_wild(void)
+{
+ return 32;
+}
-static int route_classify(struct sk_buff *skb, struct tcf_proto *tp,
- struct tcf_result *res)
+#ifdef CONFIG_NET_CLS_POLICE
+#define IF_ROUTE_POLICE \
+if (f->police) { \
+ int pol_res = tcf_police(skb, f->police); \
+ if (pol_res >= 0) return pol_res; \
+ dont_cache = 1; \
+ continue; \
+} \
+if (!dont_cache)
+#else
+#define IF_ROUTE_POLICE
+#endif
+
+
+static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
+ struct tcf_result *res)
{
- struct dst_entry *dst = skb->dst;
+ struct route4_head *head = (struct route4_head*)tp->root;
+ struct dst_entry *dst;
+ struct route4_bucket *b;
+ struct route4_filter *f;
+#ifdef CONFIG_NET_CLS_POLICE
+ int dont_cache = 0;
+#endif
+ u32 id, h;
+ int iif;
+
+ if ((dst = skb->dst) == NULL)
+ goto failure;
+
+ id = dst->tclassid;
+ if (head == NULL)
+ goto old_method;
+
+ iif = ((struct rtable*)dst)->key.iif;
- if (dst) {
- u32 clid = dst->tclassid;
+ h = route4_fastmap_hash(id, iif);
+ if (id == head->fastmap[h].id &&
+ iif == head->fastmap[h].iif &&
+ (f = head->fastmap[h].filter) != NULL) {
+ if (f == ROUTE4_FAILURE)
+ goto failure;
- if (clid && (TC_H_MAJ(clid) == 0 ||
- !(TC_H_MAJ(clid^tp->q->handle)))) {
- res->classid = clid;
- res->class = 0;
+ *res = f->res;
+ return 0;
+ }
+
+ h = route4_hash_to(id);
+
+restart:
+ if ((b = head->table[h]) != NULL) {
+ f = b->ht[route4_hash_from(id)];
+
+ for ( ; f; f = f->next) {
+ if (f->id == id) {
+ *res = f->res;
+ IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f);
+ return 0;
+ }
+ }
+
+ for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next) {
+ if (f->iif == iif) {
+ *res = f->res;
+ IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f);
+ return 0;
+ }
+ }
+
+ for (f = b->ht[route4_hash_wild()]; f; f = f->next) {
+ *res = f->res;
+ IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f);
return 0;
}
+
+ }
+ if (h < 256) {
+ h = 256;
+ id &= ~0xFFFF;
+ goto restart;
+ }
+
+#ifdef CONFIG_NET_CLS_POLICE
+ if (!dont_cache)
+#endif
+ route4_set_fastmap(head, id, iif, ROUTE4_FAILURE);
+failure:
+ return -1;
+
+old_method:
+ if (id && (TC_H_MAJ(id) == 0 ||
+ !(TC_H_MAJ(id^tp->q->handle)))) {
+ res->classid = id;
+ res->class = 0;
+ return 0;
}
return -1;
}
-static unsigned long route_get(struct tcf_proto *tp, u32 handle)
+static u32 to_hash(u32 id)
+{
+ u32 h = id&0xFF;
+ if (id&0x8000)
+ h += 256;
+ return h;
+}
+
+static u32 from_hash(u32 id)
+{
+ id &= 0xFFFF;
+ if (id == 0xFFFF)
+ return 32;
+ if (!(id & 0x8000)) {
+ if (id > 255)
+ return 256;
+ return id&0xF;
+ }
+ return 16 + (id&0xF);
+}
+
+static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
+{
+ struct route4_head *head = (struct route4_head*)tp->root;
+ struct route4_bucket *b;
+ struct route4_filter *f;
+ unsigned h1, h2;
+
+ if (!head)
+ return 0;
+
+ h1 = to_hash(handle);
+ if (h1 > 256)
+ return 0;
+
+ h2 = from_hash(handle>>16);
+ if (h2 > 32)
+ return 0;
+
+ if ((b = head->table[h1]) != NULL) {
+ for (f = b->ht[h2]; f; f = f->next)
+ if (f->handle == handle)
+ return (unsigned long)f;
+ }
+ return 0;
+}
+
+static void route4_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int route4_init(struct tcf_proto *tp)
{
+ MOD_INC_USE_COUNT;
return 0;
}
-static void route_put(struct tcf_proto *tp, unsigned long f)
+static void route4_destroy(struct tcf_proto *tp)
{
+ struct route4_head *head = xchg(&tp->root, NULL);
+ int h1, h2;
+
+ if (head == NULL) {
+ MOD_DEC_USE_COUNT;
+ return;
+ }
+
+ for (h1=0; h1<=256; h1++) {
+ struct route4_bucket *b;
+
+ if ((b = head->table[h1]) != NULL) {
+ for (h2=0; h2<=32; h2++) {
+ struct route4_filter *f;
+
+ while ((f = b->ht[h2]) != NULL) {
+ unsigned long cl;
+
+ b->ht[h2] = f->next;
+ if ((cl = cls_set_class(&f->res.class, 0)) != 0)
+ tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+#ifdef CONFIG_NET_CLS_POLICE
+ tcf_police_release(f->police);
+#endif
+ kfree(f);
+ }
+ }
+ kfree(b);
+ }
+ }
+ kfree(head);
+ MOD_DEC_USE_COUNT;
}
-static int route_init(struct tcf_proto *tp)
+static int route4_delete(struct tcf_proto *tp, unsigned long arg)
{
+ struct route4_head *head = (struct route4_head*)tp->root;
+ struct route4_filter **fp, *f = (struct route4_filter*)arg;
+ unsigned h = f->handle;
+ struct route4_bucket *b;
+ int i;
+
+ if (!head || !f)
+ return -EINVAL;
+
+ b = f->bkt;
+
+ for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) {
+ if (*fp == f) {
+ unsigned long cl;
+
+ net_serialize_enter();
+ *fp = f->next;
+ net_serialize_leave();
+ route4_reset_fastmap(head, f->id);
+
+ if ((cl = cls_set_class(&f->res.class, 0)) != 0)
+ tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+
+#ifdef CONFIG_NET_CLS_POLICE
+ tcf_police_release(f->police);
+#endif
+ kfree(f);
+
+ /* Strip tree */
+
+ for (i=0; i<=32; i++)
+ if (b->ht[i])
+ return 0;
+
+ /* OK, session has no flows */
+ net_serialize_enter();
+ head->table[to_hash(h)] = NULL;
+ net_serialize_leave();
+ kfree(b);
+ return 0;
+ }
+ }
return 0;
}
-static void route_destroy(struct tcf_proto *tp)
+static int route4_change(struct tcf_proto *tp, unsigned long base,
+ u32 handle,
+ struct rtattr **tca,
+ unsigned long *arg)
{
+ struct route4_head *head = tp->root;
+ struct route4_filter *f, *f1, **ins_f;
+ struct route4_bucket *b;
+ struct rtattr *opt = tca[TCA_OPTIONS-1];
+ struct rtattr *tb[TCA_ROUTE4_MAX];
+ unsigned h1, h2;
+ int err;
+
+ if (opt == NULL)
+ return handle ? -EINVAL : 0;
+
+ if (rtattr_parse(tb, TCA_ROUTE4_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0)
+ return -EINVAL;
+
+ if ((f = (struct route4_filter*)*arg) != NULL) {
+ /* Node exists: adjust only classid */
+
+ if (f->handle != handle && handle)
+ return -EINVAL;
+ if (tb[TCA_ROUTE4_CLASSID-1]) {
+ unsigned long cl;
+
+ f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]);
+ cl = cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
+ if (cl)
+ tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
+ }
+#ifdef CONFIG_NET_CLS_POLICE
+ if (tb[TCA_ROUTE4_POLICE-1]) {
+ struct tcf_police *police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]);
+ net_serialize_enter();
+ police = xchg(&f->police, police);
+ net_serialize_leave();
+ tcf_police_release(police);
+ }
+#endif
+ return 0;
+ }
+
+ /* Now more serious part... */
+
+ if (head == NULL) {
+ head = kmalloc(sizeof(struct route4_head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+ memset(head, 0, sizeof(struct route4_head));
+ net_serialize_enter();
+ tp->root = head;
+ net_serialize_leave();
+ }
+
+ f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL);
+ if (f == NULL)
+ return -ENOBUFS;
+
+ memset(f, 0, sizeof(*f));
+
+ err = -EINVAL;
+ f->handle = 0x8000;
+ if (tb[TCA_ROUTE4_TO-1]) {
+ if (handle&0x8000)
+ goto errout;
+ if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < 4)
+ goto errout;
+ f->id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]);
+ if (f->id > 0xFF)
+ goto errout;
+ f->handle = f->id;
+ }
+ if (tb[TCA_ROUTE4_FROM-1]) {
+ u32 sid;
+ if (tb[TCA_ROUTE4_IIF-1])
+ goto errout;
+ if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < 4)
+ goto errout;
+ sid = (*(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1]));
+ if (sid > 0xFF)
+ goto errout;
+ f->handle |= sid<<16;
+ f->id |= sid<<16;
+ } else if (tb[TCA_ROUTE4_IIF-1]) {
+ if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < 4)
+ goto errout;
+ f->iif = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]);
+ if (f->iif > 0x7FFF)
+ goto errout;
+ f->handle |= (f->iif|0x8000)<<16;
+ } else
+ f->handle |= 0xFFFF<<16;
+
+ if (handle) {
+ f->handle |= handle&0x7F00;
+ if (f->handle != handle)
+ goto errout;
+ }
+
+ if (tb[TCA_ROUTE4_CLASSID-1]) {
+ if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < 4)
+ goto errout;
+ f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]);
+ }
+
+ h1 = to_hash(f->handle);
+ if ((b = head->table[h1]) == NULL) {
+ err = -ENOBUFS;
+ b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL);
+ if (b == NULL)
+ goto errout;
+ memset(b, 0, sizeof(*b));
+ net_serialize_enter();
+ head->table[h1] = b;
+ net_serialize_leave();
+ }
+ f->bkt = b;
+
+ err = -EEXIST;
+ h2 = from_hash(f->handle>>16);
+ for (ins_f = &b->ht[h2]; (f1=*ins_f) != NULL; ins_f = &f1->next) {
+ if (f->handle < f1->handle)
+ break;
+ if (f1->handle == f->handle)
+ goto errout;
+ }
+
+ cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
+#ifdef CONFIG_NET_CLS_POLICE
+ if (tb[TCA_ROUTE4_POLICE-1])
+ f->police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]);
+#endif
+
+ f->next = f1;
+ net_serialize_enter();
+ *ins_f = f;
+ net_serialize_leave();
+ route4_reset_fastmap(head, f->id);
+ *arg = (unsigned long)f;
+ return 0;
+
+errout:
+ if (f)
+ kfree(f);
+ return err;
}
-static int route_delete(struct tcf_proto *tp, unsigned long arg)
+static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- return -EINVAL;
+ struct route4_head *head = tp->root;
+ unsigned h, h1;
+
+ if (head == NULL)
+ arg->stop = 1;
+
+ if (arg->stop)
+ return;
+
+ for (h = 0; h <= 256; h++) {
+ struct route4_bucket *b = head->table[h];
+
+ if (b) {
+ for (h1 = 0; h1 <= 32; h1++) {
+ struct route4_filter *f;
+
+ for (f = b->ht[h1]; f; f = f->next) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+ }
+ }
+ }
}
-static int route_change(struct tcf_proto *tp, u32 handle,
- struct rtattr **tca,
- unsigned long *arg)
+#ifdef CONFIG_RTNETLINK
+static int route4_dump(struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
{
- return handle ? -EINVAL : 0;
+ struct route4_filter *f = (struct route4_filter*)fh;
+ unsigned char *b = skb->tail;
+ struct rtattr *rta;
+ u32 id;
+
+ if (f == NULL)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ rta = (struct rtattr*)b;
+ RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+
+ if (!(f->handle&0x8000)) {
+ id = f->id&0xFF;
+ RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id);
+ }
+ if (f->handle&0x80000000) {
+ if ((f->handle>>16) != 0xFFFF)
+ RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif);
+ } else {
+ id = f->id>>16;
+ RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id);
+ }
+ if (f->res.classid)
+ RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid);
+#ifdef CONFIG_NET_CLS_POLICE
+ if (f->police) {
+ struct rtattr * p_rta = (struct rtattr*)skb->tail;
+
+ RTA_PUT(skb, TCA_ROUTE4_POLICE, 0, NULL);
+
+ if (tcf_police_dump(skb, f->police) < 0)
+ goto rtattr_failure;
+
+ p_rta->rta_len = skb->tail - (u8*)p_rta;
+ }
+#endif
+
+ rta->rta_len = skb->tail - b;
+#ifdef CONFIG_NET_CLS_POLICE
+ if (f->police) {
+ RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &f->police->stats);
+ }
+#endif
+ return skb->len;
+
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
}
+#endif
-struct tcf_proto_ops cls_route_ops = {
+struct tcf_proto_ops cls_route4_ops = {
NULL,
"route",
- route_classify,
- route_init,
- route_destroy,
-
- route_get,
- route_put,
- route_change,
- route_delete,
- NULL,
+ route4_classify,
+ route4_init,
+ route4_destroy,
+
+ route4_get,
+ route4_put,
+ route4_change,
+ route4_delete,
+ route4_walk,
+#ifdef CONFIG_RTNETLINK
+ route4_dump
+#else
+ NULL
+#endif
};
+
+#ifdef MODULE
+int init_module(void)
+{
+ return register_tcf_proto_ops(&cls_route4_ops);
+}
+
+void cleanup_module(void)
+{
+ unregister_tcf_proto_ops(&cls_route4_ops);
+}
+#endif
@@ -120,6+120,18 @@ static __inline__ unsigned hash_src(u32 *src) return h & 0xF;
}
+#ifdef CONFIG_NET_CLS_POLICE
+#define RSVP_POLICE() \
+if (f->police) { \
+ int pol_res = tcf_police(skb, f->police); \
+ if (pol_res < 0) continue; \
+ if (pol_res) return pol_res; \
+}
+#else
+#define RSVP_POLICE()
+#endif
+
+
static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
@@ -137,7+149,7 @@ static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, struct iphdr *nhptr = skb->nh.iph;
#endif
-#ifndef __i386__
+#if !defined( __i386__) && !defined(__m68k__)
if ((unsigned long)nhptr & 3)
return -1;
#endif
@@ -181,13+193,12 @@ restart: && src[2] == f->src[2]
#endif
) {
+
+ RSVP_POLICE();
+
matched:
if (f->tunnelhdr == 0) {
*res = f->res;
-#ifdef CONFIG_NET_CLS_POLICE
- if (f->police)
- return tcf_police(skb, f->police);
-#endif
return 0;
} else {
tunnelid = f->res.classid;
@@ -198,8+209,10 @@ matched: }
/* And wildcard bucket... */
- if ((f = s->ht[16]) != NULL)
+ for (f = s->ht[16]; f; f = f->next) {
+ RSVP_POLICE();
goto matched;
+ }
return -1;
}
}
@@ -260,7+273,6 @@ static void rsvp_destroy(struct tcf_proto *tp) struct rsvp_session *s;
while ((s = sht[h1]) != NULL) {
-
sht[h1] = s->next;
for (h2=0; h2<=16; h2++) {
@@ -270,7+282,7 @@ static void rsvp_destroy(struct tcf_proto *tp) unsigned long cl;
s->ht[h2] = f->next;
- if ((cl = xchg(&f->res.class, 0)) != 0)
+ if ((cl = cls_set_class(&f->res.class, 0)) != 0)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
#ifdef CONFIG_NET_CLS_POLICE
tcf_police_release(f->police);
@@ -297,8+309,11 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) if (*fp == f) {
unsigned long cl;
+ net_serialize_enter();
*fp = f->next;
- if ((cl = xchg(&f->res.class, 0)) != 0)
+ net_serialize_leave();
+
+ if ((cl = cls_set_class(&f->res.class, 0)) != 0)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
#ifdef CONFIG_NET_CLS_POLICE
@@ -317,12+332,14 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF];
*sp; sp = &(*sp)->next) {
if (*sp == s) {
+ net_serialize_enter();
*sp = s->next;
+ net_serialize_leave();
kfree(s);
return 0;
}
}
-
+
return 0;
}
}
@@ -399,7+416,8 @@ static u32 gen_tunnel(struct rsvp_head *data) return 0;
}
-static int rsvp_change(struct tcf_proto *tp, u32 handle,
+static int rsvp_change(struct tcf_proto *tp, unsigned long base,
+ u32 handle,
struct rtattr **tca,
unsigned long *arg)
{
@@ -425,17+443,20 @@ static int rsvp_change(struct tcf_proto *tp, u32 handle, if (f->handle != handle && handle)
return -EINVAL;
if (tb[TCA_RSVP_CLASSID-1]) {
- unsigned long cl = xchg(&f->res.class, 0);
+ unsigned long cl;
+
+ f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
+ cl = cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
if (cl)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
- f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
- f->res.class = tp->q->ops->cl_ops->bind_tcf(tp->q, f->res.classid);
}
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_RSVP_POLICE-1]) {
- struct tcf_police *police = tcf_police_locate(tb[TCA_RSVP_POLICE-1]);
-
- tcf_police_release(xchg(&f->police, police));
+ struct tcf_police *police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]);
+ net_serialize_enter();
+ police = xchg(&f->police, police);
+ net_serialize_leave();
+ tcf_police_release(police);
}
#endif
return 0;
@@ -514,17+535,19 @@ insert:
f->sess = s;
if (f->tunnelhdr == 0)
- f->res.class = tp->q->ops->cl_ops->bind_tcf(tp->q, f->res.classid);
+ cls_set_class(&f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid));
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_RSVP_POLICE-1])
- f->police = tcf_police_locate(tb[TCA_RSVP_POLICE-1]);
+ f->police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]);
#endif
for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask)
break;
f->next = *fp;
+ net_serialize_enter();
*fp = f;
+ net_serialize_leave();
*arg = (unsigned long)f;
return 0;
}
@@ -546,7+569,10 @@ insert: break;
}
s->next = *sp;
+ net_serialize_enter();
*sp = s;
+ net_serialize_leave();
+
goto insert;
errout:
@@ -631,6+657,11 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, #endif
rta->rta_len = skb->tail - b;
+#ifdef CONFIG_NET_CLS_POLICE
+ if (f->police) {
+ RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &f->police->stats);
+ }
+#endif
return skb->len;
rtattr_failure:
@@ -114,7+114,7 @@ static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_re int sel = 0;
int i;
-#ifndef __i386__
+#if !defined(__i386__) && !defined(__m68k__)
if ((unsigned long)ptr & 3)
return -1;
#endif
@@ -137,10+137,13 @@ check_terminal: if (n->sel.flags&TC_U32_TERMINAL) {
*res = n->res;
#ifdef CONFIG_NET_CLS_POLICE
- if (n->police)
- return tcf_police(skb, n->police);
+ if (n->police) {
+ int pol_res = tcf_police(skb, n->police);
+ if (pol_res >= 0)
+ return pol_res;
+ } else
#endif
- return 0;
+ return 0;
}
n = n->next;
goto next_knode;
@@ -304,7+307,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) {
unsigned long cl;
- if ((cl = xchg(&n->res.class, 0)) != 0)
+ if ((cl = cls_set_class(&n->res.class, 0)) != 0)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
#ifdef CONFIG_NET_CLS_POLICE
tcf_police_release(n->police);
@@ -323,7+326,10 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) if (ht) {
for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) {
if (*kp == key) {
+ net_serialize_enter();
*kp = key->next;
+ net_serialize_leave();
+
u32_destroy_key(tp, key);
return 0;
}
@@ -340,7+346,9 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
for (h=0; h<=ht->divisor; h++) {
while ((n = ht->ht[h]) != NULL) {
+ net_serialize_enter();
ht->ht[h] = n->next;
+ net_serialize_leave();
u32_destroy_key(tp, n);
}
}
@@ -402,6+410,7 @@ static void u32_destroy(struct tcf_proto *tp) kfree(tp_c);
}
+ MOD_DEC_USE_COUNT;
tp->data = NULL;
}
@@ -437,8+446,10 @@ static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) return handle|(i>0xFFF ? 0xFFF : i);
}
-static int u32_set_parms(struct Qdisc *q, struct tc_u_hnode *ht,
- struct tc_u_knode *n, struct rtattr **tb)
+static int u32_set_parms(struct Qdisc *q, unsigned long base,
+ struct tc_u_hnode *ht,
+ struct tc_u_knode *n, struct rtattr **tb,
+ struct rtattr *est)
{
if (tb[TCA_U32_LINK-1]) {
u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]);
@@ -455,29+466,34 @@ static int u32_set_parms(struct Qdisc *q, struct tc_u_hnode *ht, ht_down->refcnt++;
}
+ net_serialize_enter();
ht_down = xchg(&n->ht_down, ht_down);
+ net_serialize_leave();
if (ht_down)
ht_down->refcnt--;
}
if (tb[TCA_U32_CLASSID-1]) {
- unsigned long cl = xchg(&n->res.class, 0);
+ unsigned long cl;
+
+ n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]);
+ cl = cls_set_class(&n->res.class, q->ops->cl_ops->bind_tcf(q, base, n->res.classid));
if (cl)
q->ops->cl_ops->unbind_tcf(q, cl);
- n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]);
- n->res.class = q->ops->cl_ops->bind_tcf(q, n->res.classid);
}
#ifdef CONFIG_NET_CLS_POLICE
if (tb[TCA_U32_POLICE-1]) {
- struct tcf_police *police = tcf_police_locate(tb[TCA_U32_POLICE-1]);
-
- tcf_police_release(xchg(&n->police, police));
+ struct tcf_police *police = tcf_police_locate(tb[TCA_U32_POLICE-1], est);
+ net_serialize_enter();
+ police = xchg(&n->police, police);
+ net_serialize_leave();
+ tcf_police_release(police);
}
#endif
return 0;
}
-static int u32_change(struct tcf_proto *tp, u32 handle,
+static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
struct rtattr **tca,
unsigned long *arg)
{
@@ -500,7+516,7 @@ static int u32_change(struct tcf_proto *tp, u32 handle, if (TC_U32_KEY(n->handle) == 0)
return -EINVAL;
- return u32_set_parms(tp->q, n->ht_up, n, tb);
+ return u32_set_parms(tp->q, base, n->ht_up, n, tb, tca[TCA_RATE-1]);
}
if (tb[TCA_U32_DIVISOR-1]) {
@@ -531,7+547,7 @@ static int u32_change(struct tcf_proto *tp, u32 handle,
if (tb[TCA_U32_HASH-1]) {
htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]);
- if (TC_U32_HTID(handle) == TC_U32_ROOT) {
+ if (TC_U32_HTID(htid) == TC_U32_ROOT) {
ht = tp->root;
htid = ht->handle;
} else {
@@ -550,8+566,6 @@ static int u32_change(struct tcf_proto *tp, u32 handle, if (handle) {
if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
return -EINVAL;
- if (TC_U32_HASH(handle) && TC_U32_HASH(handle^htid))
- return -EINVAL;
handle = htid | TC_U32_NODE(handle);
} else
handle = gen_new_kid(ht, htid);
@@ -568,14+582,16 @@ static int u32_change(struct tcf_proto *tp, u32 handle, memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
n->ht_up = ht;
n->handle = handle;
- err = u32_set_parms(tp->q, ht, n, tb);
+ err = u32_set_parms(tp->q, base, ht, n, tb, tca[TCA_RATE-1]);
if (err == 0) {
struct tc_u_knode **ins;
for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
- if (TC_U32_NODE(handle) >= TC_U32_NODE((*ins)->handle))
+ if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle))
break;
+ net_serialize_enter();
n->next = *ins;
*ins = n;
+ net_serialize_leave();
*arg = (unsigned long)n;
return 0;
}
@@ -664,6+680,11 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, }
rta->rta_len = skb->tail - b;
+#ifdef CONFIG_NET_CLS_POLICE
+ if (TC_U32_KEY(n->handle) && n->police) {
+ RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), &n->police->stats);
+ }
+#endif
return skb->len;
rtattr_failure:
@@ -171,8+171,9 @@ void qdisc_kill_estimator(struct tc_stats *stats) pest = &est->next;
continue;
}
- /* ATOMIC_SET */
+ net_serialize_enter();
*pest = est->next;
+ net_serialize_leave();
kfree(est);
killed++;
}
@@ -74,6+74,9 @@ void tcf_police_destroy(struct tcf_police *p) for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) {
if (*p1p == p) {
*p1p = p->next;
+#ifdef CONFIG_NET_ESTIMATOR
+ qdisc_kill_estimator(&p->stats);
+#endif
if (p->R_tab)
qdisc_put_rtab(p->R_tab);
if (p->P_tab)
@@ -85,7+88,7 @@ void tcf_police_destroy(struct tcf_police *p) BUG_TRAP(0);
}
-struct tcf_police * tcf_police_locate(struct rtattr *rta)
+struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est)
{
unsigned h;
struct tcf_police *p;
@@ -111,20+114,35 @@ struct tcf_police * tcf_police_locate(struct rtattr *rta)
memset(p, 0, sizeof(*p));
p->refcnt = 1;
- if ((p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1])) == NULL)
- goto failure;
- if (parm->peakrate.rate &&
- (p->P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE-1])) == NULL)
- goto failure;
+ if (parm->rate.rate) {
+ if ((p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1])) == NULL)
+ goto failure;
+ if (parm->peakrate.rate &&
+ (p->P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE-1])) == NULL)
+ goto failure;
+ }
+ if (tb[TCA_POLICE_RESULT-1])
+ p->result = *(int*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
+#ifdef CONFIG_NET_ESTIMATOR
+ if (tb[TCA_POLICE_AVRATE-1])
+ p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
+#endif
p->toks = p->burst = parm->burst;
p->mtu = parm->mtu;
- if (p->mtu == 0)
- p->mtu = 255<<p->R_tab->rate.cell_log;
+ if (p->mtu == 0) {
+ p->mtu = ~0;
+ if (p->R_tab)
+ p->mtu = 255<<p->R_tab->rate.cell_log;
+ }
if (p->P_tab)
p->ptoks = L2T_P(p, p->mtu);
PSCHED_GET_TIME(p->t_c);
p->index = parm->index ? : tcf_police_new_index();
p->action = parm->action;
+#ifdef CONFIG_NET_ESTIMATOR
+ if (est)
+ qdisc_new_estimator(&p->stats, est);
+#endif
h = tcf_police_hash(p->index);
p->next = tcf_police_ht[h];
tcf_police_ht[h] = p;
@@ -143,7+161,20 @@ int tcf_police(struct sk_buff *skb, struct tcf_police *p) long toks;
long ptoks = 0;
+ p->stats.bytes += skb->len;
+ p->stats.packets++;
+
+#ifdef CONFIG_NET_ESTIMATOR
+ if (p->ewma_rate && p->stats.bps >= p->ewma_rate) {
+ p->stats.overlimits++;
+ return p->action;
+ }
+#endif
+
if (skb->len <= p->mtu) {
+ if (p->R_tab == NULL)
+ return p->result;
+
PSCHED_GET_TIME(now);
toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst, 0);
@@ -163,10+194,11 @@ int tcf_police(struct sk_buff *skb, struct tcf_police *p) p->t_c = now;
p->toks = toks;
p->ptoks = ptoks;
- return TC_POLICE_OK;
+ return p->result;
}
}
+ p->stats.overlimits++;
return p->action;
}
@@ -180,12+212,21 @@ int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p) opt.action = p->action;
opt.mtu = p->mtu;
opt.burst = p->burst;
- opt.rate = p->R_tab->rate;
+ if (p->R_tab)
+ opt.rate = p->R_tab->rate;
+ else
+ memset(&opt.rate, 0, sizeof(opt.rate));
if (p->P_tab)
opt.peakrate = p->P_tab->rate;
else
memset(&opt.peakrate, 0, sizeof(opt.peakrate));
RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
+ if (p->result)
+ RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result);
+#ifdef CONFIG_NET_ESTIMATOR
+ if (p->ewma_rate)
+ RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate);
+#endif
return skb->len;
rtattr_failure:
* Fixes:
*
* Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
*/
#include <linux/config.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
+#include <linux/kmod.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); }
#ifdef CONFIG_RTNETLINK
-static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
+static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
struct Qdisc *old, struct Qdisc *new);
static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
struct Qdisc *q, unsigned long cl, int event);
@@ -116,6+118,10 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, ---destroy
destroys resources allocated by init and during lifetime of qdisc.
+
+ ---change
+
+ changes qdisc parameters.
*/
/************************************************
@@ -177,22+183,22 @@ struct Qdisc *qdisc_lookup(struct device *dev, u32 handle) return NULL;
}
-/* We know classid. Find qdisc among all qdisc's attached to device
- (root qdisc, all its children, children of children etc.)
- */
-
-struct Qdisc *qdisc_lookup_class(struct device *dev, u32 classid)
+struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
{
- struct Qdisc *q;
+ unsigned long cl;
+ struct Qdisc *leaf;
+ struct Qdisc_class_ops *cops = p->ops->cl_ops;
- for (q = dev->qdisc_list; q; q = q->next) {
- if (q->classid == classid)
- return q;
- }
- return NULL;
+ if (cops == NULL)
+ return NULL;
+ cl = cops->get(p, classid);
+ if (cl == 0)
+ return NULL;
+ leaf = cops->leaf(p, cl);
+ cops->put(p, cl);
+ return leaf;
}
-
/* Find queueing discipline by name */
struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
@@ -268,6+274,37 @@ u32 qdisc_alloc_handle(struct device *dev) return i>0 ? autohandle : 0;
}
+/* Attach toplevel qdisc to device dev */
+
+static struct Qdisc *
+dev_graft_qdisc(struct device *dev, struct Qdisc *qdisc)
+{
+ struct Qdisc *oqdisc;
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ start_bh_atomic();
+ oqdisc = dev->qdisc_sleeping;
+
+ /* Prune old scheduler */
+ if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
+ qdisc_reset(oqdisc);
+
+ /* ... and graft new one */
+ if (qdisc == NULL)
+ qdisc = &noop_qdisc;
+ dev->qdisc_sleeping = qdisc;
+ dev->qdisc = &noop_qdisc;
+ end_bh_atomic();
+
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+
+ return oqdisc;
+}
+
+
/* Graft qdisc "new" to class "classid" of qdisc "parent" or
to device "dev".
@@ -280,17+317,10 @@ int qdisc_graft(struct device *dev, struct Qdisc *parent, u32 classid, int err = 0;
if (parent == NULL) {
- BUG_TRAP(classid == TC_H_ROOT);
- if (new) {
- new->parent = NULL;
- new->classid = TC_H_ROOT;
- }
- *old = dev_set_scheduler(dev, new);
+ *old = dev_graft_qdisc(dev, new);
} else {
struct Qdisc_class_ops *cops = parent->ops->cl_ops;
- BUG_TRAP(classid != TC_H_ROOT);
-
err = -EINVAL;
if (cops) {
@@ -313,22+343,30 @@ int qdisc_graft(struct device *dev, struct Qdisc *parent, u32 classid, */
static struct Qdisc *
-qdisc_create(struct device *dev, struct Qdisc_ops *ops, u32 handle,
- u32 parentid, struct rtattr **tca, int *errp)
+qdisc_create(struct device *dev, u32 handle, struct rtattr **tca, int *errp)
{
int err;
struct rtattr *kind = tca[TCA_KIND-1];
struct Qdisc *sch = NULL;
+ struct Qdisc_ops *ops;
int size;
- int new = 0;
- if (ops == NULL) {
- ops = qdisc_lookup_ops(kind);
- err = -EINVAL;
- if (ops == NULL)
- goto err_out;
- new = 1;
+ ops = qdisc_lookup_ops(kind);
+#ifdef CONFIG_KMOD
+ if (ops==NULL && tca[TCA_KIND-1] != NULL) {
+ char module_name[4 + IFNAMSIZ + 1];
+
+ if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
+ sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind));
+ request_module (module_name);
+ ops = qdisc_lookup_ops(kind);
+ }
}
+#endif
+
+ err = -EINVAL;
+ if (ops == NULL)
+ goto err_out;
size = sizeof(*sch) + ops->priv_size;
@@ -340,13+378,8 @@ qdisc_create(struct device *dev, struct Qdisc_ops *ops, u32 handle, /* Grrr... Resolve race condition with module unload */
err = -EINVAL;
- if (new) {
- if (ops != qdisc_lookup_ops(kind))
- goto err_out;
- } else if (kind) {
- if (rtattr_strcmp(kind, ops->id))
- goto err_out;
- }
+ if (ops != qdisc_lookup_ops(kind))
+ goto err_out;
memset(sch, 0, size);
@@ -355,6+388,7 @@ qdisc_create(struct device *dev, struct Qdisc_ops *ops, u32 handle, sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev = dev;
+ atomic_set(&sch->refcnt, 1);
if (handle == 0) {
handle = qdisc_alloc_handle(dev);
err = -ENOMEM;
@@ -362,9+396,8 @@ qdisc_create(struct device *dev, struct Qdisc_ops *ops, u32 handle, goto err_out;
}
sch->handle = handle;
- sch->classid = parentid;
- if (ops->init && (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
+ if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
sch->next = dev->qdisc_list;
dev->qdisc_list = sch;
#ifdef CONFIG_NET_ESTIMATOR
return NULL;
}
+static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
+{
+ if (tca[TCA_OPTIONS-1]) {
+ int err;
+
+ if (sch->ops->change == NULL)
+ return -EINVAL;
+ err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
+ if (err)
+ return err;
+ }
+#ifdef CONFIG_NET_ESTIMATOR
+ if (tca[TCA_RATE-1]) {
+ qdisc_kill_estimator(&sch->stats);
+ qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
+ }
+#endif
+ return 0;
+}
+
+struct check_loop_arg
+{
+ struct qdisc_walker w;
+ struct Qdisc *p;
+ int depth;
+};
+
+static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
+
+static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
+{
+ struct check_loop_arg arg;
+
+ if (q->ops->cl_ops == NULL)
+ return 0;
+
+ arg.w.stop = arg.w.skip = arg.w.count = 0;
+ arg.w.fn = check_loop_fn;
+ arg.depth = depth;
+ arg.p = p;
+ q->ops->cl_ops->walk(q, &arg.w);
+ return arg.w.stop ? -ELOOP : 0;
+}
+
+static int
+check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
+{
+ struct Qdisc *leaf;
+ struct Qdisc_class_ops *cops = q->ops->cl_ops;
+ struct check_loop_arg *arg = (struct check_loop_arg *)w;
+
+ leaf = cops->leaf(q, cl);
+ if (leaf) {
+ if (leaf == arg->p || arg->depth > 7)
+ return -ELOOP;
+ return check_loop(leaf, arg->p, arg->depth + 1);
+ }
+ return 0;
+}
/*
- Create/delete/change/get qdisc.
+ * Delete/get qdisc.
*/
-static int tc_ctl_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
struct tcmsg *tcm = NLMSG_DATA(n);
struct rtattr **tca = arg;
struct device *dev;
u32 clid = tcm->tcm_parent;
- struct Qdisc *old_q;
struct Qdisc *q = NULL;
struct Qdisc *p = NULL;
- struct Qdisc *leaf = NULL;
- struct Qdisc_ops *qops = NULL;
int err;
- /* Find device */
if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
return -ENODEV;
- /* If parent is specified, it must exist
- and tcm_parent selects a class in parent which
- new qdisc will be attached to.
-
- The place may be already busy by another qdisc,
- remember this fact, if it was not auto-created discipline.
- */
if (clid) {
if (clid != TC_H_ROOT) {
- p = qdisc_lookup(dev, TC_H_MAJ(clid));
- if (p == NULL)
+ if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
return -ENOENT;
- leaf = qdisc_lookup_class(dev, clid);
+ q = qdisc_leaf(p, clid);
} else
- leaf = dev->qdisc_sleeping;
-
- if (leaf && leaf->flags&TCQ_F_DEFAULT && n->nlmsg_type == RTM_NEWQDISC)
- leaf = NULL;
+ q = dev->qdisc_sleeping;
- /*
- Also, leaf may be exactly that qdisc, which we want
- to control. Remember this to avoid one more qdisc_lookup.
- */
-
- if (leaf && leaf->handle == tcm->tcm_handle)
- q = leaf;
- }
+ if (!q)
+ return -ENOENT;
- /* Try to locate the discipline */
- if (tcm->tcm_handle && q == NULL) {
- if (TC_H_MIN(tcm->tcm_handle))
+ if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
return -EINVAL;
- q = qdisc_lookup(dev, tcm->tcm_handle);
+ } else {
+ if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+ return -ENOENT;
}
- /* If discipline already exists, check that its real parent
- matches to one selected by tcm_parent.
- */
-
- if (q) {
- if (clid && p != q->parent)
- return -EINVAL;
- BUG_TRAP(!leaf || leaf == q);
- if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+ if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+ return -EINVAL;
+
+ if (n->nlmsg_type == RTM_DELQDISC) {
+ if (!clid)
return -EINVAL;
- clid = q->classid;
- goto process_existing;
+ if (q->handle == 0)
+ return -ENOENT;
+ if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
+ return err;
+ if (q) {
+ qdisc_notify(skb, n, clid, q, NULL);
+ qdisc_destroy(q);
+ }
+ } else {
+ qdisc_notify(skb, n, clid, NULL, q);
}
+ return 0;
+}
- /* The discipline is known not to exist.
- If parent was not selected too, return error.
- */
- if (clid == 0)
- return tcm->tcm_handle ? -ENOENT : -EINVAL;
+/*
+ Create/change qdisc.
+ */
- /* Check for the case when leaf is exactly the thing,
- that you want.
- */
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+ struct tcmsg *tcm = NLMSG_DATA(n);
+ struct rtattr **tca = arg;
+ struct device *dev;
+ u32 clid = tcm->tcm_parent;
+ struct Qdisc *q = NULL;
+ struct Qdisc *p = NULL;
+ int err;
+
+ if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+ return -ENODEV;
- if (leaf && tcm->tcm_handle == 0) {
- q = leaf;
- if (!tca[TCA_KIND-1] || rtattr_strcmp(tca[TCA_KIND-1], q->ops->id) == 0)
- goto process_existing;
+ if (clid) {
+ if (clid != TC_H_ROOT) {
+ if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+ return -ENOENT;
+ q = qdisc_leaf(p, clid);
+ } else {
+ q = dev->qdisc_sleeping;
+ }
+
+ /* It may be default qdisc, ignore it */
+ if (q && q->handle == 0)
+ q = NULL;
+
+ if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
+ if (tcm->tcm_handle) {
+ if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
+ return -EEXIST;
+ if (TC_H_MIN(tcm->tcm_handle))
+ return -EINVAL;
+ if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+ goto create_n_graft;
+ if (n->nlmsg_flags&NLM_F_EXCL)
+ return -EEXIST;
+ if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+ return -EINVAL;
+ if (q == p ||
+ (p && check_loop(q, p, 0)))
+ return -ELOOP;
+ atomic_inc(&q->refcnt);
+ goto graft;
+ } else {
+ if (q == NULL)
+ goto create_n_graft;
+
+ /* This magic test requires explanation.
+ *
+ * We know, that some child q is already
+ * attached to this parent and have choice:
+ * either to change it or to create/graft new one.
+ *
+ * 1. We are allowed to create/graft only
+ * if CREATE and REPLACE flags are set.
+ *
+ * 2. If EXCL is set, requestor wanted to say,
+ * that qdisc tcm_handle is not expected
+ * to exist, so that we choose create/graft too.
+ *
+ * 3. The last case is when no flags are set.
+ * Alas, it is sort of hole in API, we
+ * cannot decide what to do unambiguously.
+ * For now we select create/graft, if
+ * user gave KIND, which does not match existing.
+ */
+ if ((n->nlmsg_flags&NLM_F_CREATE) &&
+ (n->nlmsg_flags&NLM_F_REPLACE) &&
+ ((n->nlmsg_flags&NLM_F_EXCL) ||
+ (tca[TCA_KIND-1] &&
+ rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
+ goto create_n_graft;
+ }
+ }
+ } else {
+ if (!tcm->tcm_handle)
+ return -EINVAL;
+ q = qdisc_lookup(dev, tcm->tcm_handle);
}
- if (n->nlmsg_type != RTM_NEWQDISC || !(n->nlmsg_flags&NLM_F_CREATE))
+ /* Change qdisc parameters */
+ if (q == NULL)
return -ENOENT;
- if (leaf && n->nlmsg_flags&NLM_F_EXCL)
+ if (n->nlmsg_flags&NLM_F_EXCL)
return -EEXIST;
+ if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+ return -EINVAL;
+ err = qdisc_change(q, tca);
+ if (err == 0)
+ qdisc_notify(skb, n, clid, NULL, q);
+ return err;
-create_and_graft:
- q = qdisc_create(dev, qops, tcm->tcm_handle, clid, tca, &err);
+create_n_graft:
+ if (!(n->nlmsg_flags&NLM_F_CREATE))
+ return -ENOENT;
+ q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
if (q == NULL)
return err;
graft:
- err = qdisc_graft(dev, p, clid, q, &old_q);
- if (err) {
- if (q)
- qdisc_destroy(q);
- return err;
+ if (1) {
+ struct Qdisc *old_q = NULL;
+ err = qdisc_graft(dev, p, clid, q, &old_q);
+ if (err) {
+ if (q)
+ qdisc_destroy(q);
+ return err;
+ }
+ qdisc_notify(skb, n, clid, old_q, q);
+ if (old_q)
+ qdisc_destroy(old_q);
}
- qdisc_notify(skb, n, old_q, q);
- if (old_q)
- qdisc_destroy(old_q);
return 0;
-
-process_existing:
-
- switch (n->nlmsg_type) {
- case RTM_NEWQDISC:
- if (n->nlmsg_flags&NLM_F_EXCL)
- return -EEXIST;
- qops = q->ops;
- goto create_and_graft;
- case RTM_GETQDISC:
- qdisc_notify(skb, n, NULL, q);
- return 0;
- case RTM_DELQDISC:
- q = NULL;
- goto graft;
- default:
- return -EINVAL;
- }
}
-static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q,
+static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
u32 pid, u32 seq, unsigned flags, int event)
{
struct tcmsg *tcm;
@@ -521,9+660,9 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, tcm = NLMSG_DATA(nlh);
tcm->tcm_family = AF_UNSPEC;
tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
- tcm->tcm_parent = q->classid;
+ tcm->tcm_parent = clid;
tcm->tcm_handle = q->handle;
- tcm->tcm_info = 0;
+ tcm->tcm_info = atomic_read(&q->refcnt);
RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
if (q->ops->dump && q->ops->dump(q, skb) < 0)
goto rtattr_failure;
@@ -539,7+678,7 @@ rtattr_failure: }
static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
- struct Qdisc *old, struct Qdisc *new)
+ u32 clid, struct Qdisc *old, struct Qdisc *new)
{
struct sk_buff *skb;
u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
@@ -548,12+687,12 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, if (!skb)
return -ENOBUFS;
- if (old && !(old->flags&TCQ_F_DEFAULT)) {
- if (tc_fill_qdisc(skb, old, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
+ if (old && old->handle) {
+ if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
goto err_out;
}
if (new) {
- if (tc_fill_qdisc(skb, new, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+ if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
goto err_out;
}
@@ -583,7+722,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) q = q->next, q_idx++) {
if (q_idx < s_q_idx)
continue;
- if (tc_fill_qdisc(skb, q, NETLINK_CB(cb->skb).pid,
+ if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
goto done;
}
@@ -797,11+936,10 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) for (q=dev->qdisc_list, t=0; q; q = q->next, t++) {
if (t < s_t) continue;
if (!q->ops->cl_ops) continue;
- if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle
- && (tcm->tcm_parent != TC_H_ROOT || q->parent != NULL))
+ if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)
continue;
if (t > s_t)
- memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+ memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
arg.w.fn = qdisc_class_dump;
arg.skb = skb;
arg.cb = cb;
@@ -846,6+984,20 @@ static int psched_read_proc(char *buffer, char **start, off_t offset, }
#endif
+#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
+int psched_tod_diff(int delta_sec, int bound)
+{
+ int delta;
+
+ if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1)
+ return bound;
+ delta = delta_sec * 1000000;
+ if (delta > bound)
+ delta = bound;
+ return delta;
+}
+#endif
+
psched_time_t psched_time_base;
#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
@@ -866,7+1018,8 @@ static void psched_tick(unsigned long dummy) #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
psched_time_t dummy_stamp;
PSCHED_GET_TIME(dummy_stamp);
- psched_timer.expires = jiffies + 4*HZ;
+ /* It is OK up to 4GHz cpu */
+ psched_timer.expires = jiffies + 1*HZ;
#else
unsigned long now = jiffies;
psched_time_base = ((u64)now)<<PSCHED_JSCALE;
@@ -891,7+1044,6 @@ __initfunc(int psched_calibrate_clock(void)) return -1;
#endif
- start_bh_atomic();
#ifdef PSCHED_WATCHER
psched_tick(0);
#endif
@@ -902,7+1054,6 @@ __initfunc(int psched_calibrate_clock(void)) barrier();
PSCHED_GET_TIME(stamp1);
do_gettimeofday(&tv1);
- end_bh_atomic();
delay = PSCHED_TDIFF(stamp1, stamp);
rdelay = tv1.tv_usec - tv.tv_usec;
@@ -921,6+1072,9 @@ __initfunc(int psched_calibrate_clock(void))
__initfunc(int pktsched_init(void))
{
+#ifdef CONFIG_RTNETLINK
+ struct rtnetlink_link *link_p;
+#endif
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *ent;
#endif
@@ -931,19+1085,22 @@ __initfunc(int pktsched_init(void)) #elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
psched_tick_per_us = HZ<<PSCHED_JSCALE;
psched_us_per_tick = 1000000;
+#ifdef PSCHED_WATCHER
+ psched_tick(0);
+#endif
#endif
#ifdef CONFIG_RTNETLINK
- struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC];
+ link_p = rtnetlink_links[PF_UNSPEC];
/* Setup rtnetlink links. It is made here to avoid
exporting large number of public symbols.
*/
if (link_p) {
- link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_ctl_qdisc;
- link_p[RTM_DELQDISC-RTM_BASE].doit = tc_ctl_qdisc;
- link_p[RTM_GETQDISC-RTM_BASE].doit = tc_ctl_qdisc;
+ link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
+ link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
+ link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
@@ -975,6+1132,12 @@ __initfunc(int pktsched_init(void)) #ifdef CONFIG_NET_SCH_RED
INIT_QDISC(red);
#endif
+#ifdef CONFIG_NET_SCH_GRED
+ INIT_QDISC(gred);
+#endif
+#ifdef CONFIG_NET_SCH_DSMARK
+ INIT_QDISC(dsmark);
+#endif
#ifdef CONFIG_NET_SCH_SFQ
INIT_QDISC(sfq);
#endif
#include <net/sock.h>
#include <net/pkt_sched.h>
+
/* Class-Based Queueing (CBQ) algorithm.
=======================================
@@ -169,6+170,9 @@ struct cbq_sched_data struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
with backlog */
+#ifdef CONFIG_NET_CLS_POLICE
+ struct cbq_class *rx_class;
+#endif
struct cbq_class *tx_class;
struct cbq_class *tx_borrowed;
int tx_len;
@@ -269,17+273,21 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch) else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
cl = defmap[TC_PRIO_BESTEFFORT];
- if (cl == NULL)
+ if (cl == NULL || cl->level >= head->level)
goto fallback;
}
- if (cl->level == 0) {
#ifdef CONFIG_NET_CLS_POLICE
- if (result)
- return cbq_reclassify(skb, cl);
+ switch (result) {
+ case TC_POLICE_RECLASSIFY:
+ return cbq_reclassify(skb, cl);
+ case TC_POLICE_SHOT:
+ return NULL;
+ default:
+ }
#endif
+ if (cl->level == 0)
return cl;
- }
/*
* Step 3+n. If classifier selected a link sharing class,
@@ -321,11+329,9 @@ static __inline__ void cbq_activate_class(struct cbq_class *cl) if (cl_tail != NULL) {
cl->next_alive = cl_tail->next_alive;
cl_tail->next_alive = cl;
- cl->deficit = 0;
} else {
cl->next_alive = cl;
q->activemask |= (1<<prio);
- cl->deficit = cl->quantum;
}
}
@@ -358,31+364,28 @@ static void cbq_deactivate_class(struct cbq_class *this) }
cl = cl_prev->next_alive;
- cl->deficit += cl->quantum;
return;
}
} while ((cl_prev = cl) != q->active[prio]);
}
-static __inline__ void
+static void
cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
{
- if (q->toplevel > 0) {
+ int toplevel = q->toplevel;
+
+ if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
psched_time_t now;
PSCHED_GET_TIME(now);
if (PSCHED_TLESS(now, q->now))
now = q->now;
- if (PSCHED_TLESS(cl->undertime, now)) {
- q->toplevel = 0;
- return;
- }
- while ((cl = cl->borrow) != NULL
- && q->toplevel > cl->level) {
- if (PSCHED_TLESS(cl->borrow->undertime, now)) {
+
+ do {
+ if (PSCHED_TLESS(cl->undertime, now)) {
q->toplevel = cl->level;
return;
}
- }
+ } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
}
}
@@ -393,23+396,31 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct cbq_class *cl = cbq_classify(skb, sch);
int len = skb->len;
- if (cl && cl->q->enqueue(skb, cl->q) == 1) {
- sch->q.qlen++;
- sch->stats.packets++;
- cl->stats.packets++;
- sch->stats.bytes+=len;
- cl->stats.bytes+=len;
- cbq_mark_toplevel(q, cl);
- if (!cl->next_alive)
- cbq_activate_class(cl);
- return 1;
+#ifdef CONFIG_NET_CLS_POLICE
+ q->rx_class = cl;
+#endif
+ if (cl) {
+#ifdef CONFIG_NET_CLS_POLICE
+ cl->q->__parent = sch;
+#endif
+ if (cl->q->enqueue(skb, cl->q) == 1) {
+ sch->q.qlen++;
+ sch->stats.packets++;
+ sch->stats.bytes+=len;
+ cbq_mark_toplevel(q, cl);
+ if (!cl->next_alive)
+ cbq_activate_class(cl);
+ return 1;
+ }
}
sch->stats.drops++;
if (cl == NULL)
kfree_skb(skb);
- else
+ else {
+ cbq_mark_toplevel(q, cl);
cl->stats.drops++;
+ }
return 0;
}
@@ -426,9+437,14 @@ cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) }
q->tx_class = NULL;
+ cbq_mark_toplevel(q, cl);
+
+#ifdef CONFIG_NET_CLS_POLICE
+ q->rx_class = cl;
+ cl->q->__parent = sch;
+#endif
if (cl->q->ops->requeue(skb, cl->q) == 1) {
sch->q.qlen++;
- cbq_mark_toplevel(q, cl);
if (!cl->next_alive)
cbq_activate_class(cl);
return 1;
@@ -445,11+461,9 @@ cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) static void cbq_ovl_classic(struct cbq_class *cl)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
+ psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
if (!cl->delayed) {
- psched_tdiff_t delay;
-
- delay = PSCHED_TDIFF(cl->undertime, q->now);
delay += cl->offtime;
/*
@@ -463,15+477,35 @@ static void cbq_ovl_classic(struct cbq_class *cl) delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
if (cl->avgidle < cl->minidle)
cl->avgidle = cl->minidle;
- if (delay < 0)
- delay = 0;
+ if (delay <= 0)
+ delay = 1;
PSCHED_TADD2(q->now, delay, cl->undertime);
- if (q->wd_expires == 0 || q->wd_expires > delay)
- q->wd_expires = delay;
cl->xstats.overactions++;
cl->delayed = 1;
}
+ if (q->wd_expires == 0 || q->wd_expires > delay)
+ q->wd_expires = delay;
+
+ /* Dirty work! We must schedule wakeups based on
+ real available rate, rather than leaf rate,
+ which may be tiny (even zero).
+ */
+ if (q->toplevel == TC_CBQ_MAXLEVEL) {
+ struct cbq_class *b;
+ psched_tdiff_t base_delay = q->wd_expires;
+
+ for (b = cl->borrow; b; b = b->borrow) {
+ delay = PSCHED_TDIFF(b->undertime, q->now);
+ if (delay < base_delay) {
+ if (delay <= 0)
+ delay = 1;
+ base_delay = delay;
+ }
+ }
+
+ q->wd_expires = delay;
+ }
}
/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
@@ -481,15+515,18 @@ static void cbq_ovl_classic(struct cbq_class *cl) static void cbq_ovl_rclassic(struct cbq_class *cl)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
+ struct cbq_class *this = cl;
- while (cl && cl->delayed) {
- cl = cl->borrow;
- if (cl->level > q->toplevel)
- return;
- }
+ do {
+ if (cl->level > q->toplevel) {
+ cl = NULL;
+ break;
+ }
+ } while ((cl = cl->borrow) != NULL);
- if (cl)
- cbq_ovl_classic(cl);
+ if (cl == NULL)
+ cl = this;
+ cbq_ovl_classic(cl);
}
/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
@@ -497,12+534,11 @@ static void cbq_ovl_rclassic(struct cbq_class *cl) static void cbq_ovl_delay(struct cbq_class *cl)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
+ psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
if (!cl->delayed) {
- psched_tdiff_t delay;
unsigned long sched = jiffies;
- delay = PSCHED_TDIFF(cl->undertime, q->now);
delay += cl->offtime;
if (cl->avgidle < 0)
delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
@@ -521,8+557,12 @@ static void cbq_ovl_delay(struct cbq_class *cl) add_timer(&q->delay_timer);
cl->delayed = 1;
cl->xstats.overactions++;
+ return;
}
+ delay = 1;
}
+ if (q->wd_expires == 0 || q->wd_expires > delay)
+ q->wd_expires = delay;
}
/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
@@ -555,6+595,7 @@ static void cbq_ovl_drop(struct cbq_class *cl) static void cbq_watchdog(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc*)arg;
+ sch->flags &= ~TCQ_F_THROTTLED;
qdisc_wakeup(sch->dev);
}
@@ -622,6+663,7 @@ static void cbq_undelay(unsigned long arg) add_timer(&q->delay_timer);
}
+ sch->flags &= ~TCQ_F_THROTTLED;
qdisc_wakeup(sch->dev);
}
@@ -631,18+673,23 @@ static void cbq_undelay(unsigned long arg) static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
{
int len = skb->len;
- struct Qdisc *sch = child->parent;
+ struct Qdisc *sch = child->__parent;
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
- struct cbq_class *cl = cbq_class_lookup(q, child->classid);
+ struct cbq_class *cl = q->rx_class;
+
+ q->rx_class = NULL;
if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
+
+ cbq_mark_toplevel(q, cl);
+
+ q->rx_class = cl;
+ cl->q->__parent = sch;
+
if (cl->q->enqueue(skb, cl->q) == 1) {
sch->q.qlen++;
sch->stats.packets++;
- cl->stats.packets++;
sch->stats.bytes+=len;
- cl->stats.bytes+=len;
- cbq_mark_toplevel(q, cl);
if (!cl->next_alive)
cbq_activate_class(cl);
return 0;
@@ -656,21+703,42 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) }
#endif
+/*
+ It is mission critical procedure.
+
+ We "regenerate" toplevel cutoff, if transmitting class
+ has backlog and it is not regulated. It is not part of
+ original CBQ description, but looks more reasonable.
+ Probably, it is wrong. This question needs further investigation.
+*/
+
static __inline__ void
-cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
+cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
+ struct cbq_class *borrowed)
{
- if (cl && q->toplevel >= cl->level) {
- if (cl->q->q.qlen <= 1 || PSCHED_TLESS(q->now, cl->undertime))
- q->toplevel = TC_CBQ_MAXLEVEL;
- else /* BUGGGG? if (cl != this) */
- q->toplevel = cl->level;
+ if (cl && q->toplevel >= borrowed->level) {
+ if (cl->q->q.qlen > 1) {
+ do {
+ if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
+ q->toplevel = borrowed->level;
+ return;
+ }
+ } while ((borrowed=borrowed->borrow) != NULL);
+ }
+#if 0
+ /* It is not necessary now. Uncommenting it
+ will save CPU cycles, but decrease fairness.
+ */
+ q->toplevel = TC_CBQ_MAXLEVEL;
+#endif
}
}
-static __inline__ void
+static void
cbq_update(struct cbq_sched_data *q)
{
- struct cbq_class *cl = q->tx_class;
+ struct cbq_class *this = q->tx_class;
+ struct cbq_class *cl = this;
int len = q->tx_len;
q->tx_class = NULL;
@@ -679,6+747,9 @@ cbq_update(struct cbq_sched_data *q) long avgidle = cl->avgidle;
long idle;
+ cl->stats.packets++;
+ cl->stats.bytes += len;
+
/*
(now - last) is total time between packet right edges.
(last_pktlen/rate) is "virtual" busy time, so that
@@ -697,6+768,10 @@ cbq_update(struct cbq_sched_data *q)
if (avgidle <= 0) {
/* Overlimit or at-limit */
+
+ if (avgidle < cl->minidle)
+ avgidle = cl->minidle;
+
cl->avgidle = avgidle;
/* Calculate expected time, when this class
@@ -732,12+807,11 @@ cbq_update(struct cbq_sched_data *q) cl->avgidle = cl->maxidle;
else
cl->avgidle = avgidle;
-
}
cl->last = q->now;
}
- cbq_update_toplevel(q, q->tx_borrowed);
+ cbq_update_toplevel(q, this, q->tx_borrowed);
}
static __inline__ struct cbq_class *
@@ -750,21+824,33 @@ cbq_under_limit(struct cbq_class *cl) return cl;
if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
- PSCHED_TLESS(cl->undertime, q->now)) {
+ !PSCHED_TLESS(q->now, cl->undertime)) {
cl->delayed = 0;
return cl;
}
- while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
- PSCHED_TLESS(q->now, cl->undertime)) {
- if ((cl = cl->borrow) == NULL || cl->level > q->toplevel) {
+ do {
+ /* It is very suspicious place. Now overlimit
+ action is generated for not bounded classes
+ only if link is completely congested.
+ Though it is in agree with ancestor-only paradigm,
+ it looks very stupid. Particularly,
+ it means that this chunk of code will either
+ never be called or result in strong amplification
+ of burstiness. Dangerous, silly, and, however,
+ no another solution exists.
+ */
+ if ((cl = cl->borrow) == NULL) {
this_cl->stats.overlimits++;
this_cl->overlimit(this_cl);
return NULL;
}
- }
- this_cl->xstats.borrows++;
- cl->xstats.borrows++;
+ if (cl->level > q->toplevel)
+ return NULL;
+ } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
+ PSCHED_TLESS(q->now, cl->undertime));
+
+ cl->delayed = 0;
return cl;
}
@@ -784,27+870,26 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
/* Start round */
do {
- struct cbq_class *borrow;
+ struct cbq_class *borrow = NULL;
- /* Class is empty */
- if (cl->q->q.qlen == 0)
- goto skip_class;
-
- if ((borrow = cbq_under_limit(cl)) == NULL)
+ if (cl->q->q.qlen &&
+ (borrow = cbq_under_limit(cl)) == NULL)
goto skip_class;
if (cl->deficit <= 0) {
- /* Class exhausted its allotment per this
- round.
+ /* Class exhausted its allotment per
+ this round. Switch to the next one.
*/
deficit = 1;
+ cl->deficit += cl->quantum;
goto next_class;
}
skb = cl->q->dequeue(cl->q);
/* Class did not give us any skb :-(
- It could occur if cl->q == "tbf"
+ It could occur even if cl->q->q.qlen != 0
+ f.e. if cl->q == "tbf"
*/
if (skb == NULL)
goto skip_class;
@@ -812,6+897,15 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio) cl->deficit -= skb->len;
q->tx_class = cl;
q->tx_borrowed = borrow;
+ if (borrow != cl) {
+#ifndef CBQ_XSTATS_BORROWS_BYTES
+ borrow->xstats.borrows++;
+ cl->xstats.borrows++;
+#else
+ borrow->xstats.borrows += skb->len;
+ cl->xstats.borrows += skb->len;
+#endif
+ }
q->tx_len = skb->len;
if (cl->deficit <= 0) {
@@ -822,8+916,6 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio) return skb;
skip_class:
- cl->deficit = 0;
-
if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
/* Class is empty or penalized.
Unlink it from active chain.
@@ -857,7+949,6 @@ skip_class: next_class:
cl_prev = cl;
cl = cl->next_alive;
- cl->deficit += cl->quantum;
} while (cl_prev != cl_tail);
} while (deficit);
@@ -914,6+1005,7 @@ cbq_dequeue(struct Qdisc *sch) skb = cbq_dequeue_1(sch);
if (skb) {
sch->q.qlen--;
+ sch->flags &= ~TCQ_F_THROTTLED;
return skb;
}
@@ -955,6+1047,7 @@ cbq_dequeue(struct Qdisc *sch) delay = 1;
q->wd_timer.expires = jiffies + delay;
add_timer(&q->wd_timer);
+ sch->flags |= TCQ_F_THROTTLED;
}
}
return NULL;
@@ -1129,14+1222,18 @@ static void cbq_link_class(struct cbq_class *this) static int cbq_drop(struct Qdisc* sch)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
- struct cbq_class *cl;
- int h;
+ struct cbq_class *cl, *cl_head;
+ int prio;
- for (h = TC_CBQ_MAXPRIO; h >= 0; h++) {
- for (cl = q->classes[h]; cl; cl = cl->next) {
+ for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio++) {
+ if ((cl_head = q->active[prio]) == NULL)
+ continue;
+
+ cl = cl_head;
+ do {
if (cl->q->ops->drop && cl->q->ops->drop(cl->q))
return 1;
- }
+ } while ((cl = cl->next_alive) != cl_head);
}
return 0;
}
@@ -1166,8+1263,8 @@ cbq_reset(struct Qdisc* sch)
cl->next_alive = NULL;
PSCHED_SET_PASTPERFECT(cl->undertime);
- cl->avgidle = 0;
- cl->deficit = 0;
+ cl->avgidle = cl->maxidle;
+ cl->deficit = cl->quantum;
cl->cpriority = cl->priority;
}
}
@@ -1187,8+1284,10 @@ static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) cl->avpkt = lss->avpkt;
if (lss->change&TCF_CBQ_LSS_MINIDLE)
cl->minidle = -(long)lss->minidle;
- if (lss->change&TCF_CBQ_LSS_MAXIDLE)
+ if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
cl->maxidle = lss->maxidle;
+ cl->avgidle = lss->maxidle;
+ }
if (lss->change&TCF_CBQ_LSS_OFFTIME)
cl->offtime = lss->offtime;
return 0;
@@ -1261,7+1360,7 @@ static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) {
cl->police = p->police;
- if (!(cl->q->flags&TCQ_F_DEFAULT)) {
+ if (cl->q->handle) {
if (p->police == TC_POLICE_RECLASSIFY)
cl->q->reshape_fail = cbq_reshape_fail;
else
@@ -1300,6+1399,7 @@ static int cbq_init(struct Qdisc *sch, struct rtattr *opt) return -EINVAL;
}
+ q->link.refcnt = 1;
q->link.sibling = &q->link;
q->link.classid = sch->handle;
q->link.qdisc = sch;
@@ -1493,6+1593,7 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg, else
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle = cl->classid;
+ tcm->tcm_info = cl->q->handle;
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
@@ -1533,12+1634,20 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, }
if ((*old = xchg(&cl->q, new)) != NULL)
qdisc_reset(*old);
-
+
return 0;
}
return -ENOENT;
}
+static struct Qdisc *
+cbq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbq_class *cl = (struct cbq_class*)arg;
+
+ return cl ? cl->q : NULL;
+}
+
static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
@@ -1569,6+1678,7 @@ static void cbq_destroy_class(struct cbq_class *cl) #ifdef CONFIG_NET_ESTIMATOR
qdisc_kill_estimator(&cl->stats);
#endif
+ kfree(cl);
}
static void
@@ -1578,6+1688,9 @@ cbq_destroy(struct Qdisc* sch) struct cbq_class *cl;
unsigned h;
+#ifdef CONFIG_NET_CLS_POLICE
+ q->rx_class = NULL;
+#endif
for (h = 0; h < 16; h++) {
for (cl = q->classes[h]; cl; cl = cl->next)
cbq_destroy_filters(cl);
@@ -1590,20+1703,29 @@ cbq_destroy(struct Qdisc* sch) }
qdisc_put_rtab(q->link.R_tab);
+ MOD_DEC_USE_COUNT;
}
-static void cbq_put(struct Qdisc *q, unsigned long arg)
+static void cbq_put(struct Qdisc *sch, unsigned long arg)
{
+ struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
struct cbq_class *cl = (struct cbq_class*)arg;
- if (--cl->refcnt == 0)
+ start_bh_atomic();
+ if (--cl->refcnt == 0) {
+#ifdef CONFIG_NET_CLS_POLICE
+ if (q->rx_class == cl)
+ q->rx_class = NULL;
+#endif
cbq_destroy_class(cl);
+ }
+ end_bh_atomic();
return;
}
static int
-cbq_change(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
- unsigned long *arg)
+cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
+ unsigned long *arg)
{
int err;
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
@@ -1763,6+1885,7 @@ cbq_change(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, cl->borrow = cl->tparent;
if (cl->tparent != &q->link)
cl->share = cl->tparent;
+ cbq_adjust_levels(parent);
cl->minidle = -0x7FFFFFFF;
cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
@@ -1781,7+1904,6 @@ cbq_change(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, #endif
if (tb[TCA_CBQ_FOPT-1])
cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
- cbq_adjust_levels(parent);
end_bh_atomic();
#ifdef CONFIG_NET_ESTIMATOR
@@ -1810,10+1932,16 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) if (cl->next_alive)
cbq_deactivate_class(cl);
- if (q->tx_class == cl)
- q->tx_class = cl->borrow;
if (q->tx_borrowed == cl)
q->tx_borrowed = q->tx_class;
+ if (q->tx_class == cl) {
+ q->tx_class = NULL;
+ q->tx_borrowed = NULL;
+ }
+#ifdef CONFIG_NET_CLS_POLICE
+ if (q->rx_class == cl)
+ q->rx_class = NULL;
+#endif
cbq_unlink_class(cl);
cbq_adjust_levels(cl->tparent);
@@ -1841,12+1969,16 @@ static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) return &cl->filter_list;
}
-static unsigned long cbq_bind_filter(struct Qdisc *sch, u32 classid)
+static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
{
struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
+ struct cbq_class *p = (struct cbq_class*)parent;
struct cbq_class *cl = cbq_class_lookup(q, classid);
if (cl) {
+ if (p && p->level <= cl->level)
+ return 0;
cl->filters++;
return (unsigned long)cl;
}
@@ -1878,7+2010,7 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) }
if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
arg->stop = 1;
- break;
+ return;
}
arg->count++;
}
@@ -1888,9+2020,10 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) static struct Qdisc_class_ops cbq_class_ops =
{
cbq_graft,
+ cbq_leaf,
cbq_get,
cbq_put,
- cbq_change,
+ cbq_change_class,
cbq_delete,
cbq_walk,
@@ -1918,6+2051,7 @@ struct Qdisc_ops cbq_qdisc_ops = cbq_init,
cbq_reset,
cbq_destroy,
+ NULL /* cbq_change */,
#ifdef CONFIG_RTNETLINK
cbq_dump,
@@ -826,6+826,12 @@ static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, return -EINVAL;
}
+static struct Qdisc * csz_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ return NULL;
+}
+
+
static unsigned long csz_get(struct Qdisc *sch, u32 classid)
{
struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
@@ -840,6+846,12 @@ static unsigned long csz_get(struct Qdisc *sch, u32 classid) return band+1;
}
+static unsigned long csz_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+ return csz_get(sch, classid);
+}
+
+
static void csz_put(struct Qdisc *sch, unsigned long cl)
{
return;
@@ -1006,6+1018,8 @@ static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl) struct Qdisc_class_ops csz_class_ops =
{
csz_graft,
+ csz_leaf,
+
csz_get,
csz_put,
csz_change,
@@ -1013,7+1027,7 @@ struct Qdisc_class_ops csz_class_ops = csz_walk,
csz_find_tcf,
- csz_get,
+ csz_bind,
csz_put,
#ifdef CONFIG_RTNETLINK
@@ -1036,6+1050,7 @@ struct Qdisc_ops csz_qdisc_ops = csz_init,
csz_reset,
csz_destroy,
+ NULL /* csz_change */,
#ifdef CONFIG_RTNETLINK
csz_dump,
@@ -97,10+97,7 @@ fifo_drop(struct Qdisc* sch) static void
fifo_reset(struct Qdisc* sch)
{
- struct sk_buff *skb;
-
- while ((skb=__skb_dequeue(&sch->q)) != NULL)
- kfree_skb(skb);
+ skb_queue_purge(&sch->q);
sch->stats.backlog = 0;
}
@@ -137,15+134,15 @@ pfifo_dequeue(struct Qdisc* sch) return __skb_dequeue(&sch->q);
}
-
static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
{
struct fifo_sched_data *q = (void*)sch->data;
if (opt == NULL) {
- q->limit = sch->dev->tx_queue_len;
if (sch->ops == &bfifo_qdisc_ops)
- q->limit *= sch->dev->mtu;
+ q->limit = sch->dev->tx_queue_len*sch->dev->mtu;
+ else
+ q->limit = sch->dev->tx_queue_len;
} else {
struct tc_fifo_qopt *ctl = RTA_DATA(opt);
if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
@@ -188,6+185,8 @@ struct Qdisc_ops pfifo_qdisc_ops = fifo_init,
fifo_reset,
NULL,
+ fifo_init,
+
#ifdef CONFIG_RTNETLINK
fifo_dump,
#endif
@@ -208,6+207,7 @@ struct Qdisc_ops bfifo_qdisc_ops = fifo_init,
fifo_reset,
NULL,
+ fifo_init,
#ifdef CONFIG_RTNETLINK
fifo_dump,
#endif
@@ -184,7+184,7 @@ struct Qdisc noop_qdisc = { NULL },
noop_enqueue,
noop_dequeue,
- TCQ_F_DEFAULT|TCQ_F_BUILTIN,
+ TCQ_F_BUILTIN,
&noop_qdisc_ops,
};
@@ -207,7+207,7 @@ struct Qdisc noqueue_qdisc = { NULL },
NULL,
NULL,
- TCQ_F_DEFAULT|TCQ_F_BUILTIN,
+ TCQ_F_BUILTIN,
&noqueue_qdisc_ops,
};
@@ -322,8+322,8 @@ struct Qdisc * qdisc_create_dflt(struct device *dev, struct Qdisc_ops *ops) sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev = dev;
- sch->flags |= TCQ_F_DEFAULT;
- if (ops->init && ops->init(sch, NULL) == 0)
+ atomic_set(&sch->refcnt, 1);
+ if (!ops->init || ops->init(sch, NULL) == 0)
return sch;
kfree(sch);
@@ -342,6+342,10 @@ void qdisc_reset(struct Qdisc *qdisc) void qdisc_destroy(struct Qdisc *qdisc)
{
struct Qdisc_ops *ops = qdisc->ops;
+
+ if (!atomic_dec_and_test(&qdisc->refcnt))
+ return;
+
#ifdef CONFIG_NET_SCHED
if (qdisc->dev) {
struct Qdisc *q, **qp;
@@ -444,30+448,3 @@ void dev_shutdown(struct device *dev) end_bh_atomic();
}
-struct Qdisc * dev_set_scheduler(struct device *dev, struct Qdisc *qdisc)
-{
- struct Qdisc *oqdisc;
-
- if (dev->flags & IFF_UP)
- dev_deactivate(dev);
-
- start_bh_atomic();
- oqdisc = dev->qdisc_sleeping;
-
- /* Prune old scheduler */
- if (oqdisc)
- qdisc_reset(oqdisc);
-
- /* ... and graft new one */
- if (qdisc == NULL)
- qdisc = &noop_qdisc;
- dev->qdisc_sleeping = qdisc;
- dev->qdisc = &noop_qdisc;
- end_bh_atomic();
-
- if (dev->flags & IFF_UP)
- dev_activate(dev);
-
- return oqdisc;
-}
-
@@ -49,17+49,19 @@ static __inline__ unsigned prio_classify(struct sk_buff *skb, struct Qdisc *sch) {
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
struct tcf_result res;
+ u32 band;
- res.classid = skb->priority;
- if (TC_H_MAJ(res.classid) != sch->handle) {
+ band = skb->priority;
+ if (TC_H_MAJ(skb->priority) != sch->handle) {
if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) {
- if (TC_H_MAJ(res.classid))
- res.classid = 0;
- res.classid = q->prio2band[res.classid&TC_PRIO_MAX] + 1;
+ if (TC_H_MAJ(band))
+ band = 0;
+ return q->prio2band[band&TC_PRIO_MAX];
}
+ band = res.classid;
}
-
- return res.classid - 1;
+ band = TC_H_MIN(band) - 1;
+ return band < q->bands ? band : q->prio2band[0];
}
static int
@@ -160,38+162,74 @@ prio_destroy(struct Qdisc* sch) MOD_DEC_USE_COUNT;
}
+static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
+{
+ struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+ struct tc_prio_qopt *qopt = RTA_DATA(opt);
+ int i;
+
+ if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
+ return -EINVAL;
+ if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
+ return -EINVAL;
+
+ for (i=0; i<=TC_PRIO_MAX; i++) {
+ if (qopt->priomap[i] >= qopt->bands)
+ return -EINVAL;
+ }
+
+ start_bh_atomic();
+ q->bands = qopt->bands;
+ memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
+
+ for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
+ struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
+ if (child != &noop_qdisc)
+ qdisc_destroy(child);
+ }
+ end_bh_atomic();
+
+ for (i=0; i<=TC_PRIO_MAX; i++) {
+ int band = q->prio2band[i];
+ if (q->queues[band] == &noop_qdisc) {
+ struct Qdisc *child;
+ child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+ if (child) {
+ net_serialize_enter();
+ child = xchg(&q->queues[band], child);
+ net_serialize_leave();
+ if (child != &noop_qdisc)
+ qdisc_destroy(child);
+ }
+ }
+ }
+ return 0;
+}
+
static int prio_init(struct Qdisc *sch, struct rtattr *opt)
{
static const u8 prio2band[TC_PRIO_MAX+1] =
{ 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 };
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
- unsigned mask = 0;
int i;
+ for (i=0; i<TCQ_PRIO_BANDS; i++)
+ q->queues[i] = &noop_qdisc;
+
if (opt == NULL) {
q->bands = 3;
memcpy(q->prio2band, prio2band, sizeof(prio2band));
- mask = 7;
+ for (i=0; i<3; i++) {
+ struct Qdisc *child;
+ child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+ if (child)
+ q->queues[i] = child;
+ }
} else {
- struct tc_prio_qopt *qopt = RTA_DATA(opt);
+ int err;
- if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
- return -EINVAL;
- if (qopt->bands > TCQ_PRIO_BANDS)
- return -EINVAL;
- q->bands = qopt->bands;
- for (i=0; i<=TC_PRIO_MAX; i++) {
- if (qopt->priomap[i] >= q->bands)
- return -EINVAL;
- q->prio2band[i] = qopt->priomap[i];
- mask |= (1<<qopt->priomap[i]);
- }
- }
- for (i=0; i<TCQ_PRIO_BANDS; i++) {
- if (mask&(1<<i))
- q->queues[i] = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
- if (q->queues[i] == NULL)
- q->queues[i] = &noop_qdisc;
+ if ((err= prio_tune(sch, opt)) != 0)
+ return err;
}
MOD_INC_USE_COUNT;
return 0;
@@ -232,6+270,18 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, return 0;
}
+static struct Qdisc *
+prio_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
+ unsigned long band = arg - 1;
+
+ if (band >= q->bands)
+ return NULL;
+
+ return q->queues[band];
+}
+
static unsigned long prio_get(struct Qdisc *sch, u32 classid)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
@@ -242,6+292,12 @@ static unsigned long prio_get(struct Qdisc *sch, u32 classid) return band;
}
+static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+ return prio_get(sch, classid);
+}
+
+
static void prio_put(struct Qdisc *q, unsigned long cl)
{
return;
@@ -267,12+323,15 @@ static int prio_delete(struct Qdisc *sch, unsigned long cl)
#ifdef CONFIG_RTNETLINK
-static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm)
+static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
+ struct tcmsg *tcm)
{
struct prio_sched_data *q = (struct prio_sched_data *)sch->data;
if (cl - 1 > q->bands)
return -ENOENT;
+ if (q->queues[cl-1])
+ tcm->tcm_info = q->queues[cl-1]->handle;
return 0;
}
#endif
@@ -310,6+369,8 @@ static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) static struct Qdisc_class_ops prio_class_ops =
{
prio_graft,
+ prio_leaf,
+
prio_get,
prio_put,
prio_change,
@@ -317,7+378,7 @@ static struct Qdisc_class_ops prio_class_ops = prio_walk,
prio_find_tcf,
- prio_get,
+ prio_bind,
prio_put,
#ifdef CONFIG_RTNETLINK
@@ -340,6+401,7 @@ struct Qdisc_ops prio_qdisc_ops = prio_init,
prio_reset,
prio_destroy,
+ prio_tune,
#ifdef CONFIG_RTNETLINK
prio_dump,
@@ -193,8+193,8 @@ red_enqueue(struct sk_buff *skb, struct Qdisc* sch) }
if (q->qave < q->qth_min) {
-enqueue:
q->qcount = -1;
+enqueue:
if (sch->stats.backlog <= q->limit) {
__skb_queue_tail(&sch->q, skb);
sch->stats.backlog += skb->len;
*/
if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
goto enqueue;
+printk(KERN_DEBUG "Drop %d\n", q->qcount);
q->qcount = 0;
q->qR = net_random()&q->Rmask;
sch->stats.overlimits++;
@@ -375,6+376,7 @@ struct Qdisc_ops red_qdisc_ops = red_init,
red_reset,
red_destroy,
+ NULL /* red_change */,
#ifdef CONFIG_RTNETLINK
red_dump,
@@ -380,6+380,27 @@ static void sfq_perturbation(unsigned long arg) }
}
+static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
+{
+ struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
+ struct tc_sfq_qopt *ctl = RTA_DATA(opt);
+
+ if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+ return -EINVAL;
+
+ start_bh_atomic();
+ q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
+ q->perturb_period = ctl->perturb_period*HZ;
+
+ del_timer(&q->perturb_timer);
+ if (q->perturb_period) {
+ q->perturb_timer.expires = jiffies + q->perturb_period;
+ add_timer(&q->perturb_timer);
+ }
+ end_bh_atomic();
+ return 0;
+}
+
static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
{
struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data;
@@ -399,24+420,15 @@ static int sfq_init(struct Qdisc *sch, struct rtattr *opt) q->max_depth = 0;
q->tail = SFQ_DEPTH;
if (opt == NULL) {
- q->quantum = sch->dev->mtu;
+ q->quantum = psched_mtu(sch->dev);
q->perturb_period = 0;
- if (sch->dev->hard_header)
- q->quantum += sch->dev->hard_header_len;
} else {
- struct tc_sfq_qopt *ctl = RTA_DATA(opt);
- if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
- return -EINVAL;
- q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
- q->perturb_period = ctl->perturb_period*HZ;
- /* The rest is compiled in */
+ int err = sfq_change(sch, opt);
+ if (err)
+ return err;
}
for (i=0; i<SFQ_DEPTH; i++)
sfq_link(q, i);
- if (q->perturb_period) {
- q->perturb_timer.expires = jiffies + q->perturb_period;
- add_timer(&q->perturb_timer);
- }
MOD_INC_USE_COUNT;
return 0;
}
@@ -467,6+479,7 @@ struct Qdisc_ops sfq_qdisc_ops = sfq_init,
sfq_reset,
sfq_destroy,
+ NULL, /* sfq_change */
#ifdef CONFIG_RTNETLINK
sfq_dump,
@@ -114,6+114,7 @@ struct tbf_sched_data u32 limit; /* Maximal length of backlog: bytes */
u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
u32 mtu;
+ u32 max_size;
struct qdisc_rate_table *R_tab;
struct qdisc_rate_table *P_tab;
@@ -132,6+133,8 @@ tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) {
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+ if (skb->len > q->max_size)
+ goto drop;
__skb_queue_tail(&sch->q, skb);
if ((sch->stats.backlog += skb->len) <= q->limit) {
sch->stats.bytes += skb->len;
@@ -145,6+148,8 @@ tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
__skb_unlink(skb, &sch->q);
sch->stats.backlog -= skb->len;
+
+drop:
sch->stats.drops++;
#ifdef CONFIG_NET_CLS_POLICE
if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
@@ -180,6+185,7 @@ static void tbf_watchdog(unsigned long arg) {
struct Qdisc *sch = (struct Qdisc*)arg;
+ sch->flags &= ~TCQ_F_THROTTLED;
qdisc_wakeup(sch->dev);
}
@@ -216,6+222,7 @@ tbf_dequeue(struct Qdisc* sch) q->tokens = toks;
q->ptokens = ptoks;
sch->stats.backlog -= skb->len;
+ sch->flags &= ~TCQ_F_THROTTLED;
return skb;
}
@@ -238,10+245,11 @@ tbf_dequeue(struct Qdisc* sch) Really, if we split the flow into independent
subflows, it would be a very good solution.
This is the main idea of all FQ algorithms
- (cf. CSZ, HPFQ, HFCS)
+ (cf. CSZ, HPFQ, HFSC)
*/
__skb_queue_head(&sch->q, skb);
+ sch->flags |= TCQ_F_THROTTLED;
sch->stats.overlimits++;
}
return NULL;
@@ -258,53+266,86 @@ tbf_reset(struct Qdisc* sch) PSCHED_GET_TIME(q->t_c);
q->tokens = q->buffer;
q->ptokens = q->mtu;
+ sch->flags &= ~TCQ_F_THROTTLED;
del_timer(&q->wd_timer);
}
-static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
+static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
{
+ int err = -EINVAL;
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
struct rtattr *tb[TCA_TBF_PTAB];
struct tc_tbf_qopt *qopt;
+ struct qdisc_rate_table *rtab = NULL;
+ struct qdisc_rate_table *ptab = NULL;
+ int max_size;
- MOD_INC_USE_COUNT;
-
- if (opt == NULL ||
- rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
+ if (rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
tb[TCA_TBF_PARMS-1] == NULL ||
- RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) {
- MOD_DEC_USE_COUNT;
- return -EINVAL;
- }
+ RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
+ goto done;
qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
- q->R_tab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
- if (q->R_tab == NULL) {
- MOD_DEC_USE_COUNT;
- return -EINVAL;
- }
+ rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
+ if (rtab == NULL)
+ goto done;
if (qopt->peakrate.rate) {
- q->P_tab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_PTAB-1]);
- if (q->P_tab == NULL) {
- MOD_DEC_USE_COUNT;
- qdisc_put_rtab(q->R_tab);
- return -EINVAL;
+ if (qopt->peakrate.rate > qopt->rate.rate)
+ ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
+ if (ptab == NULL)
+ goto done;
+ }
+
+ max_size = psched_mtu(sch->dev);
+ if (ptab) {
+ int n = max_size>>qopt->peakrate.cell_log;
+ while (n>0 && ptab->data[n-1] > qopt->mtu) {
+ max_size -= (1<<qopt->peakrate.cell_log);
+ n--;
}
}
+ if (rtab->data[max_size>>qopt->rate.cell_log] > qopt->buffer)
+ goto done;
- PSCHED_GET_TIME(q->t_c);
- init_timer(&q->wd_timer);
- q->wd_timer.function = tbf_watchdog;
- q->wd_timer.data = (unsigned long)sch;
+ start_bh_atomic();
q->limit = qopt->limit;
q->mtu = qopt->mtu;
- if (q->mtu == 0)
- q->mtu = psched_mtu(sch->dev);
+ q->max_size = max_size;
q->buffer = qopt->buffer;
q->tokens = q->buffer;
q->ptokens = q->mtu;
- return 0;
+ rtab = xchg(&q->R_tab, rtab);
+ ptab = xchg(&q->P_tab, ptab);
+ end_bh_atomic();
+ err = 0;
+done:
+ if (rtab)
+ qdisc_put_rtab(rtab);
+ if (ptab)
+ qdisc_put_rtab(ptab);
+ return err;
+}
+
+static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
+{
+ int err;
+ struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ MOD_INC_USE_COUNT;
+
+ PSCHED_GET_TIME(q->t_c);
+ init_timer(&q->wd_timer);
+ q->wd_timer.function = tbf_watchdog;
+ q->wd_timer.data = (unsigned long)sch;
+
+ if ((err = tbf_change(sch, opt)) != 0) {
+ MOD_DEC_USE_COUNT;
+ }
+ return err;
}
static void tbf_destroy(struct Qdisc *sch)
@@ -328,10+369,10 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) unsigned char *b = skb->tail;
struct rtattr *rta;
struct tc_tbf_qopt opt;
-
+
rta = (struct rtattr*)b;
RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
-
+
opt.limit = q->limit;
opt.rate = q->R_tab->rate;
if (q->P_tab)
@@ -366,6+407,7 @@ struct Qdisc_ops tbf_qdisc_ops = tbf_init,
tbf_reset,
tbf_destroy,
+ tbf_change,
#ifdef CONFIG_RTNETLINK
tbf_dump,
@@ -444,6+444,7 @@ static struct teql_master the_master = { teql_qdisc_init,
teql_reset,
teql_destroy,
+ NULL,
},};
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Version: $Id: af_unix.c,v 1.73 1999/01/15 06:55:48 davem Exp $
+ * Version: $Id: af_unix.c,v 1.74 1999/03/21 05:23:16 davem Exp $
*
* Fixes:
* Linus Torvalds : Assorted bug cures.
* Lots of bug fixes.
* Alexey Kuznetosv : Repaired (I hope) bugs introduces
* by above two patches.
+ * Andrea Arcangeli : If possible we block in connect(2)
+ * if the max backlog of the listen socket
+ * is been reached. This won't break
+ * old apps and it will avoid huge amount
+ * of socks hashed (this for unix_gc()
+ * performances reasons).
+ * Security fix that limits the max
+ * number of socks to 2*max_files and
+ * the number of skb queueable in the
+ * dgram receiver.
*
* Known differences from reference BSD that was tested:
*
int sysctl_unix_delete_delay = HZ;
int sysctl_unix_destroy_delay = 10*HZ;
+int sysctl_unix_max_dgram_qlen = 10;
unix_socket *unix_socket_table[UNIX_HASH_SIZE+1];
+static atomic_t unix_nr_socks = ATOMIC_INIT(0);
+static struct wait_queue * unix_ack_wqueue = NULL;
+static struct wait_queue * unix_dgram_wqueue = NULL;
#define unix_sockets_unbound (unix_socket_table[UNIX_HASH_SIZE])
@@ -263,6+277,8 @@ static void unix_destroy_timer(unsigned long data) unix_socket *sk=(unix_socket *)data;
if(!unix_locked(sk) && atomic_read(&sk->wmem_alloc) == 0)
{
+ atomic_dec(&unix_nr_socks);
+
sk_free(sk);
/* socket destroyed, decrement count */
@@ -295,6+311,11 @@ static int unix_release_sock (unix_socket *sk) sk->dead=1;
sk->socket = NULL;
+ if (sk->state == TCP_LISTEN)
+ wake_up_interruptible(&unix_ack_wqueue);
+ if (sk->type == SOCK_DGRAM)
+ wake_up_interruptible(&unix_dgram_wqueue);
+
skpair=unix_peer(sk);
if (skpair!=NULL)
@@ -347,6+368,8 @@ static void unix_destroy_socket(unix_socket *sk)
if(!unix_locked(sk) && atomic_read(&sk->wmem_alloc) == 0)
{
+ atomic_dec(&unix_nr_socks);
+
sk_free(sk);
/* socket destroyed, decrement count */
@@ -371,6+394,8 @@ static int unix_listen(struct socket *sock, int backlog) return -EOPNOTSUPP; /* Only stream sockets accept */
if (!sk->protinfo.af_unix.addr)
return -EINVAL; /* No listens on an unbound socket */
+ if ((unsigned) backlog > SOMAXCONN)
+ backlog = SOMAXCONN;
sk->max_ack_backlog=backlog;
sk->state=TCP_LISTEN;
sock->flags |= SO_ACCEPTCON;
@@ -388,6+413,9 @@ static struct sock * unix_create1(struct socket *sock, int stream) {
struct sock *sk;
+ if (atomic_read(&unix_nr_socks) >= 2*max_files)
+ return NULL;
+
MOD_INC_USE_COUNT;
sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1);
if (!sk) {
@@ -395,6+423,8 @@ static struct sock * unix_create1(struct socket *sock, int stream) return NULL;
}
+ atomic_inc(&unix_nr_socks);
+
sock_init_data(sock,sk);
if (stream)
@@ -673,9+703,25 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, we will have to recheck all again in any case.
*/
+restart:
/* Find listening sock */
other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err);
+ if (!other)
+ return -ECONNREFUSED;
+
+ while (other->ack_backlog >= other->max_ack_backlog) {
+ unix_unlock(other);
+ if (other->dead || other->state != TCP_LISTEN)
+ return -ECONNREFUSED;
+ if (flags & O_NONBLOCK)
+ return -EAGAIN;
+ interruptible_sleep_on(&unix_ack_wqueue);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ goto restart;
+ }
+
/* create new sock for complete connection */
newsk = unix_create1(NULL, 1);
@@ -704,7+750,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
/* Check that listener is in valid state. */
err = -ECONNREFUSED;
- if (other == NULL || other->dead || other->state != TCP_LISTEN)
+ if (other->dead || other->state != TCP_LISTEN)
goto out;
err = -ENOMEM;
@@ -815,11+861,10 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags) continue;
}
tsk = skb->sk;
- sk->ack_backlog--;
+ if (sk->max_ack_backlog == sk->ack_backlog--)
+ wake_up_interruptible(&unix_ack_wqueue);
kfree_skb(skb);
- if (!tsk->dead)
- break;
- unix_release_sock(tsk);
+ break;
}
@@ -947,6+992,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, * Check with 1003.1g - what should
* datagram error
*/
+ dead:
unix_unlock(other);
unix_peer(sk)=NULL;
other = NULL;
@@ -964,6+1010,29 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, goto out_unlock;
}
+ while (skb_queue_len(&other->receive_queue) >=
+ sysctl_unix_max_dgram_qlen)
+ {
+ if (sock->file->f_flags & O_NONBLOCK)
+ {
+ err = -EAGAIN;
+ goto out_unlock;
+ }
+ interruptible_sleep_on(&unix_dgram_wqueue);
+ if (other->dead)
+ goto dead;
+ if (sk->shutdown & SEND_SHUTDOWN)
+ {
+ err = -EPIPE;
+ goto out_unlock;
+ }
+ if (signal_pending(current))
+ {
+ err = -ERESTARTSYS;
+ goto out_unlock;
+ }
+ }
+
skb_queue_tail(&other->receive_queue, skb);
other->data_ready(other,len);
@@ -1126,6+1195,13 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int size, if (!skb)
goto out;
+ /*
+ * sysctl_unix_max_dgram_qlen may change over the time we blocked
+ * in the waitqueue so we must wakeup every time we shrink the
+ * receiver queue. -arca
+ */
+ wake_up_interruptible(&unix_dgram_wqueue);
+
if (msg->msg_name)
{
msg->msg_namelen = sizeof(short);
extern int sysctl_unix_destroy_delay;
extern int sysctl_unix_delete_delay;
+extern int sysctl_unix_max_dgram_qlen;
ctl_table unix_table[] = {
{NET_UNIX_DESTROY_DELAY, "destroy_delay",
@@ -27,6+28,9 @@ ctl_table unix_table[] = { {NET_UNIX_DELETE_DELAY, "delete_delay",
&sysctl_unix_delete_delay, sizeof(int), 0644, NULL,
&proc_dointvec_jiffies},
+ {NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen",
+ &sysctl_unix_max_dgram_qlen, sizeof(int), 0600, NULL,
+ &proc_dointvec_jiffies},
{0}
};