X-Git-Url: https://repo.or.cz/davej-history.git/blobdiff_plain/0939a7a4e831adf18e76198bcfec0cea57e97bfe..9af6f6e4860e86507da2d470dd6a3bee34bf58c2:/net/unix/af_unix.c diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6b6a081ca..d4bec3a5d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -8,6 +8,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * + * Version: $Id: af_unix.c,v 1.84 1999/09/08 03:47:18 davem Exp $ + * * Fixes: * Linus Torvalds : Assorted bug cures. * Niibe Yutaka : async I/O support. @@ -24,6 +26,26 @@ * Alan Cox : Started proper garbage collector * Heiko EiBfeldt : Missing verify_area check * Alan Cox : Started POSIXisms + * Andreas Schwab : Replace inode by dentry for proper + * reference counting + * Kirk Petersen : Made this a module + * Christoph Rohland : Elegant non-blocking accept/connect algorithm. + * Lots of bug fixes. + * Alexey Kuznetosv : Repaired (I hope) bugs introduces + * by above two patches. + * Andrea Arcangeli : If possible we block in connect(2) + * if the max backlog of the listen socket + * is been reached. This won't break + * old apps and it will avoid huge amount + * of socks hashed (this for unix_gc() + * performances reasons). + * Security fix that limits the max + * number of socks to 2*max_files and + * the number of skb queueable in the + * dgram receiver. + * Artur Skawina : Hash function optimizations + * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) + * * * Known differences from reference BSD that was tested: * @@ -55,6 +77,7 @@ * with BSD names. */ +#include #include #include #include @@ -67,7 +90,6 @@ #include #include #include -#include #include #include #include @@ -81,24 +103,36 @@ #include #include #include +#include +#include +#include #include #define min(a,b) (((a)<(b))?(a):(b)) +int sysctl_unix_max_dgram_qlen = 10; unix_socket *unix_socket_table[UNIX_HASH_SIZE+1]; +rwlock_t unix_table_lock = RW_LOCK_UNLOCKED; +static atomic_t unix_nr_socks = ATOMIC_INIT(0); #define unix_sockets_unbound (unix_socket_table[UNIX_HASH_SIZE]) #define UNIX_ABSTRACT(sk) ((sk)->protinfo.af_unix.addr->hash!=UNIX_HASH_SIZE) +/* + SMP locking strategy. + * hash table is protceted with rwlock unix_table_lock + * each socket state is protected by separate rwlock. + + */ + extern __inline__ unsigned unix_hash_fold(unsigned hash) { hash ^= hash>>16; hash ^= hash>>8; - hash ^= hash>>4; - return hash; + return hash&(UNIX_HASH_SIZE-1); } #define unix_peer(sk) ((sk)->pair) @@ -110,34 +144,27 @@ extern __inline__ int unix_our_peer(unix_socket *sk, unix_socket *osk) extern __inline__ int unix_may_send(unix_socket *sk, unix_socket *osk) { - return (sk->type==osk->type); -} - -extern __inline__ void unix_lock(unix_socket *sk) -{ - sk->sock_readers++; + return (unix_peer(osk) == NULL || unix_our_peer(sk, osk)); } -extern __inline__ int unix_unlock(unix_socket *sk) +static __inline__ unix_socket * unix_peer_get(unix_socket *s) { - return sk->sock_readers--; -} + unix_socket *peer; -extern __inline__ int unix_locked(unix_socket *sk) -{ - return sk->sock_readers; + unix_state_rlock(s); + peer = unix_peer(s); + if (peer) + sock_hold(peer); + unix_state_runlock(s); + return peer; } extern __inline__ void unix_release_addr(struct unix_address *addr) { - if (addr) - { - if (atomic_dec_and_test(&addr->refcnt)) - kfree(addr); - } + if (atomic_dec_and_test(&addr->refcnt)) + kfree(addr); } - /* * Check unix socket name: * - should be not zero length. @@ -153,8 +180,16 @@ static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp) return -EINVAL; if (sunaddr->sun_path[0]) { - if (len >= sizeof(*sunaddr)) - len = sizeof(*sunaddr)-1; + /* + * This may look like an off by one error but it is + * a bit more subtle. 108 is the longest valid AF_UNIX + * path for a binding. sun_path[108] doesnt as such + * exist. However in kernel space we are guaranteed that + * it is a valid memory location in our kernel + * address buffer. + */ + if (len > sizeof(*sunaddr)) + len = sizeof(*sunaddr); ((char *)sunaddr)[len]=0; len = strlen(sunaddr->sun_path)+1+sizeof(short); return len; @@ -164,241 +199,314 @@ static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp) return len; } -static void unix_remove_socket(unix_socket *sk) +static void __unix_remove_socket(unix_socket *sk) { unix_socket **list = sk->protinfo.af_unix.list; - if (sk->next) - sk->next->prev = sk->prev; - if (sk->prev) - sk->prev->next = sk->next; - if (*list == sk) - *list = sk->next; - sk->protinfo.af_unix.list = NULL; - sk->prev = NULL; - sk->next = NULL; + if (list) { + if (sk->next) + sk->next->prev = sk->prev; + if (sk->prev) + sk->prev->next = sk->next; + if (*list == sk) + *list = sk->next; + sk->protinfo.af_unix.list = NULL; + sk->prev = NULL; + sk->next = NULL; + __sock_put(sk); + } } -static void unix_insert_socket(unix_socket *sk) +static void __unix_insert_socket(unix_socket **list, unix_socket *sk) { - unix_socket **list = sk->protinfo.af_unix.list; + BUG_TRAP(sk->protinfo.af_unix.list==NULL); + + sk->protinfo.af_unix.list = list; sk->prev = NULL; sk->next = *list; if (*list) (*list)->prev = sk; *list=sk; + sock_hold(sk); } -static unix_socket *unix_find_socket_byname(struct sockaddr_un *sunname, - int len, int type, unsigned hash) +static __inline__ void unix_remove_socket(unix_socket *sk) +{ + write_lock(&unix_table_lock); + __unix_remove_socket(sk); + write_unlock(&unix_table_lock); +} + +static __inline__ void unix_insert_socket(unix_socket **list, unix_socket *sk) +{ + write_lock(&unix_table_lock); + __unix_insert_socket(list, sk); + write_unlock(&unix_table_lock); +} + +static unix_socket *__unix_find_socket_byname(struct sockaddr_un *sunname, + int len, int type, unsigned hash) { unix_socket *s; - for (s=unix_socket_table[(hash^type)&0xF]; s; s=s->next) - { + for (s=unix_socket_table[hash^type]; s; s=s->next) { if(s->protinfo.af_unix.addr->len==len && - memcmp(s->protinfo.af_unix.addr->name, sunname, len) == 0 && - s->type == type) - { - unix_lock(s); - return(s); - } + memcmp(s->protinfo.af_unix.addr->name, sunname, len) == 0) + return s; } - return(NULL); + return NULL; +} + +static __inline__ unix_socket * +unix_find_socket_byname(struct sockaddr_un *sunname, + int len, int type, unsigned hash) +{ + unix_socket *s; + + read_lock(&unix_table_lock); + s = __unix_find_socket_byname(sunname, len, type, hash); + if (s) + sock_hold(s); + read_unlock(&unix_table_lock); + return s; } static unix_socket *unix_find_socket_byinode(struct inode *i) { unix_socket *s; - for (s=unix_socket_table[i->i_ino & 0xF]; s; s=s->next) + read_lock(&unix_table_lock); + for (s=unix_socket_table[i->i_ino & (UNIX_HASH_SIZE-1)]; s; s=s->next) { - if(s->protinfo.af_unix.inode==i) + struct dentry *dentry = s->protinfo.af_unix.dentry; + + if(dentry && dentry->d_inode == i) { - unix_lock(s); - return(s); + sock_hold(s); + break; } } - return(NULL); + read_unlock(&unix_table_lock); + return s; } -/* - * Delete a unix socket. We have to allow for deferring this on a timer. - */ +static __inline__ int unix_writable(struct sock *sk) +{ + return ((atomic_read(&sk->wmem_alloc)<<2) <= sk->sndbuf); +} -static void unix_destroy_timer(unsigned long data) +static void unix_write_space(struct sock *sk) { - unix_socket *sk=(unix_socket *)data; - if(!unix_locked(sk) && sk->wmem_alloc==0) - { - unix_release_addr(sk->protinfo.af_unix.addr); - sk_free(sk); - return; + read_lock(&sk->callback_lock); + if (!sk->dead && unix_writable(sk)) { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket, 2); } - - /* - * Retry; - */ - - sk->timer.expires=jiffies+10*HZ; /* No real hurry try it every 10 seconds or so */ - add_timer(&sk->timer); + read_unlock(&sk->callback_lock); } - - -static void unix_delayed_delete(unix_socket *sk) + +static void unix_sock_destructor(struct sock *sk) { - sk->timer.data=(unsigned long)sk; - sk->timer.expires=jiffies+HZ; /* Normally 1 second after will clean up. After that we try every 10 */ - sk->timer.function=unix_destroy_timer; - add_timer(&sk->timer); + skb_queue_purge(&sk->receive_queue); + + BUG_TRAP(atomic_read(&sk->wmem_alloc) == 0); + BUG_TRAP(sk->protinfo.af_unix.list==NULL); + BUG_TRAP(sk->socket==NULL); + if (sk->dead==0) { + printk("Attempt to release alive unix socket: %p\n", sk); + return; + } + + if (sk->protinfo.af_unix.addr) + unix_release_addr(sk->protinfo.af_unix.addr); + + atomic_dec(&unix_nr_socks); +#ifdef UNIX_REFCNT_DEBUG + printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks)); +#endif + MOD_DEC_USE_COUNT; } - -static void unix_destroy_socket(unix_socket *sk) + +static int unix_release_sock (unix_socket *sk, int embrion) { + struct dentry *dentry; + unix_socket *skpair; struct sk_buff *skb; + int state; unix_remove_socket(sk); - - while((skb=skb_dequeue(&sk->receive_queue))!=NULL) - { - if(sk->state==TCP_LISTEN) - { - unix_socket *osk=skb->sk; - osk->state=TCP_CLOSE; - kfree_skb(skb, FREE_WRITE); /* Now surplus - free the skb first before the socket */ - osk->state_change(osk); /* So the connect wakes and cleans up (if any) */ - /* osk will be destroyed when it gets to close or the timer fires */ - } - else - { - /* passed fds are erased in the kfree_skb hook */ - kfree_skb(skb,FREE_WRITE); + + /* Clear state */ + unix_state_wlock(sk); + write_lock(&sk->callback_lock); + sk->dead = 1; + sk->socket = NULL; + write_unlock(&sk->callback_lock); + sk->shutdown = SHUTDOWN_MASK; + dentry = sk->protinfo.af_unix.dentry; + sk->protinfo.af_unix.dentry=NULL; + state = sk->state; + sk->state = TCP_CLOSE; + unix_state_wunlock(sk); + + wake_up_interruptible(sk->sleep); + wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); + + skpair=unix_peer(sk); + + if (skpair!=NULL) { + if (sk->type==SOCK_STREAM) { + unix_state_wlock(skpair); + skpair->shutdown=SHUTDOWN_MASK; /* No more writes*/ + if (!skb_queue_empty(&sk->receive_queue) || embrion) + skpair->err = ECONNRESET; + unix_state_wunlock(skpair); + sk->data_ready(skpair,0); } + sock_put(skpair); /* It may now die */ + unix_peer(sk) = NULL; } - - if(sk->protinfo.af_unix.inode!=NULL) - { - iput(sk->protinfo.af_unix.inode); - sk->protinfo.af_unix.inode=NULL; - } - - if(!unix_unlock(sk) && sk->wmem_alloc==0) + + /* Try to flush out this socket. Throw out buffers at least */ + + while((skb=skb_dequeue(&sk->receive_queue))!=NULL) { - unix_release_addr(sk->protinfo.af_unix.addr); - sk_free(sk); + if (state==TCP_LISTEN) + unix_release_sock(skb->sk, 1); + /* passed fds are erased in the kfree_skb hook */ + kfree_skb(skb); } - else - { - sk->dead=1; - unix_delayed_delete(sk); /* Try every so often until buffers are all freed */ + + if (dentry) { + lock_kernel(); + dput(dentry); + unlock_kernel(); } + + sock_put(sk); + + /* ---- Socket is dead now and most probably destroyed ---- */ + + /* + * Fixme: BSD difference: In BSD all sockets connected to use get + * ECONNRESET and we die on the spot. In Linux we behave + * like files and pipes do and wait for the last + * dereference. + * + * Can't we simply set sock->err? + * + * What the above comment does talk about? --ANK(980817) + */ + + if (atomic_read(&unix_tot_inflight)) + unix_gc(); /* Garbage collect fds */ + + return 0; } static int unix_listen(struct socket *sock, int backlog) { + int err; struct sock *sk = sock->sk; - if (sock->state != SS_UNCONNECTED) - return(-EINVAL); + err = -EOPNOTSUPP; if (sock->type!=SOCK_STREAM) - return -EOPNOTSUPP; /* Only stream sockets accept */ + goto out; /* Only stream sockets accept */ + err = -EINVAL; if (!sk->protinfo.af_unix.addr) - return -EINVAL; /* No listens on an unbound socket */ + goto out; /* No listens on an unbound socket */ + unix_state_wlock(sk); + if (sk->state != TCP_CLOSE && sk->state != TCP_LISTEN) + goto out_unlock; + if (backlog > sk->max_ack_backlog) + wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); sk->max_ack_backlog=backlog; - if (sk->ack_backlog < backlog) - sk->state_change(sk); sk->state=TCP_LISTEN; sock->flags |= SO_ACCEPTCON; - return 0; + /* set credentials so connect can copy them */ + sk->peercred.pid = current->pid; + sk->peercred.uid = current->euid; + sk->peercred.gid = current->egid; + err = 0; + +out_unlock: + unix_state_wunlock(sk); +out: + return err; } extern struct proto_ops unix_stream_ops; extern struct proto_ops unix_dgram_ops; -static int unix_create(struct socket *sock, int protocol) +static struct sock * unix_create1(struct socket *sock) { struct sock *sk; - sock->state = SS_UNCONNECTED; + if (atomic_read(&unix_nr_socks) >= 2*max_files) + return NULL; + + MOD_INC_USE_COUNT; + sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1); + if (!sk) { + MOD_DEC_USE_COUNT; + return NULL; + } + + atomic_inc(&unix_nr_socks); + sock_init_data(sock,sk); + + sk->write_space = unix_write_space; + + sk->max_ack_backlog = sysctl_unix_max_dgram_qlen; + sk->destruct = unix_sock_destructor; + sk->protinfo.af_unix.dentry=NULL; + sk->protinfo.af_unix.lock = RW_LOCK_UNLOCKED; + atomic_set(&sk->protinfo.af_unix.inflight, 0); + init_MUTEX(&sk->protinfo.af_unix.readsem);/* single task reading lock */ + init_waitqueue_head(&sk->protinfo.af_unix.peer_wait); + sk->protinfo.af_unix.list=NULL; + unix_insert_socket(&unix_sockets_unbound, sk); + + return sk; +} + +static int unix_create(struct socket *sock, int protocol) +{ if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; - switch (sock->type) - { - case SOCK_STREAM: - sock->ops = &unix_stream_ops; - break; + sock->state = SS_UNCONNECTED; + + switch (sock->type) { + case SOCK_STREAM: + sock->ops = &unix_stream_ops; + break; /* * Believe it or not BSD has AF_UNIX, SOCK_RAW though * nothing uses it. */ - case SOCK_RAW: - sock->type=SOCK_DGRAM; - case SOCK_DGRAM: - sock->ops = &unix_dgram_ops; - break; - default: - return -ESOCKTNOSUPPORT; + case SOCK_RAW: + sock->type=SOCK_DGRAM; + case SOCK_DGRAM: + sock->ops = &unix_dgram_ops; + break; + default: + return -ESOCKTNOSUPPORT; } - sk = sk_alloc(GFP_KERNEL); - if (!sk) - return -ENOMEM; - sock_init_data(sock,sk); - - sk->protinfo.af_unix.family=AF_UNIX; - sk->protinfo.af_unix.inode=NULL; - sk->sock_readers=1; /* Us */ - sk->protinfo.af_unix.readsem=MUTEX; /* single task reading lock */ - sk->mtu=4096; - sk->protinfo.af_unix.list=&unix_sockets_unbound; - unix_insert_socket(sk); - return 0; -} - -static int unix_dup(struct socket *newsock, struct socket *oldsock) -{ - return unix_create(newsock, 0); + return unix_create1(sock) ? 0 : -ENOMEM; } -static int unix_release(struct socket *sock, struct socket *peer) +static int unix_release(struct socket *sock) { unix_socket *sk = sock->sk; - unix_socket *skpair; if (!sk) return 0; - - if (sock->state != SS_UNCONNECTED) - sock->state = SS_DISCONNECTING; - sk->state_change(sk); - sk->dead=1; - skpair=unix_peer(sk); - if (sock->type==SOCK_STREAM && skpair) - { - if (unix_our_peer(sk, skpair)) - skpair->shutdown=SHUTDOWN_MASK; /* No more writes */ - if (skpair->state!=TCP_LISTEN) - skpair->state_change(skpair); /* Wake any blocked writes */ - } - if (skpair!=NULL) - unix_unlock(skpair); /* It may now die */ - unix_peer(sk)=NULL; /* No pair */ - unix_destroy_socket(sk); /* Try to flush out this socket. Throw out buffers at least */ - unix_gc(); /* Garbage collect fds */ + sock->sk = NULL; - /* - * FIXME: BSD difference: In BSD all sockets connected to use get ECONNRESET and we die on the spot. In - * Linux we behave like files and pipes do and wait for the last dereference. - */ - if (sk->socket) - { - sk->socket = NULL; - sock->sk = NULL; - } - - return 0; + return unix_release_sock (sk, 0); } static int unix_autobind(struct socket *sock) @@ -406,64 +514,78 @@ static int unix_autobind(struct socket *sock) struct sock *sk = sock->sk; static u32 ordernum = 1; struct unix_address * addr; - unix_socket *osk; + int err; + + down(&sk->protinfo.af_unix.readsem); + + err = 0; + if (sk->protinfo.af_unix.addr) + goto out; + err = -ENOMEM; addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL); if (!addr) - return -ENOBUFS; - if (sk->protinfo.af_unix.addr || sk->protinfo.af_unix.inode) - { - kfree(addr); - return -EINVAL; - } + goto out; + memset(addr, 0, sizeof(*addr) + sizeof(short) + 16); addr->name->sun_family = AF_UNIX; - addr->refcnt = 1; + atomic_set(&addr->refcnt, 1); retry: - addr->len = sprintf(addr->name->sun_path+1, "%08x", ordernum) + 1 + sizeof(short); + addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0)); - ordernum++; - if ((osk=unix_find_socket_byname(addr->name, addr->len, sock->type, - addr->hash)) != NULL) - { - unix_unlock(osk); + write_lock(&unix_table_lock); + ordernum = (ordernum+1)&0xFFFFF; + + if (__unix_find_socket_byname(addr->name, addr->len, sock->type, + addr->hash)) { + write_unlock(&unix_table_lock); + /* Sanity yield. It is unusual case, but yet... */ + if (!(ordernum&0xFF)) { + current->policy |= SCHED_YIELD; + schedule(); + } goto retry; } + addr->hash ^= sk->type; + __unix_remove_socket(sk); sk->protinfo.af_unix.addr = addr; - unix_remove_socket(sk); - sk->protinfo.af_unix.list = &unix_socket_table[(addr->hash ^ sk->type)&0xF]; - unix_insert_socket(sk); - return 0; + __unix_insert_socket(&unix_socket_table[addr->hash], sk); + write_unlock(&unix_table_lock); + err = 0; + +out: + up(&sk->protinfo.af_unix.readsem); + return err; } static unix_socket *unix_find_other(struct sockaddr_un *sunname, int len, int type, unsigned hash, int *error) { - int old_fs; - int err; - struct inode *inode; unix_socket *u; if (sunname->sun_path[0]) { - old_fs=get_fs(); - set_fs(get_ds()); - err = open_namei(sunname->sun_path, 2, S_IFSOCK, &inode, NULL); - set_fs(old_fs); - if(err<0) - { - *error=err; + struct dentry *dentry; + + /* Do not believe to VFS, grab kernel lock */ + lock_kernel(); + dentry = open_namei(sunname->sun_path, 2, S_IFSOCK); + if (IS_ERR(dentry)) { + *error = PTR_ERR(dentry); + unlock_kernel(); return NULL; } - u=unix_find_socket_byinode(inode); - iput(inode); + u=unix_find_socket_byinode(dentry->d_inode); + dput(dentry); + unlock_kernel(); + if (u && u->type != type) { *error=-EPROTOTYPE; - unix_unlock(u); + sock_put(u); return NULL; } } @@ -483,84 +605,85 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; - struct inode * inode; - int old_fs; + struct dentry * dentry = NULL; int err; unsigned hash; struct unix_address *addr; - - if (sk->protinfo.af_unix.addr || sk->protinfo.af_unix.inode || - sunaddr->sun_family != AF_UNIX) - return -EINVAL; + unix_socket **list; + + err = -EINVAL; + if (sunaddr->sun_family != AF_UNIX) + goto out; + + if (addr_len==sizeof(short)) { + err = unix_autobind(sock); + goto out; + } - if (addr_len==sizeof(short)) - return unix_autobind(sock); + err = unix_mkname(sunaddr, addr_len, &hash); + if (err < 0) + goto out; + addr_len = err; - addr_len = unix_mkname(sunaddr, addr_len, &hash); - if (addr_len < 0) - return addr_len; + down(&sk->protinfo.af_unix.readsem); + + err = -EINVAL; + if (sk->protinfo.af_unix.addr) + goto out_up; + err = -ENOMEM; addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) - return -ENOBUFS; + goto out_up; - /* We sleeped; recheck ... */ + memcpy(addr->name, sunaddr, addr_len); + addr->len = addr_len; + addr->hash = hash^sk->type; + atomic_set(&addr->refcnt, 1); + + if (sunaddr->sun_path[0]) { + lock_kernel(); + dentry = do_mknod(sunaddr->sun_path, S_IFSOCK|sock->inode->i_mode, 0); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + unlock_kernel(); + if (err==-EEXIST) + err=-EADDRINUSE; + unix_release_addr(addr); + goto out_up; + } + unlock_kernel(); - if (sk->protinfo.af_unix.addr || sk->protinfo.af_unix.inode) - { - kfree(addr); - return -EINVAL; /* Already bound */ + addr->hash = UNIX_HASH_SIZE; } - memcpy(addr->name, sunaddr, addr_len); - addr->len = addr_len; - addr->hash = hash; - addr->refcnt = 1; + write_lock(&unix_table_lock); - if (!sunaddr->sun_path[0]) - { - unix_socket *osk = unix_find_socket_byname(sunaddr, addr_len, - sk->type, hash); - if (osk) - { - unix_unlock(osk); - kfree(addr); - return -EADDRINUSE; + if (!sunaddr->sun_path[0]) { + err = -EADDRINUSE; + if (__unix_find_socket_byname(sunaddr, addr_len, + sk->type, hash)) { + unix_release_addr(addr); + goto out_unlock; } - unix_remove_socket(sk); - sk->protinfo.af_unix.addr = addr; - sk->protinfo.af_unix.list = &unix_socket_table[(hash^sk->type)&0xF]; - unix_insert_socket(sk); - return 0; + + list = &unix_socket_table[addr->hash]; + } else { + list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; + sk->protinfo.af_unix.dentry = dentry; } - addr->hash = UNIX_HASH_SIZE; + err = 0; + __unix_remove_socket(sk); sk->protinfo.af_unix.addr = addr; - - old_fs=get_fs(); - set_fs(get_ds()); + __unix_insert_socket(list, sk); - err=do_mknod(sunaddr->sun_path, S_IFSOCK|S_IRWXUGO, 0); - if (!err) - err=open_namei(sunaddr->sun_path, 2, S_IFSOCK, &inode, NULL); - - set_fs(old_fs); - - if(err<0) - { - unix_release_addr(addr); - sk->protinfo.af_unix.addr = NULL; - if (err==-EEXIST) - return -EADDRINUSE; - else - return err; - } - unix_remove_socket(sk); - sk->protinfo.af_unix.list = &unix_socket_table[inode->i_ino & 0xF]; - sk->protinfo.af_unix.inode = inode; - unix_insert_socket(sk); - - return 0; +out_unlock: + write_unlock(&unix_table_lock); +out_up: + up(&sk->protinfo.af_unix.readsem); +out: + return err; } static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, @@ -572,179 +695,233 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, unsigned hash; int err; - /* - * 1003.1g breaking connected state with AF_UNSPEC - */ + if (addr->sa_family != AF_UNSPEC) { + err = unix_mkname(sunaddr, alen, &hash); + if (err < 0) + goto out; + alen = err; - if(addr->sa_family==AF_UNSPEC) - { - if(unix_peer(sk)) - { - unix_unlock(unix_peer(sk)); - unix_peer(sk) = NULL; - sock->state=SS_UNCONNECTED; - } - return 0; - } - - alen = unix_mkname(sunaddr, alen, &hash); - if (alen < 0) - return alen; + if (sock->passcred && !sk->protinfo.af_unix.addr && + (err = unix_autobind(sock)) != 0) + goto out; - other=unix_find_other(sunaddr, alen, sock->type, hash, &err); - if (!other) - return err; - if (!unix_may_send(sk, other)) - { - unix_unlock(other); - return -EINVAL; + other=unix_find_other(sunaddr, alen, sock->type, hash, &err); + if (!other) + goto out; + + unix_state_wlock(sk); + + err = -EPERM; + if (!unix_may_send(sk, other)) + goto out_unlock; + } else { + /* + * 1003.1g breaking connected state with AF_UNSPEC + */ + other = NULL; + unix_state_wlock(sk); } /* * If it was connected, reconnect. */ - if (unix_peer(sk)) - { - unix_unlock(unix_peer(sk)); + if (unix_peer(sk)) { + sock_put(unix_peer(sk)); unix_peer(sk)=NULL; } unix_peer(sk)=other; - if (sock->passcred && !sk->protinfo.af_unix.addr) - unix_autobind(sock); + unix_state_wunlock(sk); return 0; + +out_unlock: + unix_state_wunlock(sk); + sock_put(other); +out: + return err; } -static int unix_stream_connect1(struct socket *sock, struct msghdr *msg, - int len, struct unix_skb_parms *cmsg, int nonblock) +static void unix_wait_for_peer(unix_socket *other) { - struct sockaddr_un *sunaddr=(struct sockaddr_un *)msg->msg_name; + int sched; + DECLARE_WAITQUEUE(wait, current); + + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&other->protinfo.af_unix.peer_wait, &wait); + + sched = (!other->dead && + !(other->shutdown&RCV_SHUTDOWN) && + !signal_pending(current) && + skb_queue_len(&other->receive_queue) >= other->max_ack_backlog); + + unix_state_runlock(other); + + if (sched) + schedule(); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&other->protinfo.af_unix.peer_wait, &wait); +} + +static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; - unix_socket *other; - struct sk_buff *skb; - int err; + struct sock *newsk = NULL; + unix_socket *other = NULL; + struct sk_buff *skb = NULL; unsigned hash; - int addr_len; + int st; + int err; - addr_len = unix_mkname(sunaddr, msg->msg_namelen, &hash); - if (addr_len < 0) - return addr_len; + err = unix_mkname(sunaddr, addr_len, &hash); + if (err < 0) + goto out; + addr_len = err; - switch (sock->state) - { - case SS_UNCONNECTED: - /* This is ok... continue with connect */ - break; - case SS_CONNECTED: - /* Socket is already connected */ - return -EISCONN; - case SS_CONNECTING: - /* Not yet connected... we will check this. */ - break; - default: - return(-EINVAL); + if (sock->passcred && !sk->protinfo.af_unix.addr && + (err = unix_autobind(sock)) != 0) + goto out; + + /* First of all allocate resources. + If we will make it after state is locked, + we will have to recheck all again in any case. + */ + + err = -ENOMEM; + + /* create new sock for complete connection */ + newsk = unix_create1(NULL); + if (newsk == NULL) + goto out; + + /* Allocate skb for sending to listening sock */ + skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); + if (skb == NULL) + goto out; + +restart: + /* Find listening sock. */ + other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err); + if (!other) + goto out; + + /* Latch state of peer */ + unix_state_rlock(other); + + /* Apparently VFS overslept socket death. Retry. */ + if (other->dead) { + unix_state_runlock(other); + sock_put(other); + goto restart; } + err = -ECONNREFUSED; + if (other->state != TCP_LISTEN) + goto out_unlock; - if (unix_peer(sk)) - { - if (sock->state==SS_CONNECTING && sk->state==TCP_ESTABLISHED) - { - sock->state=SS_CONNECTED; - if (!sk->protinfo.af_unix.addr) - unix_autobind(sock); - return 0; - } - if (sock->state==SS_CONNECTING && sk->state == TCP_CLOSE) - { - sock->state=SS_UNCONNECTED; - return -ECONNREFUSED; - } - if (sock->state!=SS_CONNECTING) - return -EISCONN; - if (nonblock) - return -EALREADY; - /* - * Drop through the connect up logic to the wait. - */ + if (skb_queue_len(&other->receive_queue) >= other->max_ack_backlog) { + err = -EAGAIN; + if (flags & O_NONBLOCK) + goto out_unlock; + + unix_wait_for_peer(other); + + err = -ERESTARTSYS; + if (signal_pending(current)) + goto out; + sock_put(other); + goto restart; + } + + /* Latch our state. + + It is tricky place. We need to grab write lock and cannot + drop lock on peer. It is dangerous because deadlock is + possible. Connect to self case and simultaneous + attempt to connect are eliminated by checking socket + state. other is TCP_LISTEN, if sk is TCP_LISTEN we + check this before attempt to grab lock. + + Well, and we have to recheck the state after socket locked. + */ + st = sk->state; + + switch (st) { + case TCP_CLOSE: + /* This is ok... continue with connect */ + break; + case TCP_ESTABLISHED: + /* Socket is already connected */ + err = -EISCONN; + goto out_unlock; + default: + err = -EINVAL; + goto out_unlock; } - if (sock->state==SS_UNCONNECTED) - { - /* - * Now ready to connect - */ - - skb=sock_alloc_send_skb(sk, len, 0, nonblock, &err); /* Marker object */ - if(skb==NULL) - return err; - memcpy(&UNIXCB(skb), cmsg, sizeof(*cmsg)); - if (len) - memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); - sk->state=TCP_CLOSE; - other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err); - if(other==NULL) - { - kfree_skb(skb, FREE_WRITE); - return err; - } - other->ack_backlog++; - unix_peer(sk)=other; - skb_queue_tail(&other->receive_queue,skb); - sk->state=TCP_SYN_SENT; - sock->state=SS_CONNECTING; - other->data_ready(other,0); /* Wake up ! */ + unix_state_wlock(sk); + + if (sk->state != st) { + unix_state_wunlock(sk); + unix_state_runlock(other); + sock_put(other); + goto restart; } - - - /* Wait for an accept */ - - while(sk->state==TCP_SYN_SENT) + + /* The way is open! Fastly set all the necessary fields... */ + + sock_hold(sk); + unix_peer(newsk)=sk; + newsk->state=TCP_ESTABLISHED; + newsk->type=SOCK_STREAM; + newsk->peercred.pid = current->pid; + newsk->peercred.uid = current->euid; + newsk->peercred.gid = current->egid; + newsk->sleep = &newsk->protinfo.af_unix.peer_wait; + + /* copy address information from listening to new sock*/ + if (other->protinfo.af_unix.addr) { - if(nonblock) - return -EINPROGRESS; - interruptible_sleep_on(sk->sleep); - if(current->signal & ~current->blocked) - return -ERESTARTSYS; + atomic_inc(&other->protinfo.af_unix.addr->refcnt); + newsk->protinfo.af_unix.addr=other->protinfo.af_unix.addr; } - - /* - * Has the other end closed on us ? - */ - - if(sk->state==TCP_CLOSE) - { - unix_unlock(unix_peer(sk)); - unix_peer(sk)=NULL; - sock->state=SS_UNCONNECTED; - return -ECONNREFUSED; + if (other->protinfo.af_unix.dentry) { + /* Damn, even dget is not SMP safe. It becomes ridiculous... */ + lock_kernel(); + newsk->protinfo.af_unix.dentry=dget(other->protinfo.af_unix.dentry); + unlock_kernel(); } - - /* - * Amazingly it has worked - */ - + + /* Set credentials */ + sk->peercred = other->peercred; + + sock_hold(newsk); + unix_peer(sk)=newsk; sock->state=SS_CONNECTED; - if (!sk->protinfo.af_unix.addr) - unix_autobind(sock); - return 0; -} + sk->state=TCP_ESTABLISHED; + unix_state_wunlock(sk); -static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) -{ - struct msghdr msg; - struct unix_skb_parms cmsg; - - msg.msg_name = uaddr; - msg.msg_namelen = addr_len; - cmsg.fp = NULL; - cmsg.attr = MSG_SYN; - cmsg.creds.pid = current->pid; - cmsg.creds.uid = current->euid; - cmsg.creds.gid = current->egid; - - return unix_stream_connect1(sock, &msg, 0, &cmsg, flags&O_NONBLOCK); + /* take ten and and send info to listening sock */ + skb_queue_tail(&other->receive_queue,skb); + unix_state_runlock(other); + other->data_ready(other, 0); + sock_put(other); + return 0; + +out_unlock: + if (other) + unix_state_runlock(other); + +out: + if (skb) + kfree_skb(skb); + if (newsk) + unix_release_sock(newsk, 0); + if (other) + sock_put(other); + return err; } static int unix_socketpair(struct socket *socka, struct socket *sockb) @@ -752,8 +929,8 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) struct sock *ska=socka->sk, *skb = sockb->sk; /* Join our sockets back to back */ - unix_lock(ska); - unix_lock(skb); + sock_hold(ska); + sock_hold(skb); unix_peer(ska)=skb; unix_peer(skb)=ska; @@ -770,70 +947,42 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) static int unix_accept(struct socket *sock, struct socket *newsock, int flags) { unix_socket *sk = sock->sk; - unix_socket *newsk = newsock->sk; unix_socket *tsk; struct sk_buff *skb; - - if (sock->state != SS_UNCONNECTED) - return(-EINVAL); - if (!(sock->flags & SO_ACCEPTCON)) - return(-EINVAL); + int err; + err = -EOPNOTSUPP; if (sock->type!=SOCK_STREAM) - return -EOPNOTSUPP; + goto out; + + err = -EINVAL; if (sk->state!=TCP_LISTEN) - return -EINVAL; - - if (sk->protinfo.af_unix.addr) - { - atomic_inc(&sk->protinfo.af_unix.addr->refcnt); - newsk->protinfo.af_unix.addr=sk->protinfo.af_unix.addr; - } - if (sk->protinfo.af_unix.inode) - { - sk->protinfo.af_unix.inode->i_count++; - newsk->protinfo.af_unix.inode=sk->protinfo.af_unix.inode; - } - - for (;;) - { - skb=skb_dequeue(&sk->receive_queue); - if(skb==NULL) - { - if(flags&O_NONBLOCK) - return -EAGAIN; - interruptible_sleep_on(sk->sleep); - if(current->signal & ~current->blocked) - return -ERESTARTSYS; - continue; - } - if (!(UNIXCB(skb).attr & MSG_SYN)) - { - tsk=skb->sk; - tsk->state_change(tsk); - kfree_skb(skb, FREE_WRITE); - continue; - } - break; - } + goto out; - tsk=skb->sk; - sk->ack_backlog--; - unix_peer(newsk)=tsk; - unix_peer(tsk)=newsk; - tsk->state=TCP_ESTABLISHED; - newsk->state=TCP_ESTABLISHED; - memcpy(&newsk->peercred, UNIXCREDS(skb), sizeof(struct ucred)); - tsk->peercred.pid = current->pid; - tsk->peercred.uid = current->euid; - tsk->peercred.gid = current->egid; - unix_lock(newsk); /* Swap lock over */ - unix_unlock(sk); /* Locked to child socket not master */ - unix_lock(tsk); /* Back lock */ - kfree_skb(skb, FREE_WRITE); /* The buffer is just used as a tag */ - tsk->state_change(tsk); /* Wake up any sleeping connect */ - sock_wake_async(tsk->socket, 0); + /* If socket state is TCP_LISTEN it cannot change, + so that no locks are necessary. + */ + + skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err); + if (!skb) + goto out; + + tsk = skb->sk; + if (skb_queue_len(&sk->receive_queue) <= sk->max_ack_backlog/2) + wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); + skb_free_datagram(sk, skb); + + /* attach accepted sock to socket */ + unix_state_wlock(tsk); + newsock->state = SS_CONNECTED; + newsock->sk = tsk; + tsk->sleep = &newsock->wait; + tsk->socket = newsock; + unix_state_wunlock(tsk); return 0; + +out: + return err; } @@ -841,23 +990,34 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_ { struct sock *sk = sock->sk; struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; - - if (peer) - { - if (!unix_peer(sk)) - return -ENOTCONN; - sk=unix_peer(sk); + int err = 0; + + if (peer) { + sk = unix_peer_get(sk); + + err = -ENOTCONN; + if (!sk) + goto out; + err = 0; + } else { + sock_hold(sk); } - if (!sk->protinfo.af_unix.addr) - { + + unix_state_rlock(sk); + if (!sk->protinfo.af_unix.addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; *uaddr_len = sizeof(short); - return 0; /* Not bound */ + } else { + struct unix_address *addr = sk->protinfo.af_unix.addr; + + *uaddr_len = addr->len; + memcpy(sunaddr, addr->name, *uaddr_len); } - *uaddr_len = sk->protinfo.af_unix.addr->len; - memcpy(sunaddr, sk->protinfo.af_unix.addr->name, *uaddr_len); - return 0; + unix_state_runlock(sk); + sock_put(sk); +out: + return err; } static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) @@ -877,7 +1037,11 @@ static void unix_destruct_fds(struct sk_buff *skb) struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + lock_kernel(); scm_destroy(&scm); + unlock_kernel(); sock_wfree(skb); } @@ -891,7 +1055,6 @@ static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) scm->fp = NULL; } - /* * Send AF_UNIX data. */ @@ -900,82 +1063,130 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct scm_cookie *scm) { struct sock *sk = sock->sk; - unix_socket *other; struct sockaddr_un *sunaddr=msg->msg_name; + unix_socket *other = NULL; int namelen = 0; /* fake GCC */ int err; unsigned hash; struct sk_buff *skb; + err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) - return -EOPNOTSUPP; + goto out; - if (msg->msg_flags&~MSG_DONTWAIT) - return -EINVAL; + err = -EINVAL; + if (msg->msg_flags&~(MSG_DONTWAIT|MSG_NOSIGNAL)) + goto out; if (msg->msg_namelen) { - namelen = unix_mkname(sunaddr, msg->msg_namelen, &hash); - if (namelen < 0) - return namelen; + err = unix_mkname(sunaddr, msg->msg_namelen, &hash); + if (err < 0) + goto out; + namelen = err; } else { sunaddr = NULL; - if (!unix_peer(sk)) - return -ENOTCONN; + err = -ENOTCONN; + other = unix_peer_get(sk); + if (!other) + goto out; } - if (sock->passcred && !sk->protinfo.af_unix.addr) - unix_autobind(sock); + if (sock->passcred && !sk->protinfo.af_unix.addr && + (err = unix_autobind(sock)) != 0) + goto out; skb = sock_alloc_send_skb(sk, len, 0, msg->msg_flags&MSG_DONTWAIT, &err); - if (skb==NULL) - return err; + goto out; memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred)); - UNIXCB(skb).attr = msg->msg_flags; if (scm->fp) unix_attach_fds(scm, skb); - memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + skb->h.raw = skb->data; + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + if (err) + goto out_free; - other = unix_peer(sk); - if (other && other->dead) - { +restart: + if (!other) { + err = -ECONNRESET; + if (sunaddr == NULL) + goto out_free; + + other = unix_find_other(sunaddr, namelen, sk->type, hash, &err); + if (other==NULL) + goto out_free; + } + + unix_state_rlock(other); + err = -EPERM; + if (!unix_may_send(sk, other)) + goto out_unlock; + + if (other->dead) { /* * Check with 1003.1g - what should * datagram error */ - unix_unlock(other); - unix_peer(sk)=NULL; - other = NULL; - if (sunaddr == NULL) { - kfree_skb(skb, FREE_WRITE); - return -ECONNRESET; + unix_state_runlock(other); + sock_put(other); + + err = 0; + unix_state_wlock(sk); + if (unix_peer(sk) == other) { + sock_put(other); + unix_peer(sk)=NULL; + err = -ECONNREFUSED; } + unix_state_wunlock(sk); + + other = NULL; + if (err) + goto out_free; + goto restart; } - if (!other) - { - other = unix_find_other(sunaddr, namelen, sk->type, hash, &err); - - if (other==NULL) - { - kfree_skb(skb, FREE_WRITE); - return err; - } - if (!unix_may_send(sk, other)) - { - unix_unlock(other); - kfree_skb(skb, FREE_WRITE); - return -EINVAL; + + err = -EPIPE; + if (other->shutdown&RCV_SHUTDOWN) + goto out_unlock; + + if (0/*other->user_callback && + other->user_callback(other->user_data, skb) == 0*/) { + unix_state_runlock(other); + sock_put(other); + return len; + } + + if (skb_queue_len(&other->receive_queue) >= other->max_ack_backlog) { + if (msg->msg_flags & MSG_DONTWAIT) { + err = -EAGAIN; + goto out_unlock; } + + unix_wait_for_peer(other); + + err = -ERESTARTSYS; + if (signal_pending(current)) + goto out_free; + + goto restart; } skb_queue_tail(&other->receive_queue, skb); - other->data_ready(other,len); - - if (!unix_peer(sk)) - unix_unlock(other); + unix_state_runlock(other); + other->data_ready(other, len); + sock_put(other); return len; + +out_unlock: + unix_state_runlock(other); +out_free: + kfree_skb(skb); +out: + if (other) + sock_put(other); + return err; } @@ -983,37 +1194,34 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct scm_cookie *scm) { struct sock *sk = sock->sk; - unix_socket *other; + unix_socket *other = NULL; struct sockaddr_un *sunaddr=msg->msg_name; int err,size; struct sk_buff *skb; int limit=0; int sent=0; - if (sock->flags & SO_ACCEPTCON) - return(-EINVAL); - + err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) - return -EOPNOTSUPP; + goto out_err; - if (msg->msg_flags&~MSG_DONTWAIT) - return -EINVAL; + err = -EINVAL; + if (msg->msg_flags&~(MSG_DONTWAIT|MSG_NOSIGNAL)) + goto out_err; if (msg->msg_namelen) { - if (sk->state==TCP_ESTABLISHED) - return -EISCONN; - else - return -EOPNOTSUPP; + err = (sk->state==TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP); + goto out_err; } else { sunaddr = NULL; - if (!unix_peer(sk)) - return -ENOTCONN; + err = -ENOTCONN; + other = unix_peer_get(sk); + if (!other) + goto out_err; } - if (sk->shutdown&SEND_SHUTDOWN) { - send_sig(SIGPIPE,current,0); - return -EPIPE; - } + if (sk->shutdown&SEND_SHUTDOWN) + goto pipe_err; while(sent < len) { @@ -1021,11 +1229,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, * Optimisation for the fact that under 0.01% of X messages typically * need breaking up. */ - + size=len-sent; - if (size>(sk->sndbuf-sizeof(struct sk_buff))/2) /* Keep two messages in the pipe so it schedules better */ - size=(sk->sndbuf-sizeof(struct sk_buff))/2; + /* Keep two messages in the pipe so it schedules better */ + if (size > sk->sndbuf/2 - 16) + size = sk->sndbuf/2 - 16; /* * Keep to page sized kmalloc()'s as various people @@ -1033,8 +1242,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, * much. */ - if (size > 3500) - limit = 3500; /* Fall back to a page if we can't grab a big buffer this instant */ + if (size > 4096-16) + limit = 4096-16; /* Fall back to a page if we can't grab a big buffer this instant */ else limit = 0; /* Otherwise just grab and wait */ @@ -1043,13 +1252,9 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, */ skb=sock_alloc_send_skb(sk,size,limit,msg->msg_flags&MSG_DONTWAIT, &err); - + if (skb==NULL) - { - if (sent) - return sent; - return err; - } + goto out_err; /* * If you pass two values to the sock_alloc_send_skb @@ -1061,41 +1266,48 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, size = min(size, skb_tailroom(skb)); memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred)); - UNIXCB(skb).attr = msg->msg_flags; if (scm->fp) unix_attach_fds(scm, skb); - memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size); + if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) { + kfree_skb(skb); + goto out_err; + } - other=unix_peer(sk); + unix_state_rlock(other); - if (other->dead || (sk->shutdown & SEND_SHUTDOWN)) - { - kfree_skb(skb, FREE_WRITE); - if(sent) - return sent; - send_sig(SIGPIPE,current,0); - return -EPIPE; - } + if (other->dead || (other->shutdown & RCV_SHUTDOWN)) + goto pipe_err_free; skb_queue_tail(&other->receive_queue, skb); - other->data_ready(other,size); + unix_state_runlock(other); + other->data_ready(other, size); sent+=size; } + sock_put(other); return sent; + +pipe_err_free: + kfree_skb(skb); + unix_state_runlock(other); +pipe_err: + if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE,current,0); + err = -EPIPE; +out_err: + if (other) + sock_put(other); + return sent ? : err; } -/* - * Sleep until data has arrive. But check for races.. - */ - -static void unix_data_wait(unix_socket * sk) +static void unix_copy_addr(struct msghdr *msg, struct sock *sk) { - if (!skb_peek(&sk->receive_queue)) - { - sk->socket->flags |= SO_WAITDATA; - interruptible_sleep_on(sk->sleep); - sk->socket->flags &= ~SO_WAITDATA; + msg->msg_namelen = sizeof(short); + if (sk->protinfo.af_unix.addr) { + msg->msg_namelen=sk->protinfo.af_unix.addr->len; + memcpy(msg->msg_name, + sk->protinfo.af_unix.addr->name, + sk->protinfo.af_unix.addr->len); } } @@ -1105,48 +1317,32 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int size, struct sock *sk = sock->sk; int noblock = flags & MSG_DONTWAIT; struct sk_buff *skb; + int err; + err = -EOPNOTSUPP; if (flags&MSG_OOB) - return -EOPNOTSUPP; + goto out; msg->msg_namelen = 0; -retry: - skb=skb_dequeue(&sk->receive_queue); + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; - if (skb==NULL) - { - if (sk->shutdown & RCV_SHUTDOWN) - return 0; - if (noblock) - return -EAGAIN; - unix_data_wait(sk); - if (current->signal & ~current->blocked) - return -ERESTARTSYS; - goto retry; - } + if (skb_queue_len(&sk->receive_queue) <= sk->max_ack_backlog/2) + wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); if (msg->msg_name) - { - if (skb->sk->protinfo.af_unix.addr) - { - memcpy(msg->msg_name, skb->sk->protinfo.af_unix.addr->name, - skb->sk->protinfo.af_unix.addr->len); - msg->msg_namelen=skb->sk->protinfo.af_unix.addr->len; - } - else - msg->msg_namelen=sizeof(short); - } + unix_copy_addr(msg, skb->sk); if (size > skb->len) size = skb->len; else if (size < skb->len) msg->msg_flags |= MSG_TRUNC; - if (memcpy_toiovec(msg->msg_iov, skb->data, size)) { - skb_queue_head(&sk->receive_queue, skb); - return -EFAULT; - } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size); + if (err) + goto out_free; scm->creds = *UNIXCREDS(skb); @@ -1154,9 +1350,9 @@ retry: { if (UNIXCB(skb).fp) unix_detach_fds(scm, skb); - kfree_skb(skb, FREE_WRITE); - return size; - } else + } + else + { /* It is questionable: on PEEK we could: - do not return fds - good, but too simple 8) - return fds, and do not return them on read (old strategy, @@ -1171,12 +1367,50 @@ retry: */ if (UNIXCB(skb).fp) scm->fp = scm_fp_dup(UNIXCB(skb).fp); + } + err = size; + +out_free: + skb_free_datagram(sk,skb); +out: + return err; +} + +/* + * Sleep until data has arrive. But check for races.. + */ + +static void unix_stream_data_wait(unix_socket * sk) +{ + DECLARE_WAITQUEUE(wait, current); + + unix_state_rlock(sk); + + add_wait_queue(sk->sleep, &wait); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (skb_queue_len(&sk->receive_queue) || + sk->err || + (sk->shutdown & RCV_SHUTDOWN) || + signal_pending(current)) + break; + + sk->socket->flags |= SO_WAITDATA; + unix_state_runlock(sk); + schedule(); + unix_state_rlock(sk); + sk->socket->flags &= ~SO_WAITDATA; + } - skb_queue_head(&sk->receive_queue, skb); - return size; + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sleep, &wait); + unix_state_runlock(sk); } + static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags, struct scm_cookie *scm) { @@ -1186,16 +1420,20 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size int copied = 0; int check_creds = 0; int target = 1; + int err = 0; - if (sock->flags & SO_ACCEPTCON) - return(-EINVAL); + err = -EINVAL; + if (sk->state != TCP_ESTABLISHED) + goto out; + err = -EOPNOTSUPP; if (flags&MSG_OOB) - return -EOPNOTSUPP; - if(flags&MSG_WAITALL) + goto out; + + if (flags&MSG_WAITALL) target = size; - - + + msg->msg_namelen = 0; /* Lock the socket to prevent queue disordering @@ -1215,52 +1453,58 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size if (copied >= target) break; - if (sk->err) - return sock_error(sk); - + /* + * POSIX 1003.1g mandates this order. + */ + + if ((err = sock_error(sk)) != 0) + break; if (sk->shutdown & RCV_SHUTDOWN) break; - up(&sk->protinfo.af_unix.readsem); + err = -EAGAIN; if (noblock) - return -EAGAIN; - unix_data_wait(sk); - if (current->signal & ~current->blocked) - return -ERESTARTSYS; + break; + up(&sk->protinfo.af_unix.readsem); + + unix_stream_data_wait(sk); + + if (signal_pending(current)) { + err = -ERESTARTSYS; + goto out; + } down(&sk->protinfo.af_unix.readsem); continue; } - /* Never glue messages from different writers */ - if (check_creds && - memcmp(UNIXCREDS(skb), &scm->creds, sizeof(scm->creds)) != 0) - { - skb_queue_head(&sk->receive_queue, skb); - break; + if (check_creds) { + /* Never glue messages from different writers */ + if (memcmp(UNIXCREDS(skb), &scm->creds, sizeof(scm->creds)) != 0) { + skb_queue_head(&sk->receive_queue, skb); + break; + } + } else { + /* Copy credentials */ + scm->creds = *UNIXCREDS(skb); + check_creds = 1; } /* Copy address just once */ if (sunaddr) { - if (skb->sk->protinfo.af_unix.addr) - { - memcpy(sunaddr, skb->sk->protinfo.af_unix.addr->name, - skb->sk->protinfo.af_unix.addr->len); - msg->msg_namelen=skb->sk->protinfo.af_unix.addr->len; - } - else - msg->msg_namelen=sizeof(short); + unix_copy_addr(msg, skb->sk); sunaddr = NULL; } chunk = min(skb->len, size); - memcpy_toiovec(msg->msg_iov, skb->data, chunk); + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + skb_queue_head(&sk->receive_queue, skb); + if (copied == 0) + copied = -EFAULT; + break; + } copied += chunk; size -= chunk; - /* Copy credentials */ - scm->creds = *UNIXCREDS(skb); - check_creds = 1; - /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { @@ -1276,7 +1520,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size break; } - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); if (scm->fp) break; @@ -1284,7 +1528,6 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size else { /* It is questionable, see note in unix_dgram_recvmsg. - */ if (UNIXCB(skb).fp) scm->fp = scm_fp_dup(UNIXCB(skb).fp); @@ -1296,38 +1539,43 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size } while (size); up(&sk->protinfo.af_unix.readsem); - return copied; +out: + return copied ? : err; } static int unix_shutdown(struct socket *sock, int mode) { struct sock *sk = sock->sk; - unix_socket *other=unix_peer(sk); - - mode++; + unix_socket *other; - if (mode&SEND_SHUTDOWN) - { - sk->shutdown|=SEND_SHUTDOWN; - sk->state_change(sk); - if(other && sk->type == SOCK_STREAM && other->state != TCP_LISTEN) - { - if (unix_our_peer(sk, other)) - other->shutdown|=RCV_SHUTDOWN; - other->state_change(other); - } - } - other=unix_peer(sk); - if(mode&RCV_SHUTDOWN) - { - sk->shutdown|=RCV_SHUTDOWN; + mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN); + + if (mode) { + unix_state_wlock(sk); + sk->shutdown |= mode; + other=unix_peer(sk); + if (other) + sock_hold(other); + unix_state_wunlock(sk); sk->state_change(sk); - if(other && sk->type != SOCK_DGRAM && other->state != TCP_LISTEN) - { - if (unix_our_peer(sk, other)) - other->shutdown|=SEND_SHUTDOWN; - other->state_change(other); + + if (other && sk->type == SOCK_STREAM) { + int peer_mode = 0; + + if (mode&RCV_SHUTDOWN) + peer_mode |= SEND_SHUTDOWN; + if (mode&SEND_SHUTDOWN) + peer_mode |= RCV_SHUTDOWN; + unix_state_wlock(other); + other->shutdown |= peer_mode; + unix_state_wunlock(other); + if (peer_mode&RCV_SHUTDOWN) + other->data_ready(other,0); + else + other->state_change(other); } + if (other) + sock_put(other); } return 0; } @@ -1337,36 +1585,73 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; long amount=0; - + int err; + switch(cmd) { case TIOCOUTQ: - amount=sk->sndbuf-sk->wmem_alloc; + amount = sk->sndbuf - atomic_read(&sk->wmem_alloc); if(amount<0) amount=0; - return put_user(amount, (int *)arg); + err = put_user(amount, (int *)arg); + break; case TIOCINQ: { struct sk_buff *skb; - if(sk->state==TCP_LISTEN) - return -EINVAL; - /* - * These two are safe on current systems as - * only user tasks fiddle here - */ + if (sk->state==TCP_LISTEN) { + err = -EINVAL; + break; + } + + spin_lock(&sk->receive_queue.lock); if((skb=skb_peek(&sk->receive_queue))!=NULL) amount=skb->len; - return put_user(amount, (int *)arg); + spin_unlock(&sk->receive_queue.lock); + err = put_user(amount, (int *)arg); + break; } default: - return -EINVAL; + err = -EINVAL; + break; } - /*NOTREACHED*/ - return(0); + return err; } +static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + + poll_wait(file, sk->sleep, wait); + mask = 0; + + /* exceptional events? */ + if (sk->err) + mask |= POLLERR; + if (sk->shutdown & RCV_SHUTDOWN) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if (sk->type == SOCK_STREAM && sk->state==TCP_CLOSE) + mask |= POLLHUP; + + /* + * we set writable also when the other side has shut down the + * connection. This prevents stuck sockets. + */ + if (unix_writable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + return mask; +} + + #ifdef CONFIG_PROC_FS static int unix_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data) @@ -1379,16 +1664,21 @@ static int unix_read_proc(char *buffer, char **start, off_t offset, len+= sprintf(buffer,"Num RefCount Protocol Flags Type St " "Inode Path\n"); - + + read_lock(&unix_table_lock); forall_unix_sockets (i,s) { - len+=sprintf(buffer+len,"%p: %08X %08X %08lX %04X %02X %5ld", + unix_state_rlock(s); + + len+=sprintf(buffer+len,"%p: %08X %08X %08X %04X %02X %5ld", s, - s->sock_readers, + atomic_read(&s->refcnt), 0, - s->socket ? s->socket->flags : 0, + s->state == TCP_LISTEN ? SO_ACCEPTCON : 0, s->type, - s->socket ? s->socket->state : 0, + s->socket ? + (s->state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : + (s->state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), s->socket ? s->socket->inode->i_ino : 0); if (s->protinfo.af_unix.addr) @@ -1402,9 +1692,11 @@ static int unix_read_proc(char *buffer, char **start, off_t offset, buffer[len] = '@'; len += s->protinfo.af_unix.addr->len - sizeof(short); } + unix_state_runlock(s); + buffer[len++]='\n'; - pos=begin+len; + pos = begin + len; if(poslength) len=length; + if (len < 0) + len = 0; return len; } #endif struct proto_ops unix_stream_ops = { - AF_UNIX, + PF_UNIX, - unix_dup, unix_release, unix_bind, unix_stream_connect, unix_socketpair, unix_accept, unix_getname, - datagram_poll, + unix_poll, unix_ioctl, unix_listen, unix_shutdown, @@ -1441,18 +1735,18 @@ struct proto_ops unix_stream_ops = { sock_no_getsockopt, sock_no_fcntl, unix_stream_sendmsg, - unix_stream_recvmsg + unix_stream_recvmsg, + sock_no_mmap }; struct proto_ops unix_dgram_ops = { - AF_UNIX, + PF_UNIX, - unix_dup, unix_release, unix_bind, unix_dgram_connect, unix_socketpair, - NULL, + sock_no_accept, unix_getname, datagram_poll, unix_ioctl, @@ -1462,31 +1756,67 @@ struct proto_ops unix_dgram_ops = { sock_no_getsockopt, sock_no_fcntl, unix_dgram_sendmsg, - unix_dgram_recvmsg + unix_dgram_recvmsg, + sock_no_mmap }; struct net_proto_family unix_family_ops = { - AF_UNIX, + PF_UNIX, unix_create }; -void unix_proto_init(struct net_proto *pro) +#ifdef MODULE +#ifdef CONFIG_SYSCTL +extern void unix_sysctl_register(void); +extern void unix_sysctl_unregister(void); +#endif + +int init_module(void) +#else +void __init unix_proto_init(struct net_proto *pro) +#endif { struct sk_buff *dummy_skb; struct proc_dir_entry *ent; - printk(KERN_INFO "NET3: Unix domain sockets 0.15 for Linux NET3.038.\n"); + printk(KERN_INFO "NET4: Unix domain sockets 1.0/SMP for Linux NET4.0.\n"); if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) { printk(KERN_CRIT "unix_proto_init: panic\n"); +#ifdef MODULE + return -1; +#else return; +#endif } sock_register(&unix_family_ops); #ifdef CONFIG_PROC_FS ent = create_proc_entry("net/unix", 0, 0); ent->read_proc = unix_read_proc; #endif + +#ifdef MODULE +#ifdef CONFIG_SYSCTL + unix_sysctl_register(); +#endif + + return 0; +#endif } + +#ifdef MODULE +void cleanup_module(void) +{ + sock_unregister(PF_UNIX); +#ifdef CONFIG_SYSCTL + unix_sysctl_unregister(); +#endif +#ifdef CONFIG_PROC_FS + remove_proc_entry("net/unix", 0); +#endif +} +#endif + /* * Local variables: * compile-command: "gcc -g -D__KERNEL__ -Wall -O6 -I/usr/src/linux/include -c af_unix.c"