4 * Copyright (C) 1991, 1992 Linus Torvalds 8 * 'fork.c' contains the help-routines for the 'fork' system call 9 * (see also entry.S and others). 10 * Fork is rather simple, once you get the hang of it, but the memory 11 * management can be a bitch. See 'mm/memory.c': 'copy_page_tables()' 14 #include <linux/config.h> 15 #include <linux/malloc.h> 16 #include <linux/init.h> 17 #include <linux/unistd.h> 18 #include <linux/smp_lock.h> 19 #include <linux/module.h> 20 #include <linux/vmalloc.h> 22 #include <asm/pgtable.h> 23 #include <asm/pgalloc.h> 24 #include <asm/uaccess.h> 25 #include <asm/mmu_context.h> 27 /* The idle threads do not count.. */ 32 unsigned long total_forks
;/* Handle normal Linux uptimes. */ 35 struct task_struct
*pidhash
[PIDHASH_SZ
]; 37 voidadd_wait_queue(wait_queue_head_t
*q
, wait_queue_t
* wait
) 41 wq_write_lock_irqsave(&q
->lock
, flags
); 43 __add_wait_queue(q
, wait
); 44 wq_write_unlock_irqrestore(&q
->lock
, flags
); 47 voidadd_wait_queue_exclusive(wait_queue_head_t
*q
, wait_queue_t
* wait
) 51 wq_write_lock_irqsave(&q
->lock
, flags
); 52 wait
->flags
= WQ_FLAG_EXCLUSIVE
; 53 __add_wait_queue_tail(q
, wait
); 54 wq_write_unlock_irqrestore(&q
->lock
, flags
); 57 voidremove_wait_queue(wait_queue_head_t
*q
, wait_queue_t
* wait
) 61 wq_write_lock_irqsave(&q
->lock
, flags
); 62 __remove_wait_queue(q
, wait
); 63 wq_write_unlock_irqrestore(&q
->lock
, flags
); 66 void __init
fork_init(unsigned long mempages
) 69 * The default maximum number of threads is set to a safe 70 * value: the thread structures can take up at most half 73 max_threads
= mempages
/ (THREAD_SIZE
/PAGE_SIZE
) /2; 75 init_task
.rlim
[RLIMIT_NPROC
].rlim_cur
= max_threads
/2; 76 init_task
.rlim
[RLIMIT_NPROC
].rlim_max
= max_threads
/2; 79 /* Protects next_safe and last_pid. */ 80 spinlock_t lastpid_lock
= SPIN_LOCK_UNLOCKED
; 82 static intget_pid(unsigned long flags
) 84 static int next_safe
= PID_MAX
; 85 struct task_struct
*p
; 90 spin_lock(&lastpid_lock
); 91 if((++last_pid
) &0xffff8000) { 92 last_pid
=300;/* Skip daemons etc. */ 95 if(last_pid
>= next_safe
) { 98 read_lock(&tasklist_lock
); 101 if(p
->pid
== last_pid
|| 102 p
->pgrp
== last_pid
|| 103 p
->session
== last_pid
) { 104 if(++last_pid
>= next_safe
) { 105 if(last_pid
&0xffff8000) 111 if(p
->pid
> last_pid
&& next_safe
> p
->pid
) 113 if(p
->pgrp
> last_pid
&& next_safe
> p
->pgrp
) 115 if(p
->session
> last_pid
&& next_safe
> p
->session
) 116 next_safe
= p
->session
; 118 read_unlock(&tasklist_lock
); 120 spin_unlock(&lastpid_lock
); 125 staticinlineintdup_mmap(struct mm_struct
* mm
) 127 struct vm_area_struct
* mpnt
, *tmp
, **pprev
; 130 flush_cache_mm(current
->mm
); 134 mm
->mmap_cache
= NULL
; 140 for(mpnt
= current
->mm
->mmap
; mpnt
; mpnt
= mpnt
->vm_next
) { 144 if(mpnt
->vm_flags
& VM_DONTCOPY
) 146 tmp
=kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
); 150 tmp
->vm_flags
&= ~VM_LOCKED
; 156 struct inode
*inode
= file
->f_dentry
->d_inode
; 158 if(tmp
->vm_flags
& VM_DENYWRITE
) 159 atomic_dec(&inode
->i_writecount
); 161 /* insert tmp into the share list, just after mpnt */ 162 spin_lock(&inode
->i_mapping
->i_shared_lock
); 163 if((tmp
->vm_next_share
= mpnt
->vm_next_share
) != NULL
) 164 mpnt
->vm_next_share
->vm_pprev_share
= 166 mpnt
->vm_next_share
= tmp
; 167 tmp
->vm_pprev_share
= &mpnt
->vm_next_share
; 168 spin_unlock(&inode
->i_mapping
->i_shared_lock
); 171 /* Copy the pages, but defer checking for errors */ 172 retval
=copy_page_range(mm
, current
->mm
, tmp
); 173 if(!retval
&& tmp
->vm_ops
&& tmp
->vm_ops
->open
) 174 tmp
->vm_ops
->open(tmp
); 177 * Link in the new vma even if an error occurred, 178 * so that exit_mmap() can clean up the mess. 181 pprev
= &tmp
->vm_next
; 187 if(mm
->map_count
>= AVL_MIN_MAP_COUNT
) 191 flush_tlb_mm(current
->mm
); 195 #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) 196 #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 198 static struct mm_struct
*mm_init(struct mm_struct
* mm
) 200 atomic_set(&mm
->mm_users
,1); 201 atomic_set(&mm
->mm_count
,1); 202 init_MUTEX(&mm
->mmap_sem
); 203 mm
->page_table_lock
= SPIN_LOCK_UNLOCKED
; 204 mm
->pgd
=pgd_alloc(); 213 * Allocate and initialize an mm_struct. 215 struct mm_struct
*mm_alloc(void) 217 struct mm_struct
* mm
; 221 memset(mm
,0,sizeof(*mm
)); 228 * Called when the last reference to the mm 229 * is dropped: either by a lazy thread or by 230 * mmput. Free the page directory and the mm. 232 inlinevoid__mmdrop(struct mm_struct
*mm
) 234 if(mm
== &init_mm
)BUG(); 241 * Decrement the use count and release all resources for an mm. 243 voidmmput(struct mm_struct
*mm
) 245 if(atomic_dec_and_test(&mm
->mm_users
)) { 251 /* Please note the differences between mmput and mm_release. 252 * mmput is called whenever we stop holding onto a mm_struct, 253 * error success whatever. 255 * mm_release is called after a mm_struct has been removed 256 * from the current process. 258 * This difference is important for error handling, when we 259 * only half set up a mm_struct for a new process and need to restore 260 * the old one. Because we mmput the new mm_struct before 261 * restoring the old one. . . 262 * Eric Biederman 10 January 1998 266 struct task_struct
*tsk
= current
; 268 /* notify parent sleeping on vfork() */ 269 if(tsk
->flags
& PF_VFORK
) { 270 tsk
->flags
&= ~PF_VFORK
; 271 up(tsk
->p_opptr
->vfork_sem
); 275 staticinlineintcopy_mm(unsigned long clone_flags
,struct task_struct
* tsk
) 277 struct mm_struct
* mm
; 280 tsk
->min_flt
= tsk
->maj_flt
=0; 281 tsk
->cmin_flt
= tsk
->cmaj_flt
=0; 282 tsk
->nswap
= tsk
->cnswap
=0; 285 tsk
->active_mm
= NULL
; 288 * Are we cloning a kernel thread? 290 * We need to steal a active VM for that.. 296 if(clone_flags
& CLONE_VM
) { 297 atomic_inc(&mm
->mm_users
); 306 /* Copy the current MM stuff.. */ 307 memcpy(mm
, current
->mm
,sizeof(*mm
)); 314 down(¤t
->mm
->mmap_sem
); 315 retval
=dup_mmap(mm
); 316 up(¤t
->mm
->mmap_sem
); 321 * child gets a private LDT (if there was an LDT in the parent) 323 copy_segments(tsk
, mm
); 325 if(init_new_context(tsk
,mm
)) 339 staticinlinestruct fs_struct
*__copy_fs_struct(struct fs_struct
*old
) 341 struct fs_struct
*fs
=kmem_cache_alloc(fs_cachep
, GFP_KERNEL
); 342 /* We don't need to lock fs - think why ;-) */ 344 atomic_set(&fs
->count
,1); 345 fs
->lock
= RW_LOCK_UNLOCKED
; 346 fs
->umask
= old
->umask
; 347 read_lock(&old
->lock
); 348 fs
->rootmnt
=mntget(old
->rootmnt
); 349 fs
->root
=dget(old
->root
); 350 fs
->pwdmnt
=mntget(old
->pwdmnt
); 351 fs
->pwd
=dget(old
->pwd
); 353 fs
->altrootmnt
=mntget(old
->altrootmnt
); 354 fs
->altroot
=dget(old
->altroot
); 356 fs
->altrootmnt
= NULL
; 359 read_unlock(&old
->lock
); 364 struct fs_struct
*copy_fs_struct(struct fs_struct
*old
) 366 return__copy_fs_struct(old
); 369 staticinlineintcopy_fs(unsigned long clone_flags
,struct task_struct
* tsk
) 371 if(clone_flags
& CLONE_FS
) { 372 atomic_inc(¤t
->fs
->count
); 375 tsk
->fs
=__copy_fs_struct(current
->fs
); 381 static intcount_open_files(struct files_struct
*files
,int size
) 385 /* Find the last open fd */ 386 for(i
= size
/(8*sizeof(long)); i
>0; ) { 387 if(files
->open_fds
->fds_bits
[--i
]) 390 i
= (i
+1) *8*sizeof(long); 394 static intcopy_files(unsigned long clone_flags
,struct task_struct
* tsk
) 396 struct files_struct
*oldf
, *newf
; 397 struct file
**old_fds
, **new_fds
; 398 int open_files
, nfds
, size
, i
, error
=0; 401 * A background process may not have any files ... 403 oldf
= current
->files
; 407 if(clone_flags
& CLONE_FILES
) { 408 atomic_inc(&oldf
->count
); 414 newf
=kmem_cache_alloc(files_cachep
, SLAB_KERNEL
); 418 atomic_set(&newf
->count
,1); 420 newf
->file_lock
= RW_LOCK_UNLOCKED
; 422 newf
->max_fds
= NR_OPEN_DEFAULT
; 423 newf
->max_fdset
= __FD_SETSIZE
; 424 newf
->close_on_exec
= &newf
->close_on_exec_init
; 425 newf
->open_fds
= &newf
->open_fds_init
; 426 newf
->fd
= &newf
->fd_array
[0]; 428 /* We don't yet have the oldf readlock, but even if the old 429 fdset gets grown now, we'll only copy up to "size" fds */ 430 size
= oldf
->max_fdset
; 431 if(size
> __FD_SETSIZE
) { 433 write_lock(&newf
->file_lock
); 434 error
=expand_fdset(newf
, size
); 435 write_unlock(&newf
->file_lock
); 439 read_lock(&oldf
->file_lock
); 441 open_files
=count_open_files(oldf
, size
); 444 * Check whether we need to allocate a larger fd array. 445 * Note: we're not a clone task, so the open count won't 448 nfds
= NR_OPEN_DEFAULT
; 449 if(open_files
> nfds
) { 450 read_unlock(&oldf
->file_lock
); 452 write_lock(&newf
->file_lock
); 453 error
=expand_fd_array(newf
, open_files
); 454 write_unlock(&newf
->file_lock
); 457 nfds
= newf
->max_fds
; 458 read_lock(&oldf
->file_lock
); 464 memcpy(newf
->open_fds
->fds_bits
, oldf
->open_fds
->fds_bits
, open_files
/8); 465 memcpy(newf
->close_on_exec
->fds_bits
, oldf
->close_on_exec
->fds_bits
, open_files
/8); 467 for(i
= open_files
; i
!=0; i
--) { 468 struct file
*f
= *old_fds
++; 473 read_unlock(&oldf
->file_lock
); 475 /* compute the remainder to be cleared */ 476 size
= (newf
->max_fds
- open_files
) *sizeof(struct file
*); 478 /* This is long word aligned thus could use a optimized version */ 479 memset(new_fds
,0, size
); 481 if(newf
->max_fdset
> open_files
) { 482 int left
= (newf
->max_fdset
-open_files
)/8; 483 int start
= open_files
/ (8*sizeof(unsigned long)); 485 memset(&newf
->open_fds
->fds_bits
[start
],0, left
); 486 memset(&newf
->close_on_exec
->fds_bits
[start
],0, left
); 495 free_fdset(newf
->close_on_exec
, newf
->max_fdset
); 496 free_fdset(newf
->open_fds
, newf
->max_fdset
); 497 kmem_cache_free(files_cachep
, newf
); 501 staticinlineintcopy_sighand(unsigned long clone_flags
,struct task_struct
* tsk
) 503 struct signal_struct
*sig
; 505 if(clone_flags
& CLONE_SIGHAND
) { 506 atomic_inc(¤t
->sig
->count
); 509 sig
=kmem_cache_alloc(sigact_cachep
, GFP_KERNEL
); 513 spin_lock_init(&sig
->siglock
); 514 atomic_set(&sig
->count
,1); 515 memcpy(tsk
->sig
->action
, current
->sig
->action
,sizeof(tsk
->sig
->action
)); 519 staticinlinevoidcopy_flags(unsigned long clone_flags
,struct task_struct
*p
) 521 unsigned long new_flags
= p
->flags
; 523 new_flags
&= ~(PF_SUPERPRIV
| PF_USEDFPU
| PF_VFORK
); 524 new_flags
|= PF_FORKNOEXEC
; 525 if(!(clone_flags
& CLONE_PTRACE
)) 527 if(clone_flags
& CLONE_VFORK
) 528 new_flags
|= PF_VFORK
; 529 p
->flags
= new_flags
; 533 * Ok, this is the main fork-routine. It copies the system process 534 * information (task[nr]) and sets up the necessary registers. It also 535 * copies the data segment in its entirety. The "stack_start" and 536 * "stack_top" arguments are simply passed along to the platform 537 * specific copy_thread() routine. Most platforms ignore stack_top. 538 * For an example that's using stack_top, see 539 * arch/ia64/kernel/process.c. 541 intdo_fork(unsigned long clone_flags
,unsigned long stack_start
, 542 struct pt_regs
*regs
,unsigned long stack_size
) 544 int retval
= -ENOMEM
; 545 struct task_struct
*p
; 546 DECLARE_MUTEX_LOCKED(sem
); 548 if(clone_flags
& CLONE_PID
) { 549 /* This is only allowed from the boot up thread */ 554 current
->vfork_sem
= &sem
; 556 p
=alloc_task_struct(); 563 if(atomic_read(&p
->user
->processes
) >= p
->rlim
[RLIMIT_NPROC
].rlim_cur
) 565 atomic_inc(&p
->user
->__count
); 566 atomic_inc(&p
->user
->processes
); 569 * Counter increases are protected by 570 * the kernel lock so nr_threads can't 571 * increase under us (but it may decrease). 573 if(nr_threads
>= max_threads
) 574 goto bad_fork_cleanup_count
; 576 get_exec_domain(p
->exec_domain
); 578 if(p
->binfmt
&& p
->binfmt
->module
) 579 __MOD_INC_USE_COUNT(p
->binfmt
->module
); 583 p
->state
= TASK_UNINTERRUPTIBLE
; 585 copy_flags(clone_flags
, p
); 586 p
->pid
=get_pid(clone_flags
); 588 p
->run_list
.next
= NULL
; 589 p
->run_list
.prev
= NULL
; 591 if((clone_flags
& CLONE_VFORK
) || !(clone_flags
& CLONE_PARENT
)) { 592 p
->p_opptr
= current
; 593 if(!(p
->ptrace
& PT_PTRACED
)) 597 init_waitqueue_head(&p
->wait_chldexit
); 599 spin_lock_init(&p
->alloc_lock
); 602 init_sigpending(&p
->pending
); 604 p
->it_real_value
= p
->it_virt_value
= p
->it_prof_value
=0; 605 p
->it_real_incr
= p
->it_virt_incr
= p
->it_prof_incr
=0; 606 init_timer(&p
->real_timer
); 607 p
->real_timer
.data
= (unsigned long) p
; 609 p
->leader
=0;/* session leadership doesn't inherit */ 611 p
->times
.tms_utime
= p
->times
.tms_stime
=0; 612 p
->times
.tms_cutime
= p
->times
.tms_cstime
=0; 617 p
->processor
= current
->processor
; 618 /* ?? should we just memset this ?? */ 619 for(i
=0; i
< smp_num_cpus
; i
++) 620 p
->per_cpu_utime
[i
] = p
->per_cpu_stime
[i
] =0; 621 spin_lock_init(&p
->sigmask_lock
); 624 p
->lock_depth
= -1;/* -1 = no lock */ 625 p
->start_time
= jiffies
; 628 /* copy all the process information */ 629 if(copy_files(clone_flags
, p
)) 630 goto bad_fork_cleanup
; 631 if(copy_fs(clone_flags
, p
)) 632 goto bad_fork_cleanup_files
; 633 if(copy_sighand(clone_flags
, p
)) 634 goto bad_fork_cleanup_fs
; 635 if(copy_mm(clone_flags
, p
)) 636 goto bad_fork_cleanup_sighand
; 637 retval
=copy_thread(0, clone_flags
, stack_start
, stack_size
, p
, regs
); 639 goto bad_fork_cleanup_sighand
; 642 /* Our parent execution domain becomes current domain 643 These must match for thread signalling to apply */ 645 p
->parent_exec_id
= p
->self_exec_id
; 647 /* ok, now we should be set up.. */ 649 p
->exit_signal
= clone_flags
& CSIGNAL
; 653 * "share" dynamic priority between parent and child, thus the 654 * total amount of dynamic priorities in the system doesnt change, 655 * more scheduling fairness. This is only important in the first 656 * timeslice, on the long run the scheduling behaviour is unchanged. 658 p
->counter
= (current
->counter
+1) >>1; 659 current
->counter
>>=1; 660 if(!current
->counter
) 661 current
->need_resched
=1; 664 * Ok, add it to the run-queues and make it 665 * visible to the rest of the system. 671 INIT_LIST_HEAD(&p
->thread_group
); 672 write_lock_irq(&tasklist_lock
); 673 if(clone_flags
& CLONE_THREAD
) { 674 p
->tgid
= current
->tgid
; 675 list_add(&p
->thread_group
, ¤t
->thread_group
); 680 write_unlock_irq(&tasklist_lock
); 682 if(p
->ptrace
& PT_PTRACED
) 683 send_sig(SIGSTOP
, p
,1); 685 wake_up_process(p
);/* do this last */ 689 if((clone_flags
& CLONE_VFORK
) && (retval
>0)) 693 bad_fork_cleanup_sighand
: 696 exit_fs(p
);/* blocking */ 697 bad_fork_cleanup_files
: 698 exit_files(p
);/* blocking */ 700 put_exec_domain(p
->exec_domain
); 701 if(p
->binfmt
&& p
->binfmt
->module
) 702 __MOD_DEC_USE_COUNT(p
->binfmt
->module
); 703 bad_fork_cleanup_count
: 704 atomic_dec(&p
->user
->processes
); 711 /* SLAB cache for signal_struct structures (tsk->sig) */ 712 kmem_cache_t
*sigact_cachep
; 714 /* SLAB cache for files_struct structures (tsk->files) */ 715 kmem_cache_t
*files_cachep
; 717 /* SLAB cache for fs_struct structures (tsk->fs) */ 718 kmem_cache_t
*fs_cachep
; 720 /* SLAB cache for vm_area_struct structures */ 721 kmem_cache_t
*vm_area_cachep
; 723 /* SLAB cache for mm_struct structures (tsk->mm) */ 724 kmem_cache_t
*mm_cachep
; 726 void __init
proc_caches_init(void) 728 sigact_cachep
=kmem_cache_create("signal_act", 729 sizeof(struct signal_struct
),0, 730 SLAB_HWCACHE_ALIGN
, NULL
, NULL
); 732 panic("Cannot create signal action SLAB cache"); 734 files_cachep
=kmem_cache_create("files_cache", 735 sizeof(struct files_struct
),0, 736 SLAB_HWCACHE_ALIGN
, NULL
, NULL
); 738 panic("Cannot create files SLAB cache"); 740 fs_cachep
=kmem_cache_create("fs_cache", 741 sizeof(struct fs_struct
),0, 742 SLAB_HWCACHE_ALIGN
, NULL
, NULL
); 744 panic("Cannot create fs_struct SLAB cache"); 746 vm_area_cachep
=kmem_cache_create("vm_area_struct", 747 sizeof(struct vm_area_struct
),0, 748 SLAB_HWCACHE_ALIGN
, NULL
, NULL
); 750 panic("vma_init: Cannot alloc vm_area_struct SLAB cache"); 752 mm_cachep
=kmem_cache_create("mm_struct", 753 sizeof(struct mm_struct
),0, 754 SLAB_HWCACHE_ALIGN
, NULL
, NULL
); 756 panic("vma_init: Cannot alloc mm_struct SLAB cache");