kernel/fork.c

Name: Public Git Hosting - davej-history.git/blob - kernel/fork.c
Rating: 4.9 (1497 reviews)
 1 /*
 2  * linux/kernel/fork.c
 3  *
 4  * Copyright (C) 1991, 1992 Linus Torvalds
 5  */
 6
 7 /*
 8  * 'fork.c' contains the help-routines for the 'fork' system call
 9  * (see also system_call.s).
 10  * Fork is rather simple, once you get the hang of it, but the memory
 11  * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
 12  */
 13
 14 #include <linux/init.h>
 15 #include <linux/errno.h>
 16 #include <linux/sched.h>
 17 #include <linux/kernel.h>
 18 #include <linux/mm.h>
 19 #include <linux/slab.h>
 20 #include <linux/unistd.h>
 21 #include <linux/ptrace.h>
 22 #include <linux/malloc.h>
 23 #include <linux/smp.h>
 24 #include <linux/smp_lock.h>
 25 #include <linux/module.h>
 26
 27 #include <asm/system.h>
 28 #include <asm/pgtable.h>
 29 #include <asm/mmu_context.h>
 30 #include <asm/uaccess.h>
 31
 32 int nr_tasks=1;
 33 int nr_running=1;
 34 unsigned long int total_forks=0;/* Handle normal Linux uptimes. */
 35 int last_pid=0;
 36
 37 /* SLAB cache for mm_struct's. */
 38 kmem_cache_t *mm_cachep;
 39
 40 /* SLAB cache for files structs */
 41 kmem_cache_t *files_cachep;
 42
 43 struct task_struct *pidhash[PIDHASH_SZ];
 44
 45 struct task_struct **tarray_freelist = NULL;
 46 spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED;
 47
 48 /* UID task count cache, to prevent walking entire process list every
 49  * single fork() operation.
 50  */
 51 #define UIDHASH_SZ (PIDHASH_SZ >> 2)
 52
 53 static struct uid_taskcount {
 54 struct uid_taskcount *next, **pprev;
 55 unsigned short uid;
 56 int task_count;
 57 } *uidhash[UIDHASH_SZ];
 58
 59 #ifdef __SMP__
 60 static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
 61 #endif
 62
 63 kmem_cache_t *uid_cachep;
 64
 65 #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
 66
 67 staticinlinevoiduid_hash_insert(struct uid_taskcount *up,unsigned int hashent)
 68 {
 69 spin_lock(&uidhash_lock);
 70 if((up->next = uidhash[hashent]) != NULL)
 71  uidhash[hashent]->pprev = &up->next;
 72  up->pprev = &uidhash[hashent];
 73  uidhash[hashent] = up;
 74 spin_unlock(&uidhash_lock);
 75 }
 76
 77 staticinlinevoiduid_hash_remove(struct uid_taskcount *up)
 78 {
 79 spin_lock(&uidhash_lock);
 80 if(up->next)
 81  up->next->pprev = up->pprev;
 82 *up->pprev = up->next;
 83 spin_unlock(&uidhash_lock);
 84 }
 85
 86 staticinlinestruct uid_taskcount *uid_find(unsigned short uid,unsigned int hashent)
 87 {
 88 struct uid_taskcount *up;
 89
 90 spin_lock(&uidhash_lock);
 91 for(up = uidhash[hashent]; (up && up->uid != uid); up = up->next)
 92 ;
 93 spin_unlock(&uidhash_lock);
 94 return up;
 95 }
 96
 97 intcharge_uid(struct task_struct *p,int count)
 98 {
 99 unsigned int hashent =uidhashfn(p->uid);
 100 struct uid_taskcount *up =uid_find(p->uid, hashent);
 101
 102 if(up) {
 103 int limit = p->rlim[RLIMIT_NPROC].rlim_cur;
 104 int newcnt = up->task_count + count;
 105
 106 if(newcnt > limit)
 107 return-EAGAIN;
 108 else if(newcnt ==0) {
 109 uid_hash_remove(up);
 110 kmem_cache_free(uid_cachep, up);
 111 return0;
 112 }
 113 }else{
 114  up =kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
 115 if(!up)
 116 return-EAGAIN;
 117  up->uid = p->uid;
 118  up->task_count =0;
 119 uid_hash_insert(up, hashent);
 120 }
 121  up->task_count += count;
 122 return0;
 123 }
 124
 125 __initfunc(voiduidcache_init(void))
 126 {
 127 int i;
 128
 129  uid_cachep =kmem_cache_create("uid_cache",sizeof(struct uid_taskcount),
 130 0,
 131  SLAB_HWCACHE_ALIGN, NULL, NULL);
 132 if(!uid_cachep)
 133 panic("Cannot create uid taskcount SLAB cache\n");
 134
 135 for(i =0; i < UIDHASH_SZ; i++)
 136  uidhash[i] =0;
 137 }
 138
 139 staticinlineintfind_empty_process(void)
 140 {
 141 struct task_struct **tslot;
 142
 143 if(current->uid) {
 144 int error;
 145
 146 if(nr_tasks >= NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT)
 147 return-EAGAIN;
 148 if((error =charge_uid(current,1)) <0)
 149 return error;
 150 }
 151  tslot =get_free_taskslot();
 152 if(tslot)
 153 return tslot - &task[0];
 154 return-EAGAIN;
 155 }
 156
 157 #ifdef __SMP__
 158 /* Protects next_safe and last_pid. */
 159 static spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
 160 #endif
 161
 162 static intget_pid(unsigned long flags)
 163 {
 164 static int next_safe = PID_MAX;
 165 struct task_struct *p;
 166
 167 if(flags & CLONE_PID)
 168 return current->pid;
 169
 170 spin_lock(&lastpid_lock);
 171 if((++last_pid) &0xffff8000) {
 172  last_pid =300;/* Skip daemons etc. */
 173 goto inside;
 174 }
 175 if(last_pid >= next_safe) {
 176 inside:
 177  next_safe = PID_MAX;
 178 read_lock(&tasklist_lock);
 179  repeat:
 180 for_each_task(p) {
 181 if(p->pid == last_pid ||
 182  p->pgrp == last_pid ||
 183  p->session == last_pid) {
 184 if(++last_pid >= next_safe) {
 185 if(last_pid &0xffff8000)
 186  last_pid =300;
 187  next_safe = PID_MAX;
 188 goto repeat;
 189 }
 190 }
 191 if(p->pid > last_pid && next_safe > p->pid)
 192  next_safe = p->pid;
 193 if(p->pgrp > last_pid && next_safe > p->pgrp)
 194  next_safe = p->pgrp;
 195 if(p->session > last_pid && next_safe > p->session)
 196  next_safe = p->session;
 197 }
 198 read_unlock(&tasklist_lock);
 199 }
 200 spin_unlock(&lastpid_lock);
 201
 202 return last_pid;
 203 }
 204
 205 staticinlineintdup_mmap(struct mm_struct * mm)
 206 {
 207 struct vm_area_struct * mpnt, *tmp, **pprev;
 208 int retval;
 209
 210 flush_cache_mm(current->mm);
 211  pprev = &mm->mmap;
 212 for(mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
 213 struct file *file;
 214
 215  retval = -ENOMEM;
 216  tmp =kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 217 if(!tmp)
 218 goto fail_nomem;
 219 *tmp = *mpnt;
 220  tmp->vm_flags &= ~VM_LOCKED;
 221  tmp->vm_mm = mm;
 222  mm->map_count++;
 223  tmp->vm_next = NULL;
 224  file = tmp->vm_file;
 225 if(file) {
 226  file->f_count++;
 227 if(tmp->vm_flags & VM_DENYWRITE)
 228  file->f_dentry->d_inode->i_writecount--;
 229
 230 /* insert tmp into the share list, just after mpnt */
 231 if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
 232  mpnt->vm_next_share->vm_pprev_share =
 233 &tmp->vm_next_share;
 234  mpnt->vm_next_share = tmp;
 235  tmp->vm_pprev_share = &mpnt->vm_next_share;
 236 }
 237
 238 /* Copy the pages, but defer checking for errors */
 239  retval =copy_page_range(mm, current->mm, tmp);
 240 if(!retval && tmp->vm_ops && tmp->vm_ops->open)
 241  tmp->vm_ops->open(tmp);
 242
 243 /*
 244  * Link in the new vma even if an error occurred,
 245  * so that exit_mmap() can clean up the mess.
 246  */
 247 if((tmp->vm_next = *pprev) != NULL)
 248 (*pprev)->vm_pprev = &tmp->vm_next;
 249 *pprev = tmp;
 250  tmp->vm_pprev = pprev;
 251
 252  pprev = &tmp->vm_next;
 253 if(retval)
 254 goto fail_nomem;
 255 }
 256  retval =0;
 257
 258 fail_nomem:
 259 flush_tlb_mm(current->mm);
 260 return retval;
 261 }
 262
 263 /*
 264  * Allocate and initialize an mm_struct.
 265  *
 266  * NOTE! The mm mutex will be locked until the
 267  * caller decides that all systems are go..
 268  */
 269 struct mm_struct *mm_alloc(void)
 270 {
 271 struct mm_struct * mm;
 272
 273  mm =kmem_cache_alloc(mm_cachep, SLAB_KERNEL);
 274 if(mm) {
 275 *mm = *current->mm;
 276 init_new_context(mm);
 277  mm->count =1;
 278  mm->map_count =0;
 279  mm->def_flags =0;
 280  mm->mmap_sem = MUTEX_LOCKED;
 281 /*
 282  * Leave mm->pgd set to the parent's pgd
 283  * so that pgd_offset() is always valid.
 284  */
 285  mm->mmap = mm->mmap_cache = NULL;
 286
 287 /* It has not run yet, so cannot be present in anyone's
 288  * cache or tlb.
 289  */
 290  mm->cpu_vm_mask =0;
 291 }
 292 return mm;
 293 }
 294
 295 /*
 296  * Decrement the use count and release all resources for an mm.
 297  */
 298 voidmmput(struct mm_struct *mm)
 299 {
 300 if(!--mm->count) {
 301 exit_mmap(mm);
 302 free_page_tables(mm);
 303 kmem_cache_free(mm_cachep, mm);
 304 }
 305 }
 306
 307 staticinlineintcopy_mm(unsigned long clone_flags,struct task_struct * tsk)
 308 {
 309 struct mm_struct * mm;
 310 int retval;
 311
 312 if(clone_flags & CLONE_VM) {
 313 mmget(current->mm);
 314 SET_PAGE_DIR(tsk, current->mm->pgd);
 315 return0;
 316 }
 317
 318  retval = -ENOMEM;
 319  mm =mm_alloc();
 320 if(!mm)
 321 goto fail_nomem;
 322
 323  tsk->mm = mm;
 324  tsk->min_flt = tsk->maj_flt =0;
 325  tsk->cmin_flt = tsk->cmaj_flt =0;
 326  tsk->nswap = tsk->cnswap =0;
 327  retval =new_page_tables(tsk);
 328 if(retval)
 329 goto free_mm;
 330  retval =dup_mmap(mm);
 331 if(retval)
 332 goto free_pt;
 333 up(&mm->mmap_sem);
 334 return0;
 335
 336 free_mm:
 337  mm->pgd = NULL;
 338 free_pt:
 339  tsk->mm = NULL;
 340 mmput(mm);
 341 fail_nomem:
 342 return retval;
 343 }
 344
 345 staticinlineintcopy_fs(unsigned long clone_flags,struct task_struct * tsk)
 346 {
 347 if(clone_flags & CLONE_FS) {
 348  current->fs->count++;
 349 return0;
 350 }
 351  tsk->fs =kmalloc(sizeof(*tsk->fs), GFP_KERNEL);
 352 if(!tsk->fs)
 353 return-1;
 354  tsk->fs->count =1;
 355  tsk->fs->umask = current->fs->umask;
 356  tsk->fs->root =dget(current->fs->root);
 357  tsk->fs->pwd =dget(current->fs->pwd);
 358 return0;
 359 }
 360
 361 /* return value is only accurate by +-sizeof(long)*8 fds */
 362 /* XXX make this architecture specific */
 363 staticinlineint__copy_fdset(unsigned long*d,unsigned long*src)
 364 {
 365 int i;
 366 unsigned long*p = src;
 367 unsigned long*max = src;
 368
 369 for(i = __FDSET_LONGS; i; --i) {
 370 if((*d++ = *p++) !=0)
 371  max = p;
 372 }
 373 return(max - src)*sizeof(long)*8;
 374 }
 375
 376 staticinlineintcopy_fdset(fd_set *dst, fd_set *src)
 377 {
 378 return__copy_fdset(dst->fds_bits, src->fds_bits);
 379 }
 380
 381 static intcopy_files(unsigned long clone_flags,struct task_struct * tsk)
 382 {
 383 struct files_struct *oldf, *newf;
 384 struct file **old_fds, **new_fds;
 385 int size, i, error =0;
 386
 387 /*
 388  * A background process may not have any files ...
 389  */
 390  oldf = current->files;
 391 if(!oldf)
 392 goto out;
 393
 394 if(clone_flags & CLONE_FILES) {
 395  oldf->count++;
 396 goto out;
 397 }
 398
 399  tsk->files = NULL;
 400  error = -ENOMEM;
 401  newf =kmem_cache_alloc(files_cachep, SLAB_KERNEL);
 402 if(!newf)
 403 goto out;
 404
 405 /*
 406  * Allocate the fd array, using get_free_page() if possible.
 407  * Eventually we want to make the array size variable ...
 408  */
 409  size = NR_OPEN *sizeof(struct file *);
 410 if(size == PAGE_SIZE)
 411  new_fds = (struct file **)__get_free_page(GFP_KERNEL);
 412 else
 413  new_fds = (struct file **)kmalloc(size, GFP_KERNEL);
 414 if(!new_fds)
 415 goto out_release;
 416 memset((void*) new_fds,0, size);
 417
 418  newf->count =1;
 419  newf->max_fds = NR_OPEN;
 420  newf->fd = new_fds;
 421  newf->close_on_exec = oldf->close_on_exec;
 422  i =copy_fdset(&newf->open_fds, &oldf->open_fds);
 423
 424  old_fds = oldf->fd;
 425 for(; i !=0; i--) {
 426 struct file * f = *old_fds;
 427  old_fds++;
 428 *new_fds = f;
 429 if(f)
 430  f->f_count++;
 431  new_fds++;
 432 }
 433  tsk->files = newf;
 434  error =0;
 435 out:
 436 return error;
 437
 438 out_release:
 439 kmem_cache_free(files_cachep, newf);
 440 goto out;
 441 }
 442
 443 staticinlineintcopy_sighand(unsigned long clone_flags,struct task_struct * tsk)
 444 {
 445 if(clone_flags & CLONE_SIGHAND) {
 446 atomic_inc(&current->sig->count);
 447 return0;
 448 }
 449  tsk->sig =kmalloc(sizeof(*tsk->sig), GFP_KERNEL);
 450 if(!tsk->sig)
 451 return-1;
 452 spin_lock_init(&tsk->sig->siglock);
 453 atomic_set(&tsk->sig->count,1);
 454 memcpy(tsk->sig->action, current->sig->action,sizeof(tsk->sig->action));
 455 return0;
 456 }
 457
 458 /*
 459  * Ok, this is the main fork-routine. It copies the system process
 460  * information (task[nr]) and sets up the necessary registers. It
 461  * also copies the data segment in its entirety.
 462  */
 463 intdo_fork(unsigned long clone_flags,unsigned long usp,struct pt_regs *regs)
 464 {
 465 int nr;
 466 int error = -ENOMEM;
 467 struct task_struct *p;
 468
 469 lock_kernel();
 470  p =alloc_task_struct();
 471 if(!p)
 472 goto bad_fork;
 473
 474  error = -EAGAIN;
 475  nr =find_empty_process();
 476 if(nr <0)
 477 goto bad_fork_free;
 478
 479 *p = *current;
 480
 481 if(p->exec_domain && p->exec_domain->module)
 482 __MOD_INC_USE_COUNT(p->exec_domain->module);
 483 if(p->binfmt && p->binfmt->module)
 484 __MOD_INC_USE_COUNT(p->binfmt->module);
 485
 486  p->did_exec =0;
 487  p->swappable =0;
 488  p->state = TASK_UNINTERRUPTIBLE;
 489  p->flags &= ~(PF_PTRACED|PF_TRACESYS|PF_SUPERPRIV);
 490  p->sigpending =0;
 491  p->flags |= PF_FORKNOEXEC;
 492  p->pid =get_pid(clone_flags);
 493  p->next_run = NULL;
 494  p->prev_run = NULL;
 495  p->p_pptr = p->p_opptr = current;
 496  p->p_cptr = NULL;
 497 init_waitqueue(&p->wait_chldexit);
 498 sigemptyset(&p->signal);
 499  p->sigqueue = NULL;
 500  p->sigqueue_tail = &p->sigqueue;
 501  p->it_real_value = p->it_virt_value = p->it_prof_value =0;
 502  p->it_real_incr = p->it_virt_incr = p->it_prof_incr =0;
 503 init_timer(&p->real_timer);
 504  p->real_timer.data = (unsigned long) p;
 505  p->leader =0;/* session leadership doesn't inherit */
 506  p->tty_old_pgrp =0;
 507  p->times.tms_utime = p->times.tms_stime =0;
 508  p->times.tms_cutime = p->times.tms_cstime =0;
 509 #ifdef __SMP__
 510 {
 511 int i;
 512  p->has_cpu =0;
 513  p->processor = NO_PROC_ID;
 514 /* ?? should we just memset this ?? */
 515 for(i =0; i < smp_num_cpus; i++)
 516  p->per_cpu_utime[i] = p->per_cpu_stime[i] =0;
 517 }
 518 #endif
 519  p->lock_depth =0;
 520  p->start_time = jiffies;
 521  p->tarray_ptr = &task[nr];
 522 *p->tarray_ptr = p;
 523
 524 {
 525 unsigned long flags;
 526 write_lock_irqsave(&tasklist_lock, flags);
 527 SET_LINKS(p);
 528 hash_pid(p);
 529 write_unlock_irqrestore(&tasklist_lock, flags);
 530 }
 531
 532  nr_tasks++;
 533
 534  error = -ENOMEM;
 535 /* copy all the process information */
 536 if(copy_files(clone_flags, p))
 537 goto bad_fork_cleanup;
 538 if(copy_fs(clone_flags, p))
 539 goto bad_fork_cleanup_files;
 540 if(copy_sighand(clone_flags, p))
 541 goto bad_fork_cleanup_fs;
 542 if(copy_mm(clone_flags, p))
 543 goto bad_fork_cleanup_sighand;
 544  error =copy_thread(nr, clone_flags, usp, p, regs);
 545 if(error)
 546 goto bad_fork_cleanup_sighand;
 547  p->semundo = NULL;
 548
 549 /* ok, now we should be set up.. */
 550  p->swappable =1;
 551  p->exit_signal = clone_flags & CSIGNAL;
 552  p->pdeath_signal =0;
 553
 554 /*
 555  * "share" dynamic priority between parent and child, thus the
 556  * total amount of dynamic priorities in the system doesnt change,
 557  * more scheduling fairness. This is only important in the first
 558  * timeslice, on the long run the scheduling behaviour is unchanged.
 559  */
 560  current->counter >>=1;
 561  p->counter = current->counter;
 562
 563 if(p->pid) {
 564 wake_up_process(p);/* do this last, just in case */
 565 }else{
 566  p->state = TASK_RUNNING;
 567  p->next_run = p->prev_run = p;
 568 }
 569 ++total_forks;
 570  error = p->pid;
 571 bad_fork:
 572 unlock_kernel();
 573 return error;
 574
 575 bad_fork_cleanup_sighand:
 576 exit_sighand(p);
 577 bad_fork_cleanup_fs:
 578 exit_fs(p);/* blocking */
 579 bad_fork_cleanup_files:
 580 exit_files(p);/* blocking */
 581 bad_fork_cleanup:
 582 charge_uid(current, -1);
 583 if(p->exec_domain && p->exec_domain->module)
 584 __MOD_DEC_USE_COUNT(p->exec_domain->module);
 585 if(p->binfmt && p->binfmt->module)
 586 __MOD_DEC_USE_COUNT(p->binfmt->module);
 587 add_free_taskslot(p->tarray_ptr);
 588
 589 {
 590 unsigned long flags;
 591 write_lock_irqsave(&tasklist_lock, flags);
 592 unhash_pid(p);
 593 REMOVE_LINKS(p);
 594 write_unlock_irqrestore(&tasklist_lock, flags);
 595 }
 596
 597  nr_tasks--;
 598 bad_fork_free:
 599 free_task_struct(p);
 600 goto bad_fork;
 601 }
 602
 603 static voidfiles_ctor(void*fp, kmem_cache_t *cachep,unsigned long flags)
 604 {
 605 struct files_struct *f = fp;
 606
 607 memset(f,0,sizeof(*f));
 608 }
 609
 610 __initfunc(voidfilescache_init(void))
 611 {
 612  files_cachep =kmem_cache_create("files_cache",
 613 sizeof(struct files_struct),
 614 0,
 615  SLAB_HWCACHE_ALIGN,
 616  files_ctor, NULL);
 617 if(!files_cachep)
 618 panic("Cannot create files cache");
 619 }