a08aa2c6477ec12dfb189595fc5e735619e34568
[davej-history.git] / kernel / fork.c
bloba08aa2c6477ec12dfb189595fc5e735619e34568
1 /*
2 * linux/kernel/fork.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also system_call.s).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
14 #include <linux/init.h>
15 #include <linux/errno.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <linux/mm.h>
19 #include <linux/slab.h>
20 #include <linux/unistd.h>
21 #include <linux/ptrace.h>
22 #include <linux/malloc.h>
23 #include <linux/smp.h>
24 #include <linux/smp_lock.h>
25 #include <linux/module.h>
27 #include <asm/system.h>
28 #include <asm/pgtable.h>
29 #include <asm/mmu_context.h>
30 #include <asm/uaccess.h>
32 int nr_tasks=1;
33 int nr_running=1;
34 unsigned long int total_forks=0;/* Handle normal Linux uptimes. */
35 int last_pid=0;
37 /* SLAB cache for mm_struct's. */
38 kmem_cache_t *mm_cachep;
40 /* SLAB cache for files structs */
41 kmem_cache_t *files_cachep;
43 struct task_struct *pidhash[PIDHASH_SZ];
45 struct task_struct **tarray_freelist = NULL;
46 spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED;
48 /* UID task count cache, to prevent walking entire process list every
49 * single fork() operation.
51 #define UIDHASH_SZ (PIDHASH_SZ >> 2)
53 static struct uid_taskcount {
54 struct uid_taskcount *next, **pprev;
55 unsigned short uid;
56 int task_count;
57 } *uidhash[UIDHASH_SZ];
59 #ifdef __SMP__
60 static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
61 #endif
63 kmem_cache_t *uid_cachep;
65 #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
67 staticinlinevoiduid_hash_insert(struct uid_taskcount *up,unsigned int hashent)
69 spin_lock(&uidhash_lock);
70 if((up->next = uidhash[hashent]) != NULL)
71 uidhash[hashent]->pprev = &up->next;
72 up->pprev = &uidhash[hashent];
73 uidhash[hashent] = up;
74 spin_unlock(&uidhash_lock);
77 staticinlinevoiduid_hash_remove(struct uid_taskcount *up)
79 spin_lock(&uidhash_lock);
80 if(up->next)
81 up->next->pprev = up->pprev;
82 *up->pprev = up->next;
83 spin_unlock(&uidhash_lock);
86 staticinlinestruct uid_taskcount *uid_find(unsigned short uid,unsigned int hashent)
88 struct uid_taskcount *up;
90 spin_lock(&uidhash_lock);
91 for(up = uidhash[hashent]; (up && up->uid != uid); up = up->next)
93 spin_unlock(&uidhash_lock);
94 return up;
97 intcharge_uid(struct task_struct *p,int count)
99 unsigned int hashent =uidhashfn(p->uid);
100 struct uid_taskcount *up =uid_find(p->uid, hashent);
102 if(up) {
103 int limit = p->rlim[RLIMIT_NPROC].rlim_cur;
104 int newcnt = up->task_count + count;
106 if(newcnt > limit)
107 return-EAGAIN;
108 else if(newcnt ==0) {
109 uid_hash_remove(up);
110 kmem_cache_free(uid_cachep, up);
111 return0;
113 }else{
114 up =kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
115 if(!up)
116 return-EAGAIN;
117 up->uid = p->uid;
118 up->task_count =0;
119 uid_hash_insert(up, hashent);
121 up->task_count += count;
122 return0;
125 __initfunc(voiduidcache_init(void))
127 int i;
129 uid_cachep =kmem_cache_create("uid_cache",sizeof(struct uid_taskcount),
131 SLAB_HWCACHE_ALIGN, NULL, NULL);
132 if(!uid_cachep)
133 panic("Cannot create uid taskcount SLAB cache\n");
135 for(i =0; i < UIDHASH_SZ; i++)
136 uidhash[i] =0;
139 staticinlineintfind_empty_process(void)
141 struct task_struct **tslot;
143 if(current->uid) {
144 int error;
146 if(nr_tasks >= NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT)
147 return-EAGAIN;
148 if((error =charge_uid(current,1)) <0)
149 return error;
151 tslot =get_free_taskslot();
152 if(tslot)
153 return tslot - &task[0];
154 return-EAGAIN;
157 #ifdef __SMP__
158 /* Protects next_safe and last_pid. */
159 static spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
160 #endif
162 static intget_pid(unsigned long flags)
164 static int next_safe = PID_MAX;
165 struct task_struct *p;
167 if(flags & CLONE_PID)
168 return current->pid;
170 spin_lock(&lastpid_lock);
171 if((++last_pid) &0xffff8000) {
172 last_pid =300;/* Skip daemons etc. */
173 goto inside;
175 if(last_pid >= next_safe) {
176 inside:
177 next_safe = PID_MAX;
178 read_lock(&tasklist_lock);
179 repeat:
180 for_each_task(p) {
181 if(p->pid == last_pid ||
182 p->pgrp == last_pid ||
183 p->session == last_pid) {
184 if(++last_pid >= next_safe) {
185 if(last_pid &0xffff8000)
186 last_pid =300;
187 next_safe = PID_MAX;
188 goto repeat;
191 if(p->pid > last_pid && next_safe > p->pid)
192 next_safe = p->pid;
193 if(p->pgrp > last_pid && next_safe > p->pgrp)
194 next_safe = p->pgrp;
195 if(p->session > last_pid && next_safe > p->session)
196 next_safe = p->session;
198 read_unlock(&tasklist_lock);
200 spin_unlock(&lastpid_lock);
202 return last_pid;
205 staticinlineintdup_mmap(struct mm_struct * mm)
207 struct vm_area_struct * mpnt, *tmp, **pprev;
208 int retval;
210 flush_cache_mm(current->mm);
211 pprev = &mm->mmap;
212 for(mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
213 struct file *file;
215 retval = -ENOMEM;
216 tmp =kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
217 if(!tmp)
218 goto fail_nomem;
219 *tmp = *mpnt;
220 tmp->vm_flags &= ~VM_LOCKED;
221 tmp->vm_mm = mm;
222 mm->map_count++;
223 tmp->vm_next = NULL;
224 file = tmp->vm_file;
225 if(file) {
226 file->f_count++;
227 if(tmp->vm_flags & VM_DENYWRITE)
228 file->f_dentry->d_inode->i_writecount--;
230 /* insert tmp into the share list, just after mpnt */
231 if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
232 mpnt->vm_next_share->vm_pprev_share =
233 &tmp->vm_next_share;
234 mpnt->vm_next_share = tmp;
235 tmp->vm_pprev_share = &mpnt->vm_next_share;
238 /* Copy the pages, but defer checking for errors */
239 retval =copy_page_range(mm, current->mm, tmp);
240 if(!retval && tmp->vm_ops && tmp->vm_ops->open)
241 tmp->vm_ops->open(tmp);
244 * Link in the new vma even if an error occurred,
245 * so that exit_mmap() can clean up the mess.
247 if((tmp->vm_next = *pprev) != NULL)
248 (*pprev)->vm_pprev = &tmp->vm_next;
249 *pprev = tmp;
250 tmp->vm_pprev = pprev;
252 pprev = &tmp->vm_next;
253 if(retval)
254 goto fail_nomem;
256 retval =0;
258 fail_nomem:
259 flush_tlb_mm(current->mm);
260 return retval;
264 * Allocate and initialize an mm_struct.
266 * NOTE! The mm mutex will be locked until the
267 * caller decides that all systems are go..
269 struct mm_struct *mm_alloc(void)
271 struct mm_struct * mm;
273 mm =kmem_cache_alloc(mm_cachep, SLAB_KERNEL);
274 if(mm) {
275 *mm = *current->mm;
276 init_new_context(mm);
277 mm->count =1;
278 mm->map_count =0;
279 mm->def_flags =0;
280 mm->mmap_sem = MUTEX_LOCKED;
282 * Leave mm->pgd set to the parent's pgd
283 * so that pgd_offset() is always valid.
285 mm->mmap = mm->mmap_cache = NULL;
287 /* It has not run yet, so cannot be present in anyone's
288 * cache or tlb.
290 mm->cpu_vm_mask =0;
292 return mm;
296 * Decrement the use count and release all resources for an mm.
298 voidmmput(struct mm_struct *mm)
300 if(!--mm->count) {
301 exit_mmap(mm);
302 free_page_tables(mm);
303 kmem_cache_free(mm_cachep, mm);
307 staticinlineintcopy_mm(unsigned long clone_flags,struct task_struct * tsk)
309 struct mm_struct * mm;
310 int retval;
312 if(clone_flags & CLONE_VM) {
313 mmget(current->mm);
314 SET_PAGE_DIR(tsk, current->mm->pgd);
315 return0;
318 retval = -ENOMEM;
319 mm =mm_alloc();
320 if(!mm)
321 goto fail_nomem;
323 tsk->mm = mm;
324 tsk->min_flt = tsk->maj_flt =0;
325 tsk->cmin_flt = tsk->cmaj_flt =0;
326 tsk->nswap = tsk->cnswap =0;
327 retval =new_page_tables(tsk);
328 if(retval)
329 goto free_mm;
330 retval =dup_mmap(mm);
331 if(retval)
332 goto free_pt;
333 up(&mm->mmap_sem);
334 return0;
336 free_mm:
337 mm->pgd = NULL;
338 free_pt:
339 tsk->mm = NULL;
340 mmput(mm);
341 fail_nomem:
342 return retval;
345 staticinlineintcopy_fs(unsigned long clone_flags,struct task_struct * tsk)
347 if(clone_flags & CLONE_FS) {
348 current->fs->count++;
349 return0;
351 tsk->fs =kmalloc(sizeof(*tsk->fs), GFP_KERNEL);
352 if(!tsk->fs)
353 return-1;
354 tsk->fs->count =1;
355 tsk->fs->umask = current->fs->umask;
356 tsk->fs->root =dget(current->fs->root);
357 tsk->fs->pwd =dget(current->fs->pwd);
358 return0;
361 /* return value is only accurate by +-sizeof(long)*8 fds */
362 /* XXX make this architecture specific */
363 staticinlineint__copy_fdset(unsigned long*d,unsigned long*src)
365 int i;
366 unsigned long*p = src;
367 unsigned long*max = src;
369 for(i = __FDSET_LONGS; i; --i) {
370 if((*d++ = *p++) !=0)
371 max = p;
373 return(max - src)*sizeof(long)*8;
376 staticinlineintcopy_fdset(fd_set *dst, fd_set *src)
378 return__copy_fdset(dst->fds_bits, src->fds_bits);
381 static intcopy_files(unsigned long clone_flags,struct task_struct * tsk)
383 struct files_struct *oldf, *newf;
384 struct file **old_fds, **new_fds;
385 int size, i, error =0;
388 * A background process may not have any files ...
390 oldf = current->files;
391 if(!oldf)
392 goto out;
394 if(clone_flags & CLONE_FILES) {
395 oldf->count++;
396 goto out;
399 tsk->files = NULL;
400 error = -ENOMEM;
401 newf =kmem_cache_alloc(files_cachep, SLAB_KERNEL);
402 if(!newf)
403 goto out;
406 * Allocate the fd array, using get_free_page() if possible.
407 * Eventually we want to make the array size variable ...
409 size = NR_OPEN *sizeof(struct file *);
410 if(size == PAGE_SIZE)
411 new_fds = (struct file **)__get_free_page(GFP_KERNEL);
412 else
413 new_fds = (struct file **)kmalloc(size, GFP_KERNEL);
414 if(!new_fds)
415 goto out_release;
416 memset((void*) new_fds,0, size);
418 newf->count =1;
419 newf->max_fds = NR_OPEN;
420 newf->fd = new_fds;
421 newf->close_on_exec = oldf->close_on_exec;
422 i =copy_fdset(&newf->open_fds, &oldf->open_fds);
424 old_fds = oldf->fd;
425 for(; i !=0; i--) {
426 struct file * f = *old_fds;
427 old_fds++;
428 *new_fds = f;
429 if(f)
430 f->f_count++;
431 new_fds++;
433 tsk->files = newf;
434 error =0;
435 out:
436 return error;
438 out_release:
439 kmem_cache_free(files_cachep, newf);
440 goto out;
443 staticinlineintcopy_sighand(unsigned long clone_flags,struct task_struct * tsk)
445 if(clone_flags & CLONE_SIGHAND) {
446 atomic_inc(&current->sig->count);
447 return0;
449 tsk->sig =kmalloc(sizeof(*tsk->sig), GFP_KERNEL);
450 if(!tsk->sig)
451 return-1;
452 spin_lock_init(&tsk->sig->siglock);
453 atomic_set(&tsk->sig->count,1);
454 memcpy(tsk->sig->action, current->sig->action,sizeof(tsk->sig->action));
455 return0;
459 * Ok, this is the main fork-routine. It copies the system process
460 * information (task[nr]) and sets up the necessary registers. It
461 * also copies the data segment in its entirety.
463 intdo_fork(unsigned long clone_flags,unsigned long usp,struct pt_regs *regs)
465 int nr;
466 int error = -ENOMEM;
467 struct task_struct *p;
469 lock_kernel();
470 p =alloc_task_struct();
471 if(!p)
472 goto bad_fork;
474 error = -EAGAIN;
475 nr =find_empty_process();
476 if(nr <0)
477 goto bad_fork_free;
479 *p = *current;
481 if(p->exec_domain && p->exec_domain->module)
482 __MOD_INC_USE_COUNT(p->exec_domain->module);
483 if(p->binfmt && p->binfmt->module)
484 __MOD_INC_USE_COUNT(p->binfmt->module);
486 p->did_exec =0;
487 p->swappable =0;
488 p->state = TASK_UNINTERRUPTIBLE;
489 p->flags &= ~(PF_PTRACED|PF_TRACESYS|PF_SUPERPRIV);
490 p->sigpending =0;
491 p->flags |= PF_FORKNOEXEC;
492 p->pid =get_pid(clone_flags);
493 p->next_run = NULL;
494 p->prev_run = NULL;
495 p->p_pptr = p->p_opptr = current;
496 p->p_cptr = NULL;
497 init_waitqueue(&p->wait_chldexit);
498 sigemptyset(&p->signal);
499 p->sigqueue = NULL;
500 p->sigqueue_tail = &p->sigqueue;
501 p->it_real_value = p->it_virt_value = p->it_prof_value =0;
502 p->it_real_incr = p->it_virt_incr = p->it_prof_incr =0;
503 init_timer(&p->real_timer);
504 p->real_timer.data = (unsigned long) p;
505 p->leader =0;/* session leadership doesn't inherit */
506 p->tty_old_pgrp =0;
507 p->times.tms_utime = p->times.tms_stime =0;
508 p->times.tms_cutime = p->times.tms_cstime =0;
509 #ifdef __SMP__
511 int i;
512 p->has_cpu =0;
513 p->processor = NO_PROC_ID;
514 /* ?? should we just memset this ?? */
515 for(i =0; i < smp_num_cpus; i++)
516 p->per_cpu_utime[i] = p->per_cpu_stime[i] =0;
518 #endif
519 p->lock_depth =0;
520 p->start_time = jiffies;
521 p->tarray_ptr = &task[nr];
522 *p->tarray_ptr = p;
525 unsigned long flags;
526 write_lock_irqsave(&tasklist_lock, flags);
527 SET_LINKS(p);
528 hash_pid(p);
529 write_unlock_irqrestore(&tasklist_lock, flags);
532 nr_tasks++;
534 error = -ENOMEM;
535 /* copy all the process information */
536 if(copy_files(clone_flags, p))
537 goto bad_fork_cleanup;
538 if(copy_fs(clone_flags, p))
539 goto bad_fork_cleanup_files;
540 if(copy_sighand(clone_flags, p))
541 goto bad_fork_cleanup_fs;
542 if(copy_mm(clone_flags, p))
543 goto bad_fork_cleanup_sighand;
544 error =copy_thread(nr, clone_flags, usp, p, regs);
545 if(error)
546 goto bad_fork_cleanup_sighand;
547 p->semundo = NULL;
549 /* ok, now we should be set up.. */
550 p->swappable =1;
551 p->exit_signal = clone_flags & CSIGNAL;
552 p->pdeath_signal =0;
555 * "share" dynamic priority between parent and child, thus the
556 * total amount of dynamic priorities in the system doesnt change,
557 * more scheduling fairness. This is only important in the first
558 * timeslice, on the long run the scheduling behaviour is unchanged.
560 current->counter >>=1;
561 p->counter = current->counter;
563 if(p->pid) {
564 wake_up_process(p);/* do this last, just in case */
565 }else{
566 p->state = TASK_RUNNING;
567 p->next_run = p->prev_run = p;
569 ++total_forks;
570 error = p->pid;
571 bad_fork:
572 unlock_kernel();
573 return error;
575 bad_fork_cleanup_sighand:
576 exit_sighand(p);
577 bad_fork_cleanup_fs:
578 exit_fs(p);/* blocking */
579 bad_fork_cleanup_files:
580 exit_files(p);/* blocking */
581 bad_fork_cleanup:
582 charge_uid(current, -1);
583 if(p->exec_domain && p->exec_domain->module)
584 __MOD_DEC_USE_COUNT(p->exec_domain->module);
585 if(p->binfmt && p->binfmt->module)
586 __MOD_DEC_USE_COUNT(p->binfmt->module);
587 add_free_taskslot(p->tarray_ptr);
590 unsigned long flags;
591 write_lock_irqsave(&tasklist_lock, flags);
592 unhash_pid(p);
593 REMOVE_LINKS(p);
594 write_unlock_irqrestore(&tasklist_lock, flags);
597 nr_tasks--;
598 bad_fork_free:
599 free_task_struct(p);
600 goto bad_fork;
603 static voidfiles_ctor(void*fp, kmem_cache_t *cachep,unsigned long flags)
605 struct files_struct *f = fp;
607 memset(f,0,sizeof(*f));
610 __initfunc(voidfilescache_init(void))
612 files_cachep =kmem_cache_create("files_cache",
613 sizeof(struct files_struct),
615 SLAB_HWCACHE_ALIGN,
616 files_ctor, NULL);
617 if(!files_cachep)
618 panic("Cannot create files cache");
close