4 * Copyright (C) 1991, 1992 Linus Torvalds 8 * #!-checking implemented by tytso. 11 * Demand-loading implemented 01.12.91 - no need to read anything but 12 * the header into memory. The inode of the executable is put into 13 * "current->executable", and page faults do the actual loading. Clean. 15 * Once more I can proudly say that linux stood up to being changed: it 16 * was less than 2 hours work to get demand-loading completely implemented. 18 * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, 19 * current->executable is only used by the procfs. This allows a dispatch 20 * table to check for several different types of binary formats. We keep 21 * trying until we recognize the file or we run out of supported binary 25 #include <linux/config.h> 26 #include <linux/slab.h> 27 #include <linux/file.h> 28 #include <linux/mman.h> 29 #include <linux/a.out.h> 30 #include <linux/stat.h> 31 #include <linux/fcntl.h> 32 #include <linux/smp_lock.h> 33 #include <linux/init.h> 34 #include <linux/pagemap.h> 35 #include <linux/highmem.h> 36 #include <linux/spinlock.h> 37 #define __NO_VERSION__ 38 #include <linux/module.h> 40 #include <asm/uaccess.h> 41 #include <asm/pgalloc.h> 42 #include <asm/mmu_context.h> 45 #include <linux/kmod.h> 48 static struct linux_binfmt
*formats
; 49 static rwlock_t binfmt_lock
= RW_LOCK_UNLOCKED
; 51 intregister_binfmt(struct linux_binfmt
* fmt
) 53 struct linux_binfmt
** tmp
= &formats
; 59 write_lock(&binfmt_lock
); 62 write_unlock(&binfmt_lock
); 69 write_unlock(&binfmt_lock
); 73 intunregister_binfmt(struct linux_binfmt
* fmt
) 75 struct linux_binfmt
** tmp
= &formats
; 77 write_lock(&binfmt_lock
); 81 write_unlock(&binfmt_lock
); 86 write_unlock(&binfmt_lock
); 90 staticinlinevoidput_binfmt(struct linux_binfmt
* fmt
) 93 __MOD_DEC_USE_COUNT(fmt
->module
); 97 * Note that a shared library must be both readable and executable due to 100 * Also note that we take the address to load from from the file itself. 102 asmlinkage
longsys_uselib(const char* library
) 108 error
=user_path_walk(library
, &nd
); 113 if(!S_ISREG(nd
.dentry
->d_inode
->i_mode
)) 116 error
=permission(nd
.dentry
->d_inode
, MAY_READ
| MAY_EXEC
); 120 file
=dentry_open(nd
.dentry
, nd
.mnt
, O_RDONLY
); 121 error
=PTR_ERR(file
); 126 if(file
->f_op
&& file
->f_op
->read
) { 127 struct linux_binfmt
* fmt
; 129 read_lock(&binfmt_lock
); 130 for(fmt
= formats
; fmt
; fmt
= fmt
->next
) { 133 if(!try_inc_mod_count(fmt
->module
)) 135 read_unlock(&binfmt_lock
); 136 error
= fmt
->load_shlib(file
); 137 read_lock(&binfmt_lock
); 139 if(error
!= -ENOEXEC
) 142 read_unlock(&binfmt_lock
); 153 * count() counts the number of arguments/envelopes 155 static intcount(char** argv
,int max
) 164 error
=get_user(p
,argv
); 178 * 'copy_strings()' copies argument/envelope strings from user 179 * memory to free pages in kernel mem. These are in a format ready 180 * to be put directly into the top of new user memory. 182 intcopy_strings(int argc
,char** argv
,struct linux_binprm
*bprm
) 189 if(get_user(str
, argv
+argc
) || !str
|| !(len
=strnlen_user(str
, bprm
->p
))) 195 /* XXX: add architecture specific overflow check here. */ 202 int offset
, bytes_to_copy
; 204 offset
= pos
% PAGE_SIZE
; 206 page
= bprm
->page
[i
]; 209 page
=alloc_page(GFP_HIGHUSER
); 210 bprm
->page
[i
] = page
; 218 memset(kaddr
,0, offset
); 219 bytes_to_copy
= PAGE_SIZE
- offset
; 220 if(bytes_to_copy
> len
) { 223 memset(kaddr
+offset
+len
,0, PAGE_SIZE
-offset
-len
); 225 err
=copy_from_user(kaddr
+ offset
, str
, bytes_to_copy
); 231 pos
+= bytes_to_copy
; 232 str
+= bytes_to_copy
; 233 len
-= bytes_to_copy
; 240 * Like copy_strings, but get argv and its values from kernel memory. 242 intcopy_strings_kernel(int argc
,char** argv
,struct linux_binprm
*bprm
) 245 mm_segment_t oldfs
=get_fs(); 247 r
=copy_strings(argc
, argv
, bprm
); 253 * This routine is used to map in a page into an address space: needed by 254 * execve() for the initial stack and environment pages. 256 voidput_dirty_page(struct task_struct
* tsk
,struct page
*page
,unsigned long address
) 262 if(page_count(page
) !=1) 263 printk("mem_map disagrees with %p at %08lx\n", page
, address
); 264 pgd
=pgd_offset(tsk
->mm
, address
); 265 pmd
=pmd_alloc(pgd
, address
); 268 force_sig(SIGKILL
, tsk
); 271 pte
=pte_alloc(pmd
, address
); 274 force_sig(SIGKILL
, tsk
); 277 if(!pte_none(*pte
)) { 282 flush_dcache_page(page
); 283 flush_page_to_ram(page
); 284 set_pte(pte
,pte_mkdirty(pte_mkwrite(mk_pte(page
, PAGE_COPY
)))); 285 /* no need for flush_tlb */ 288 intsetup_arg_pages(struct linux_binprm
*bprm
) 290 unsigned long stack_base
; 291 struct vm_area_struct
*mpnt
; 294 stack_base
= STACK_TOP
- MAX_ARG_PAGES
*PAGE_SIZE
; 296 bprm
->p
+= stack_base
; 298 bprm
->loader
+= stack_base
; 299 bprm
->exec
+= stack_base
; 301 mpnt
=kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
); 305 down(¤t
->mm
->mmap_sem
); 307 mpnt
->vm_mm
= current
->mm
; 308 mpnt
->vm_start
= PAGE_MASK
& (unsigned long) bprm
->p
; 309 mpnt
->vm_end
= STACK_TOP
; 310 mpnt
->vm_page_prot
= PAGE_COPY
; 311 mpnt
->vm_flags
= VM_STACK_FLAGS
; 314 mpnt
->vm_file
= NULL
; 315 mpnt
->vm_private_data
= (void*)0; 316 insert_vm_struct(current
->mm
, mpnt
); 317 current
->mm
->total_vm
= (mpnt
->vm_end
- mpnt
->vm_start
) >> PAGE_SHIFT
; 320 for(i
=0; i
< MAX_ARG_PAGES
; i
++) { 321 struct page
*page
= bprm
->page
[i
]; 323 bprm
->page
[i
] = NULL
; 325 put_dirty_page(current
,page
,stack_base
); 327 stack_base
+= PAGE_SIZE
; 329 up(¤t
->mm
->mmap_sem
); 334 struct file
*open_exec(const char*name
) 341 if(path_init(name
, LOOKUP_FOLLOW
|LOOKUP_POSITIVE
, &nd
)) 342 err
=path_walk(name
, &nd
); 345 inode
= nd
.dentry
->d_inode
; 346 file
=ERR_PTR(-EACCES
); 347 if(!IS_NOEXEC(inode
) &&S_ISREG(inode
->i_mode
)) { 348 int err
=permission(inode
, MAY_EXEC
); 351 file
=dentry_open(nd
.dentry
, nd
.mnt
, O_RDONLY
); 353 err
=deny_write_access(file
); 368 intkernel_read(struct file
*file
,unsigned long offset
, 369 char* addr
,unsigned long count
) 373 int result
= -ENOSYS
; 375 if(!file
->f_op
->read
) 379 result
= file
->f_op
->read(file
, addr
, count
, &pos
); 385 static intexec_mmap(void) 387 struct mm_struct
* mm
, * old_mm
; 389 old_mm
= current
->mm
; 390 if(old_mm
&&atomic_read(&old_mm
->mm_users
) ==1) { 391 flush_cache_mm(old_mm
); 394 flush_tlb_mm(old_mm
); 400 struct mm_struct
*active_mm
= current
->active_mm
; 402 if(init_new_context(current
, mm
)) { 408 current
->active_mm
= mm
; 409 task_unlock(current
); 410 activate_mm(active_mm
, mm
); 413 if(active_mm
!= old_mm
)BUG(); 424 * This function makes sure the current process has its own signal table, 425 * so that flush_signal_handlers can later reset the handlers without 426 * disturbing other processes. (Other processes might share the signal 427 * table via the CLONE_SIGNAL option to clone().) 430 staticinlineintmake_private_signals(void) 432 struct signal_struct
* newsig
; 434 if(atomic_read(¤t
->sig
->count
) <=1) 436 newsig
=kmem_cache_alloc(sigact_cachep
, GFP_KERNEL
); 439 spin_lock_init(&newsig
->siglock
); 440 atomic_set(&newsig
->count
,1); 441 memcpy(newsig
->action
, current
->sig
->action
,sizeof(newsig
->action
)); 442 spin_lock_irq(¤t
->sigmask_lock
); 443 current
->sig
= newsig
; 444 spin_unlock_irq(¤t
->sigmask_lock
); 449 * If make_private_signals() made a copy of the signal table, decrement the 450 * refcount of the original table, and free it if necessary. 451 * We don't do that in make_private_signals() so that we can back off 452 * in flush_old_exec() if an error occurs after calling make_private_signals(). 455 staticinlinevoidrelease_old_signals(struct signal_struct
* oldsig
) 457 if(current
->sig
== oldsig
) 459 if(atomic_dec_and_test(&oldsig
->count
)) 460 kmem_cache_free(sigact_cachep
, oldsig
); 464 * These functions flushes out all traces of the currently running executable 465 * so that a new one can be started 468 staticinlinevoidflush_old_files(struct files_struct
* files
) 472 write_lock(&files
->file_lock
); 474 unsigned long set
, i
; 478 if(i
>= files
->max_fds
|| i
>= files
->max_fdset
) 480 set
= files
->close_on_exec
->fds_bits
[j
]; 483 files
->close_on_exec
->fds_bits
[j
] =0; 484 write_unlock(&files
->file_lock
); 485 for( ; set
; i
++,set
>>=1) { 490 write_lock(&files
->file_lock
); 493 write_unlock(&files
->file_lock
); 497 * An execve() will automatically "de-thread" the process. 498 * Note: we don't have to hold the tasklist_lock to test 499 * whether we migth need to do this. If we're not part of 500 * a thread group, there is no way we can become one 501 * dynamically. And if we are, we only need to protect the 502 * unlink - even if we race with the last other thread exit, 503 * at worst the list_del_init() might end up being a no-op. 505 staticinlinevoidde_thread(struct task_struct
*tsk
) 507 if(!list_empty(&tsk
->thread_group
)) { 508 write_lock_irq(&tasklist_lock
); 509 list_del_init(&tsk
->thread_group
); 510 write_unlock_irq(&tasklist_lock
); 513 /* Minor oddity: this might stay the same. */ 514 tsk
->tgid
= tsk
->pid
; 517 intflush_old_exec(struct linux_binprm
* bprm
) 521 struct signal_struct
* oldsig
; 524 * Make sure we have a private signal table 526 oldsig
= current
->sig
; 527 retval
=make_private_signals(); 528 if(retval
)goto flush_failed
; 531 * Release all of the old mmap stuff 534 if(retval
)goto mmap_failed
; 536 /* This is the point of no return */ 537 release_old_signals(oldsig
); 539 current
->sas_ss_sp
= current
->sas_ss_size
=0; 541 if(current
->euid
== current
->uid
&& current
->egid
== current
->gid
) 542 current
->dumpable
=1; 543 name
= bprm
->filename
; 544 for(i
=0; (ch
= *(name
++)) !='\0';) { 549 current
->comm
[i
++] = ch
; 551 current
->comm
[i
] ='\0'; 557 if(bprm
->e_uid
!= current
->euid
|| bprm
->e_gid
!= current
->egid
|| 558 permission(bprm
->file
->f_dentry
->d_inode
,MAY_READ
)) 559 current
->dumpable
=0; 561 /* An exec changes our domain. We are no longer part of the thread 564 current
->self_exec_id
++; 566 flush_signal_handlers(current
); 567 flush_old_files(current
->files
); 573 spin_lock_irq(¤t
->sigmask_lock
); 574 if(current
->sig
!= oldsig
) 576 current
->sig
= oldsig
; 577 spin_unlock_irq(¤t
->sigmask_lock
); 582 * We mustn't allow tracing of suid binaries, unless 583 * the tracer has the capability to trace anything.. 585 staticinlineintmust_not_trace_exec(struct task_struct
* p
) 587 return(p
->ptrace
& PT_PTRACED
) && !cap_raised(p
->p_pptr
->cap_effective
, CAP_SYS_PTRACE
); 591 * Fill the binprm structure from the inode. 592 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes 594 intprepare_binprm(struct linux_binprm
*bprm
) 597 struct inode
* inode
= bprm
->file
->f_dentry
->d_inode
; 599 mode
= inode
->i_mode
; 600 /* Huh? We had already checked for MAY_EXEC, WTF do we check this? */ 601 if(!(mode
&0111))/* with at least _one_ execute bit set */ 603 if(bprm
->file
->f_op
== NULL
) 606 bprm
->e_uid
= current
->euid
; 607 bprm
->e_gid
= current
->egid
; 609 if(!IS_NOSUID(inode
)) { 612 bprm
->e_uid
= inode
->i_uid
; 616 * If setgid is set but no group execute bit then this 617 * is a candidate for mandatory locking, not a setgid 620 if((mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
)) 621 bprm
->e_gid
= inode
->i_gid
; 624 /* We don't have VFS support for capabilities yet */ 625 cap_clear(bprm
->cap_inheritable
); 626 cap_clear(bprm
->cap_permitted
); 627 cap_clear(bprm
->cap_effective
); 629 /* To support inheritance of root-permissions and suid-root 630 * executables under compatibility mode, we raise all three 631 * capability sets for the file. 633 * If only the real uid is 0, we only raise the inheritable 634 * and permitted sets of the executable file. 637 if(!issecure(SECURE_NOROOT
)) { 638 if(bprm
->e_uid
==0|| current
->uid
==0) { 639 cap_set_full(bprm
->cap_inheritable
); 640 cap_set_full(bprm
->cap_permitted
); 643 cap_set_full(bprm
->cap_effective
); 646 memset(bprm
->buf
,0,BINPRM_BUF_SIZE
); 647 returnkernel_read(bprm
->file
,0,bprm
->buf
,BINPRM_BUF_SIZE
); 651 * This function is used to produce the new IDs and capabilities 652 * from the old ones and the file's capabilities. 654 * The formula used for evolving capabilities is: 657 * (***) pP' = (fP & X) | (fI & pI) 658 * pE' = pP' & fE [NB. fE is 0 or ~0] 660 * I=Inheritable, P=Permitted, E=Effective // p=process, f=file 661 * ' indicates post-exec(), and X is the global 'cap_bset'. 665 voidcompute_creds(struct linux_binprm
*bprm
) 667 kernel_cap_t new_permitted
, working
; 670 new_permitted
=cap_intersect(bprm
->cap_permitted
, cap_bset
); 671 working
=cap_intersect(bprm
->cap_inheritable
, 672 current
->cap_inheritable
); 673 new_permitted
=cap_combine(new_permitted
, working
); 675 if(bprm
->e_uid
!= current
->uid
|| bprm
->e_gid
!= current
->gid
|| 676 !cap_issubset(new_permitted
, current
->cap_permitted
)) { 677 current
->dumpable
=0; 680 if(must_not_trace_exec(current
) 681 ||atomic_read(¤t
->fs
->count
) >1 682 ||atomic_read(¤t
->files
->count
) >1 683 ||atomic_read(¤t
->sig
->count
) >1) { 684 if(!capable(CAP_SETUID
)) { 685 bprm
->e_uid
= current
->uid
; 686 bprm
->e_gid
= current
->gid
; 688 if(!capable(CAP_SETPCAP
)) { 689 new_permitted
=cap_intersect(new_permitted
, 690 current
->cap_permitted
); 697 /* For init, we want to retain the capabilities set 698 * in the init_task struct. Thus we skip the usual 699 * capability rules */ 700 if(current
->pid
!=1) { 701 current
->cap_permitted
= new_permitted
; 702 current
->cap_effective
= 703 cap_intersect(new_permitted
, bprm
->cap_effective
); 706 /* AUD: Audit candidate if current->cap_effective is set */ 708 current
->suid
= current
->euid
= current
->fsuid
= bprm
->e_uid
; 709 current
->sgid
= current
->egid
= current
->fsgid
= bprm
->e_gid
; 713 current
->keep_capabilities
=0; 717 voidremove_arg_zero(struct linux_binprm
*bprm
) 720 unsigned long offset
; 724 offset
= bprm
->p
% PAGE_SIZE
; 727 while(bprm
->p
++, *(kaddr
+offset
++)) { 728 if(offset
!= PAGE_SIZE
) 733 page
= bprm
->page
[bprm
->p
/PAGE_SIZE
]; 742 * cycle the list of binary formats handler, until one recognizes the image 744 intsearch_binary_handler(struct linux_binprm
*bprm
,struct pt_regs
*regs
) 747 struct linux_binfmt
*fmt
; 749 /* handle /sbin/loader.. */ 751 struct exec
* eh
= (struct exec
*) bprm
->buf
; 753 if(!bprm
->loader
&& eh
->fh
.f_magic
==0x183&& 754 (eh
->fh
.f_flags
&0x3000) ==0x3000) 756 char* dynloader
[] = {"/sbin/loader"}; 758 unsigned long loader
; 760 allow_write_access(bprm
->file
); 764 loader
= PAGE_SIZE
*MAX_ARG_PAGES
-sizeof(void*); 766 file
=open_exec(dynloader
[0]); 767 retval
=PTR_ERR(file
); 771 bprm
->loader
= loader
; 772 retval
=prepare_binprm(bprm
); 775 /* should call search_binary_handler recursively here, 776 but it does not matter */ 780 for(try=0;try<2;try++) { 781 read_lock(&binfmt_lock
); 782 for(fmt
= formats
; fmt
; fmt
= fmt
->next
) { 783 int(*fn
)(struct linux_binprm
*,struct pt_regs
*) = fmt
->load_binary
; 786 if(!try_inc_mod_count(fmt
->module
)) 788 read_unlock(&binfmt_lock
); 789 retval
=fn(bprm
, regs
); 792 allow_write_access(bprm
->file
); 796 current
->did_exec
=1; 799 read_lock(&binfmt_lock
); 801 if(retval
!= -ENOEXEC
) 804 read_unlock(&binfmt_lock
); 808 read_unlock(&binfmt_lock
); 809 if(retval
!= -ENOEXEC
) { 813 #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) 815 if(printable(bprm
->buf
[0]) && 816 printable(bprm
->buf
[1]) && 817 printable(bprm
->buf
[2]) && 818 printable(bprm
->buf
[3])) 820 sprintf(modname
,"binfmt-%04x", *(unsigned short*)(&bprm
->buf
[2])); 821 request_module(modname
); 830 * sys_execve() executes a new program. 832 intdo_execve(char* filename
,char** argv
,char** envp
,struct pt_regs
* regs
) 834 struct linux_binprm bprm
; 839 file
=open_exec(filename
); 841 retval
=PTR_ERR(file
); 845 bprm
.p
= PAGE_SIZE
*MAX_ARG_PAGES
-sizeof(void*); 846 memset(bprm
.page
,0, MAX_ARG_PAGES
*sizeof(bprm
.page
[0])); 849 bprm
.filename
= filename
; 853 if((bprm
.argc
=count(argv
, bprm
.p
/sizeof(void*))) <0) { 854 allow_write_access(file
); 859 if((bprm
.envc
=count(envp
, bprm
.p
/sizeof(void*))) <0) { 860 allow_write_access(file
); 865 retval
=prepare_binprm(&bprm
); 869 retval
=copy_strings_kernel(1, &bprm
.filename
, &bprm
); 874 retval
=copy_strings(bprm
.envc
, envp
, &bprm
); 878 retval
=copy_strings(bprm
.argc
, argv
, &bprm
); 882 retval
=search_binary_handler(&bprm
,regs
); 888 /* Something went wrong, return the inode and free the argument pages*/ 889 allow_write_access(bprm
.file
); 893 for(i
=0; i
< MAX_ARG_PAGES
; i
++) { 894 struct page
* page
= bprm
.page
[i
]; 902 voidset_binfmt(struct linux_binfmt
*new) 904 struct linux_binfmt
*old
= current
->binfmt
; 906 __MOD_INC_USE_COUNT(new->module
); 907 current
->binfmt
=new; 908 if(old
&& old
->module
) 909 __MOD_DEC_USE_COUNT(old
->module
); 912 intdo_coredump(long signr
,struct pt_regs
* regs
) 914 struct linux_binfmt
* binfmt
; 915 char corename
[6+sizeof(current
->comm
)]; 917 struct inode
* inode
; 920 binfmt
= current
->binfmt
; 921 if(!binfmt
|| !binfmt
->core_dump
) 923 if(!current
->dumpable
||atomic_read(¤t
->mm
->mm_users
) !=1) 925 current
->dumpable
=0; 926 if(current
->rlim
[RLIMIT_CORE
].rlim_cur
< binfmt
->min_coredump
) 929 memcpy(corename
,"core.",5); 931 memcpy(corename
+5,current
->comm
,sizeof(current
->comm
)); 935 file
=filp_open(corename
, O_CREAT
|2| O_TRUNC
| O_NOFOLLOW
,0600); 938 inode
= file
->f_dentry
->d_inode
; 939 if(inode
->i_nlink
>1) 940 goto close_fail
;/* multiple links - don't dump */ 942 if(!S_ISREG(inode
->i_mode
)) 946 if(!file
->f_op
->write
) 948 if(!binfmt
->core_dump(signr
, regs
, file
)) 951 filp_close(file
, NULL
); 955 filp_close(file
, NULL
);