linux 1.0 内核注解 linux/fs/exec.c

时间：2009-05-24 来源：taozhijiangscu

/********************************************
*Created By: Prometheus
*Date        : 2009-5-24
********************************************/
/*
* linux/fs/exec.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/ /*
* #!-checking implemented by tytso.
*/ /*
* Demand-loading implemented 01.12.91 - no need to read anything but
* the header into memory. The inode of the executable is put into
* "current->executable", and page faults do the actual loading. Clean.
*
* Once more I can proudly say that linux stood up to being changed: it
* was less than 2 hours work to get demand-loading completely implemented.
*
* Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
* current->executable is only used by the procfs. This allows a dispatch
* table to check for several different types of binary formats. We keep
* trying until we recognize the file or we run out of supported binary
* formats.
*/ #include <linux/fs.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/a.out.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/ptrace.h>
#include <linux/user.h>
#include <linux/segment.h>
#include <linux/malloc.h> #include <asm/system.h> #include <linux/binfmts.h> #include <asm/segment.h>
#include <asm/system.h> asmlinkage int sys_exit(int exit_code);
asmlinkage int sys_close(unsigned fd);
asmlinkage int sys_open(const char *, int, int);
asmlinkage int sys_brk(unsigned long); extern void shm_exit (void); int open_inode(struct inode * inode, int mode)
{
int error, fd;
struct file *f, **fpp; if (!inode->i_op || !inode->i_op->default_file_ops)
  return -EINVAL;
f = get_empty_filp();
if (!f)
  return -EMFILE;
fd = 0;
fpp = current->filp;
for (;;) {
  if (!*fpp)
   break;
  if (++fd > NR_OPEN)
   return -ENFILE;
  fpp++;
}
*fpp = f;
f->f_flags = mode;
f->f_mode = (mode+1) & O_ACCMODE;
f->f_inode = inode;
f->f_pos = 0;
f->f_reada = 0;
f->f_op = inode->i_op->default_file_ops;
if (f->f_op->open) {
  error = f->f_op->open(inode,f);
  if (error) {
   *fpp = NULL;
   f->f_count--;
   return error;
  }
}
inode->i_count++;
return fd;
} /*
* These are the only things you should do on a core-file: use only these
* macros to write out all the necessary info.
*/
#define DUMP_WRITE(addr,nr) \
while (file.f_op->write(inode,&file,(char *)(addr),(nr)) != (nr)) goto close_coredump #define DUMP_SEEK(offset) \
if (file.f_op->lseek) { \
if (file.f_op->lseek(inode,&file,(offset),0) != (offset)) \
   goto close_coredump; \
} else file.f_pos = (offset)   /*
* Routine writes a core dump image in the current directory.
* Currently only a stub-function.
*
* Note that setuid/setgid files won't make a core-dump if the uid/gid
* changed due to the set[u|g]id. It's enforced by the "current->dumpable"
* field, which also makes sure the core-dumps won't be recursive if the
* dumping of the process results in another error..
*/
int core_dump(long signr, struct pt_regs * regs)
{
struct inode * inode = NULL;
struct file file;
unsigned short fs;
int has_dumped = 0;
char corefile[6+sizeof(current->comm)];
int i;
register int dump_start, dump_size;
struct user dump; if (!current->dumpable)
  return 0;
current->dumpable = 0; /* See if we have enough room to write the upage. */
if (current->rlim[RLIMIT_CORE].rlim_cur < PAGE_SIZE)
  return 0;
fs = get_fs();
set_fs(KERNEL_DS);
memcpy(corefile,"core.",5);
#if 0
memcpy(corefile+5,current->comm,sizeof(current->comm));
#else
corefile[4] = '\0';
#endif
    //O_RDWR       //读写
if (open_namei(corefile,O_CREAT | 2 | O_TRUNC,0600,&inode,NULL)) {
  inode = NULL;
  goto end_coredump;
}
if (!S_ISREG(inode->i_mode))
  goto end_coredump;
if (!inode->i_op || !inode->i_op->default_file_ops)
  goto end_coredump;
file.f_mode = 3;
file.f_flags = 0;
file.f_count = 1;
file.f_inode = inode;
file.f_pos = 0;
file.f_reada = 0;
file.f_op = inode->i_op->default_file_ops;
if (file.f_op->open)
  if (file.f_op->open(inode,&file))
   goto end_coredump;
if (!file.f_op->write)
  goto close_coredump;
has_dumped = 1;
/* changed the size calculations - should hopefully work better. lbt */
dump.magic = CMAGIC; //Code indicating core file.
dump.start_code = 0;
dump.start_stack = regs->esp & ~(PAGE_SIZE - 1);
dump.u_tsize = ((unsigned long) current->end_code) >> 12;
dump.u_dsize = ((unsigned long) (current->brk + (PAGE_SIZE-1))) >> 12;
dump.u_dsize -= dump.u_tsize;  //要减的，呵呵
dump.u_ssize = 0;  //Stack segment size (pages)
for(i=0; i<8; i++) dump.u_debugreg[i] = current->debugreg[i];
if (dump.start_stack < TASK_SIZE)
  dump.u_ssize = ((unsigned long) (TASK_SIZE - dump.start_stack)) >> 12; //对于数据和堆栈，首先检测进程的限制，如果空间限制允许就进行记录，否则设置为0
//从这里的设置来看是优先保存堆栈信息的
/* If the size of the dump file exceeds the rlimit, then see what would happen
   if we wrote the stack, but not the data area. */
if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE >
     current->rlim[RLIMIT_CORE].rlim_cur)
  dump.u_dsize = 0;
/* Make sure we have enough room to write the stack and data areas. */
if ((dump.u_ssize+1) * PAGE_SIZE >
     current->rlim[RLIMIT_CORE].rlim_cur)
  dump.u_ssize = 0;
       strncpy(dump.u_comm, current->comm, sizeof(current->comm)); //user command
dump.u_ar0 = (struct pt_regs *)(((int)(&dump.regs)) -((int)(&dump)));
dump.signal = signr;
dump.regs = *regs;
/* Flag indicating the math stuff is valid. We don't support this for the
   soft-float routines yet */
if (hard_math) {
  if ((dump.u_fpvalid = current->used_math) != 0) { //使用了浮点运算
   if (last_task_used_math == current)  //直接取浮点运算单元的数据保存
    __asm__("clts ; fnsave %0": :"m" (dump.i387));
   else
    memcpy(&dump.i387,&current->tss.i387.hard,sizeof(dump.i387));
  }
} else {
  /* we should dump the emulator state here, but we need to
     convert it into standard 387 format first.. */
  dump.u_fpvalid = 0;
}
set_fs(KERNEL_DS);
/* struct user */
DUMP_WRITE(&dump,sizeof(dump));
/* Now dump all of the user data. Include malloced stuff as well */
DUMP_SEEK(PAGE_SIZE);
/* now we start writing out the user space info */
set_fs(USER_DS);
/* Dump the data area */
if (dump.u_dsize != 0) {
  dump_start = dump.u_tsize << 12;
  dump_size = dump.u_dsize << 12;
  DUMP_WRITE(dump_start,dump_size);
};
/* Now prepare to dump the stack area */
if (dump.u_ssize != 0) {
  dump_start = dump.start_stack;
  dump_size = dump.u_ssize << 12;
  DUMP_WRITE(dump_start,dump_size);
};
/* Finally dump the task struct. Not be used by gdb, but could be useful */
set_fs(KERNEL_DS);
DUMP_WRITE(current,sizeof(*current));
close_coredump:
if (file.f_op->release)
  file.f_op->release(inode,&file);
end_coredump:
set_fs(fs);
iput(inode);
return has_dumped;
} /*
* Note that a shared library must be both readable and executable due to
* security reasons.
*
* Also note that we take the address to load from from the file itself.
*/
//加载共享库，参数是库名。成功返回0
asmlinkage int sys_uselib(const char * library)
{
int fd, retval;
struct file * file;
struct linux_binfmt * fmt; fd = sys_open(library, 0, 0);
if (fd < 0)
  return fd;
file = current->filp[fd];
retval = -ENOEXEC;
if (file && file->f_inode && file->f_op && file->f_op->read) {
  fmt = formats;  //这里的formats是一个全局的数组名，这里是遍历尝试进行加载的
  do {
   int (*fn)(int) = fmt->load_shlib;
   if (!fn)  //数组最后是用NULL结尾的，所以这里应该是不合法的类型了
    break;
   retval = fn(fd);
   fmt++;
  } while (retval == -ENOEXEC);
}
sys_close(fd);
return retval;
} /*
* create_tables() parses the env- and arg-strings in new user
* memory and creates the pointer tables from them, and puts their
* addresses on the "stack", returning the new stack pointer value.
*/  //差不多
unsigned long * create_tables(char * p,int argc,int envc,int ibcs)
{
unsigned long *argv,*envp;
unsigned long * sp;
struct vm_area_struct *mpnt; mpnt = (struct vm_area_struct *)kmalloc(sizeof(*mpnt), GFP_KERNEL);
if (mpnt) {
  mpnt->vm_task = current;
  mpnt->vm_start = PAGE_MASK & (unsigned long) p;
  mpnt->vm_end = TASK_SIZE;
  mpnt->vm_page_prot = PAGE_PRIVATE|PAGE_DIRTY;
  mpnt->vm_share = NULL;
  mpnt->vm_inode = NULL;
  mpnt->vm_offset = 0;
  mpnt->vm_ops = NULL;
  insert_vm_struct(current, mpnt);
  current->stk_vma = mpnt;
}
sp = (unsigned long *) (0xfffffffc & (unsigned long) p);
sp -= envc+1;  //因为空指针需要多跳过一个
envp = sp;
sp -= argc+1;
argv = sp;
if (!ibcs) { //
  put_fs_long((unsigned long)envp,--sp);
  put_fs_long((unsigned long)argv,--sp);
} //呃，总算是理清楚了，这里的argv envp实际都算指向sp的不同位置，而
//堆栈我们只能是存储字符串指针，那么p实际就算解析这些字符串然后把
//字符串头部指针放置到sp中的，呵呵
put_fs_long((unsigned long)argc,--sp);
current->arg_start = (unsigned long) p;
while (argc-->0) {
  put_fs_long((unsigned long) p,argv++);
  while (get_fs_byte(p++)) /* nothing */ ;
}
put_fs_long(0,argv);
current->arg_end = current->env_start = (unsigned long) p;
while (envc-->0) {
  put_fs_long((unsigned long) p,envp++);
  while (get_fs_byte(p++)) /* nothing */ ;
}
put_fs_long(0,envp);
current->env_end = (unsigned long) p;
return sp; //返回栈顶地址
} /*
* count() counts the number of arguments/envelopes
*/
static int count(char ** argv)
{
int i=0;
char ** tmp; if ((tmp = argv) != 0)
  while (get_fs_long((unsigned long *) (tmp++))) //这里的堆栈参数和环境变量都是规测的4字节了
   i++; return i;
} /*
* 'copy_string()' copies argument/envelope strings from user
* memory to free pages in kernel mem. These are in a format ready
* to be put directly into the top of new user memory.
*
* Modified by TYT, 11/24/91 to add the from_kmem argument, which specifies
* whether the string and the string array are from user or kernel segments:
*
* from_kmem     argv *        argv **
*    0          user space    user space
*    1          kernel space user space
*    2          kernel space kernel space
*
* We do this by playing games with the fs segment register. Since it
* it is expensive to load a segment register, we try to avoid calling
* set_fs() unless we absolutely have to.
*/
//将用户态的数据拷贝到内核态
unsigned long copy_strings(int argc,char ** argv,unsigned long *page,
  unsigned long p, int from_kmem)
{
char *tmp, *pag = NULL;
int len, offset = 0;
unsigned long old_fs, new_fs; if (!p)
  return 0; /* bullet-proofing */
new_fs = get_ds();
old_fs = get_fs();
if (from_kmem==2)  //内核空间到内核空间
  set_fs(new_fs);  //fs->内核段
while (argc-- > 0) {
  if (from_kmem == 1) //如果argv在内核空间，就将fs设置成内核空间
   set_fs(new_fs);
  if (!(tmp = (char *)get_fs_long(((unsigned long *)argv)+argc))) //获得字符串的地址(地址本身根据from_kmem可能在
          //内核空间或者用户空间的)
   panic("VFS: argc is wrong");
  if (from_kmem == 1) //如果必要要立即恢复，这里很巧妙，如果from_kmem==2，那么就是内核空间数据的交换。fs就一直是内核空间
     //了，这里的改变复原是不会发生的。同样的如果from_kmem==0，那么fs默认的就是用户空间，也不用改变了
   set_fs(old_fs);
  len=0;  /* remember zero-padding */
  do {
   len++;
  } while (get_fs_byte(tmp++));
   //bprm.p = PAGE_SIZE*MAX_ARG_PAGES-4;
  if (p < len) { /* this shouldn't happen - 128kB */
   set_fs(old_fs);
   return 0;
  }
  while (len) {  //下面的argv**实际的字符数据只有当from_kmem==2时候在内核空间
     //而其他时候就在用户空间了，当from_kmem!=2时候fs应该此时是用户空间的
   --p; --tmp; --len;
   if (--offset < 0) { //表示页面内部的偏移，当offset<0的时候就需要申请新的页面了
    offset = p % PAGE_SIZE;
    if (from_kmem==2)
     set_fs(old_fs);
    if (!(pag = (char *) page[p/PAGE_SIZE]) &&
        !(pag = (char *) page[p/PAGE_SIZE] =
          (unsigned long *) get_free_page(GFP_USER))) //申请失败，看看这里是用户空间的地址
     return 0;
    if (from_kmem==2)
     set_fs(new_fs);    }
   *(pag + offset) = get_fs_byte(tmp); //复制一字节的数据
  }
}
if (from_kmem==2)
  set_fs(old_fs);
return p;
} unsigned long change_ldt(unsigned long text_size,unsigned long * page)
{
unsigned long code_limit,data_limit,code_base,data_base;
int i; code_limit = TASK_SIZE;
data_limit = TASK_SIZE;
code_base = data_base = 0;
current->start_code = code_base; //0
data_base += data_limit;
for (i=MAX_ARG_PAGES-1 ; i>=0 ; i--) {
  data_base -= PAGE_SIZE;  //页面起始地址，看来是从3G的用户地址的末尾开始排列
  if (page[i]) {
   current->rss++;
   put_dirty_page(current,page[i],data_base);
  }
}
return data_limit;
} /*
* Read in the complete executable. This is used for "-N" files
* that aren't on a block boundary, and for files on filesystems
* without bmap support.
*/
int read_exec(struct inode *inode, unsigned long offset,
char * addr, unsigned long count)
{
struct file file;
int result = -ENOEXEC; if (!inode->i_op || !inode->i_op->default_file_ops)
  goto end_readexec;
file.f_mode = 1;
file.f_flags = 0;
file.f_count = 1;
file.f_inode = inode;
file.f_pos = 0;
file.f_reada = 0;
file.f_op = inode->i_op->default_file_ops;
if (file.f_op->open)
  if (file.f_op->open(inode,&file))
   goto end_readexec;
if (!file.f_op || !file.f_op->read)
  goto close_readexec;
if (file.f_op->lseek) {
  if (file.f_op->lseek(inode,&file,offset,0) != offset) //从头偏移到指定的offset位置
    goto close_readexec;
} else
  file.f_pos = offset; //呃，如果能直接这样设置偏移?那为什么不用呢？
if (get_fs() == USER_DS) {
  result = verify_area(VERIFY_WRITE, addr, count);
  if (result)
   goto close_readexec;
}
result = file.f_op->read(inode, &file, addr, count); //读取到addr指向的地址
close_readexec:
if (file.f_op->release)
  file.f_op->release(inode,&file);
end_readexec:
return result;
}
/*
* This function flushes out all traces of the currently running executable so
* that a new one can be started
*/
//基本将原进程的很多的资源给释放掉
void flush_old_exec(struct linux_binprm * bprm)
{
int i;
int ch;
char * name;
struct vm_area_struct * mpnt, *mpnt1; current->dumpable = 1;
name = bprm->filename;
for (i=0; (ch = *(name++)) != '\0';) {
  if (ch == '/')
   i = 0;
  else
   if (i < 15) //命令只能15个字符？
    current->comm[i++] = ch;
}
current->comm[i] = '\0';
if (current->shm)
  shm_exit();
if (current->executable) {
  iput(current->executable);
  current->executable = NULL; //取消当前的执行镜像
}
/* Release all of the old mmap stuff. */ mpnt = current->mmap;
current->mmap = NULL;
current->stk_vma = NULL;
while (mpnt) {
  mpnt1 = mpnt->vm_next;
  if (mpnt->vm_ops && mpnt->vm_ops->close)
   mpnt->vm_ops->close(mpnt);
  kfree(mpnt);
  mpnt = mpnt1;
} /* Flush the old ldt stuff... */
if (current->ldt) {
  free_page((unsigned long) current->ldt);
  current->ldt = NULL;
  for (i=1 ; i<NR_TASKS ; i++) {
   if (task[i] == current) {
    set_ldt_desc(gdt+(i<<1)+
          FIRST_LDT_ENTRY,&default_ldt, 1); //替换成默认的局部描述符表
    load_ldt(i);
   }
  }
} for (i=0 ; i<8 ; i++) current->debugreg[i] = 0; if (bprm->e_uid != current->euid || bprm->e_gid != current->egid ||
     !permission(bprm->inode,MAY_READ))
  current->dumpable = 0;
current->signal = 0;
for (i=0 ; i<32 ; i++) {
  current->sigaction[i].sa_mask = 0;
  current->sigaction[i].sa_flags = 0;
  if (current->sigaction[i].sa_handler != SIG_IGN)
   current->sigaction[i].sa_handler = NULL;
}
for (i=0 ; i<NR_OPEN ; i++)
  if (FD_ISSET(i,&current->close_on_exec))
   sys_close(i);  //so close on exec
FD_ZERO(&current->close_on_exec); //默认的是复位的
clear_page_tables(current);
if (last_task_used_math == current)
  last_task_used_math = NULL;
current->used_math = 0;
current->elf_executable = 0;
} /*
* sys_execve() executes a new program.
*/
static int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs)
{
struct linux_binprm bprm;
struct linux_binfmt * fmt;
unsigned long old_fs;
int i;
int retval;
int sh_bang = 0; if (regs->cs != USER_CS) //必须是用户空间的代码段哈
  return -EINVAL;
bprm.p = PAGE_SIZE*MAX_ARG_PAGES-4;
for (i=0 ; i<MAX_ARG_PAGES ; i++) /* clear page-table */
  bprm.page[i] = 0;
retval = open_namei(filename, 0, 0, &bprm.inode, NULL); //打开执行文件
if (retval)
  return retval;
bprm.filename = filename;
bprm.argc = count(argv);
bprm.envc = count(envp);

restart_interp:
if (!S_ISREG(bprm.inode->i_mode)) { /* must be regular file */
  retval = -EACCES;
  goto exec_error2;
}
if (IS_NOEXEC(bprm.inode)) {  /* FS mustn't be mounted noexec */
  retval = -EPERM;
  goto exec_error2;
}
if (!bprm.inode->i_sb) {
  retval = -EACCES;
  goto exec_error2;
}
i = bprm.inode->i_mode;
if (IS_NOSUID(bprm.inode) && ( //如果设置了忽略粘置位
  ((i & S_ISUID) && bprm.inode->i_uid != current->euid) //我想的是如果忽略粘滞位，而一般的粘滞位文件都是要
                //有宿主的权限执行的，所以这里直接错误返回了
  || ((i & S_ISGID) && !in_group_p(bprm.inode->i_gid))
        )
     &&!suser())
{
  retval = -EPERM;
  goto exec_error2;
}
/* make sure we don't let suid, sgid files be ptraced. */
if (current->flags & PF_PTRACED) {
  bprm.e_uid = current->euid;
  bprm.e_gid = current->egid;
} else {
  bprm.e_uid = (i & S_ISUID) ? bprm.inode->i_uid : current->euid; //否则设置成文件所有者的uid gid
  bprm.e_gid = (i & S_ISGID) ? bprm.inode->i_gid : current->egid;
}

if (current->euid == bprm.inode->i_uid)
  i >>= 6;
else if (in_group_p(bprm.inode->i_gid))
  i >>= 3;
if (!(i & 1) &&
     !((bprm.inode->i_mode & 0111) && suser())
) {
  retval = -EACCES;
  goto exec_error2;
}
memset(bprm.buf,0,sizeof(bprm.buf));
old_fs = get_fs();
set_fs(get_ds());
retval = read_exec(bprm.inode,0,bprm.buf,128);  //读取执行文件开头
set_fs(old_fs);
if (retval < 0)
  goto exec_error2;   //这里的sh_bang实际是一个标志，当解析器设置完之后就自加它，下次就不会执行这些代码
if ((bprm.buf[0] == '#') && (bprm.buf[1] == '!') && (!sh_bang)) { //执行脚本文件，需要加载解析器
  /*
   * This section does the #! interpretation.
   * Sorta complicated, but hopefully it will work. -TYT
   */   char *cp, *interp, *i_name, *i_arg;   iput(bprm.inode);
  bprm.buf[127] = '\0';
  if ((cp = strchr(bprm.buf, '\n')) == NULL) //查找字符串中首次出现换行符的位置，如果没哟找到就直接指到128字符的最后
   cp = bprm.buf+127;
  *cp = '\0';
  while (cp > bprm.buf) {
   cp--;
   if ((*cp == ' ') || (*cp == '\t'))
    *cp = '\0'; //滤除解析器名字后面的空白字符
   else
    break;
  }
  //跳过 #!，并跳过#!后面的空白字符
  for (cp = bprm.buf+2; (*cp == ' ') || (*cp == '\t'); cp++);
  if (!cp || *cp == '\0') {
   retval = -ENOEXEC; /* No interpreter name found */
   goto exec_error1;
  }
  interp = i_name = cp;
  i_arg = 0;
  for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++) {
    if (*cp == '/')
    i_name = cp+1; //是不是找到 /../..最后的名字阿
  }
  while ((*cp == ' ') || (*cp == '\t'))
   *cp++ = '\0';
  if (*cp) //解析器名字后面带有的参数
   i_arg = cp;
  /*
   * OK, we've parsed out the interpreter name and
   * (optional) argument.
   */
  if (sh_bang++ == 0) {
   //这里进行了环境变量和参数的复制，p作为参数传递进去，同时也
   //被更新后返回，最后p指向的是第一个执行参数
   bprm.p = copy_strings(bprm.envc, envp, bprm.page, bprm.p, 0);
   bprm.p = copy_strings(--bprm.argc, argv+1, bprm.page, bprm.p, 0);
  }
  /*
   * Splice in (1) the interpreter's name for argv[0]
   *           (2) (optional) argument to interpreter
   *           (3) filename of shell script
   *
   * This is done in reverse order, because of how the
   * user environment and arguments are stored.
   */
  bprm.p = copy_strings(1, &bprm.filename, bprm.page, bprm.p, 2); //kernel->kernel，复制执行文件名
  bprm.argc++;
  if (i_arg) {
   bprm.p = copy_strings(1, &i_arg, bprm.page, bprm.p, 2); //复制解释器的参数
   bprm.argc++;
  }
  bprm.p = copy_strings(1, &i_name, bprm.page, bprm.p, 2); //复制解释器的名字
  bprm.argc++;
  if (!bprm.p) {
   retval = -E2BIG;
   goto exec_error1;
  }
  /*
   * OK, now restart the process with the interpreter's inode.
   * Note that we use open_namei() as the name is now in kernel
   * space, and we don't need to copy it.
   */
  //这里可以加载执行解析器了
  retval = open_namei(interp, 0, 0, &bprm.inode, NULL);
  if (retval)
   goto exec_error1;
  goto restart_interp;  //这里从新执行的一遍的原因估计是上面是对脚本文件本身执行的，实际脚本文件
      //本身不是合法的可执行的二进制的文件，而上面用open_namei打开解释器了，应该
      //就更新了bprm.inode了，执行这个可执行的二进制程序的设置
}
//执行这里说明不是脚本文件，那么上面的参数和环境信息都没有被设置，这里拷贝设置
if (!sh_bang) {
  bprm.p = copy_strings(bprm.envc,envp,bprm.page,bprm.p,0); //user->kernel
  bprm.p = copy_strings(bprm.argc,argv,bprm.page,bprm.p,0);
  if (!bprm.p) {
   retval = -E2BIG;
   goto exec_error2;
  }
} bprm.sh_bang = sh_bang;
fmt = formats;
do {
  int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
  if (!fn)
   break;
  retval = fn(&bprm, regs);
  if (retval == 0) {
   iput(bprm.inode);
   current->did_exec = 1;
   return 0;  //这里执行文件就被加载了，成功了！
      //一旦return eip弹出就更新进程了
  }
  fmt++;
} while (retval == -ENOEXEC);
exec_error2:
iput(bprm.inode);
exec_error1:
for (i=0 ; i<MAX_ARG_PAGES ; i++)
  free_page(bprm.page[i]);
return(retval);
} /*
* sys_execve() executes a new program.
*/
asmlinkage int sys_execve(struct pt_regs regs)
{
int error;
char * filename; //将文件名称拷贝到内核空间中
error = getname((char *) regs.ebx, &filename);
if (error)
  return error;
error = do_execve(filename, (char **) regs.ecx, (char **) regs.edx, &regs);
putname(filename); //释放空间的，一般的do_execve是不会返回的，到这里说明是出错了
return error;
} /*
* These are the prototypes for the functions in the dispatch table, as
* well as the dispatch table itself.
*/ extern int load_aout_binary(struct linux_binprm *,
       struct pt_regs * regs);
extern int load_aout_library(int fd); #ifdef CONFIG_BINFMT_ELF
extern int load_elf_binary(struct linux_binprm *,
       struct pt_regs * regs);
extern int load_elf_library(int fd);
#endif #ifdef CONFIG_BINFMT_COFF
extern int load_coff_binary(struct linux_binprm *,
       struct pt_regs * regs);
extern int load_coff_library(int fd);
#endif /* Here are the actual binaries that will be accepted */
struct linux_binfmt formats[] = {
{load_aout_binary, load_aout_library},
#ifdef CONFIG_BINFMT_ELF
{load_elf_binary, load_elf_library},
#endif
#ifdef CONFIG_BINFMT_COFF
{load_coff_binary, load_coff_library},
#endif
{NULL, NULL}
}; /*
* These are the functions used to load a.out style executables and shared
* libraries. There is no binary dependent code anywhere else.
*/ //a.out执行文件的格式
// 8 struct exec
// 9 {
// 10   unsigned long a_info;         /* Use macros N_MAGIC, etc for access */
// 11   unsigned a_text;              /* length of text, in bytes */
// 12   unsigned a_data;              /* length of data, in bytes */
// 13   unsigned a_bss;               /* length of uninitialized data area for file, in bytes */
// 14   unsigned a_syms;              /* length of symbol table data in file, in bytes */
// 15   unsigned a_entry;             /* start address */
// 16   unsigned a_trsize;            /* length of relocation info for text, in bytes */
// 17   unsigned a_drsize;            /* length of relocation info for data, in bytes */
// 18 };
int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
{
struct exec ex;
struct file * file;
int fd, error;
unsigned long p = bprm->p; ex = *((struct exec *) bprm->buf);  /* exec-header */
if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
      N_MAGIC(ex) != QMAGIC) ||
     ex.a_trsize || ex.a_drsize || //重定位信息，可能对这些文件都是不需要重定位信息 :-(
     bprm->inode->i_size < ex.a_text+ex.a_data+ex.a_syms+N_TXTOFF(ex)) {
  return -ENOEXEC;
} if (N_MAGIC(ex) == ZMAGIC &&
     (N_TXTOFF(ex) < bprm->inode->i_sb->s_blocksize)) {
  printk("N_TXTOFF < BLOCK_SIZE. Please convert binary.");
  return -ENOEXEC;
} //若文件则使用了ZMAGIC类型的a.out格式，
//下面的链接库也是检测了的
if (N_TXTOFF(ex) != BLOCK_SIZE && N_MAGIC(ex) == ZMAGIC) {
  printk("N_TXTOFF != BLOCK_SIZE. See a.out.h.");
  return -ENOEXEC;
}

/* OK, This is the point of no return */
flush_old_exec(bprm); current->end_code = N_TXTADDR(ex) + ex.a_text;
current->end_data = ex.a_data + current->end_code;
current->start_brk = current->brk = current->end_data; //brk的结尾就是数据段的结尾，当然分配内存空间可以改变
current->start_code += N_TXTADDR(ex);
current->rss = 0; current->suid = current->euid = bprm->e_uid;
current->mmap = NULL;
current->executable = NULL; /* for OMAGIC files */
current->sgid = current->egid = bprm->e_gid;
if (N_MAGIC(ex) == OMAGIC) {
  do_mmap(NULL, 0, ex.a_text+ex.a_data, //映射代码段和数据段
   PROT_READ|PROT_WRITE|PROT_EXEC,
   MAP_FIXED|MAP_PRIVATE, 0);
  read_exec(bprm->inode, 32, (char *) 0, ex.a_text+ex.a_data); //跳过固定的32字节的aout头部，后面紧跟的是代码段
} else { //Z N 都是代码段和数据段分页面加载的，所以需要对对齐
  if (ex.a_text & 0xfff || ex.a_data & 0xfff)
   printk("%s: executable not page aligned\n", current->comm);

  fd = open_inode(bprm->inode, O_RDONLY);

  if (fd < 0)
   return fd;
  file = current->filp[fd];
  if (!file->f_op || !file->f_op->mmap)
  {
   sys_close(fd);
   do_mmap(NULL, 0, ex.a_text+ex.a_data,
    PROT_READ|PROT_WRITE|PROT_EXEC,
    MAP_FIXED|MAP_PRIVATE, 0);
   read_exec(bprm->inode, N_TXTOFF(ex),
      (char *) N_TXTADDR(ex), ex.a_text+ex.a_data);
   goto beyond_if;
  }
  //这里是映射只读代码段
  error = do_mmap(file, N_TXTADDR(ex), ex.a_text,
    PROT_READ | PROT_EXEC,
    MAP_FIXED | MAP_SHARED, N_TXTOFF(ex));   if (error != N_TXTADDR(ex)) {
   sys_close(fd);
   send_sig(SIGSEGV, current, 0);
   return 0;
  };
  //映射可读写的数据段
   error = do_mmap(file, N_TXTADDR(ex) + ex.a_text, ex.a_data,
    PROT_READ | PROT_WRITE | PROT_EXEC,
    MAP_FIXED | MAP_PRIVATE, N_TXTOFF(ex) + ex.a_text);
  sys_close(fd);
  if (error != N_TXTADDR(ex) + ex.a_text) {
   send_sig(SIGSEGV, current, 0);
   return 0;
  };
  current->executable = bprm->inode;
  bprm->inode->i_count++;
}
beyond_if:
sys_brk(current->brk+ex.a_bss); //增加bss到数据段中

p += change_ldt(ex.a_text,bprm->page);
p -= MAX_ARG_PAGES*PAGE_SIZE;
p = (unsigned long) create_tables((char *)p,bprm->argc,bprm->envc,0);
current->start_stack = p;
regs->eip = ex.a_entry;  /* eip, magic happens :-) */
regs->esp = p;   /* stack pointer */
if (current->flags & PF_PTRACED)
  send_sig(SIGTRAP, current, 0);
return 0;
} //a.out是由OMAGIC, NMAGIC, QMAGIC, 或者 ZMAGIC发展过来的一种可执行文件格式.
//OMAGIC :在文件头之后就是各个段，数据段和代码段一块存放.
//NMAGIC:和OMAGIC差不多,但是数据段在代码段之后加载之后就立即加载数据段,并且将代码段标示位只读.
//ZMAGIC:只是增加了对页需求的支持.代码段和头部是分开的页面
//QMAGIC:这种格式可以将文件头和代码段的第一页合并起来,这样可以节省空间.
//所以对于ZMAGIC类型，程序执行的入口地址是页面对齐的;而对于QMAGIC类型执行的
//地址是偏移了32字节的头部的
int load_aout_library(int fd)
{
    struct file * file;
struct exec ex;
struct inode * inode;
unsigned int len;
unsigned int bss;
unsigned int start_addr;
int error;

file = current->filp[fd];
inode = file->f_inode;

set_fs(KERNEL_DS);
if (file->f_op->read(inode, file, (char *) &ex, sizeof(ex)) != sizeof(ex)) {
  return -EACCES;
}
set_fs(USER_DS);

/* We come in here for the regular a.out style of shared libraries */
if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || ex.a_trsize ||
     ex.a_drsize || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
     inode->i_size < ex.a_text+ex.a_data+ex.a_syms+N_TXTOFF(ex)) {
  return -ENOEXEC;
}
if (N_MAGIC(ex) == ZMAGIC && N_TXTOFF(ex) &&
     (N_TXTOFF(ex) < inode->i_sb->s_blocksize)) {
  printk("N_TXTOFF < BLOCK_SIZE. Please convert library\n"); //should be equal
  return -ENOEXEC;
}

if (N_FLAGS(ex)) return -ENOEXEC; /* For QMAGIC, the starting address is 0x20 into the page. We mask
    this off to get the starting address for the page */ start_addr = ex.a_entry & 0xfffff000;  //页面地址 /* Now use mmap to map the library into memory. */
error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
   PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE,
   N_TXTOFF(ex));
if (error != start_addr)
  return error;
len = PAGE_ALIGN(ex.a_text + ex.a_data);
bss = ex.a_text + ex.a_data + ex.a_bss;
if (bss > len)  //需要增加映射空间
  do_mmap(NULL, start_addr + len, bss-len,
   PROT_READ|PROT_WRITE|PROT_EXEC,
   MAP_PRIVATE|MAP_FIXED, 0);
return 0;
}
文档地址:http://blogimg.chinaunix.net/blog/upfile2/090524204040.pdf