(转载)Linux 内核文件系统与设备操作流程分析-2

时间：2010-12-29 来源：tuohuang0303

到这里才是查找对应 struct dentry 的具体操作，此函数首先从缓存中尝试获取
struct dentry 结构。如果获取失败，则调用 real_lookup() 函数使用实际文件
系统方法来读取 inode 信息。这里要明确 struct dentry 中包含了 struct inode
信息。

static int do_lookup(struct nameidata *nd, struct qstr *name,
             struct path *path)
{
    struct vfsmount *mnt = nd->mnt;

        //
        // 从 hlist 中获取 struct dentry 结构，hlist 代表的是
        // 一个 inode 的缓存即是一个 HASH 表。
        //
    struct dentry *dentry = __d_lookup(nd->dentry, name);

        //
        // 如果没有找到则会调用 real_lookup() 实际文件系统方法
        // 从磁盘中获取
        //
    if (!dentry)
        goto need_lookup;
    if (dentry->d_op && dentry->d_op->d_revalidate)
        goto need_revalidate;
done:
        //
        // 如果从缓存中找到，则设置 struct path 并返回
        //
    path->mnt = mnt;
    path->dentry = dentry;
    __follow_mount(path);
    return 0;

need_lookup:

        //
        // 使用实际文件系统方法，从磁盘中获得 inode 信息
        //
    dentry = real_lookup(nd->dentry, name, nd);
    if (IS_ERR(dentry))
        goto fail;
    goto done;

need_revalidate:
    dentry = do_revalidate(dentry, nd);

        //
        // 这里是缓存的分之。如果 struct dentry 无效还是需要调
        // 用 real_lookup() 读取
        //
    if (!dentry)
        goto need_lookup;
    if (IS_ERR(dentry))
        goto fail;
    goto done;

fail:
    return PTR_ERR(dentry);
}

在分析 real_lookup() 函数前，我们先来看一下 ext3 文件系统的 inode
结构。很明显可以看出 lookup 指向了 ext3_lookup() 函数。

struct inode_operations ext3_dir_inode_operations = {

        //
        // 为了更清晰，在这个结构中只列出我们感兴趣的字段
        //

        ......

    .lookup        = ext3_lookup,

        ......
};

此函数先从缓存中查找对应的 inode，如果没有则新分配一个 struct dentry
结构，然后调用 parent->d_inode->i_op->lookup 即调用了 ext3_lookup()
函数来查找 inode。

static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
{
    struct dentry * result;

        //
        // 篇幅所限，在这个函数中我们只列出相关代码。
        //

        //
        // 获得上一层目录的 inode。别忘了我们是分解路径依次
        // 调用的，所以上一层的 inode 肯定是存在的。
        //
    struct inode *dir = parent->d_inode;

        ......

        //
        // 先从缓存里查找。
        //
    result = d_lookup(parent, name);
    if (!result) {

                //
                // 没找到的话，新分配一个 struct dentry 结构
                // 注意：我们这里新分配了一个 struct dentry，
                // 也就是说每一个目录或文件都需要一个 dentry 结构。
                //
        struct dentry * dentry = d_alloc(parent, name);
        result = ERR_PTR(-ENOMEM);
        if (dentry) {

                        //
                        // 这里也就是调用了 ext3_lookup() 函数，可以
                        // 看下上面介绍的 ext3_dir_inode_operations
                        // 结构
                        //
            result = dir->i_op->lookup(dir, dentry, nd);
            if (result)
                dput(dentry);
            else
                result = dentry;
        }
        mutex_unlock(&dir->i_mutex);
        return result;
    }

      .......
}

这里到了实际文件系统的查找函数。首先根据第一个参数，也就是上级的 dentry
从 ext3_dir_entry_2 中得到新的 dentry 结构，并从其中得到相关的 inode number，
再调用 iget() 函数去获取相应的 struct inode 结构，最后将此 inode 与 dentry
进行关联。

static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
{
    struct inode * inode;
    struct ext3_dir_entry_2 * de;
    struct buffer_head * bh;

    if (dentry->d_name.len > EXT3_NAME_LEN)
        return ERR_PTR(-ENAMETOOLONG);

        //
        // 得到新的 dentry 并返回一个磁盘缓存 buffer_head 结构
        // 注意：这个 dentry 虽然是新分配的，但它所指向的 d_parent
        // 与 d_inode 是有效的，也就是说上级目录相关信息是有效的。
        // 返回的 de 里包含了 inode number。
        //
    bh = ext3_find_entry(dentry, &de);

        //
        // 注意：这里的 inode 默认置为 NULL
        //
    inode = NULL;
    if (bh) {
        unsigned long ino = le32_to_cpu(de->inode);
        brelse (bh);

                //
                // 如果对应的超级块(super block)无效则直接返回错误
                //
        if (!ext3_valid_inum(dir->i_sb, ino)) {
            ext3_error(dir->i_sb, "ext3_lookup",
                   "bad inode number: %lu", ino);
            inode = NULL;
        } else

                        //
                        // 有效则调用 iget() 函数得到正确的 struct inode
                        // 其实也就是根据超级块(super block)的函数集获取
                        //
            inode = iget(dir->i_sb, ino);

        if (!inode)
            return ERR_PTR(-EACCES);
    }

        //
        // 关键此 inode 对应的 dentry 结构并返回。
        //
    return d_splice_alias(inode, dentry);
}

在分析 iget() 函数之前，有必要先了解下超级块(super block)中的
相关字段与函数。

struct super_block {

        //
        // 为了更清晰，在这个结构中只列出我们感兴趣的字段
        //

        ......

        //
        // 文件系统结构。在下面介绍 mount 挂载文件系统时
        // 会有详细介绍。
        //
    struct file_system_type    *s_type;

        //
        // 超级块(super block)函数集
        //
    struct super_operations    *s_op;

        ......
};

下面是 ext3 文件系统的超级块(super block)函数集结构

static struct super_operations ext3_sops = {

        //
        // 为了更清晰，在这个结构中只列出我们感兴趣的字段
        //

        ......

        //
        // 注意：这里的 ext3_read_inode() 是不是很眼熟
        //
    .read_inode    = ext3_read_inode,

        ......
};

终于走到了最终的读取函数！这个函数非常简单，在判断一些有效性后，直接调用
超级块(super block)函数集中的 read_inode 方法，也就是我们前面介绍的 ext3_sops
函数集中的 ext3_read_inode() 函数。

static inline struct inode *iget(struct super_block *sb, unsigned long ino)
{
    struct inode *inode = iget_locked(sb, ino);

    if (inode && (inode->i_state & I_NEW)) {

                //
                // 这里调用的就是 ext3_read_inode() 函数
                //
        sb->s_op->read_inode(inode);
        unlock_new_inode(inode);
    }

    return inode;
}

到这里我们可以解释 ext3_read_inode() 函数是何时调用的了，可以说是
open_namei() 函数在路径转换时间接的调用了 iget() 函数，而 iget() 函
数则是调用了已经注册好的超级块(super block)函数集 ext3_sops 中的
ext3_read_inode() 函数来获取相应的 inode。其实这也就可以解释为什么
在 struct inode->i_fop 中（也就是 ext3_file_operations 函数集中）
open 操作函数 generic_file_open() 是个空操作。因为其对应的 inode
已经在 open_namei()->iget() 中得到了，得到了一个 inode 其实在实际
文件系统中就是一个打开操作，得到了 inode 当然就可以对它进行读/写
操作了。只所以提供了一个 generic_file_open() 应该是占位用的，占位
的目的应该是为了可以使用用户提供的操作方法。也就是说，如果你自己
写了一个 open 操作并赋值给 struct inode->i_fop->open 的话，系统会
调用你所提供的这个 open 操作。我们在上面分析 __dentry_open() 函数时
已经指出了这个调用点。以上的疑问都得到了解答，但这里又再次引出了一
个疑问，那就是这个已经注册好了的超级块(super block)函数集 ext3_sops
是什么时候注册的？要解答这个疑问我们只能从头，也就是 mount 文件系统
时进行分析。

在分析 mount 前我们首先来了解下如下结构，这个结构是在注册新的文件
系统时被作为参数传递的，注册文件系统的函数为 register_filesyste()。

struct file_system_type {

        //
        // 文件系统名称，如：ext3
        //
    const char *name;
    int fs_flags;

        //
        // 实际文件系统的超级块(super block)函数。在 mount 时通
        // 过它来得到超级块的信息，包含 inode 等。
        //
    int (*get_sb) (struct file_system_type *, int,
               const char *, void *, struct vfsmount *);
    void (*kill_sb) (struct super_block *);

        //
        // 当前模块
        //
    struct module *owner;

        //
        // 指向下一个文件系统地址
        //
    struct file_system_type * next;
    struct list_head fs_supers;
    struct lock_class_key s_lock_key;
    struct lock_class_key s_umount_key;
};

我们再来看下 ext3 文件系统是如何填充这个结构的。

static struct file_system_type ext3_fs_type = {
    .owner        = THIS_MODULE,
    .name        = "ext3",

         //
         // 注意这里的回调函数指向了 ext3_get_sb()
         //
    .get_sb        = ext3_get_sb,
    .kill_sb    = kill_block_super,
    .fs_flags    = FS_REQUIRES_DEV,
};

最终使用  register_filesystem( &ext3_fs_type ); 完成文件系统的注册。
这里仅是注册了文件系统，我们知道要使用一个文件系统首先要 mount 才可
使用。我们清楚了以上结构后，接着来看 vfs_kern_mount() 函数，这个函数
是内核最终实现 mount 的函数，这个函数的第一个参数即是上面提到的
file_system_type 结构，在 ext3 文件系统下传递的是 ext3_fs_type。函数
中调用的 type->get_sb 即触发了 ext3_get_sb() 函数。

struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
    struct vfsmount *mnt;
    char *secdata = NULL;
    int error;

    if (!type)
        return ERR_PTR(-ENODEV);

    error = -ENOMEM;

        //
        // 根据名称分配一个新的 vfsmount 挂接点。
        //
    mnt = alloc_vfsmnt(name);
    if (!mnt)
        goto out;

    if (data) {
        secdata = alloc_secdata();
        if (!secdata)
            goto out_mnt;

        error = security_sb_copy_data(type, data, secdata);
        if (error)
            goto out_free_secdata;
    }

        //
        // 注意：这里调用了已注册文件系统的超级块(super block)函数
        // 对于 ext3 文件系统来说，就是调用了 ext3_get_sb，可参考
        // 以上对 file_system_type 的说明。
        //
    error = type->get_sb(type, flags, name, data, mnt);
    if (error < 0)
        goto out_free_secdata;

    error = security_sb_kern_mount(mnt->mnt_sb, secdata);
    if (error)
        goto out_sb;

        //
        // 这里的挂接点是一个 dentry 结构
        //
    mnt->mnt_mountpoint = mnt->mnt_root;

        //
        // 把新的 vfsmount 结构赋给自身的 parent 这样可以
        // 通过 parent 遍历出所有 mount 的文件系统
        //
    mnt->mnt_parent = mnt;
    up_write(&mnt->mnt_sb->s_umount);
    free_secdata(secdata);
    return mnt;

//
// 以下流程只有出错时才会走到
//
out_sb:
    dput(mnt->mnt_root);
    up_write(&mnt->mnt_sb->s_umount);
    deactivate_super(mnt->mnt_sb);
out_free_secdata:
    free_secdata(secdata);
out_mnt:
    free_vfsmnt(mnt);
out:
    return ERR_PTR(error);
}

下面的 ext3_get_sb() 函数仅是个简单的封状，直接调用的 get_sb_bdev()
函数，但这里要注意 get_sb_bdev() 函数不是严格按照 ext3_get_sb() 函数
进行传递的，它本身多出了一个 ext3_fill_super 参数，而这个参数是以一个
回调函数形式提供的。

static int ext3_get_sb(struct file_system_type *fs_type,
    int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
        //
        // 注意：这里多了一个 ext3_fill_super() 的回调函数。
        //
    return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
}

了解了以上结构我们再来看 ext3_fill_super() 函数的具体实现，这个函数的第
一个参数即是一个超级块(super block)结构。在此函数中将上面提到的 ext3 超级
块(super block) 函数集 ext3_sops 赋给了此结构。然后调用 iget() 函数触发
超级块(super block) 函数集。

static int ext3_fill_super (struct super_block *sb, void *data, int silent)
{

        //
        // 篇幅所限，在这个函数中我们只列出相关代码。
        //

        //
        // 设置超级块的函数集
        //
    sb->s_op = &ext3_sops;
    sb->s_export_op = &ext3_export_ops;
    sb->s_xattr = ext3_xattr_handlers;
#ifdef CONFIG_QUOTA
    sb->s_qcop = &ext3_qctl_operations;
    sb->dq_op = &ext3_quota_operations;
#endif
    INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */

    sb->s_root = NULL;

        //
        // 调用 iget() 函数得到相应的 inode。
        //
    root = iget(sb, EXT3_ROOT_INO);

        //
        // 根据得到的根 inode 分配超级块(super block)中的
        // s_root 此字段是一个 struct dentry 结构。
        //
    sb->s_root = d_alloc_root(root);

        //
        // 如果根 dentry 无效则提示错误跳到失败处。
        //
    if (!sb->s_root) {
        printk(KERN_ERR "EXT3-fs: get root inode failed\n");
        iput(root);
        goto failed_mount4;
    }

        //
        // 如果根 inode 不是目录或者大小与块无效则提示错误
        // 跳到失败处。
        //
    if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
        dput(sb->s_root);
        sb->s_root = NULL;
        printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
        goto failed_mount4;
    }

}

至此所有流程都走到了，疑问也被一个个打破。我们在整体的梳理下流程。在内核
sys_open 被调用打开一个文件或者设备驱动时，调用 filp_open()->do_filp_open()
函数，在 do_filp_open() 函数中，首先利用 open_namei() 函数得到一个 struct
nameidata 结构，那么在这个过程中 __path_lookup_intent_open() 函数设置了
struct nameidata->intent.open 相关字段，然后调用 do_path_lookup() 函数，在这
个函数中设置了 struct nameidata->mnt 与 struct nameidata->dentry 相关字段后
调用了 _link_path_walk() 函数开始分解路径，并依次调用 do_lookup() 函数来
获得路径中个目录与最终文件的 struct inode。do_lookup() 函数先从 inode 缓存
即 hlist 中查找 inode，如果没有找到则调用 real_lookup() 函数，此函数分配
了一个 struct dentry 结构，然后使用上层目录的 struct inode->i_op->lookup()
方法来继续查找，这样就触发了 ext3_lookup() 函数，而此函数得到 struct dentry
与 inode number 后调用 iget() 函数来返回 struct inode。（这里有必要强调一点，
那就是不仅目录才有 struct dentry 结构，一个文件也拥有一个 struct dentry 结
构，这个从上面具体代码分析中可以看到）而 iget() 函数是使用 struct inode 超
级块(super block)中的函数 ext3_read_inode() 来最终完成从磁盘读取 inode 操
作，读到一个 in core 类型的 struct inode 后为了提供文件与设备读/写等操作设
置了 struct inode->i_op 与 struct inode->i_fop 函数集。其实以上步骤按照提供
的系统调用以及内核操作流程来理解等于是打开了一个文件或目录。这也就是为什么
在 ext3_file_operations 函数集中只有读/写等操作，而打开是空操作的原因。至于
为什么提供一个空操作函数，在上面分析时已经给出了，这里不在阐述。到此
struct inode，struct dentry, struct nameidata 结构都已完全填充好。在
open_namei() 调用返回后将得到的 nameidata 结构作为参数调用
nameidata_to_filp() 函数，在此函数当中使用 struct dentry 作参数调用了
__dentry_open() 函数，在这个函数中会动态初始化一个 struct file 结构，并使用
struct inode->i_fop 函数集来填充 struct file->f_op （别忘了，我们前面的 inode
结构中相关域都已经准备好了，这里直接拿来使用即可）。那么不管是文件还是设备驱
动，可以看出来是走到具体文件系统这里才开始区分的。如果是目录/文件/连接则直接
使用 ext3_file_xxx 或 ext3_dir_xxx 等函数集。如果操作对象是一个设备驱动的话
则使用 init_special_inode 来初始化不同的设备驱动，如果是一个字符设备驱动的
话则调用 chrdev_open() 函数来对应 struct file 操作集。而上面提到的超级块
(super block) 函数是在注册文件系统注册时由 register_filesystem() 函数注册，
在 mount 时由 vfs_kern_mount() 函数间接调用 ext3_fill_super() 函数时进行关联
的。具体可以看上面的代码分析，这里不在详述。所有流程清晰后我们再说一下 struct
inode 中的几个函数集的区别与作用。我们这里仅以文件/目录为例进行解释，struct
inode_operations 操作是对文件(inode)的建立/查找(打开)/删除/重命名操作，struct
file_operations 操作是对已经存在的文件的读/写/刷新/列目录(readdir)/发送控制字
操作。

参考：linux kernel source 2.6.19.1

/usr/fs/ext3/inode.c
/usr/fs/ext3/namei.c
/usr/fs/ext3/super.c
/usr/fs/ext3/file.c
/usr/fs/ext3/dir.c
/usr/fs/block_dev.c
/usr/fs/open.c
/usr/fs/dcache.h
/usr/fs/inode.c
/usr/fs/namei.c
/usr/fs/super.c
/inlucde/linux/mount.h

感谢 zhuzj 与我探讨。

感谢我的发小齐佳佳，并将本次劳动成果献给齐佳佳。