sys_umount系统调用的实现注释。2.4版内核 | 趁着年轻

首页 > LINUX内核 > sys_umount系统调用的实现注释。2.4版内核

sys_umount系统调用的实现注释。2.4版内核

2012年7月1日 kulv 发表评论阅读评论 5914次阅读

/*
 * Now umount can handle mount points as well as block devices.
 * This is important for filesystems which use unnamed block devices.
 *
 * We now support a flag for forced unmount like the other 'big iron'
 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
 */
//文件系统卸载的系统调用
asmlinkage long sys_umount(char * name, int flags)
{
	struct nameidata nd;
	char *kname;
	int retval;

	lock_kernel();
	kname = getname(name);
	retval = PTR_ERR(kname);
	if (IS_ERR(kname))
		goto out;
	retval = 0;
	//kulv:得到卸载目录的nameidata结构。其内有该目录的dentry , inode
	if (path_init(kname, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &nd))
		retval = path_walk(kname, &nd);
	putname(kname);
	if (retval)
		goto out;
	retval = -EINVAL;
	if (nd.dentry != nd.mnt->mnt_root)//mnt 是代表该安装点的连接结构，此结构的mnt_root指向了设备的根节点
		goto dput_and_out;//如果该安装目录的dentry结构不等于设备的根节点，则一定有问题。

	retval = -EPERM;
	if (!capable(CAP_SYS_ADMIN) && current->uid!=nd.mnt->mnt_owner)
		goto dput_and_out;//如果不是管理员，且不是拥有者。

	dput(nd.dentry);//减少使用计数
	/* puts nd.mnt */
	down(&mount_sem);
	retval = do_umount(nd.mnt, 0, flags);//真正工作的地方.
	up(&mount_sem);
	goto out;
dput_and_out:
	path_release(&nd);
out:
	unlock_kernel();
	return retval;
}

/*kulv:
mnt为该安装点的连接结构。包含了设备目录，超级块，以及安装树的支持结构。
umount_root = 0 ；
flags:从sys_umount 传下来
*/
static int do_umount(struct vfsmount *mnt, int umount_root, int flags)
{
	struct super_block * sb = mnt->mnt_sb;//得到超级块

	/*
	 * No sense to grab the lock for this test, but test itself looks
	 * somewhat bogus. Suggestions for better replacement?
	 * Ho-hum... In principle, we might treat that as umount + switch
	 * to rootfs. GC would eventually take care of the old vfsmount.
	 * The problem being: we have to implement rootfs and GC for that 😉
	 * Actually it makes sense, especially if rootfs would contain a
	 * /reboot - static binary that would close all descriptors and
	 * call reboot(9). Then init(8) could umount root and exec /reboot.
	 */
	if (mnt == current->fs->rootmnt && !umount_root) {
		int retval = 0;//如果卸载的目录为当前进程根目录，
		/*
		 * Special case for "unmounting" root ...
		 * we just try to remount it readonly.
		 */
		mntput(mnt);
		if (!(sb->s_flags & MS_RDONLY))//因为调用这的时候flag为0，所以下面不成立,用户进程无法卸载自己的根目录
			retval = do_remount_sb(sb, MS_RDONLY, 0);
		return retval;
	}

	spin_lock(&dcache_lock);

	//如果一个设备安装了多次，即一个超级块有多个连接结构，那么
	if (mnt->mnt_instances.next != mnt->mnt_instances.prev) {//如果安装了多次，即其
		if (atomic_read(&mnt->mnt_count) > 2) {//引用计数大于2，表面还有其他地方用到这个结构，那么···
			spin_unlock(&dcache_lock);
			mntput(mnt);//递减一下计数就可以了
			return -EBUSY;
		}
		if (sb->s_type->fs_flags & FS_SINGLE)//FS_SINGLE表示超级块在同种文件系统中是共享的
			put_filesystem(sb->s_type);
		/* We hold two references, so mntput() is safe */
		mntput(mnt);
		//因为这个超级块有多次安装，//即设备有多次安装，不能将设备拆下，只是拆除多次安装的一次
		//下面的函数只是把当前连接件从各种队列中删除，然后递减相关计数，这样就无法访问到了
		remove_vfsmnt(mnt);
		return 0;
	}
	spin_unlock(&dcache_lock);

	//下面代表设备的唯一一次安装，那就麻烦点了
	/*
	 * Before checking whether the filesystem is still busy,
	 * make sure the kernel doesn't hold any quota files open
	 * on the device. If the umount fails, too bad -- there
	 * are no quotas running any more. Just turn them on again.
	 */
	DQUOT_OFF(sb);//关于这个磁盘空间的配额。
	acct_auto_close(sb->s_dev);

	/*
	 * If we may have to abort operations to get out of this
	 * mount, and they will themselves hold resources we must
	 * allow the fs to do things. In the Unix tradition of
	 * 'Gee thats tricky lets do it in userspace' the umount_begin
	 * might fail to complete on the first run through as other tasks
	 * must return, and the like. Thats for the mount program to worry
	 * about for the moment.
	 */

	if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
		sb->s_op->umount_begin(sb);//让相应的文件系统来处理卸载准备工作
		//struct file_operations 定义在:include/linux/fs.h里面

	/*
	 * Shrink dcache, then fsync. This guarantees that if the
	 * filesystem is quiescent at this point, then (a) only the
	 * root entry should be in use and (b) that root entry is
	 * clean.
	 */
	shrink_dcache_sb(sb);//将缓存在dentry_unused 队列中的属于本设备的dentry结构删除
	fsync_dev(sb->s_dev);//将缓存在内存中，还没有同步到磁盘设备中的数据刷新到设备上去。

	if (sb->s_root->d_inode->i_state) {
		mntput(mnt);
		return -EBUSY;
	}

	/* Something might grab it again - redo checks */

	spin_lock(&dcache_lock);
	if (atomic_read(&mnt->mnt_count) > 2) {
		spin_unlock(&dcache_lock);
		mntput(mnt);
		return -EBUSY;
	}

	/* OK, that's the point of no return */
	mntput(mnt);
	remove_vfsmnt(mnt);
	//设备上的缓存，inode，脏页面，锁住的页面，睡眠等待的进程·都处理完了，该把
	//当前连接件从各种队列中删除，然后递减相关计数，这样就无法访问到了.
	//你懂的，这些队列什么的形成了一个挂载点所需要的所有连接

	//将超级块的后事处理一下，包括调用其super_operations函数通知相应的文件系统
	//然后删除相关申请的缓存什么的。然后安全退出
	kill_super(sb, umount_root);
	return 0;
}

/*
 * Called with spinlock held, releases it.
 */
static void remove_vfsmnt(struct vfsmount *mnt)
{//本函数只是卸载相应设备的多次安装中的一次
	/* First of all, remove it from all lists */
	list_del(&mnt->mnt_instances);//把自己从超级块的多次安装中删除，此成员是挂入超级块的s_mounts上的。
	list_del(&mnt->mnt_clash);//一个安装点可以安装多个文件系统，mnt_clash挂入安装点的dentry结构的d_vfsmount上
	list_del(&mnt->mnt_list);//系统有个全局的vfsmntlist队列，记录系统中所有的连接件
	list_del(&mnt->mnt_child);//从上一层安装中删除，相应的，自己也有一个mnt_mounts队列表示安装在我下面的设备
	spin_unlock(&dcache_lock);
	/* Now we can work safely */
	if (mnt->mnt_parent != mnt)//我的上一层设备为我自己，那就说明这是根节点
		mntput(mnt->mnt_parent);

	dput(mnt->mnt_mountpoint);//指安装点的dentry
	dput(mnt->mnt_root);//设备的根目录dentry结构
	if (mnt->mnt_devname)
		kfree(mnt->mnt_devname);
	kfree(mnt);
}

/*
 * Shrink the dcache for the specified super block.
 * This allows us to unmount a device without disturbing
 * the dcache for the other devices.
 *
 * This implementation makes just two traversals of the
 * unused list.  On the first pass we move the selected
 * dentries to the most recent end, and on the second
 * pass we free them.  The second pass must restart after
 * each dput(), but since the target dentries are all at
 * the end, it's really just a single traversal.
 */

/**
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
 * Shrink the dcache for the specified super block. This
 * is used to free the dcache before unmounting a file
 * system
 */
//删除属于给定超级块的设备的所有数据
void shrink_dcache_sb(struct super_block * sb)
{
	struct list_head *tmp, *next;
	struct dentry *dentry;

	/*
	 * Pass one ... move the dentries for the specified
	 * superblock to the most recent end of the unused list.
	 */
	spin_lock(&dcache_lock);
	next = dentry_unused.next;//系统全局的邋錮entry_unused数据结构队列，缓存所有暂时不用要删除的数据
	while (next != &dentry_unused) {
		tmp = next;
		next = tmp->next;
		dentry = list_entry(tmp, struct dentry, d_lru);
		if (dentry->d_sb != sb)//越过不属于我们的
			continue;
		list_del(tmp);//从dentry_unused中删除
		list_add(tmp, &dentry_unused);//然后插入到节点之后，即第一个节点
	}//经过这轮迭代，所有符合条件的节点都已经相对倒叙的放在dentry_unused的前面部分了

	/*
	 * Pass two ... free the dentries for this superblock.
	 */
repeat:
	next = dentry_unused.next;
	while (next != &dentry_unused) {//再来一次
		tmp = next;
		next = tmp->next;
		dentry = list_entry(tmp, struct dentry, d_lru);
		if (dentry->d_sb != sb)
			continue;//这用的着吗，直接退出就可以呀，因为上一趟已经把所有符合条件的都放到前面了
		if (atomic_read(&dentry->d_count))
			continue;
		dentry_stat.nr_unused--;//总数减少
		list_del(tmp);//正式删除了哈
		INIT_LIST_HEAD(tmp);//这···用得着吗?
		prune_one_dentry(dentry);//把dentry处理一下，其实就是把相关的指针删除，比如hash等
		goto repeat;
	}
	spin_unlock(&dcache_lock);
}

static inline void prune_one_dentry(struct dentry * dentry)
{
	struct dentry * parent;

	list_del_init(&dentry->d_hash);
	list_del(&dentry->d_child);
	dentry_iput(dentry);
	parent = dentry->d_parent;
	d_free(dentry);
	if (parent != dentry)//莫非这里如果不成立，那就是说这是根目录了?
		dput(parent);
	spin_lock(&dcache_lock);
}
//KULV:
//dev:设备号
int fsync_dev(kdev_t dev)
{
	//sync_buffers的第一趟，0表示对所有
	sync_buffers(dev, 0);//第一趟，刷新，不等待被锁住的缓存，过门不入

	lock_kernel();
	sync_supers(dev);//将超级块也处理一下，实际上就是在super_blocks全局队列中找到设备号相同的
					//然后如果相应文件系统有对应的sb->s_op->write_super(sb)，则调用，更新LRU
	sync_inodes(dev);
	DQUOT_SYNC(dev);
	unlock_kernel();

	return sync_buffers(dev, 1);//最后一趟，如果buf被锁住了，
					//那会进入等待状态，有可能会引起重新调度
}

/* Call sync_buffers with wait!=0 to ensure that the call does not
 * return until all buffer writes have completed.  Sync() may return
 * before the writes have finished; fsync() may not.
 */

/* Godamity-damn.  Some buffers (bitmaps for filesystems)
 * spontaneously dirty themselves without ever brelse being called.
 * We will ultimately want to put these in a separate list, but for
 * now we search all of the lists for dirty buffers.
 */
static int sync_buffers(kdev_t dev, int wait)
{//第一趟wait为0，表示不等，第二趟为1，等
	int i, retry, pass = 0, err = 0;
	struct buffer_head * bh, *next;

	/* One pass for no-wait, three for wait:
	 * 0) write out all dirty, unlocked buffers;
	 * 1) write out all dirty buffers, waiting if locked;
	 * 2) wait for completion by waiting for all buffers to unlock.
	 */
	do {
		retry = 0;

		/* We search all lists as a failsafe mechanism, not because we expect
		 * there to be dirty buffers on any of the other lists.
		 */
repeat:
		spin_lock(&lru_list_lock);
		bh = lru_list[BUF_DIRTY];//取得脏的队列头,待会会被移到其他队列中，比如被锁定的
		if (!bh)
			goto repeat2;

		//nr_buffers_type[BUF_DIRTY] 指相对每种buf的总数
		for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
			next = bh->b_next_free;

			if (!lru_list[BUF_DIRTY])//这···有可能吗，上面不是检查过bh了吗···
				break;
			if (dev && bh->b_dev != dev)//忽略所有不属于此设备的。前面的dev用不着了吧?
				continue;
			if (buffer_locked(bh)) {//看是否是锁住的，如果是，那么进去
				/* Buffer is locked; skip it unless wait is
				 * requested AND pass > 0.
				 */
				if (!wait || !pass) {
					retry = 1;//第一趟wait为0，不等，直接跳过，只是锁住它
					continue;
				}
				atomic_inc(&bh->b_count);
				spin_unlock(&lru_list_lock);
				wait_on_buffer (bh);
				atomic_dec(&bh->b_count);
				goto repeat;
			}

			//以下节点是没有锁住的
			/* If an unlocked buffer is not uptodate, there has
			 * been an IO error. Skip it.
			 */
			if (wait && buffer_req(bh) && !buffer_locked(bh) &&
			    !buffer_dirty(bh) && !buffer_uptodate(bh)) {
				err = -EIO;
				continue;
			}

			/* Don't write clean buffers.  Don't write ANY buffers
			 * on the third pass.
			 */
			if (!buffer_dirty(bh) || pass >= 2)
				continue;

			atomic_inc(&bh->b_count);
			spin_unlock(&lru_list_lock);
			ll_rw_block(WRITE, 1, &bh);//刷到磁盘上面
			atomic_dec(&bh->b_count);
			retry = 1;
			goto repeat;//成功刷了一次，为何要这样重来?难道为了怕链表变动?
		}

    repeat2:
		bh = lru_list[BUF_LOCKED];
		if (!bh) {
			spin_unlock(&lru_list_lock);
			break;
		}
		for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
			next = bh->b_next_free;

			if (!lru_list[BUF_LOCKED])
				break;
			if (dev && bh->b_dev != dev)
				continue;
			if (buffer_locked(bh)) {
				/* Buffer is locked; skip it unless wait is
				 * requested AND pass > 0.
				 */
				if (!wait || !pass) {
					retry = 1;
					continue;
				}
				atomic_inc(&bh->b_count);
				spin_unlock(&lru_list_lock);
				wait_on_buffer (bh); //等待buf被解锁，会引起进程调度
				spin_lock(&lru_list_lock);
				atomic_dec(&bh->b_count);
				goto repeat2;//这里重新来循环是因为等待了之后list变化了吗?
			}
		}
		spin_unlock(&lru_list_lock);

		/* If we are waiting for the sync to succeed, and if any dirty
		 * blocks were written, then repeat; on the second pass, only
		 * wait for buffers being written (do not pass to write any
		 * more buffers on the second pass).
		 */
	} while (wait && retry && ++passs_list.next)) {
		if (!sb->s_dev)
			continue;
		if (dev && sb->s_dev != dev)//不是我的，就掠过
			continue;
		if (!sb->s_dirt)
			continue;
		lock_super(sb);
		if (sb->s_dev && sb->s_dirt && (!dev || dev == sb->s_dev))
			if (sb->s_op && sb->s_op->write_super)
				sb->s_op->write_super(sb);
			//如果相应的设备提供了write_super的方法，那调用之。
				//对于ext2文件系统来说，这是ext2_write_super函数，其初始化如下
				/*其实ext2文件系统也没干啥事，就是更新了一下LRU，时间什么的
				static struct super_operations ext2_sops = {
						read_inode:	ext2_read_inode,
						write_inode:	ext2_write_inode,
						put_inode:	ext2_put_inode,
						delete_inode:	ext2_delete_inode,
						put_super:	ext2_put_super,
						write_super:	ext2_write_super,
						statfs:		ext2_statfs,
						remount_fs:	ext2_remount,
					};*/
		unlock_super(sb);
	}
}

/**
 *	sync_inodes
 *	@dev: device to sync the inodes from.
 *
 *	sync_inodes goes through the super block's dirty list,
 *	writes them out, and puts them back on the normal list.
 */

void sync_inodes(kdev_t dev)
{
	struct super_block * sb = sb_entry(super_blocks.next);

	/*
	 * Search the super_blocks array for the device(s) to sync.
	 */
	spin_lock(&inode_lock);
	for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
		if (!sb->s_dev)
			continue;
		if (dev && sb->s_dev != dev)
			continue;
		//找到了属于我的超块
		sync_list(&sb->s_dirty);//s_dirty表示属于此超级块(设备)的脏了的inodes，全部写回去

		if (dev)
			break;
	}
	spin_unlock(&inode_lock);
}

static inline void sync_list(struct list_head *head)
{
	struct list_head * tmp;

	while ((tmp = head->prev) != head)//无非就是得到每一个inode，然后刷新之
		sync_one(list_entry(tmp, struct inode, i_list), 0);
	/*sync_one会吧给定inode的所有脏页面，锁住的页面都耍到磁盘上去
		然后唤醒所有等待在上面的进程
	*/
}

static inline void sync_one(struct inode *inode, int sync)
{
	if (inode->i_state & I_LOCK) {//不明白，锁住的难道不脏?
		__iget(inode);//如果被锁住了，先递增计数吧，免得待会被别人先登了
		spin_unlock(&inode_lock);
		__wait_on_inode(inode);//这是没办法的，必须等
		iput(inode);//递减技术
		spin_lock(&inode_lock);
	} else {
		unsigned dirty;

		list_del(&inode->i_list);//从队列中删除
		list_add(&inode->i_list, atomic_read(&inode->i_count)
							? &inode_in_use
							: &inode_unused);//是放入在用的呢，还是不用的?不过都是缓存
		/* Set I_LOCK, reset I_DIRTY */
		dirty = inode->i_state & I_DIRTY;//这是不是脏的呢
		inode->i_state |= I_LOCK;//我锁住了哈，你们别来碰
		inode->i_state &= ~I_DIRTY;//去掉脏位
		spin_unlock(&inode_lock);

		filemap_fdatasync(inode->i_mapping);//你懂的，i_mapping 有大文章，缓存的
		//里面其实是把mapping->dirty_pages，即这个inode的所有缓冲的脏页面写入到磁盘。

		//刚才filemap_fdatasync写完了所有的脏页面，那么，如果真的有脏的，通知一下相应文件系统吧
		/* Don't write the inode if only I_DIRTY_PAGES was set */
		if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
			write_inode(inode, sync);
		//其实就是调用相应文件系统的inode->i_sb->s_op->write_inode(inode, sync);
		//把这个inode写进去

		//这一次，把锁住的页面locked_pages也刷新了，不过会等待的___wait_on_page(page);
		filemap_fdatawait(inode->i_mapping);

		spin_lock(&inode_lock);
		inode->i_state &= ~I_LOCK;
		wake_up(&inode->i_wait);//有谁再我这上面睡找了，都醒来
	}
}

分类: LINUX内核标签: linux, 内核原理

评论 (0) Trackbacks (0) 发表评论 Trackback

本文目前尚无任何评论.

本文目前尚无任何 trackbacks 和 pingbacks.

linux 内核处理缺页异常函数：do_page_fault ，2.4.0版金山卫士开源代码—-消息机制浅析