详解sys_read和sys_write

来源:互联网 发布:现在开淘宝店晚吗 编辑:程序博客网 时间:2024/06/01 07:20

    内核源码:linux-2.6.38.8.tar.bz2

    目标平台:ARM体系结构

 

    在Linux内核中,系统调用read和write的定义如下所示:

/* arch/arm/include/asm/posix_types.h */#ifdef __GNUC__typedef long long__kernel_loff_t;#endif/* include/linux/types.h */#if defined(__GNUC__)typedef __kernel_loff_tloff_t;#endif/* fs/read_write.c */SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count){struct file *file;ssize_t ret = -EBADF;int fput_needed;file = fget_light(fd, &fput_needed);if (file) {loff_t pos = file_pos_read(file); //返回文件偏移量file->f_posret = vfs_read(file, buf, count, &pos);file_pos_write(file, pos); //重置文件偏移量file->f_pos = posfput_light(file, fput_needed);}return ret;}SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,size_t, count){struct file *file;ssize_t ret = -EBADF;int fput_needed;file = fget_light(fd, &fput_needed);if (file) {loff_t pos = file_pos_read(file); //在ARM平台上loff_t的长度都为64位ret = vfs_write(file, buf, count, &pos);file_pos_write(file, pos);fput_light(file, fput_needed); //这里主要是配合fget_light函数递减文件指针的引用计数}return ret;}
    fget_light函数用于根据文件描述符fd获得相应的文件指针,即structfile结构体实例。而fput_light函数在这里的主要作用是递减文件指针的引用计数。源代码如下所示:
/* fs/file_table.c */struct file *fget_light(unsigned int fd, int *fput_needed){struct file *file;struct files_struct *files = current->files; //从当前进程获取struct files_struct结构体实例*fput_needed = 0;if (atomic_read(&files->count) == 1) { //struct files_struct结构体实例的引用计数,大于1表示多个进程共享该实例file = fcheck_files(files, fd); //即files->fdt->->fd[fd]} else {rcu_read_lock();file = fcheck_files(files, fd);if (file) {if (atomic_long_inc_not_zero(&file->f_count)) //文件指针引用计数如果不为零则递增且返回真值,为零则直接返回假且不递增*fput_needed = 1;else/* Didn't get the reference, someone's freed */file = NULL;}rcu_read_unlock();}return file;}
/* include/linux/file.h */static inline void fput_light(struct file *file, int fput_needed){if (fput_needed)fput(file);}/* fs/file_table.c */void fput(struct file *file){if (atomic_long_dec_and_test(&file->f_count)) //文件指针引用计数递减之后等于零则为真__fput(file);}
    紧接着,各自调用vfs_read或vfs_write函数。源代码如下所示:
/* fs/read_write.c */ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos){ssize_t ret;if (!(file->f_mode & FMODE_READ)) //确认读取标志return -EBADF;if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) //read和aio_read函数指针必须有一个为真return -EINVAL;if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) //实际未使用参数VERIFY_WRITE(这里是将要将数据写入用户空间内存)return -EFAULT;ret = rw_verify_area(READ, file, pos, count);if (ret >= 0) {count = ret;if (file->f_op->read) //read函数指针为真ret = file->f_op->read(file, buf, count, pos); //调用文件系统的读函数elseret = do_sync_read(file, buf, count, pos); //使用异步I/O实现同步读操作if (ret > 0) {fsnotify_access(file); //实现文件系统事件监控的IN_ACCESS事件add_rchar(current, ret); //累加当前进程已读数据的字节数}inc_syscr(current); //递增当前进程read系统调用的计数}return ret;}ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos){ssize_t ret;if (!(file->f_mode & FMODE_WRITE)) //确认写入标志return -EBADF;if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) //write和aio_write函数指针必须有一个为真return -EINVAL;if (unlikely(!access_ok(VERIFY_READ, buf, count))) //实际未使用参数VERIFY_READ(这里是将要从用户空间内存读取数据)return -EFAULT;ret = rw_verify_area(WRITE, file, pos, count);if (ret >= 0) {count = ret;if (file->f_op->write) //write函数指针为真ret = file->f_op->write(file, buf, count, pos); //调用文件系统的写函数elseret = do_sync_write(file, buf, count, pos); //使用异步I/O实现同步写操作if (ret > 0) {fsnotify_modify(file); //实现文件系统事件监控的IN_MODIFY事件add_wchar(current, ret); //累加当前进程已写数据的字节数}inc_syscw(current); //递增当前进程write系统调用的计数}return ret;}
    1、access_ok函数用于确认所传入的用户空间内存[buf,buf+count]是否有效。源代码如下所示:
/* arch/arm/include/asm/uaccess.h */#define access_ok(type,addr,size)(__range_ok(addr,size) == 0)/* We use 33-bit arithmetic here... */#define __range_ok(addr,size) ({ \unsigned long flag, roksum; \__chk_user_ptr(addr);\ //只有函数声明,用于Sparse工具//roksum = addr + size;//如果上一步操作没有发生上溢出(意味着addr加size的算术结果小于等于0xffffffff)则roksum = roksum - flag,否则空操作;//如果上一步操作有发生下溢出(意味着addr加size小于flag)则flag = 0,否则空操作。//简单地讲,如果addr + size < current_thread_info()->addr_limit则flag = 0,否则flag保持原有的初始值current_thread_info()->addr_limit不变。__asm__("adds %1, %2, %3; sbcccs %1, %1, %0; movcc %0, #0" \: "=&r" (flag), "=&r" (roksum) \: "r" (addr), "Ir" (size), "0" (current_thread_info()->addr_limit) \ //“0”表示用括号里变量的值初始化第一个操作数,也就是flag: "cc"); \flag; }) //flag的值就是整个表达式的值#define USER_DSTASK_SIZE/* arch/arm/include/asm/memory.h */#define TASK_SIZE(UL(CONFIG_PAGE_OFFSET) - UL(0x01000000))
    其中当前进程addr_limit的值在程序执行时通常在start_thread函数中通过调用set_fs函数将其初始化为USER_DS值。如果CONFIG_PAGE_OFFSET的值配置为0xc0000000,则USER_DS的值为0xbf000000,也就是所传入的用户空间地址buf+count必须小于该值。

    2、rw_verify_area函数主要用于确认文件可读写的区域。源代码如下所示:

/* arch/arm/include/asm/page.h */#define PAGE_SHIFT12#define PAGE_SIZE(_AC(1,UL) << PAGE_SHIFT)#define PAGE_MASK(~(PAGE_SIZE-1))/* include/linux/pagemap.h */#define PAGE_CACHE_MASKPAGE_MASK/* include/linux/kernel.h */#define INT_MAX((int)(~0U>>1))/* include/linux/fs.h */#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)/* fs/read_write.c */int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count){struct inode *inode;loff_t pos; //loff_t被定义为long longint retval = -EINVAL;inode = file->f_path.dentry->d_inode;if (unlikely((ssize_t) count < 0)) //读写字节数不能小于零return retval;pos = *ppos;if (unlikely(pos < 0)) {if (!unsigned_offsets(file)) //未配置FMODE_UNSIGNED_OFFSET标志,即不能将pos当作无符号整型return retval;if (count >= -pos) //pos加count的算术结果超过了unsigned long long能容纳的最大值(这里将pos当作无符号整型)return -EOVERFLOW;} else if (unlikely((loff_t) (pos + count) < 0)) { //pos加count的和超过了loff_t所能表达的最大正整数if (!unsigned_offsets(file))return retval;}if (unlikely(inode->i_flock && mandatory_lock(inode))) { //文件系统支持强制锁并且文件的S_ISGID置位但S_IXGRP必须未置位retval = locks_mandatory_area( //只被rw_verify_area和locks_verify_truncate等两个函数所调用,用于检查锁冲突read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,inode, file, pos, count);if (retval < 0)return retval;}retval = security_file_permission(file,read_write == READ ? MAY_READ : MAY_WRITE); //安全模块检查if (retval)return retval;return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; //这里的MAX_RW_COUNT等于0x7ffff000}
    3、do_sync_read和do_sync_write等两个函数是使用相应的异步I/O来实现同步读写。源代码如下:
/* fs/read_write.c */ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos){struct iovec iov = { .iov_base = buf, .iov_len = len }; //保存将要存储数据的内存空间struct kiocb kiocb;ssize_t ret;//初始化kiocbinit_sync_kiocb(&kiocb, filp);kiocb.ki_pos = *ppos;kiocb.ki_left = len;kiocb.ki_nbytes = len;for (;;) {ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);if (ret != -EIOCBRETRY) //等于EIOCBRETRY时将触发下一次读取尝试break;wait_on_retry_sync_kiocb(&kiocb);}if (-EIOCBQUEUED == ret) //将获得完成事件ret = wait_on_sync_kiocb(&kiocb);*ppos = kiocb.ki_pos;return ret;}ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos){struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; //保存将要写入的数据struct kiocb kiocb;ssize_t ret;//初始化kiocbinit_sync_kiocb(&kiocb, filp);kiocb.ki_pos = *ppos;kiocb.ki_left = len;kiocb.ki_nbytes = len;for (;;) {ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);if (ret != -EIOCBRETRY) //等于EIOCBRETRY时将触发下一次写入尝试break;wait_on_retry_sync_kiocb(&kiocb);}if (-EIOCBQUEUED == ret)ret = wait_on_sync_kiocb(&kiocb);*ppos = kiocb.ki_pos;return ret;}/* fs/read_write.c */static void wait_on_retry_sync_kiocb(struct kiocb *iocb){set_current_state(TASK_UNINTERRUPTIBLE); //设置进程为不可被中断的睡眠状态if (!kiocbIsKicked(iocb))schedule(); //让出CPU时间elsekiocbClearKicked(iocb);__set_current_state(TASK_RUNNING); //设置进程为可运行状态}/* fs/aio.c */ssize_t wait_on_sync_kiocb(struct kiocb *iocb){while (iocb->ki_users) {set_current_state(TASK_UNINTERRUPTIBLE);if (!iocb->ki_users)break;io_schedule();}__set_current_state(TASK_RUNNING);return iocb->ki_user_data; //返回已成功读写的字节数}
    有些文件系统在定义普通文件的操作函数struct file_operations时,将成员read指向do_sync_read函数,而成员aio_read指向通用函数generic_file_aio_read;成员write指向do_sync_write函数,而成员aio_write指向通用函数generic_file_aio_write。

    综合上文的分析可知,文件读写的大部分操作实际上是在文件系统层来完成的,VFS层所做的操作非常之少。

原创粉丝点击