当前位置：首页 > news >正文

做安防在哪个网站做广告呢网站制作综述

news 2026/4/19 11:34:58

做安防在哪个网站做广告呢,网站制作综述,大连企业网站制作,电子商务网站建设与管理技能实训结构体 fd fd也就是文件描述符#xff0c;用于标识已经打开的文件、管道、socket等。是进程和内核的桥梁#xff0c;允许进程执行各种文件操作 struct fd {struct file *file;unsigned int flags; };file Linux内核中表示打开文件的结构体#xff0c;包含了文件操作所需… 结构体 fd fd也就是文件描述符用于标识已经打开的文件、管道、socket等。是进程和内核的桥梁允许进程执行各种文件操作 struct fd {struct file *file;unsigned int flags; };file Linux内核中表示打开文件的结构体包含了文件操作所需的各种信息和元数据。这是文件系统操作的核心结构之一允许内核跟踪每个打开的文件及其相关的状态。 struct file {// 用于链接或者引用计数union {// 链表节点struct llist_node fu_llist;// Read-Copy-Update头struct rcu_head fu_rcuhead;} f_u;// 文件路径信息struct path f_path;// 文件的 inode 结构体表示文件的具体内容和属性struct inode *f_inode; /* cached value */// 指向文件操作结构体的指针包含与文件相关的各种操作函数指针如读、写、打开、关闭等const struct file_operations *f_op;/** Protects f_ep_links, f_flags.* Must not be taken from IRQ context.*/spinlock_t f_lock;enum rw_hint f_write_hint;// 引用计数表示有多少引用指向这个文件结构体atomic_long_t f_count;// 文件标志描述文件的各种属性如只读、只写、非阻塞等unsigned int f_flags;// 文件模式指示文件的打开模式如读、写、执行等fmode_t f_mode;// 位置锁用于保护文件读写位置的锁struct mutex f_pos_lock;// 文件的读写位置偏移量loff_t f_pos;// 文件所有者结构体包含文件的拥有者和访问权限信息struct fown_struct f_owner;const struct cred *f_cred;// 文件预读取状态结构体包含文件预读取的相关信息struct file_ra_state f_ra;// 文件版本号表示文件的版本信息。u64 f_version; #ifdef CONFIG_SECURITYvoid *f_security; #endif/* needed for tty driver, and maybe others */void *private_data;#ifdef CONFIG_EPOLL/* Used by fs/eventpoll.c to link all the hooks to this file */// 用于事件轮询epoll系统调用的链表结struct list_head f_ep_links;struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */// 地址空间指针表示文件的内存映射状态struct address_space *f_mapping;// 写回错误序列号用于跟踪文件写回操作的错误errseq_t f_wb_err; } __randomize_layout__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ inode inode包含文件的所有元数据支撑访问控制、文件操作、同步、状态管理和特定文件类型支持 /** Keep mostly read-only and often accessed (especially for* the RCU path lookup and stat data) fields at the beginning* of the struct inode*/ struct inode {// 文件的模式包括文件类型和文件权限umode_t i_mode;// 操作标志标识文件系统特定的操作unsigned short i_opflags;// 文件所有者的用户 IDkuid_t i_uid;// 文件所有者的组 IDkgid_t i_gid;// 文件标志unsigned int i_flags;#ifdef CONFIG_FS_POSIX_ACLstruct posix_acl *i_acl;struct posix_acl *i_default_acl; #endif// 指向 inode 操作函数的指针const struct inode_operations *i_op;// 指向文件系统超级块struct super_block *i_sb;// 地址空间描述文件内容在内存中的映射struct address_space *i_mapping;#ifdef CONFIG_SECURITYvoid *i_security; #endif/* Stat data, not accessed from path walking */// inode 号唯一标识一个文件unsigned long i_ino;/** Filesystems may only read i_nlink directly. They shall use the* following functions for modification:** (set|clear|inc|drop)_nlink* inode_(inc|dec)_link_count*/// 链接数表示有多少个目录项指向此 inodeunion {const unsigned int i_nlink;unsigned int __i_nlink;};// 设备号对于设备文件有效dev_t i_rdev;// 文件大小loff_t i_size;struct timespec64 i_atime;struct timespec64 i_mtime;struct timespec64 i_ctime;spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */// 文件字节数、块大小位数、写入提示、文件占用块数unsigned short i_bytes;u8 i_blkbits;u8 i_write_hint;blkcnt_t i_blocks;#ifdef __NEED_I_SIZE_ORDEREDseqcount_t i_size_seqcount; #endif/* Misc */// 文件状态unsigned long i_state;// 读写信号量用于同步struct rw_semaphore i_rwsem;unsigned long dirtied_when; /* jiffies of first dirtying */unsigned long dirtied_time_when;// 特定结构hash链表节点、IO列表struct hlist_node i_hash;struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACKstruct bdi_writeback *i_wb; /* the associated cgroup wb *//* foreign inode detection, see wbc_detach_inode() */int i_wb_frn_winner;u16 i_wb_frn_avg_time;u16 i_wb_frn_history; #endif// 用于缓存回收的LRU列表struct list_head i_lru; /* inode LRU list */// 用于管理同一超级块中的 inode的超级块链表struct list_head i_sb_list;// 用于写回缓冲的写回列表struct list_head i_wb_list; /* backing dev writeback list */union {struct hlist_head i_dentry;struct rcu_head i_rcu;};// inode版本号atomic64_t i_version;// 引用计数atomic_t i_count;// 直接IO计数atomic_t i_dio_count;// 写操作计数atomic_t i_writecount; #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)atomic_t i_readcount; /* struct files open RO */ #endifunion {const struct file_operations *i_fop; /* former -i_op-default_file_ops */void (*free_inode)(struct inode *);};struct file_lock_context *i_flctx;struct address_space i_data;struct list_head i_devices;union {struct pipe_inode_info *i_pipe;struct block_device *i_bdev;struct cdev *i_cdev;char *i_link;unsigned i_dir_seq;};__u32 i_generation;#ifdef CONFIG_FSNOTIFY__u32 i_fsnotify_mask; /* all events this inode cares about */struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif#ifdef CONFIG_FS_ENCRYPTIONstruct fscrypt_info *i_crypt_info; #endif#ifdef CONFIG_FS_VERITYstruct fsverity_info *i_verity_info; #endifvoid *i_private; /* fs or device private pointer */ } __randomize_layout;写入——从write()到vfs write()系统调用在内核的实现为sys_write。本部分在真正文件系统操作调用之外只是获取释放文件描述符、更新位置指针、写入前检查等操作 ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) {// 获取文件描述符fdstruct fd f fdget_pos(fd);ssize_t ret -EBADF;if (f.file) {// 获取文件当前位置指针loff_t pos, *ppos file_ppos(f.file);if (ppos) {pos *ppos;ppos pos;}// VFS执行实际写操作ret vfs_write(f.file, buf, count, ppos);// 更新文件指针位置if (ret 0 ppos)f.file-f_pos pos;// 释放文件描述符减少其引用计数fdput_pos(f);}return ret; }接着进入vfsvfs实际也是调用真正文件系统的接口实现 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) {ssize_t ret;// 检查文件是否可写if (!(file-f_mode FMODE_WRITE))return -EBADF; // 文件不可写返回错误码 EBADFif (!(file-f_mode FMODE_CAN_WRITE))return -EINVAL; // 文件模式不支持写操作返回错误码 EINVAL// 检查用户空间指针是否有效if (unlikely(!access_ok(buf, count)))return -EFAULT; // 用户空间指针无效返回错误码 EFAULT// 验证写操作范围ret rw_verify_area(WRITE, file, pos, count);if (!ret) {// 限制最大写入字节数if (count MAX_RW_COUNT)count MAX_RW_COUNT;// 开始文件写入file_start_write(file);// 实际执行写操作ret __vfs_write(file, buf, count, pos);// 如果写入成功发送文件系统通知并更新写字节数if (ret 0) {fsnotify_modify(file);add_wchar(current, ret);}// 更新系统调用写计数inc_syscw(current);// 结束文件写入file_end_write(file);}return ret; }static ssize_t __vfs_write(struct file *file, const char __user *p,size_t count, loff_t *pos) {// 首先检查文件操作结构是否有write方法有直接用if (file-f_op-write)return file-f_op-write(file, p, count, pos);else if (file-f_op-write_iter)return new_sync_write(file, p, count, pos);elsereturn -EINVAL; }以下是ext4文件系统实现vfs接口的方法 const struct file_operations ext4_file_operations {.llseek ext4_llseek,.read_iter ext4_file_read_iter,.write_iter ext4_file_write_iter,.unlocked_ioctl ext4_ioctl, #ifdef CONFIG_COMPAT.compat_ioctl ext4_compat_ioctl, #endif.mmap ext4_file_mmap,.mmap_supported_flags MAP_SYNC,.open ext4_file_open,.release ext4_release_file,.fsync ext4_sync_file,.get_unmapped_area thp_get_unmapped_area,.splice_read generic_file_splice_read,.splice_write iter_file_splice_write,.fallocate ext4_fallocate, };ext4 buffered or direct 在Linux中存在几种不同的IO写入方式 DAX: 字节级别的操作。要求额外的硬件支持 DIO直接从用户态写入数据到硬盘中跳过内核缓冲区减少了上下文切换和数据复制开销块级别操作数据的读写需要是设备的块大小和linux系统的页大小的整数倍 BIO默认标准方式。数据会先从应用程序的地址空间拷贝到操作系统内核地址空间的页缓存然后再写入磁盘。根据Linux的延迟写机制当数据写到操作系统内核地址空间的页缓存就意味write 缓冲写入操作通常是异步的数据首先写入页缓存后续由内核的pdflush守护进程或kworker线程将缓存数据写入磁盘。直接I/O则是同步的数据直接写入磁盘。 static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) {// 获取文件关联的inodestruct inode *inode file_inode(iocb-ki_filp);if (unlikely(ext4_forced_shutdown(EXT4_SB(inode-i_sb))))return -EIO;// 如果文件系统配置支持直接访问且inode也允许则进行直接写入 #ifdef CONFIG_FS_DAXif (IS_DAX(inode))return ext4_dax_write_iter(iocb, from); #endif// 如果IO控制块设置了IOCB_DIRECT则执行直接IO写入绕过页缓存if (iocb-ki_flags IOCB_DIRECT)return ext4_dio_write_iter(iocb, from);// 否则进行缓存写入return ext4_buffered_write_iter(iocb, from); }extent 在以下代码中出现了extent那么extent是什么呢 extent是一段连续的物理块表示文件数据在磁盘上的位置和长度。起始块物理块长度每个文件都有一个与之关联的 extent 树其根节点存储在 inode 中。树中的节点包含 extent 或指向子节点的指针。叶子节点存储实际的 extent 信息起始块、物理块和长度内部节点存储指向下一级节点的指针。内联数据内联数据适用于包含大量小文件场景将小文件数据直接储存到文件系统的元数据结构中可以减少空间浪费孤儿列表孤儿列表用于跟踪在文件操作中可能会被中途删除或者截断的文件确保即使在系统崩溃的情况下也能被正确处理比如在文件删除中inode被更新表示文件被删除了但是系统中途崩溃了而实际删除工作在后面进行就会导致这些文件变为孤儿文件元数据仍然存在可是文件本身被逻辑删除了 ext4 buffered IO buffered IO部分主要做了以下事情锁定inode防止并发修改保证page缓存的一致性检查写入操作是否合法并进行一些预处理写入 static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,struct iov_iter *from) {ssize_t ret;struct inode *inode file_inode(iocb-ki_filp);// 如果 iocb 的标志中包含 IOCB_NOWAIT则返回不支持的操作错误if (iocb-ki_flags IOCB_NOWAIT)return -EOPNOTSUPP;// 加锁以保护 inode 数据结构inode_lock(inode);// 检查写入操作是否合法并进行一些预处理ret ext4_write_checks(iocb, from);if (ret 0)goto out;// 设置当前进程的 backing_dev_info 为 inode 对应的设备current-backing_dev_info inode_to_bdi(inode);// 执行通用的写入操作将数据写入到文件中ret generic_perform_write(iocb-ki_filp, from, iocb-ki_pos);// 清除当前进程的 backing_dev_infocurrent-backing_dev_info NULL;out:// 解锁 inodeinode_unlock(inode);// 如果写入操作成功则更新文件位置并同步写入数据if (likely(ret 0)) {iocb-ki_pos ret;ret generic_write_sync(iocb, ret);}// 返回写入的字节数或者错误码return ret; }写入的执行最后还是回到了VFS。generic_perform_write处理从用户空间到文件的写入数据方法是遍历数据块、与页面缓存交互以定位或分配页面、将数据复制到这些页面、更新文件的元数据、将页面标记为脏页面以便稍后回写到存储以及确保整个过程中的数据完整性和错误处理。 ssize_t generic_perform_write(struct file *file,struct iov_iter *i, loff_t pos) {struct address_space *mapping file-f_mapping;const struct address_space_operations *a_ops mapping-a_ops;long status 0;ssize_t written 0;unsigned int flags 0;do {struct page *page;unsigned long offset; /* Offset into pagecache page */unsigned long bytes; /* Bytes to write to page */size_t copied; /* Bytes copied from user */void *fsdata;offset (pos (PAGE_SIZE - 1));bytes min_t(unsigned long, PAGE_SIZE - offset,iov_iter_count(i));again:/** Bring in the user page that we will copy from _first_.* Otherwise theres a nasty deadlock on copying from the* same page as were writing to, without it being marked* up-to-date.** Not only is this an optimisation, but it is also required* to check that the address is actually valid, when atomic* usercopies are used, below.*/// 错误处理和信号检测if (unlikely(iov_iter_fault_in_readable(i, bytes))) {status -EFAULT;break;}if (fatal_signal_pending(current)) {status -EINTR;break;}// 负责将目标文件对应的页加载到内存中准备好缓冲区以便写入数据。这个函数可能会涉及到文件系统特定的逻辑例如预分配块或者处理写入锁。status a_ops-write_begin(file, mapping, pos, bytes, flags,page, fsdata);if (unlikely(status 0))break;// 如果页面映射到用户空间并且可能被写入则确保缓存一致性以防止缓存中的旧数据与内存中的新数据冲突。if (mapping_writably_mapped(mapping))flush_dcache_page(page);// 从用户空间缓冲区复制数据到内核页面缓存copied iov_iter_copy_from_user_atomic(page, i, offset, bytes);flush_dcache_page(page);// 负责处理写操作后的收尾工作例如更新文件大小、标记页面脏、解除页面锁定等status a_ops-write_end(file, mapping, pos, bytes, copied,page, fsdata);if (unlikely(status 0))break;copied status;cond_resched();iov_iter_advance(i, copied);if (unlikely(copied 0)) {/** If we were unable to copy any data at all, we must* fall back to a single segment length write.** If we didnt fallback here, we could livelock* because not all segments in the iov can be copied at* once without a pagefault.*/bytes min_t(unsigned long, PAGE_SIZE - offset,iov_iter_single_seg_count(i));goto again;}pos copied;written copied;balance_dirty_pages_ratelimited(mapping);} while (iov_iter_count(i));return written ? written : status; } EXPORT_SYMBOL(generic_perform_write);ext4 write begin ext4_write_begin处理将数据写入文件的准备工作。确保正确设置数据结构和状态以便实际的数据写入操作顺利进行锁定inode、在页面缓存中分配页面以及初始化日志事务以确保文件系统的一致性、确定需要修改的特定块并在必要时从磁盘读取任何现有数据以避免覆盖块的未初始化部分。 static int ext4_write_begin(struct file *file, struct address_space *mapping,loff_t pos, unsigned len, unsigned flags,struct page **pagep, void **fsdata) {struct inode *inode mapping-host;int ret, needed_blocks;handle_t *handle;int retries 0;struct page *page;pgoff_t index;unsigned from, to;// 检查文件系统是否被强制关闭if (unlikely(ext4_forced_shutdown(EXT4_SB(inode-i_sb))))return -EIO;// 记录写入操作的跟踪信息trace_ext4_write_begin(inode, pos, len, flags);/** Reserve one block more for addition to orphan list in case* we allocate blocks but write fails for some reason*/// 计算写操作所需的块数包括一个额外的块用于孤儿列表orphan list的添加needed_blocks ext4_writepage_trans_blocks(inode) 1;// 计算写入位置的页索引、起始偏移量和结束偏移量index pos PAGE_SHIFT;from pos (PAGE_SIZE - 1);to from len;// 如果文件可能包含内联数据尝试写入内联数据if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {ret ext4_try_to_write_inline_data(mapping, inode, pos, len,flags, pagep);if (ret 0)return ret;if (ret 1)return 0;}// 进行事务处理之前需要先调用 grab_cache_page_write_begin() 获取页面。这样做可以避免在高系统负载或内存压力下造成的长时间等待同时允许更灵活的内存分配从而减少潜在的死锁风险。这种策略有助于确保文件系统写入操作的效率和可靠性 retry_grab:// 获取要写入的缓存页。如果获取失败page grab_cache_page_write_begin(mapping, index, flags);if (!page)return -ENOMEM;unlock_page(page);retry_journal:// 开始一个新的Ext4事务handle ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);if (IS_ERR(handle)) {put_page(page);return PTR_ERR(handle);}// 锁定页面并确保页面稳定。如果页面映射发生变化重新获取页面lock_page(page);if (page-mapping ! mapping) {/* The page got truncated from under us */unlock_page(page);put_page(page);ext4_journal_stop(handle);goto retry_grab;}/* In case writeback began while the page was unlocked */wait_for_stable_page(page);// 根据文件系统状态选择写入方法并执行实际的写入操作 #ifdef CONFIG_FS_ENCRYPTIONif (ext4_should_dioread_nolock(inode))ret ext4_block_write_begin(page, pos, len,ext4_get_block_unwritten);elseret ext4_block_write_begin(page, pos, len,ext4_get_block); #elseif (ext4_should_dioread_nolock(inode))ret __block_write_begin(page, pos, len,ext4_get_block_unwritten);elseret __block_write_begin(page, pos, len, ext4_get_block); #endifif (!ret ext4_should_journal_data(inode)) {ret ext4_walk_page_buffers(handle, page_buffers(page),from, to, NULL,do_journal_get_write_access);}// 处理写入过程中出现的错误。如果需要重试分配块重新开始事务if (ret) {bool extended (pos len inode-i_size) !ext4_verity_in_progress(inode);unlock_page(page);/** __block_write_begin may have instantiated a few blocks* outside i_size. Trim these off again. Dont need* i_size_read because we hold i_mutex.** Add inode to orphan list in case we crash before* truncate finishes*/if (extended ext4_can_truncate(inode))ext4_orphan_add(handle, inode);ext4_journal_stop(handle);if (extended) {ext4_truncate_failed_write(inode);/** If truncate failed early the inode might* still be on the orphan list; we need to* make sure the inode is removed from the* orphan list in that case.*/if (inode-i_nlink)ext4_orphan_del(NULL, inode);}if (ret -ENOSPC ext4_should_retry_alloc(inode-i_sb, retries))goto retry_journal;put_page(page);return ret;}*pagep page;return ret; }block_write_begin通过映射页中必要的块来准备要写的页。遍历每个块确保将其映射并标记为最新的如果有必要还会对需要从磁盘读取的块发起读取以避免覆盖未初始化的数据 // • page: 需要写入数据的页面。 // • pos: 写操作的起始位置。 // • len: 写入数据的长度。 // • get_block: 用于映射逻辑块号到物理块号的回调函数。 // • iomap: I/O 映射结构体用于描述 I/O 操作。 int __block_write_begin(struct page *page, loff_t pos, unsigned len,get_block_t *get_block) {return __block_write_begin_int(page, pos, len, get_block, NULL); } EXPORT_SYMBOL(__block_write_begin);int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,get_block_t *get_block, struct iomap *iomap) {unsigned from pos (PAGE_SIZE - 1);unsigned to from len;struct inode *inode page-mapping-host;unsigned block_start, block_end;sector_t block;int err 0;unsigned blocksize, bbits;struct buffer_head *bh, *head, *wait[2], **wait_bhwait;BUG_ON(!PageLocked(page));BUG_ON(from PAGE_SIZE);BUG_ON(to PAGE_SIZE);BUG_ON(from to);// 为页面分配缓冲头并设置缓冲区大小和块大小的位数head create_page_buffers(page, inode, 0);blocksize head-b_size;bbits block_size_bits(blocksize);block (sector_t)page-index (PAGE_SHIFT - bbits);// 遍历页面的每个缓冲头buffer head处理每个块for(bh head, block_start 0; bh ! head || !block_start;block, block_startblock_end, bh bh-b_this_page) {block_end block_start blocksize;if (block_end from || block_start to) {if (PageUptodate(page)) {if (!buffer_uptodate(bh))set_buffer_uptodate(bh);}continue;}if (buffer_new(bh))clear_buffer_new(bh);// 如果缓冲区尚未映射则调用get_block或者iomap_to_bh进行块映射if (!buffer_mapped(bh)) {WARN_ON(bh-b_size ! blocksize);if (get_block) {err get_block(inode, block, bh, 1);if (err)break;} else {iomap_to_bh(inode, block, bh, iomap);}if (buffer_new(bh)) {clean_bdev_bh_alias(bh);if (PageUptodate(page)) {clear_buffer_new(bh);set_buffer_uptodate(bh);mark_buffer_dirty(bh);continue;}if (block_end to || block_start from)zero_user_segments(page,to, block_end,block_start, from);continue;}}if (PageUptodate(page)) {if (!buffer_uptodate(bh))set_buffer_uptodate(bh);continue; }// 如果缓冲区未更新且未延迟也未写入则从磁盘读取块数据if (!buffer_uptodate(bh) !buffer_delay(bh) !buffer_unwritten(bh) (block_start from || block_end to)) {ll_rw_block(REQ_OP_READ, 0, 1, bh);*wait_bhbh;}}/** If we issued read requests - let them complete.*/// 等待所有读取操作完成while(wait_bh wait) {wait_on_buffer(*--wait_bh);if (!buffer_uptodate(*wait_bh))err -EIO;}if (unlikely(err))page_zero_new_buffers(page, from, to);return err; }ext4 write end ext4_write_end对页的数据写入做收尾工作。如果写入扩展了文件则更新inode大小必要时将inode标记为脏的并处理任何清理包括处理日志事务如果写入部分失败则截断超出新文件大小的未初始化块。保证写操作后数据的完整性和一致性。 /** We need to pick up the new inode size which generic_commit_write gave us* file can be NULL - eg, when called from page_symlink().** ext4 never places buffers on inode-i_mapping-private_list. metadata* buffers are managed internally.*/ static int ext4_write_end(struct file *file,struct address_space *mapping,loff_t pos, unsigned len, unsigned copied,struct page *page, void *fsdata) {handle_t *handle ext4_journal_current_handle();struct inode *inode mapping-host;loff_t old_size inode-i_size;int ret 0, ret2;int i_size_changed 0;int inline_data ext4_has_inline_data(inode);bool verity ext4_verity_in_progress(inode);trace_ext4_write_end(inode, pos, len, copied);// 包含内联数据处理内联数据写入否则进行块写入if (inline_data) {ret ext4_write_inline_data_end(inode, pos, len,copied, page);if (ret 0) {unlock_page(page);put_page(page);goto errout;}copied ret;} elsecopied block_write_end(file, mapping, pos,len, copied, page, fsdata);/** its important to update i_size while still holding page lock:* page writeout could otherwise come in and zero beyond i_size.** If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree* blocks are being written past EOF, so skip the i_size update.*/if (!verity)i_size_changed ext4_update_inode_size(inode, pos copied);unlock_page(page);put_page(page);// 如果旧文件大小小于写入位置且没有正在进行的文件校验扩展操作更新页面缓存的文件大小if (old_size pos !verity)pagecache_isize_extended(inode, old_size, pos);/** Dont mark the inode dirty under page lock. First, it unnecessarily* makes the holding time of page lock longer. Second, it forces lock* ordering of page lock and transaction start for journaling* filesystems.*/// 如果文件大小发生变化或包含内联数据标记inode为脏if (i_size_changed || inline_data)ext4_mark_inode_dirty(handle, inode);// 如果写入位置加上写入长度超过了文件大小并且文件系统允许截断添加inode到孤儿列表if (pos len inode-i_size !verity ext4_can_truncate(inode))/* if we have allocated more blocks and copied* less. We will have blocks allocated outside* inode-i_size. So truncate them*/ext4_orphan_add(handle, inode); errout:ret2 ext4_journal_stop(handle);if (!ret)ret ret2;if (pos len inode-i_size !verity) {ext4_truncate_failed_write(inode);/** If truncate failed early the inode might still be* on the orphan list; we need to make sure the inode* is removed from the orphan list in that case.*/if (inode-i_nlink)ext4_orphan_del(NULL, inode);}return ret ? ret : copied; }ext4 direct IO static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) {ssize_t ret;size_t count;loff_t offset;handle_t *handle;struct inode *inode file_inode(iocb-ki_filp);bool extend false, overwrite false, unaligned_aio false;// 锁定inode。if (iocb-ki_flags IOCB_NOWAIT) {if (!inode_trylock(inode))return -EAGAIN;} else {inode_lock(inode);}// 检查是否支持直接IOif (!ext4_dio_supported(inode)) {inode_unlock(inode);/** Fallback to buffered I/O if the inode does not support* direct I/O.*/return ext4_buffered_write_iter(iocb, from);}// 写入前检查ret ext4_write_checks(iocb, from);if (ret 0) {inode_unlock(inode);return ret;}// 同步未对齐的异步direct IO防止数据损坏offset iocb-ki_pos;count iov_iter_count(from);if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) !is_sync_kiocb(iocb) ext4_unaligned_aio(inode, from, offset)) {unaligned_aio true;inode_dio_wait(inode);}/** Determine whether the I/O will overwrite allocated and initialized* blocks. If so, check to see whether it is possible to take the* dioread_nolock path.*/// 如果IO对齐且I/O覆盖已分配和初始化的块且 inode 支持无锁直接读取则设置 overwrite 并降级写锁if (!unaligned_aio ext4_overwrite_io(inode, offset, count) ext4_should_dioread_nolock(inode)) {overwrite true;downgrade_write(inode-i_rwsem);}// 检查写操作的结束(offset count)是否超过了inode的当前磁盘大小启动一个日志句柄来安全地管理对inode的更改将该inode添加到孤立列表中以处理写操作期间可能发生的崩溃并设置一个标志(extend)表示将扩展inode的大小。然后停止日志记录句柄if (offset count EXT4_I(inode)-i_disksize) {handle ext4_journal_start(inode, EXT4_HT_INODE, 2);if (IS_ERR(handle)) {ret PTR_ERR(handle);goto out;}ret ext4_orphan_add(handle, inode);if (ret) {ext4_journal_stop(handle);goto out;}extend true;ext4_journal_stop(handle);}// 执行直接 I/O 写入ret iomap_dio_rw(iocb, from, ext4_iomap_ops, ext4_dio_write_ops,is_sync_kiocb(iocb) || unaligned_aio || extend);// 如果是扩展操作需要再次启动一个事务并更新磁盘大小if (extend)ret ext4_handle_inode_extension(inode, offset, ret, count);out:if (overwrite)inode_unlock_shared(inode);elseinode_unlock(inode);if (ret 0 iov_iter_count(from)) {ssize_t err;loff_t endbyte;// 回退到缓冲IOoffset iocb-ki_pos;err ext4_buffered_write_iter(iocb, from);if (err 0)return err;// 在当前 I/O 操作覆盖的范围内确保页面缓存中的页面被写入磁盘并失效即使缓存无效。这是为了在必要时回退到缓冲 I/O 时尽量保持直接 I/O 的语义ret err;endbyte offset err - 1;err filemap_write_and_wait_range(iocb-ki_filp-f_mapping,offset, endbyte);if (!err)invalidate_mapping_pages(iocb-ki_filp-f_mapping,offset PAGE_SHIFT,endbyte PAGE_SHIFT);}return ret; }ext4 BIO与DIO代码有感 ext4 BIO(Buffered IO)与DIO(Direct IO) ext4 BIO与DIO都尝试对inode进行锁定。不同的是DIO还允许无等待也就是在锁已经被获取的情况下直接返回 BIO经过内核page缓存而DIO则直接从用户空间写入到设备 DIO还确保写入操作覆盖范围内的缓存页面被写入磁盘并失效以保证直接 I/O 语义和未对齐的异步直接 I/O 写入防止数据损坏 Ref https://elixir.bootlin.com/linux/v5.5-rc2/source

查看全文

http://www.hkea.cn/news/14327264/