Filipe Manana
2014-Aug-30 17:00 UTC
[PATCH] Btrfs: fix fsync data loss after a ranged fsync
While we're doing a full fsync (when the inode has the flag BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a portion of the file), we might have ordered operations that are started while we're logging the inode and that fall outside the fsync range. This means we can get extent maps outside our range added to the inode's extent map tree's modified list for which the corresponding ordered operation wasn't captured by our call to btrfs_get_logged_extents() - the fill delalloc callbacks, inode.c:cow_file_range() and inode.c:submit_compressed_extents() add an extent map to the modified list before creating the respective ordered operation - and they do this without holding the inode's mutex nor the inode's log mutex. Therefore when a full ranged fsync finishes don't remove every extent map from the modified list of extent maps - as for some of them, that fall outside our fsync range, we might have not waited for their respective ordered operation to finish (meaning the corresponding file extent item wasn't inserted into the fs/subvol tree yet), and we must let the next fsync (very likely a fast one that checks only the modified list) see this extent map and log a matching file extent item to the log btree and wait for its ordered operation to finish (if it's still ongoing). Signed-off-by: Filipe Manana <fdmanana@suse.com> --- fs/btrfs/file.c | 2 +- fs/btrfs/tree-log.c | 50 ++++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/tree-log.h | 2 ++ 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66c4076..e5534c1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(&ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5a917a6..8b18a2d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -94,8 +94,10 @@ #define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only); + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3856,8 +3858,10 @@ process: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only) + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end) { struct btrfs_path *path; struct btrfs_path *dst_path; @@ -4050,8 +4054,27 @@ log_extents: struct extent_map *em, *n; write_lock(&tree->lock); - list_for_each_entry_safe(em, n, &tree->modified_extents, list) + /* + * We can't just remove every em if we're called for a ranged + * fsync - that is, one that doesn't cover the whole possible + * file range (0 to LLONG_MAX). This is because we can have + * em's that fall outside the range and therefore their ordered + * operations haven't completed yet (btrfs_finish_ordered_io() + * not invoked yet). Their ordered operations might have started + * after we called btrfs_get_logged_extents() too, so we don't + * end up waiting for them to complete when syncing the log. + * Removing every em outside the range would make a subsequent + * fsync that does a "fast search" (BTRFS_INODE_NEEDS_FULL_SYNC + * flag not set) not log the extent represented by an em, + * therefore making us lose data after a log replay. + */ + list_for_each_entry_safe(em, n, &tree->modified_extents, list) { + if (em->mod_start > end) + continue; + if (em->mod_start + em->mod_len <= start) + continue; list_del_init(&em->list); + } write_unlock(&tree->lock); } @@ -4158,7 +4181,10 @@ out: */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only, + struct dentry *parent, + const loff_t start, + const loff_t end, + int exists_only, struct btrfs_log_ctx *ctx) { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; @@ -4204,7 +4230,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only); + ret = btrfs_log_inode(trans, root, inode, inode_only, start, end); if (ret) goto end_trans; @@ -4232,7 +4258,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { - ret = btrfs_log_inode(trans, root, inode, inode_only); + ret = btrfs_log_inode(trans, root, inode, inode_only, + 0, LLONG_MAX); if (ret) goto end_trans; } @@ -4266,13 +4293,15 @@ end_no_trans: */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry, + const loff_t start, + const loff_t end, struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, - 0, ctx); + start, end, 0, ctx); dput(parent); return ret; @@ -4509,6 +4538,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, root->fs_info->last_trans_committed)) return 0; - return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); + return btrfs_log_inode_parent(trans, root, inode, parent, 0, + LLONG_MAX, 1, NULL); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 7f5b41b..e2e798a 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry, + const loff_t start, + const loff_t end, struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html