Miao Xie
2013-Jan-29 09:59 UTC
[PATCH V2 01/10] Btrfs: use atomic for btrfs_fs_info->generation
fs_info->generation is a 64bit variant, and it can be accessed by multi-task, if there is no lock or other methods to protect it, we might get the wrong number, especially on 32bit machine. For example, Assuming ->generation is 0x0000 0000 ffff ffff at the beginning, then we increase it by 1, ->generation will be 0x0000 0001 0000 0000, but it is in the registers, then we store it into the memory. If some task accesses it at this time, just like this: Task0 Task1 set low 32 bits load low 32 bits load high 32 bits set high 32 bits The task will get 0, it is a wrong number. We fix this problem by the atomic operation. Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- Changelog v1 -> v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.c | 7 ++++--- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/file.c | 6 ++++-- fs/btrfs/inode.c | 5 +++-- fs/btrfs/qgroup.c | 2 +- fs/btrfs/transaction.c | 4 ++-- include/trace/events/btrfs.h | 3 ++- 8 files changed, 21 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index eea5da7..4a36c03 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1365,10 +1365,11 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, (unsigned long long) root->fs_info->running_transaction->transid); - if (trans->transid != root->fs_info->generation) + if (trans->transid != atomic64_read(&root->fs_info->generation)) WARN(1, KERN_CRIT "trans %llu running %llu\n", (unsigned long long)trans->transid, - (unsigned long long)root->fs_info->generation); + (unsigned long long)atomic64_read( + &root->fs_info->generation)); if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; @@ -1465,7 +1466,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, return 0; WARN_ON(trans->transaction != root->fs_info->running_transaction); - WARN_ON(trans->transid != root->fs_info->generation); + WARN_ON(trans->transid != atomic64_read(&root->fs_info->generation)); parent_nritems = btrfs_header_nritems(parent); blocksize = btrfs_level_size(root, parent_level - 1); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 547b7b0..c3edb22 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1278,7 +1278,7 @@ struct btrfs_fs_info { struct btrfs_block_rsv empty_block_rsv; - u64 generation; + atomic64_t generation; u64 last_trans_committed; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65f0367..f03aebc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1200,7 +1200,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); memset(&root->root_kobj, 0, sizeof(root->root_kobj)); - root->defrag_trans_start = fs_info->generation; + root->defrag_trans_start = atomic64_read(&fs_info->generation); init_completion(&root->kobj_unregister); root->defrag_running = 0; root->root_key.objectid = objectid; @@ -2501,7 +2501,7 @@ retry_root_backup: fs_info->pending_quota_state = 1; } - fs_info->generation = generation; + atomic64_set(&fs_info->generation, generation); fs_info->last_trans_committed = generation; ret = btrfs_recover_balance(fs_info); @@ -3436,12 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) int was_dirty; btrfs_assert_tree_locked(buf); - if (transid != root->fs_info->generation) + if (transid != atomic64_read(&root->fs_info->generation)) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " "found %llu running %llu\n", (unsigned long long)buf->start, (unsigned long long)transid, - (unsigned long long)root->fs_info->generation); + (u64)atomic64_read(&root->fs_info->generation)); was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) { spin_lock(&root->fs_info->delalloc_lock); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 841cfe3..02409b6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1588,7 +1588,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, * otherwise subsequent syncs to a file that''s been synced in this * transaction will appear to have already occured. */ - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + BTRFS_I(inode)->last_trans = atomic64_read(&root->fs_info->generation) + + 1; BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0 || num_written == -EIOCBQUEUED) { err = generic_write_sync(file, pos, num_written); @@ -1679,7 +1680,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * syncing */ smp_mb(); - if (btrfs_inode_in_log(inode, root->fs_info->generation) || + if (btrfs_inode_in_log(inode, + atomic64_read(&root->fs_info->generation)) || BTRFS_I(inode)->last_trans < root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ca7ace7..35c4dda 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2644,7 +2644,8 @@ static void btrfs_read_locked_inode(struct inode *inode) * idea about which extents were modified before we were evicted from * cache. */ - if (BTRFS_I(inode)->last_trans == root->fs_info->generation) + if (BTRFS_I(inode)->last_trans =+ atomic64_read(&root->fs_info->generation)) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -6903,7 +6904,7 @@ again: set_page_dirty(page); SetPageUptodate(page); - BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_trans = atomic64_read(&root->fs_info->generation); BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a5c8562..dd002ae 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -290,7 +290,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } if (btrfs_qgroup_status_generation(l, ptr) !- fs_info->generation) { + atomic64_read(&fs_info->generation)) { flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; printk(KERN_ERR "btrfs: qgroup generation mismatch, " diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f154946..105d642 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -161,8 +161,8 @@ loop: list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); - fs_info->generation++; - cur_trans->transid = fs_info->generation; + atomic64_inc(&fs_info->generation); + cur_trans->transid = atomic64_read(&fs_info->generation); fs_info->running_transaction = cur_trans; cur_trans->aborted = 0; spin_unlock(&fs_info->trans_lock); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index ea546a4..09dc72f 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -71,7 +71,8 @@ TRACE_EVENT(btrfs_transaction_commit, ), TP_fast_assign( - __entry->generation = root->fs_info->generation; + __entry->generation + atomic64_read(&root->fs_info->generation); __entry->root_objectid = root->root_key.objectid; ), -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Miao Xie
2013-Jan-29 10:00 UTC
[PATCH V2 02/10] Btrfs: use atomic for fs_info->last_trans_committed
fs_info->last_trans_committed is a 64bit variant, and it can be accessed by multi-task, if there is no lock or other methods to protect it, we might get the wrong number, especially on 32bit machine.(Even on 64bit machine, it is possible that the compiler may split a 64bit operation into two 32bit operation.) For example, Assuming ->last_trans_committed is 0x00000000ffffffff at the beginning, then we want set it to 0x0000000100000000. Task0 Task1 set low 32 bits load low 32 bits load high 32 bits set high 32 bits The task will get 0, it is a wrong number. We fix this problem by the atomic operation. Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- Changelog v1 -> v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/file.c | 2 +- fs/btrfs/ioctl.c | 2 +- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/scrub.c | 2 +- fs/btrfs/transaction.c | 5 +++-- fs/btrfs/tree-log.c | 16 +++++++++------- 8 files changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c3edb22..34a60a8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1279,7 +1279,7 @@ struct btrfs_fs_info { struct btrfs_block_rsv empty_block_rsv; atomic64_t generation; - u64 last_trans_committed; + atomic64_t last_trans_committed; /* * this is updated to the current trans every time a full commit diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f03aebc..87ed05a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2502,7 +2502,7 @@ retry_root_backup: } atomic64_set(&fs_info->generation, generation); - fs_info->last_trans_committed = generation; + atomic64_set(&fs_info->last_trans_committed, generation); ret = btrfs_recover_balance(fs_info); if (ret) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 02409b6..910ea99 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1683,7 +1683,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (btrfs_inode_in_log(inode, atomic64_read(&root->fs_info->generation)) || BTRFS_I(inode)->last_trans <- root->fs_info->last_trans_committed) { + atomic64_read(&root->fs_info->last_trans_committed)) { BTRFS_I(inode)->last_trans = 0; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index afbf3ac..3b6c339 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3114,7 +3114,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, return PTR_ERR(trans); /* No running transaction, don''t bother */ - transid = root->fs_info->last_trans_committed; + transid = atomic64_read(&root->fs_info->last_trans_committed); goto out; } transid = trans->transid; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f107312..f376621 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -975,7 +975,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, * if this file hasn''t been changed since the last transaction * commit, we can safely return without doing anything */ - if (last_mod < root->fs_info->last_trans_committed) + if (last_mod < atomic64_read(&root->fs_info->last_trans_committed)) return; spin_lock(&root->fs_info->ordered_extent_lock); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bdbb94f..af0b566 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2703,7 +2703,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) return -EIO; - gen = root->fs_info->last_trans_committed; + gen = atomic64_read(&root->fs_info->last_trans_committed); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 105d642..29fdf1c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -459,7 +459,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) int ret = 0; if (transid) { - if (transid <= root->fs_info->last_trans_committed) + if (transid <+ atomic64_read(&root->fs_info->last_trans_committed)) goto out; ret = -EINVAL; @@ -1730,7 +1731,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cur_trans->commit_done = 1; - root->fs_info->last_trans_committed = cur_trans->transid; + atomic64_set(&root->fs_info->last_trans_committed, cur_trans->transid); wake_up(&cur_trans->commit_wait); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9027bb1..7f42a53 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3397,7 +3397,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); write_lock(&tree->lock); - test_gen = root->fs_info->last_trans_committed; + test_gen = atomic64_read(&root->fs_info->last_trans_committed); list_for_each_entry_safe(em, n, &tree->modified_extents, list) { list_del_init(&em->list); @@ -3502,7 +3502,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, /* Only run delayed items if we are a dir or a new file */ if (S_ISDIR(inode->i_mode) || - BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { + BTRFS_I(inode)->generation > + atomic64_read(&root->fs_info->last_trans_committed)) { ret = btrfs_commit_inode_delayed_items(trans, inode); if (ret) { btrfs_free_path(path); @@ -3744,7 +3745,8 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct super_block *sb; struct dentry *old_parent = NULL; int ret = 0; - u64 last_committed = root->fs_info->last_trans_committed; + u64 last_committed = atomic64_read( + &root->fs_info->last_trans_committed); sb = inode->i_sb; @@ -3754,7 +3756,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, } if (root->fs_info->last_trans_log_full_commit > - root->fs_info->last_trans_committed) { + atomic64_read(&root->fs_info->last_trans_committed)) { ret = 1; goto end_no_trans; } @@ -3806,7 +3808,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, break; if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed) { + atomic64_read(&root->fs_info->last_trans_committed)) { ret = btrfs_log_inode(trans, root, inode, inode_only); if (ret) goto end_trans; @@ -4069,9 +4071,9 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, * from hasn''t been logged, we don''t need to log it */ if (BTRFS_I(inode)->logged_trans <- root->fs_info->last_trans_committed && + atomic64_read(&root->fs_info->last_trans_committed) && (!old_dir || BTRFS_I(old_dir)->logged_trans <- root->fs_info->last_trans_committed)) + atomic64_read(&root->fs_info->last_trans_committed))) return 0; return btrfs_log_inode_parent(trans, root, inode, parent, 1); -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Miao Xie
2013-Jan-29 10:03 UTC
[PATCH V2 03/10] Btrfs: use atomic for fs_info->last_trans_log_full_commit
fs_info->last_trans_log_full_commit is a 64bit variant, and it can be accessed by multi-task, if there is no lock or other methods to protect it, we might get the wrong number, especially on 32bit machine.(Even on 64bit machine, it is possible that the compiler may split a 64bit operation into two 32bit operation.) For example, Assuming ->last_trans_log_full_commit is 0x00000000ffffffff at the beginning, then we want set it to 0x0000000100000000. Task0 Task1 set low 32 bits load low 32 bits load high 32 bits set high 32 bits The task will get 0, it is a wrong number. We fix this problem by the atomic operation. Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- Changelog v1 -> v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 3 ++- fs/btrfs/inode.c | 3 ++- fs/btrfs/tree-log.c | 32 +++++++++++++++++++------------- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 34a60a8..745e7ad 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1285,7 +1285,7 @@ struct btrfs_fs_info { * this is updated to the current trans every time a full commit * is required instead of the faster short fsync log commits */ - u64 last_trans_log_full_commit; + atomic64_t last_trans_log_full_commit; unsigned long mount_opt; unsigned long compress_type:4; u64 max_inline; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 85b8454..ef61a4a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7868,7 +7868,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_log_full_commit = trans->transid; + atomic64_set(&root->fs_info->last_trans_log_full_commit, + trans->transid); cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 35c4dda..803be87 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7433,7 +7433,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ - root->fs_info->last_trans_log_full_commit = trans->transid; + atomic64_set(&root->fs_info->last_trans_log_full_commit, + trans->transid); } else { ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7f42a53..bb7c01b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2227,14 +2227,14 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit !+ if (atomic64_read(&root->fs_info->last_trans_log_full_commit) ! trans->transid && root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])) schedule(); finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); - } while (root->fs_info->last_trans_log_full_commit !+ } while (atomic64_read(&root->fs_info->last_trans_log_full_commit) ! trans->transid && root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])); return 0; @@ -2244,12 +2244,12 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); - while (root->fs_info->last_trans_log_full_commit !+ while (atomic64_read(&root->fs_info->last_trans_log_full_commit) ! trans->transid && atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit !+ if (atomic64_read(&root->fs_info->last_trans_log_full_commit) ! trans->transid && atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); @@ -2306,7 +2306,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (atomic64_read(&root->fs_info->last_trans_log_full_commit) =+ trans->transid) { ret = -EAGAIN; mutex_unlock(&root->log_mutex); goto out; @@ -2361,7 +2362,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); goto out; } - root->fs_info->last_trans_log_full_commit = trans->transid; + atomic64_set(&root->fs_info->last_trans_log_full_commit, + trans->transid); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; @@ -2390,7 +2392,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we''ve moved on to the tree of log tree roots, * check the full commit flag again */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (atomic64_read(&root->fs_info->last_trans_log_full_commit) =+ trans->transid) { btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; @@ -2614,7 +2617,8 @@ fail: out_unlock: mutex_unlock(&BTRFS_I(dir)->log_mutex); if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; + atomic64_set(&root->fs_info->last_trans_log_full_commit, + trans->transid); ret = 0; } else if (ret < 0) btrfs_abort_transaction(trans, root, ret); @@ -2647,7 +2651,8 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; + atomic64_set(&root->fs_info->last_trans_log_full_commit, + trans->transid); ret = 0; } else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, root, ret); @@ -3708,8 +3713,8 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * make sure any commits to the log are forced * to be full commits */ - root->fs_info->last_trans_log_full_commit - trans->transid; + atomic64_set(&root->fs_info->last_trans_log_full_commit, + trans->transid); ret = 1; break; } @@ -3755,7 +3760,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - if (root->fs_info->last_trans_log_full_commit > + if (atomic64_read(&root->fs_info->last_trans_log_full_commit) > atomic64_read(&root->fs_info->last_trans_committed)) { ret = 1; goto end_no_trans; @@ -3825,7 +3830,8 @@ end_trans: dput(old_parent); if (ret < 0) { WARN_ON(ret != -ENOSPC); - root->fs_info->last_trans_log_full_commit = trans->transid; + atomic64_set(&root->fs_info->last_trans_log_full_commit, + trans->transid); ret = 1; } btrfs_end_log_trans(root); -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Miao Xie
2013-Jan-29 10:05 UTC
[PATCH V2 04/10] Btrfs: add a comment for fs_info->max_inline
Though ->max_inline is a 64bit variant, and may be accessed by multi-task, but it is just suggestive number, so we needn''t add anything to protect fs_info->max_inline, just add a comment to explain wny we don''t use a lock to protect it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- Changelog v1 -> v2: - modify the changelog and make it more clear. --- fs/btrfs/ctree.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 745e7ad..3e672916 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1288,6 +1288,12 @@ struct btrfs_fs_info { atomic64_t last_trans_log_full_commit; unsigned long mount_opt; unsigned long compress_type:4; + /* + * It is a suggestive number, the read side is safe even it gets a + * wrong number because we will write out the data into a regular + * extent. The write side(mount/remount) is under ->s_umount lock, + * so it is also safe. + */ u64 max_inline; u64 alloc_start; struct btrfs_transaction *running_transaction; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
fs_info->alloc_start is a 64bits variant, can be accessed by multi-task, but it is not protected strictly, it can be changed while we are accessing it. On 32bit machine, we will get wrong value because we access it by two instructions.(In fact, it is also possible that the same problem happens on the 64bit machine, because the compiler may split the 64bit operation into two 32bit operation.) For example: Assuming -> alloc_start is 0x0000 0000 0001 0000 at the beginning, then we remount and set ->alloc_start to 0x0000 0100 0000 0000. Task0 Task1 load high 32 bits set high 32 bits set low 32 bits load low 32 bits Task1 will get 0. This patch fixes this problem by using two locks to protect it fs_info->chunk_mutex sb->s_umount On the read side, we just need get one of these two locks, and on the write side, we must lock all of them. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- Changelog v1 -> v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.h | 10 ++++++++++ fs/btrfs/super.c | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3e672916..201be7d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1295,6 +1295,16 @@ struct btrfs_fs_info { * so it is also safe. */ u64 max_inline; + /* + * Protected by ->chunk_mutex and sb->s_umount. + * + * The reason that we use two lock to protect it is because only + * remount and mount operations can change it and these two operations + * are under sb->s_umount, but the read side (chunk allocation) can not + * acquire sb->s_umount or the deadlock would happen. So we use two + * locks to protect it. On the write side, we must acquire two locks, + * and on the read side, we just need acquire one of them. + */ u64 alloc_start; struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d8982e9..c96f132 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_alloc_start: num = match_strdup(&args[0]); if (num) { + mutex_lock(&info->chunk_mutex); info->alloc_start = memparse(num, NULL); + mutex_unlock(&info->chunk_mutex); kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", @@ -1289,7 +1291,9 @@ restore: fs_info->mount_opt = old_opts; fs_info->compress_type = old_compress_type; fs_info->max_inline = old_max_inline; + mutex_lock(&fs_info->chunk_mutex); fs_info->alloc_start = old_alloc_start; + mutex_unlock(&fs_info->chunk_mutex); btrfs_resize_thread_pool(fs_info, old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Miao Xie
2013-Jan-29 10:09 UTC
[PATCH V2 06/10] Btrfs: use percpu counter for dirty metadata count
->dirty_metadata_bytes is accessed very frequently, so use percpu counter instead of the u64 variant to reduce the contention of the lock. This patch also fixed the problem that we access it without lock protection in __btrfs_btree_balance_dirty(), which may cause we skip the dirty pages flush. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- Changelog v1 -> v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.h | 9 ++++---- fs/btrfs/disk-io.c | 64 ++++++++++++++++++++++++++++------------------------ fs/btrfs/extent_io.c | 9 +++----- 3 files changed, 42 insertions(+), 40 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 201be7d..1dcbbfd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 }; /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) +#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. @@ -1439,10 +1441,9 @@ struct btrfs_fs_info { u64 total_pinned; - /* protected by the delalloc lock, used to keep from writing - * metadata until there is a nice batch - */ - u64 dirty_metadata_bytes; + /* used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + s32 dirty_metadata_batch; struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 87ed05a..961ac58 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -946,18 +946,20 @@ static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_io_tree *tree; + struct btrfs_fs_info *fs_info; + int ret; + tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { - struct btrfs_root *root = BTRFS_I(mapping->host)->root; - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; if (wbc->for_kupdate) return 0; + fs_info = BTRFS_I(mapping->host)->root->fs_info; /* this is a bit racy, but that''s ok */ - num_dirty = root->fs_info->dirty_metadata_bytes; - if (num_dirty < thresh) + ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret < 0) return 0; } return btree_write_cache_pages(mapping, wbc); @@ -1125,24 +1127,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + struct btrfs_fs_info *fs_info = root->fs_info; + if (btrfs_header_generation(buf) =- root->fs_info->running_transaction->transid) { + fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= buf->len) - root->fs_info->dirty_metadata_bytes -= buf->len; - else { - spin_unlock(&root->fs_info->delalloc_lock); - btrfs_panic(root->fs_info, -EOVERFLOW, - "Can''t clear %lu bytes from " - " dirty_mdatadata_bytes (%llu)", - buf->len, - root->fs_info->dirty_metadata_bytes); - } - spin_unlock(&root->fs_info->delalloc_lock); - + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -buf->len, + fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(buf); @@ -2004,10 +1998,18 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + if (ret) { + err = ret; + goto fail_bdi; + } + fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * + (1 + ilog2(nr_cpu_ids)); + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_bdi; + goto fail_dirty_metadata_bytes; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2261,6 +2263,7 @@ int open_ctree(struct super_block *sb, leafsize = btrfs_super_leafsize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); stripesize = btrfs_super_stripesize(disk_super); + fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); /* * mixed block groups end up with duplicate but slightly offset @@ -2723,6 +2726,8 @@ fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); +fail_dirty_metadata_bytes: + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: @@ -3401,6 +3406,7 @@ int close_ctree(struct btrfs_root *root) btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); @@ -3443,11 +3449,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)transid, (u64)atomic64_read(&root->fs_info->generation)); was_dirty = set_extent_buffer_dirty(buf); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += buf->len; - spin_unlock(&root->fs_info->delalloc_lock); - } + if (!was_dirty) + __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, + buf->len, + root->fs_info->dirty_metadata_batch); } static void __btrfs_btree_balance_dirty(struct btrfs_root *root, @@ -3457,8 +3462,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; + int ret; if (current->flags & PF_MEMALLOC) return; @@ -3466,9 +3470,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, if (flush_delayed) btrfs_balance_delayed_items(root); - num_dirty = root->fs_info->dirty_metadata_bytes; - - if (num_dirty > thresh) { + ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret > 0) { balance_dirty_pages_ratelimited_nr( root->fs_info->btree_inode->i_mapping, 1); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1b319df..2d4bdfd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3124,12 +3124,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - spin_lock(&fs_info->delalloc_lock); - if (fs_info->dirty_metadata_bytes >= eb->len) - fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&fs_info->delalloc_lock); + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); ret = 1; } else { spin_unlock(&eb->refs_lock); -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Miao Xie
2013-Jan-29 10:10 UTC
[PATCH V2 07/10] Btrfs: use percpu counter for fs_info->delalloc_bytes
fs_info->delalloc_bytes is accessed very frequently, so use percpu counter instead of the u64 variant for it to reduce the lock contention. This patch also fixed the problem that we access the variant without the lock protection.At worst, we would not flush the delalloc inodes, and just return ENOSPC error when we still have some free space in the fs. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- Changelog v1 -> v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.h | 7 ++++--- fs/btrfs/disk-io.c | 18 ++++++++++++++---- fs/btrfs/extent-tree.c | 6 ++++-- fs/btrfs/inode.c | 6 ++++-- 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1dcbbfd..51515a3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1383,6 +1383,7 @@ struct btrfs_fs_info { */ struct list_head ordered_extents; + spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for * this list to be empty even when there is still dirty data=ordered @@ -1443,7 +1444,10 @@ struct btrfs_fs_info { /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; s32 dirty_metadata_batch; + s32 delalloc_batch; + struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; @@ -1459,9 +1463,6 @@ struct btrfs_fs_info { struct reloc_control *reloc_ctl; - spinlock_t delalloc_lock; - u64 delalloc_bytes; - /* data_alloc_cluster is only used in ssd mode */ struct btrfs_free_cluster data_alloc_cluster; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 961ac58..29d52af 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2006,10 +2006,16 @@ int open_ctree(struct super_block *sb, fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * (1 + ilog2(nr_cpu_ids)); + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + if (ret) { + err = ret; + goto fail_dirty_metadata_bytes; + } + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_dirty_metadata_bytes; + goto fail_delalloc_bytes; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2264,6 +2270,7 @@ int open_ctree(struct super_block *sb, sectorsize = btrfs_super_sectorsize(disk_super); stripesize = btrfs_super_stripesize(disk_super); fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); /* * mixed block groups end up with duplicate but slightly offset @@ -2726,6 +2733,8 @@ fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); +fail_delalloc_bytes: + percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: percpu_counter_destroy(&fs_info->dirty_metadata_bytes); fail_bdi: @@ -3357,9 +3366,9 @@ int close_ctree(struct btrfs_root *root) btrfs_free_qgroup_config(root->fs_info); - if (fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - (unsigned long long)fs_info->delalloc_bytes); + if (percpu_counter_sum(&fs_info->delalloc_bytes)) { + printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", + percpu_counter_sum(&fs_info->delalloc_bytes)); } free_extent_buffer(fs_info->extent_root->node); @@ -3407,6 +3416,7 @@ int close_ctree(struct btrfs_root *root) btrfs_mapping_tree_free(&fs_info->mapping_tree); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ef61a4a..f4f0b1e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3724,7 +3724,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, space_info = block_rsv->space_info; smp_mb(); - delalloc_bytes = root->fs_info->delalloc_bytes; + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); if (delalloc_bytes == 0) { if (trans) return; @@ -3766,7 +3767,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, break; } smp_mb(); - delalloc_bytes = root->fs_info->delalloc_bytes; + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 803be87..d98b508 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1510,7 +1510,8 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += len; - root->fs_info->delalloc_bytes += len; + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, + root->fs_info->delalloc_batch); if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, &root->fs_info->delalloc_inodes); @@ -1551,7 +1552,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_free_reserved_data_space(inode, len); spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->delalloc_bytes -= len; + __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, + root->fs_info->delalloc_batch); BTRFS_I(inode)->delalloc_bytes -= len; if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Miao Xie
2013-Jan-29 10:11 UTC
[PATCH V2 08/10] Btrfs: use the inode own lock to protect its delalloc_bytes
We need not use a global lock to protect the delalloc_bytes of the inode, just use its own lock. In this way, we can reduce the lock contention and ->delalloc_lock will just protect delalloc inode list. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- Changelog v1 -> v2: - none. --- fs/btrfs/btrfs_inode.h | 1 + fs/btrfs/disk-io.c | 2 ++ fs/btrfs/inode.c | 47 ++++++++++++++++++++++++++++++++++------------- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2a8c242..c935a77 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -40,6 +40,7 @@ #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 #define BTRFS_INODE_NEEDS_FULL_SYNC 7 #define BTRFS_INODE_COPY_EVERYTHING 8 +#define BTRFS_INODE_IN_DELALLOC_LIST 9 /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 29d52af..abf1f10 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3685,6 +3685,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) delalloc_inodes); list_del_init(&btrfs_inode->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &btrfs_inode->runtime_flags); btrfs_invalidate_inodes(btrfs_inode->root); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d98b508..4f92b35 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1508,15 +1508,22 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } - spin_lock(&root->fs_info->delalloc_lock); - BTRFS_I(inode)->delalloc_bytes += len; __percpu_counter_add(&root->fs_info->delalloc_bytes, len, root->fs_info->delalloc_batch); - if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->fs_info->delalloc_inodes); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->delalloc_bytes += len; + if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->fs_info->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + } + spin_unlock(&root->fs_info->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1551,16 +1558,22 @@ static void btrfs_clear_bit_hook(struct inode *inode, && do_list) btrfs_free_reserved_data_space(inode, len); - spin_lock(&root->fs_info->delalloc_lock); __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes -= len; - if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && - !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); + test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + } + spin_unlock(&root->fs_info->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&BTRFS_I(inode)->lock); } } @@ -7316,14 +7329,19 @@ fail: static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { + u64 delalloc_bytes; struct inode *inode = dentry->d_inode; u32 blocksize = inode->i_sb->s_blocksize; generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; + + spin_lock(&BTRFS_I(inode)->lock); + delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + - ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; + ALIGN(delalloc_bytes, blocksize)) >> 9; return 0; } @@ -7611,8 +7629,11 @@ again: list_del_init(&binode->delalloc_inodes); inode = igrab(&binode->vfs_inode); - if (!inode) + if (!inode) { + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &binode->runtime_flags); continue; + } list_add_tail(&binode->delalloc_inodes, &root->fs_info->delalloc_inodes); -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Miao Xie
2013-Jan-29 10:13 UTC
[PATCH V2 09/10] Btrfs: use seqlock to protect fs_info->avail_{data, metadata, system}_alloc_bits
There is no lock to protect fs_info->avail_{data, metadata, system}_alloc_bits, it may introduce some problem, such as the wrong profile information, so we add a seqlock to protect them. Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- Changelog v1 -> v2: - none. --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 22 ++++++++++++++------ fs/btrfs/volumes.c | 56 +++++++++++++++++++++++++++----------------------- 4 files changed, 49 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 51515a3..c95b539 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1474,6 +1474,8 @@ struct btrfs_fs_info { struct rb_root defrag_inodes; atomic_t defrag_running; + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; /* * these three are in extended format (availability of single * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index abf1f10..a7797ed 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2036,6 +2036,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->tree_mod_seq_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); + seqlock_init(&fs_info->profiles_lock); init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f4f0b1e..bbbfa72 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3223,12 +3223,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); } /* @@ -3320,12 +3322,18 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { - if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits; + unsigned seq; + + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); return btrfs_reduce_alloc_profile(root, flags); } @@ -7937,12 +7945,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 15f6efd..65f22c2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1372,14 +1372,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) u64 devid; u64 num_devices; u8 *dev_uuid; + unsigned seq; int ret = 0; bool clear_super = false; mutex_lock(&uuid_mutex); - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); num_devices = root->fs_info->fs_devices->num_devices; btrfs_dev_replace_lock(&root->fs_info->dev_replace); @@ -2984,6 +2989,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, int mixed = 0; int ret; u64 num_devices; + unsigned seq; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3067,22 +3073,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl, /* allow to reduce meta or sys integrity only if force set */ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_system_alloc_bits & allowed) && - !(bctl->sys.target & allowed)) || - ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - printk(KERN_INFO "btrfs: force reducing metadata " - "integrity\n"); - } else { - printk(KERN_ERR "btrfs: balance will reduce metadata " - "integrity, use force if you want this\n"); - ret = -EINVAL; - goto out; + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + printk(KERN_INFO "btrfs: force reducing metadata " + "integrity\n"); + } else { + printk(KERN_ERR "btrfs: balance will reduce metadata " + "integrity, use force if you want this\n"); + ret = -EINVAL; + goto out; + } } - } + } while (read_seqretry(&fs_info->profiles_lock, seq)); if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { int num_tolerated_disk_barrier_failures; @@ -3886,10 +3896,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, if (ret) return ret; - alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - fs_info->avail_metadata_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - + alloc_profile = btrfs_get_alloc_profile(extent_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, &stripe_size, chunk_offset, alloc_profile); if (ret) @@ -3897,10 +3904,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, sys_chunk_offset = chunk_offset + chunk_size; - alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - fs_info->avail_system_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - + alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, &sys_chunk_size, &sys_stripe_size, sys_chunk_offset, alloc_profile); -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
There is no lock to protect fs_info->fs_state, it will introduce some problems, such as the value may be covered by the other task when several tasks modify it. For example: Task0 - CPU0 Task1 - CPU1 mov %fs_state rax or $0x1 rax mov %fs_state rax or $0x2 rax mov rax %fs_state mov rax %fs_state The expected value is 3, but in fact, it is 2. Though this problem doesn''t happen now (because there is only one flag currently), the code is error prone, if we add other flags, the above problem will happen to a certainty. Now we use bit operation for it to fix the above problem. In this way, we can make the code more robust and be easy to add new flags. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- Changelog v1 -> v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.h | 4 +++- fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/file.c | 2 +- fs/btrfs/scrub.c | 2 +- fs/btrfs/super.c | 4 ++-- fs/btrfs/transaction.c | 9 ++++----- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c95b539..c34e36e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -338,7 +338,9 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) /* * File system states */ +#define BTRFS_FS_STATE_ERROR 0 +/* Super block flags */ /* Errors detected */ #define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) @@ -1540,7 +1542,7 @@ struct btrfs_fs_info { u64 qgroup_seq; /* filesystem state */ - u64 fs_state; + unsigned long fs_state; struct btrfs_delayed_root *delayed_root; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a7797ed..caf329c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2196,7 +2196,8 @@ int open_ctree(struct super_block *sb, goto fail_alloc; /* check FS state, whether FS is broken. */ - fs_info->fs_state |= btrfs_super_flags(disk_super); + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { @@ -3354,7 +3355,7 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_error_commit_super(root); btrfs_put_block_group_cache(fs_info); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 910ea99..796fd79 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1531,7 +1531,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, * although we have opened a file as writable, we have * to stop this write operation to ensure FS consistency. */ - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { mutex_unlock(&inode->i_mutex); err = -EROFS; goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index af0b566..2e91b56 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2700,7 +2700,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, int ret; struct btrfs_root *root = sctx->dev_root; - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EIO; gen = atomic64_read(&root->fs_info->last_trans_committed); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c96f132..6528482 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -98,7 +98,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info) * today we only save the error info into ram. Long term we''ll * also send it down to the disk */ - fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); } static void save_error_info(struct btrfs_fs_info *fs_info) @@ -114,7 +114,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (sb->s_flags & MS_RDONLY) return; - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 29fdf1c..50437b4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -62,7 +62,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type) spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); return -EROFS; } @@ -114,7 +114,7 @@ loop: kmem_cache_free(btrfs_transaction_cachep, cur_trans); cur_trans = fs_info->running_transaction; goto loop; - } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); kmem_cache_free(btrfs_transaction_cachep, cur_trans); return -EROFS; @@ -302,7 +302,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, int ret; u64 qgroup_reserved = 0; - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); if (current->journal_info) { @@ -635,9 +635,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_run_delayed_iputs(root); if (trans->aborted || - root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) err = -EIO; - } assert_qgroups_uptodate(trans); memset(trans, 0, sizeof(*trans)); -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html