David Rientjes
2010-Aug-24 10:50 UTC
[patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail(). These functions are equivalent to kmalloc(), kcalloc(), and kzalloc(), respectively, except that they will never return NULL and instead loop forever trying to allocate memory. If the first allocation attempt fails, a warning will be emitted, including a call trace. Subsequent failures will suppress this warning. These were added as helper functions for documentation and auditability. No future callers should be added. Signed-off-by: David Rientjes <rientjes@google.com> --- drivers/md/dm-region-hash.c | 2 +- fs/btrfs/inode.c | 2 +- fs/gfs2/log.c | 2 +- fs/gfs2/rgrp.c | 18 ++++--------- fs/jbd/transaction.c | 11 ++------ fs/reiserfs/journal.c | 3 +- include/linux/slab.h | 55 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 68 insertions(+), 25 deletions(-) diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -290,7 +290,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); if (unlikely(!nreg)) - nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); + nreg = kmalloc_nofail(sizeof(*nreg), GFP_NOIO); nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? DM_RH_CLEAN : DM_RH_NOSYNC; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1967,7 +1967,7 @@ void btrfs_add_delayed_iput(struct inode *inode) if (atomic_add_unless(&inode->i_count, -1, 1)) return; - delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); + delayed = kmalloc_nofail(sizeof(*delayed), GFP_NOFS); delayed->inode = inode; spin_lock(&fs_info->delayed_iput_lock); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -709,7 +709,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) } trace_gfs2_log_flush(sdp, 1); - ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); + ai = kzalloc_nofail(sizeof(struct gfs2_ail), GFP_NOFS); INIT_LIST_HEAD(&ai->ai_ail1_list); INIT_LIST_HEAD(&ai->ai_ail2_list); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1440,8 +1440,8 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, rgrp_blk++; if (!bi->bi_clone) { - bi->bi_clone = kmalloc(bi->bi_bh->b_size, - GFP_NOFS | __GFP_NOFAIL); + bi->bi_clone = kmalloc_nofail(bi->bi_bh->b_size, + GFP_NOFS); memcpy(bi->bi_clone + bi->bi_offset, bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); @@ -1759,9 +1759,6 @@ fail: * @block: the block * * Figure out what RG a block belongs to and add that RG to the list - * - * FIXME: Don''t use NOFAIL - * */ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, @@ -1789,8 +1786,8 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, if (rlist->rl_rgrps == rlist->rl_space) { new_space = rlist->rl_space + 10; - tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *), - GFP_NOFS | __GFP_NOFAIL); + tmp = kcalloc_nofail(new_space, sizeof(struct gfs2_rgrpd *), + GFP_NOFS); if (rlist->rl_rgd) { memcpy(tmp, rlist->rl_rgd, @@ -1811,17 +1808,14 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, * @rlist: the list of resource groups * @state: the lock state to acquire the RG lock in * @flags: the modifier flags for the holder structures - * - * FIXME: Don''t use NOFAIL - * */ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) { unsigned int x; - rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder), - GFP_NOFS | __GFP_NOFAIL); + rlist->rl_ghs = kcalloc_nofail(rlist->rl_rgrps, + sizeof(struct gfs2_holder), GFP_NOFS); for (x = 0; x < rlist->rl_rgrps; x++) gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, state, 0, diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -98,14 +98,9 @@ static int start_this_handle(journal_t *journal, handle_t *handle) } alloc_transaction: - if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); - if (!new_transaction) { - ret = -ENOMEM; - goto out; - } - } + if (!journal->j_running_transaction) + new_transaction = kzalloc_nofail(sizeof(*new_transaction), + GFP_NOFS); jbd_debug(3, "New handle %p going live.\n", handle); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2593,8 +2593,7 @@ static int journal_read(struct super_block *sb) static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) { struct reiserfs_journal_list *jl; - jl = kzalloc(sizeof(struct reiserfs_journal_list), - GFP_NOFS | __GFP_NOFAIL); + jl = kzalloc_nofail(sizeof(struct reiserfs_journal_list), GFP_NOFS); INIT_LIST_HEAD(&jl->j_list); INIT_LIST_HEAD(&jl->j_working_list); INIT_LIST_HEAD(&jl->j_tail_bh_list); diff --git a/include/linux/slab.h b/include/linux/slab.h --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -334,6 +334,61 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) return kmalloc_node(size, flags | __GFP_ZERO, node); } +/** + * kmalloc_nofail - infinitely loop until kmalloc() succeeds. + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate (see kmalloc). + * + * NOTE: no new callers of this function should be implemented! + * All memory allocations should be failable whenever possible. + */ +static inline void *kmalloc_nofail(size_t size, gfp_t flags) +{ + void *ret; + + for (;;) { + ret = kmalloc(size, flags); + if (ret) + return ret; + WARN_ONCE(1, "Out of memory, no fallback implemented " + "(size=%lu flags=0x%x)\n", + size, flags); + } +} + +/** + * kcalloc_nofail - infinitely loop until kcalloc() succeeds. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kcalloc). + * + * NOTE: no new callers of this function should be implemented! + * All memory allocations should be failable whenever possible. + */ +static inline void *kcalloc_nofail(size_t n, size_t size, gfp_t flags) +{ + void *ret; + + for (;;) { + ret = kcalloc(n, size, flags); + if (ret) + return ret; + WARN_ONCE(1, "Out of memory, no fallback implemented " + "(n=%lu size=%lu flags=0x%x)\n", + n, size, flags); + } +} + +/** + * kzalloc_nofail - infinitely loop until kzalloc() succeeds. + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate (see kzalloc). + */ +static inline void *kzalloc_nofail(size_t size, gfp_t flags) +{ + return kmalloc_nofail(size, flags | __GFP_ZERO); +} + void __init kmem_cache_init_late(void); #endif /* _LINUX_SLAB_H */ -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-24 10:50 UTC
[patch 4/5] btrfs: add nofail variant of set_extent_dirty
Add set_extent_dirty_nofail(). This function is equivalent to set_extent_dirty(), except that it will never fail because of allocation failure and instead loop forever trying to allocate memory. If the first allocation attempt fails, a warning will be emitted, including a call trace. Subsequent failures will suppress this warning. This was added as a helper function for documentation and auditability. No future callers should be added. Signed-off-by: David Rientjes <rientjes@google.com> --- fs/btrfs/extent-tree.c | 8 ++++---- fs/btrfs/extent_io.c | 19 +++++++++++++++++++ fs/btrfs/extent_io.h | 2 ++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3831,9 +3831,9 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_dirty(info->pinned_extents, + set_extent_dirty_nofail(info->pinned_extents, bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); + GFP_NOFS); } btrfs_put_block_group(cache); total -= num_bytes; @@ -3872,8 +3872,8 @@ static int pin_down_extent(struct btrfs_root *root, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_dirty(root->fs_info->pinned_extents, bytenr, - bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + set_extent_dirty_nofail(root->fs_info->pinned_extents, bytenr, + bytenr + num_bytes - 1, GFP_NOFS); return 0; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -940,6 +940,25 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, NULL, mask); } +/* + * NOTE: no new callers of this function should be implemented! + * All memory allocations should be failable whenever possible. + */ +int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + int ret; + + for (;;) { + ret = set_extent_dirty(tree, start, end, mask); + if (ret != -ENOMEM) + return ret; + WARN_ONCE(1, "Out of memory, no fallback implemented " + "(flags=0x%x)\n", + mask); + } +} + int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -197,6 +197,8 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
Jan Kara
2010-Aug-24 12:15 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Tue 24-08-10 03:50:19, David Rientjes wrote:> Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail(). These > functions are equivalent to kmalloc(), kcalloc(), and kzalloc(), > respectively, except that they will never return NULL and instead loop > forever trying to allocate memory. > > If the first allocation attempt fails, a warning will be emitted, > including a call trace. Subsequent failures will suppress this warning. > > These were added as helper functions for documentation and auditability. > No future callers should be added.This looks like a cleaner approach than the previous one. You can add Acked-by: Jan Kara <jack@suse.cz> for the JBD part. Honza> Signed-off-by: David Rientjes <rientjes@google.com> > --- > drivers/md/dm-region-hash.c | 2 +- > fs/btrfs/inode.c | 2 +- > fs/gfs2/log.c | 2 +- > fs/gfs2/rgrp.c | 18 ++++--------- > fs/jbd/transaction.c | 11 ++------ > fs/reiserfs/journal.c | 3 +- > include/linux/slab.h | 55 +++++++++++++++++++++++++++++++++++++++++++ > 7 files changed, 68 insertions(+), 25 deletions(-) > > diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c > --- a/drivers/md/dm-region-hash.c > +++ b/drivers/md/dm-region-hash.c > @@ -290,7 +290,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) > > nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); > if (unlikely(!nreg)) > - nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); > + nreg = kmalloc_nofail(sizeof(*nreg), GFP_NOIO); > > nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? > DM_RH_CLEAN : DM_RH_NOSYNC; > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -1967,7 +1967,7 @@ void btrfs_add_delayed_iput(struct inode *inode) > if (atomic_add_unless(&inode->i_count, -1, 1)) > return; > > - delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); > + delayed = kmalloc_nofail(sizeof(*delayed), GFP_NOFS); > delayed->inode = inode; > > spin_lock(&fs_info->delayed_iput_lock); > diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c > --- a/fs/gfs2/log.c > +++ b/fs/gfs2/log.c > @@ -709,7 +709,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) > } > trace_gfs2_log_flush(sdp, 1); > > - ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); > + ai = kzalloc_nofail(sizeof(struct gfs2_ail), GFP_NOFS); > INIT_LIST_HEAD(&ai->ai_ail1_list); > INIT_LIST_HEAD(&ai->ai_ail2_list); > > diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c > --- a/fs/gfs2/rgrp.c > +++ b/fs/gfs2/rgrp.c > @@ -1440,8 +1440,8 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, > rgrp_blk++; > > if (!bi->bi_clone) { > - bi->bi_clone = kmalloc(bi->bi_bh->b_size, > - GFP_NOFS | __GFP_NOFAIL); > + bi->bi_clone = kmalloc_nofail(bi->bi_bh->b_size, > + GFP_NOFS); > memcpy(bi->bi_clone + bi->bi_offset, > bi->bi_bh->b_data + bi->bi_offset, > bi->bi_len); > @@ -1759,9 +1759,6 @@ fail: > * @block: the block > * > * Figure out what RG a block belongs to and add that RG to the list > - * > - * FIXME: Don''t use NOFAIL > - * > */ > > void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, > @@ -1789,8 +1786,8 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, > if (rlist->rl_rgrps == rlist->rl_space) { > new_space = rlist->rl_space + 10; > > - tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *), > - GFP_NOFS | __GFP_NOFAIL); > + tmp = kcalloc_nofail(new_space, sizeof(struct gfs2_rgrpd *), > + GFP_NOFS); > > if (rlist->rl_rgd) { > memcpy(tmp, rlist->rl_rgd, > @@ -1811,17 +1808,14 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, > * @rlist: the list of resource groups > * @state: the lock state to acquire the RG lock in > * @flags: the modifier flags for the holder structures > - * > - * FIXME: Don''t use NOFAIL > - * > */ > > void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) > { > unsigned int x; > > - rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder), > - GFP_NOFS | __GFP_NOFAIL); > + rlist->rl_ghs = kcalloc_nofail(rlist->rl_rgrps, > + sizeof(struct gfs2_holder), GFP_NOFS); > for (x = 0; x < rlist->rl_rgrps; x++) > gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, > state, 0, > diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c > --- a/fs/jbd/transaction.c > +++ b/fs/jbd/transaction.c > @@ -98,14 +98,9 @@ static int start_this_handle(journal_t *journal, handle_t *handle) > } > > alloc_transaction: > - if (!journal->j_running_transaction) { > - new_transaction = kzalloc(sizeof(*new_transaction), > - GFP_NOFS|__GFP_NOFAIL); > - if (!new_transaction) { > - ret = -ENOMEM; > - goto out; > - } > - } > + if (!journal->j_running_transaction) > + new_transaction = kzalloc_nofail(sizeof(*new_transaction), > + GFP_NOFS); > > jbd_debug(3, "New handle %p going live.\n", handle); > > diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c > --- a/fs/reiserfs/journal.c > +++ b/fs/reiserfs/journal.c > @@ -2593,8 +2593,7 @@ static int journal_read(struct super_block *sb) > static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) > { > struct reiserfs_journal_list *jl; > - jl = kzalloc(sizeof(struct reiserfs_journal_list), > - GFP_NOFS | __GFP_NOFAIL); > + jl = kzalloc_nofail(sizeof(struct reiserfs_journal_list), GFP_NOFS); > INIT_LIST_HEAD(&jl->j_list); > INIT_LIST_HEAD(&jl->j_working_list); > INIT_LIST_HEAD(&jl->j_tail_bh_list); > diff --git a/include/linux/slab.h b/include/linux/slab.h > --- a/include/linux/slab.h > +++ b/include/linux/slab.h > @@ -334,6 +334,61 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) > return kmalloc_node(size, flags | __GFP_ZERO, node); > } > > +/** > + * kmalloc_nofail - infinitely loop until kmalloc() succeeds. > + * @size: how many bytes of memory are required. > + * @flags: the type of memory to allocate (see kmalloc). > + * > + * NOTE: no new callers of this function should be implemented! > + * All memory allocations should be failable whenever possible. > + */ > +static inline void *kmalloc_nofail(size_t size, gfp_t flags) > +{ > + void *ret; > + > + for (;;) { > + ret = kmalloc(size, flags); > + if (ret) > + return ret; > + WARN_ONCE(1, "Out of memory, no fallback implemented " > + "(size=%lu flags=0x%x)\n", > + size, flags); > + } > +} > + > +/** > + * kcalloc_nofail - infinitely loop until kcalloc() succeeds. > + * @n: number of elements. > + * @size: element size. > + * @flags: the type of memory to allocate (see kcalloc). > + * > + * NOTE: no new callers of this function should be implemented! > + * All memory allocations should be failable whenever possible. > + */ > +static inline void *kcalloc_nofail(size_t n, size_t size, gfp_t flags) > +{ > + void *ret; > + > + for (;;) { > + ret = kcalloc(n, size, flags); > + if (ret) > + return ret; > + WARN_ONCE(1, "Out of memory, no fallback implemented " > + "(n=%lu size=%lu flags=0x%x)\n", > + n, size, flags); > + } > +} > + > +/** > + * kzalloc_nofail - infinitely loop until kzalloc() succeeds. > + * @size: how many bytes of memory are required. > + * @flags: the type of memory to allocate (see kzalloc). > + */ > +static inline void *kzalloc_nofail(size_t size, gfp_t flags) > +{ > + return kmalloc_nofail(size, flags | __GFP_ZERO); > +} > + > void __init kmem_cache_init_late(void); > > #endif /* _LINUX_SLAB_H */-- Jan Kara <jack@suse.cz> SUSE Labs, CR -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-24 13:29 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote:> These were added as helper functions for documentation and auditability. > No future callers should be added.git grep GFP_NOFAIL isn''t auditable enough? might as well declare these functions depricated if you really want to do this. FWIW I think mason said he''d fix btrfs to not suck like this ''soon''. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-24 13:30 UTC
Re: [patch 4/5] btrfs: add nofail variant of set_extent_dirty
On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote:> Add set_extent_dirty_nofail(). This function is equivalent to > set_extent_dirty(), except that it will never fail because of allocation > failure and instead loop forever trying to allocate memory. > > If the first allocation attempt fails, a warning will be emitted, > including a call trace. Subsequent failures will suppress this warning. > > This was added as a helper function for documentation and auditability. > No future callers should be added. > > Signed-off-by: David Rientjes <rientjes@google.com> > --- > fs/btrfs/extent-tree.c | 8 ++++---- > fs/btrfs/extent_io.c | 19 +++++++++++++++++++ > fs/btrfs/extent_io.h | 2 ++ > 3 files changed, 25 insertions(+), 4 deletions(-)I''d much rather someone help mason to clean this up. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jens Axboe
2010-Aug-24 13:33 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On 2010-08-24 15:29, Peter Zijlstra wrote:> On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote: >> These were added as helper functions for documentation and auditability. >> No future callers should be added. > > git grep GFP_NOFAIL isn''t auditable enough? > > might as well declare these functions depricated if you really want to > do this.Agree, adding a helper function would seem to be going in the reverse direction imho. Nobody will read that comment on top of the function. Should be possible to warn at build time for anyone using __GFP_NOFAIL without wrapping it in a function. -- Jens Axboe -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Dave Chinner
2010-Aug-24 13:55 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Tue, Aug 24, 2010 at 03:29:18PM +0200, Peter Zijlstra wrote:> On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote: > > These were added as helper functions for documentation and auditability. > > No future callers should be added. > > git grep GFP_NOFAIL isn''t auditable enough? > > might as well declare these functions depricated if you really want to > do this.Also, if you are going to add tight loops, you might want to put a backoff in the loops like "congestion_wait(BLK_RW_ASYNC, HZ/50);" so that they don''t spin.... FWIW, in all this "allocations can''t fail" churn, no one has noticed that XFS has been doing these "allocations can''t fail" loop in kmem_alloc() and kmem_zone_alloc(), well, forever. I can''t ever remember seeing it report a potential deadlock, though.... Cheers, Dave. -- Dave Chinner david@fromorbit.com -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-24 14:03 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Tue, 2010-08-24 at 23:55 +1000, Dave Chinner wrote:> no one has noticed > that XFS has been doing these "allocations can''t fail" loop in > kmem_alloc() and kmem_zone_alloc(), well, foreverI occasionally check if its still there and cry a little ;-) -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-24 20:08 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Tue, 24 Aug 2010, Peter Zijlstra wrote:> > These were added as helper functions for documentation and auditability. > > No future callers should be added. > > git grep GFP_NOFAIL isn''t auditable enough? >I''m trying to remove that bit, and __GFP_REPEAT, to simplify and remove several branches from the page allocator.> might as well declare these functions depricated if you really want to > do this. > > FWIW I think mason said he''d fix btrfs to not suck like this ''soon''. >Great! -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-24 20:11 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Tue, 24 Aug 2010, Jens Axboe wrote:> Should be possible to warn at build time for anyone using __GFP_NOFAIL > without wrapping it in a function. >We could make this __deprecated functions as Peter suggested if you think build time warnings for existing users would be helpful. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-24 20:12 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Tue, 24 Aug 2010, Dave Chinner wrote:> > git grep GFP_NOFAIL isn''t auditable enough? > > > > might as well declare these functions depricated if you really want to > > do this. > > Also, if you are going to add tight loops, you might want to put a > backoff in the loops like "congestion_wait(BLK_RW_ASYNC, HZ/50);" so > that they don''t spin.... >These loops don''t actually loop at all, all users are passing order < PAGE_ALLOC_COSTLY_ORDER which implicitly loop forever in the page allocator without killing anything (they are all GFP_NOIO or GFP_NOFS, so the oom killer isn''t involved). -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Ted Ts''o
2010-Aug-25 11:24 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Tue, Aug 24, 2010 at 01:11:26PM -0700, David Rientjes wrote:> On Tue, 24 Aug 2010, Jens Axboe wrote: > > > Should be possible to warn at build time for anyone using __GFP_NOFAIL > > without wrapping it in a function. > > > > We could make this __deprecated functions as Peter suggested if you think > build time warnings for existing users would be helpful.Let me take a few steps backwards and look at the problem from a somewhat higher level. Part of the problem is that we have a few places in the kernel where failure is really not an option --- or rather, if we''re going to fail while we''re in the middle of doing a commit, our choices really are (a) retry the loop in the jbd layer (which Andrew really doesn''t like), (b) keep our own private cache of free memory so we don''t fail and/or loop, (c) fail the file system and mark it read-only, or (d) panic. There are other places where we can fail safely (for example, in jbd''s start_this_handle, although that just pushes the layer up the stack, and ultimately, to userspace where most userspace programs don''t really expect ENOMEM to get returned by a random disk write --- how much do _you_ trust a random GNOME or KDE developer to do correct error checking and do something sane at the application?) So we can mark the retry loop helper function as deprecated, and that will make some of these cases go away, but ultimately if we''re going to fail the memory allocation, something bad is going to happen, and the only question is whether we want to have something bad happen by looping in the memory allocator, or to force the file system to panic/oops the system, or have random application die and/or lose user data because they don''t expect write() to return ENOMEM. So at some level it would be nice if we had a few different levels of "we *really* need this memory". Level 0 might be, "if we can''t get it, no biggie, we''ll figure out some other way around it. I doubt there is much at level 0, but in theory, we could have some good citizens that fall in that camp and who simply will bypass some optimization if they can''t get the memory. Level 1 might be, if you can''t get the memory, we will propagate a failure up to userspace, but it''s probably a relatively "safe" place to fail (i.e., when the user is opening a file). Level 2 might be, "if you can''t get the memory, we will propgate a failure up to userspace, but it''s at a space where most applications are lazy f*ckers, and this may lead to serious application errors" (example: close(2), and this is a file system that only pushes the file to the server at close time, e.g. AFS). Level 3 might be, "if you can''t get the memory, I''m going to fail the file system, or some other global subsystem, that will probably result in the system crashing or needing to be rebooted". We can ignore this problem and pretend it doesn''t exist at the memory allocator level, but that means the callers are going to be doing their own thing to try to avoid having really bad things happening at really-low-memory occasions. And this may mean looping, whether we mark the function as deprecated or not. This is becoming even more of an issue now given that with containerization, we have jokers who are forcing tasks to run in very small memory containers, which means that failures can happen far more frequently --- and in some cases, just because the process running the task happens to be in an extremely constrained memory cgroup, doesn''t mean that failing the entire system is really such a great idea. Or maybe that means memory cgroups are kinda busted. :-) - Ted -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-25 11:35 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o wrote:> Part of the problem is that we have a few places in the kernel where > failure is really not an option --- or rather, if we''re going to fail > while we''re in the middle of doing a commit, our choices really are > (a) retry the loop in the jbd layer (which Andrew really doesn''t > like), (b) keep our own private cache of free memory so we don''t fail > and/or loop, (c) fail the file system and mark it read-only, or (d) > panic.d) do the allocation before you''re committed to going fwd and can still fail and back out. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Ted Ts''o
2010-Aug-25 11:57 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, Aug 25, 2010 at 01:35:32PM +0200, Peter Zijlstra wrote:> On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o wrote: > > Part of the problem is that we have a few places in the kernel where > > failure is really not an option --- or rather, if we''re going to fail > > while we''re in the middle of doing a commit, our choices really are > > (a) retry the loop in the jbd layer (which Andrew really doesn''t > > like), (b) keep our own private cache of free memory so we don''t fail > > and/or loop, (c) fail the file system and mark it read-only, or (d) > > panic. > > d) do the allocation before you''re committed to going fwd and can still > fail and back out.Sure in some cases that can be done, but the commit has to happen at some point, or we run out of journal space, at which point we''re back to (c) or (d). - Ted -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-25 12:48 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 07:57 -0400, Ted Ts''o wrote:> On Wed, Aug 25, 2010 at 01:35:32PM +0200, Peter Zijlstra wrote: > > On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o wrote: > > > Part of the problem is that we have a few places in the kernel where > > > failure is really not an option --- or rather, if we''re going to fail > > > while we''re in the middle of doing a commit, our choices really are > > > (a) retry the loop in the jbd layer (which Andrew really doesn''t > > > like), (b) keep our own private cache of free memory so we don''t fail > > > and/or loop, (c) fail the file system and mark it read-only, or (d) > > > panic. > > > > d) do the allocation before you''re committed to going fwd and can still > > fail and back out. > > Sure in some cases that can be done, but the commit has to happen at > some point, or we run out of journal space, at which point we''re back > to (c) or (d).Well (b) sounds a lot saner than either of those. Simply revert to a state that is sub-optimal but has bounded memory use and reserve that memory up-front. That way you can always get out of a tight memory spot. Its what the block layer has always done to avoid the memory deadlock situation, it has a private stash of BIOs that is big enough to always service some IO, and as long as IO is happening stuff keeps moving fwd and we don''t deadlock. Filesystems might have a slightly harder time creating such a bounded state because there might be more involved like journals and the like, but still it should be possible to create something like that (my swap over nfs patches created such a state for the network rx side of things). Also, we cannot let our fear of crappy userspace get in the way of doing sensible things. Your example of write(2) returning -ENOMEM is not correct though, the syscall (and the page_mkwrite callback for mmap()s) happens before we actually dirty the data and need to write things out, so we can always simply wait for memory to become available to dirty.
Peter Zijlstra
2010-Aug-25 12:52 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 14:48 +0200, Peter Zijlstra wrote:> > > On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o wrote:> > > > (a) retry the loop in the jbd layer (which Andrew really doesn''t > > > > like), (b) keep our own private cache of free memory so we don''t fail > > > > and/or loop,> Well (b) sounds a lot saner than either of those. Simply revert to a > state that is sub-optimal but has bounded memory use and reserve that > memory up-front. That way you can always get out of a tight memory spot.Also, there''s a good reason for disliking (a), its a deadlock scenario, suppose we need to write out data to get free pages, but the writing out is blocked on requiring free pages. There''s really nothing the page allocator can do to help you there, its a situation you have to avoid getting into. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Theodore Tso
2010-Aug-25 13:20 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Aug 25, 2010, at 8:52 AM, Peter Zijlstra wrote:> Also, there''s a good reason for disliking (a), its a deadlock scenario, > suppose we need to write out data to get free pages, but the writing out > is blocked on requiring free pages. > > There''s really nothing the page allocator can do to help you there, its > a situation you have to avoid getting into.Well, if all of these users start having their own private pools of emergency memory, I''m not sure that''s such a great idea either. And in some cases, there *is* extra memory. For example, if the reason why the page allocator failed is because there isn''t enough memory in the current process''s cgroup, maybe it''s important enough that the kernel code might decide to say, "violate the cgroup constraints --- it''s more important that we not bring down the entire system" than to honor whatever yahoo decided that a particular cgroup has been set down to something ridiculous like 512mb, when there''s plenty of free physical memory --- but not in that cgroup. -- Ted -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Dave Chinner
2010-Aug-25 13:24 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, Aug 25, 2010 at 02:48:36PM +0200, Peter Zijlstra wrote:> On Wed, 2010-08-25 at 07:57 -0400, Ted Ts''o wrote: > > On Wed, Aug 25, 2010 at 01:35:32PM +0200, Peter Zijlstra wrote: > > > On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o wrote: > > > > Part of the problem is that we have a few places in the kernel where > > > > failure is really not an option --- or rather, if we''re going to fail > > > > while we''re in the middle of doing a commit, our choices really are > > > > (a) retry the loop in the jbd layer (which Andrew really doesn''t > > > > like), (b) keep our own private cache of free memory so we don''t fail > > > > and/or loop, (c) fail the file system and mark it read-only, or (d) > > > > panic. > > > > > > d) do the allocation before you''re committed to going fwd and can still > > > fail and back out. > > > > Sure in some cases that can be done, but the commit has to happen at > > some point, or we run out of journal space, at which point we''re back > > to (c) or (d). > > Well (b) sounds a lot saner than either of those. Simply revert to a > state that is sub-optimal but has bounded memory use and reserve that > memory up-front. That way you can always get out of a tight memory spot. > > Its what the block layer has always done to avoid the memory deadlock > situation, it has a private stash of BIOs that is big enough to always > service some IO, and as long as IO is happening stuff keeps moving fwd > and we don''t deadlock. > > Filesystems might have a slightly harder time creating such a bounded > state because there might be more involved like journals and the like, > but still it should be possible to create something like that (my swap > over nfs patches created such a state for the network rx side of > things).Filesystems are way more complex than the block layer - the block layer simply doesn''t have to handle situations were thread X is holding A, B and C, while thread Y needs C to complete the transaction. thread Y is the user of the low memory pool, but has almost depleted it and so even if we swith to thread X, the pool doe snot have enouhg memory for X to complete and allow us to switch back to Y and have it complete, freeing the memory from the pool that it holds. That is, the guarantee that we will always make progress simply does not exist in filesystems, so a mempool-like concept seems to me to be doomed from the start.... Cheers, Dave. -- Dave Chinner david@fromorbit.com -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-25 13:31 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 09:20 -0400, Theodore Tso wrote:> On Aug 25, 2010, at 8:52 AM, Peter Zijlstra wrote: > > Also, there''s a good reason for disliking (a), its a deadlock scenario, > > suppose we need to write out data to get free pages, but the writing out > > is blocked on requiring free pages. > > > > There''s really nothing the page allocator can do to help you there, its > > a situation you have to avoid getting into. > > Well, if all of these users start having their own private pools of > emergency memory, I''m not sure that''s such a great idea either. > > And in some cases, there *is* extra memory. For example, if the > reason why the page allocator failed is because there isn''t enough > memory in the current process''s cgroup, maybe it''s important enough > that the kernel code might decide to say, "violate the cgroup > constraints --- it''s more important that we not bring down the entire > system" than to honor whatever yahoo decided that a particular cgroup > has been set down to something ridiculous like 512mb, when there''s > plenty of free physical memory --- but not in that cgroup.I''m not sure, but I think the cgroup thing doesn''t account kernel allocations, in which case the above problem doesn''t exist. For the cpuset case we punch through the cpuset constraints for kernel allocations (unless __GFP_HARDWALL). -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-25 13:34 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 09:20 -0400, Theodore Tso wrote:> Well, if all of these users start having their own private pools of > emergency memory, I''m not sure that''s such a great idea either.That''s a secondary problem, and could be reduced by something like the memory reservation system I provided in the swap-over-nfs patches. Of course, when all these users can actually happen concurrently there''s really nothing much you can do about it. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-25 13:35 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 23:24 +1000, Dave Chinner wrote:> > That is, the guarantee that we will always make progress simply does > not exist in filesystems, so a mempool-like concept seems to me to > be doomed from the start....While I appreciate that it might be somewhat (a lot) harder for a filesystem to provide that guarantee, I''d be deeply worried about your claim that its impossible. It would render a system without swap very prone to deadlocks. Even with the very tight dirty page accounting we currently have you can fill all your memory with anonymous pages, at which point there''s nothing free and you require writeout of dirty pages to succeed. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-25 14:13 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o wrote:> There are other places where we can fail safely (for example, in jbd''s > start_this_handle, although that just pushes the layer up the stack, > and ultimately, to userspace where most userspace programs don''t > really expect ENOMEM to get returned by a random disk writeWhile talking with Chris about this, if you can indeed push the error out that far you can basically ensure this particular fs-op does not complicate the journal commit and thereby limit the number of extra entries in your journal, and thus the amount of memory required. So once stuff starts failing, push out ops back out of the filesystem code, force a journal commit, and then let these ops retry. There is no need to actually push the -ENOMEM all the way back to userspace. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-25 20:43 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, Peter Zijlstra wrote:> I''m not sure, but I think the cgroup thing doesn''t account kernel > allocations, in which case the above problem doesn''t exist. >Right, the only cgroup that does is cpusets because it binds the memory allocations to a set of nodes.> For the cpuset case we punch through the cpuset constraints for kernel > allocations (unless __GFP_HARDWALL). >__GFP_HARDWALL doesn''t mean that the allocation won''t be constrained, this is a common misconception. __GFP_HARDWALL only prevents us from looking at our cpuset.mem_exclusive flag and checking our nearest common ancestor cpuset if we can block. The cpusets case is actually the easiest to fix: use GFP_ATOMIC. GFP_ATOMIC allocations aren''t bound by any cpuset and, in the general case, can allocate below the min watermark because of ALLOC_HARD | ALLOC_HARDER in the page allocator which creates the notion of "memory reserves" available to these tasks. Then, success really depends on the setting of the watermarks instead. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Ted Ts''o
2010-Aug-25 20:53 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, Aug 25, 2010 at 03:35:42PM +0200, Peter Zijlstra wrote:> > While I appreciate that it might be somewhat (a lot) harder for a > filesystem to provide that guarantee, I''d be deeply worried about your > claim that its impossible. > > It would render a system without swap very prone to deadlocks. Even with > the very tight dirty page accounting we currently have you can fill all > your memory with anonymous pages, at which point there''s nothing free > and you require writeout of dirty pages to succeed.For file systems that do delayed allocation, the situation is very similar to swapping over NFS. Sometimes in order to make some free memory, you need to spend some free memory... which implies that for these file systems, being more aggressive about triggering writeout, and being more aggressive about throttling processes which are creating too many dirty pages, especially dirty delayed allocaiton pages (regardless of whether this is via write(2) or accessing mmapped memory), is a really good idea. A pool of free pages which is reserved for routines that are doing page cleaning would probably also be a good idea. Maybe that''s just retrying with GFP_ATOMIC if a normal allocation fails, or maybe we need our own special pool, or maybe we need to dynamically resize the GFP_ATOMIC pool based on how many subsystems might need to use it.... Just brainstorming here; what do people think? - Ted -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-25 20:55 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 13:43 -0700, David Rientjes wrote:> The cpusets case is actually the easiest to fix: use GFP_ATOMIC.I don''t think that''s a valid usage of GFP_ATOMIC, I think we should fallback to outside the cpuset for kernel allocations by default. -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-25 20:58 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, Peter Zijlstra wrote:> While I appreciate that it might be somewhat (a lot) harder for a > filesystem to provide that guarantee, I''d be deeply worried about your > claim that its impossible. > > It would render a system without swap very prone to deadlocks. Even with > the very tight dirty page accounting we currently have you can fill all > your memory with anonymous pages, at which point there''s nothing free > and you require writeout of dirty pages to succeed. >While I''d really love for callers to be able to gracefully handle getting a NULL back from the page allocator in all cases, it''s not a prerequisite for this patchset. This patchset actually does nothing interesting except removing the __GFP_NOFAIL bit from their gfp mask. All of these allocations already loop looking for memory because they have orders that are less than PAGE_ALLOC_COSTLY_ORDER (which defaults to 3). So the loops in kzalloc_nofail(), etc., never actually loop. Demanding that the page allocator return order-3 memory in any context is a non-starter, so I''m not really interested in that. I''m more concerned about proper error handling being implemented for these callers iff someone redefines PAGE_ALLOC_COSTLY_ORDER to something else, perhaps even 0. Callers can, when desperate for memory, use GFP_ATOMIC to use some memory reserves across zones, hopefully order-0 and not an egregious amount. But the remainder of the burden really is back on the caller when this is depleted or it needs higher order allocs to be fixed in a way that doesn''t rely on memory that doesn''t exist. That''s an implementation choice by the caller and I agree that some failsafe behavior is the only way that we don''t get really bad results like corrupted user data or filesystems. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-25 20:59 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, Ted Ts''o wrote:> A pool of free pages which is reserved for routines that are doing > page cleaning would probably also be a good idea. Maybe that''s just > retrying with GFP_ATOMIC if a normal allocation fails, or maybe we > need our own special pool, or maybe we need to dynamically resize the > GFP_ATOMIC pool based on how many subsystems might need to use it.... >PF_MEMALLOC is used for those tasks, not GFP_ATOMIC. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Christoph Lameter
2010-Aug-25 21:11 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
We already have __GFP_NOFAIL behavior for slab allocations since a __GFP_NOFAIL flag is passed through to the page allocator if no objects are available. The page allocator will do the same as when called directly with __GFP_NOFAIL and so we have consistent behavior regardless of the allocator used.
David Rientjes
2010-Aug-25 21:11 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, Peter Zijlstra wrote:> > The cpusets case is actually the easiest to fix: use GFP_ATOMIC. > > I don''t think that''s a valid usage of GFP_ATOMIC, I think we should > fallback to outside the cpuset for kernel allocations by default.Cpusets doesn''t enforce isolation for only user memory, it''s always bound _all_ allocations that aren''t atomic or in irq context (or oom killed tasks). Allowing slab, for example, to be allocated in other cpusets could cause them to oom themselves since they are bound by the same memory isolation policy that all other cpusets are. We''d get random oom conditions in cpusets only depending on where the slab was allocated at now fault to those applications themselves, and that''s certainly not a situation we want. The memory controller cgroup also has slab accounting on their TODO list. If you think GFP_ATOMIC is inappropriate in these contexts, then they are by definition blockable. So this seems like a good candidate for using memory compaction since we''re talking only about PAGE_ALLOC_COSTLY_ORDER and higher allocs, even though it''s only currently configurable for hugepages. There''s still no hard guarantee that the memory will allocatable (GFP_KERNEL, the compaction, then GFP_ATOMIC could all still fail), but I don''t see how continuously looping the page allocator is possibly supposed to help in these situations. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-25 21:21 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 16:11 -0500, Christoph Lameter wrote:> We already have __GFP_NOFAIL behavior for slab allocations since > a __GFP_NOFAIL flag is passed through to the page allocator if no objects > are available. > > The page allocator will do the same as when called directly with > __GFP_NOFAIL and so we have consistent behavior regardless of the > allocator used.Thank''s for quoting the context to which you''re replying, that really helps parsing things.. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-25 21:23 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, Christoph Lameter wrote:> We already have __GFP_NOFAIL behavior for slab allocations since > a __GFP_NOFAIL flag is passed through to the page allocator if no objects > are available. >It all depends on what flags are passed to kmalloc(), slab nor slub enforce __GFP_NOFAIL behavior themselves. In slab, cache_grow() will return NULL depending on whether the page allocator returns NULL, and that would only happen for __GFP_NORETRY or cachep->gfp->gfporder >= PAGE_ALLOC_COSTLY_ORDER. In slub, the default order is tried with __GFP_NORETRY and if it returns NULL, the higher order alloc will fail under the same circumstances. So the nofail behavior for slab depends only on the flags passed from the caller. -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-25 21:27 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 14:11 -0700, David Rientjes wrote:> > There''s still no hard guarantee that the memory will allocatable > (GFP_KERNEL, the compaction, then GFP_ATOMIC could all still fail), but I > don''t see how continuously looping the page allocator is possibly supposed > to help in these situations.Why do you think I''m a proponent of that behaviour? I''ve been arguing that the existance of GFP_NOFAIL is the bug, and I started the whole discussion because your patchset didn''t outline the purpose of its existance, it merely changes __GFP_NOFAIL usage into $foo_nofail() functions, which on its own is a rather daft change. Optimizing the page allocator by removing those conditional from its innards into an outer loop not used by most callers seems a fine goal, but you didn''t state that. Also, I like the postfix proposed by Andi better: _i_suck() :-) -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Christoph Lameter
2010-Aug-25 21:35 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, David Rientjes wrote:> It all depends on what flags are passed to kmalloc(), slab nor slub > enforce __GFP_NOFAIL behavior themselves. In slab, cache_grow() will > return NULL depending on whether the page allocator returns NULL, and that > would only happen for __GFP_NORETRY or > cachep->gfp->gfporder >= PAGE_ALLOC_COSTLY_ORDER. In slub, the default > order is tried with __GFP_NORETRY and if it returns NULL, the higher order > alloc will fail under the same circumstances. So the nofail behavior for > slab depends only on the flags passed from the caller.If the higher order fails in slub then an order 0 alloc is attempted without __GFP_NORETRY. In both cases the nofail behavior of the page allocator determines the outcode. True if the caller mixes in __GFP_NORETRY then you may still get NULL. But that is an issue that can be resolved by the caller. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-25 21:35 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 16:53 -0400, Ted Ts''o wrote:> On Wed, Aug 25, 2010 at 03:35:42PM +0200, Peter Zijlstra wrote: > > > > While I appreciate that it might be somewhat (a lot) harder for a > > filesystem to provide that guarantee, I''d be deeply worried about your > > claim that its impossible. > > > > It would render a system without swap very prone to deadlocks. Even with > > the very tight dirty page accounting we currently have you can fill all > > your memory with anonymous pages, at which point there''s nothing free > > and you require writeout of dirty pages to succeed. > > For file systems that do delayed allocation, the situation is very > similar to swapping over NFS. Sometimes in order to make some free > memory, you need to spend some free memory...Which means you need to be able to compute a bounded amount of that memory.> which implies that for > these file systems, being more aggressive about triggering writeout, > and being more aggressive about throttling processes which are > creating too many dirty pages, especially dirty delayed allocaiton > pages (regardless of whether this is via write(2) or accessing mmapped > memory), is a really good idea.That seems unrelated, the VM has a strict dirty limit and controls writeback when needed. That part works.> A pool of free pages which is reserved for routines that are doing > page cleaning would probably also be a good idea. Maybe that''s just > retrying with GFP_ATOMIC if a normal allocation fails, or maybe we > need our own special pool, or maybe we need to dynamically resize the > GFP_ATOMIC pool based on how many subsystems might need to use it....We have a smallish reserve, accessible with PF_MEMALLOC, but its use is not regulated nor bounded, it just mostly works good enough. -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-25 23:05 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, Christoph Lameter wrote:> If the higher order fails in slub then an order 0 alloc is attempted > without __GFP_NORETRY. In both cases the nofail behavior of the page > allocator determines the outcode. >Right, I thought you said the slab layer passes __GFP_NOFAIL when there''s no objects available. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-25 23:11 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, Peter Zijlstra wrote:> > There''s still no hard guarantee that the memory will allocatable > > (GFP_KERNEL, the compaction, then GFP_ATOMIC could all still fail), but I > > don''t see how continuously looping the page allocator is possibly supposed > > to help in these situations. > > Why do you think I''m a proponent of that behaviour? >I don''t, I was agreeing with what you''re saying :)> I''ve been arguing that the existance of GFP_NOFAIL is the bug, and I > started the whole discussion because your patchset didn''t outline the > purpose of its existance, it merely changes __GFP_NOFAIL usage into > $foo_nofail() functions, which on its own is a rather daft change. >I originally pushed these to the callers, but Andrew thought it would be better so that we could audit the existing users that have no fallback behavior, even though they could go implement it on their own (like getblk() which actually does try _some_ memory freeing). It eliminates the flag from the page allocator and saves branches in the slowpath. We don''t need this logic in the allocator itself, it can exist at a higher level and, with deprecation, will hopefully be incentive enough for those subsystems to fix the issue. I''ll repropose the patchset with __deprecated as you suggested. Thanks! -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Dave Chinner
2010-Aug-26 00:09 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, Aug 25, 2010 at 03:35:42PM +0200, Peter Zijlstra wrote:> On Wed, 2010-08-25 at 23:24 +1000, Dave Chinner wrote: > > > > That is, the guarantee that we will always make progress simply does > > not exist in filesystems, so a mempool-like concept seems to me to > > be doomed from the start.... > > While I appreciate that it might be somewhat (a lot) harder for a > filesystem to provide that guarantee, I''d be deeply worried about your > claim that its impossible.I didn''t say impossible, just that there''s no way we can always guarantee of forward progress with a specific, bound pool of memory. Sure, we know what the worst case amount of log space is needed for each transaction (i.e. how many pages that will be dirtied), but that does not take into account all the blocks that need to be read to make those modifications, the memory needed for stuff like btree cursors, log tickets, transaction commit vectors, btree blocks needed to do the searches, etc. A typical transaction reservation on a 4k block filesystem is between 200-400k (it''s worst case), and if you add in all the other allocations that might be required, we''re at the order of requiring megabytes of RAM to guarantee a single transaction will succeed in low memory conditions. The exact requirement is very difficult to quantify algorithmically, but for a single transaction it should be possible. However, consider the case of running a thousand concurrent transactions and in the middle of that the system runs out of memory. All the transactions need memory allocation to succeed, some are blocked waiting for resources held in other transactions, etc. Firstly, how to you stop all the transactions from making further progress to serialise access to the low memory pool? Secondly, how do you select which transaction you want to use the low memory pool? What do you do if the selected transaction then blocks on a resource held by another transaction (which you can''t know ahead of time)? Do you switch to another thread and hope the pool doesn''t run dry? What do you do when (not if) the memory pool runs dry? I''m sure this could be done, but it''s lot of difficult, unrewarding work that greatly increases code complexity, touches a massive amount of the filesystem code base, exponentially increases the test matrix, is likely to have significant operational overhead, and even then there''s no guarantee that we''ve got it right. That doesn''t sound like a good solution to me.> It would render a system without swap very prone to deadlocks. Even with > the very tight dirty page accounting we currently have you can fill all > your memory with anonymous pages, at which point there''s nothing free > and you require writeout of dirty pages to succeed.Then don''t allow anonymous pages to fill all of memory when there is no swap available - i.e. keep a larger pool of free memory when there is no swap available. That''s a much simpler solution than turning all the filesystems upside down to try to make them not need allocation.... Cheers, Dave. -- Dave Chinner david@fromorbit.com -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Ted Ts''o
2010-Aug-26 00:19 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, Aug 25, 2010 at 04:11:38PM -0700, David Rientjes wrote:> > I''ll repropose the patchset with __deprecated as you suggested. Thanks!And what Dave and I are saying is that we''ll either need to do our on loop to avoid the deprecation warning, or the use of the deprecated function will probably be used forever... - Ted -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-26 00:30 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, Ted Ts''o wrote:> > I''ll repropose the patchset with __deprecated as you suggested. Thanks! > > And what Dave and I are saying is that we''ll either need to do our on > loop to avoid the deprecation warning, or the use of the deprecated > function will probably be used forever... >We certainly hope that nobody will reimplement the same function without the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER where there''s no looping at a higher level. So perhaps the best alternative is to implement the same _nofail() functions but do a WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead? I think it''s really sad that the caller can''t know what the upper bounds of its memory requirement are ahead of time or at least be able to implement a memory freeing function when kmalloc() returns NULL. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Christoph Lameter
2010-Aug-26 01:30 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, David Rientjes wrote:> On Wed, 25 Aug 2010, Christoph Lameter wrote: > > > If the higher order fails in slub then an order 0 alloc is attempted > > without __GFP_NORETRY. In both cases the nofail behavior of the page > > allocator determines the outcode. > > > > Right, I thought you said the slab layer passes __GFP_NOFAIL when there''s > no objects available.Yes, the slab layer calls the page allocator when there are no objects available and passes the __GFP_NOFAIL that the user may have set in the call to the page allocator. Why then add new functions that do the same? -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Ted Ts''o
2010-Aug-26 01:48 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, Aug 25, 2010 at 05:30:42PM -0700, David Rientjes wrote:> > We certainly hope that nobody will reimplement the same function without > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER > where there''s no looping at a higher level. So perhaps the best > alternative is to implement the same _nofail() functions but do a > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?Yeah, that sounds better.> I think it''s really sad that the caller can''t know what the upper bounds > of its memory requirement are ahead of time or at least be able to > implement a memory freeing function when kmalloc() returns NULL.Oh, we can determine an upper bound. You might just not like it. Actually ext3/ext4 shouldn''t be as bad as XFS, which Dave estimated to be around 400k for a transaction. My guess is that the worst case for ext3/ext4 is probably around 256k or so; like XFS, most of the time, it would be a lot less. (At least, if data != journalled; if we are doing data journalling and every single data block begins with 0xc03b3998U, we''ll need to allocate a 4k page for every single data block written.) We could dynamically calculate an upper bound if we had to. Of course, if ext3/ext4 is attached to a network block device, then it could get a lot worse than 256k, of course. - Ted
David Rientjes
2010-Aug-26 03:09 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, Ted Ts''o wrote:> > We certainly hope that nobody will reimplement the same function without > > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER > > where there''s no looping at a higher level. So perhaps the best > > alternative is to implement the same _nofail() functions but do a > > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead? > > Yeah, that sounds better. >Ok, and we''ll make it a WARN_ON_ONCE() to be nice to the kernel log. Although the current patchset does this with WARN_ON_ONCE(1, ...) instead, this serves to ensure that we aren''t dependent on the page allocator''s implementation to always loop for order < PAGE_ALLOC_COSTLY_ORDER in which case the loop in the _nofail() functions would actually do something.> > I think it''s really sad that the caller can''t know what the upper bounds > > of its memory requirement are ahead of time or at least be able to > > implement a memory freeing function when kmalloc() returns NULL. > > Oh, we can determine an upper bound. You might just not like it. > Actually ext3/ext4 shouldn''t be as bad as XFS, which Dave estimated to > be around 400k for a transaction. My guess is that the worst case for > ext3/ext4 is probably around 256k or so; like XFS, most of the time, > it would be a lot less. (At least, if data != journalled; if we are > doing data journalling and every single data block begins with > 0xc03b3998U, we''ll need to allocate a 4k page for every single data > block written.) We could dynamically calculate an upper bound if we > had to. Of course, if ext3/ext4 is attached to a network block > device, then it could get a lot worse than 256k, of course. >On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL is 5086 pages, or ~20MB. GFP_ATOMIC would allow access to ~12MB of that, so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as a fallback behavior when GFP_NOFS or GFP_NOIO fails? -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-26 03:12 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, Christoph Lameter wrote:> > Right, I thought you said the slab layer passes __GFP_NOFAIL when there''s > > no objects available. > > Yes, the slab layer calls the page allocator when there are no objects > available and passes the __GFP_NOFAIL that the user may have set in the > call to the page allocator. > > Why then add new functions that do the same? >Because we can remove the flag, remove branches from the page allocator slowpath, and none of these allocations actually are dependent on __GFP_NOFAIL since they are all under PAGE_ALLOC_COSTLY_ORDER. -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Dave Chinner
2010-Aug-26 06:38 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, Aug 25, 2010 at 09:48:47PM -0400, Ted Ts''o wrote:> On Wed, Aug 25, 2010 at 05:30:42PM -0700, David Rientjes wrote: > > > > We certainly hope that nobody will reimplement the same function without > > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER > > where there''s no looping at a higher level. So perhaps the best > > alternative is to implement the same _nofail() functions but do a > > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead? > > Yeah, that sounds better. > > > I think it''s really sad that the caller can''t know what the upper bounds > > of its memory requirement are ahead of time or at least be able to > > implement a memory freeing function when kmalloc() returns NULL. > > Oh, we can determine an upper bound. You might just not like it. > Actually ext3/ext4 shouldn''t be as bad as XFS, which Dave estimated to > be around 400k for a transaction.For a 4k block size filesystem. If I use 64k block size directories (which XFS can even on 4k page size machines), the maximum transaction reservation goes up to at around 3MB, and that''s just for blocks being _modified_. It''s not the limit on the amount of memory that may need to be allocated during a transaction....> My guess is that the worst case for > ext3/ext4 is probably around 256k or so; like XFS, most of the time, > it would be a lot less.Right, it usually is a lot less, but one of the big problems is that during low memory situations memory reclaim of the metadata page cache actually causes _more_ memory allocation during tranactions than otherwise would occur....... Cheers, Dave. -- Dave Chinner david@fromorbit.com -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Dave Chinner
2010-Aug-26 07:06 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, Aug 25, 2010 at 08:09:21PM -0700, David Rientjes wrote:> On Wed, 25 Aug 2010, Ted Ts''o wrote: > > > I think it''s really sad that the caller can''t know what the upper bounds > > > of its memory requirement are ahead of time or at least be able to > > > implement a memory freeing function when kmalloc() returns NULL. > > > > Oh, we can determine an upper bound. You might just not like it. > > Actually ext3/ext4 shouldn''t be as bad as XFS, which Dave estimated to > > be around 400k for a transaction. My guess is that the worst case for > > ext3/ext4 is probably around 256k or so; like XFS, most of the time, > > it would be a lot less. (At least, if data != journalled; if we are > > doing data journalling and every single data block begins with > > 0xc03b3998U, we''ll need to allocate a 4k page for every single data > > block written.) We could dynamically calculate an upper bound if we > > had to. Of course, if ext3/ext4 is attached to a network block > > device, then it could get a lot worse than 256k, of course. > > > > On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL > is 5086 pages, or ~20MB. GFP_ATOMIC would allow access to ~12MB of that, > so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as > a fallback behavior when GFP_NOFS or GFP_NOIO fails?It would take a handful of concurrent transactions in XFS with worst case memory allocation requirements to exhaust that pool, and then we really would be in trouble. Alternatively, it would take a few allocations from each of a couple of thousand concurrent transactions to get to the same point. Bound memory pools only work when serialised access to the pool can be enforced and there are no dependencies on other operations in progress for completion of the work and freeing of the memory. This is where it becomes exceedingly difficult to guarantee progress. One of the ideas that has floated around (I think Mel Gorman came up with it first) was that if hardening the filesystem is so difficult, why not just harden a single path via a single thread? e.g. we allow the bdi flusher thread to have a separate reserve pool of free pages, and when memory allocations start to fail, then that thread can dip into it''s pool to complete the writeback of the dirty pages being flushed. When a fileystem attaches to a bdi, it can specify the size of the reserve pool it needs. This can be easily tested for during allocation (say a PF_ flag) and switched to the reserve pool as necessary. because it is per-thread, access to the pool is guaranteed to serialised. Memory reclaim can then refill these pools before putting pages on freelists. This could give us a mechanism for ensuring that allocations succeed in the ->writepage path without needing to care about filesystem implementation details. And in the case of ext3/4, a pool could be attached to the jbd thread as well so that it never starves of memory when commits are required... So, rather than turning filesystems upside down, maybe we should revisit per-thread reserve pools for threads that are tasked with cleaning pages for the VM? Cheers, Dave. -- Dave Chinner david@fromorbit.com -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Peter Zijlstra
2010-Aug-26 08:29 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 2010-08-25 at 20:09 -0700, David Rientjes wrote:> > Oh, we can determine an upper bound. You might just not like it. > > Actually ext3/ext4 shouldn''t be as bad as XFS, which Dave estimated to > > be around 400k for a transaction. My guess is that the worst case for > > ext3/ext4 is probably around 256k or so; like XFS, most of the time, > > it would be a lot less. (At least, if data != journalled; if we are > > doing data journalling and every single data block begins with > > 0xc03b3998U, we''ll need to allocate a 4k page for every single data > > block written.) We could dynamically calculate an upper bound if we > > had to. Of course, if ext3/ext4 is attached to a network block > > device, then it could get a lot worse than 256k, of course. > > > On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL > is 5086 pages, or ~20MB. GFP_ATOMIC would allow access to ~12MB of that, > so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as > a fallback behavior when GFP_NOFS or GFP_NOIO fails?Agreed with the fact that 400k isn''t much to worry about. Not agreed with the GFP_ATOMIC stmt. Direct reclaim already has PF_MEMALLOC, but then we also need a concurrency limit on that, otherwise you can still easily blow though your reserves by having 100 concurrency users of it, resulting in an upper bound of 40000k instead, which will be too much. There were patches to limit the direct reclaim contexts, not sure they ever got anywhere.. It is something to consider in the re-design of the whole direct-reclaim/writeback paths though.. -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Christoph Lameter
2010-Aug-26 14:16 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 25 Aug 2010, David Rientjes wrote:> Because we can remove the flag, remove branches from the page allocator > slowpath, and none of these allocations actually are dependent on > __GFP_NOFAIL since they are all under PAGE_ALLOC_COSTLY_ORDER.Then we can simply remove __GFP_NOFAIL? Functions are only needed for higher order allocs that can fail? -- To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Aug-26 22:31 UTC
Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Thu, 26 Aug 2010, Christoph Lameter wrote:> > Because we can remove the flag, remove branches from the page allocator > > slowpath, and none of these allocations actually are dependent on > > __GFP_NOFAIL since they are all under PAGE_ALLOC_COSTLY_ORDER. > > Then we can simply remove __GFP_NOFAIL? Functions are only needed for > higher order allocs that can fail? >Yes, that''s the intent. We''d like to add the WARN_ON_ONCE(get_order(size) >= PAGE_ALLOC_COSTLY_ORDER) warning, though, so we''re ensured that redefinition of that #define doesn''t cause allocations to fail to that don''t have appropriate error handling or future callers use higher order allocs. The _nofail() functions help that and do some due diligence in ensuring that we aren''t changing gfp flags based only on the current page allocator implementation which may later change with very specialized corner cases. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Sep-02 01:02 UTC
[patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail(). These functions are equivalent to kmalloc(), kcalloc(), and kzalloc(), respectively, except that they will never return NULL and instead loop forever trying to allocate memory. If the first allocation attempt fails because the page allocator doesn''t implicitly loop, a warning will be emitted, including a call trace. Subsequent failures will suppress this warning. These were added as helper functions for documentation and auditability. No future callers should be added. Signed-off-by: David Rientjes <rientjes@google.com> --- drivers/md/dm-region-hash.c | 2 +- fs/btrfs/inode.c | 2 +- fs/gfs2/log.c | 2 +- fs/gfs2/rgrp.c | 18 +++++---------- fs/jbd/transaction.c | 11 ++------ fs/reiserfs/journal.c | 3 +- include/linux/slab.h | 51 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 64 insertions(+), 25 deletions(-) diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -290,7 +290,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); if (unlikely(!nreg)) - nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); + nreg = kmalloc_nofail(sizeof(*nreg), GFP_NOIO); nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? DM_RH_CLEAN : DM_RH_NOSYNC; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1967,7 +1967,7 @@ void btrfs_add_delayed_iput(struct inode *inode) if (atomic_add_unless(&inode->i_count, -1, 1)) return; - delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); + delayed = kmalloc_nofail(sizeof(*delayed), GFP_NOFS); delayed->inode = inode; spin_lock(&fs_info->delayed_iput_lock); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -709,7 +709,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) } trace_gfs2_log_flush(sdp, 1); - ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); + ai = kzalloc_nofail(sizeof(struct gfs2_ail), GFP_NOFS); INIT_LIST_HEAD(&ai->ai_ail1_list); INIT_LIST_HEAD(&ai->ai_ail2_list); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1440,8 +1440,8 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, rgrp_blk++; if (!bi->bi_clone) { - bi->bi_clone = kmalloc(bi->bi_bh->b_size, - GFP_NOFS | __GFP_NOFAIL); + bi->bi_clone = kmalloc_nofail(bi->bi_bh->b_size, + GFP_NOFS); memcpy(bi->bi_clone + bi->bi_offset, bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); @@ -1759,9 +1759,6 @@ fail: * @block: the block * * Figure out what RG a block belongs to and add that RG to the list - * - * FIXME: Don''t use NOFAIL - * */ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, @@ -1789,8 +1786,8 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, if (rlist->rl_rgrps == rlist->rl_space) { new_space = rlist->rl_space + 10; - tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *), - GFP_NOFS | __GFP_NOFAIL); + tmp = kcalloc_nofail(new_space, sizeof(struct gfs2_rgrpd *), + GFP_NOFS); if (rlist->rl_rgd) { memcpy(tmp, rlist->rl_rgd, @@ -1811,17 +1808,14 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, * @rlist: the list of resource groups * @state: the lock state to acquire the RG lock in * @flags: the modifier flags for the holder structures - * - * FIXME: Don''t use NOFAIL - * */ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) { unsigned int x; - rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder), - GFP_NOFS | __GFP_NOFAIL); + rlist->rl_ghs = kcalloc_nofail(rlist->rl_rgrps, + sizeof(struct gfs2_holder), GFP_NOFS); for (x = 0; x < rlist->rl_rgrps; x++) gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, state, 0, diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -98,14 +98,9 @@ static int start_this_handle(journal_t *journal, handle_t *handle) } alloc_transaction: - if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); - if (!new_transaction) { - ret = -ENOMEM; - goto out; - } - } + if (!journal->j_running_transaction) + new_transaction = kzalloc_nofail(sizeof(*new_transaction), + GFP_NOFS); jbd_debug(3, "New handle %p going live.\n", handle); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2593,8 +2593,7 @@ static int journal_read(struct super_block *sb) static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) { struct reiserfs_journal_list *jl; - jl = kzalloc(sizeof(struct reiserfs_journal_list), - GFP_NOFS | __GFP_NOFAIL); + jl = kzalloc_nofail(sizeof(struct reiserfs_journal_list), GFP_NOFS); INIT_LIST_HEAD(&jl->j_list); INIT_LIST_HEAD(&jl->j_working_list); INIT_LIST_HEAD(&jl->j_tail_bh_list); diff --git a/include/linux/slab.h b/include/linux/slab.h --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -334,6 +334,57 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) return kmalloc_node(size, flags | __GFP_ZERO, node); } +/** + * kmalloc_nofail - infinitely loop until kmalloc() succeeds. + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate (see kmalloc). + * + * NOTE: no new callers of this function should be implemented! + * All memory allocations should be failable whenever possible. + */ +static inline void *kmalloc_nofail(size_t size, gfp_t flags) +{ + void *ret; + + for (;;) { + ret = kmalloc(size, flags); + if (ret) + return ret; + WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER); + } +} + +/** + * kcalloc_nofail - infinitely loop until kcalloc() succeeds. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kcalloc). + * + * NOTE: no new callers of this function should be implemented! + * All memory allocations should be failable whenever possible. + */ +static inline void *kcalloc_nofail(size_t n, size_t size, gfp_t flags) +{ + void *ret; + + for (;;) { + ret = kcalloc(n, size, flags); + if (ret) + return ret; + WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER); + } +} + +/** + * kzalloc_nofail - infinitely loop until kzalloc() succeeds. + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate (see kzalloc). + */ +static inline void *kzalloc_nofail(size_t size, gfp_t flags) +{ + return kmalloc_nofail(size, flags | __GFP_ZERO); +} + void __init kmem_cache_init_late(void); #endif /* _LINUX_SLAB_H */
David Rientjes
2010-Sep-02 01:03 UTC
[patch v2 4/5] btrfs: add nofail variant of set_extent_dirty
Add set_extent_dirty_nofail(). This function is equivalent to set_extent_dirty(), except that it will never fail because of allocation failure and instead loop forever trying to allocate memory. If the first allocation attempt fails because the page allocator doesn''t implicitly loop, a warning will be emitted, including a call trace. Subsequent failures will suppress this warning. This was added as a helper function for documentation and auditability. No future callers should be added. Signed-off-by: David Rientjes <rientjes@google.com> --- fs/btrfs/extent-tree.c | 8 ++++---- fs/btrfs/extent_io.c | 18 ++++++++++++++++++ fs/btrfs/extent_io.h | 2 ++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3831,9 +3831,9 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_dirty(info->pinned_extents, + set_extent_dirty_nofail(info->pinned_extents, bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); + GFP_NOFS); } btrfs_put_block_group(cache); total -= num_bytes; @@ -3872,8 +3872,8 @@ static int pin_down_extent(struct btrfs_root *root, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_dirty(root->fs_info->pinned_extents, bytenr, - bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + set_extent_dirty_nofail(root->fs_info->pinned_extents, bytenr, + bytenr + num_bytes - 1, GFP_NOFS); return 0; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -940,6 +940,24 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, NULL, mask); } +/* + * NOTE: no new callers of this function should be implemented! + * All memory allocations should be failable whenever possible. + */ +int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + int ret; + + for (;;) { + ret = set_extent_dirty(tree, start, end, mask); + if (ret != -ENOMEM) + return ret; + WARN_ON_ONCE(get_order(sizeof(struct extent_state)) > + PAGE_ALLOC_COSTLY_ORDER); + } +} + int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -197,6 +197,8 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jiri Slaby
2010-Sep-02 07:59 UTC
Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On 09/02/2010 03:02 AM, David Rientjes wrote:> --- a/include/linux/slab.h > +++ b/include/linux/slab.h > @@ -334,6 +334,57 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) > return kmalloc_node(size, flags | __GFP_ZERO, node); > } > > +/** > + * kmalloc_nofail - infinitely loop until kmalloc() succeeds. > + * @size: how many bytes of memory are required. > + * @flags: the type of memory to allocate (see kmalloc). > + * > + * NOTE: no new callers of this function should be implemented! > + * All memory allocations should be failable whenever possible. > + */ > +static inline void *kmalloc_nofail(size_t size, gfp_t flags) > +{ > + void *ret; > + > + for (;;) { > + ret = kmalloc(size, flags); > + if (ret) > + return ret; > + WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);This doesn''t work as you expect. kmalloc will warn every time it fails. __GFP_NOFAIL used to disable the warning. Actually what''s wrong with __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches are needed.> + }-- js -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jan Kara
2010-Sep-02 14:51 UTC
Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Thu 02-09-10 09:59:13, Jiri Slaby wrote:> On 09/02/2010 03:02 AM, David Rientjes wrote: > > --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -334,6 +334,57 > > @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) > > return kmalloc_node(size, flags | __GFP_ZERO, node); } > > > > +/** + * kmalloc_nofail - infinitely loop until kmalloc() succeeds. + > > * @size: how many bytes of memory are required. + * @flags: the type > > of memory to allocate (see kmalloc). + * + * NOTE: no new callers of > > this function should be implemented! + * All memory allocations should > > be failable whenever possible. + */ +static inline void > > *kmalloc_nofail(size_t size, gfp_t flags) +{ + void *ret; + + for > > (;;) { + ret = kmalloc(size, flags); + if (ret) + > > return ret; + WARN_ON_ONCE(get_order(size) > > > PAGE_ALLOC_COSTLY_ORDER); > > This doesn''t work as you expect. kmalloc will warn every time it fails. > __GFP_NOFAIL used to disable the warning. Actually what''s wrong with > __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches > are needed.David should probably add the reasoning to the changelogs so that he doesn''t have to explain again and again ;). But if I understood it correctly, the concern is that the looping checks slightly impact fast path of the callers which do not need it. Generally, also looping for a long time inside allocator isn''t a nice thing but some callers aren''t able to do better for now to the patch is imperfect in this sence... Honza -- Jan Kara <jack@suse.cz> SUSE Labs, CR -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Neil Brown
2010-Sep-02 21:15 UTC
Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Thu, 2 Sep 2010 16:51:41 +0200 Jan Kara <jack@suse.cz> wrote:> On Thu 02-09-10 09:59:13, Jiri Slaby wrote: > > On 09/02/2010 03:02 AM, David Rientjes wrote: > > > --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -334,6 +334,57 > > > @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) > > > return kmalloc_node(size, flags | __GFP_ZERO, node); } > > > > > > +/** + * kmalloc_nofail - infinitely loop until kmalloc() succeeds. + > > > * @size: how many bytes of memory are required. + * @flags: the type > > > of memory to allocate (see kmalloc). + * + * NOTE: no new callers of > > > this function should be implemented! + * All memory allocations should > > > be failable whenever possible. + */ +static inline void > > > *kmalloc_nofail(size_t size, gfp_t flags) +{ + void *ret; + + for > > > (;;) { + ret = kmalloc(size, flags); + if (ret) + > > > return ret; + WARN_ON_ONCE(get_order(size) > > > > PAGE_ALLOC_COSTLY_ORDER); > > > > This doesn''t work as you expect. kmalloc will warn every time it fails. > > __GFP_NOFAIL used to disable the warning. Actually what''s wrong with > > __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches > > are needed. > David should probably add the reasoning to the changelogs so that he > doesn''t have to explain again and again ;). But if I understood it > correctly, the concern is that the looping checks slightly impact fast path > of the callers which do not need it. Generally, also looping for a long > time inside allocator isn''t a nice thing but some callers aren''t able to do > better for now to the patch is imperfect in this sence... >I''m actually a bit confused about this too. I thought David said he was removing a branch on the *slow* path - which make sense as you wouldn''t even test NOFAIL until you had a failure. Why are branches on the slow-path an issue?? This is an important question to me because I still hope to see the swap-over-nfs patches merged eventually and they add a branch on the slow path (if I remember correctly). NeilBrown
David Rientjes
2010-Sep-05 23:01 UTC
Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Thu, 2 Sep 2010, Jiri Slaby wrote:> > @@ -334,6 +334,57 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) > > return kmalloc_node(size, flags | __GFP_ZERO, node); > > } > > > > +/** > > + * kmalloc_nofail - infinitely loop until kmalloc() succeeds. > > + * @size: how many bytes of memory are required. > > + * @flags: the type of memory to allocate (see kmalloc). > > + * > > + * NOTE: no new callers of this function should be implemented! > > + * All memory allocations should be failable whenever possible. > > + */ > > +static inline void *kmalloc_nofail(size_t size, gfp_t flags) > > +{ > > + void *ret; > > + > > + for (;;) { > > + ret = kmalloc(size, flags); > > + if (ret) > > + return ret; > > + WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER); > > This doesn''t work as you expect. kmalloc will warn every time it fails.It actually does work as I expect since the WARN_ON_ONCE() never even gets triggered here for any of kmalloc_nofail()''s callers since they all have sizes small enough that the conditional is never true. The page allocator implicitly loops forever if the order <= PAGE_ALLOC_COSTLY_ORDER, so this warning is only emitted if the #define implementation of the page allocator only changes. That''s intended, we want to know the consequences of our change.> __GFP_NOFAIL used to disable the warning.No, it didn''t, it was unnecessary for all of the kmalloc_nofail() callers since they already implicitly loop. No warning is emitted (these are all GFP_NOFS or GFP_NOIO users, there are no order-4 or larger GFP_ATOMIC allocators using this interface).> Actually what''s wrong with > __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches > are needed. >Couple reasons: - it''s unnecessary in the page allocator, it can be implemented at a higher level, which allows us to remove these branches from the slowpath, - it''s mostly unnecessary since all users have orders that will implicitly loop forever anyway (although __GFP_NOFAIL does guarantee that it won''t fail even if we change the looping behavior internally, these warnings help to isolate cases where it''s needed), and -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Sep-05 23:03 UTC
Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Fri, 3 Sep 2010, Neil Brown wrote:> I''m actually a bit confused about this too. > I thought David said he was removing a branch on the *slow* path - which make > sense as you wouldn''t even test NOFAIL until you had a failure. > Why are branches on the slow-path an issue??They aren''t necessarily an issue in the performance sense, this is a cleanup series since all converted callers to using these new functions (and the eventual removal of __GFP_NOFAIL entirely) are using the bit unnecessarily since they all have orders that implicitly loop in the page allocator forever already, with or without the flag. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Rientjes
2010-Sep-06 09:05 UTC
Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
On Wed, 1 Sep 2010, David Rientjes wrote:> Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail(). These > functions are equivalent to kmalloc(), kcalloc(), and kzalloc(), > respectively, except that they will never return NULL and instead loop > forever trying to allocate memory. > > If the first allocation attempt fails because the page allocator doesn''t > implicitly loop, a warning will be emitted, including a call trace. > Subsequent failures will suppress this warning. > > These were added as helper functions for documentation and auditability. > No future callers should be added. >Are there any objections to merging this series through -mm with the exception of the fifth patch for ntfs? That particular patch needs to have its WARN_ON_ONCE() condition rewritten since it fallbacks to vmalloc for high order allocs. Thanks. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html