thr3ads.net - Btrfs devel - [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc [Aug 2010]

If this information is useful, please help other people find it:
Share via:

David Rientjes

2010-Aug-24 10:50 UTC

[patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail().  These
functions are equivalent to kmalloc(), kcalloc(), and kzalloc(),
respectively, except that they will never return NULL and instead loop
forever trying to allocate memory.

If the first allocation attempt fails, a warning will be emitted,
including a call trace.  Subsequent failures will suppress this warning.

These were added as helper functions for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 drivers/md/dm-region-hash.c |    2 +-
 fs/btrfs/inode.c            |    2 +-
 fs/gfs2/log.c               |    2 +-
 fs/gfs2/rgrp.c              |   18 ++++---------
 fs/jbd/transaction.c        |   11 ++------
 fs/reiserfs/journal.c       |    3 +-
 include/linux/slab.h        |   55 +++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -290,7 +290,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash
*rh, region_t region)
 
 	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
 	if (unlikely(!nreg))
-		nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
+		nreg = kmalloc_nofail(sizeof(*nreg), GFP_NOIO);
 
 	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
 		      DM_RH_CLEAN : DM_RH_NOSYNC;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1967,7 +1967,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
 	if (atomic_add_unless(&inode->i_count, -1, 1))
 		return;
 
-	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+	delayed = kmalloc_nofail(sizeof(*delayed), GFP_NOFS);
 	delayed->inode = inode;
 
 	spin_lock(&fs_info->delayed_iput_lock);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -709,7 +709,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock
*gl)
 	}
 	trace_gfs2_log_flush(sdp, 1);
 
-	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
+	ai = kzalloc_nofail(sizeof(struct gfs2_ail), GFP_NOFS);
 	INIT_LIST_HEAD(&ai->ai_ail1_list);
 	INIT_LIST_HEAD(&ai->ai_ail2_list);
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1440,8 +1440,8 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp,
u64 bstart,
 		rgrp_blk++;
 
 		if (!bi->bi_clone) {
-			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
-					       GFP_NOFS | __GFP_NOFAIL);
+			bi->bi_clone = kmalloc_nofail(bi->bi_bh->b_size,
+					       GFP_NOFS);
 			memcpy(bi->bi_clone + bi->bi_offset,
 			       bi->bi_bh->b_data + bi->bi_offset,
 			       bi->bi_len);
@@ -1759,9 +1759,6 @@ fail:
  * @block: the block
  *
  * Figure out what RG a block belongs to and add that RG to the list
- *
- * FIXME: Don''t use NOFAIL
- *
  */
 
 void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
@@ -1789,8 +1786,8 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct
gfs2_rgrp_list *rlist,
 	if (rlist->rl_rgrps == rlist->rl_space) {
 		new_space = rlist->rl_space + 10;
 
-		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
-			      GFP_NOFS | __GFP_NOFAIL);
+		tmp = kcalloc_nofail(new_space, sizeof(struct gfs2_rgrpd *),
+			      GFP_NOFS);
 
 		if (rlist->rl_rgd) {
 			memcpy(tmp, rlist->rl_rgd,
@@ -1811,17 +1808,14 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct
gfs2_rgrp_list *rlist,
  * @rlist: the list of resource groups
  * @state: the lock state to acquire the RG lock in
  * @flags: the modifier flags for the holder structures
- *
- * FIXME: Don''t use NOFAIL
- *
  */
 
 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
 {
 	unsigned int x;
 
-	rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
-				GFP_NOFS | __GFP_NOFAIL);
+	rlist->rl_ghs = kcalloc_nofail(rlist->rl_rgrps,
+				sizeof(struct gfs2_holder), GFP_NOFS);
 	for (x = 0; x < rlist->rl_rgrps; x++)
 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
 				state, 0,
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -98,14 +98,9 @@ static int start_this_handle(journal_t *journal, handle_t
*handle)
 	}
 
 alloc_transaction:
-	if (!journal->j_running_transaction) {
-		new_transaction = kzalloc(sizeof(*new_transaction),
-						GFP_NOFS|__GFP_NOFAIL);
-		if (!new_transaction) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
+	if (!journal->j_running_transaction)
+		new_transaction = kzalloc_nofail(sizeof(*new_transaction),
+						GFP_NOFS);
 
 	jbd_debug(3, "New handle %p going live.\n", handle);
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2593,8 +2593,7 @@ static int journal_read(struct super_block *sb)
 static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 {
 	struct reiserfs_journal_list *jl;
-	jl = kzalloc(sizeof(struct reiserfs_journal_list),
-		     GFP_NOFS | __GFP_NOFAIL);
+	jl = kzalloc_nofail(sizeof(struct reiserfs_journal_list), GFP_NOFS);
 	INIT_LIST_HEAD(&jl->j_list);
 	INIT_LIST_HEAD(&jl->j_working_list);
 	INIT_LIST_HEAD(&jl->j_tail_bh_list);
diff --git a/include/linux/slab.h b/include/linux/slab.h
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -334,6 +334,61 @@ static inline void *kzalloc_node(size_t size, gfp_t flags,
int node)
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
+/**
+ * kmalloc_nofail - infinitely loop until kmalloc() succeeds.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kmalloc).
+ *
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+static inline void *kmalloc_nofail(size_t size, gfp_t flags)
+{
+	void *ret;
+
+	for (;;) {
+		ret = kmalloc(size, flags);
+		if (ret)
+			return ret;
+		WARN_ONCE(1, "Out of memory, no fallback implemented "
+				"(size=%lu flags=0x%x)\n",
+				size, flags);
+	}
+}
+
+/**
+ * kcalloc_nofail - infinitely loop until kcalloc() succeeds.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kcalloc).
+ *
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+static inline void *kcalloc_nofail(size_t n, size_t size, gfp_t flags)
+{
+	void *ret;
+
+	for (;;) {
+		ret = kcalloc(n, size, flags);
+		if (ret)
+			return ret;
+		WARN_ONCE(1, "Out of memory, no fallback implemented "
+				"(n=%lu size=%lu flags=0x%x)\n",
+				n, size, flags);
+	}
+}
+
+/**
+ * kzalloc_nofail - infinitely loop until kzalloc() succeeds.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kzalloc).
+ */
+static inline void *kzalloc_nofail(size_t size, gfp_t flags)
+{
+	return kmalloc_nofail(size, flags | __GFP_ZERO);
+}
+
 void __init kmem_cache_init_late(void);
 
 #endif	/* _LINUX_SLAB_H */
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-24 10:50 UTC

head link

[patch 4/5] btrfs: add nofail variant of set_extent_dirty

Add set_extent_dirty_nofail().  This function is equivalent to
set_extent_dirty(), except that it will never fail because of allocation
failure and instead loop forever trying to allocate memory.

If the first allocation attempt fails, a warning will be emitted,
including a call trace.  Subsequent failures will suppress this warning.

This was added as a helper function for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/btrfs/extent-tree.c |    8 ++++----
 fs/btrfs/extent_io.c   |   19 +++++++++++++++++++
 fs/btrfs/extent_io.h   |    2 ++
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3831,9 +3831,9 @@ static int update_block_group(struct btrfs_trans_handle
*trans,
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
 
-			set_extent_dirty(info->pinned_extents,
+			set_extent_dirty_nofail(info->pinned_extents,
 					 bytenr, bytenr + num_bytes - 1,
-					 GFP_NOFS | __GFP_NOFAIL);
+					 GFP_NOFS);
 		}
 		btrfs_put_block_group(cache);
 		total -= num_bytes;
@@ -3872,8 +3872,8 @@ static int pin_down_extent(struct btrfs_root *root,
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
 
-	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
-			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+	set_extent_dirty_nofail(root->fs_info->pinned_extents, bytenr,
+			 bytenr + num_bytes - 1, GFP_NOFS);
 	return 0;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -940,6 +940,25 @@ int set_extent_dirty(struct extent_io_tree *tree, u64
start, u64 end,
 			      NULL, mask);
 }
 
+/*
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	int ret;
+
+	for (;;) {
+		ret = set_extent_dirty(tree, start, end, mask);
+		if (ret != -ENOMEM)
+			return ret;
+		WARN_ONCE(1, "Out of memory, no fallback implemented "
+				"(flags=0x%x)\n",
+				mask);
+	}
+}
+
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -197,6 +197,8 @@ int set_extent_new(struct extent_io_tree *tree, u64 start,
u64 end,
 		   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
+int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,

Jan Kara

2010-Aug-24 12:15 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Tue 24-08-10 03:50:19, David Rientjes wrote:> Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail().  These
> functions are equivalent to kmalloc(), kcalloc(), and kzalloc(),
> respectively, except that they will never return NULL and instead loop
> forever trying to allocate memory.
> 
> If the first allocation attempt fails, a warning will be emitted,
> including a call trace.  Subsequent failures will suppress this warning.
> 
> These were added as helper functions for documentation and auditability.
> No future callers should be added.  This looks like a cleaner approach than the previous one. You can add
Acked-by: Jan Kara <jack@suse.cz>
  for the JBD part.

								Honza> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  drivers/md/dm-region-hash.c |    2 +-
>  fs/btrfs/inode.c            |    2 +-
>  fs/gfs2/log.c               |    2 +-
>  fs/gfs2/rgrp.c              |   18 ++++---------
>  fs/jbd/transaction.c        |   11 ++------
>  fs/reiserfs/journal.c       |    3 +-
>  include/linux/slab.h        |   55
+++++++++++++++++++++++++++++++++++++++++++
>  7 files changed, 68 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
> --- a/drivers/md/dm-region-hash.c
> +++ b/drivers/md/dm-region-hash.c
> @@ -290,7 +290,7 @@ static struct dm_region *__rh_alloc(struct
dm_region_hash *rh, region_t region)
>  
>  	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
>  	if (unlikely(!nreg))
> -		nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
> +		nreg = kmalloc_nofail(sizeof(*nreg), GFP_NOIO);
>  
>  	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
>  		      DM_RH_CLEAN : DM_RH_NOSYNC;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1967,7 +1967,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
>  	if (atomic_add_unless(&inode->i_count, -1, 1))
>  		return;
>  
> -	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
> +	delayed = kmalloc_nofail(sizeof(*delayed), GFP_NOFS);
>  	delayed->inode = inode;
>  
>  	spin_lock(&fs_info->delayed_iput_lock);
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -709,7 +709,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct
gfs2_glock *gl)
>  	}
>  	trace_gfs2_log_flush(sdp, 1);
>  
> -	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
> +	ai = kzalloc_nofail(sizeof(struct gfs2_ail), GFP_NOFS);
>  	INIT_LIST_HEAD(&ai->ai_ail1_list);
>  	INIT_LIST_HEAD(&ai->ai_ail2_list);
>  
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -1440,8 +1440,8 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd
*sdp, u64 bstart,
>  		rgrp_blk++;
>  
>  		if (!bi->bi_clone) {
> -			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
> -					       GFP_NOFS | __GFP_NOFAIL);
> +			bi->bi_clone = kmalloc_nofail(bi->bi_bh->b_size,
> +					       GFP_NOFS);
>  			memcpy(bi->bi_clone + bi->bi_offset,
>  			       bi->bi_bh->b_data + bi->bi_offset,
>  			       bi->bi_len);
> @@ -1759,9 +1759,6 @@ fail:
>   * @block: the block
>   *
>   * Figure out what RG a block belongs to and add that RG to the list
> - *
> - * FIXME: Don''t use NOFAIL
> - *
>   */
>  
>  void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
> @@ -1789,8 +1786,8 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct
gfs2_rgrp_list *rlist,
>  	if (rlist->rl_rgrps == rlist->rl_space) {
>  		new_space = rlist->rl_space + 10;
>  
> -		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
> -			      GFP_NOFS | __GFP_NOFAIL);
> +		tmp = kcalloc_nofail(new_space, sizeof(struct gfs2_rgrpd *),
> +			      GFP_NOFS);
>  
>  		if (rlist->rl_rgd) {
>  			memcpy(tmp, rlist->rl_rgd,
> @@ -1811,17 +1808,14 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct
gfs2_rgrp_list *rlist,
>   * @rlist: the list of resource groups
>   * @state: the lock state to acquire the RG lock in
>   * @flags: the modifier flags for the holder structures
> - *
> - * FIXME: Don''t use NOFAIL
> - *
>   */
>  
>  void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
>  {
>  	unsigned int x;
>  
> -	rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct
gfs2_holder),
> -				GFP_NOFS | __GFP_NOFAIL);
> +	rlist->rl_ghs = kcalloc_nofail(rlist->rl_rgrps,
> +				sizeof(struct gfs2_holder), GFP_NOFS);
>  	for (x = 0; x < rlist->rl_rgrps; x++)
>  		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
>  				state, 0,
> diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
> --- a/fs/jbd/transaction.c
> +++ b/fs/jbd/transaction.c
> @@ -98,14 +98,9 @@ static int start_this_handle(journal_t *journal,
handle_t *handle)
>  	}
>  
>  alloc_transaction:
> -	if (!journal->j_running_transaction) {
> -		new_transaction = kzalloc(sizeof(*new_transaction),
> -						GFP_NOFS|__GFP_NOFAIL);
> -		if (!new_transaction) {
> -			ret = -ENOMEM;
> -			goto out;
> -		}
> -	}
> +	if (!journal->j_running_transaction)
> +		new_transaction = kzalloc_nofail(sizeof(*new_transaction),
> +						GFP_NOFS);
>  
>  	jbd_debug(3, "New handle %p going live.\n", handle);
>  
> diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
> --- a/fs/reiserfs/journal.c
> +++ b/fs/reiserfs/journal.c
> @@ -2593,8 +2593,7 @@ static int journal_read(struct super_block *sb)
>  static struct reiserfs_journal_list *alloc_journal_list(struct super_block
*s)
>  {
>  	struct reiserfs_journal_list *jl;
> -	jl = kzalloc(sizeof(struct reiserfs_journal_list),
> -		     GFP_NOFS | __GFP_NOFAIL);
> +	jl = kzalloc_nofail(sizeof(struct reiserfs_journal_list), GFP_NOFS);
>  	INIT_LIST_HEAD(&jl->j_list);
>  	INIT_LIST_HEAD(&jl->j_working_list);
>  	INIT_LIST_HEAD(&jl->j_tail_bh_list);
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -334,6 +334,61 @@ static inline void *kzalloc_node(size_t size, gfp_t
flags, int node)
>  	return kmalloc_node(size, flags | __GFP_ZERO, node);
>  }
>  
> +/**
> + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.
> + * @size: how many bytes of memory are required.
> + * @flags: the type of memory to allocate (see kmalloc).
> + *
> + * NOTE: no new callers of this function should be implemented!
> + * All memory allocations should be failable whenever possible.
> + */
> +static inline void *kmalloc_nofail(size_t size, gfp_t flags)
> +{
> +	void *ret;
> +
> +	for (;;) {
> +		ret = kmalloc(size, flags);
> +		if (ret)
> +			return ret;
> +		WARN_ONCE(1, "Out of memory, no fallback implemented "
> +				"(size=%lu flags=0x%x)\n",
> +				size, flags);
> +	}
> +}
> +
> +/**
> + * kcalloc_nofail - infinitely loop until kcalloc() succeeds.
> + * @n: number of elements.
> + * @size: element size.
> + * @flags: the type of memory to allocate (see kcalloc).
> + *
> + * NOTE: no new callers of this function should be implemented!
> + * All memory allocations should be failable whenever possible.
> + */
> +static inline void *kcalloc_nofail(size_t n, size_t size, gfp_t flags)
> +{
> +	void *ret;
> +
> +	for (;;) {
> +		ret = kcalloc(n, size, flags);
> +		if (ret)
> +			return ret;
> +		WARN_ONCE(1, "Out of memory, no fallback implemented "
> +				"(n=%lu size=%lu flags=0x%x)\n",
> +				n, size, flags);
> +	}
> +}
> +
> +/**
> + * kzalloc_nofail - infinitely loop until kzalloc() succeeds.
> + * @size: how many bytes of memory are required.
> + * @flags: the type of memory to allocate (see kzalloc).
> + */
> +static inline void *kzalloc_nofail(size_t size, gfp_t flags)
> +{
> +	return kmalloc_nofail(size, flags | __GFP_ZERO);
> +}
> +
>  void __init kmem_cache_init_late(void);
>  
>  #endif	/* _LINUX_SLAB_H */-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-24 13:29 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote:> These were added as helper functions for documentation and auditability.
> No future callers should be added.
git grep GFP_NOFAIL isn''t auditable enough?

might as well declare these functions depricated if you really want to
do this.

FWIW I think mason said he''d fix btrfs to not suck like this
''soon''.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-24 13:30 UTC

head link

Re: [patch 4/5] btrfs: add nofail variant of set_extent_dirty

On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote:> Add set_extent_dirty_nofail().  This function is equivalent to
> set_extent_dirty(), except that it will never fail because of allocation
> failure and instead loop forever trying to allocate memory.
> 
> If the first allocation attempt fails, a warning will be emitted,
> including a call trace.  Subsequent failures will suppress this warning.
> 
> This was added as a helper function for documentation and auditability.
> No future callers should be added.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  fs/btrfs/extent-tree.c |    8 ++++----
>  fs/btrfs/extent_io.c   |   19 +++++++++++++++++++
>  fs/btrfs/extent_io.h   |    2 ++
>  3 files changed, 25 insertions(+), 4 deletions(-)
I''d much rather someone help mason to clean this up.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Jens Axboe

2010-Aug-24 13:33 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On 2010-08-24 15:29, Peter Zijlstra wrote:> On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote:
>> These were added as helper functions for documentation and
auditability.
>> No future callers should be added.
> 
> git grep GFP_NOFAIL isn''t auditable enough?
> 
> might as well declare these functions depricated if you really want to
> do this.
Agree, adding a helper function would seem to be going in the reverse
direction imho. Nobody will read that comment on top of the function.

Should be possible to warn at build time for anyone using __GFP_NOFAIL
without wrapping it in a function.

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-raid"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Dave Chinner

2010-Aug-24 13:55 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Tue, Aug 24, 2010 at 03:29:18PM +0200, Peter Zijlstra
wrote:> On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote:
> > These were added as helper functions for documentation and
auditability.
> > No future callers should be added.
> 
> git grep GFP_NOFAIL isn''t auditable enough?
> 
> might as well declare these functions depricated if you really want to
> do this.
Also, if you are going to add tight loops, you might want to put a
backoff in the loops like "congestion_wait(BLK_RW_ASYNC, HZ/50);" so
that they don''t spin....

FWIW, in all this "allocations can''t fail" churn, no one has
noticed
that XFS has been doing these "allocations can''t fail" loop
in
kmem_alloc() and kmem_zone_alloc(), well, forever. I can''t ever
remember seeing it report a potential deadlock, though....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-24 14:03 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Tue, 2010-08-24 at 23:55 +1000, Dave Chinner wrote:> no one has noticed
> that XFS has been doing these "allocations can''t fail"
loop in
> kmem_alloc() and kmem_zone_alloc(), well, forever 
I occasionally check if its still there and cry a little ;-)
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-24 20:08 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Tue, 24 Aug 2010, Peter Zijlstra wrote:
> > These were added as helper functions for documentation and
auditability.
> > No future callers should be added.
> 
> git grep GFP_NOFAIL isn''t auditable enough?
> 
I''m trying to remove that bit, and __GFP_REPEAT, to simplify and remove
several branches from the page allocator.
> might as well declare these functions depricated if you really want to
> do this.
> 
> FWIW I think mason said he''d fix btrfs to not suck like this
''soon''.
> 
Great!
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-24 20:11 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Tue, 24 Aug 2010, Jens Axboe wrote:
> Should be possible to warn at build time for anyone using __GFP_NOFAIL
> without wrapping it in a function.
> 
We could make this __deprecated functions as Peter suggested if you think 
build time warnings for existing users would be helpful.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-24 20:12 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Tue, 24 Aug 2010, Dave Chinner wrote:
> > git grep GFP_NOFAIL isn''t auditable enough?
> > 
> > might as well declare these functions depricated if you really want to
> > do this.
> 
> Also, if you are going to add tight loops, you might want to put a
> backoff in the loops like "congestion_wait(BLK_RW_ASYNC, HZ/50);"
so
> that they don''t spin....
> 
These loops don''t actually loop at all, all users are passing 
order < PAGE_ALLOC_COSTLY_ORDER which implicitly loop forever in the page 
allocator without killing anything (they are all GFP_NOIO or GFP_NOFS, so 
the oom killer isn''t involved).
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ted Ts''o

2010-Aug-25 11:24 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Tue, Aug 24, 2010 at 01:11:26PM -0700, David Rientjes
wrote:> On Tue, 24 Aug 2010, Jens Axboe wrote:
> 
> > Should be possible to warn at build time for anyone using __GFP_NOFAIL
> > without wrapping it in a function.
> > 
> 
> We could make this __deprecated functions as Peter suggested if you think 
> build time warnings for existing users would be helpful.
Let me take a few steps backwards and look at the problem from a
somewhat higher level.

Part of the problem is that we have a few places in the kernel where
failure is really not an option --- or rather, if we''re going to fail
while we''re in the middle of doing a commit, our choices really are
(a) retry the loop in the jbd layer (which Andrew really doesn''t
like), (b) keep our own private cache of free memory so we don''t fail
and/or loop, (c) fail the file system and mark it read-only, or (d)
panic.

There are other places where we can fail safely (for example, in jbd''s
start_this_handle, although that just pushes the layer up the stack,
and ultimately, to userspace where most userspace programs don''t
really expect ENOMEM to get returned by a random disk write --- how
much do _you_ trust a random GNOME or KDE developer to do correct
error checking and do something sane at the application?)

So we can mark the retry loop helper function as deprecated, and that
will make some of these cases go away, but ultimately if we''re going
to fail the memory allocation, something bad is going to happen, and
the only question is whether we want to have something bad happen by
looping in the memory allocator, or to force the file system to
panic/oops the system, or have random application die and/or lose user
data because they don''t expect write() to return ENOMEM.

So at some level it would be nice if we had a few different levels of
"we *really* need this memory".  Level 0 might be, "if we
can''t get
it, no biggie, we''ll figure out some other way around it.  I doubt
there is much at level 0, but in theory, we could have some good
citizens that fall in that camp and who simply will bypass some
optimization if they can''t get the memory.  Level 1 might be, if you
can''t get the memory, we will propagate a failure up to userspace, but
it''s probably a relatively "safe" place to fail (i.e., when
the user
is opening a file).  Level 2 might be, "if you can''t get the
memory,
we will propgate a failure up to userspace, but it''s at a space where
most applications are lazy f*ckers, and this may lead to serious
application errors" (example: close(2), and this is a file system that
only pushes the file to the server at close time, e.g. AFS).  Level 3
might be, "if you can''t get the memory, I''m going to fail
the file
system, or some other global subsystem, that will probably result in
the system crashing or needing to be rebooted".

We can ignore this problem and pretend it doesn''t exist at the memory
allocator level, but that means the callers are going to be doing
their own thing to try to avoid having really bad things happening at
really-low-memory occasions.  And this may mean looping, whether we
mark the function as deprecated or not.

This is becoming even more of an issue now given that with
containerization, we have jokers who are forcing tasks to run in very
small memory containers, which means that failures can happen far more
frequently --- and in some cases, just because the process running the
task happens to be in an extremely constrained memory cgroup, doesn''t
mean that failing the entire system is really such a great idea.  Or
maybe that means memory cgroups are kinda busted.  :-)

      	   	 		    	  	   - Ted
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-25 11:35 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o
wrote:> Part of the problem is that we have a few places in the kernel where
> failure is really not an option --- or rather, if we''re going to
fail
> while we''re in the middle of doing a commit, our choices really
are
> (a) retry the loop in the jbd layer (which Andrew really doesn''t
> like), (b) keep our own private cache of free memory so we don''t
fail
> and/or loop, (c) fail the file system and mark it read-only, or (d)
> panic. 
d) do the allocation before you''re committed to going fwd and can still
fail and back out.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ted Ts''o

2010-Aug-25 11:57 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, Aug 25, 2010 at 01:35:32PM +0200, Peter Zijlstra
wrote:> On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o wrote:
> > Part of the problem is that we have a few places in the kernel where
> > failure is really not an option --- or rather, if we''re going
to fail
> > while we''re in the middle of doing a commit, our choices
really are
> > (a) retry the loop in the jbd layer (which Andrew really
doesn''t
> > like), (b) keep our own private cache of free memory so we
don''t fail
> > and/or loop, (c) fail the file system and mark it read-only, or (d)
> > panic. 
> 
> d) do the allocation before you''re committed to going fwd and can
still
> fail and back out.
Sure in some cases that can be done, but the commit has to happen at
some point, or we run out of journal space, at which point we''re back
to (c) or (d).

						- Ted


--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-25 12:48 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 07:57 -0400, Ted Ts''o
wrote:> On Wed, Aug 25, 2010 at 01:35:32PM +0200, Peter Zijlstra wrote:
> > On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o wrote:
> > > Part of the problem is that we have a few places in the kernel
where
> > > failure is really not an option --- or rather, if we''re
going to fail
> > > while we''re in the middle of doing a commit, our choices
really are
> > > (a) retry the loop in the jbd layer (which Andrew really
doesn''t
> > > like), (b) keep our own private cache of free memory so we
don''t fail
> > > and/or loop, (c) fail the file system and mark it read-only, or
(d)
> > > panic. 
> > 
> > d) do the allocation before you''re committed to going fwd and
can still
> > fail and back out.
> 
> Sure in some cases that can be done, but the commit has to happen at
> some point, or we run out of journal space, at which point we''re
back
> to (c) or (d).
Well (b) sounds a lot saner than either of those. Simply revert to a
state that is sub-optimal but has bounded memory use and reserve that
memory up-front. That way you can always get out of a tight memory spot.

Its what the block layer has always done to avoid the memory deadlock
situation, it has a private stash of BIOs that is big enough to always
service some IO, and as long as IO is happening stuff keeps moving fwd
and we don''t deadlock.

Filesystems might have a slightly harder time creating such a bounded
state because there might be more involved like journals and the like,
but still it should be possible to create something like that (my swap
over nfs patches created such a state for the network rx side of
things).

Also, we cannot let our fear of crappy userspace get in the way of doing
sensible things. Your example of write(2) returning -ENOMEM is not
correct though, the syscall (and the page_mkwrite callback for mmap()s)
happens before we actually dirty the data and need to write things out,
so we can always simply wait for memory to become available to dirty.

Peter Zijlstra

2010-Aug-25 12:52 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 14:48 +0200, Peter Zijlstra wrote:
> > > On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o wrote:
> > > > (a) retry the loop in the jbd layer (which Andrew really
doesn''t
> > > > like), (b) keep our own private cache of free memory so we
don''t fail
> > > > and/or loop, 
> Well (b) sounds a lot saner than either of those. Simply revert to a
> state that is sub-optimal but has bounded memory use and reserve that
> memory up-front. That way you can always get out of a tight memory spot.
Also, there''s a good reason for disliking (a), its a deadlock scenario,
suppose we need to write out data to get free pages, but the writing out
is blocked on requiring free pages.

There''s really nothing the page allocator can do to help you there, its
a situation you have to avoid getting into.


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Theodore Tso

2010-Aug-25 13:20 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Aug 25, 2010, at 8:52 AM, Peter Zijlstra wrote:> Also, there''s a good reason for disliking (a), its a deadlock
scenario,
> suppose we need to write out data to get free pages, but the writing out
> is blocked on requiring free pages.
> 
> There''s really nothing the page allocator can do to help you
there, its
> a situation you have to avoid getting into.
Well, if all of these users start having their own private pools of emergency
memory, I''m not sure that''s such a great idea either.

And in some cases, there *is* extra memory.  For example, if the reason why the
page allocator failed is because there isn''t enough memory in the
current process''s cgroup, maybe it''s important enough that the
kernel code might decide to say, "violate the cgroup constraints ---
it''s more important that we not bring down the entire system" than
to honor whatever yahoo decided that a particular cgroup has been set down to
something ridiculous like 512mb, when there''s plenty of free physical
memory --- but not in that cgroup.

-- Ted

--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Dave Chinner

2010-Aug-25 13:24 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, Aug 25, 2010 at 02:48:36PM +0200, Peter Zijlstra
wrote:> On Wed, 2010-08-25 at 07:57 -0400, Ted Ts''o wrote:
> > On Wed, Aug 25, 2010 at 01:35:32PM +0200, Peter Zijlstra wrote:
> > > On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o wrote:
> > > > Part of the problem is that we have a few places in the
kernel where
> > > > failure is really not an option --- or rather, if
we''re going to fail
> > > > while we''re in the middle of doing a commit, our
choices really are
> > > > (a) retry the loop in the jbd layer (which Andrew really
doesn''t
> > > > like), (b) keep our own private cache of free memory so we
don''t fail
> > > > and/or loop, (c) fail the file system and mark it read-only,
or (d)
> > > > panic. 
> > > 
> > > d) do the allocation before you''re committed to going
fwd and can still
> > > fail and back out.
> > 
> > Sure in some cases that can be done, but the commit has to happen at
> > some point, or we run out of journal space, at which point
we''re back
> > to (c) or (d).
> 
> Well (b) sounds a lot saner than either of those. Simply revert to a
> state that is sub-optimal but has bounded memory use and reserve that
> memory up-front. That way you can always get out of a tight memory spot.
> 
> Its what the block layer has always done to avoid the memory deadlock
> situation, it has a private stash of BIOs that is big enough to always
> service some IO, and as long as IO is happening stuff keeps moving fwd
> and we don''t deadlock.
> 
> Filesystems might have a slightly harder time creating such a bounded
> state because there might be more involved like journals and the like,
> but still it should be possible to create something like that (my swap
> over nfs patches created such a state for the network rx side of
> things).
Filesystems are way more complex than the block layer - the block
layer simply doesn''t have to handle situations were thread X is
holding A, B and C, while thread Y needs C to complete the
transaction. thread Y is the user of the low memory pool, but has
almost depleted it and so even if we swith to thread X, the pool doe
snot have enouhg memory for X to complete and allow us to switch
back to Y and have it complete, freeing the memory from the pool
that it holds.

That is, the guarantee that we will always make progress simply does
not exist in filesystems, so a mempool-like concept seems to me to
be doomed from the start....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-25 13:31 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 09:20 -0400, Theodore Tso wrote:> On Aug 25, 2010, at 8:52 AM, Peter Zijlstra wrote:
> > Also, there''s a good reason for disliking (a), its a deadlock
scenario,
> > suppose we need to write out data to get free pages, but the writing
out
> > is blocked on requiring free pages.
> > 
> > There''s really nothing the page allocator can do to help you
there, its
> > a situation you have to avoid getting into.
> 
> Well, if all of these users start having their own private pools of
> emergency memory, I''m not sure that''s such a great idea
either.
> 
> And in some cases, there *is* extra memory.  For example, if the
> reason why the page allocator failed is because there isn''t enough
> memory in the current process''s cgroup, maybe it''s
important enough
> that the kernel code might decide to say, "violate the cgroup
> constraints --- it''s more important that we not bring down the
entire
> system" than to honor whatever yahoo decided that a particular cgroup
> has been set down to something ridiculous like 512mb, when there''s
> plenty of free physical memory --- but not in that cgroup.
I''m not sure, but I think the cgroup thing doesn''t account
kernel
allocations, in which case the above problem doesn''t exist.

For the cpuset case we punch through the cpuset constraints for kernel
allocations (unless __GFP_HARDWALL).
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-25 13:34 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 09:20 -0400, Theodore Tso wrote:> Well, if all of these users start having their own private pools of
> emergency memory, I''m not sure that''s such a great idea
either.
That''s a secondary problem, and could be reduced by something like the
memory reservation system I provided in the swap-over-nfs patches. Of
course, when all these users can actually happen concurrently there''s
really nothing much you can do about it.


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-25 13:35 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 23:24 +1000, Dave Chinner wrote:> 
> That is, the guarantee that we will always make progress simply does
> not exist in filesystems, so a mempool-like concept seems to me to
> be doomed from the start.... 
While I appreciate that it might be somewhat (a lot) harder for a
filesystem to provide that guarantee, I''d be deeply worried about your
claim that its impossible.

It would render a system without swap very prone to deadlocks. Even with
the very tight dirty page accounting we currently have you can fill all
your memory with anonymous pages, at which point there''s nothing free
and you require writeout of dirty pages to succeed.

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-25 14:13 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 07:24 -0400, Ted Ts''o
wrote:> There are other places where we can fail safely (for example, in
jbd''s
> start_this_handle, although that just pushes the layer up the stack,
> and ultimately, to userspace where most userspace programs don''t
> really expect ENOMEM to get returned by a random disk write 
While talking with Chris about this, if you can indeed push the error
out that far you can basically ensure this particular fs-op does not
complicate the journal commit and thereby limit the number of extra
entries in your journal, and thus the amount of memory required.

So once stuff starts failing, push out ops back out of the filesystem
code, force a journal commit, and then let these ops retry. There is no
need to actually push the -ENOMEM all the way back to userspace.


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-25 20:43 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, Peter Zijlstra wrote:
> I''m not sure, but I think the cgroup thing doesn''t
account kernel
> allocations, in which case the above problem doesn''t exist.
> 
Right, the only cgroup that does is cpusets because it binds the memory 
allocations to a set of nodes.
> For the cpuset case we punch through the cpuset constraints for kernel
> allocations (unless __GFP_HARDWALL).
> 
__GFP_HARDWALL doesn''t mean that the allocation won''t be
constrained, this
is a common misconception.  __GFP_HARDWALL only prevents us from looking 
at our cpuset.mem_exclusive flag and checking our nearest common ancestor 
cpuset if we can block.

The cpusets case is actually the easiest to fix: use GFP_ATOMIC.  
GFP_ATOMIC allocations aren''t bound by any cpuset and, in the general 
case, can allocate below the min watermark because of 
ALLOC_HARD | ALLOC_HARDER in the page allocator which creates the notion 
of "memory reserves" available to these tasks.  Then, success really 
depends on the setting of the watermarks instead.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ted Ts''o

2010-Aug-25 20:53 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, Aug 25, 2010 at 03:35:42PM +0200, Peter Zijlstra
wrote:> 
> While I appreciate that it might be somewhat (a lot) harder for a
> filesystem to provide that guarantee, I''d be deeply worried about
your
> claim that its impossible.
> 
> It would render a system without swap very prone to deadlocks. Even with
> the very tight dirty page accounting we currently have you can fill all
> your memory with anonymous pages, at which point there''s nothing
free
> and you require writeout of dirty pages to succeed.
For file systems that do delayed allocation, the situation is very
similar to swapping over NFS.  Sometimes in order to make some free
memory, you need to spend some free memory...  which implies that for
these file systems, being more aggressive about triggering writeout,
and being more aggressive about throttling processes which are
creating too many dirty pages, especially dirty delayed allocaiton
pages (regardless of whether this is via write(2) or accessing mmapped
memory), is a really good idea.

A pool of free pages which is reserved for routines that are doing
page cleaning would probably also be a good idea.  Maybe that''s just
retrying with GFP_ATOMIC if a normal allocation fails, or maybe we
need our own special pool, or maybe we need to dynamically resize the
GFP_ATOMIC pool based on how many subsystems might need to use it....

Just brainstorming here; what do people think?

							- Ted
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-25 20:55 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 13:43 -0700, David Rientjes wrote:> The cpusets case is actually the easiest to fix: use GFP_ATOMIC.  
I don''t think that''s a valid usage of GFP_ATOMIC, I think we
should
fallback to outside the cpuset for kernel allocations by default.
--
To unsubscribe from this list: send the line "unsubscribe linux-raid"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-25 20:58 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, Peter Zijlstra wrote:
> While I appreciate that it might be somewhat (a lot) harder for a
> filesystem to provide that guarantee, I''d be deeply worried about
your
> claim that its impossible.
> 
> It would render a system without swap very prone to deadlocks. Even with
> the very tight dirty page accounting we currently have you can fill all
> your memory with anonymous pages, at which point there''s nothing
free
> and you require writeout of dirty pages to succeed.
> 
While I''d really love for callers to be able to gracefully handle
getting
a NULL back from the page allocator in all cases, it''s not a
prerequisite
for this patchset.  This patchset actually does nothing interesting except 
removing the __GFP_NOFAIL bit from their gfp mask.  All of these 
allocations already loop looking for memory because they have orders that 
are less than PAGE_ALLOC_COSTLY_ORDER (which defaults to 3).  So the loops 
in kzalloc_nofail(), etc., never actually loop.

Demanding that the page allocator return order-3 memory in any context is 
a non-starter, so I''m not really interested in that.  I''m more
concerned
about proper error handling being implemented for these callers iff 
someone redefines PAGE_ALLOC_COSTLY_ORDER to something else, perhaps even 
0.

Callers can, when desperate for memory, use GFP_ATOMIC to use some memory 
reserves across zones, hopefully order-0 and not an egregious amount.  But 
the remainder of the burden really is back on the caller when this is 
depleted or it needs higher order allocs to be fixed in a way that
doesn''t
rely on memory that doesn''t exist.  That''s an implementation
choice by the
caller and I agree that some failsafe behavior is the only way that we 
don''t get really bad results like corrupted user data or filesystems.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-25 20:59 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, Ted Ts''o wrote:
> A pool of free pages which is reserved for routines that are doing
> page cleaning would probably also be a good idea.  Maybe that''s
just
> retrying with GFP_ATOMIC if a normal allocation fails, or maybe we
> need our own special pool, or maybe we need to dynamically resize the
> GFP_ATOMIC pool based on how many subsystems might need to use it....
> 
PF_MEMALLOC is used for those tasks, not GFP_ATOMIC.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Christoph Lameter

2010-Aug-25 21:11 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

We already have __GFP_NOFAIL behavior for slab allocations since
a __GFP_NOFAIL flag is passed through to the page allocator if no objects
are available.

The page allocator will do the same as when called directly with
__GFP_NOFAIL and so we have consistent behavior regardless of the
allocator used.

David Rientjes

2010-Aug-25 21:11 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, Peter Zijlstra wrote:
> > The cpusets case is actually the easiest to fix: use GFP_ATOMIC.  
> 
> I don''t think that''s a valid usage of GFP_ATOMIC, I think
we should
> fallback to outside the cpuset for kernel allocations by default.
Cpusets doesn''t enforce isolation for only user memory, it''s
always bound
_all_ allocations that aren''t atomic or in irq context (or oom killed 
tasks).  Allowing slab, for example, to be allocated in other cpusets 
could cause them to oom themselves since they are bound by the same memory 
isolation policy that all other cpusets are.  We''d get random oom 
conditions in cpusets only depending on where the slab was allocated at 
now fault to those applications themselves, and that''s certainly not a 
situation we want.  The memory controller cgroup also has slab accounting 
on their TODO list.

If you think GFP_ATOMIC is inappropriate in these contexts, then they are 
by definition blockable.  So this seems like a good candidate for using 
memory compaction since we''re talking only about
PAGE_ALLOC_COSTLY_ORDER
and higher allocs, even though it''s only currently configurable for 
hugepages.

There''s still no hard guarantee that the memory will allocatable 
(GFP_KERNEL, the compaction, then GFP_ATOMIC could all still fail), but I 
don''t see how continuously looping the page allocator is possibly
supposed
to help in these situations.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-25 21:21 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 16:11 -0500, Christoph Lameter
wrote:> We already have __GFP_NOFAIL behavior for slab allocations since
> a __GFP_NOFAIL flag is passed through to the page allocator if no objects
> are available.
> 
> The page allocator will do the same as when called directly with
> __GFP_NOFAIL and so we have consistent behavior regardless of the
> allocator used.
Thank''s for quoting the context to which you''re replying, that
really
helps parsing things..

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-25 21:23 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, Christoph Lameter wrote:
> We already have __GFP_NOFAIL behavior for slab allocations since
> a __GFP_NOFAIL flag is passed through to the page allocator if no objects
> are available.
> 
It all depends on what flags are passed to kmalloc(), slab nor slub 
enforce __GFP_NOFAIL behavior themselves.  In slab, cache_grow() will 
return NULL depending on whether the page allocator returns NULL, and that 
would only happen for __GFP_NORETRY or
cachep->gfp->gfporder >= PAGE_ALLOC_COSTLY_ORDER.  In slub, the default
order is tried with __GFP_NORETRY and if it returns NULL, the higher order 
alloc will fail under the same circumstances.  So the nofail behavior for 
slab depends only on the flags passed from the caller.
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-25 21:27 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 14:11 -0700, David Rientjes wrote:> 
> There''s still no hard guarantee that the memory will allocatable 
> (GFP_KERNEL, the compaction, then GFP_ATOMIC could all still fail), but I 
> don''t see how continuously looping the page allocator is possibly
supposed
> to help in these situations. 
Why do you think I''m a proponent of that behaviour?

I''ve been arguing that the existance of GFP_NOFAIL is the bug, and I
started the whole discussion because your patchset didn''t outline the
purpose of its existance, it merely changes __GFP_NOFAIL usage into
$foo_nofail() functions, which on its own is a rather daft change.

Optimizing the page allocator by removing those conditional from its
innards into an outer loop not used by most callers seems a fine goal,
but you didn''t state that.

Also, I like the postfix proposed by Andi better: _i_suck() :-)
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Christoph Lameter

2010-Aug-25 21:35 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, David Rientjes wrote:
> It all depends on what flags are passed to kmalloc(), slab nor slub
> enforce __GFP_NOFAIL behavior themselves.  In slab, cache_grow() will
> return NULL depending on whether the page allocator returns NULL, and that
> would only happen for __GFP_NORETRY or
> cachep->gfp->gfporder >= PAGE_ALLOC_COSTLY_ORDER.  In slub, the
default
> order is tried with __GFP_NORETRY and if it returns NULL, the higher order
> alloc will fail under the same circumstances.  So the nofail behavior for
> slab depends only on the flags passed from the caller.
If the higher order fails in slub then an order 0 alloc is attempted
without __GFP_NORETRY. In both cases the nofail behavior of the page
allocator determines the outcode.

True if the caller mixes in __GFP_NORETRY then you may still get NULL. But
that is an issue that can be resolved by the caller.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-25 21:35 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 16:53 -0400, Ted Ts''o
wrote:> On Wed, Aug 25, 2010 at 03:35:42PM +0200, Peter Zijlstra wrote:
> > 
> > While I appreciate that it might be somewhat (a lot) harder for a
> > filesystem to provide that guarantee, I''d be deeply worried
about your
> > claim that its impossible.
> > 
> > It would render a system without swap very prone to deadlocks. Even
with
> > the very tight dirty page accounting we currently have you can fill
all
> > your memory with anonymous pages, at which point there''s
nothing free
> > and you require writeout of dirty pages to succeed.
> 
> For file systems that do delayed allocation, the situation is very
> similar to swapping over NFS.  Sometimes in order to make some free
> memory, you need to spend some free memory... 
Which means you need to be able to compute a bounded amount of that
memory.
>  which implies that for
> these file systems, being more aggressive about triggering writeout,
> and being more aggressive about throttling processes which are
> creating too many dirty pages, especially dirty delayed allocaiton
> pages (regardless of whether this is via write(2) or accessing mmapped
> memory), is a really good idea.
That seems unrelated, the VM has a strict dirty limit and controls
writeback when needed. That part works.
> A pool of free pages which is reserved for routines that are doing
> page cleaning would probably also be a good idea.  Maybe that''s
just
> retrying with GFP_ATOMIC if a normal allocation fails, or maybe we
> need our own special pool, or maybe we need to dynamically resize the
> GFP_ATOMIC pool based on how many subsystems might need to use it....
We have a smallish reserve, accessible with PF_MEMALLOC, but its use is
not regulated nor bounded, it just mostly works good enough.


--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-25 23:05 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, Christoph Lameter wrote:
> If the higher order fails in slub then an order 0 alloc is attempted
> without __GFP_NORETRY. In both cases the nofail behavior of the page
> allocator determines the outcode.
> 
Right, I thought you said the slab layer passes __GFP_NOFAIL when
there''s
no objects available.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-25 23:11 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, Peter Zijlstra wrote:
> > There''s still no hard guarantee that the memory will
allocatable
> > (GFP_KERNEL, the compaction, then GFP_ATOMIC could all still fail),
but I
> > don''t see how continuously looping the page allocator is
possibly supposed
> > to help in these situations. 
> 
> Why do you think I''m a proponent of that behaviour?
> 
I don''t, I was agreeing with what you''re saying :)
> I''ve been arguing that the existance of GFP_NOFAIL is the bug, and
I
> started the whole discussion because your patchset didn''t outline
the
> purpose of its existance, it merely changes __GFP_NOFAIL usage into
> $foo_nofail() functions, which on its own is a rather daft change.
> 
I originally pushed these to the callers, but Andrew thought it would be 
better so that we could audit the existing users that have no fallback 
behavior, even though they could go implement it on their own (like 
getblk() which actually does try _some_ memory freeing).  It eliminates 
the flag from the page allocator and saves branches in the slowpath.  We 
don''t need this logic in the allocator itself, it can exist at a higher
level and, with deprecation, will hopefully be incentive enough for those 
subsystems to fix the issue.

I''ll repropose the patchset with __deprecated as you suggested. 
Thanks!
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Dave Chinner

2010-Aug-26 00:09 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, Aug 25, 2010 at 03:35:42PM +0200, Peter Zijlstra
wrote:> On Wed, 2010-08-25 at 23:24 +1000, Dave Chinner wrote:
> > 
> > That is, the guarantee that we will always make progress simply does
> > not exist in filesystems, so a mempool-like concept seems to me to
> > be doomed from the start.... 
> 
> While I appreciate that it might be somewhat (a lot) harder for a
> filesystem to provide that guarantee, I''d be deeply worried about
your
> claim that its impossible.
I didn''t say impossible, just that there''s no way we can
always
guarantee of forward progress with a specific, bound pool of memory.

Sure, we know what the worst case amount of log space is needed for
each transaction (i.e. how many pages that will be dirtied), but
that does not take into account all the blocks that need to be read
to make those modifications, the memory needed for stuff like btree
cursors, log tickets, transaction commit vectors, btree blocks
needed to do the searches, etc.  A typical transaction reservation
on a 4k block filesystem is between 200-400k (it''s worst case), and
if you add in all the other allocations that might be required,
we''re at the order of requiring megabytes of RAM to guarantee a
single transaction will succeed in low memory conditions. The exact
requirement is very difficult to quantify algorithmically, but for a
single transaction it should be possible.

However, consider the case of running a thousand concurrent
transactions and in the middle of that the system runs out of
memory. All the transactions need memory allocation to succeed, some
are blocked waiting for resources held in other transactions, etc.
Firstly, how to you stop all the transactions from making further
progress to serialise access to the low memory pool?  Secondly, how
do you select which transaction you want to use the low memory pool?
What do you do if the selected transaction then blocks on a resource
held by another transaction (which you can''t know ahead of time)? Do
you switch to another thread and hope the pool doesn''t run dry? What
do you do when (not if) the memory pool runs dry?

I''m sure this could be done, but it''s lot of difficult,
unrewarding
work that greatly increases code complexity, touches a massive
amount of the filesystem code base, exponentially increases the test
matrix, is likely to have significant operational overhead, and even
then there''s no guarantee that we''ve got it right. That
doesn''t
sound like a good solution to me.
> It would render a system without swap very prone to deadlocks. Even with
> the very tight dirty page accounting we currently have you can fill all
> your memory with anonymous pages, at which point there''s nothing
free
> and you require writeout of dirty pages to succeed.
Then don''t allow anonymous pages to fill all of memory when there is
no swap available - i.e. keep a larger pool of free memory when
there is no swap available. That''s a much simpler solution than
turning all the filesystems upside down to try to make them not need
allocation....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ted Ts''o

2010-Aug-26 00:19 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, Aug 25, 2010 at 04:11:38PM -0700, David Rientjes
wrote:> 
> I''ll repropose the patchset with __deprecated as you suggested. 
Thanks!
And what Dave and I are saying is that we''ll either need to do our on
loop to avoid the deprecation warning, or the use of the deprecated
function will probably be used forever...

	      	       	     	 - Ted
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-26 00:30 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, Ted Ts''o wrote:
> > I''ll repropose the patchset with __deprecated as you
suggested.  Thanks!
> 
> And what Dave and I are saying is that we''ll either need to do our
on
> loop to avoid the deprecation warning, or the use of the deprecated
> function will probably be used forever...
> 
We certainly hope that nobody will reimplement the same function without 
the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
where there''s no looping at a higher level.  So perhaps the best 
alternative is to implement the same _nofail() functions but do a 
WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?

I think it''s really sad that the caller can''t know what the
upper bounds
of its memory requirement are ahead of time or at least be able to 
implement a memory freeing function when kmalloc() returns NULL.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Christoph Lameter

2010-Aug-26 01:30 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, David Rientjes wrote:
> On Wed, 25 Aug 2010, Christoph Lameter wrote:
>
> > If the higher order fails in slub then an order 0 alloc is attempted
> > without __GFP_NORETRY. In both cases the nofail behavior of the page
> > allocator determines the outcode.
> >
>
> Right, I thought you said the slab layer passes __GFP_NOFAIL when
there''s
> no objects available.
Yes, the slab layer calls the page allocator when there are no objects
available and passes the __GFP_NOFAIL that the user may have set in the
call to the page allocator.

Why then add new functions that do the same?



--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ted Ts''o

2010-Aug-26 01:48 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, Aug 25, 2010 at 05:30:42PM -0700, David Rientjes
wrote:> 
> We certainly hope that nobody will reimplement the same function without 
> the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER
> where there''s no looping at a higher level.  So perhaps the best 
> alternative is to implement the same _nofail() functions but do a 
> WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
Yeah, that sounds better.
> I think it''s really sad that the caller can''t know what
the upper bounds
> of its memory requirement are ahead of time or at least be able to 
> implement a memory freeing function when kmalloc() returns NULL.
Oh, we can determine an upper bound.  You might just not like it.
Actually ext3/ext4 shouldn''t be as bad as XFS, which Dave estimated to
be around 400k for a transaction.  My guess is that the worst case for
ext3/ext4 is probably around 256k or so; like XFS, most of the time,
it would be a lot less.  (At least, if data != journalled; if we are
doing data journalling and every single data block begins with
0xc03b3998U, we''ll need to allocate a 4k page for every single data
block written.)  We could dynamically calculate an upper bound if we
had to.  Of course, if ext3/ext4 is attached to a network block
device, then it could get a lot worse than 256k, of course.

	      	      	      	    	- Ted

David Rientjes

2010-Aug-26 03:09 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, Ted Ts''o wrote:
> > We certainly hope that nobody will reimplement the same function
without
> > the __deprecated warning, especially for order <
PAGE_ALLOC_COSTLY_ORDER
> > where there''s no looping at a higher level.  So perhaps the
best
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 
Ok, and we''ll make it a WARN_ON_ONCE() to be nice to the kernel log.  
Although the current patchset does this with WARN_ON_ONCE(1, ...) instead, 
this serves to ensure that we aren''t dependent on the page
allocator''s
implementation to always loop for order < PAGE_ALLOC_COSTLY_ORDER in which 
case the loop in the _nofail() functions would actually do something.
> > I think it''s really sad that the caller can''t know
what the upper bounds
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn''t be as bad as XFS, which Dave
estimated to
> be around 400k for a transaction.  My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less.  (At least, if data != journalled; if we are
> doing data journalling and every single data block begins with
> 0xc03b3998U, we''ll need to allocate a 4k page for every single
data
> block written.)  We could dynamically calculate an upper bound if we
> had to.  Of course, if ext3/ext4 is attached to a network block
> device, then it could get a lot worse than 256k, of course.
> 
On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL 
is 5086 pages, or ~20MB.  GFP_ATOMIC would allow access to ~12MB of that, 
so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as 
a fallback behavior when GFP_NOFS or GFP_NOIO fails?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-26 03:12 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, Christoph Lameter wrote:
> > Right, I thought you said the slab layer passes __GFP_NOFAIL when
there''s
> > no objects available.
> 
> Yes, the slab layer calls the page allocator when there are no objects
> available and passes the __GFP_NOFAIL that the user may have set in the
> call to the page allocator.
> 
> Why then add new functions that do the same?
> 
Because we can remove the flag, remove branches from the page allocator 
slowpath, and none of these allocations actually are dependent on 
__GFP_NOFAIL since they are all under PAGE_ALLOC_COSTLY_ORDER.
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Dave Chinner

2010-Aug-26 06:38 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, Aug 25, 2010 at 09:48:47PM -0400, Ted Ts''o
wrote:> On Wed, Aug 25, 2010 at 05:30:42PM -0700, David Rientjes wrote:
> > 
> > We certainly hope that nobody will reimplement the same function
without
> > the __deprecated warning, especially for order <
PAGE_ALLOC_COSTLY_ORDER
> > where there''s no looping at a higher level.  So perhaps the
best
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 
> > I think it''s really sad that the caller can''t know
what the upper bounds
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn''t be as bad as XFS, which Dave
estimated to
> be around 400k for a transaction.
For a 4k block size filesystem.

If I use 64k block size directories (which XFS can even on 4k page
size machines), the maximum transaction reservation goes up to at
around 3MB, and that''s just for blocks being _modified_. It''s
not
the limit on the amount of memory that may need to be allocated
during a transaction....
> My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less. 
Right, it usually is a lot less, but one of the big problems is that
during low memory situations memory reclaim of the metadata page
cache actually causes _more_ memory allocation during tranactions
than otherwise would occur.......

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Dave Chinner

2010-Aug-26 07:06 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, Aug 25, 2010 at 08:09:21PM -0700, David Rientjes
wrote:> On Wed, 25 Aug 2010, Ted Ts''o wrote:
> > > I think it''s really sad that the caller can''t
know what the upper bounds
> > > of its memory requirement are ahead of time or at least be able
to
> > > implement a memory freeing function when kmalloc() returns NULL.
> > 
> > Oh, we can determine an upper bound.  You might just not like it.
> > Actually ext3/ext4 shouldn''t be as bad as XFS, which Dave
estimated to
> > be around 400k for a transaction.  My guess is that the worst case for
> > ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> > it would be a lot less.  (At least, if data != journalled; if we are
> > doing data journalling and every single data block begins with
> > 0xc03b3998U, we''ll need to allocate a 4k page for every
single data
> > block written.)  We could dynamically calculate an upper bound if we
> > had to.  Of course, if ext3/ext4 is attached to a network block
> > device, then it could get a lot worse than 256k, of course.
> > 
> 
> On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL 
> is 5086 pages, or ~20MB.  GFP_ATOMIC would allow access to ~12MB of that, 
> so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as 
> a fallback behavior when GFP_NOFS or GFP_NOIO fails?
It would take a handful of concurrent transactions in XFS with
worst case memory allocation requirements to exhaust that pool, and
then we really would be in trouble.  Alternatively, it would take a
few allocations from each of a couple of thousand concurrent
transactions to get to the same point.

Bound memory pools only work when serialised access to the pool can
be enforced and there are no dependencies on other operations in
progress for completion of the work and freeing of the memory.
This is where it becomes exceedingly difficult to guarantee
progress.

One of the ideas that has floated around (I think Mel Gorman came up
with it first) was that if hardening the filesystem is so difficult,
why not just harden a single path via a single thread? e.g. we allow
the bdi flusher thread to have a separate reserve pool of free
pages, and when memory allocations start to fail, then that thread
can dip into it''s pool to complete the writeback of the dirty pages
being flushed.  When a fileystem attaches to a bdi, it can specify
the size of the reserve pool it needs. 

This can be easily tested for during allocation (say a PF_ flag) and
switched to the reserve pool as necessary. because it is per-thread,
access to the pool is guaranteed to serialised. Memory reclaim can
then refill these pools before putting pages on freelists. This
could give us a mechanism for ensuring that allocations succeed in
the ->writepage path without needing to care about filesystem
implementation details.

And in the case of ext3/4, a pool could be attached to the jbd
thread as well so that it never starves of memory when commits are
required...

So, rather than turning filesystems upside down, maybe we should
revisit per-thread reserve pools for threads that are tasked with
cleaning pages for the VM?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-raid"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peter Zijlstra

2010-Aug-26 08:29 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 2010-08-25 at 20:09 -0700, David Rientjes wrote:> > Oh, we can determine an upper bound.  You might just not like it.
> > Actually ext3/ext4 shouldn''t be as bad as XFS, which Dave
estimated to
> > be around 400k for a transaction.  My guess is that the worst case for
> > ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> > it would be a lot less.  (At least, if data != journalled; if we are
> > doing data journalling and every single data block begins with
> > 0xc03b3998U, we''ll need to allocate a 4k page for every
single data
> > block written.)  We could dynamically calculate an upper bound if we
> > had to.  Of course, if ext3/ext4 is attached to a network block
> > device, then it could get a lot worse than 256k, of course.
> > 
> On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL 
> is 5086 pages, or ~20MB.  GFP_ATOMIC would allow access to ~12MB of that, 
> so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as 
> a fallback behavior when GFP_NOFS or GFP_NOIO fails? 
Agreed with the fact that 400k isn''t much to worry about.

Not agreed with the GFP_ATOMIC stmt.

Direct reclaim already has PF_MEMALLOC, but then we also need a
concurrency limit on that, otherwise you can still easily blow though
your reserves by having 100 concurrency users of it, resulting in an
upper bound of 40000k instead, which will be too much.

There were patches to limit the direct reclaim contexts, not sure they
ever got anywhere..

It is something to consider in the re-design of the whole
direct-reclaim/writeback paths though..
--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Christoph Lameter

2010-Aug-26 14:16 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 25 Aug 2010, David Rientjes wrote:
> Because we can remove the flag, remove branches from the page allocator
> slowpath, and none of these allocations actually are dependent on
> __GFP_NOFAIL since they are all under PAGE_ALLOC_COSTLY_ORDER.
Then we can simply remove __GFP_NOFAIL? Functions are only needed for
higher order allocs that can fail?

--
To unsubscribe from this list: send the line "unsubscribe
reiserfs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Aug-26 22:31 UTC

head link

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Thu, 26 Aug 2010, Christoph Lameter wrote:
> > Because we can remove the flag, remove branches from the page
allocator
> > slowpath, and none of these allocations actually are dependent on
> > __GFP_NOFAIL since they are all under PAGE_ALLOC_COSTLY_ORDER.
> 
> Then we can simply remove __GFP_NOFAIL? Functions are only needed for
> higher order allocs that can fail?
> 
Yes, that''s the intent.  We''d like to add the 
WARN_ON_ONCE(get_order(size) >= PAGE_ALLOC_COSTLY_ORDER) warning, though, 
so we''re ensured that redefinition of that #define doesn''t
cause
allocations to fail to that don''t have appropriate error handling or 
future callers use higher order allocs.  The _nofail() functions help that 
and do some due diligence in ensuring that we aren''t changing gfp flags
based only on the current page allocator implementation which may later 
change with very specialized corner cases.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Sep-02 01:02 UTC

head link

[patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail().  These
functions are equivalent to kmalloc(), kcalloc(), and kzalloc(),
respectively, except that they will never return NULL and instead loop
forever trying to allocate memory.

If the first allocation attempt fails because the page allocator
doesn''t
implicitly loop, a warning will be emitted, including a call trace.
Subsequent failures will suppress this warning.

These were added as helper functions for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 drivers/md/dm-region-hash.c |    2 +-
 fs/btrfs/inode.c            |    2 +-
 fs/gfs2/log.c               |    2 +-
 fs/gfs2/rgrp.c              |   18 +++++----------
 fs/jbd/transaction.c        |   11 ++------
 fs/reiserfs/journal.c       |    3 +-
 include/linux/slab.h        |   51 +++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 64 insertions(+), 25 deletions(-)

diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -290,7 +290,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash
*rh, region_t region)
 
 	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
 	if (unlikely(!nreg))
-		nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
+		nreg = kmalloc_nofail(sizeof(*nreg), GFP_NOIO);
 
 	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
 		      DM_RH_CLEAN : DM_RH_NOSYNC;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1967,7 +1967,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
 	if (atomic_add_unless(&inode->i_count, -1, 1))
 		return;
 
-	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+	delayed = kmalloc_nofail(sizeof(*delayed), GFP_NOFS);
 	delayed->inode = inode;
 
 	spin_lock(&fs_info->delayed_iput_lock);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -709,7 +709,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock
*gl)
 	}
 	trace_gfs2_log_flush(sdp, 1);
 
-	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
+	ai = kzalloc_nofail(sizeof(struct gfs2_ail), GFP_NOFS);
 	INIT_LIST_HEAD(&ai->ai_ail1_list);
 	INIT_LIST_HEAD(&ai->ai_ail2_list);
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1440,8 +1440,8 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp,
u64 bstart,
 		rgrp_blk++;
 
 		if (!bi->bi_clone) {
-			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
-					       GFP_NOFS | __GFP_NOFAIL);
+			bi->bi_clone = kmalloc_nofail(bi->bi_bh->b_size,
+					       GFP_NOFS);
 			memcpy(bi->bi_clone + bi->bi_offset,
 			       bi->bi_bh->b_data + bi->bi_offset,
 			       bi->bi_len);
@@ -1759,9 +1759,6 @@ fail:
  * @block: the block
  *
  * Figure out what RG a block belongs to and add that RG to the list
- *
- * FIXME: Don''t use NOFAIL
- *
  */
 
 void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
@@ -1789,8 +1786,8 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct
gfs2_rgrp_list *rlist,
 	if (rlist->rl_rgrps == rlist->rl_space) {
 		new_space = rlist->rl_space + 10;
 
-		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
-			      GFP_NOFS | __GFP_NOFAIL);
+		tmp = kcalloc_nofail(new_space, sizeof(struct gfs2_rgrpd *),
+			      GFP_NOFS);
 
 		if (rlist->rl_rgd) {
 			memcpy(tmp, rlist->rl_rgd,
@@ -1811,17 +1808,14 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct
gfs2_rgrp_list *rlist,
  * @rlist: the list of resource groups
  * @state: the lock state to acquire the RG lock in
  * @flags: the modifier flags for the holder structures
- *
- * FIXME: Don''t use NOFAIL
- *
  */
 
 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
 {
 	unsigned int x;
 
-	rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
-				GFP_NOFS | __GFP_NOFAIL);
+	rlist->rl_ghs = kcalloc_nofail(rlist->rl_rgrps,
+				sizeof(struct gfs2_holder), GFP_NOFS);
 	for (x = 0; x < rlist->rl_rgrps; x++)
 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
 				state, 0,
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -98,14 +98,9 @@ static int start_this_handle(journal_t *journal, handle_t
*handle)
 	}
 
 alloc_transaction:
-	if (!journal->j_running_transaction) {
-		new_transaction = kzalloc(sizeof(*new_transaction),
-						GFP_NOFS|__GFP_NOFAIL);
-		if (!new_transaction) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
+	if (!journal->j_running_transaction)
+		new_transaction = kzalloc_nofail(sizeof(*new_transaction),
+						GFP_NOFS);
 
 	jbd_debug(3, "New handle %p going live.\n", handle);
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2593,8 +2593,7 @@ static int journal_read(struct super_block *sb)
 static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 {
 	struct reiserfs_journal_list *jl;
-	jl = kzalloc(sizeof(struct reiserfs_journal_list),
-		     GFP_NOFS | __GFP_NOFAIL);
+	jl = kzalloc_nofail(sizeof(struct reiserfs_journal_list), GFP_NOFS);
 	INIT_LIST_HEAD(&jl->j_list);
 	INIT_LIST_HEAD(&jl->j_working_list);
 	INIT_LIST_HEAD(&jl->j_tail_bh_list);
diff --git a/include/linux/slab.h b/include/linux/slab.h
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -334,6 +334,57 @@ static inline void *kzalloc_node(size_t size, gfp_t flags,
int node)
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
+/**
+ * kmalloc_nofail - infinitely loop until kmalloc() succeeds.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kmalloc).
+ *
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+static inline void *kmalloc_nofail(size_t size, gfp_t flags)
+{
+	void *ret;
+
+	for (;;) {
+		ret = kmalloc(size, flags);
+		if (ret)
+			return ret;
+		WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);
+	}
+}
+
+/**
+ * kcalloc_nofail - infinitely loop until kcalloc() succeeds.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kcalloc).
+ *
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+static inline void *kcalloc_nofail(size_t n, size_t size, gfp_t flags)
+{
+	void *ret;
+
+	for (;;) {
+		ret = kcalloc(n, size, flags);
+		if (ret)
+			return ret;
+		WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);
+	}
+}
+
+/**
+ * kzalloc_nofail - infinitely loop until kzalloc() succeeds.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kzalloc).
+ */
+static inline void *kzalloc_nofail(size_t size, gfp_t flags)
+{
+	return kmalloc_nofail(size, flags | __GFP_ZERO);
+}
+
 void __init kmem_cache_init_late(void);
 
 #endif	/* _LINUX_SLAB_H */

David Rientjes

2010-Sep-02 01:03 UTC

head link

[patch v2 4/5] btrfs: add nofail variant of set_extent_dirty

Add set_extent_dirty_nofail().  This function is equivalent to
set_extent_dirty(), except that it will never fail because of allocation
failure and instead loop forever trying to allocate memory.

If the first allocation attempt fails because the page allocator
doesn''t
implicitly loop, a warning will be emitted, including a call trace.
Subsequent failures will suppress this warning.

This was added as a helper function for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/btrfs/extent-tree.c |    8 ++++----
 fs/btrfs/extent_io.c   |   18 ++++++++++++++++++
 fs/btrfs/extent_io.h   |    2 ++
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3831,9 +3831,9 @@ static int update_block_group(struct btrfs_trans_handle
*trans,
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
 
-			set_extent_dirty(info->pinned_extents,
+			set_extent_dirty_nofail(info->pinned_extents,
 					 bytenr, bytenr + num_bytes - 1,
-					 GFP_NOFS | __GFP_NOFAIL);
+					 GFP_NOFS);
 		}
 		btrfs_put_block_group(cache);
 		total -= num_bytes;
@@ -3872,8 +3872,8 @@ static int pin_down_extent(struct btrfs_root *root,
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
 
-	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
-			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+	set_extent_dirty_nofail(root->fs_info->pinned_extents, bytenr,
+			 bytenr + num_bytes - 1, GFP_NOFS);
 	return 0;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -940,6 +940,24 @@ int set_extent_dirty(struct extent_io_tree *tree, u64
start, u64 end,
 			      NULL, mask);
 }
 
+/*
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	int ret;
+
+	for (;;) {
+		ret = set_extent_dirty(tree, start, end, mask);
+		if (ret != -ENOMEM)
+			return ret;
+		WARN_ON_ONCE(get_order(sizeof(struct extent_state)) >
+						PAGE_ALLOC_COSTLY_ORDER);
+	}
+}
+
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -197,6 +197,8 @@ int set_extent_new(struct extent_io_tree *tree, u64 start,
u64 end,
 		   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
+int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Jiri Slaby

2010-Sep-02 07:59 UTC

head link

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On 09/02/2010 03:02 AM, David Rientjes wrote:> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -334,6 +334,57 @@ static inline void *kzalloc_node(size_t size, gfp_t
flags, int node)
>  	return kmalloc_node(size, flags | __GFP_ZERO, node);
>  }
>  
> +/**
> + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.
> + * @size: how many bytes of memory are required.
> + * @flags: the type of memory to allocate (see kmalloc).
> + *
> + * NOTE: no new callers of this function should be implemented!
> + * All memory allocations should be failable whenever possible.
> + */
> +static inline void *kmalloc_nofail(size_t size, gfp_t flags)
> +{
> +	void *ret;
> +
> +	for (;;) {
> +		ret = kmalloc(size, flags);
> +		if (ret)
> +			return ret;
> +		WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);
This doesn''t work as you expect. kmalloc will warn every time it fails.
__GFP_NOFAIL used to disable the warning. Actually what''s wrong with
__GFP_NOFAIL? I cannot find a reason in the changelogs why the patches
are needed.
> +	}


-- 
js
--
To unsubscribe from this list: send the line "unsubscribe linux-raid"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Jan Kara

2010-Sep-02 14:51 UTC

head link

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Thu 02-09-10 09:59:13, Jiri Slaby wrote:> On 09/02/2010 03:02 AM, David Rientjes wrote:
> > --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -334,6
+334,57
> > @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int
node)
> > return kmalloc_node(size, flags | __GFP_ZERO, node); }
> >  
> > +/** + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.  +
> > * @size: how many bytes of memory are required.  + * @flags: the type
> > of memory to allocate (see kmalloc).  + * + * NOTE: no new callers of
> > this function should be implemented!  + * All memory allocations
should
> > be failable whenever possible.  + */ +static inline void
> > *kmalloc_nofail(size_t size, gfp_t flags) +{ +	void *ret; + +	for
> > (;;) { +		ret = kmalloc(size, flags); +		if (ret) +
> > return ret; +		WARN_ON_ONCE(get_order(size) >
> > PAGE_ALLOC_COSTLY_ORDER);
> 
> This doesn''t work as you expect. kmalloc will warn every time it
fails.
> __GFP_NOFAIL used to disable the warning. Actually what''s wrong
with
> __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches
> are needed.  David should probably add the reasoning to the changelogs so that he
doesn''t have to explain again and again ;). But if I understood it
correctly, the concern is that the looping checks slightly impact fast path
of the callers which do not need it. Generally, also looping for a long
time inside allocator isn''t a nice thing but some callers
aren''t able to do
better for now to the patch is imperfect in this sence...

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Neil Brown

2010-Sep-02 21:15 UTC

head link

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Thu, 2 Sep 2010 16:51:41 +0200
Jan Kara <jack@suse.cz> wrote:
> On Thu 02-09-10 09:59:13, Jiri Slaby wrote:
> > On 09/02/2010 03:02 AM, David Rientjes wrote:
> > > --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -334,6
+334,57
> > > @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int
node)
> > > return kmalloc_node(size, flags | __GFP_ZERO, node); }
> > >  
> > > +/** + * kmalloc_nofail - infinitely loop until kmalloc()
succeeds.  +
> > > * @size: how many bytes of memory are required.  + * @flags: the
type
> > > of memory to allocate (see kmalloc).  + * + * NOTE: no new
callers of
> > > this function should be implemented!  + * All memory allocations
should
> > > be failable whenever possible.  + */ +static inline void
> > > *kmalloc_nofail(size_t size, gfp_t flags) +{ +	void *ret; + +	for
> > > (;;) { +		ret = kmalloc(size, flags); +		if (ret) +
> > > return ret; +		WARN_ON_ONCE(get_order(size) >
> > > PAGE_ALLOC_COSTLY_ORDER);
> > 
> > This doesn''t work as you expect. kmalloc will warn every time
it fails.
> > __GFP_NOFAIL used to disable the warning. Actually what''s
wrong with
> > __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches
> > are needed.
>   David should probably add the reasoning to the changelogs so that he
> doesn''t have to explain again and again ;). But if I understood it
> correctly, the concern is that the looping checks slightly impact fast path
> of the callers which do not need it. Generally, also looping for a long
> time inside allocator isn''t a nice thing but some callers
aren''t able to do
> better for now to the patch is imperfect in this sence...
>
I''m actually a bit confused about this too.
I thought David said he was removing a branch on the *slow* path - which make
sense as you wouldn''t even test NOFAIL until you had a failure.
Why are branches on the slow-path an issue??
This is an important question to me because I still hope to see the
swap-over-nfs patches merged eventually and they add a branch on the slow
path (if I remember correctly).

NeilBrown

David Rientjes

2010-Sep-05 23:01 UTC

head link

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Thu, 2 Sep 2010, Jiri Slaby wrote:
> > @@ -334,6 +334,57 @@ static inline void *kzalloc_node(size_t size,
gfp_t flags, int node)
> >  	return kmalloc_node(size, flags | __GFP_ZERO, node);
> >  }
> >  
> > +/**
> > + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.
> > + * @size: how many bytes of memory are required.
> > + * @flags: the type of memory to allocate (see kmalloc).
> > + *
> > + * NOTE: no new callers of this function should be implemented!
> > + * All memory allocations should be failable whenever possible.
> > + */
> > +static inline void *kmalloc_nofail(size_t size, gfp_t flags)
> > +{
> > +	void *ret;
> > +
> > +	for (;;) {
> > +		ret = kmalloc(size, flags);
> > +		if (ret)
> > +			return ret;
> > +		WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);
> 
> This doesn''t work as you expect. kmalloc will warn every time it
fails.
It actually does work as I expect since the WARN_ON_ONCE() never even gets 
triggered here for any of kmalloc_nofail()''s callers since they all
have
sizes small enough that the conditional is never true.  The page allocator 
implicitly loops forever if the order <= PAGE_ALLOC_COSTLY_ORDER, so this 
warning is only emitted if the #define implementation of the page 
allocator only changes.  That''s intended, we want to know the
consequences
of our change.
> __GFP_NOFAIL used to disable the warning.
No, it didn''t, it was unnecessary for all of the kmalloc_nofail()
callers
since they already implicitly loop.  No warning is emitted (these are all 
GFP_NOFS or GFP_NOIO users, there are no order-4 or larger GFP_ATOMIC 
allocators using this interface).
> Actually what''s wrong with
> __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches
> are needed.
> 
Couple reasons:

 - it''s unnecessary in the page allocator, it can be implemented at a 
   higher level, which allows us to remove these branches from the 
   slowpath,

 - it''s mostly unnecessary since all users have orders that will 
   implicitly loop forever anyway (although __GFP_NOFAIL does guarantee 
   that it won''t fail even if we change the looping behavior
internally,
   these warnings help to isolate cases where it''s needed), and
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Sep-05 23:03 UTC

head link

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Fri, 3 Sep 2010, Neil Brown wrote:
> I''m actually a bit confused about this too.
> I thought David said he was removing a branch on the *slow* path - which
make
> sense as you wouldn''t even test NOFAIL until you had a failure.
> Why are branches on the slow-path an issue??
They aren''t necessarily an issue in the performance sense, this is a 
cleanup series since all converted callers to using these new functions 
(and the eventual removal of __GFP_NOFAIL entirely) are using the bit 
unnecessarily since they all have orders that implicitly loop in the page 
allocator forever already, with or without the flag.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Rientjes

2010-Sep-06 09:05 UTC

head link

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

On Wed, 1 Sep 2010, David Rientjes wrote:
> Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail().  These
> functions are equivalent to kmalloc(), kcalloc(), and kzalloc(),
> respectively, except that they will never return NULL and instead loop
> forever trying to allocate memory.
> 
> If the first allocation attempt fails because the page allocator
doesn''t
> implicitly loop, a warning will be emitted, including a call trace.
> Subsequent failures will suppress this warning.
> 
> These were added as helper functions for documentation and auditability.
> No future callers should be added.
> 
Are there any objections to merging this series through -mm with the 
exception of the fifth patch for ntfs?  That particular patch needs to 
have its WARN_ON_ONCE() condition rewritten since it fallbacks to 
vmalloc for high order allocs.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Btrfs devel - Aug 2010 - [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

[patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

[patch 4/5] btrfs: add nofail variant of set_extent_dirty

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 4/5] btrfs: add nofail variant of set_extent_dirty

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

[patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

[patch v2 4/5] btrfs: add nofail variant of set_extent_dirty

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc

Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc