Dan Magenheimer
2010-Mar-09 23:20 UTC
[Xen-devel] [PATCH] [post-4.0] tmem: add page deduplication
(This is for post-4.0, but I''m posting now for feedback.) Add "page deduplication" capability to Xen-side of tmem. (Transparent to tmem-enabled guests.) Ephemeral pages that have the exact same content are "combined" so that only one page frame is needed. Since ephemeral pages are essentially read-only, no C-O-W (and thus no equivalent of swapping) is necessary. Anybody know of any good fast (SSE2?) assembly memory compare routines? (See tmh_page_cmp() in the patch.) Points of interest: - Modifications to LRU eviction algorithm to accommodate dedup''ed pages - New data structures to allow lookup of matching pages and track references. (Algorithm used is similar to that used by KSM in KVM/Linux: No hashing required.) - Lock (and rbtree) chosen by first byte of data to allow reasonably high concurrency without greatly complicating lock management. - Statistics added so "dedup ratio" can be monitored. - Dedup is disabled/enabled by Xen command line option. I''m seeing 1.08-1.52 dedup ratio for two self-ballooned guests simultaneously/continuously building linux; that''s up to a 34% reduction in physical ephemeral pages used by tmem. Clearly this is very workload-dependent. YMMV. To obtain this savings, approx double the time is spent in tmem (increasing from roughly 0.1% to roughly 0.2%). This compares favorably to compression which costs approximately 10x for an approximate savings of 50%. Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com> diff -r b8d2a4134a68 tools/misc/xen-tmem-list-parse.c --- a/tools/misc/xen-tmem-list-parse.c Wed Mar 03 17:41:58 2010 +0000 +++ b/tools/misc/xen-tmem-list-parse.c Tue Mar 09 14:58:37 2010 -0700 @@ -110,13 +110,20 @@ void parse_global(char *s) unsigned long long rtree_node_max = parse(s,"Nm"); unsigned long long pgp_count = parse(s,"Pc"); unsigned long long pgp_max = parse(s,"Pm"); + unsigned long long page_count = parse(s,"Fc"); + unsigned long long max_page_count = parse(s,"Fm"); + unsigned long long pcd_count = parse(s,"Sc"); + unsigned long long max_pcd_count = parse(s,"Sm"); printf("total tmem ops=%llu (errors=%llu) -- tmem pages avail=%llu\n", total_ops, errored_ops, avail_pages); printf("datastructs: objs=%llu (max=%llu) pgps=%llu (max=%llu) " - "nodes=%llu (max=%llu)\n", + "nodes=%llu (max=%llu) pages=%llu (max=%llu) pcds=%llu (max=%llu) " + "dedup ratio=%f\n", obj_count, obj_max, pgp_count, pgp_max, - rtree_node_count, rtree_node_max); + rtree_node_count, rtree_node_max, + page_count,max_page_count,pcd_count,max_pcd_count, + (pcd_count==0)?0.0:(global_eph_count*1.0)/pcd_count); printf("misc: failed_copies=%llu alloc_failed=%llu alloc_page_failed=%llu " "low_mem=%llu evicted=%llu/%llu relinq=%llu/%llu, " "max_evicts_per_relinq=%llu, flush_pools=%llu, " diff -r b8d2a4134a68 xen/common/tmem.c --- a/xen/common/tmem.c Wed Mar 03 17:41:58 2010 +0000 +++ b/xen/common/tmem.c Tue Mar 09 14:58:37 2010 -0700 @@ -79,6 +79,7 @@ static unsigned long low_on_memory = 0; static unsigned long low_on_memory = 0; static int global_obj_count_max = 0; static int global_pgp_count_max = 0; +static int global_pcd_count_max = 0; static int global_page_count_max = 0; static int global_rtree_node_count_max = 0; static long global_eph_count_max = 0; @@ -108,6 +109,7 @@ DECL_CYC_COUNTER(decompress); struct tm_pool; struct tmem_page_descriptor; +struct tmem_page_content_descriptor; struct client { struct list_head client_list; struct tm_pool *pools[MAX_POOLS_PER_DOMAIN]; @@ -219,12 +221,17 @@ struct tmem_page_descriptor { obj_t *obj; uint64_t inv_oid; /* used for invalid list only */ }; - uint32_t index; size_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid, else compressed data (cdata) */ + uint32_t index; + /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings */ + uint16_t firstbyte; /* NON_SHAREABLE->pfp otherwise->pcd */ + bool_t eviction_attempted; /* CHANGE TO lifetimes? (settable) */ + struct list_head pcd_siblings; union { pfp_t *pfp; /* page frame pointer */ char *cdata; /* compressed data */ + struct tmem_page_content_descriptor *pcd; /* page dedup */ }; union { uint64_t timestamp; @@ -233,6 +240,16 @@ struct tmem_page_descriptor { DECL_SENTINEL }; typedef struct tmem_page_descriptor pgp_t; + +struct tmem_page_content_descriptor { + pfp_t *pfp; + struct list_head pgp_list; + struct rb_node pcd_rb_tree_node; + uint32_t count; +}; +typedef struct tmem_page_content_descriptor pcd_t; +struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page */ +rwlock_t pcd_tree_rwlocks[256]; /* poor man''s concurrency for now */ static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */ @@ -267,6 +284,7 @@ static long global_eph_count = 0; /* ato static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */ static atomic_t global_obj_count = ATOMIC_INIT(0); static atomic_t global_pgp_count = ATOMIC_INIT(0); +static atomic_t global_pcd_count = ATOMIC_INIT(0); static atomic_t global_page_count = ATOMIC_INIT(0); static atomic_t global_rtree_node_count = ATOMIC_INIT(0); @@ -336,6 +354,103 @@ static NOINLINE void tmem_page_free(pool atomic_dec_and_assert(global_page_count); } +/************ PAGE CONTENT DESCRIPTOR MANIPULATION ROUTINES ***********/ + +#define NOT_SHAREABLE ((uint16_t)-1UL) + +/* ensure pgp no longer points to pcd, nor vice-versa */ +/* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done */ +static NOINLINE void pcd_disassociate(pgp_t *pgp, pool_t *pool, bool_t have_pcd_rwlock) +{ + pcd_t *pcd = pgp->pcd; + pfp_t *pfp = pgp->pcd->pfp; + uint16_t firstbyte = pgp->firstbyte; + + ASSERT(tmh_dedup_enabled()); + ASSERT(firstbyte != NOT_SHAREABLE); + ASSERT(firstbyte < 256); + ASSERT(!pgp->size); + + if ( have_pcd_rwlock ) + ASSERT_WRITELOCK(&pcd_tree_rwlocks[firstbyte]); + else + tmem_write_lock(&pcd_tree_rwlocks[firstbyte]); + list_del_init(&pgp->pcd_siblings); + pgp->pcd = NULL; + pgp->firstbyte = NOT_SHAREABLE; + pgp->size = -1; + if ( --pcd->count ) + { + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); + return; + } + + /* no more references to this pcd, recycle it and the physical page */ + ASSERT(list_empty(&pcd->pgp_list)); + pcd->pfp = NULL; + /* remove pcd from rbtree */ + rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]); + /* reinit the struct for safety for now */ + RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); + /* now free up the pcd memory */ + tmem_free(pcd,sizeof(pcd_t),NULL); + atomic_dec_and_assert(global_pcd_count); + tmem_page_free(pool,pfp); + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); +} + +static NOINLINE int pcd_associate(pgp_t *pgp, uint8_t firstbyte) +{ + struct rb_node **new, *parent = NULL; + struct rb_root *root; + pcd_t *pcd; + int cmp; + + if ( !tmh_dedup_enabled() ) + return 0; + ASSERT(pgp->pfp != NULL); + ASSERT(pgp->obj != NULL); + ASSERT(pgp->obj->pool != NULL); + ASSERT(!pgp->obj->pool->persistent); + tmem_write_lock(&pcd_tree_rwlocks[firstbyte]); + /* look for page match */ + root = &pcd_tree_roots[firstbyte]; + new = &(root->rb_node); + while ( *new ) + { + pcd = container_of(*new, pcd_t, pcd_rb_tree_node); + parent = *new; + if ( (cmp = tmh_page_cmp(pgp->pfp,pcd->pfp)) < 0 ) + new = &((*new)->rb_left); + else if ( cmp > 0 ) + new = &((*new)->rb_right); + else + { + /* match! free the no-longer-needed page frame from the pgp */ + tmem_page_free(pgp->obj->pool,pgp->pfp); + goto match; + } + } + /* no match, alloc a pcd and put it in the tree */ + if ( (pcd = tmem_malloc(pcd_t, NULL)) == NULL ) + return -ENOMEM; + atomic_inc_and_max(global_pcd_count); + RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); /* is this necessary */ + INIT_LIST_HEAD(&pcd->pgp_list); /* is this necessary */ + pcd->count = 0; + pcd->pfp = pgp->pfp; + rb_link_node(&pcd->pcd_rb_tree_node, parent, new); + rb_insert_color(&pcd->pcd_rb_tree_node, root); +match: + pcd->count++; + list_add(&pgp->pcd_siblings,&pcd->pgp_list); + pgp->firstbyte = firstbyte; + pgp->eviction_attempted = 0; + pgp->pcd = pcd; + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); + return 0; +} + /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/ /* allocate a pgp_t and associate it with an object */ @@ -353,6 +468,12 @@ static NOINLINE pgp_t *pgp_alloc(obj_t * INIT_LIST_HEAD(&pgp->global_eph_pages); INIT_LIST_HEAD(&pgp->client_eph_pages); pgp->pfp = NULL; + if ( tmh_dedup_enabled() ) + { + pgp->firstbyte = NOT_SHAREABLE; + pgp->eviction_attempted = 0; + INIT_LIST_HEAD(&pgp->pcd_siblings); + } pgp->size = -1; pgp->index = -1; pgp->timestamp = get_cycles(); @@ -376,7 +497,9 @@ static NOINLINE void pgp_free_data(pgp_t { if ( pgp->pfp == NULL ) return; - if ( !pgp->size ) + if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE ) + pcd_disassociate(pgp,pool,0); + else if ( !pgp->size ) tmem_page_free(pgp->obj->pool,pgp->pfp); else { @@ -987,10 +1110,56 @@ static void client_freeze(client_t *clie /************ MEMORY REVOCATION ROUTINES *******************************/ +static bool_t tmem_try_to_evict_pgp(pgp_t *pgp, bool_t *hold_pool_rwlock) +{ + obj_t *obj = pgp->obj; + pool_t *pool = obj->pool; + client_t *client = pool->client; + uint16_t firstbyte = pgp->firstbyte; + + if ( pool->is_dying ) + return 0; + if ( tmh_lock_all && !obj->no_evict ) + return 1; + if ( tmem_spin_trylock(&obj->obj_spinlock) ) + { + if ( tmh_dedup_enabled() ) + { + firstbyte = pgp->firstbyte; + if ( firstbyte == NOT_SHAREABLE ) + goto obj_unlock; + ASSERT(firstbyte < 256); + if ( !tmem_write_trylock(&pcd_tree_rwlocks[firstbyte]) ) + goto obj_unlock; + if ( pgp->pcd->count > 1 && !pgp->eviction_attempted ) + { + pgp->eviction_attempted++; + list_del(&pgp->global_eph_pages); + list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list); + list_del(&pgp->client_eph_pages); + list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list); + goto pcd_unlock; + } + } + if ( obj->pgp_count > 1 ) + return 1; + if ( tmem_write_trylock(&pool->pool_rwlock) ) + { + *hold_pool_rwlock = 1; + return 1; + } +pcd_unlock: + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); +obj_unlock: + tmem_spin_unlock(&obj->obj_spinlock); + } + return 0; +} + static int tmem_evict(void) { client_t *client = tmh_client_from_current(); - pgp_t *pgp = NULL, *pgp_del; + pgp_t *pgp = NULL, *pgp2, *pgp_del; obj_t *obj; pool_t *pool; int ret = 0; @@ -1001,49 +1170,15 @@ static int tmem_evict(void) if ( (client != NULL) && client_over_quota(client) && !list_empty(&client->ephemeral_page_list) ) { - list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages) - { - obj = pgp->obj; - pool = obj->pool; - if ( pool->is_dying ) - continue; - if ( tmh_lock_all && !obj->no_evict ) + list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,client_eph_pages) + if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) ) goto found; - if ( tmem_spin_trylock(&obj->obj_spinlock) ) - { - if ( obj->pgp_count > 1 ) - goto found; - if ( tmem_write_trylock(&pool->pool_rwlock) ) - { - hold_pool_rwlock = 1; - goto found; - } - tmem_spin_unlock(&obj->obj_spinlock); - } - } } else if ( list_empty(&global_ephemeral_page_list) ) { goto out; } else { - list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages) - { - obj = pgp->obj; - pool = obj->pool; - if ( pool->is_dying ) - continue; - if ( tmh_lock_all && !obj->no_evict ) + list_for_each_entry_safe(pgp,pgp2,&global_ephemeral_page_list,global_eph_pages) + if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) ) goto found; - if ( tmem_spin_trylock(&obj->obj_spinlock) ) - { - if ( obj->pgp_count > 1 ) - goto found; - if ( tmem_write_trylock(&pool->pool_rwlock) ) - { - hold_pool_rwlock = 1; - goto found; - } - tmem_spin_unlock(&obj->obj_spinlock); - } - } } ret = 0; @@ -1057,10 +1192,16 @@ found: ASSERT(obj->no_evict == 0); ASSERT(obj->pool != NULL); ASSERT_SENTINEL(obj,OBJ); + pool = obj->pool; ASSERT_SPINLOCK(&obj->obj_spinlock); pgp_del = pgp_delete_from_obj(obj, pgp->index); ASSERT(pgp_del == pgp); + if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE ) + { + ASSERT(pgp->pcd->count == 1 || pgp->eviction_attempted); + pcd_disassociate(pgp,pool,1); + } pgp_delete(pgp,1); if ( obj->pgp_count == 0 ) { @@ -1197,6 +1338,11 @@ copy_uncompressed: ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0); if ( ret == -EFAULT ) goto bad_copy; + if ( tmh_dedup_enabled() && !is_persistent(pool) ) + { + if ( pcd_associate(pgp,tmh_get_first_byte(pgp->pfp)) == -ENOMEM ) + goto failed_dup; + } pgp->size = 0; done: @@ -1308,13 +1454,16 @@ copy_uncompressed: copy_uncompressed: if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL ) { - ret == -ENOMEM; + ret = -ENOMEM; goto delete_and_free; } /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */ ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva); if ( ret == -EFAULT ) goto bad_copy; + if ( tmh_dedup_enabled() && !is_persistent(pool) ) + if ( pcd_associate(pgp,tmh_get_first_byte(pgp->pfp)) == -ENOMEM ) + goto delete_and_free; pgp->size = 0; insert_page: @@ -1411,6 +1560,19 @@ static NOINLINE int do_tmem_get(pool_t * pgp->size, cva) == -EFAULT ) goto bad_copy; END_CYC_COUNTER(decompress); + } + else if ( tmh_dedup_enabled() && !is_persistent(pool) && + pgp->firstbyte != NOT_SHAREABLE ) + { + uint16_t firstbyte = pgp->firstbyte; + int ret; + + tmem_read_lock(&pcd_tree_rwlocks[firstbyte]); + ret = tmh_copy_to_client(cmfn, pgp->pcd->pfp, tmem_offset, + pfn_offset, len, cva); + tmem_read_unlock(&pcd_tree_rwlocks[firstbyte]); + if ( ret == -EFAULT ) + goto bad_copy; } else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset, pfn_offset, len, cva) == -EFAULT) @@ -1855,11 +2017,14 @@ static int tmemc_list_global(tmem_cli_va total_flush_pool, use_long ? '','' : ''\n''); if (use_long) n += scnprintf(info+n,BSIZE-n, - "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n", + "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d," + "Fc:%d,Fm:%d,Sc:%d,Sm:%d\n", global_eph_count, global_eph_count_max, _atomic_read(global_obj_count), global_obj_count_max, _atomic_read(global_rtree_node_count), global_rtree_node_count_max, - _atomic_read(global_pgp_count), global_pgp_count_max); + _atomic_read(global_pgp_count), global_pgp_count_max, + _atomic_read(global_page_count), global_page_count_max, + _atomic_read(global_pcd_count), global_pcd_count_max); if ( sum + n >= len ) return sum; tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1); @@ -2569,10 +2734,18 @@ EXPORT void *tmem_relinquish_pages(unsig /* called at hypervisor startup */ EXPORT void init_tmem(void) { + int i; if ( !tmh_enabled() ) return; radix_tree_init(); + if ( tmh_dedup_enabled() ) + for (i = 0; i < 256; i++ ) + { + pcd_tree_roots[i] = RB_ROOT; + rwlock_init(&pcd_tree_rwlocks[i]); + } + if ( tmh_init() ) { printk("tmem: initialized comp=%d global-lock=%d\n", diff -r b8d2a4134a68 xen/common/tmem_xen.c --- a/xen/common/tmem_xen.c Wed Mar 03 17:41:58 2010 +0000 +++ b/xen/common/tmem_xen.c Tue Mar 09 14:58:37 2010 -0700 @@ -19,6 +19,9 @@ boolean_param("tmem", opt_tmem); EXPORT int opt_tmem_compress = 0; boolean_param("tmem_compress", opt_tmem_compress); + +EXPORT int opt_tmem_dedup = 1; +boolean_param("tmem_dedup", opt_tmem_dedup); EXPORT int opt_tmem_shared_auth = 0; boolean_param("tmem_shared_auth", opt_tmem_shared_auth); diff -r b8d2a4134a68 xen/include/xen/tmem_xen.h --- a/xen/include/xen/tmem_xen.h Wed Mar 03 17:41:58 2010 +0000 +++ b/xen/include/xen/tmem_xen.h Tue Mar 09 14:58:37 2010 -0700 @@ -52,6 +52,12 @@ static inline int tmh_compression_enable static inline int tmh_compression_enabled(void) { return opt_tmem_compress; +} + +extern int opt_tmem_dedup; +static inline int tmh_dedup_enabled(void) +{ + return opt_tmem_dedup; } extern int opt_tmem_shared_auth; @@ -326,6 +332,28 @@ static inline bool_t tmh_current_is_priv return IS_PRIV(current->domain); } +static inline uint8_t tmh_get_first_byte(pfp_t *pfp) +{ + void *p = __map_domain_page(pfp); + + return (uint8_t)(*(char *)p); +} + +static inline int tmh_page_cmp(pfp_t *pfp1, pfp_t *pfp2) +{ + const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1); + const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp2); + int i; + + // FIXME: code in assembly? + for ( i = PAGE_SIZE/sizeof(uint64_t); *p1 == *p2; i--, *p1++, *p2++ ); + if ( !i ) + return 0; + if ( *p1 < *p2 ) + return -1; + return 1; +} + /* these typedefs are in the public/tmem.h interface typedef XEN_GUEST_HANDLE(void) cli_mfn_t; typedef XEN_GUEST_HANDLE(char) cli_va_t; _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel